Esempio n. 1
0
def extract():
    logging.info('Begin extract')

    # Reference variables
    candidate_file_agg = list()

    # Create list of candidate files
    for root, subdirs, files in os.walk(lib.get_conf('resume_directory')):
        folder_files = map(lambda x: os.path.join(root, x), files)
        candidate_file_agg.extend(folder_files)

    # Convert list to a pandas DataFrame
    observations = pandas.DataFrame(data=candidate_file_agg,
                                    columns=['file_path'])
    logging.info('Found {} candidate files'.format(len(observations.index)))

    # Subset candidate files to supported extensions
    observations['extension'] = observations['file_path'].apply(
        lambda x: os.path.splitext(x)[1])
    observations = observations[observations['extension'].isin(
        lib.AVAILABLE_EXTENSIONS)]
    logging.info(
        'Subset candidate files to extensions w/ available parsers. {} files remain'
        .format(len(observations.index)))

    # Attempt to extract text from files
    observations['text'] = observations['file_path'].apply(lib.convert_pdf)

    # Archive schema and return
    lib.archive_dataset_schemas('extract', locals(), globals())
    logging.info('End extract')
    return observations
Esempio n. 2
0
def load(observations, nlp):
    logging.info('Begin load')
    output_path = os.path.join(lib.get_conf('summary_output_directory'),
                               'resume_summary.csv')

    logging.info('Results being output to {}'.format(output_path))
    print('Results output to {}'.format(output_path))

    observations.to_csv(path_or_buf=output_path, index_label='index')
    logging.info('End transform')
    pass
Esempio n. 3
0
def save(observations):
    logging.info('Begin load')
    
    output_path = os.path.join(lib.get_conf('summary_output_directory'), 'resume_summary.csv')
    
    tmp = observations.copy()
    del tmp['text']
    tmp.to_csv(path_or_buf=output_path, index_label='index')
    
    logging.info('End transform')
    pass
Esempio n. 4
0
def extract_fields(df):
    for extractor, items_of_interest in lib.get_conf('extractors').items():
        df[extractor] = df['text'].apply(lambda x: extract_skills(x, extractor, items_of_interest))
    return df