예제 #1
0
def transform(observations, nlp):
    # TODO Docstring
    logging.info('Begin transform')

    # Extract candidate name
    observations['candidate_name'] = observations['text'].apply(
        lambda x: field_extraction.candidate_name_extractor(x, nlp))

    if observations['candidate_name'] == "NOT FOUND":
        match = re.search(field_extraction.NAME_REGEX, observations['text'],
                          re.IGNORECASE)
        observations['candidate_name'] = match[0]

    # Extract contact fields
    observations['email'] = observations['text'].apply(
        lambda x: lib.term_match(x, field_extraction.EMAIL_REGEX))
    observations['phone'] = observations['text'].apply(
        lambda x: lib.term_match(x, field_extraction.PHONE_REGEX))

    # Extract skills
    observations = field_extraction.extract_fields(observations)

    # Archive schema and return
    lib.archive_dataset_schemas('transform', locals(), globals())
    logging.info('End transform')
    return observations, nlp
예제 #2
0
def extract():
    logging.info('Begin extract')

    # Reference variables
    candidate_file_agg = list()

    # Create list of candidate files
    for root, subdirs, files in os.walk(lib.get_conf('resume_directory')):
        folder_files = map(lambda x: os.path.join(root, x), files)
        candidate_file_agg.extend(folder_files)

    # Convert list to a pandas DataFrame
    observations = pandas.DataFrame(data=candidate_file_agg,
                                    columns=['file_path'])
    logging.info('Found {} candidate files'.format(len(observations.index)))

    # Subset candidate files to supported extensions
    observations['extension'] = observations['file_path'].apply(
        lambda x: os.path.splitext(x)[1])
    observations = observations[observations['extension'].isin(
        lib.AVAILABLE_EXTENSIONS)]
    logging.info(
        'Subset candidate files to extensions w/ available parsers. {} files remain'
        .format(len(observations.index)))

    # Attempt to extract text from files
    observations['text'] = observations['file_path'].apply(lib.convert_pdf)

    # Archive schema and return
    lib.archive_dataset_schemas('extract', locals(), globals())
    logging.info('End extract')
    return observations
예제 #3
0
def transform(observations, nlp):
    # TODO Docstring
    logging.info('Begin transform')

    # Extract candidate name
    observations['candidate_name'] = observations['text'].apply(
        lambda x: field_extraction.candidate_name_extractor(x, nlp))

    # Extract contact fields
    observations['email'] = observations['text'].apply(
        lambda x: lib.term_match(x, field_extraction.EMAIL_REGEX))
    observations['phone'] = observations['text'].apply(
        lambda x: lib.term_match(x, field_extraction.PHONE_REGEX))

    # Extract education data
    count = 0
    observations['university'] = observations['text'].apply(
        lambda x: field_extraction.university_extractor(x, nlp))
    observations['Major'] = observations['text'].apply(
        lambda x: field_extraction.major_extractor(x, nlp))

    # Extract skills
    observations = field_extraction.extract_fields(observations)

    # Archive schema and return
    lib.archive_dataset_schemas('transform', locals(), globals())
    logging.info('End transform')
    return observations, nlp