Beispiel #1
0
def transform(observations, nlp):
    logging.info('Begin transform')

    print("Extracting name, email, phone, GPA, and dates of work experience")
    observations = observations.fillna('')
    observations['candidate_name'] = observations['text'].apply(lambda x: field_extraction.candidate_name_extractor(x, nlp))
    observations['email'] = observations['text'].apply(lambda x: lib.term_match(x, field_extraction.EMAIL_REGEX))
    observations['phone'] = observations['text'].apply(lambda x: lib.term_match(x, field_extraction.PHONE_REGEX))
    observations['GPA'] = observations['text'].apply(lambda x: field_extraction.gpa_extractor(x))
    observations['years_experience'] = observations['Work'].apply(lambda x: field_extraction.years_of_experience(x))
    observations['mos_experience'] = field_extraction.months_of_experience(observations['years_experience'])


    # observations['work_dates'] = observations['Work'].apply(
    #     lambda x: field_extraction.spacy_extractor_by_type(str(x).replace('\n', '. '), nlp, 'DATE', 2))

    # observations['uni'] = observations['Edu'].apply(
    #     lambda x: field_extraction.spacy_extractor_by_type(str(x), nlp, 'ORG', 2))

    observations = field_extraction.extract_fields(observations)  # search for terms in whole resume

    # Archive schema and return
    lib.archive_dataset_schemas('transform', locals(), globals())
    logging.info('End transform')
    return observations
Beispiel #2
0
def transform(observations, nlp):
    # TODO Docstring
    logging.info('Begin transform')

    # Extract contact fields
    observations['email'] = observations['text'].apply(
        lambda x: lib.term_match(x, field_extraction.EMAIL_REGEX))
    observations['phone'] = observations['text'].apply(
        lambda x: lib.term_match(x, field_extraction.PHONE_REGEX))

    # Extract candidate name
    observations['candidate_name'] = observations['text'].apply(
        lambda x: field_extraction.candidate_name_extractor(
            x, nlp, observations['email']))
    # Extract skills
    observations = field_extraction.extract_fields(observations)
    # Archive schema and return
    lib.archive_dataset_schemas('transform', locals(), globals())
    logging.info('End transform')
    return observations, nlp
Beispiel #3
0
def transform(observations, root_file_path, nlp):
    logging.info("Extracting email, phone, GPA, and dates of work experience")
    observations = observations.fillna('')
    observations['candidate_name'] = observations['text'].apply(
        lambda x: field_extraction.candidate_name_extractor(x, nlp))
    observations['email'] = observations['text'].apply(
        lambda x: lib.term_match(x, field_extraction.EMAIL_REGEX))
    observations['phone'] = observations['text'].apply(
        lambda x: lib.term_match(x, field_extraction.PHONE_REGEX))
    observations['GPA'] = observations['text'].apply(
        lambda x: field_extraction.gpa_extractor(x))
    observations['years_experience'] = observations['Work'].apply(
        lambda x: field_extraction.years_of_experience(x))
    observations['mos_experience'] = field_extraction.months_of_experience(
        observations['years_experience'])

    # convert GPA to a single number
    GPA_REGEX = r"[01234]{1}\.[0-9]{1,3}"
    observations.GPA.fillna('[]', inplace=True)
    observations['GPAnum'] = observations.GPA.apply(
        lambda x: re.findall(re.compile(GPA_REGEX), str(x)))

    def getmax(x):
        try:
            y = max(x)
        except:
            y = 0
        return y

    observations['GPAmax'] = observations['GPAnum'].apply(lambda x: getmax(x))
    observations['GPAmax'] = observations['GPAmax'].apply(lambda x: np.nan
                                                          if x == 0 else x)
    observations.filter(like='GPA')
    np.mean(observations['GPAmax'].astype('float'))
    observations.drop('GPAnum', axis=1, inplace=True)

    observations = field_extraction.extract_fields(
        observations, root_file_path)  # search for terms in whole resume

    return observations