def transform(observations, nlp): # TODO Docstring logging.info('Begin transform') # Extract candidate name observations['candidate_name'] = observations['text'].apply( lambda x: extract_entities.candidate_name_extractor(x, nlp)) #Extract nationality observations['nationality'] = observations['text'].apply( lambda x: extract_entities.nationality_extractor(x, nlp)) # Extract contact fields observations['email'] = observations['text'].apply( lambda x: lib.term_match(x, extract_entities.EMAIL_REGEX)) observations['phone'] = observations['text'].apply( lambda x: lib.term_match(x, extract_entities.PHONE_REGEX)) observations['birthdate'] = observations['text'].apply( lambda x: lib.birthdate_match(x, extract_entities.BIRTHDATE_REGEX)) observations['unit_postcode'] = observations['text'].apply( lambda x: lib.term_match(x, extract_entities.UNIT_POSTCODE_REGEX)) observations['url'] = observations['text'].apply( lambda x: lib.term_match(x, extract_entities.URL_REGEX)) # Extract skills observations = extract_entities.extract_fields(observations) # Archive schema and return lib.archive_dataset_schemas('transform', locals(), globals()) logging.info('End transform') return observations, nlp
def transform(observations, nlp): # TODO Docstring logging.info('Begin transform') # Extract candidate name observations['candidate_name'] = observations['text'].apply( lambda x: field_extraction.candidate_name_extractor(x, nlp)) # Extract contact fields observations['email'] = observations['text'].apply( lambda x: lib.term_match(x, field_extraction.EMAIL_REGEX)) observations['phone'] = observations['text'].apply( lambda x: lib.term_match(x, field_extraction.PHONE_REGEX)) # Extract university observations['universities'] = observations['text'].apply( field_extraction.extract_universities) # Extract skills observations['skills'] = observations['text'].apply( field_extraction.extract_skills) # Archive schema and return lib.archive_dataset_schemas('transform', locals(), globals()) logging.info('End transform') return observations, nlp
def transform(observations, nlp): logging.info('Begin transform') print("Extracting name, email, phone, GPA, and dates of work experience") observations = observations.fillna('') observations['candidate_name'] = observations['text'].apply(lambda x: field_extraction.candidate_name_extractor(x, nlp)) observations['email'] = observations['text'].apply(lambda x: lib.term_match(x, field_extraction.EMAIL_REGEX)) observations['phone'] = observations['text'].apply(lambda x: lib.term_match(x, field_extraction.PHONE_REGEX)) observations['GPA'] = observations['text'].apply(lambda x: field_extraction.gpa_extractor(x)) observations['years_experience'] = observations['Work'].apply(lambda x: field_extraction.years_of_experience(x)) observations['mos_experience'] = field_extraction.months_of_experience(observations['years_experience']) # observations['work_dates'] = observations['Work'].apply( # lambda x: field_extraction.spacy_extractor_by_type(str(x).replace('\n', '. '), nlp, 'DATE', 2)) # observations['uni'] = observations['Edu'].apply( # lambda x: field_extraction.spacy_extractor_by_type(str(x), nlp, 'ORG', 2)) observations = field_extraction.extract_fields(observations) # search for terms in whole resume # Archive schema and return lib.archive_dataset_schemas('transform', locals(), globals()) logging.info('End transform') return observations
def transform(observations, root_file_path, nlp): logging.info("Extracting email, phone, GPA, and dates of work experience") observations = observations.fillna('') observations['candidate_name'] = observations['text'].apply( lambda x: field_extraction.candidate_name_extractor(x, nlp)) observations['email'] = observations['text'].apply( lambda x: lib.term_match(x, field_extraction.EMAIL_REGEX)) observations['phone'] = observations['text'].apply( lambda x: lib.term_match(x, field_extraction.PHONE_REGEX)) observations['GPA'] = observations['text'].apply( lambda x: field_extraction.gpa_extractor(x)) observations['years_experience'] = observations['Work'].apply( lambda x: field_extraction.years_of_experience(x)) observations['mos_experience'] = field_extraction.months_of_experience( observations['years_experience']) # convert GPA to a single number GPA_REGEX = r"[01234]{1}\.[0-9]{1,3}" observations.GPA.fillna('[]', inplace=True) observations['GPAnum'] = observations.GPA.apply( lambda x: re.findall(re.compile(GPA_REGEX), str(x))) def getmax(x): try: y = max(x) except: y = 0 return y observations['GPAmax'] = observations['GPAnum'].apply(lambda x: getmax(x)) observations['GPAmax'] = observations['GPAmax'].apply(lambda x: np.nan if x == 0 else x) observations.filter(like='GPA') np.mean(observations['GPAmax'].astype('float')) observations.drop('GPAnum', axis=1, inplace=True) observations = field_extraction.extract_fields( observations, root_file_path) # search for terms in whole resume return observations
def transform(observations, nlp): logging.info('Begin transform') # Extract candidate name observations['name'] = observations['text'].apply(lambda x: field_extraction.candidate_name_extractor(x, nlp)) # Extract contact fields observations['phone'] = observations['text'].apply(lambda x: lib.term_match(x, field_extraction.PHONE_REGEX)) observations['email'] = observations['text'].apply(lambda x: lib.term_match(x, field_extraction.EMAIL_REGEX)) # Extract GPA observations['gpa'] = observations['text'].apply(lambda x: lib.term_match(x, field_extraction.GPA_REGEX)) #observations['company'] = observations['text'].apply(lambda x: field_extraction.company_name_extractor(x, nlp)) # Extract skills observations = field_extraction.extract_fields(observations) # Archive schema and return lib.archive_dataset_schemas('transform', locals(), globals()) logging.info('End transform') return observations, nlp