def transform(observations, nlp): # TODO Docstring logging.info('Begin transform') # Extract candidate name observations['candidate_name'] = observations['text'].apply( lambda x: field_extraction.candidate_name_extractor(x, nlp)) if observations['candidate_name'] == "NOT FOUND": match = re.search(field_extraction.NAME_REGEX, observations['text'], re.IGNORECASE) observations['candidate_name'] = match[0] # Extract contact fields observations['email'] = observations['text'].apply( lambda x: lib.term_match(x, field_extraction.EMAIL_REGEX)) observations['phone'] = observations['text'].apply( lambda x: lib.term_match(x, field_extraction.PHONE_REGEX)) # Extract skills observations = field_extraction.extract_fields(observations) # Archive schema and return lib.archive_dataset_schemas('transform', locals(), globals()) logging.info('End transform') return observations, nlp
def extract(): logging.info('Begin extract') # Reference variables candidate_file_agg = list() # Create list of candidate files for root, subdirs, files in os.walk(lib.get_conf('resume_directory')): folder_files = map(lambda x: os.path.join(root, x), files) candidate_file_agg.extend(folder_files) # Convert list to a pandas DataFrame observations = pandas.DataFrame(data=candidate_file_agg, columns=['file_path']) logging.info('Found {} candidate files'.format(len(observations.index))) # Subset candidate files to supported extensions observations['extension'] = observations['file_path'].apply( lambda x: os.path.splitext(x)[1]) observations = observations[observations['extension'].isin( lib.AVAILABLE_EXTENSIONS)] logging.info( 'Subset candidate files to extensions w/ available parsers. {} files remain' .format(len(observations.index))) # Attempt to extract text from files observations['text'] = observations['file_path'].apply(lib.convert_pdf) # Archive schema and return lib.archive_dataset_schemas('extract', locals(), globals()) logging.info('End extract') return observations
def transform(observations, nlp): # TODO Docstring logging.info('Begin transform') # Extract candidate name observations['candidate_name'] = observations['text'].apply( lambda x: field_extraction.candidate_name_extractor(x, nlp)) # Extract contact fields observations['email'] = observations['text'].apply( lambda x: lib.term_match(x, field_extraction.EMAIL_REGEX)) observations['phone'] = observations['text'].apply( lambda x: lib.term_match(x, field_extraction.PHONE_REGEX)) # Extract education data count = 0 observations['university'] = observations['text'].apply( lambda x: field_extraction.university_extractor(x, nlp)) observations['Major'] = observations['text'].apply( lambda x: field_extraction.major_extractor(x, nlp)) # Extract skills observations = field_extraction.extract_fields(observations) # Archive schema and return lib.archive_dataset_schemas('transform', locals(), globals()) logging.info('End transform') return observations, nlp