def extract_skills_case_sensitive(resume_text, items_of_interest): potential_skills_dict = dict() matched_skills = set() for skill_input in items_of_interest: if type(skill_input) is not str and len(skill_input) >= 1: potential_skills_dict[skill_input[0]] = skill_input elif type(skill_input) is str: potential_skills_dict[skill_input] = [skill_input] else: pass #logging.warning('Unknown skill listing type: {}.'.format(skill_input)) for (skill_name, skill_alias_list) in potential_skills_dict.items(): skill_matches = 0 # TODO incorporate word2vec here? for skill_alias in skill_alias_list: skill_matches += lib.term_count(resume_text.replace('-', ' ').replace(':', '').replace(',', '').replace('\'', ''), skill_alias.lower()) # add the # of matches for each alias if skill_matches > 0: matched_skills.add(skill_name.replace('\x20', '')) if len(matched_skills) == 0: matched_skills = '' return list(matched_skills)
def extract_skills(resume_text, extractor, items_of_interest): potential_skills_dict = dict() matched_skills = set() # TODO This skill input formatting could happen once per run, instead of once per observation. for skill_input in items_of_interest: # Format list inputs if type(skill_input) is list and len(skill_input) >= 1: potential_skills_dict[skill_input[0]] = skill_input # Format string inputs elif type(skill_input) is str: potential_skills_dict[skill_input] = [skill_input] else: logging.warn( 'Unknown skill listing type: {}. Please format as either a single string or a list of strings' ''.format(skill_input)) for (skill_name, skill_alias_list) in potential_skills_dict.items(): skill_matches = 0 # Iterate through aliases for skill_alias in skill_alias_list: # Add the number of matches for each alias skill_matches += lib.term_count(resume_text, skill_alias.lower()) # If at least one alias is found, add skill name to set of skills if skill_matches > 0: matched_skills.add(skill_name) return matched_skills
def extract_skills_case_agnostic(resume_text, items_of_interest): potential_skills_dict = dict() matched_skills = set() for skill_input in items_of_interest: # Format list of strings inputs if type(skill_input) is not str and len(skill_input) >= 1: potential_skills_dict[skill_input[0]] = skill_input # Format string inputs if type(skill_input) is str: potential_skills_dict[skill_input] = [skill_input] else: pass #logging.warning('Unknown skill listing type: {}. Please format as a string or a list of strings'.format(skill_input)) for (skill_name, skill_alias_list) in potential_skills_dict.items(): skill_matches = 0 # iterate through each string in the list of equivalent words (i.e. a line in the yaml file) # TODO incorporate word2vec here? for skill_alias in skill_alias_list: skill_matches += lib.term_count(resume_text.replace('-', ' ').replace(':', '').replace(',', '').replace('\'', ''), skill_alias.lower()) # add the # of matches for each alias if skill_matches > 0: # if at least one alias is found, add skill name to set of skills matched_skills.add(skill_name.replace('\x20', '')) if len(matched_skills) == 0: # so it doesn't save 'set()' in the csv when it's empty matched_skills = '' return list(matched_skills)
def extract_universities(resume_text): # Reference variables matched_universities = set() normalized_resume_text = ' '.join(simple_preprocess(resume_text)) # Iterate through possible universities for university in lib.get_conf('universities'): university = ' '.join(simple_preprocess(university)) university_count = lib.term_count(normalized_resume_text, university) if university_count > 0: matched_universities.add(university) return matched_universities