def xml_features(data): """ Extract details from selected xml tags Strip the xml tags from the xml to make it plaintext Parameters: ----------- data -- string Resume xml as string Returns: -------- xml_features -- dict Dictionary with plaintext resume without any xml tags and select xml features """ xml_features = {} jobs = data.xpath('//job/title/text()') job_normalized = [] for title in jobs: job_normalized.append(normalize_job_titles(title)) xml_features["jobs"] = jobs employers = data.xpath('//job/employer/text()') institution = data.xpath('//education/school/institution/text()') degree = data.xpath('//education/school/degree/text()') xml_features["employers"] = employers xml_features["institution"] = institution xml_features["degree"] = degree pattern = re.compile(r'<.*?>') data = etree.tostring(data, pretty_print=True) text = pattern.sub('', data) xml_features["raw_resume"] = text return xml_features
def test_different_permutations_of_same_title_should_be_properly_normalized(): original_titles = \ [ "director of finance", "Director of Finance", "DIRECTOR OF FINANCE", "finance director", "Finance Director", "FINANCE DIRECTOR", ] expected_normalized_title = "director of finance" normalized_titles = normalize_job_titles(original_titles) for normalized_title in normalized_titles: assert_equals(expected_normalized_title, normalized_title)
def test_different_variants_of_same_title_should_be_properly_normalized(): original_titles = \ [ "sr project mgr", "sr project mgr.", "sr project mngr", "sr project mngr.", "sr. project mgr", "sr. project mgr.", "sr. project mngr", "sr. project mngr.", ] expected_normalized_title = "senior project manager" normalized_titles = normalize_job_titles(original_titles) for normalized_title in normalized_titles: assert_equals(expected_normalized_title, normalized_title)
def test_actual_titles_list_and_normalized_titles_list_should_be_of_same_length(): actual_titles = ["software developer", "ceo", "business analyst", "vp"] normalized_titles = normalize_job_titles(actual_titles) assert_equals(len(actual_titles), len(normalized_titles))
def test_software_developer_should_be_normalized_as_software_engineer(): actual_title = ["software developer"] expected_title = ["software engineer"] normalized_title = normalize_job_titles(actual_title) assert_equals(normalized_title, expected_title)
def extract_top_jobs(source_dir): """ Function to extract top jobs Args: source_dir -- path to the source xml files. """ # Get the files from the source directory files = [ f for (dirpath, dirnames, filenames) in os.walk(source_dir) for f in filenames if f[-4:] == '.txt' ] names = [] job_titles = [] # j, bar = 0, pbar(len(files)) # bar.start() labels_list = [] # From each xml file extract the information and store in plaintext files. for fname in files: # Create an xml parser object xml = etree.parse(source_dir + '/' + fname) # Extract the current job title, and current job element from xml current_job_title = xml.xpath('//job[@end = "present"]/title/text()') try: # Since there are many duplicate resumes in the data, filter out the resumes based on candidate name. # Extract the candidate name from the resume name = xml.xpath('//givenname/text()')[0] + ' ' + xml.xpath( '//surname/text()')[0] if name not in names: names.append(name) if current_job_title: i = 0 if len(current_job_title) > 1: while i < len(current_job_title): job_titles.append(current_job_title[i]) i += 1 else: job_titles.append(current_job_title[0]) except: pass # j += 1 # bar.update(j) # bar.finish() print job_titles[:10] print len(job_titles) print len(set(job_titles)) print Counter(job_titles).most_common(200) normalized_titles = normalize_job_titles(job_titles) print len(normalized_titles) print len(set(normalized_titles)) top_normalized_jobs_counter = Counter(normalized_titles).most_common(200) print top_normalized_jobs_counter top_normalized_jobs = [] for tj in top_normalized_jobs_counter: top_normalized_jobs.append(tj[0]) return top_normalized_jobs
def clean_data_and_extract_job_titles(fname, paths, names, job_titles, labels_list): """ Function to clean data and extract job titles from resume. Parameters: ----------- fname - string. Name of the resume file paths - dict Dict containing paths of source directories names - string. Extracted candidate names from resume. Used to remove duplicate resumes. job_titles - list. Titles extracted from resume labels_list - list. Titles that will be used as labels for classifier. Returns: -------- names - string. Extracted candidate names from resume. Used to remove duplicate resumes. job_titles - list. Titles extracted from resume labels_list - list. Titles that will be used as labels for classifier. """ source_dir = paths['main_source_directory'] + '/' + paths['xml_data_directory'] try: xml = etree.parse(source_dir + '/' + fname) # Extract the current job title, and current job element from xml current_job_title = xml.xpath('//job[@end = "present"]/title/text()') current_job_title = normalize_job_titles(current_job_title) current_job = xml.xpath('//job[@end = "present"]') # Extract the contact information from xml. contact = xml.xpath('//contact') # Since there are many duplicate resumes in the data, filter out the resumes based on candidate name. # Extract the candidate name from the resume name = xml.xpath('//givenname/text()')[0] + ' ' + xml.xpath('//surname/text()')[0] # Avoid duplicate resumes by only choosing resumes with unique names if name not in names: names.append(name) else: return names, job_titles, labels_list # Remove the candidate contact information from the resume. if contact: contact[0].getparent().remove(contact[0]) # Remove the current job section from the resume as we will be using current job title as lable and # use our algorithm to predict it. if current_job: if len(current_job) > 1: i = 0 while i < len(current_job): current_job[i].getparent().remove(current_job[i]) i += 1 else: current_job[0].getparent().remove(current_job[0]) # Convert xml to string. xml = etree.tostring(xml, pretty_print=True) # Strip the xml tags from the resume. text_data = stripxml(xml) i = 0 flag = 0 # From the resume text remove all the words matching the current job title as we do not want any # information about the current job in the resume text. if current_job_title: text_data = text_data.replace(current_job_title[0].strip(), '') if current_job_title[0].strip() in top_jobs: flag = 1 job_count[current_job_title[0].strip()] += 1 # Only save the resumes whose current job title is present in the top 100 jobs if flag == 1 and job_count[current_job_title[0].strip()] < 300: if current_job_title: job_titles.append(current_job_title[0].strip()) directory = paths['main_source_directory'] + '/' + paths['plaintext_data_directory'] + '/' f = open(directory + '%s' % fname[:-4] + '_plaintext.txt', 'w') f.write(text_data) f.close() labels_list.append((fname[:-4] + '_plaintext.txt', current_job_title[0].strip().replace('\n', ''))) return names, job_titles, labels_list except: return names, job_titles, labels_list
def extract_top_jobs(source_dir): """ Function to extract top jobs Args: source_dir -- path to the source xml files. """ # Get the files from the source directory files = [f for (dirpath, dirnames, filenames) in os.walk(source_dir) for f in filenames if f[-4:] == '.txt'] names = [] job_titles = [] # j, bar = 0, pbar(len(files)) # bar.start() labels_list = [] # From each xml file extract the information and store in plaintext files. for fname in files: # Create an xml parser object xml = etree.parse(source_dir + '/' + fname) # Extract the current job title, and current job element from xml current_job_title = xml.xpath('//job[@end = "present"]/title/text()') try: # Since there are many duplicate resumes in the data, filter out the resumes based on candidate name. # Extract the candidate name from the resume name = xml.xpath('//givenname/text()')[0] + ' ' + xml.xpath('//surname/text()')[0] if name not in names: names.append(name) if current_job_title: i = 0 if len(current_job_title) > 1: while i < len(current_job_title): job_titles.append(current_job_title[i]) i += 1 else: job_titles.append(current_job_title[0]) except: pass # j += 1 # bar.update(j) # bar.finish() print job_titles[:10] print len(job_titles) print len(set(job_titles)) print Counter(job_titles).most_common(200) normalized_titles = normalize_job_titles(job_titles) print len(normalized_titles) print len(set(normalized_titles)) top_normalized_jobs_counter = Counter(normalized_titles).most_common(200) print top_normalized_jobs_counter top_normalized_jobs = [] for tj in top_normalized_jobs_counter: top_normalized_jobs.append(tj[0]) return top_normalized_jobs
def clean_data_and_extract_job_titles(fname, paths, names, job_titles, labels_list): """ Function to clean data and extract job titles from resume. Parameters: ----------- fname - string. Name of the resume file paths - dict Dict containing paths of source directories names - string. Extracted candidate names from resume. Used to remove duplicate resumes. job_titles - list. Titles extracted from resume labels_list - list. Titles that will be used as labels for classifier. Returns: -------- names - string. Extracted candidate names from resume. Used to remove duplicate resumes. job_titles - list. Titles extracted from resume labels_list - list. Titles that will be used as labels for classifier. """ source_dir = paths['main_source_directory'] + '/' + paths['xml_data_directory'] xml = etree.parse(source_dir + '/' + fname) # Extract the current job title, and current job element from xml current_job_title = xml.xpath('//job[@end = "present"]/title/text()') current_job_title = normalize_job_titles(current_job_title) current_job = xml.xpath('//job[@end = "present"]') # Extract the contact information from xml. contact = xml.xpath('//contact') try: # Since there are many duplicate resumes in the data, filter out the resumes based on candidate name. # Extract the candidate name from the resume name = xml.xpath('//givenname/text()')[0] + ' ' + xml.xpath('//surname/text()')[0] # Avoid duplicate resumes by only choosing resumes with unique names if name not in names: names.append(name) else: return names, job_titles, labels_list # Remove the candidate contact information from the resume. if contact: contact[0].getparent().remove(contact[0]) # Remove the current job section from the resume as we will be using current job title as lable and # use our algorithm to predict it. if current_job: if len(current_job) > 1: i = 0 while i < len(current_job): current_job[i].getparent().remove(current_job[i]) i += 1 else: current_job[0].getparent().remove(current_job[0]) # Convert xml to string. xml = etree.tostring(xml, pretty_print=True) # Strip the xml tags from the resume. text_data = stripxml(xml) i = 0 flag = 0 # From the resume text remove all the words matching the current job title as we do not want any # information about the current job in the resume text. if current_job_title: text_data = text_data.replace(current_job_title[0].strip(), '') job_titles.append(current_job_title[0].strip()) if current_job_title[0].strip() in top_jobs: flag = 1 job_count[current_job_title[0].strip()] += 1 # Only save the resumes whose current job title is present in the top 100 jobs if flag == 1 and job_count[current_job_title[0].strip()] < 250: if current_job_title: directory = paths['main_source_directory'] + '/' + paths['plaintext_data_directory'] + '/' f = open(directory + '%s' % fname[:-4] + '_plaintext.txt', 'w') f.write(text_data) f.close() labels_list.append((fname[:-4] + '_plaintext.txt', current_job_title[0].strip().replace('\n', ''))) return names, job_titles, labels_list except: return names, job_titles, labels_list
def test_actual_titles_list_and_normalized_titles_list_should_be_of_same_length( ): actual_titles = ["software developer", "ceo", "business analyst", "vp"] normalized_titles = normalize_job_titles(actual_titles) assert_equals(len(actual_titles), len(normalized_titles))