def extract_univ(data, univ_dict, univ_normalize): """ This function will extract university name from resume text Parameters: ----------- data -- string Resume text univ_dict -- dict University lookup dictionary univ_normalize -- dict University alias dictionary Returns: -------- university -- string University name in string """ try: data = stripxml(str(data)) data = data.lower() data = data.replace('\xc2\xa0', ' ') #print data data = re.sub ('[^A-Za-z0-9 ]+',' ',str(data)) data = re.sub (' ',' ',str(data)) if 'education' in data: parted = data.split('education')[1] second = parted[:150] else: second = data n = 10 while n > 1: for ngram in ngrams(str(second).lower(), n): if ngram.lower() in univ_normalize: return univ_normalize[str(ngram.lower())] elif ngram.lower() in univ_dict: university = ngram.title() return university n -= 1 return "" except UnicodeEncodeError: return data except UnicodeDecodeError: return data
def extract_univ(data, univ_dict, univ_normalize): """ This function will extract university name from resume text Parameters: ----------- data -- string Resume text univ_dict -- dict University lookup dictionary univ_normalize -- dict University alias dictionary Returns: -------- university -- string University name in string """ try: data = stripxml(str(data)) data = data.lower() data = data.replace('\xc2\xa0', ' ') #print data data = re.sub('[^A-Za-z0-9 ]+', ' ', str(data)) data = re.sub(' ', ' ', str(data)) if 'education' in data: parted = data.split('education')[1] second = parted[:150] else: second = data n = 10 while n > 1: for ngram in ngrams(str(second).lower(), n): if ngram.lower() in univ_normalize: return univ_normalize[str(ngram.lower())] elif ngram.lower() in univ_dict: university = ngram.title() return university n -= 1 return "" except UnicodeEncodeError: return data except UnicodeDecodeError: return data
def extract_from_resume(data): """ This function will extract university name from resume text Parameters: ----------- data -- string Resume text Returns: -------- out -- dict Dictionary of university details """ out = {} data = stripxml(str(data)) data = data.lower() data = re.sub ('[^A-Za-z0-9 ]+',' ',str(data)) data = re.sub (' ',' ',str(data)) if 'education' in data: parted = data.split('education')[1] second = parted[:150] out['split2'] = second else: second = data with open("static/UniqueFBUnivNames.csv", 'rb') as f: reader = csv.reader(f) for row in reader: row = re.sub('[^A-Za-z0-9 ]+', ' ', str(row)) row = re.sub(' ', ' ', str(row)) if str(row).lower() in second: if len(str(row).split())>1: out['univ'] = str(row) return out break return out
def extract_from_resume(data): """ This function will extract university name from resume text Parameters: ----------- data -- string Resume text Returns: -------- out -- dict Dictionary of university details """ out = {} data = stripxml(str(data)) data = data.lower() data = re.sub('[^A-Za-z0-9 ]+', ' ', str(data)) data = re.sub(' ', ' ', str(data)) if 'education' in data: parted = data.split('education')[1] second = parted[:150] out['split2'] = second else: second = data with open("static/UniqueFBUnivNames.csv", 'rb') as f: reader = csv.reader(f) for row in reader: row = re.sub('[^A-Za-z0-9 ]+', ' ', str(row)) row = re.sub(' ', ' ', str(row)) if str(row).lower() in second: if len(str(row).split()) > 1: out['univ'] = str(row) return out break return out
for root, dirs, files in os.walk(xml_directory, topdown=False): for f in files: try: xml = etree.parse(xml_directory + '/' + f) name = xml.xpath('//givenname/text()')[0] + ' ' + xml.xpath('//surname/text()')[0] if name not in names: names.append(name) education = xml.xpath('//education')[0] schools = education.xpath('//school') resume_id = f.split('.')[0] temp_univ_major_list = [] for school in schools: school_text = stripxml(etree.tostring(school)) #institution = school.xpath('institution/text()')[0] institution = extract_univ( school_text, univ_dict, univ_normalize) institution = re.sub ('[^A-Za-z0-9 ]+',' ',str(institution)) institution = re.sub (' ',' ',str(institution)) #print institution if institution.lower() in univ_normalize: #print "NORMALIZED" institution = univ_normalize[institution] degree_level = school.xpath('degree/@level')[0] degree = school.xpath('degree/text()')[0] major_code = str(school.xpath('major/@code')[0]) major = school.xpath('major/text()')[0] temp_univ_major_list.append(str(institution + '_' + major_code).lower())
def clean_data_and_extract_job_titles(fname, paths, names, job_titles, labels_list): """ Function to clean data and extract job titles from resume. Parameters: ----------- fname - string. Name of the resume file paths - dict Dict containing paths of source directories names - string. Extracted candidate names from resume. Used to remove duplicate resumes. job_titles - list. Titles extracted from resume labels_list - list. Titles that will be used as labels for classifier. Returns: -------- names - string. Extracted candidate names from resume. Used to remove duplicate resumes. job_titles - list. Titles extracted from resume labels_list - list. Titles that will be used as labels for classifier. """ source_dir = paths['main_source_directory'] + '/' + paths['xml_data_directory'] try: xml = etree.parse(source_dir + '/' + fname) # Extract the current job title, and current job element from xml current_job_title = xml.xpath('//job[@end = "present"]/title/text()') current_job_title = normalize_job_titles(current_job_title) current_job = xml.xpath('//job[@end = "present"]') # Extract the contact information from xml. contact = xml.xpath('//contact') # Since there are many duplicate resumes in the data, filter out the resumes based on candidate name. # Extract the candidate name from the resume name = xml.xpath('//givenname/text()')[0] + ' ' + xml.xpath('//surname/text()')[0] # Avoid duplicate resumes by only choosing resumes with unique names if name not in names: names.append(name) else: return names, job_titles, labels_list # Remove the candidate contact information from the resume. if contact: contact[0].getparent().remove(contact[0]) # Remove the current job section from the resume as we will be using current job title as lable and # use our algorithm to predict it. if current_job: if len(current_job) > 1: i = 0 while i < len(current_job): current_job[i].getparent().remove(current_job[i]) i += 1 else: current_job[0].getparent().remove(current_job[0]) # Convert xml to string. xml = etree.tostring(xml, pretty_print=True) # Strip the xml tags from the resume. text_data = stripxml(xml) i = 0 flag = 0 # From the resume text remove all the words matching the current job title as we do not want any # information about the current job in the resume text. if current_job_title: text_data = text_data.replace(current_job_title[0].strip(), '') if current_job_title[0].strip() in top_jobs: flag = 1 job_count[current_job_title[0].strip()] += 1 # Only save the resumes whose current job title is present in the top 100 jobs if flag == 1 and job_count[current_job_title[0].strip()] < 300: if current_job_title: job_titles.append(current_job_title[0].strip()) directory = paths['main_source_directory'] + '/' + paths['plaintext_data_directory'] + '/' f = open(directory + '%s' % fname[:-4] + '_plaintext.txt', 'w') f.write(text_data) f.close() labels_list.append((fname[:-4] + '_plaintext.txt', current_job_title[0].strip().replace('\n', ''))) return names, job_titles, labels_list except: return names, job_titles, labels_list
def clean_data_and_extract_job_titles(fname, paths, names, job_titles, labels_list): """ Function to clean data and extract job titles from resume. Parameters: ----------- fname - string. Name of the resume file paths - dict Dict containing paths of source directories names - string. Extracted candidate names from resume. Used to remove duplicate resumes. job_titles - list. Titles extracted from resume labels_list - list. Titles that will be used as labels for classifier. Returns: -------- names - string. Extracted candidate names from resume. Used to remove duplicate resumes. job_titles - list. Titles extracted from resume labels_list - list. Titles that will be used as labels for classifier. """ source_dir = paths['main_source_directory'] + '/' + paths['xml_data_directory'] xml = etree.parse(source_dir + '/' + fname) # Extract the current job title, and current job element from xml current_job_title = xml.xpath('//job[@end = "present"]/title/text()') current_job_title = normalize_job_titles(current_job_title) current_job = xml.xpath('//job[@end = "present"]') # Extract the contact information from xml. contact = xml.xpath('//contact') try: # Since there are many duplicate resumes in the data, filter out the resumes based on candidate name. # Extract the candidate name from the resume name = xml.xpath('//givenname/text()')[0] + ' ' + xml.xpath('//surname/text()')[0] # Avoid duplicate resumes by only choosing resumes with unique names if name not in names: names.append(name) else: return names, job_titles, labels_list # Remove the candidate contact information from the resume. if contact: contact[0].getparent().remove(contact[0]) # Remove the current job section from the resume as we will be using current job title as lable and # use our algorithm to predict it. if current_job: if len(current_job) > 1: i = 0 while i < len(current_job): current_job[i].getparent().remove(current_job[i]) i += 1 else: current_job[0].getparent().remove(current_job[0]) # Convert xml to string. xml = etree.tostring(xml, pretty_print=True) # Strip the xml tags from the resume. text_data = stripxml(xml) i = 0 flag = 0 # From the resume text remove all the words matching the current job title as we do not want any # information about the current job in the resume text. if current_job_title: text_data = text_data.replace(current_job_title[0].strip(), '') job_titles.append(current_job_title[0].strip()) if current_job_title[0].strip() in top_jobs: flag = 1 job_count[current_job_title[0].strip()] += 1 # Only save the resumes whose current job title is present in the top 100 jobs if flag == 1 and job_count[current_job_title[0].strip()] < 250: if current_job_title: directory = paths['main_source_directory'] + '/' + paths['plaintext_data_directory'] + '/' f = open(directory + '%s' % fname[:-4] + '_plaintext.txt', 'w') f.write(text_data) f.close() labels_list.append((fname[:-4] + '_plaintext.txt', current_job_title[0].strip().replace('\n', ''))) return names, job_titles, labels_list except: return names, job_titles, labels_list
for root, dirs, files in os.walk(xml_directory, topdown=False): for f in files: try: xml = etree.parse(xml_directory + '/' + f) name = xml.xpath('//givenname/text()')[0] + ' ' + xml.xpath( '//surname/text()')[0] if name not in names: names.append(name) education = xml.xpath('//education')[0] schools = education.xpath('//school') resume_id = f.split('.')[0] temp_univ_major_list = [] for school in schools: school_text = stripxml(etree.tostring(school)) #institution = school.xpath('institution/text()')[0] institution = extract_univ(school_text, univ_dict, univ_normalize) institution = re.sub('[^A-Za-z0-9 ]+', ' ', str(institution)) institution = re.sub(' ', ' ', str(institution)) #print institution if institution.lower() in univ_normalize: #print "NORMALIZED" institution = univ_normalize[institution] degree_level = school.xpath('degree/@level')[0] degree = school.xpath('degree/text()')[0] major_code = str(school.xpath('major/@code')[0]) major = school.xpath('major/text()')[0]