def extract_univ(data, univ_dict, univ_normalize):
    """
    This function will extract university name from resume text

    Parameters:
    -----------
    data -- string
        Resume text

    univ_dict -- dict
        University lookup dictionary

    univ_normalize -- dict
        University alias dictionary

    Returns:
    --------
    university -- string
        University name in string

    """
    try:
        data = stripxml(str(data))

        data = data.lower()
        data = data.replace('\xc2\xa0', ' ')
        #print data
        data = re.sub ('[^A-Za-z0-9 ]+',' ',str(data))
        data = re.sub ('  ',' ',str(data))

        if 'education' in data:
            parted = data.split('education')[1]
            second = parted[:150]
        else:
            second = data
        n = 10
        while n > 1:
            for ngram in ngrams(str(second).lower(), n):
                if ngram.lower() in univ_normalize:
                    return univ_normalize[str(ngram.lower())]
                elif ngram.lower() in univ_dict:
                    university = ngram.title()
                    return university
            n -= 1
        return ""
    except UnicodeEncodeError:
        return data
    except UnicodeDecodeError:
        return data
def extract_univ(data, univ_dict, univ_normalize):
    """
    This function will extract university name from resume text

    Parameters:
    -----------
    data -- string
        Resume text

    univ_dict -- dict
        University lookup dictionary

    univ_normalize -- dict
        University alias dictionary

    Returns:
    --------
    university -- string
        University name in string

    """
    try:
        data = stripxml(str(data))

        data = data.lower()
        data = data.replace('\xc2\xa0', ' ')
        #print data
        data = re.sub('[^A-Za-z0-9 ]+', ' ', str(data))
        data = re.sub('  ', ' ', str(data))

        if 'education' in data:
            parted = data.split('education')[1]
            second = parted[:150]
        else:
            second = data
        n = 10
        while n > 1:
            for ngram in ngrams(str(second).lower(), n):
                if ngram.lower() in univ_normalize:
                    return univ_normalize[str(ngram.lower())]
                elif ngram.lower() in univ_dict:
                    university = ngram.title()
                    return university
            n -= 1
        return ""
    except UnicodeEncodeError:
        return data
    except UnicodeDecodeError:
        return data
def extract_from_resume(data):
    """
    This function will extract university name from resume text

    Parameters:
    -----------
    data -- string
        Resume text

    Returns:
    --------
    out -- dict
        Dictionary of university details

    """
    out = {}
    data = stripxml(str(data))
    data = data.lower()
    data = re.sub ('[^A-Za-z0-9 ]+',' ',str(data))
    data = re.sub ('  ',' ',str(data))
    if 'education' in data:
        parted = data.split('education')[1]
        second = parted[:150]
        out['split2'] = second
    else:
        second = data
    with open("static/UniqueFBUnivNames.csv", 'rb') as f:
        reader = csv.reader(f)
        for row in reader:
            row = re.sub('[^A-Za-z0-9 ]+', ' ', str(row))
            row = re.sub('  ', ' ', str(row))
            if str(row).lower() in second:
                if len(str(row).split())>1:
                    out['univ'] = str(row)
                    return out
                    break
    return out
def extract_from_resume(data):
    """
    This function will extract university name from resume text

    Parameters:
    -----------
    data -- string
        Resume text

    Returns:
    --------
    out -- dict
        Dictionary of university details

    """
    out = {}
    data = stripxml(str(data))
    data = data.lower()
    data = re.sub('[^A-Za-z0-9 ]+', ' ', str(data))
    data = re.sub('  ', ' ', str(data))
    if 'education' in data:
        parted = data.split('education')[1]
        second = parted[:150]
        out['split2'] = second
    else:
        second = data
    with open("static/UniqueFBUnivNames.csv", 'rb') as f:
        reader = csv.reader(f)
        for row in reader:
            row = re.sub('[^A-Za-z0-9 ]+', ' ', str(row))
            row = re.sub('  ', ' ', str(row))
            if str(row).lower() in second:
                if len(str(row).split()) > 1:
                    out['univ'] = str(row)
                    return out
                    break
    return out
for root, dirs, files in os.walk(xml_directory, topdown=False):
    for f in files:

        try:
            xml = etree.parse(xml_directory + '/' + f)
            name = xml.xpath('//givenname/text()')[0] + ' ' + xml.xpath('//surname/text()')[0]
            if name not in names:
                names.append(name)
                education = xml.xpath('//education')[0]
                schools = education.xpath('//school')

                resume_id = f.split('.')[0]
                temp_univ_major_list = []
                for school in schools:
                    school_text = stripxml(etree.tostring(school))
                    #institution = school.xpath('institution/text()')[0]
                    institution = extract_univ( school_text, univ_dict, univ_normalize)
                    institution = re.sub ('[^A-Za-z0-9 ]+',' ',str(institution))
                    institution = re.sub ('  ',' ',str(institution))
                    #print institution
                    if institution.lower() in univ_normalize:
                        #print "NORMALIZED"
                        institution = univ_normalize[institution]

                    degree_level = school.xpath('degree/@level')[0]
                    degree = school.xpath('degree/text()')[0]
                    major_code = str(school.xpath('major/@code')[0])
                    major = school.xpath('major/text()')[0]

                    temp_univ_major_list.append(str(institution + '_' + major_code).lower())
def clean_data_and_extract_job_titles(fname, paths, names, job_titles, labels_list):
    """
    Function to clean data and extract job titles from resume.

    Parameters:
    -----------
    fname - string.
        Name of the resume file

    paths - dict
        Dict containing paths of source directories

    names - string.
        Extracted candidate names from resume. Used to remove duplicate resumes.

    job_titles - list.
        Titles extracted from resume

    labels_list - list.
        Titles that will be used as labels for classifier.

    Returns:
    --------
    names - string.
        Extracted candidate names from resume. Used to remove duplicate resumes.

    job_titles - list.
        Titles extracted from resume

    labels_list - list.
        Titles that will be used as labels for classifier.

    """

    source_dir = paths['main_source_directory'] + '/' + paths['xml_data_directory']

    try:
        xml = etree.parse(source_dir + '/' + fname)

        # Extract the current job title, and current job element from xml
        current_job_title = xml.xpath('//job[@end = "present"]/title/text()')
        current_job_title = normalize_job_titles(current_job_title)
        current_job = xml.xpath('//job[@end = "present"]')

        # Extract the contact information from xml.
        contact = xml.xpath('//contact')

        # Since there are many duplicate resumes in the data, filter out the resumes based on candidate name.
        # Extract the candidate name from the resume
        name = xml.xpath('//givenname/text()')[0] + ' ' + xml.xpath('//surname/text()')[0]

        # Avoid duplicate resumes by only choosing resumes with unique names
        if name not in names:
            names.append(name)
        else:
            return names, job_titles, labels_list

        # Remove the candidate contact information from the resume.
        if contact:
                contact[0].getparent().remove(contact[0])

        # Remove the current job section from the resume as we will be using current job title as lable and
        # use our algorithm to predict it.
        if current_job:
            if len(current_job) > 1:
                i = 0
                while i < len(current_job):
                    current_job[i].getparent().remove(current_job[i])
                    i += 1
            else:
                current_job[0].getparent().remove(current_job[0])

            # Convert xml to string.
            xml = etree.tostring(xml, pretty_print=True)

            # Strip the xml tags from the resume.
            text_data = stripxml(xml)
            i = 0
            flag = 0

            # From the resume text remove all the words matching the current job title as we do not want any
            # information about the current job in the resume text.
            if current_job_title:
                text_data = text_data.replace(current_job_title[0].strip(), '')
                if current_job_title[0].strip() in top_jobs:
                    flag = 1
                    job_count[current_job_title[0].strip()] += 1

        # Only save the resumes whose current job title is present in the top 100 jobs
        if flag == 1 and job_count[current_job_title[0].strip()] < 300:

            if current_job_title:
                job_titles.append(current_job_title[0].strip())
                directory = paths['main_source_directory'] + '/' + paths['plaintext_data_directory'] + '/'
                f = open(directory + '%s' % fname[:-4] + '_plaintext.txt', 'w')
                f.write(text_data)
                f.close()

                labels_list.append((fname[:-4] + '_plaintext.txt', current_job_title[0].strip().replace('\n', '')))

        return names, job_titles, labels_list
    except:
        return names, job_titles, labels_list
def clean_data_and_extract_job_titles(fname, paths, names, job_titles, labels_list):
    """
    Function to clean data and extract job titles from resume.

    Parameters:
    -----------
    fname - string.
        Name of the resume file

    paths - dict
        Dict containing paths of source directories

    names - string.
        Extracted candidate names from resume. Used to remove duplicate resumes.

    job_titles - list.
        Titles extracted from resume

    labels_list - list.
        Titles that will be used as labels for classifier.

    Returns:
    --------
    names - string.
        Extracted candidate names from resume. Used to remove duplicate resumes.

    job_titles - list.
        Titles extracted from resume

    labels_list - list.
        Titles that will be used as labels for classifier.

    """
    source_dir = paths['main_source_directory'] + '/' + paths['xml_data_directory']
    xml = etree.parse(source_dir + '/' + fname)

    # Extract the current job title, and current job element from xml
    current_job_title = xml.xpath('//job[@end = "present"]/title/text()')
    current_job_title = normalize_job_titles(current_job_title)
    current_job = xml.xpath('//job[@end = "present"]')

    # Extract the contact information from xml.
    contact = xml.xpath('//contact')

    try:
        # Since there are many duplicate resumes in the data, filter out the resumes based on candidate name.
        # Extract the candidate name from the resume
        name = xml.xpath('//givenname/text()')[0] + ' ' + xml.xpath('//surname/text()')[0]

        # Avoid duplicate resumes by only choosing resumes with unique names
        if name not in names:
            names.append(name)
        else:
            return names, job_titles, labels_list

        # Remove the candidate contact information from the resume.
        if contact:
                contact[0].getparent().remove(contact[0])

        # Remove the current job section from the resume as we will be using current job title as lable and
        # use our algorithm to predict it.
        if current_job:
            if len(current_job) > 1:
                i = 0
                while i < len(current_job):
                    current_job[i].getparent().remove(current_job[i])
                    i += 1
            else:
                current_job[0].getparent().remove(current_job[0])

            # Convert xml to string.
            xml = etree.tostring(xml, pretty_print=True)

            # Strip the xml tags from the resume.
            text_data = stripxml(xml)
            i = 0
            flag = 0

            # From the resume text remove all the words matching the current job title as we do not want any
            # information about the current job in the resume text.
            if current_job_title:
                text_data = text_data.replace(current_job_title[0].strip(), '')
                job_titles.append(current_job_title[0].strip())
                if current_job_title[0].strip() in top_jobs:
                    flag = 1
                    job_count[current_job_title[0].strip()] += 1

        # Only save the resumes whose current job title is present in the top 100 jobs
        if flag == 1 and job_count[current_job_title[0].strip()] < 250:

            if current_job_title:
                directory = paths['main_source_directory'] + '/' + paths['plaintext_data_directory'] + '/'
                f = open(directory + '%s' % fname[:-4] + '_plaintext.txt', 'w')
                f.write(text_data)
                f.close()

                labels_list.append((fname[:-4] + '_plaintext.txt', current_job_title[0].strip().replace('\n', '')))

        return names, job_titles, labels_list
    except:
        return names, job_titles, labels_list
Beispiel #8
0
for root, dirs, files in os.walk(xml_directory, topdown=False):
    for f in files:

        try:
            xml = etree.parse(xml_directory + '/' + f)
            name = xml.xpath('//givenname/text()')[0] + ' ' + xml.xpath(
                '//surname/text()')[0]
            if name not in names:
                names.append(name)
                education = xml.xpath('//education')[0]
                schools = education.xpath('//school')

                resume_id = f.split('.')[0]
                temp_univ_major_list = []
                for school in schools:
                    school_text = stripxml(etree.tostring(school))
                    #institution = school.xpath('institution/text()')[0]
                    institution = extract_univ(school_text, univ_dict,
                                               univ_normalize)
                    institution = re.sub('[^A-Za-z0-9 ]+', ' ',
                                         str(institution))
                    institution = re.sub('  ', ' ', str(institution))
                    #print institution
                    if institution.lower() in univ_normalize:
                        #print "NORMALIZED"
                        institution = univ_normalize[institution]

                    degree_level = school.xpath('degree/@level')[0]
                    degree = school.xpath('degree/text()')[0]
                    major_code = str(school.xpath('major/@code')[0])
                    major = school.xpath('major/text()')[0]