def xml_features(data):
    """
    Extract details from selected xml tags
    Strip the xml tags from the xml to make it plaintext

    Parameters:
    -----------
    data -- string
        Resume xml as string

    Returns:
    --------
    xml_features -- dict
        Dictionary with plaintext resume without any xml tags and select xml features
    """
    xml_features = {}
    jobs = data.xpath('//job/title/text()')
    job_normalized = []
    for title in jobs:
        job_normalized.append(normalize_job_titles(title))
    xml_features["jobs"] = jobs
    employers = data.xpath('//job/employer/text()')
    institution = data.xpath('//education/school/institution/text()')
    degree = data.xpath('//education/school/degree/text()')
    xml_features["employers"] = employers
    xml_features["institution"] = institution
    xml_features["degree"] = degree
    pattern = re.compile(r'<.*?>')
    data = etree.tostring(data, pretty_print=True)
    text = pattern.sub('', data)
    xml_features["raw_resume"] = text
    return xml_features
def xml_features(data):
    """
    Extract details from selected xml tags
    Strip the xml tags from the xml to make it plaintext

    Parameters:
    -----------
    data -- string
        Resume xml as string

    Returns:
    --------
    xml_features -- dict
        Dictionary with plaintext resume without any xml tags and select xml features
    """
    xml_features = {}
    jobs = data.xpath('//job/title/text()')
    job_normalized = []
    for title in jobs:
        job_normalized.append(normalize_job_titles(title))
    xml_features["jobs"] = jobs
    employers = data.xpath('//job/employer/text()')
    institution = data.xpath('//education/school/institution/text()')
    degree = data.xpath('//education/school/degree/text()')
    xml_features["employers"] = employers
    xml_features["institution"] = institution
    xml_features["degree"] = degree
    pattern = re.compile(r'<.*?>')
    data = etree.tostring(data, pretty_print=True)
    text = pattern.sub('', data)
    xml_features["raw_resume"] = text
    return xml_features
def test_different_permutations_of_same_title_should_be_properly_normalized():
    original_titles = \
        [
            "director of finance",
            "Director of Finance",
            "DIRECTOR OF FINANCE",
            "finance director",
            "Finance Director",
            "FINANCE DIRECTOR",
        ]
    expected_normalized_title = "director of finance"
    normalized_titles = normalize_job_titles(original_titles)

    for normalized_title in normalized_titles:
        assert_equals(expected_normalized_title, normalized_title)
Esempio n. 4
0
def test_different_permutations_of_same_title_should_be_properly_normalized():
    original_titles = \
        [
            "director of finance",
            "Director of Finance",
            "DIRECTOR OF FINANCE",
            "finance director",
            "Finance Director",
            "FINANCE DIRECTOR",
        ]
    expected_normalized_title = "director of finance"
    normalized_titles = normalize_job_titles(original_titles)

    for normalized_title in normalized_titles:
        assert_equals(expected_normalized_title, normalized_title)
def test_different_variants_of_same_title_should_be_properly_normalized():
    original_titles = \
        [
            "sr project mgr",
            "sr project mgr.",
            "sr project mngr",
            "sr project mngr.",
            "sr. project mgr",
            "sr. project mgr.",
            "sr. project mngr",
            "sr. project mngr.",
        ]
    expected_normalized_title = "senior project manager"
    normalized_titles = normalize_job_titles(original_titles)

    for normalized_title in normalized_titles:
        assert_equals(expected_normalized_title, normalized_title)
Esempio n. 6
0
def test_different_variants_of_same_title_should_be_properly_normalized():
    original_titles = \
        [
            "sr project mgr",
            "sr project mgr.",
            "sr project mngr",
            "sr project mngr.",
            "sr. project mgr",
            "sr. project mgr.",
            "sr. project mngr",
            "sr. project mngr.",
        ]
    expected_normalized_title = "senior project manager"
    normalized_titles = normalize_job_titles(original_titles)

    for normalized_title in normalized_titles:
        assert_equals(expected_normalized_title, normalized_title)
def test_actual_titles_list_and_normalized_titles_list_should_be_of_same_length():
    actual_titles = ["software developer", "ceo", "business analyst", "vp"]
    normalized_titles = normalize_job_titles(actual_titles)

    assert_equals(len(actual_titles), len(normalized_titles))
def test_software_developer_should_be_normalized_as_software_engineer():
    actual_title = ["software developer"]
    expected_title = ["software engineer"]
    normalized_title = normalize_job_titles(actual_title)
    assert_equals(normalized_title, expected_title)
Esempio n. 9
0
def extract_top_jobs(source_dir):
    """
    Function to extract top jobs

    Args:
        source_dir -- path to the source xml files.

    """

    # Get the files from the source directory
    files = [
        f for (dirpath, dirnames, filenames) in os.walk(source_dir)
        for f in filenames if f[-4:] == '.txt'
    ]

    names = []
    job_titles = []

    # j, bar = 0, pbar(len(files))
    # bar.start()
    labels_list = []

    # From each xml file extract the information and store in plaintext files.
    for fname in files:
        # Create an xml parser object
        xml = etree.parse(source_dir + '/' + fname)

        # Extract the current job title, and current job element from xml
        current_job_title = xml.xpath('//job[@end = "present"]/title/text()')

        try:
            # Since there are many duplicate resumes in the data, filter out the resumes based on candidate name.
            # Extract the candidate name from the resume
            name = xml.xpath('//givenname/text()')[0] + ' ' + xml.xpath(
                '//surname/text()')[0]

            if name not in names:
                names.append(name)

                if current_job_title:
                    i = 0
                    if len(current_job_title) > 1:
                        while i < len(current_job_title):
                            job_titles.append(current_job_title[i])
                            i += 1
                    else:
                        job_titles.append(current_job_title[0])

        except:
            pass

    #     j += 1
    #     bar.update(j)
    # bar.finish()

    print job_titles[:10]
    print len(job_titles)
    print len(set(job_titles))
    print Counter(job_titles).most_common(200)
    normalized_titles = normalize_job_titles(job_titles)
    print len(normalized_titles)
    print len(set(normalized_titles))
    top_normalized_jobs_counter = Counter(normalized_titles).most_common(200)
    print top_normalized_jobs_counter

    top_normalized_jobs = []
    for tj in top_normalized_jobs_counter:
        top_normalized_jobs.append(tj[0])

    return top_normalized_jobs
def clean_data_and_extract_job_titles(fname, paths, names, job_titles, labels_list):
    """
    Function to clean data and extract job titles from resume.

    Parameters:
    -----------
    fname - string.
        Name of the resume file

    paths - dict
        Dict containing paths of source directories

    names - string.
        Extracted candidate names from resume. Used to remove duplicate resumes.

    job_titles - list.
        Titles extracted from resume

    labels_list - list.
        Titles that will be used as labels for classifier.

    Returns:
    --------
    names - string.
        Extracted candidate names from resume. Used to remove duplicate resumes.

    job_titles - list.
        Titles extracted from resume

    labels_list - list.
        Titles that will be used as labels for classifier.

    """

    source_dir = paths['main_source_directory'] + '/' + paths['xml_data_directory']

    try:
        xml = etree.parse(source_dir + '/' + fname)

        # Extract the current job title, and current job element from xml
        current_job_title = xml.xpath('//job[@end = "present"]/title/text()')
        current_job_title = normalize_job_titles(current_job_title)
        current_job = xml.xpath('//job[@end = "present"]')

        # Extract the contact information from xml.
        contact = xml.xpath('//contact')

        # Since there are many duplicate resumes in the data, filter out the resumes based on candidate name.
        # Extract the candidate name from the resume
        name = xml.xpath('//givenname/text()')[0] + ' ' + xml.xpath('//surname/text()')[0]

        # Avoid duplicate resumes by only choosing resumes with unique names
        if name not in names:
            names.append(name)
        else:
            return names, job_titles, labels_list

        # Remove the candidate contact information from the resume.
        if contact:
                contact[0].getparent().remove(contact[0])

        # Remove the current job section from the resume as we will be using current job title as lable and
        # use our algorithm to predict it.
        if current_job:
            if len(current_job) > 1:
                i = 0
                while i < len(current_job):
                    current_job[i].getparent().remove(current_job[i])
                    i += 1
            else:
                current_job[0].getparent().remove(current_job[0])

            # Convert xml to string.
            xml = etree.tostring(xml, pretty_print=True)

            # Strip the xml tags from the resume.
            text_data = stripxml(xml)
            i = 0
            flag = 0

            # From the resume text remove all the words matching the current job title as we do not want any
            # information about the current job in the resume text.
            if current_job_title:
                text_data = text_data.replace(current_job_title[0].strip(), '')
                if current_job_title[0].strip() in top_jobs:
                    flag = 1
                    job_count[current_job_title[0].strip()] += 1

        # Only save the resumes whose current job title is present in the top 100 jobs
        if flag == 1 and job_count[current_job_title[0].strip()] < 300:

            if current_job_title:
                job_titles.append(current_job_title[0].strip())
                directory = paths['main_source_directory'] + '/' + paths['plaintext_data_directory'] + '/'
                f = open(directory + '%s' % fname[:-4] + '_plaintext.txt', 'w')
                f.write(text_data)
                f.close()

                labels_list.append((fname[:-4] + '_plaintext.txt', current_job_title[0].strip().replace('\n', '')))

        return names, job_titles, labels_list
    except:
        return names, job_titles, labels_list
def extract_top_jobs(source_dir):
    """
    Function to extract top jobs

    Args:
        source_dir -- path to the source xml files.

    """

    # Get the files from the source directory
    files = [f for (dirpath, dirnames, filenames) in os.walk(source_dir) for f in filenames if f[-4:] == '.txt']

    names = []
    job_titles = []

    # j, bar = 0, pbar(len(files))
    # bar.start()
    labels_list = []

    # From each xml file extract the information and store in plaintext files.
    for fname in files:
        # Create an xml parser object
        xml = etree.parse(source_dir + '/' + fname)

        # Extract the current job title, and current job element from xml
        current_job_title = xml.xpath('//job[@end = "present"]/title/text()')

        try:
            # Since there are many duplicate resumes in the data, filter out the resumes based on candidate name.
            # Extract the candidate name from the resume
            name = xml.xpath('//givenname/text()')[0] + ' ' + xml.xpath('//surname/text()')[0]

            if name not in names:
                names.append(name)

                if current_job_title:
                    i = 0
                    if len(current_job_title) > 1:
                        while i < len(current_job_title):
                            job_titles.append(current_job_title[i])
                            i += 1
                    else:
                        job_titles.append(current_job_title[0])

        except:
            pass

    #     j += 1
    #     bar.update(j)
    # bar.finish()

    print job_titles[:10]
    print len(job_titles)
    print len(set(job_titles))
    print Counter(job_titles).most_common(200)
    normalized_titles = normalize_job_titles(job_titles)
    print len(normalized_titles)
    print len(set(normalized_titles))
    top_normalized_jobs_counter = Counter(normalized_titles).most_common(200)
    print top_normalized_jobs_counter

    top_normalized_jobs = []
    for tj in top_normalized_jobs_counter:
        top_normalized_jobs.append(tj[0])

    return top_normalized_jobs
def clean_data_and_extract_job_titles(fname, paths, names, job_titles, labels_list):
    """
    Function to clean data and extract job titles from resume.

    Parameters:
    -----------
    fname - string.
        Name of the resume file

    paths - dict
        Dict containing paths of source directories

    names - string.
        Extracted candidate names from resume. Used to remove duplicate resumes.

    job_titles - list.
        Titles extracted from resume

    labels_list - list.
        Titles that will be used as labels for classifier.

    Returns:
    --------
    names - string.
        Extracted candidate names from resume. Used to remove duplicate resumes.

    job_titles - list.
        Titles extracted from resume

    labels_list - list.
        Titles that will be used as labels for classifier.

    """
    source_dir = paths['main_source_directory'] + '/' + paths['xml_data_directory']
    xml = etree.parse(source_dir + '/' + fname)

    # Extract the current job title, and current job element from xml
    current_job_title = xml.xpath('//job[@end = "present"]/title/text()')
    current_job_title = normalize_job_titles(current_job_title)
    current_job = xml.xpath('//job[@end = "present"]')

    # Extract the contact information from xml.
    contact = xml.xpath('//contact')

    try:
        # Since there are many duplicate resumes in the data, filter out the resumes based on candidate name.
        # Extract the candidate name from the resume
        name = xml.xpath('//givenname/text()')[0] + ' ' + xml.xpath('//surname/text()')[0]

        # Avoid duplicate resumes by only choosing resumes with unique names
        if name not in names:
            names.append(name)
        else:
            return names, job_titles, labels_list

        # Remove the candidate contact information from the resume.
        if contact:
                contact[0].getparent().remove(contact[0])

        # Remove the current job section from the resume as we will be using current job title as lable and
        # use our algorithm to predict it.
        if current_job:
            if len(current_job) > 1:
                i = 0
                while i < len(current_job):
                    current_job[i].getparent().remove(current_job[i])
                    i += 1
            else:
                current_job[0].getparent().remove(current_job[0])

            # Convert xml to string.
            xml = etree.tostring(xml, pretty_print=True)

            # Strip the xml tags from the resume.
            text_data = stripxml(xml)
            i = 0
            flag = 0

            # From the resume text remove all the words matching the current job title as we do not want any
            # information about the current job in the resume text.
            if current_job_title:
                text_data = text_data.replace(current_job_title[0].strip(), '')
                job_titles.append(current_job_title[0].strip())
                if current_job_title[0].strip() in top_jobs:
                    flag = 1
                    job_count[current_job_title[0].strip()] += 1

        # Only save the resumes whose current job title is present in the top 100 jobs
        if flag == 1 and job_count[current_job_title[0].strip()] < 250:

            if current_job_title:
                directory = paths['main_source_directory'] + '/' + paths['plaintext_data_directory'] + '/'
                f = open(directory + '%s' % fname[:-4] + '_plaintext.txt', 'w')
                f.write(text_data)
                f.close()

                labels_list.append((fname[:-4] + '_plaintext.txt', current_job_title[0].strip().replace('\n', '')))

        return names, job_titles, labels_list
    except:
        return names, job_titles, labels_list
Esempio n. 13
0
def test_actual_titles_list_and_normalized_titles_list_should_be_of_same_length(
):
    actual_titles = ["software developer", "ceo", "business analyst", "vp"]
    normalized_titles = normalize_job_titles(actual_titles)

    assert_equals(len(actual_titles), len(normalized_titles))
Esempio n. 14
0
def test_software_developer_should_be_normalized_as_software_engineer():
    actual_title = ["software developer"]
    expected_title = ["software engineer"]
    normalized_title = normalize_job_titles(actual_title)
    assert_equals(normalized_title, expected_title)