Esempio n. 1
0
def extract_features(res_text, xml_text):
    xml_tree = etree.fromstring(xml_text)
    words = [
        st.stem(w) for w in nltk.word_tokenize(res_text.lower())
        if w not in stopwords and len(w) > 2
    ]
    features = Counter(words)
    # features = {}
    # words_set = list(set(words))
    # for w in words_set:
    #     features[w] = 1

    university = xml_tree.xpath('//institution/text()')
    if university:
        normalized_univ = extract_univ(university[0], univ_dict,
                                       univ_normalize)
        features[
            'university'] = normalized_univ if normalized_univ else university[
                0]

    degree_level = xml_tree.xpath('//degree/@level')
    if degree_level:
        features['degree_level'] = max(degree_level)

    return features
def analyze():
    # global results_json
    global university
    global tree_json
    if request.method:
        # Get and save file from browser upload
        files = request.files['file']
        if files:
            filename = str(files.filename)
            extension = filename.rsplit('.', 1)[1]
            filename_without_extension = filename.rsplit('.', 1)[0]
            files.save(os.path.join(iHire.config['UPLOAD_FOLDER'], filename))

            if extension == 'pdf':
                text_from_pdf = extract_text_from_pdf(filename)
                text_from_pdf = text_from_pdf.replace('\xc2\xa0', ' ')
                with open(filename_without_extension + '.txt', 'wb') as write_file:
                    write_file.write(text_from_pdf)

                textfile_name = filename_without_extension + '.txt'
            else:
                textfile_name = filename

            university = extract_univ(open(textfile_name).read(), univ_dict, univ_normalize)
            print filename

            # create_data_for_graph(university, "", skills_employer, univ_major_number, major_code_lookup)

            tree_json = create_data_for_tree(
                university,
                "",
                skills_employer_tree,
                univ_major_number,
                major_code_lookup,
                employer_second_degree_tree
            )

            resume_text = [open(textfile_name).read()]
            predicted_decision = model.decision_function(resume_text)

            top_predictions, normalized_prediction_score = get_top_predictions(predicted_decision)

            out = dict()

            skills_map_with_percent_list = []
            titles = sorted(skills_map_with_percent.keys())
            for title in titles:
                temp_skill_map = dict()
                temp_skill_map[title] = skills_map_with_percent[title]
                skills_map_with_percent_list.append(temp_skill_map)

            out["university"] = university
            out["skills_map"] = skills_map_with_percent_list
            out["titles"] = titles
            out["candidate_skills"] = dict()
            out["title_data"] = dict()

            try:
                tokens = nltk.word_tokenize(resume_text[0].lower())
            except UnicodeDecodeError:
                tokens = nltk.word_tokenize(resume_text[0].decode('utf-8').lower())

            skill_score = []
            for pred in top_predictions:
                try:
                    top15 = skills_map_with_percent[title_title_map[pred]]["skills"][:15]
                except KeyError:
                    top15 = []
                temp_skill_list = [t for t in top15 if len(t) > 1 and t.lower() in tokens]

                out["candidate_skills"][title_title_map[pred]] = temp_skill_list
                out["title_data"][title_title_map[pred]] = titles_data[title_title_map[pred]]
                skill_score.append(int(len(temp_skill_list) / 15.0 * 100.0))

            final_score = [sum(x)/2 for x in zip(normalized_prediction_score, skill_score)]

            final_titles_list = []
            sorted_score_indexes = [i[0] for i in sorted(enumerate(final_score), key=lambda x:x[1], reverse=True)]

            for s in sorted_score_indexes:
                final_titles_list.append(title_title_map[top_predictions[s]])

            final_score_sorted = sorted(final_score, reverse=True)

            out["final_prediction_list"] = final_titles_list
            out["final_score_sorted"] = final_score_sorted
            out["tree_json"] = json.dumps(tree_json)

            print final_titles_list[:5]
            print final_score_sorted[:5]

            if os.path.isfile(textfile_name):
                    os.remove(textfile_name)

            if os.path.isfile(filename):
                    os.remove(filename)

            # results_json = OrderedDict(out)
            return json.dumps(OrderedDict(out))
    for f in files:

        try:
            xml = etree.parse(xml_directory + '/' + f)
            name = xml.xpath('//givenname/text()')[0] + ' ' + xml.xpath('//surname/text()')[0]
            if name not in names:
                names.append(name)
                education = xml.xpath('//education')[0]
                schools = education.xpath('//school')

                resume_id = f.split('.')[0]
                temp_univ_major_list = []
                for school in schools:
                    school_text = stripxml(etree.tostring(school))
                    #institution = school.xpath('institution/text()')[0]
                    institution = extract_univ( school_text, univ_dict, univ_normalize)
                    institution = re.sub ('[^A-Za-z0-9 ]+',' ',str(institution))
                    institution = re.sub ('  ',' ',str(institution))
                    #print institution
                    if institution.lower() in univ_normalize:
                        #print "NORMALIZED"
                        institution = univ_normalize[institution]

                    degree_level = school.xpath('degree/@level')[0]
                    degree = school.xpath('degree/text()')[0]
                    major_code = str(school.xpath('major/@code')[0])
                    major = school.xpath('major/text()')[0]

                    temp_univ_major_list.append(str(institution + '_' + major_code).lower())
                    if str(institution + '_' + major_code).lower() not in univ_major_list:
                        counter += 1
def analyze():
    # global results_json
    global university
    global tree_json
    if request.method:
        # Get and save file from browser upload
        files = request.files['file']
        if files:
            filename = str(files.filename)
            extension = filename.rsplit('.', 1)[1]
            filename_without_extension = filename.rsplit('.', 1)[0]
            files.save(os.path.join(iHire.config['UPLOAD_FOLDER'], filename))

            if extension == 'pdf':
                text_from_pdf = extract_text_from_pdf(filename)
                text_from_pdf = text_from_pdf.replace('\xc2\xa0', ' ')
                with open(filename_without_extension + '.txt',
                          'wb') as write_file:
                    write_file.write(text_from_pdf)

                textfile_name = filename_without_extension + '.txt'
            else:
                textfile_name = filename

            university = extract_univ(
                open(textfile_name).read(), univ_dict, univ_normalize)
            print filename

            # create_data_for_graph(university, "", skills_employer, univ_major_number, major_code_lookup)

            tree_json = create_data_for_tree(university, "",
                                             skills_employer_tree,
                                             univ_major_number,
                                             major_code_lookup,
                                             employer_second_degree_tree)

            resume_text = [open(textfile_name).read()]
            predicted_decision = model.decision_function(resume_text)

            top_predictions, normalized_prediction_score = get_top_predictions(
                predicted_decision)

            out = dict()

            skills_map_with_percent_list = []
            titles = sorted(skills_map_with_percent.keys())
            for title in titles:
                temp_skill_map = dict()
                temp_skill_map[title] = skills_map_with_percent[title]
                skills_map_with_percent_list.append(temp_skill_map)

            out["university"] = university
            out["skills_map"] = skills_map_with_percent_list
            out["titles"] = titles
            out["candidate_skills"] = dict()
            out["title_data"] = dict()

            try:
                tokens = nltk.word_tokenize(resume_text[0].lower())
            except UnicodeDecodeError:
                tokens = nltk.word_tokenize(
                    resume_text[0].decode('utf-8').lower())

            skill_score = []
            for pred in top_predictions:
                try:
                    top15 = skills_map_with_percent[
                        title_title_map[pred]]["skills"][:15]
                except KeyError:
                    top15 = []
                temp_skill_list = [
                    t for t in top15 if len(t) > 1 and t.lower() in tokens
                ]

                out["candidate_skills"][
                    title_title_map[pred]] = temp_skill_list
                out["title_data"][title_title_map[pred]] = titles_data[
                    title_title_map[pred]]
                skill_score.append(int(len(temp_skill_list) / 15.0 * 100.0))

            final_score = [
                sum(x) / 2
                for x in zip(normalized_prediction_score, skill_score)
            ]

            final_titles_list = []
            sorted_score_indexes = [
                i[0] for i in sorted(
                    enumerate(final_score), key=lambda x: x[1], reverse=True)
            ]

            for s in sorted_score_indexes:
                final_titles_list.append(title_title_map[top_predictions[s]])

            final_score_sorted = sorted(final_score, reverse=True)

            out["final_prediction_list"] = final_titles_list
            out["final_score_sorted"] = final_score_sorted
            out["tree_json"] = json.dumps(tree_json)

            print final_titles_list[:5]
            print final_score_sorted[:5]

            if os.path.isfile(textfile_name):
                os.remove(textfile_name)

            if os.path.isfile(filename):
                os.remove(filename)

            # results_json = OrderedDict(out)
            return json.dumps(OrderedDict(out))
Esempio n. 5
0
        try:
            xml = etree.parse(xml_directory + '/' + f)
            name = xml.xpath('//givenname/text()')[0] + ' ' + xml.xpath(
                '//surname/text()')[0]
            if name not in names:
                names.append(name)
                education = xml.xpath('//education')[0]
                schools = education.xpath('//school')

                resume_id = f.split('.')[0]
                temp_univ_major_list = []
                for school in schools:
                    school_text = stripxml(etree.tostring(school))
                    #institution = school.xpath('institution/text()')[0]
                    institution = extract_univ(school_text, univ_dict,
                                               univ_normalize)
                    institution = re.sub('[^A-Za-z0-9 ]+', ' ',
                                         str(institution))
                    institution = re.sub('  ', ' ', str(institution))
                    #print institution
                    if institution.lower() in univ_normalize:
                        #print "NORMALIZED"
                        institution = univ_normalize[institution]

                    degree_level = school.xpath('degree/@level')[0]
                    degree = school.xpath('degree/text()')[0]
                    major_code = str(school.xpath('major/@code')[0])
                    major = school.xpath('major/text()')[0]

                    temp_univ_major_list.append(
                        str(institution + '_' + major_code).lower())