def get_ies_scores(self): extractor = Extractor() ies_filenames = extractor.populate_file_names(self.__ies_accuracy_test) ies_filenames = extractor.filter_by_valid_exts(ies_filenames) filenames, resume_content = extractor.read_resume_content_tika_api( ies_filenames, self.__ies_accuracy_test) filenames, resume_content = extractor.remove_empty_resumes( filenames, resume_content) resume_labels = extractor.read_resume_labels(self.__ies_accuracy_test, filenames) true_edu_insts = [ extractor.get_edu_institutions(xml_tree) for xml_tree in resume_labels ] true_edu_majors = [ extractor.get_edu_majors(xml_tree) for xml_tree in resume_labels ] true_emp_names = [ extractor.get_company_names(xml_tree) for xml_tree in resume_labels ] true_emp_jtitles = [ extractor.get_job_titles(xml_tree) for xml_tree in resume_labels ] cs = CrfSuite() cs.load_tagger() annotator = Annotator() annotated_resumes = [ annotator.annotate_using_trained_model(self.__ies_accuracy_test + self.__seperator + filename[0] + filename[1]) for filename in filenames ] predicted_entity_list = [ cs.tag_doc(resume) for resume in annotated_resumes ] ies_edu_insts = [ extractor.get_edu_institutions_from_list(entity_list) for entity_list in predicted_entity_list ] ies_edu_majors = [ extractor.get_edu_major_from_list(entity_list) for entity_list in predicted_entity_list ] ies_emp_names = [ extractor.get_company_names_from_list(entity_list) for entity_list in predicted_entity_list ] ies_emp_jtitles = [ extractor.get_company_position_from_list(entity_list) for entity_list in predicted_entity_list ] tokeniser = Tokeniser() true_edu_insts = tokeniser.docs_tolower( tokeniser.tokenise_doclines_to_words(true_edu_insts)) true_edu_majors = tokeniser.docs_tolower( tokeniser.tokenise_doclines_to_words(true_edu_majors)) true_emp_names = tokeniser.docs_tolower( tokeniser.tokenise_doclines_to_words(true_emp_names)) true_emp_jtitles = tokeniser.docs_tolower( tokeniser.tokenise_doclines_to_words(true_emp_jtitles)) ies_edu_insts = tokeniser.docs_tolower( tokeniser.tokenise_doclines_to_words(ies_edu_insts)) ies_edu_majors = tokeniser.docs_tolower( tokeniser.tokenise_doclines_to_words(ies_edu_majors)) ies_emp_names = tokeniser.docs_tolower( tokeniser.tokenise_doclines_to_words(ies_emp_names)) ies_emp_jtitles = tokeniser.docs_tolower( tokeniser.tokenise_doclines_to_words(ies_emp_jtitles)) edu_insts_match_score = self.score_matches(ies_edu_insts, true_edu_insts) edu_majors_match_score = self.score_matches(ies_edu_majors, true_edu_majors) emp_names_match_score = self.score_matches(ies_emp_names, true_emp_names) emp_jtitles_match_score = self.score_matches(ies_emp_jtitles, true_emp_jtitles) print(edu_insts_match_score) print(edu_majors_match_score) print(emp_names_match_score) print(emp_jtitles_match_score)
def get_zylon_parser_scores(self): """ parameters: none Extracts labelled entities from zylon's xml output and true xml output. Compares the entity lists and returns a score, higher is better. return: edu_insts_match_score, edu_majors_match_score, emp_names_match_score, emp_jtitles_match_score """ extractor = Extractor() zylon_filenames = extractor.populate_file_names( self.__zylon_parser_labels_folder) zylon_xml_trees = extractor.read_resume_labels( self.__zylon_parser_labels_folder, zylon_filenames) true_xml_trees = extractor.read_resume_labels( self.__dataset_raw_folder, zylon_filenames) true_edu_insts = [ extractor.get_edu_institutions(xml_tree) for xml_tree in true_xml_trees ] true_edu_majors = [ extractor.get_edu_majors(xml_tree) for xml_tree in true_xml_trees ] true_emp_names = [ extractor.get_company_names(xml_tree) for xml_tree in true_xml_trees ] true_emp_jtitles = [ extractor.get_job_titles(xml_tree) for xml_tree in true_xml_trees ] zylon_edu_insts = [ extractor.get_edu_institutions_zy(xml_tree) for xml_tree in zylon_xml_trees ] zylon_edu_majors = [ extractor.get_edu_majors_zy(xml_tree) for xml_tree in zylon_xml_trees ] zylon_emp_names = [ extractor.get_company_names_zy(xml_tree) for xml_tree in zylon_xml_trees ] zylon_emp_jtitles = [ extractor.get_job_titles_zy(xml_tree) for xml_tree in zylon_xml_trees ] tokeniser = Tokeniser() true_edu_insts = tokeniser.docs_tolower( tokeniser.tokenise_doclines_to_words(true_edu_insts)) true_edu_majors = tokeniser.docs_tolower( tokeniser.tokenise_doclines_to_words(true_edu_majors)) true_emp_names = tokeniser.docs_tolower( tokeniser.tokenise_doclines_to_words(true_emp_names)) true_emp_jtitles = tokeniser.docs_tolower( tokeniser.tokenise_doclines_to_words(true_emp_jtitles)) zylon_edu_insts = tokeniser.docs_tolower( tokeniser.tokenise_doclines_to_words(zylon_edu_insts)) zylon_edu_majors = tokeniser.docs_tolower( tokeniser.tokenise_doclines_to_words(zylon_edu_majors)) zylon_emp_names = tokeniser.docs_tolower( tokeniser.tokenise_doclines_to_words(zylon_emp_names)) zylon_emp_jtitles = tokeniser.docs_tolower( tokeniser.tokenise_doclines_to_words(zylon_emp_jtitles)) edu_insts_match_score = self.score_matches(zylon_edu_insts, true_edu_insts) edu_majors_match_score = self.score_matches(zylon_edu_majors, true_edu_majors) emp_names_match_score = self.score_matches(zylon_emp_names, true_emp_names) emp_jtitles_match_score = self.score_matches(zylon_emp_jtitles, true_emp_jtitles) return edu_insts_match_score, edu_majors_match_score, emp_names_match_score, emp_jtitles_match_score