def get_zylon_parser_scores(self):
        """
        parameters: none

        Extracts labelled entities from zylon's xml output and true xml
        output. Compares the entity lists and returns a score, higher is
        better.
        
        return: edu_insts_match_score, edu_majors_match_score, emp_names_match_score, emp_jtitles_match_score
        """
        extractor = Extractor()
        zylon_filenames = extractor.populate_file_names(
            self.__zylon_parser_labels_folder)

        zylon_xml_trees = extractor.read_resume_labels(
            self.__zylon_parser_labels_folder, zylon_filenames)
        true_xml_trees = extractor.read_resume_labels(
            self.__dataset_raw_folder, zylon_filenames)

        true_edu_insts = [
            extractor.get_edu_institutions(xml_tree)
            for xml_tree in true_xml_trees
        ]
        true_edu_majors = [
            extractor.get_edu_majors(xml_tree) for xml_tree in true_xml_trees
        ]
        true_emp_names = [
            extractor.get_company_names(xml_tree)
            for xml_tree in true_xml_trees
        ]
        true_emp_jtitles = [
            extractor.get_job_titles(xml_tree) for xml_tree in true_xml_trees
        ]

        zylon_edu_insts = [
            extractor.get_edu_institutions_zy(xml_tree)
            for xml_tree in zylon_xml_trees
        ]
        zylon_edu_majors = [
            extractor.get_edu_majors_zy(xml_tree)
            for xml_tree in zylon_xml_trees
        ]
        zylon_emp_names = [
            extractor.get_company_names_zy(xml_tree)
            for xml_tree in zylon_xml_trees
        ]
        zylon_emp_jtitles = [
            extractor.get_job_titles_zy(xml_tree)
            for xml_tree in zylon_xml_trees
        ]

        tokeniser = Tokeniser()
        true_edu_insts = tokeniser.docs_tolower(
            tokeniser.tokenise_doclines_to_words(true_edu_insts))
        true_edu_majors = tokeniser.docs_tolower(
            tokeniser.tokenise_doclines_to_words(true_edu_majors))
        true_emp_names = tokeniser.docs_tolower(
            tokeniser.tokenise_doclines_to_words(true_emp_names))
        true_emp_jtitles = tokeniser.docs_tolower(
            tokeniser.tokenise_doclines_to_words(true_emp_jtitles))

        zylon_edu_insts = tokeniser.docs_tolower(
            tokeniser.tokenise_doclines_to_words(zylon_edu_insts))
        zylon_edu_majors = tokeniser.docs_tolower(
            tokeniser.tokenise_doclines_to_words(zylon_edu_majors))
        zylon_emp_names = tokeniser.docs_tolower(
            tokeniser.tokenise_doclines_to_words(zylon_emp_names))
        zylon_emp_jtitles = tokeniser.docs_tolower(
            tokeniser.tokenise_doclines_to_words(zylon_emp_jtitles))

        edu_insts_match_score = self.score_matches(zylon_edu_insts,
                                                   true_edu_insts)
        edu_majors_match_score = self.score_matches(zylon_edu_majors,
                                                    true_edu_majors)
        emp_names_match_score = self.score_matches(zylon_emp_names,
                                                   true_emp_names)
        emp_jtitles_match_score = self.score_matches(zylon_emp_jtitles,
                                                     true_emp_jtitles)

        return edu_insts_match_score, edu_majors_match_score, emp_names_match_score, emp_jtitles_match_score
    def get_ies_scores(self):
        extractor = Extractor()
        ies_filenames = extractor.populate_file_names(self.__ies_accuracy_test)
        ies_filenames = extractor.filter_by_valid_exts(ies_filenames)
        filenames, resume_content = extractor.read_resume_content_tika_api(
            ies_filenames, self.__ies_accuracy_test)
        filenames, resume_content = extractor.remove_empty_resumes(
            filenames, resume_content)
        resume_labels = extractor.read_resume_labels(self.__ies_accuracy_test,
                                                     filenames)

        true_edu_insts = [
            extractor.get_edu_institutions(xml_tree)
            for xml_tree in resume_labels
        ]
        true_edu_majors = [
            extractor.get_edu_majors(xml_tree) for xml_tree in resume_labels
        ]
        true_emp_names = [
            extractor.get_company_names(xml_tree) for xml_tree in resume_labels
        ]
        true_emp_jtitles = [
            extractor.get_job_titles(xml_tree) for xml_tree in resume_labels
        ]

        cs = CrfSuite()
        cs.load_tagger()
        annotator = Annotator()
        annotated_resumes = [
            annotator.annotate_using_trained_model(self.__ies_accuracy_test +
                                                   self.__seperator +
                                                   filename[0] + filename[1])
            for filename in filenames
        ]
        predicted_entity_list = [
            cs.tag_doc(resume) for resume in annotated_resumes
        ]

        ies_edu_insts = [
            extractor.get_edu_institutions_from_list(entity_list)
            for entity_list in predicted_entity_list
        ]
        ies_edu_majors = [
            extractor.get_edu_major_from_list(entity_list)
            for entity_list in predicted_entity_list
        ]
        ies_emp_names = [
            extractor.get_company_names_from_list(entity_list)
            for entity_list in predicted_entity_list
        ]
        ies_emp_jtitles = [
            extractor.get_company_position_from_list(entity_list)
            for entity_list in predicted_entity_list
        ]

        tokeniser = Tokeniser()
        true_edu_insts = tokeniser.docs_tolower(
            tokeniser.tokenise_doclines_to_words(true_edu_insts))
        true_edu_majors = tokeniser.docs_tolower(
            tokeniser.tokenise_doclines_to_words(true_edu_majors))
        true_emp_names = tokeniser.docs_tolower(
            tokeniser.tokenise_doclines_to_words(true_emp_names))
        true_emp_jtitles = tokeniser.docs_tolower(
            tokeniser.tokenise_doclines_to_words(true_emp_jtitles))

        ies_edu_insts = tokeniser.docs_tolower(
            tokeniser.tokenise_doclines_to_words(ies_edu_insts))
        ies_edu_majors = tokeniser.docs_tolower(
            tokeniser.tokenise_doclines_to_words(ies_edu_majors))
        ies_emp_names = tokeniser.docs_tolower(
            tokeniser.tokenise_doclines_to_words(ies_emp_names))
        ies_emp_jtitles = tokeniser.docs_tolower(
            tokeniser.tokenise_doclines_to_words(ies_emp_jtitles))

        edu_insts_match_score = self.score_matches(ies_edu_insts,
                                                   true_edu_insts)
        edu_majors_match_score = self.score_matches(ies_edu_majors,
                                                    true_edu_majors)
        emp_names_match_score = self.score_matches(ies_emp_names,
                                                   true_emp_names)
        emp_jtitles_match_score = self.score_matches(ies_emp_jtitles,
                                                     true_emp_jtitles)
        print(edu_insts_match_score)
        print(edu_majors_match_score)
        print(emp_names_match_score)
        print(emp_jtitles_match_score)
class Annotator():
    __job_position_tag = "EMP-POS"
    __job_company_tag = "EMP-COMP"

    __education_course_tag = "EDU-MAJOR"
    __education_institution_tag = "EDU-INST"

    def __init__(self):
        self.__extractor = Extractor()
        self.__tokeniser = Tokeniser()
        self.__tagger = Tagger()
        self.__dataset = Dataset()
        self.__logger = Logger()

    def prepare_dataset(self, nr_of_docs=-1):
        resumes, labels = self.__extractor.read_raw_files(nr_of_docs)

        resumes = self.__tokeniser.tokenise_docs_to_lines(resumes)
        resumes = self.__tokeniser.tokenise_doclines_to_words(resumes)

        self.__dataset.resume_content = self.annotate_docs(resumes, labels)
        self.__dataset.save()

    # resumes: list of tokenised (by line and word) résumé docs
    # labels: xml structure storing labels for several resumes
    def annotate_docs(self, resumes, labels):
        self.__logger.println("annotating resumes")
        annotated_resumes = []
        for idx, resume in enumerate(resumes):
            annotated_resumes.append(self.annotate_doc(resume, labels[idx]))
            self.__logger.println(
                "annotating resume %s/%s with true labels and pos tags" %
                (idx + 1, len(resumes)))

        # non local ner tag entire dataset at a time for speed
        annotated_resumes = self.__tagger.nonlocal_ner_tag(annotated_resumes)
        self.__logger.println("completed annotating resumes")
        return annotated_resumes

    # doc: a single résumé document with token strings in each slot of list
    # labels: xml structure storing pre-extracted information
    def annotate_doc(self, doc, labels):
        job_title_list = self.__extractor.get_job_titles(labels)
        job_company_list = self.__extractor.get_company_names(labels)
        edu_major_list = self.__extractor.get_edu_majors(labels)
        edu_inst_list = self.__extractor.get_edu_institutions(labels)
        # can extract more labels here

        prepared_doc = self.__tagger.prepare_doc(doc)
        prepared_doc = self.__match_entity(prepared_doc, job_title_list,
                                           self.__job_position_tag)
        prepared_doc = self.__match_entity(prepared_doc, job_company_list,
                                           self.__job_company_tag)
        prepared_doc = self.__match_entity(prepared_doc, edu_major_list,
                                           self.__education_course_tag)
        prepared_doc = self.__match_entity(prepared_doc, edu_inst_list,
                                           self.__education_institution_tag)
        prepared_doc = self.__tagger.add_default_entity_tags(prepared_doc)

        prepared_doc = self.__tagger.pos_tag(prepared_doc)

        return prepared_doc

    # doc: résumé doc to be annotated
    # entity_list: list of labels to matched in doc
    # tag: tag to be assigned if match found
    def __match_entity(self, doc, entity_list, tag):
        for entity in entity_list:
            doc = self.__tagger.match_label(doc, entity, tag)
        return doc

    # function takes in a path to file and annotates it for tagging
    # to be ideally used to tag as a one off for testing
    # filepath: path to résumé
    def annotate_using_trained_model(self, filepath):
        resume_content = self.__extractor.read_resume_content(filepath)

        resume_content = self.__tokeniser.tokenise_docs_to_lines(
            resume_content)
        resume_content = self.__tokeniser.tokenise_doclines_to_words(
            resume_content)

        prepared_doc = self.__tagger.prepare_doc(resume_content[0])
        prepared_doc = self.__tagger.pos_tag(prepared_doc)
        prepared_doc = self.__tagger.nonlocal_ner_tag([prepared_doc])

        return prepared_doc[0]