Esempio n. 1
0
 def __init__(self, f):
     self.in_name, self.out_name = f
     self.output = []
     self.tokeniser = Tokeniser(f)
     self.st_handler = SymbolTable()
     self.writer = VMCodeWriter(f)
     self.local_state = {'labeler': labeler()}
     self.parse()
     self.writer.close()
    def __init__(self):
        extractor = Extractor()
        filenames = extractor.populate_file_names(self.__manual_anno_folder)
        valid_filenames = extractor.filter_by_valid_exts(filenames)
        valid_filenames, resume_content = extractor.read_resume_content_tika_api(
            valid_filenames, self.__manual_anno_folder)

        tokeniser = Tokeniser()
        tokenised_docs = tokeniser.tokenise_docs_to_lines(resume_content)

        dataset = Dataset()
        dataset.save_doc_lines(tokenised_docs, valid_filenames,
                               self.__manual_anno_processed)
    def test_tokenise_lines(self):
        tokeniser = Tokeniser()
        # each slot is résumé plain text
        input_docs = [
            "sample resume output\rsample resume output",
            "\rsample resume output\rsample resume output",
            "sample resume output\nsample resume output",
            "\nsample resume output\nsample resume output"
        ]

        # each slot has a list of lines found in each résumé inputted
        correct_output = [["sample resume output", "sample resume output"],
                          ["", "sample resume output", "sample resume output"],
                          ["sample resume output", "sample resume output"],
                          ["", "sample resume output", "sample resume output"]]

        output = tokeniser.tokenise_docs_to_lines(input_docs)
        self.assertEqual(output, correct_output)
    def test_tokenise_words(self):
        tokeniser = Tokeniser()

        # each slot is a line within a résumé
        input_lines = [[
            "sample resume output sample resume output",
            "  sample resume output sample resume output  ",
            "sample resume output.            sample resume output", ""
        ]]

        # each slot is a token
        correct_output = [
            [["sample", "resume", "output", "sample", "resume", "output"],
             ["sample", "resume", "output", "sample", "resume", "output"],
             ["sample", "resume", "output", "sample", "resume", "output"]]
        ]

        output = tokeniser.tokenise_doclines_to_words(input_lines)
        self.assertEqual(output, correct_output)
    def get_ies_scores(self):
        extractor = Extractor()
        ies_filenames = extractor.populate_file_names(self.__ies_accuracy_test)
        ies_filenames = extractor.filter_by_valid_exts(ies_filenames)
        filenames, resume_content = extractor.read_resume_content_tika_api(
            ies_filenames, self.__ies_accuracy_test)
        filenames, resume_content = extractor.remove_empty_resumes(
            filenames, resume_content)
        resume_labels = extractor.read_resume_labels(self.__ies_accuracy_test,
                                                     filenames)

        true_edu_insts = [
            extractor.get_edu_institutions(xml_tree)
            for xml_tree in resume_labels
        ]
        true_edu_majors = [
            extractor.get_edu_majors(xml_tree) for xml_tree in resume_labels
        ]
        true_emp_names = [
            extractor.get_company_names(xml_tree) for xml_tree in resume_labels
        ]
        true_emp_jtitles = [
            extractor.get_job_titles(xml_tree) for xml_tree in resume_labels
        ]

        cs = CrfSuite()
        cs.load_tagger()
        annotator = Annotator()
        annotated_resumes = [
            annotator.annotate_using_trained_model(self.__ies_accuracy_test +
                                                   self.__seperator +
                                                   filename[0] + filename[1])
            for filename in filenames
        ]
        predicted_entity_list = [
            cs.tag_doc(resume) for resume in annotated_resumes
        ]

        ies_edu_insts = [
            extractor.get_edu_institutions_from_list(entity_list)
            for entity_list in predicted_entity_list
        ]
        ies_edu_majors = [
            extractor.get_edu_major_from_list(entity_list)
            for entity_list in predicted_entity_list
        ]
        ies_emp_names = [
            extractor.get_company_names_from_list(entity_list)
            for entity_list in predicted_entity_list
        ]
        ies_emp_jtitles = [
            extractor.get_company_position_from_list(entity_list)
            for entity_list in predicted_entity_list
        ]

        tokeniser = Tokeniser()
        true_edu_insts = tokeniser.docs_tolower(
            tokeniser.tokenise_doclines_to_words(true_edu_insts))
        true_edu_majors = tokeniser.docs_tolower(
            tokeniser.tokenise_doclines_to_words(true_edu_majors))
        true_emp_names = tokeniser.docs_tolower(
            tokeniser.tokenise_doclines_to_words(true_emp_names))
        true_emp_jtitles = tokeniser.docs_tolower(
            tokeniser.tokenise_doclines_to_words(true_emp_jtitles))

        ies_edu_insts = tokeniser.docs_tolower(
            tokeniser.tokenise_doclines_to_words(ies_edu_insts))
        ies_edu_majors = tokeniser.docs_tolower(
            tokeniser.tokenise_doclines_to_words(ies_edu_majors))
        ies_emp_names = tokeniser.docs_tolower(
            tokeniser.tokenise_doclines_to_words(ies_emp_names))
        ies_emp_jtitles = tokeniser.docs_tolower(
            tokeniser.tokenise_doclines_to_words(ies_emp_jtitles))

        edu_insts_match_score = self.score_matches(ies_edu_insts,
                                                   true_edu_insts)
        edu_majors_match_score = self.score_matches(ies_edu_majors,
                                                    true_edu_majors)
        emp_names_match_score = self.score_matches(ies_emp_names,
                                                   true_emp_names)
        emp_jtitles_match_score = self.score_matches(ies_emp_jtitles,
                                                     true_emp_jtitles)
        print(edu_insts_match_score)
        print(edu_majors_match_score)
        print(emp_names_match_score)
        print(emp_jtitles_match_score)
    def get_zylon_parser_scores(self):
        """
        parameters: none

        Extracts labelled entities from zylon's xml output and true xml
        output. Compares the entity lists and returns a score, higher is
        better.
        
        return: edu_insts_match_score, edu_majors_match_score, emp_names_match_score, emp_jtitles_match_score
        """
        extractor = Extractor()
        zylon_filenames = extractor.populate_file_names(
            self.__zylon_parser_labels_folder)

        zylon_xml_trees = extractor.read_resume_labels(
            self.__zylon_parser_labels_folder, zylon_filenames)
        true_xml_trees = extractor.read_resume_labels(
            self.__dataset_raw_folder, zylon_filenames)

        true_edu_insts = [
            extractor.get_edu_institutions(xml_tree)
            for xml_tree in true_xml_trees
        ]
        true_edu_majors = [
            extractor.get_edu_majors(xml_tree) for xml_tree in true_xml_trees
        ]
        true_emp_names = [
            extractor.get_company_names(xml_tree)
            for xml_tree in true_xml_trees
        ]
        true_emp_jtitles = [
            extractor.get_job_titles(xml_tree) for xml_tree in true_xml_trees
        ]

        zylon_edu_insts = [
            extractor.get_edu_institutions_zy(xml_tree)
            for xml_tree in zylon_xml_trees
        ]
        zylon_edu_majors = [
            extractor.get_edu_majors_zy(xml_tree)
            for xml_tree in zylon_xml_trees
        ]
        zylon_emp_names = [
            extractor.get_company_names_zy(xml_tree)
            for xml_tree in zylon_xml_trees
        ]
        zylon_emp_jtitles = [
            extractor.get_job_titles_zy(xml_tree)
            for xml_tree in zylon_xml_trees
        ]

        tokeniser = Tokeniser()
        true_edu_insts = tokeniser.docs_tolower(
            tokeniser.tokenise_doclines_to_words(true_edu_insts))
        true_edu_majors = tokeniser.docs_tolower(
            tokeniser.tokenise_doclines_to_words(true_edu_majors))
        true_emp_names = tokeniser.docs_tolower(
            tokeniser.tokenise_doclines_to_words(true_emp_names))
        true_emp_jtitles = tokeniser.docs_tolower(
            tokeniser.tokenise_doclines_to_words(true_emp_jtitles))

        zylon_edu_insts = tokeniser.docs_tolower(
            tokeniser.tokenise_doclines_to_words(zylon_edu_insts))
        zylon_edu_majors = tokeniser.docs_tolower(
            tokeniser.tokenise_doclines_to_words(zylon_edu_majors))
        zylon_emp_names = tokeniser.docs_tolower(
            tokeniser.tokenise_doclines_to_words(zylon_emp_names))
        zylon_emp_jtitles = tokeniser.docs_tolower(
            tokeniser.tokenise_doclines_to_words(zylon_emp_jtitles))

        edu_insts_match_score = self.score_matches(zylon_edu_insts,
                                                   true_edu_insts)
        edu_majors_match_score = self.score_matches(zylon_edu_majors,
                                                    true_edu_majors)
        emp_names_match_score = self.score_matches(zylon_emp_names,
                                                   true_emp_names)
        emp_jtitles_match_score = self.score_matches(zylon_emp_jtitles,
                                                     true_emp_jtitles)

        return edu_insts_match_score, edu_majors_match_score, emp_names_match_score, emp_jtitles_match_score
 def __init__(self, report_filename = ""):
     self.report_filename = report_filename
     self.scanner = Scanner(self, self.report_filename)
     self.tokeniser = Tokeniser(self)
     self.token_num = 0
Esempio n. 8
0
n = 0
ok = 0

if sys.argv[1] == 'T':
    inFiles = os.listdir(tInDir)
    outFiles = os.listdir(tOutDir)

    for f in outFiles:
        ffile = open(tOutDir + '/' + f[0:7] + '.out', 'w')
        ffile.close()

    for f in inFiles:
        n += 1
        print(f)
        inp = open(tInDir + '/' + f, 'r')
        lex = Tokeniser(''.join(inp.readlines()))
        while True:
            out = open(tOutDir + '/' + f[0:7] + '.out', 'a')
            try:
                t = lex.Next()
            except Exception as err:
                out.write(''.join(err.args) + '\n')
            else:
                if t.tokenType == Token.tokenTypeEOF:
                    expstr = ''
                    exp = open('exp/' + f[0:7] + '.txt', 'r')
                    for line in exp:
                        expstr += line

                    out.close()
                    out = open(tOutDir + '/' + f[0:7] + '.out', 'r')
 def __init__(self):
     self.__extractor = Extractor()
     self.__tokeniser = Tokeniser()
     self.__tagger = Tagger()
     self.__dataset = Dataset()
     self.__logger = Logger()
Esempio n. 10
0
 def __init__(self, f):
     self.in_name, self.out_name = f
     self.output = []
     self.tokeniser = Tokeniser(f)
     self.depth = 0
     self.parse()
Esempio n. 11
0
import sys
from tokeniser import Token, Tokeniser
from parser1 import Parser
from nodes import *
from semanticAnalyser import SemanticAnalyser
import treePrinter

path = sys.argv[2]
f = open(path, 'r', encoding = 'utf-8')
ff = open(path, 'r', encoding = 'utf-8')
lex = Tokeniser(''.join(f.readlines()))
lexx = Tokeniser(''.join(ff.readlines()))

if sys.argv[1] == 'T':
    while True:
        try:
            t = lex.Next()
        except Exception as err:
            print(''.join(err.args))
        else:
            if t.tokenType == Token.tokenTypeEOF:
                break
            print(t)
elif sys.argv[1] == 'P':
    p = Parser(lex)
    pp = Parser(lexx)
    #x = tree(p.ParseProgramModule())
    semantic = SemanticAnalyser(p)
    semantic.analyse()
    print(treePrinter.getTree('', pp.ParseProgramModule()))