def __init__(self, f): self.in_name, self.out_name = f self.output = [] self.tokeniser = Tokeniser(f) self.st_handler = SymbolTable() self.writer = VMCodeWriter(f) self.local_state = {'labeler': labeler()} self.parse() self.writer.close()
def __init__(self): extractor = Extractor() filenames = extractor.populate_file_names(self.__manual_anno_folder) valid_filenames = extractor.filter_by_valid_exts(filenames) valid_filenames, resume_content = extractor.read_resume_content_tika_api( valid_filenames, self.__manual_anno_folder) tokeniser = Tokeniser() tokenised_docs = tokeniser.tokenise_docs_to_lines(resume_content) dataset = Dataset() dataset.save_doc_lines(tokenised_docs, valid_filenames, self.__manual_anno_processed)
def test_tokenise_lines(self): tokeniser = Tokeniser() # each slot is résumé plain text input_docs = [ "sample resume output\rsample resume output", "\rsample resume output\rsample resume output", "sample resume output\nsample resume output", "\nsample resume output\nsample resume output" ] # each slot has a list of lines found in each résumé inputted correct_output = [["sample resume output", "sample resume output"], ["", "sample resume output", "sample resume output"], ["sample resume output", "sample resume output"], ["", "sample resume output", "sample resume output"]] output = tokeniser.tokenise_docs_to_lines(input_docs) self.assertEqual(output, correct_output)
def test_tokenise_words(self): tokeniser = Tokeniser() # each slot is a line within a résumé input_lines = [[ "sample resume output sample resume output", " sample resume output sample resume output ", "sample resume output. sample resume output", "" ]] # each slot is a token correct_output = [ [["sample", "resume", "output", "sample", "resume", "output"], ["sample", "resume", "output", "sample", "resume", "output"], ["sample", "resume", "output", "sample", "resume", "output"]] ] output = tokeniser.tokenise_doclines_to_words(input_lines) self.assertEqual(output, correct_output)
def get_ies_scores(self): extractor = Extractor() ies_filenames = extractor.populate_file_names(self.__ies_accuracy_test) ies_filenames = extractor.filter_by_valid_exts(ies_filenames) filenames, resume_content = extractor.read_resume_content_tika_api( ies_filenames, self.__ies_accuracy_test) filenames, resume_content = extractor.remove_empty_resumes( filenames, resume_content) resume_labels = extractor.read_resume_labels(self.__ies_accuracy_test, filenames) true_edu_insts = [ extractor.get_edu_institutions(xml_tree) for xml_tree in resume_labels ] true_edu_majors = [ extractor.get_edu_majors(xml_tree) for xml_tree in resume_labels ] true_emp_names = [ extractor.get_company_names(xml_tree) for xml_tree in resume_labels ] true_emp_jtitles = [ extractor.get_job_titles(xml_tree) for xml_tree in resume_labels ] cs = CrfSuite() cs.load_tagger() annotator = Annotator() annotated_resumes = [ annotator.annotate_using_trained_model(self.__ies_accuracy_test + self.__seperator + filename[0] + filename[1]) for filename in filenames ] predicted_entity_list = [ cs.tag_doc(resume) for resume in annotated_resumes ] ies_edu_insts = [ extractor.get_edu_institutions_from_list(entity_list) for entity_list in predicted_entity_list ] ies_edu_majors = [ extractor.get_edu_major_from_list(entity_list) for entity_list in predicted_entity_list ] ies_emp_names = [ extractor.get_company_names_from_list(entity_list) for entity_list in predicted_entity_list ] ies_emp_jtitles = [ extractor.get_company_position_from_list(entity_list) for entity_list in predicted_entity_list ] tokeniser = Tokeniser() true_edu_insts = tokeniser.docs_tolower( tokeniser.tokenise_doclines_to_words(true_edu_insts)) true_edu_majors = tokeniser.docs_tolower( tokeniser.tokenise_doclines_to_words(true_edu_majors)) true_emp_names = tokeniser.docs_tolower( tokeniser.tokenise_doclines_to_words(true_emp_names)) true_emp_jtitles = tokeniser.docs_tolower( tokeniser.tokenise_doclines_to_words(true_emp_jtitles)) ies_edu_insts = tokeniser.docs_tolower( tokeniser.tokenise_doclines_to_words(ies_edu_insts)) ies_edu_majors = tokeniser.docs_tolower( tokeniser.tokenise_doclines_to_words(ies_edu_majors)) ies_emp_names = tokeniser.docs_tolower( tokeniser.tokenise_doclines_to_words(ies_emp_names)) ies_emp_jtitles = tokeniser.docs_tolower( tokeniser.tokenise_doclines_to_words(ies_emp_jtitles)) edu_insts_match_score = self.score_matches(ies_edu_insts, true_edu_insts) edu_majors_match_score = self.score_matches(ies_edu_majors, true_edu_majors) emp_names_match_score = self.score_matches(ies_emp_names, true_emp_names) emp_jtitles_match_score = self.score_matches(ies_emp_jtitles, true_emp_jtitles) print(edu_insts_match_score) print(edu_majors_match_score) print(emp_names_match_score) print(emp_jtitles_match_score)
def get_zylon_parser_scores(self): """ parameters: none Extracts labelled entities from zylon's xml output and true xml output. Compares the entity lists and returns a score, higher is better. return: edu_insts_match_score, edu_majors_match_score, emp_names_match_score, emp_jtitles_match_score """ extractor = Extractor() zylon_filenames = extractor.populate_file_names( self.__zylon_parser_labels_folder) zylon_xml_trees = extractor.read_resume_labels( self.__zylon_parser_labels_folder, zylon_filenames) true_xml_trees = extractor.read_resume_labels( self.__dataset_raw_folder, zylon_filenames) true_edu_insts = [ extractor.get_edu_institutions(xml_tree) for xml_tree in true_xml_trees ] true_edu_majors = [ extractor.get_edu_majors(xml_tree) for xml_tree in true_xml_trees ] true_emp_names = [ extractor.get_company_names(xml_tree) for xml_tree in true_xml_trees ] true_emp_jtitles = [ extractor.get_job_titles(xml_tree) for xml_tree in true_xml_trees ] zylon_edu_insts = [ extractor.get_edu_institutions_zy(xml_tree) for xml_tree in zylon_xml_trees ] zylon_edu_majors = [ extractor.get_edu_majors_zy(xml_tree) for xml_tree in zylon_xml_trees ] zylon_emp_names = [ extractor.get_company_names_zy(xml_tree) for xml_tree in zylon_xml_trees ] zylon_emp_jtitles = [ extractor.get_job_titles_zy(xml_tree) for xml_tree in zylon_xml_trees ] tokeniser = Tokeniser() true_edu_insts = tokeniser.docs_tolower( tokeniser.tokenise_doclines_to_words(true_edu_insts)) true_edu_majors = tokeniser.docs_tolower( tokeniser.tokenise_doclines_to_words(true_edu_majors)) true_emp_names = tokeniser.docs_tolower( tokeniser.tokenise_doclines_to_words(true_emp_names)) true_emp_jtitles = tokeniser.docs_tolower( tokeniser.tokenise_doclines_to_words(true_emp_jtitles)) zylon_edu_insts = tokeniser.docs_tolower( tokeniser.tokenise_doclines_to_words(zylon_edu_insts)) zylon_edu_majors = tokeniser.docs_tolower( tokeniser.tokenise_doclines_to_words(zylon_edu_majors)) zylon_emp_names = tokeniser.docs_tolower( tokeniser.tokenise_doclines_to_words(zylon_emp_names)) zylon_emp_jtitles = tokeniser.docs_tolower( tokeniser.tokenise_doclines_to_words(zylon_emp_jtitles)) edu_insts_match_score = self.score_matches(zylon_edu_insts, true_edu_insts) edu_majors_match_score = self.score_matches(zylon_edu_majors, true_edu_majors) emp_names_match_score = self.score_matches(zylon_emp_names, true_emp_names) emp_jtitles_match_score = self.score_matches(zylon_emp_jtitles, true_emp_jtitles) return edu_insts_match_score, edu_majors_match_score, emp_names_match_score, emp_jtitles_match_score
def __init__(self, report_filename = ""): self.report_filename = report_filename self.scanner = Scanner(self, self.report_filename) self.tokeniser = Tokeniser(self) self.token_num = 0
n = 0 ok = 0 if sys.argv[1] == 'T': inFiles = os.listdir(tInDir) outFiles = os.listdir(tOutDir) for f in outFiles: ffile = open(tOutDir + '/' + f[0:7] + '.out', 'w') ffile.close() for f in inFiles: n += 1 print(f) inp = open(tInDir + '/' + f, 'r') lex = Tokeniser(''.join(inp.readlines())) while True: out = open(tOutDir + '/' + f[0:7] + '.out', 'a') try: t = lex.Next() except Exception as err: out.write(''.join(err.args) + '\n') else: if t.tokenType == Token.tokenTypeEOF: expstr = '' exp = open('exp/' + f[0:7] + '.txt', 'r') for line in exp: expstr += line out.close() out = open(tOutDir + '/' + f[0:7] + '.out', 'r')
def __init__(self): self.__extractor = Extractor() self.__tokeniser = Tokeniser() self.__tagger = Tagger() self.__dataset = Dataset() self.__logger = Logger()
def __init__(self, f): self.in_name, self.out_name = f self.output = [] self.tokeniser = Tokeniser(f) self.depth = 0 self.parse()
import sys from tokeniser import Token, Tokeniser from parser1 import Parser from nodes import * from semanticAnalyser import SemanticAnalyser import treePrinter path = sys.argv[2] f = open(path, 'r', encoding = 'utf-8') ff = open(path, 'r', encoding = 'utf-8') lex = Tokeniser(''.join(f.readlines())) lexx = Tokeniser(''.join(ff.readlines())) if sys.argv[1] == 'T': while True: try: t = lex.Next() except Exception as err: print(''.join(err.args)) else: if t.tokenType == Token.tokenTypeEOF: break print(t) elif sys.argv[1] == 'P': p = Parser(lex) pp = Parser(lexx) #x = tree(p.ParseProgramModule()) semantic = SemanticAnalyser(p) semantic.analyse() print(treePrinter.getTree('', pp.ParseProgramModule()))