def __init__(self, f): self.in_name, self.out_name = f self.output = [] self.tokeniser = Tokeniser(f) self.st_handler = SymbolTable() self.writer = VMCodeWriter(f) self.local_state = {'labeler': labeler()} self.parse() self.writer.close()
def __init__(self): extractor = Extractor() filenames = extractor.populate_file_names(self.__manual_anno_folder) valid_filenames = extractor.filter_by_valid_exts(filenames) valid_filenames, resume_content = extractor.read_resume_content_tika_api( valid_filenames, self.__manual_anno_folder) tokeniser = Tokeniser() tokenised_docs = tokeniser.tokenise_docs_to_lines(resume_content) dataset = Dataset() dataset.save_doc_lines(tokenised_docs, valid_filenames, self.__manual_anno_processed)
def test_tokenise_nt(self): cases = { "Don't": [WordToken("do"), WordToken("not")], "hasn't": [WordToken("has"), WordToken("not")] } for input, expected in cases.iteritems(): assert Tokeniser.tokenise_sentence(input) == expected
def test_tokenise_ve(self): # I've -> I have, as there is no ambiguity cases = { "I've": [WordToken("i"), WordToken("have")], } for input, expected in cases.iteritems(): assert Tokeniser.tokenise_sentence(input) == expected
def add_content_features(featuredict, content): """Add features from note content. Derive the following features from note content and add them to the featuredict with binary values: CONTENT-TOKEN-<token>: Set for each unique, case-folded token in the note content (not including markup). CONTENT-MEDIA-<mimetype>: Set for each mimetype used for media in the note. CONTENT-HASLINK: Set if the note contains one or more links. CONTENT-LINK-<domain>: Set with the domain of each link in the note. CONTENT-TODO: Set if the note contains a todo. Args: featuredict: A dict. content: File-like object containing the note content. """ parser = etree.HTMLParser() root = etree.parse(content, parser).getroot() string_content = unicode(root.xpath('string()')) for token in Tokeniser.split(string_content): featuredict["CONTENT-TOKEN-%s" % token.lower()] = 1 for media in root.iterfind(".//en-media"): featuredict["CONTENT-MEDIA-%s" % media.get("type")] = 1 for link in root.iterfind(".//a"): url = link.get("href") if url is not None: featuredict["CONTENT-HASLINK"] = 1 netloc = urlparse(link.get("href")).netloc if netloc: featuredict["CONTENT-LINK-%s" % netloc] = 1 if root.find(".//en-todo") is not None: featuredict["CONTENT-TODO"] = 1
def test_tokenise_comma(self): cases = { "I, for one.": [WordToken("i"), PunctuationToken(","), WordToken("for"), WordToken("one"), PunctuationToken(".")] } for input, expected in cases.iteritems(): assert Tokeniser.tokenise_sentence(input) == expected
def test_tokenise_ll(self): cases = { "I'll": [WordToken("i"), WordToken("will")], "Sam'll": [WordToken("sam"), WordToken("will")], } for input, expected in cases.iteritems(): assert Tokeniser.tokenise_sentence(input) == expected
def test_tokenise_s(self): # has / genitive / is ambiguous so just leave it as is cases = { "It's": [WordToken("it"), WordToken("'s")], "He's": [WordToken("he"), WordToken("'s")], } for input, expected in cases.iteritems(): assert Tokeniser.tokenise_sentence(input) == expected
def test_tokenise_d(self): # he'd -> he had / he would ambiguous, so just leave it cases = { "It'd": [WordToken("it"), WordToken("'d")], "He'd": [WordToken("he"), WordToken("'d")], } for input, expected in cases.iteritems(): assert Tokeniser.tokenise_sentence(input) == expected
def test_compute_weighted(self): cases = [ ("agilely agility agitate", 0), ("agilely agitated of", -0.5) ] for s, expected in cases: tokens = Tokeniser.tokenise_sentence(s) assert SymbolicScore.compute_weighted(tokens, self.lexicon) == expected
def test_tokenise_bracket(self): cases = { "(I, for one.)": [PunctuationToken("("), WordToken("i"), PunctuationToken(","), WordToken("for"), WordToken("one"), PunctuationToken("."), PunctuationToken(")")] } for input, expected in cases.iteritems(): assert Tokeniser.tokenise_sentence(input) == expected
def test_tokenise_hyphen(self): # "eight-year-old-child" # 8-year-old ? cf tokenise_id cases = { "eight-year-old child": [WordToken("eight"), PunctuationToken("-"), WordToken("year"), PunctuationToken("-"), WordToken("old"), WordToken("child")], } for input, expected in cases.iteritems(): assert Tokeniser.tokenise_sentence(input) == expected
def test_tokenise_id(self): # B456F7-3 cases = { "like B456F7-3": [WordToken("like"), WordToken("B456F7-3")], "like B456F7-3-like": [WordToken("like"), WordToken("B456F7-3-like")], "8-years-old": [WordToken("8-years-old")], } for input, expected in cases.iteritems(): assert Tokeniser.tokenise_sentence(input) == expected
def test_tokenise_capital_middle(self): # I thought it was GREAT. -> GREAT should be capitalised # I love Paris -> Paris should be capitalised # Paris I love -> paris + i + love (I should go lowercase) cases = { "I thought it was GREAT": [WordToken("i"), WordToken("thought"), WordToken("it"), WordToken("was"), WordToken("GREAT")], "I love Paris": [WordToken("i"), WordToken("love"), WordToken("Paris")], "Paris I love": [WordToken("paris"), WordToken("i"), WordToken("love")], } for input, expected in cases.iteritems(): assert Tokeniser.tokenise_sentence(input) == expected
def test_period(self): # Ph.D -> Ph.D # U.S.A. -> U.S.A + period if end of sentence, U.S.A if middle cases = { "I have a Ph.D.": [WordToken("i"), WordToken("have"), WordToken("a"), WordToken("Ph.D"), PunctuationToken(".")], "Make U.K. great again.": [WordToken("make"), WordToken("U.K"), PunctuationToken("."), WordToken("great"), WordToken("again"), PunctuationToken(".")] } for input, expected in cases.iteritems(): assert Tokeniser.tokenise_sentence(input) == expected
def test_tokenise_slash(self): # love/hate relationship cases = { "love/hate relationship": [WordToken("love"), PunctuationToken("/"), WordToken("hate"), WordToken("relationship")], "this love/ hate relationship": [WordToken("this"), WordToken("love"), PunctuationToken("/"), WordToken("hate"), WordToken("relationship")], "weird-love /hate relationship": [WordToken("weird"), PunctuationToken("-"), WordToken("love"), PunctuationToken("/"), WordToken("hate"), WordToken("relationship")], } for input, expected in cases.iteritems(): assert Tokeniser.tokenise_sentence(input) == expected
class Lexer: """ Intel C/C++ Compiler optimization report lexical analyser """ def __init__(self, report_filename = ""): self.report_filename = report_filename self.scanner = Scanner(self, self.report_filename) self.tokeniser = Tokeniser(self) self.token_num = 0 def get_token_num(self): return self.token_num def get_next_token(self): token = self.tokeniser.tokenise_lexeme( self.scanner.get_next_lexeme() ) if token != TokenClass.EOR: self.token_num += 1 return token
def test_tokenise_lines(self): tokeniser = Tokeniser() # each slot is résumé plain text input_docs = [ "sample resume output\rsample resume output", "\rsample resume output\rsample resume output", "sample resume output\nsample resume output", "\nsample resume output\nsample resume output" ] # each slot has a list of lines found in each résumé inputted correct_output = [["sample resume output", "sample resume output"], ["", "sample resume output", "sample resume output"], ["sample resume output", "sample resume output"], ["", "sample resume output", "sample resume output"]] output = tokeniser.tokenise_docs_to_lines(input_docs) self.assertEqual(output, correct_output)
def test_tokenise_words(self): tokeniser = Tokeniser() # each slot is a line within a résumé input_lines = [[ "sample resume output sample resume output", " sample resume output sample resume output ", "sample resume output. sample resume output", "" ]] # each slot is a token correct_output = [ [["sample", "resume", "output", "sample", "resume", "output"], ["sample", "resume", "output", "sample", "resume", "output"], ["sample", "resume", "output", "sample", "resume", "output"]] ] output = tokeniser.tokenise_doclines_to_words(input_lines) self.assertEqual(output, correct_output)
def add_metadata_features(featuredict, note): """Add features from note metadata. Derive the following features from the Note and add them to the featuredict with binary values: META-TITLETOKEN-<token>: Set for each unique, case-folded token in the note title. META-URL-<domain>: Set with the domain of the note URL, if one is provided. META_HASURL: Set if the note has a URL. META-HASLOCATION: Set if the note has a latitude. META-SOURCE-<source>: Set with the source of the note, if it is provided. META-PLACE-<place>: Set with the place name of the note, if it is provided. META-CONTENTCLASS-<class>: Set with the content class of the note, if it is provided. Args: featuredict: A dict. note: Note object. """ for token in Tokeniser.split(unicode(note.title, encoding="utf-8")): featuredict["META-TITLETOKEN-%s" % token.lower()] = 1 if note.attributes.sourceURL: netloc = urlparse(note.attributes.sourceURL).netloc if netloc: featuredict["META-URL-%s" % netloc] = 1 featuredict["META-HASURL"] = 1 if note.attributes.latitude is not None: featuredict["META-HASLOCATION"] = 1 if note.attributes.source: featuredict["META-SOURCE-%s" % note.attributes.source] = 1 if note.attributes.placeName: featuredict["META-PLACE-%s" % note.attributes.placeName] = 1 if note.attributes.contentClass: featuredict["META-CONTENTCLASS-%s" % note.attributes.contentClass] = 1
def __init__(self, report_filename = ""): self.report_filename = report_filename self.scanner = Scanner(self, self.report_filename) self.tokeniser = Tokeniser(self) self.token_num = 0
def test_currency(self): tokens = Tokeniser.split("hi there $100 man") self.assertEqual(tokens, ["hi", "there", "$100", "man"])
def test_punctuation(self): tokens = Tokeniser.split("hi there, you ...") self.assertEqual(tokens, ["hi", "there", ",", "you", "..."])
def test_unicode(self): tokens = Tokeniser.split(u'hi theré') self.assertEqual(tokens, ["hi", u'theré'])
def test_tokenise_whitespace(self): cases = { "an apple.": [WordToken("an"), WordToken("apple"), PunctuationToken(".")], } for input, expected in cases.iteritems(): assert Tokeniser.tokenise_sentence(input) == expected
def test_muliline(self): tokens = Tokeniser.split("hi there\nsecond line") self.assertEqual(tokens, ["hi", "there", "second", "line"])
from naive_bayes_neg import NaiveBayesNeg from negation import compute_neg_punc, compute_neg_dir_dep, compute_neg_head_obj, compute_neg_after_x from symbolic import LexiconGenerator from symbolic_neg import compute_negation_terms, SymbolicClassifier from tokeniser import Tokeniser no_negation = lambda x, y: [False] * len(x) pos_path = os.path.abspath("../data/POS") pos_files = [os.path.join(pos_path, f) for f in os.listdir(pos_path)] neg_path = os.path.abspath("../data/NEG") neg_files = [os.path.join(neg_path, f) for f in os.listdir(neg_path)] dataset = [pos_files, neg_files] # list of (tokens, label), where label are either 0:positive or 1:negative datas = [(list(Tokeniser.tokenise(data)), label) for label in xrange(0, 2) for data in dataset[label]] # load spacy language model, used to compute dependency structures nlp = spacy.load('en') print "spacy loaded" # TODO rename methods as they are called in the report methods = [(compute_neg_punc, [], "punc"), (compute_neg_dir_dep, [nlp], "dir_dep"), (compute_neg_head_obj, [nlp], "head_obj")] for i in xrange(1, 6, 2): methods += [(compute_neg_after_x, [i], "after_{}".format(i))] # negation_terms = compute_negation_terms()
n = 0 ok = 0 if sys.argv[1] == 'T': inFiles = os.listdir(tInDir) outFiles = os.listdir(tOutDir) for f in outFiles: ffile = open(tOutDir + '/' + f[0:7] + '.out', 'w') ffile.close() for f in inFiles: n += 1 print(f) inp = open(tInDir + '/' + f, 'r') lex = Tokeniser(''.join(inp.readlines())) while True: out = open(tOutDir + '/' + f[0:7] + '.out', 'a') try: t = lex.Next() except Exception as err: out.write(''.join(err.args) + '\n') else: if t.tokenType == Token.tokenTypeEOF: expstr = '' exp = open('exp/' + f[0:7] + '.txt', 'r') for line in exp: expstr += line out.close() out = open(tOutDir + '/' + f[0:7] + '.out', 'r')
def check_should_return_true_for_valid_positive_int(number): assert Tokeniser.is_positive_int(number)
def check_should_return_true_for_valid_coordinates(number): assert Tokeniser.is_coordinate('{},{}'.format(number, number))
def check_should_return_true_for_valid_numbers(number): assert Tokeniser.is_number(number)
def __init__(self, f): self.in_name, self.out_name = f self.output = [] self.tokeniser = Tokeniser(f) self.depth = 0 self.parse()
def check_should_return_true_for_valid_directions(direction): assert Tokeniser.is_direction(direction)
def test_tokenise_i(self): cases = { "well I think": [WordToken("well"), WordToken("i"), WordToken("think")], } for input, expected in cases.iteritems(): assert Tokeniser.tokenise_sentence(input) == expected
def check_should_return_true_for_valid_bi_actions(action): assert Tokeniser.is_bi_action(action)
def get_ies_scores(self): extractor = Extractor() ies_filenames = extractor.populate_file_names(self.__ies_accuracy_test) ies_filenames = extractor.filter_by_valid_exts(ies_filenames) filenames, resume_content = extractor.read_resume_content_tika_api( ies_filenames, self.__ies_accuracy_test) filenames, resume_content = extractor.remove_empty_resumes( filenames, resume_content) resume_labels = extractor.read_resume_labels(self.__ies_accuracy_test, filenames) true_edu_insts = [ extractor.get_edu_institutions(xml_tree) for xml_tree in resume_labels ] true_edu_majors = [ extractor.get_edu_majors(xml_tree) for xml_tree in resume_labels ] true_emp_names = [ extractor.get_company_names(xml_tree) for xml_tree in resume_labels ] true_emp_jtitles = [ extractor.get_job_titles(xml_tree) for xml_tree in resume_labels ] cs = CrfSuite() cs.load_tagger() annotator = Annotator() annotated_resumes = [ annotator.annotate_using_trained_model(self.__ies_accuracy_test + self.__seperator + filename[0] + filename[1]) for filename in filenames ] predicted_entity_list = [ cs.tag_doc(resume) for resume in annotated_resumes ] ies_edu_insts = [ extractor.get_edu_institutions_from_list(entity_list) for entity_list in predicted_entity_list ] ies_edu_majors = [ extractor.get_edu_major_from_list(entity_list) for entity_list in predicted_entity_list ] ies_emp_names = [ extractor.get_company_names_from_list(entity_list) for entity_list in predicted_entity_list ] ies_emp_jtitles = [ extractor.get_company_position_from_list(entity_list) for entity_list in predicted_entity_list ] tokeniser = Tokeniser() true_edu_insts = tokeniser.docs_tolower( tokeniser.tokenise_doclines_to_words(true_edu_insts)) true_edu_majors = tokeniser.docs_tolower( tokeniser.tokenise_doclines_to_words(true_edu_majors)) true_emp_names = tokeniser.docs_tolower( tokeniser.tokenise_doclines_to_words(true_emp_names)) true_emp_jtitles = tokeniser.docs_tolower( tokeniser.tokenise_doclines_to_words(true_emp_jtitles)) ies_edu_insts = tokeniser.docs_tolower( tokeniser.tokenise_doclines_to_words(ies_edu_insts)) ies_edu_majors = tokeniser.docs_tolower( tokeniser.tokenise_doclines_to_words(ies_edu_majors)) ies_emp_names = tokeniser.docs_tolower( tokeniser.tokenise_doclines_to_words(ies_emp_names)) ies_emp_jtitles = tokeniser.docs_tolower( tokeniser.tokenise_doclines_to_words(ies_emp_jtitles)) edu_insts_match_score = self.score_matches(ies_edu_insts, true_edu_insts) edu_majors_match_score = self.score_matches(ies_edu_majors, true_edu_majors) emp_names_match_score = self.score_matches(ies_emp_names, true_emp_names) emp_jtitles_match_score = self.score_matches(ies_emp_jtitles, true_emp_jtitles) print(edu_insts_match_score) print(edu_majors_match_score) print(emp_names_match_score) print(emp_jtitles_match_score)
class Annotator(): __job_position_tag = "EMP-POS" __job_company_tag = "EMP-COMP" __education_course_tag = "EDU-MAJOR" __education_institution_tag = "EDU-INST" def __init__(self): self.__extractor = Extractor() self.__tokeniser = Tokeniser() self.__tagger = Tagger() self.__dataset = Dataset() self.__logger = Logger() def prepare_dataset(self, nr_of_docs=-1): resumes, labels = self.__extractor.read_raw_files(nr_of_docs) resumes = self.__tokeniser.tokenise_docs_to_lines(resumes) resumes = self.__tokeniser.tokenise_doclines_to_words(resumes) self.__dataset.resume_content = self.annotate_docs(resumes, labels) self.__dataset.save() # resumes: list of tokenised (by line and word) résumé docs # labels: xml structure storing labels for several resumes def annotate_docs(self, resumes, labels): self.__logger.println("annotating resumes") annotated_resumes = [] for idx, resume in enumerate(resumes): annotated_resumes.append(self.annotate_doc(resume, labels[idx])) self.__logger.println( "annotating resume %s/%s with true labels and pos tags" % (idx + 1, len(resumes))) # non local ner tag entire dataset at a time for speed annotated_resumes = self.__tagger.nonlocal_ner_tag(annotated_resumes) self.__logger.println("completed annotating resumes") return annotated_resumes # doc: a single résumé document with token strings in each slot of list # labels: xml structure storing pre-extracted information def annotate_doc(self, doc, labels): job_title_list = self.__extractor.get_job_titles(labels) job_company_list = self.__extractor.get_company_names(labels) edu_major_list = self.__extractor.get_edu_majors(labels) edu_inst_list = self.__extractor.get_edu_institutions(labels) # can extract more labels here prepared_doc = self.__tagger.prepare_doc(doc) prepared_doc = self.__match_entity(prepared_doc, job_title_list, self.__job_position_tag) prepared_doc = self.__match_entity(prepared_doc, job_company_list, self.__job_company_tag) prepared_doc = self.__match_entity(prepared_doc, edu_major_list, self.__education_course_tag) prepared_doc = self.__match_entity(prepared_doc, edu_inst_list, self.__education_institution_tag) prepared_doc = self.__tagger.add_default_entity_tags(prepared_doc) prepared_doc = self.__tagger.pos_tag(prepared_doc) return prepared_doc # doc: résumé doc to be annotated # entity_list: list of labels to matched in doc # tag: tag to be assigned if match found def __match_entity(self, doc, entity_list, tag): for entity in entity_list: doc = self.__tagger.match_label(doc, entity, tag) return doc # function takes in a path to file and annotates it for tagging # to be ideally used to tag as a one off for testing # filepath: path to résumé def annotate_using_trained_model(self, filepath): resume_content = self.__extractor.read_resume_content(filepath) resume_content = self.__tokeniser.tokenise_docs_to_lines( resume_content) resume_content = self.__tokeniser.tokenise_doclines_to_words( resume_content) prepared_doc = self.__tagger.prepare_doc(resume_content[0]) prepared_doc = self.__tagger.pos_tag(prepared_doc) prepared_doc = self.__tagger.nonlocal_ner_tag([prepared_doc]) return prepared_doc[0]
def check_should_return_true_for_valid_repeat_commands(cmd): assert Tokeniser.is_valid_repeat(cmd)
def __init__(self): self.__extractor = Extractor() self.__tokeniser = Tokeniser() self.__tagger = Tagger() self.__dataset = Dataset() self.__logger = Logger()
def get_zylon_parser_scores(self): """ parameters: none Extracts labelled entities from zylon's xml output and true xml output. Compares the entity lists and returns a score, higher is better. return: edu_insts_match_score, edu_majors_match_score, emp_names_match_score, emp_jtitles_match_score """ extractor = Extractor() zylon_filenames = extractor.populate_file_names( self.__zylon_parser_labels_folder) zylon_xml_trees = extractor.read_resume_labels( self.__zylon_parser_labels_folder, zylon_filenames) true_xml_trees = extractor.read_resume_labels( self.__dataset_raw_folder, zylon_filenames) true_edu_insts = [ extractor.get_edu_institutions(xml_tree) for xml_tree in true_xml_trees ] true_edu_majors = [ extractor.get_edu_majors(xml_tree) for xml_tree in true_xml_trees ] true_emp_names = [ extractor.get_company_names(xml_tree) for xml_tree in true_xml_trees ] true_emp_jtitles = [ extractor.get_job_titles(xml_tree) for xml_tree in true_xml_trees ] zylon_edu_insts = [ extractor.get_edu_institutions_zy(xml_tree) for xml_tree in zylon_xml_trees ] zylon_edu_majors = [ extractor.get_edu_majors_zy(xml_tree) for xml_tree in zylon_xml_trees ] zylon_emp_names = [ extractor.get_company_names_zy(xml_tree) for xml_tree in zylon_xml_trees ] zylon_emp_jtitles = [ extractor.get_job_titles_zy(xml_tree) for xml_tree in zylon_xml_trees ] tokeniser = Tokeniser() true_edu_insts = tokeniser.docs_tolower( tokeniser.tokenise_doclines_to_words(true_edu_insts)) true_edu_majors = tokeniser.docs_tolower( tokeniser.tokenise_doclines_to_words(true_edu_majors)) true_emp_names = tokeniser.docs_tolower( tokeniser.tokenise_doclines_to_words(true_emp_names)) true_emp_jtitles = tokeniser.docs_tolower( tokeniser.tokenise_doclines_to_words(true_emp_jtitles)) zylon_edu_insts = tokeniser.docs_tolower( tokeniser.tokenise_doclines_to_words(zylon_edu_insts)) zylon_edu_majors = tokeniser.docs_tolower( tokeniser.tokenise_doclines_to_words(zylon_edu_majors)) zylon_emp_names = tokeniser.docs_tolower( tokeniser.tokenise_doclines_to_words(zylon_emp_names)) zylon_emp_jtitles = tokeniser.docs_tolower( tokeniser.tokenise_doclines_to_words(zylon_emp_jtitles)) edu_insts_match_score = self.score_matches(zylon_edu_insts, true_edu_insts) edu_majors_match_score = self.score_matches(zylon_edu_majors, true_edu_majors) emp_names_match_score = self.score_matches(zylon_emp_names, true_emp_names) emp_jtitles_match_score = self.score_matches(zylon_emp_jtitles, true_emp_jtitles) return edu_insts_match_score, edu_majors_match_score, emp_names_match_score, emp_jtitles_match_score
def check_should_normalise_correctly_for_bi_commands(cmd): t = Tokeniser() assert t.normalise(cmd) == {'action': cmd[0], 'magnitude': cmd[1], 'direction': None}
class Parser: def __init__(self, f): self.in_name, self.out_name = f self.output = [] self.tokeniser = Tokeniser(f) self.st_handler = SymbolTable() self.writer = VMCodeWriter(f) self.local_state = {'labeler': labeler()} self.parse() self.writer.close() def parse(self): if self.tokeniser.has_next(): self.compileClass() return self.out_name, self.output def compileClass(self): self.expect(TokenType.KEYWORD, 'class') self.local_state['class'] = self.expect(TokenType.IDENTIFIER) self.expect(TokenType.SYMBOL, '{') while self.peek(TokenType.KEYWORD, CLASS_VAR_KEYWORDS): self.compileClassVarDec() while self.peek(TokenType.KEYWORD, FXN_KEYWORDS): self.st_handler.start_subroutine() self.compileSubroutine() self.expect(TokenType.SYMBOL, '}') del self.local_state['class'] def compileClassVarDec(self): kind = keyword_to_kind[self.expect(TokenType.KEYWORD)] taipu = self.compileType() name = self.expect(TokenType.IDENTIFIER) self.st_handler.define(name, taipu, kind) varlist = self.tryCompileVarList(taipu=taipu, kind=kind) self.expect(TokenType.SYMBOL, ";") def compileType(self): ttype, token = self.tokeniser.peek() if ttype == TokenType.KEYWORD and token in BI_TYPES: return self.expect(TokenType.KEYWORD, token) elif ttype == TokenType.IDENTIFIER: return self.expect(TokenType.IDENTIFIER) else: raise SyntaxError( "Expected type in {} or identifier, got: {} of type {}".format( BI_TYPES, token, ttype)) def tryCompileVarList(self, exp_type=False, taipu=None, kind=None): varlist = [] while self.peek(TokenType.SYMBOL, ","): self.expect(TokenType.SYMBOL, ",") if exp_type: taipu = self.compileType() name = self.expect(TokenType.IDENTIFIER) varlist.append((name, taipu, kind)) for entry in varlist: self.st_handler.define(*entry) def compileSubroutine(self): fxn_kind = self.expect(TokenType.KEYWORD, FXN_KEYWORDS) if fxn_kind == 'method': self.st_handler.define('this', self.local_state['class'], IdentifierKind.ARGUMENT) self.compileType() fxn_name = self.expect(TokenType.IDENTIFIER) self.expect(TokenType.SYMBOL, "(") self.compileParameterList() self.expect(TokenType.SYMBOL, ")") self.compileSubroutineBody(fxn_name, fxn_kind) def compileParameterList(self): if not self.peek(TokenType.SYMBOL, ")"): kind = IdentifierKind.ARGUMENT taipu = self.compileType() name = self.expect(TokenType.IDENTIFIER) self.st_handler.define(name, taipu, kind) self.tryCompileVarList(exp_type=True, kind=kind) def compileSubroutineBody(self, fxn_name, fxn_kind): self.expect(TokenType.SYMBOL, "{") while self.peek(TokenType.KEYWORD, "var"): self.compileVarDec() self.writer.fun_dec(fxn_name, self.st_handler.var_count(IdentifierKind.VAR)) self.compileFxnKind(fxn_kind) self.compileStatements() self.expect(TokenType.SYMBOL, "}") def compileFxnKind(self, kind): if kind == 'constructor': num_fields = self.st_handler.var_count(IdentifierKind.FIELD) self.writer.alloc(num_fields) self.writer.pop_this_ptr() elif kind == 'method': self.writer.push_variable('this', self.st_handler) self.writer.pop_this_ptr() def compileVarDec(self): self.expect(TokenType.KEYWORD, "var") kind = IdentifierKind.VAR taipu = self.compileType() name = self.expect(TokenType.IDENTIFIER) self.st_handler.define(name, taipu, kind) self.tryCompileVarList(taipu=taipu, kind=kind) self.expect(TokenType.SYMBOL, ";") def compileStatements(self): while self.peek(TokenType.KEYWORD, STMT_KEYWORDS): self.compileStatement() def compileStatement(self): if self.peek(TokenType.KEYWORD, "let"): self.compileLet() elif self.peek(TokenType.KEYWORD, "if"): self.compileIf() elif self.peek(TokenType.KEYWORD, "while"): self.compileWhile() elif self.peek(TokenType.KEYWORD, "do"): self.compileDo() elif self.peek(TokenType.KEYWORD, "return"): self.compileReturn() def compileLet(self): self.expect(TokenType.KEYWORD, "let") var = self.expect(TokenType.IDENTIFIER) array_assignment = False if self.peek(TokenType.SYMBOL, "["): array_assignment = True self.compileBasePlusOffset(var) self.expect(TokenType.SYMBOL, "=") self.compileExpression() self.expect(TokenType.SYMBOL, ";") if array_assignment: self.saveToTemp() self.popToArray() else: self.writer.pop_variable(var, self.st_handler) def compileBasePlusOffset(self, base): self.writer.push_variable(base, self.st_handler) self.expect(TokenType.SYMBOL, "[") self.compileExpression() self.expect(TokenType.SYMBOL, "]") self.writer.binary_op("+") def saveToTemp(self): self.writer.pop('temp', 0) def popToArray(self): self.writer.pop_that_ptr() self.writer.push('temp', 0) self.writer.pop_that() def compileIf(self): self.expect(TokenType.KEYWORD, "if") endif = next(self.local_state['labeler']) self.compileCond(endif) if self.peek(TokenType.KEYWORD, "else"): self.expect(TokenType.KEYWORD, "else") self.expectBracedStatements() self.writer.label(endif) def expectBracedStatements(self): self.expect(TokenType.SYMBOL, "{") self.compileStatements() self.expect(TokenType.SYMBOL, "}") def compileWhile(self): self.expect(TokenType.KEYWORD, "while") loop = next(self.local_state['labeler']) self.writer.label(loop) self.compileCond(loop) def compileCond(self, ret): self.expectGroupedExpression() self.writer.unary_op('~') not_cond = next(self.local_state['labeler']) self.writer.ifgoto(not_cond) self.expectBracedStatements() self.writer.goto(ret) self.writer.label(not_cond) def expectGroupedExpression(self): self.expect(TokenType.SYMBOL, "(") self.compileExpression() self.expect(TokenType.SYMBOL, ")") def compileDo(self): self.expect(TokenType.KEYWORD, "do") caller = self.expect(TokenType.IDENTIFIER) self.compileSubroutineCall(caller) self.writer.pop("temp", "0") self.expect(TokenType.SYMBOL, ";") def compileReturn(self): self.expect(TokenType.KEYWORD, "return") if not self.peek(TokenType.SYMBOL, ";"): self.compileExpression() else: self.writer.int_const(0) self.writer.ret() self.expect(TokenType.SYMBOL, ";") def compileExpression(self): self.compileTerm() while self.peek(TokenType.SYMBOL, EXP_SYMBOLS): op = self.compileOp() self.compileTerm() self.writer.binary_op(op) def compileOp(self): return self.expect(TokenType.SYMBOL) def compileTerm(self): if self.peek(TokenType.INT_CONST): int = self.expect(TokenType.INT_CONST) self.writer.int_const(int) elif self.peek(TokenType.STR_CONST): str = self.compileStrConst() self.writer.str_const(str) elif self.peek(TokenType.KEYWORD, KEYWORD_CONSTANTS): kw = self.expect(TokenType.KEYWORD, KEYWORD_CONSTANTS) self.writer.kw_const(kw) elif self.peek(TokenType.SYMBOL, UNARY_OPS): self.compileUnaryOp() elif self.peek(TokenType.SYMBOL, "("): self.expectGroupedExpression() elif self.tokeniser.has_next(): t1, token1 = self.tokeniser.next() if self.tokeniser.has_next(): t2, token2 = self.tokeniser.peek() if self.peek(TokenType.SYMBOL, "["): self.compileArrayAccess(token1) elif self.peek(TokenType.SYMBOL, ["(", "."]): self.compileSubroutineCall(token1) else: self.writer.push_variable(token1, self.st_handler) def compileStrConst(self): ttype, token = self.tokeniser.next() return token[1:-1] def compileUnaryOp(self): op = self.expect(TokenType.SYMBOL, ["-", "~"]) self.compileTerm() self.writer.unary_op(op) def compileArrayAccess(self, arr): self.compileBasePlusOffset(arr) self.writer.pop_that_ptr() self.writer.push_that() def compileSubroutineCall(self, caller): if self.peek(TokenType.SYMBOL, "("): method, nargs = self.compileSelfFunctionCall(caller) qualified_name = self.local_state['class'] + '.' + method elif self.peek(TokenType.SYMBOL, "."): method, nargs = self.compileMethodCall(caller) qualified_name = self.st_handler.qualify(caller, method) self.writer.call(qualified_name, nargs) def compileSelfFunctionCall(self, method): self.writer.push_this_ptr() nargs = self.expectExpressionList() + 1 return method, nargs def compileMethodCall(self, caller): nargs = 0 if self.st_handler.is_object(caller): nargs += 1 self.writer.push_variable(caller, self.st_handler) self.expect(TokenType.SYMBOL, ".") method = self.expect(TokenType.IDENTIFIER) nargs += self.expectExpressionList() return method, nargs def expectExpressionList(self): self.expect(TokenType.SYMBOL, "(") nexps = self.compileExpressionList() self.expect(TokenType.SYMBOL, ")") return nexps def compileExpressionList(self): nexps = 0 if not self.peek(TokenType.SYMBOL, ")"): self.compileExpression() nexps += 1 while self.peek(TokenType.SYMBOL, ","): self.expect(TokenType.SYMBOL, ",") self.compileExpression() nexps += 1 return nexps def peek(self, e_type, e_token=None): if not self.tokeniser.has_next(): return False a_type, a_token = self.tokeniser.peek() return self.token_match(e_type, e_token, a_type, a_token) def expect(self, e_type, e_token=None): a_type, a_token = self.tokeniser.next() if self.token_match(e_type, e_token, a_type, a_token): return a_token else: raise SyntaxError( "Expected {} of type {}, got {} of type {}".format( e_token, e_type, a_token, a_type)) def token_match(self, e_type, e_token, a_type, a_token): return (e_type == a_type or (type(e_type) == list and a_type in e_type)) and \ (e_token is None or e_token == a_token or (type(e_token) == list and a_token in e_token))
def test_should_return_true_for_empty_lists(): assert Tokeniser.is_empty([])
def check_should_return_true_for_valid_comments(comment): assert Tokeniser.is_comment(comment)
class Parser: def __init__(self, f): self.in_name, self.out_name = f self.output = [] self.tokeniser = Tokeniser(f) self.depth = 0 self.parse() def parse(self): if self.tokeniser.has_next(): self.compileClass() return self.out_name, self.output def compileClass(self): self.open_tag('class') self.expect(TokenType.KEYWORD, 'class') self.expect(TokenType.IDENTIFIER) self.expect(TokenType.SYMBOL, '{') while self.peek(TokenType.KEYWORD, ['static', 'field']): self.compileClassVarDec() while self.peek(TokenType.KEYWORD, ['function', 'constructor', 'method']): self.compileSubroutine() self.expect(TokenType.SYMBOL, '}') self.close_tag('class') def compileClassVarDec(self): self.open_tag('classVarDec') self.expect(TokenType.KEYWORD) self.compileType() self.expect(TokenType.IDENTIFIER) self.tryCompileVarList() self.expect(TokenType.SYMBOL, ";") self.close_tag('classVarDec') return self.tokeniser.peek() if self.tokeniser.has_next() else (None, None) def compileType(self): ttype, token = self.tokeniser.peek() type_list = ["void", "int", "char", "boolean"] if ttype == TokenType.KEYWORD and token in type_list: self.expect(TokenType.KEYWORD, token) elif ttype == TokenType.IDENTIFIER: self.expect(TokenType.IDENTIFIER) else: raise SyntaxError( "Expected type in {} or identifier, got: {} of type {}".format( type_list, token, ttype)) def tryCompileVarList(self, exp_type=False): while self.peek(TokenType.SYMBOL, ","): self.expect(TokenType.SYMBOL, ",") if exp_type: self.compileType() self.expect(TokenType.IDENTIFIER) def compileSubroutine(self): self.open_tag('subroutineDec') self.expect(TokenType.KEYWORD, ['constructor', 'function', 'method']) self.expect([TokenType.KEYWORD, TokenType.IDENTIFIER]) self.expect(TokenType.IDENTIFIER) self.expect(TokenType.SYMBOL, "(") self.compileParameterList() self.expect(TokenType.SYMBOL, ")") self.compileSubroutineBody() self.close_tag('subroutineDec') return self.tokeniser.peek() if self.tokeniser.has_next() else (None, None) def compileParameterList(self): self.open_tag('parameterList') if not self.peek(TokenType.SYMBOL, ")"): self.compileType() self.expect(TokenType.IDENTIFIER) self.tryCompileVarList(True) self.close_tag('parameterList') def compileSubroutineBody(self): self.open_tag('subroutineBody') self.expect(TokenType.SYMBOL, "{") while self.peek(TokenType.KEYWORD, "var"): self.compileVarDec() self.compileStatements() self.expect(TokenType.SYMBOL, "}") self.close_tag('subroutineBody') def compileVarDec(self): self.open_tag("varDec") self.expect(TokenType.KEYWORD, "var") self.compileType() self.expect(TokenType.IDENTIFIER) self.tryCompileVarList() self.expect(TokenType.SYMBOL, ";") self.close_tag("varDec") def compileStatements(self): self.open_tag("statements") while self.peek(TokenType.KEYWORD, ['let', 'if', 'while', 'do', 'return']): self.compileStatement() self.close_tag("statements") def compileStatement(self): if self.peek(TokenType.KEYWORD, "let"): self.compileLet() elif self.peek(TokenType.KEYWORD, "if"): self.compileIf() elif self.peek(TokenType.KEYWORD, "while"): self.compileWhile() elif self.peek(TokenType.KEYWORD, "do"): self.compileDo() elif self.peek(TokenType.KEYWORD, "return"): self.compileReturn() def compileLet(self): self.open_tag("letStatement") self.expect(TokenType.KEYWORD, "let") self.expect(TokenType.IDENTIFIER) if self.peek(TokenType.SYMBOL, "["): self.expect(TokenType.SYMBOL, "[") self.compileExpression() self.expect(TokenType.SYMBOL, "]") self.expect(TokenType.SYMBOL, "=") self.compileExpression() self.expect(TokenType.SYMBOL, ";") self.close_tag("letStatement") def compileIf(self): self.open_tag("ifStatement") self.expect(TokenType.KEYWORD, "if") self.expectGroupedExpression() self.expect(TokenType.SYMBOL, "{") self.compileStatements() self.expect(TokenType.SYMBOL, "}") if self.peek(TokenType.KEYWORD, "else"): self.expect(TokenType.KEYWORD, "else") self.expect(TokenType.SYMBOL, "{") self.compileStatements() self.expect(TokenType.SYMBOL, "}") self.close_tag("ifStatement") def expectGroupedExpression(self): self.expect(TokenType.SYMBOL, "(") self.compileExpression() self.expect(TokenType.SYMBOL, ")") def compileWhile(self): self.open_tag("whileStatement") self.expect(TokenType.KEYWORD, "while") self.expectGroupedExpression() self.expect(TokenType.SYMBOL, "{") self.compileStatements() self.expect(TokenType.SYMBOL, "}") self.close_tag("whileStatement") def compileDo(self): self.open_tag("doStatement") self.expect(TokenType.KEYWORD, "do") self.expect(TokenType.IDENTIFIER) self.compileSubroutineCall() self.expect(TokenType.SYMBOL, ";") self.close_tag("doStatement") def compileReturn(self): self.open_tag("returnStatement") self.expect(TokenType.KEYWORD, "return") if not self.peek(TokenType.SYMBOL, ";"): self.compileExpression() self.expect(TokenType.SYMBOL, ";") self.close_tag("returnStatement") def compileExpression(self): self.open_tag("expression") self.compileTerm() while self.peek(TokenType.SYMBOL, list("+-*/&|<>=")): self.compileOp() self.compileTerm() self.close_tag("expression") def compileOp(self): self.expect(TokenType.SYMBOL) def compileTerm(self): self.open_tag("term") if self.peek(TokenType.INT_CONST): self.expect(TokenType.INT_CONST) elif self.peek(TokenType.STR_CONST): self.compileStrConst() elif self.peek(TokenType.KEYWORD, ['true', 'false', 'null', 'this']): self.compileKeywordConstant() elif self.peek(TokenType.SYMBOL, ["-", "~"]): self.compileUnaryOp() elif self.peek(TokenType.SYMBOL, "("): self.expectGroupedExpression() elif self.tokeniser.has_next(): t1, token1 = self.tokeniser.next() self.terminal_tag(t1, token1) if self.tokeniser.has_next(): t2, token2 = self.tokeniser.peek() if self.peek(TokenType.SYMBOL, "["): self.compileArrayAccess() elif self.peek(TokenType.SYMBOL, ["(", "."]): self.compileSubroutineCall() self.close_tag("term") def compileStrConst(self): ttype, token = self.tokeniser.next() self.terminal_tag(TokenType.STR_CONST, token[1:-1]) def compileKeywordConstant(self): self.expect(TokenType.KEYWORD, ['true', 'false', 'null', 'this']) def compileUnaryOp(self): self.expect(TokenType.SYMBOL, ["-", "~"]) self.compileTerm() def compileArrayAccess(self): self.expect(TokenType.SYMBOL, "[") self.compileExpression() self.expect(TokenType.SYMBOL, "]") def compileSubroutineCall(self): if self.peek(TokenType.SYMBOL, "("): self.expectExpressionList() elif self.peek(TokenType.SYMBOL, "."): self.expect(TokenType.SYMBOL, ".") self.expect(TokenType.IDENTIFIER) self.expectExpressionList() def expectExpressionList(self): self.expect(TokenType.SYMBOL, "(") self.compileExpressionList() self.expect(TokenType.SYMBOL, ")") def compileExpressionList(self): self.open_tag("expressionList") if not self.peek(TokenType.SYMBOL, ")"): self.compileExpression() while self.peek(TokenType.SYMBOL, ","): self.expect(TokenType.SYMBOL, ",") self.compileExpression() self.close_tag("expressionList") def peek(self, e_type, e_token=None, expect=True): if not self.tokeniser.has_next(): return False a_type, a_token = self.tokeniser.peek() return self.token_match(e_type, e_token, a_type, a_token) def expect(self, e_type, e_token=None): if not self.tokeniser.has_next(): return None, None a_type, a_token = self.tokeniser.next() if self.token_match(e_type, e_token, a_type, a_token): self.terminal_tag(a_type, a_token) return self.tokeniser.peek() if self.tokeniser.has_next( ) else None, None else: raise SyntaxError( "Expected {} of type {}, got {} of type {}".format( e_token, e_type, a_token, a_type)) def token_match(self, e_type, e_token, a_type, a_token): return (e_type == a_type or (type(e_type) == list and a_type in e_type)) and \ (e_token is None or e_token == a_token or (type(e_token) == list and a_token in e_token)) def open_tag(self, tag_name, value=''): self.output.append('{}<{}>{}'.format(' ' * self.depth, escape(tag_name), escape(value))) self.depth += 2 def close_tag(self, tag_name, newline=True): self.depth -= 2 tag = '</{}>'.format(escape(tag_name)) if newline or not self.output: self.output.append(' ' * self.depth + tag) else: self.output[-1] += tag def terminal_tag(self, tag_name, value): self.depth += 2 self.open_tag(str(tag_name), value=value) self.close_tag(str(tag_name), False) self.depth -= 2