Esempio n. 1
0
 def __init__(self, f):
     self.in_name, self.out_name = f
     self.output = []
     self.tokeniser = Tokeniser(f)
     self.st_handler = SymbolTable()
     self.writer = VMCodeWriter(f)
     self.local_state = {'labeler': labeler()}
     self.parse()
     self.writer.close()
    def __init__(self):
        extractor = Extractor()
        filenames = extractor.populate_file_names(self.__manual_anno_folder)
        valid_filenames = extractor.filter_by_valid_exts(filenames)
        valid_filenames, resume_content = extractor.read_resume_content_tika_api(
            valid_filenames, self.__manual_anno_folder)

        tokeniser = Tokeniser()
        tokenised_docs = tokeniser.tokenise_docs_to_lines(resume_content)

        dataset = Dataset()
        dataset.save_doc_lines(tokenised_docs, valid_filenames,
                               self.__manual_anno_processed)
Esempio n. 3
0
 def test_tokenise_nt(self):
     cases = {
         "Don't": [WordToken("do"), WordToken("not")],
         "hasn't": [WordToken("has"), WordToken("not")]
     }
     for input, expected in cases.iteritems():
         assert Tokeniser.tokenise_sentence(input) == expected
Esempio n. 4
0
 def test_tokenise_ve(self):
     # I've -> I have, as there is no ambiguity
     cases = {
         "I've": [WordToken("i"), WordToken("have")],
     }
     for input, expected in cases.iteritems():
         assert Tokeniser.tokenise_sentence(input) == expected
def add_content_features(featuredict, content):
    """Add features from note content.

    Derive the following features from note content and add them to the
    featuredict with binary values:

        CONTENT-TOKEN-<token>: Set for each unique, case-folded token in the
            note content (not including markup).
        CONTENT-MEDIA-<mimetype>: Set for each mimetype used for media in the
            note.
        CONTENT-HASLINK: Set if the note contains one or more links.
        CONTENT-LINK-<domain>: Set with the domain of each link in the note.
        CONTENT-TODO: Set if the note contains a todo.

    Args:
        featuredict: A dict.
        content: File-like object containing the note content.
    """
    parser = etree.HTMLParser()
    root = etree.parse(content, parser).getroot()
    string_content = unicode(root.xpath('string()'))
    for token in Tokeniser.split(string_content):
        featuredict["CONTENT-TOKEN-%s" % token.lower()] = 1
    for media in root.iterfind(".//en-media"):
        featuredict["CONTENT-MEDIA-%s" % media.get("type")] = 1
    for link in root.iterfind(".//a"):
        url = link.get("href")
        if url is not None:
            featuredict["CONTENT-HASLINK"] = 1
            netloc = urlparse(link.get("href")).netloc
            if netloc:
                featuredict["CONTENT-LINK-%s" % netloc] = 1
    if root.find(".//en-todo") is not None:
        featuredict["CONTENT-TODO"] = 1
Esempio n. 6
0
 def test_tokenise_comma(self):
     cases = {
         "I, for one.": [WordToken("i"), PunctuationToken(","), WordToken("for"), WordToken("one"),
                         PunctuationToken(".")]
     }
     for input, expected in cases.iteritems():
         assert Tokeniser.tokenise_sentence(input) == expected
Esempio n. 7
0
 def test_tokenise_ll(self):
     cases = {
         "I'll": [WordToken("i"), WordToken("will")],
         "Sam'll": [WordToken("sam"), WordToken("will")],
     }
     for input, expected in cases.iteritems():
         assert Tokeniser.tokenise_sentence(input) == expected
Esempio n. 8
0
 def test_tokenise_s(self):
     # has / genitive / is ambiguous so just leave it as is
     cases = {
         "It's": [WordToken("it"), WordToken("'s")],
         "He's": [WordToken("he"), WordToken("'s")],
     }
     for input, expected in cases.iteritems():
         assert Tokeniser.tokenise_sentence(input) == expected
Esempio n. 9
0
 def test_tokenise_d(self):
     # he'd -> he had / he would ambiguous, so just leave it
     cases = {
         "It'd": [WordToken("it"), WordToken("'d")],
         "He'd": [WordToken("he"), WordToken("'d")],
     }
     for input, expected in cases.iteritems():
         assert Tokeniser.tokenise_sentence(input) == expected
Esempio n. 10
0
 def test_compute_weighted(self):
     cases = [
         ("agilely agility agitate", 0),
         ("agilely agitated of", -0.5)
     ]
     for s, expected in cases:
         tokens = Tokeniser.tokenise_sentence(s)
         assert SymbolicScore.compute_weighted(tokens, self.lexicon) == expected
Esempio n. 11
0
 def test_tokenise_bracket(self):
     cases = {
         "(I, for one.)": [PunctuationToken("("), WordToken("i"), PunctuationToken(","), WordToken("for"),
                           WordToken("one"),
                           PunctuationToken("."), PunctuationToken(")")]
     }
     for input, expected in cases.iteritems():
         assert Tokeniser.tokenise_sentence(input) == expected
Esempio n. 12
0
 def test_tokenise_hyphen(self):
     # "eight-year-old-child"
     # 8-year-old ? cf tokenise_id
     cases = {
         "eight-year-old child": [WordToken("eight"), PunctuationToken("-"), WordToken("year"),
                                  PunctuationToken("-"), WordToken("old"), WordToken("child")],
     }
     for input, expected in cases.iteritems():
         assert Tokeniser.tokenise_sentence(input) == expected
Esempio n. 13
0
 def test_tokenise_id(self):
     # B456F7-3
     cases = {
         "like B456F7-3": [WordToken("like"), WordToken("B456F7-3")],
         "like B456F7-3-like": [WordToken("like"), WordToken("B456F7-3-like")],
         "8-years-old": [WordToken("8-years-old")],
     }
     for input, expected in cases.iteritems():
         assert Tokeniser.tokenise_sentence(input) == expected
Esempio n. 14
0
 def test_tokenise_capital_middle(self):
     # I thought it was GREAT. -> GREAT should be capitalised
     # I love Paris -> Paris should be capitalised
     # Paris I love -> paris + i + love (I should go lowercase)
     cases = {
         "I thought it was GREAT": [WordToken("i"), WordToken("thought"), WordToken("it"), WordToken("was"),
                                    WordToken("GREAT")],
         "I love Paris": [WordToken("i"), WordToken("love"), WordToken("Paris")],
         "Paris I love": [WordToken("paris"), WordToken("i"), WordToken("love")],
     }
     for input, expected in cases.iteritems():
         assert Tokeniser.tokenise_sentence(input) == expected
Esempio n. 15
0
 def test_period(self):
     # Ph.D -> Ph.D
     # U.S.A. -> U.S.A + period if end of sentence, U.S.A if middle
     cases = {
         "I have a Ph.D.": [WordToken("i"), WordToken("have"), WordToken("a"), WordToken("Ph.D"),
                            PunctuationToken(".")],
         "Make U.K. great again.": [WordToken("make"), WordToken("U.K"), PunctuationToken("."), WordToken("great"),
                                    WordToken("again"),
                                    PunctuationToken(".")]
     }
     for input, expected in cases.iteritems():
         assert Tokeniser.tokenise_sentence(input) == expected
Esempio n. 16
0
 def test_tokenise_slash(self):
     # love/hate relationship
     cases = {
         "love/hate relationship": [WordToken("love"), PunctuationToken("/"), WordToken("hate"),
                                    WordToken("relationship")],
         "this love/ hate relationship": [WordToken("this"), WordToken("love"), PunctuationToken("/"),
                                          WordToken("hate"),
                                          WordToken("relationship")],
         "weird-love /hate relationship": [WordToken("weird"), PunctuationToken("-"), WordToken("love"),
                                           PunctuationToken("/"), WordToken("hate"),
                                           WordToken("relationship")],
     }
     for input, expected in cases.iteritems():
         assert Tokeniser.tokenise_sentence(input) == expected
Esempio n. 17
0
class Lexer:

    """ Intel C/C++ Compiler optimization report lexical analyser  """

    def __init__(self, report_filename = ""):
        self.report_filename = report_filename
        self.scanner = Scanner(self, self.report_filename)
        self.tokeniser = Tokeniser(self)
        self.token_num = 0

    def get_token_num(self):
        return self.token_num

    def get_next_token(self):
        token = self.tokeniser.tokenise_lexeme( self.scanner.get_next_lexeme() )
        if token != TokenClass.EOR:
            self.token_num += 1
        return token
    def test_tokenise_lines(self):
        tokeniser = Tokeniser()
        # each slot is résumé plain text
        input_docs = [
            "sample resume output\rsample resume output",
            "\rsample resume output\rsample resume output",
            "sample resume output\nsample resume output",
            "\nsample resume output\nsample resume output"
        ]

        # each slot has a list of lines found in each résumé inputted
        correct_output = [["sample resume output", "sample resume output"],
                          ["", "sample resume output", "sample resume output"],
                          ["sample resume output", "sample resume output"],
                          ["", "sample resume output", "sample resume output"]]

        output = tokeniser.tokenise_docs_to_lines(input_docs)
        self.assertEqual(output, correct_output)
    def test_tokenise_words(self):
        tokeniser = Tokeniser()

        # each slot is a line within a résumé
        input_lines = [[
            "sample resume output sample resume output",
            "  sample resume output sample resume output  ",
            "sample resume output.            sample resume output", ""
        ]]

        # each slot is a token
        correct_output = [
            [["sample", "resume", "output", "sample", "resume", "output"],
             ["sample", "resume", "output", "sample", "resume", "output"],
             ["sample", "resume", "output", "sample", "resume", "output"]]
        ]

        output = tokeniser.tokenise_doclines_to_words(input_lines)
        self.assertEqual(output, correct_output)
def add_metadata_features(featuredict, note):
    """Add features from note metadata.

    Derive the following features from the Note and add them to the
    featuredict with binary values:

        META-TITLETOKEN-<token>: Set for each unique, case-folded token in
            the note title.
        META-URL-<domain>: Set with the domain of the note URL, if one is
            provided.
        META_HASURL: Set if the note has a URL.
        META-HASLOCATION: Set if the note has a latitude.
        META-SOURCE-<source>: Set with the source of the note, if it is
            provided.
        META-PLACE-<place>: Set with the place name of the note, if it is
            provided.
        META-CONTENTCLASS-<class>: Set with the content class of the note, if
            it is provided.

    Args:
        featuredict: A dict.
        note: Note object.
    """
    for token in Tokeniser.split(unicode(note.title, encoding="utf-8")):
        featuredict["META-TITLETOKEN-%s" % token.lower()] = 1
    if note.attributes.sourceURL:
        netloc = urlparse(note.attributes.sourceURL).netloc
        if netloc:
            featuredict["META-URL-%s" % netloc] = 1
            featuredict["META-HASURL"] = 1
    if note.attributes.latitude is not None:
        featuredict["META-HASLOCATION"] = 1
    if note.attributes.source:
        featuredict["META-SOURCE-%s" % note.attributes.source] = 1
    if note.attributes.placeName:
        featuredict["META-PLACE-%s" % note.attributes.placeName] = 1
    if note.attributes.contentClass:
        featuredict["META-CONTENTCLASS-%s" % note.attributes.contentClass] = 1
Esempio n. 21
0
 def __init__(self, report_filename = ""):
     self.report_filename = report_filename
     self.scanner = Scanner(self, self.report_filename)
     self.tokeniser = Tokeniser(self)
     self.token_num = 0
 def test_currency(self):
     tokens = Tokeniser.split("hi there $100 man")
     self.assertEqual(tokens, ["hi", "there", "$100", "man"])
 def test_punctuation(self):
     tokens = Tokeniser.split("hi there, you ...")
     self.assertEqual(tokens, ["hi", "there", ",", "you", "..."])
 def test_unicode(self):
     tokens = Tokeniser.split(u'hi theré')
     self.assertEqual(tokens, ["hi", u'theré'])
Esempio n. 25
0
 def test_tokenise_whitespace(self):
     cases = {
         "an    apple.": [WordToken("an"), WordToken("apple"), PunctuationToken(".")],
     }
     for input, expected in cases.iteritems():
         assert Tokeniser.tokenise_sentence(input) == expected
 def test_muliline(self):
     tokens = Tokeniser.split("hi there\nsecond line")
     self.assertEqual(tokens, ["hi", "there", "second", "line"])
Esempio n. 27
0
from naive_bayes_neg import NaiveBayesNeg
from negation import compute_neg_punc, compute_neg_dir_dep, compute_neg_head_obj, compute_neg_after_x
from symbolic import LexiconGenerator
from symbolic_neg import compute_negation_terms, SymbolicClassifier
from tokeniser import Tokeniser

no_negation = lambda x, y: [False] * len(x)

pos_path = os.path.abspath("../data/POS")
pos_files = [os.path.join(pos_path, f) for f in os.listdir(pos_path)]
neg_path = os.path.abspath("../data/NEG")
neg_files = [os.path.join(neg_path, f) for f in os.listdir(neg_path)]
dataset = [pos_files, neg_files]

# list of (tokens, label), where label are either 0:positive or 1:negative
datas = [(list(Tokeniser.tokenise(data)), label) for label in xrange(0, 2)
         for data in dataset[label]]

# load spacy language model, used to compute dependency structures
nlp = spacy.load('en')
print "spacy loaded"

# TODO rename methods as they are called in the report
methods = [(compute_neg_punc, [], "punc"),
           (compute_neg_dir_dep, [nlp], "dir_dep"),
           (compute_neg_head_obj, [nlp], "head_obj")]
for i in xrange(1, 6, 2):
    methods += [(compute_neg_after_x, [i], "after_{}".format(i))]

#
negation_terms = compute_negation_terms()
Esempio n. 28
0
n = 0
ok = 0

if sys.argv[1] == 'T':
    inFiles = os.listdir(tInDir)
    outFiles = os.listdir(tOutDir)

    for f in outFiles:
        ffile = open(tOutDir + '/' + f[0:7] + '.out', 'w')
        ffile.close()

    for f in inFiles:
        n += 1
        print(f)
        inp = open(tInDir + '/' + f, 'r')
        lex = Tokeniser(''.join(inp.readlines()))
        while True:
            out = open(tOutDir + '/' + f[0:7] + '.out', 'a')
            try:
                t = lex.Next()
            except Exception as err:
                out.write(''.join(err.args) + '\n')
            else:
                if t.tokenType == Token.tokenTypeEOF:
                    expstr = ''
                    exp = open('exp/' + f[0:7] + '.txt', 'r')
                    for line in exp:
                        expstr += line

                    out.close()
                    out = open(tOutDir + '/' + f[0:7] + '.out', 'r')
Esempio n. 29
0
def check_should_return_true_for_valid_positive_int(number):
  assert Tokeniser.is_positive_int(number)
Esempio n. 30
0
def check_should_return_true_for_valid_coordinates(number):
  assert Tokeniser.is_coordinate('{},{}'.format(number, number))
Esempio n. 31
0
def check_should_return_true_for_valid_numbers(number):
  assert Tokeniser.is_number(number)
Esempio n. 32
0
 def __init__(self, f):
     self.in_name, self.out_name = f
     self.output = []
     self.tokeniser = Tokeniser(f)
     self.depth = 0
     self.parse()
Esempio n. 33
0
def check_should_return_true_for_valid_directions(direction):
  assert Tokeniser.is_direction(direction)
Esempio n. 34
0
 def test_tokenise_i(self):
     cases = {
         "well I think": [WordToken("well"), WordToken("i"), WordToken("think")],
     }
     for input, expected in cases.iteritems():
         assert Tokeniser.tokenise_sentence(input) == expected
Esempio n. 35
0
def check_should_return_true_for_valid_bi_actions(action):
  assert Tokeniser.is_bi_action(action)
    def get_ies_scores(self):
        extractor = Extractor()
        ies_filenames = extractor.populate_file_names(self.__ies_accuracy_test)
        ies_filenames = extractor.filter_by_valid_exts(ies_filenames)
        filenames, resume_content = extractor.read_resume_content_tika_api(
            ies_filenames, self.__ies_accuracy_test)
        filenames, resume_content = extractor.remove_empty_resumes(
            filenames, resume_content)
        resume_labels = extractor.read_resume_labels(self.__ies_accuracy_test,
                                                     filenames)

        true_edu_insts = [
            extractor.get_edu_institutions(xml_tree)
            for xml_tree in resume_labels
        ]
        true_edu_majors = [
            extractor.get_edu_majors(xml_tree) for xml_tree in resume_labels
        ]
        true_emp_names = [
            extractor.get_company_names(xml_tree) for xml_tree in resume_labels
        ]
        true_emp_jtitles = [
            extractor.get_job_titles(xml_tree) for xml_tree in resume_labels
        ]

        cs = CrfSuite()
        cs.load_tagger()
        annotator = Annotator()
        annotated_resumes = [
            annotator.annotate_using_trained_model(self.__ies_accuracy_test +
                                                   self.__seperator +
                                                   filename[0] + filename[1])
            for filename in filenames
        ]
        predicted_entity_list = [
            cs.tag_doc(resume) for resume in annotated_resumes
        ]

        ies_edu_insts = [
            extractor.get_edu_institutions_from_list(entity_list)
            for entity_list in predicted_entity_list
        ]
        ies_edu_majors = [
            extractor.get_edu_major_from_list(entity_list)
            for entity_list in predicted_entity_list
        ]
        ies_emp_names = [
            extractor.get_company_names_from_list(entity_list)
            for entity_list in predicted_entity_list
        ]
        ies_emp_jtitles = [
            extractor.get_company_position_from_list(entity_list)
            for entity_list in predicted_entity_list
        ]

        tokeniser = Tokeniser()
        true_edu_insts = tokeniser.docs_tolower(
            tokeniser.tokenise_doclines_to_words(true_edu_insts))
        true_edu_majors = tokeniser.docs_tolower(
            tokeniser.tokenise_doclines_to_words(true_edu_majors))
        true_emp_names = tokeniser.docs_tolower(
            tokeniser.tokenise_doclines_to_words(true_emp_names))
        true_emp_jtitles = tokeniser.docs_tolower(
            tokeniser.tokenise_doclines_to_words(true_emp_jtitles))

        ies_edu_insts = tokeniser.docs_tolower(
            tokeniser.tokenise_doclines_to_words(ies_edu_insts))
        ies_edu_majors = tokeniser.docs_tolower(
            tokeniser.tokenise_doclines_to_words(ies_edu_majors))
        ies_emp_names = tokeniser.docs_tolower(
            tokeniser.tokenise_doclines_to_words(ies_emp_names))
        ies_emp_jtitles = tokeniser.docs_tolower(
            tokeniser.tokenise_doclines_to_words(ies_emp_jtitles))

        edu_insts_match_score = self.score_matches(ies_edu_insts,
                                                   true_edu_insts)
        edu_majors_match_score = self.score_matches(ies_edu_majors,
                                                    true_edu_majors)
        emp_names_match_score = self.score_matches(ies_emp_names,
                                                   true_emp_names)
        emp_jtitles_match_score = self.score_matches(ies_emp_jtitles,
                                                     true_emp_jtitles)
        print(edu_insts_match_score)
        print(edu_majors_match_score)
        print(emp_names_match_score)
        print(emp_jtitles_match_score)
class Annotator():
    __job_position_tag = "EMP-POS"
    __job_company_tag = "EMP-COMP"

    __education_course_tag = "EDU-MAJOR"
    __education_institution_tag = "EDU-INST"

    def __init__(self):
        self.__extractor = Extractor()
        self.__tokeniser = Tokeniser()
        self.__tagger = Tagger()
        self.__dataset = Dataset()
        self.__logger = Logger()

    def prepare_dataset(self, nr_of_docs=-1):
        resumes, labels = self.__extractor.read_raw_files(nr_of_docs)

        resumes = self.__tokeniser.tokenise_docs_to_lines(resumes)
        resumes = self.__tokeniser.tokenise_doclines_to_words(resumes)

        self.__dataset.resume_content = self.annotate_docs(resumes, labels)
        self.__dataset.save()

    # resumes: list of tokenised (by line and word) résumé docs
    # labels: xml structure storing labels for several resumes
    def annotate_docs(self, resumes, labels):
        self.__logger.println("annotating resumes")
        annotated_resumes = []
        for idx, resume in enumerate(resumes):
            annotated_resumes.append(self.annotate_doc(resume, labels[idx]))
            self.__logger.println(
                "annotating resume %s/%s with true labels and pos tags" %
                (idx + 1, len(resumes)))

        # non local ner tag entire dataset at a time for speed
        annotated_resumes = self.__tagger.nonlocal_ner_tag(annotated_resumes)
        self.__logger.println("completed annotating resumes")
        return annotated_resumes

    # doc: a single résumé document with token strings in each slot of list
    # labels: xml structure storing pre-extracted information
    def annotate_doc(self, doc, labels):
        job_title_list = self.__extractor.get_job_titles(labels)
        job_company_list = self.__extractor.get_company_names(labels)
        edu_major_list = self.__extractor.get_edu_majors(labels)
        edu_inst_list = self.__extractor.get_edu_institutions(labels)
        # can extract more labels here

        prepared_doc = self.__tagger.prepare_doc(doc)
        prepared_doc = self.__match_entity(prepared_doc, job_title_list,
                                           self.__job_position_tag)
        prepared_doc = self.__match_entity(prepared_doc, job_company_list,
                                           self.__job_company_tag)
        prepared_doc = self.__match_entity(prepared_doc, edu_major_list,
                                           self.__education_course_tag)
        prepared_doc = self.__match_entity(prepared_doc, edu_inst_list,
                                           self.__education_institution_tag)
        prepared_doc = self.__tagger.add_default_entity_tags(prepared_doc)

        prepared_doc = self.__tagger.pos_tag(prepared_doc)

        return prepared_doc

    # doc: résumé doc to be annotated
    # entity_list: list of labels to matched in doc
    # tag: tag to be assigned if match found
    def __match_entity(self, doc, entity_list, tag):
        for entity in entity_list:
            doc = self.__tagger.match_label(doc, entity, tag)
        return doc

    # function takes in a path to file and annotates it for tagging
    # to be ideally used to tag as a one off for testing
    # filepath: path to résumé
    def annotate_using_trained_model(self, filepath):
        resume_content = self.__extractor.read_resume_content(filepath)

        resume_content = self.__tokeniser.tokenise_docs_to_lines(
            resume_content)
        resume_content = self.__tokeniser.tokenise_doclines_to_words(
            resume_content)

        prepared_doc = self.__tagger.prepare_doc(resume_content[0])
        prepared_doc = self.__tagger.pos_tag(prepared_doc)
        prepared_doc = self.__tagger.nonlocal_ner_tag([prepared_doc])

        return prepared_doc[0]
Esempio n. 38
0
def check_should_return_true_for_valid_repeat_commands(cmd):
  assert Tokeniser.is_valid_repeat(cmd)
 def __init__(self):
     self.__extractor = Extractor()
     self.__tokeniser = Tokeniser()
     self.__tagger = Tagger()
     self.__dataset = Dataset()
     self.__logger = Logger()
    def get_zylon_parser_scores(self):
        """
        parameters: none

        Extracts labelled entities from zylon's xml output and true xml
        output. Compares the entity lists and returns a score, higher is
        better.
        
        return: edu_insts_match_score, edu_majors_match_score, emp_names_match_score, emp_jtitles_match_score
        """
        extractor = Extractor()
        zylon_filenames = extractor.populate_file_names(
            self.__zylon_parser_labels_folder)

        zylon_xml_trees = extractor.read_resume_labels(
            self.__zylon_parser_labels_folder, zylon_filenames)
        true_xml_trees = extractor.read_resume_labels(
            self.__dataset_raw_folder, zylon_filenames)

        true_edu_insts = [
            extractor.get_edu_institutions(xml_tree)
            for xml_tree in true_xml_trees
        ]
        true_edu_majors = [
            extractor.get_edu_majors(xml_tree) for xml_tree in true_xml_trees
        ]
        true_emp_names = [
            extractor.get_company_names(xml_tree)
            for xml_tree in true_xml_trees
        ]
        true_emp_jtitles = [
            extractor.get_job_titles(xml_tree) for xml_tree in true_xml_trees
        ]

        zylon_edu_insts = [
            extractor.get_edu_institutions_zy(xml_tree)
            for xml_tree in zylon_xml_trees
        ]
        zylon_edu_majors = [
            extractor.get_edu_majors_zy(xml_tree)
            for xml_tree in zylon_xml_trees
        ]
        zylon_emp_names = [
            extractor.get_company_names_zy(xml_tree)
            for xml_tree in zylon_xml_trees
        ]
        zylon_emp_jtitles = [
            extractor.get_job_titles_zy(xml_tree)
            for xml_tree in zylon_xml_trees
        ]

        tokeniser = Tokeniser()
        true_edu_insts = tokeniser.docs_tolower(
            tokeniser.tokenise_doclines_to_words(true_edu_insts))
        true_edu_majors = tokeniser.docs_tolower(
            tokeniser.tokenise_doclines_to_words(true_edu_majors))
        true_emp_names = tokeniser.docs_tolower(
            tokeniser.tokenise_doclines_to_words(true_emp_names))
        true_emp_jtitles = tokeniser.docs_tolower(
            tokeniser.tokenise_doclines_to_words(true_emp_jtitles))

        zylon_edu_insts = tokeniser.docs_tolower(
            tokeniser.tokenise_doclines_to_words(zylon_edu_insts))
        zylon_edu_majors = tokeniser.docs_tolower(
            tokeniser.tokenise_doclines_to_words(zylon_edu_majors))
        zylon_emp_names = tokeniser.docs_tolower(
            tokeniser.tokenise_doclines_to_words(zylon_emp_names))
        zylon_emp_jtitles = tokeniser.docs_tolower(
            tokeniser.tokenise_doclines_to_words(zylon_emp_jtitles))

        edu_insts_match_score = self.score_matches(zylon_edu_insts,
                                                   true_edu_insts)
        edu_majors_match_score = self.score_matches(zylon_edu_majors,
                                                    true_edu_majors)
        emp_names_match_score = self.score_matches(zylon_emp_names,
                                                   true_emp_names)
        emp_jtitles_match_score = self.score_matches(zylon_emp_jtitles,
                                                     true_emp_jtitles)

        return edu_insts_match_score, edu_majors_match_score, emp_names_match_score, emp_jtitles_match_score
Esempio n. 41
0
def check_should_normalise_correctly_for_bi_commands(cmd):
  t = Tokeniser()
  assert t.normalise(cmd) == {'action': cmd[0],
                              'magnitude': cmd[1],
                              'direction': None}
Esempio n. 42
0
class Parser:
    def __init__(self, f):
        self.in_name, self.out_name = f
        self.output = []
        self.tokeniser = Tokeniser(f)
        self.st_handler = SymbolTable()
        self.writer = VMCodeWriter(f)
        self.local_state = {'labeler': labeler()}
        self.parse()
        self.writer.close()

    def parse(self):
        if self.tokeniser.has_next():
            self.compileClass()
        return self.out_name, self.output

    def compileClass(self):
        self.expect(TokenType.KEYWORD, 'class')
        self.local_state['class'] = self.expect(TokenType.IDENTIFIER)
        self.expect(TokenType.SYMBOL, '{')
        while self.peek(TokenType.KEYWORD, CLASS_VAR_KEYWORDS):
            self.compileClassVarDec()
        while self.peek(TokenType.KEYWORD, FXN_KEYWORDS):
            self.st_handler.start_subroutine()
            self.compileSubroutine()
        self.expect(TokenType.SYMBOL, '}')
        del self.local_state['class']

    def compileClassVarDec(self):
        kind = keyword_to_kind[self.expect(TokenType.KEYWORD)]
        taipu = self.compileType()
        name = self.expect(TokenType.IDENTIFIER)
        self.st_handler.define(name, taipu, kind)
        varlist = self.tryCompileVarList(taipu=taipu, kind=kind)
        self.expect(TokenType.SYMBOL, ";")

    def compileType(self):
        ttype, token = self.tokeniser.peek()
        if ttype == TokenType.KEYWORD and token in BI_TYPES:
            return self.expect(TokenType.KEYWORD, token)
        elif ttype == TokenType.IDENTIFIER:
            return self.expect(TokenType.IDENTIFIER)
        else:
            raise SyntaxError(
                "Expected type in {} or identifier, got: {} of type {}".format(
                    BI_TYPES, token, ttype))

    def tryCompileVarList(self, exp_type=False, taipu=None, kind=None):
        varlist = []
        while self.peek(TokenType.SYMBOL, ","):
            self.expect(TokenType.SYMBOL, ",")
            if exp_type:
                taipu = self.compileType()
            name = self.expect(TokenType.IDENTIFIER)
            varlist.append((name, taipu, kind))
        for entry in varlist:
            self.st_handler.define(*entry)

    def compileSubroutine(self):
        fxn_kind = self.expect(TokenType.KEYWORD, FXN_KEYWORDS)
        if fxn_kind == 'method':
            self.st_handler.define('this', self.local_state['class'],
                                   IdentifierKind.ARGUMENT)
        self.compileType()
        fxn_name = self.expect(TokenType.IDENTIFIER)
        self.expect(TokenType.SYMBOL, "(")
        self.compileParameterList()
        self.expect(TokenType.SYMBOL, ")")
        self.compileSubroutineBody(fxn_name, fxn_kind)

    def compileParameterList(self):
        if not self.peek(TokenType.SYMBOL, ")"):
            kind = IdentifierKind.ARGUMENT
            taipu = self.compileType()
            name = self.expect(TokenType.IDENTIFIER)
            self.st_handler.define(name, taipu, kind)
            self.tryCompileVarList(exp_type=True, kind=kind)

    def compileSubroutineBody(self, fxn_name, fxn_kind):
        self.expect(TokenType.SYMBOL, "{")
        while self.peek(TokenType.KEYWORD, "var"):
            self.compileVarDec()
        self.writer.fun_dec(fxn_name,
                            self.st_handler.var_count(IdentifierKind.VAR))
        self.compileFxnKind(fxn_kind)
        self.compileStatements()
        self.expect(TokenType.SYMBOL, "}")

    def compileFxnKind(self, kind):
        if kind == 'constructor':
            num_fields = self.st_handler.var_count(IdentifierKind.FIELD)
            self.writer.alloc(num_fields)
            self.writer.pop_this_ptr()
        elif kind == 'method':
            self.writer.push_variable('this', self.st_handler)
            self.writer.pop_this_ptr()

    def compileVarDec(self):
        self.expect(TokenType.KEYWORD, "var")
        kind = IdentifierKind.VAR
        taipu = self.compileType()
        name = self.expect(TokenType.IDENTIFIER)
        self.st_handler.define(name, taipu, kind)
        self.tryCompileVarList(taipu=taipu, kind=kind)
        self.expect(TokenType.SYMBOL, ";")

    def compileStatements(self):
        while self.peek(TokenType.KEYWORD, STMT_KEYWORDS):
            self.compileStatement()

    def compileStatement(self):
        if self.peek(TokenType.KEYWORD, "let"):
            self.compileLet()
        elif self.peek(TokenType.KEYWORD, "if"):
            self.compileIf()
        elif self.peek(TokenType.KEYWORD, "while"):
            self.compileWhile()
        elif self.peek(TokenType.KEYWORD, "do"):
            self.compileDo()
        elif self.peek(TokenType.KEYWORD, "return"):
            self.compileReturn()

    def compileLet(self):
        self.expect(TokenType.KEYWORD, "let")
        var = self.expect(TokenType.IDENTIFIER)
        array_assignment = False
        if self.peek(TokenType.SYMBOL, "["):
            array_assignment = True
            self.compileBasePlusOffset(var)
        self.expect(TokenType.SYMBOL, "=")
        self.compileExpression()
        self.expect(TokenType.SYMBOL, ";")
        if array_assignment:
            self.saveToTemp()
            self.popToArray()
        else:
            self.writer.pop_variable(var, self.st_handler)

    def compileBasePlusOffset(self, base):
        self.writer.push_variable(base, self.st_handler)
        self.expect(TokenType.SYMBOL, "[")
        self.compileExpression()
        self.expect(TokenType.SYMBOL, "]")
        self.writer.binary_op("+")

    def saveToTemp(self):
        self.writer.pop('temp', 0)

    def popToArray(self):
        self.writer.pop_that_ptr()
        self.writer.push('temp', 0)
        self.writer.pop_that()

    def compileIf(self):
        self.expect(TokenType.KEYWORD, "if")
        endif = next(self.local_state['labeler'])
        self.compileCond(endif)
        if self.peek(TokenType.KEYWORD, "else"):
            self.expect(TokenType.KEYWORD, "else")
            self.expectBracedStatements()
        self.writer.label(endif)

    def expectBracedStatements(self):
        self.expect(TokenType.SYMBOL, "{")
        self.compileStatements()
        self.expect(TokenType.SYMBOL, "}")

    def compileWhile(self):
        self.expect(TokenType.KEYWORD, "while")
        loop = next(self.local_state['labeler'])
        self.writer.label(loop)
        self.compileCond(loop)

    def compileCond(self, ret):
        self.expectGroupedExpression()
        self.writer.unary_op('~')
        not_cond = next(self.local_state['labeler'])
        self.writer.ifgoto(not_cond)
        self.expectBracedStatements()
        self.writer.goto(ret)
        self.writer.label(not_cond)

    def expectGroupedExpression(self):
        self.expect(TokenType.SYMBOL, "(")
        self.compileExpression()
        self.expect(TokenType.SYMBOL, ")")

    def compileDo(self):
        self.expect(TokenType.KEYWORD, "do")
        caller = self.expect(TokenType.IDENTIFIER)
        self.compileSubroutineCall(caller)
        self.writer.pop("temp", "0")
        self.expect(TokenType.SYMBOL, ";")

    def compileReturn(self):
        self.expect(TokenType.KEYWORD, "return")
        if not self.peek(TokenType.SYMBOL, ";"):
            self.compileExpression()
        else:
            self.writer.int_const(0)
        self.writer.ret()
        self.expect(TokenType.SYMBOL, ";")

    def compileExpression(self):
        self.compileTerm()
        while self.peek(TokenType.SYMBOL, EXP_SYMBOLS):
            op = self.compileOp()
            self.compileTerm()
            self.writer.binary_op(op)

    def compileOp(self):
        return self.expect(TokenType.SYMBOL)

    def compileTerm(self):
        if self.peek(TokenType.INT_CONST):
            int = self.expect(TokenType.INT_CONST)
            self.writer.int_const(int)
        elif self.peek(TokenType.STR_CONST):
            str = self.compileStrConst()
            self.writer.str_const(str)
        elif self.peek(TokenType.KEYWORD, KEYWORD_CONSTANTS):
            kw = self.expect(TokenType.KEYWORD, KEYWORD_CONSTANTS)
            self.writer.kw_const(kw)
        elif self.peek(TokenType.SYMBOL, UNARY_OPS):
            self.compileUnaryOp()
        elif self.peek(TokenType.SYMBOL, "("):
            self.expectGroupedExpression()
        elif self.tokeniser.has_next():
            t1, token1 = self.tokeniser.next()
            if self.tokeniser.has_next():
                t2, token2 = self.tokeniser.peek()
                if self.peek(TokenType.SYMBOL, "["):
                    self.compileArrayAccess(token1)
                elif self.peek(TokenType.SYMBOL, ["(", "."]):
                    self.compileSubroutineCall(token1)
                else:
                    self.writer.push_variable(token1, self.st_handler)

    def compileStrConst(self):
        ttype, token = self.tokeniser.next()
        return token[1:-1]

    def compileUnaryOp(self):
        op = self.expect(TokenType.SYMBOL, ["-", "~"])
        self.compileTerm()
        self.writer.unary_op(op)

    def compileArrayAccess(self, arr):
        self.compileBasePlusOffset(arr)
        self.writer.pop_that_ptr()
        self.writer.push_that()

    def compileSubroutineCall(self, caller):
        if self.peek(TokenType.SYMBOL, "("):
            method, nargs = self.compileSelfFunctionCall(caller)
            qualified_name = self.local_state['class'] + '.' + method
        elif self.peek(TokenType.SYMBOL, "."):
            method, nargs = self.compileMethodCall(caller)
            qualified_name = self.st_handler.qualify(caller, method)
        self.writer.call(qualified_name, nargs)

    def compileSelfFunctionCall(self, method):
        self.writer.push_this_ptr()
        nargs = self.expectExpressionList() + 1
        return method, nargs

    def compileMethodCall(self, caller):
        nargs = 0
        if self.st_handler.is_object(caller):
            nargs += 1
            self.writer.push_variable(caller, self.st_handler)
        self.expect(TokenType.SYMBOL, ".")
        method = self.expect(TokenType.IDENTIFIER)
        nargs += self.expectExpressionList()
        return method, nargs

    def expectExpressionList(self):
        self.expect(TokenType.SYMBOL, "(")
        nexps = self.compileExpressionList()
        self.expect(TokenType.SYMBOL, ")")
        return nexps

    def compileExpressionList(self):
        nexps = 0
        if not self.peek(TokenType.SYMBOL, ")"):
            self.compileExpression()
            nexps += 1
            while self.peek(TokenType.SYMBOL, ","):
                self.expect(TokenType.SYMBOL, ",")
                self.compileExpression()
                nexps += 1
        return nexps

    def peek(self, e_type, e_token=None):
        if not self.tokeniser.has_next():
            return False
        a_type, a_token = self.tokeniser.peek()
        return self.token_match(e_type, e_token, a_type, a_token)

    def expect(self, e_type, e_token=None):
        a_type, a_token = self.tokeniser.next()
        if self.token_match(e_type, e_token, a_type, a_token):
            return a_token
        else:
            raise SyntaxError(
                "Expected {} of type {}, got {} of type {}".format(
                    e_token, e_type, a_token, a_type))

    def token_match(self, e_type, e_token, a_type, a_token):
        return (e_type == a_type or (type(e_type) == list and a_type in e_type)) and \
                    (e_token is None or e_token == a_token or (type(e_token) == list and a_token in e_token))
Esempio n. 43
0
def test_should_return_true_for_empty_lists():
  assert Tokeniser.is_empty([])
Esempio n. 44
0
def check_should_return_true_for_valid_comments(comment):
  assert Tokeniser.is_comment(comment)
Esempio n. 45
0
class Parser:
    def __init__(self, f):
        self.in_name, self.out_name = f
        self.output = []
        self.tokeniser = Tokeniser(f)
        self.depth = 0
        self.parse()

    def parse(self):
        if self.tokeniser.has_next():
            self.compileClass()
        return self.out_name, self.output

    def compileClass(self):
        self.open_tag('class')
        self.expect(TokenType.KEYWORD, 'class')
        self.expect(TokenType.IDENTIFIER)
        self.expect(TokenType.SYMBOL, '{')
        while self.peek(TokenType.KEYWORD, ['static', 'field']):
            self.compileClassVarDec()
        while self.peek(TokenType.KEYWORD,
                        ['function', 'constructor', 'method']):
            self.compileSubroutine()
        self.expect(TokenType.SYMBOL, '}')
        self.close_tag('class')

    def compileClassVarDec(self):
        self.open_tag('classVarDec')
        self.expect(TokenType.KEYWORD)
        self.compileType()
        self.expect(TokenType.IDENTIFIER)
        self.tryCompileVarList()
        self.expect(TokenType.SYMBOL, ";")
        self.close_tag('classVarDec')
        return self.tokeniser.peek() if self.tokeniser.has_next() else (None,
                                                                        None)

    def compileType(self):
        ttype, token = self.tokeniser.peek()
        type_list = ["void", "int", "char", "boolean"]
        if ttype == TokenType.KEYWORD and token in type_list:
            self.expect(TokenType.KEYWORD, token)
        elif ttype == TokenType.IDENTIFIER:
            self.expect(TokenType.IDENTIFIER)
        else:
            raise SyntaxError(
                "Expected type in {} or identifier, got: {} of type {}".format(
                    type_list, token, ttype))

    def tryCompileVarList(self, exp_type=False):
        while self.peek(TokenType.SYMBOL, ","):
            self.expect(TokenType.SYMBOL, ",")
            if exp_type:
                self.compileType()
            self.expect(TokenType.IDENTIFIER)

    def compileSubroutine(self):
        self.open_tag('subroutineDec')
        self.expect(TokenType.KEYWORD, ['constructor', 'function', 'method'])
        self.expect([TokenType.KEYWORD, TokenType.IDENTIFIER])
        self.expect(TokenType.IDENTIFIER)
        self.expect(TokenType.SYMBOL, "(")
        self.compileParameterList()
        self.expect(TokenType.SYMBOL, ")")
        self.compileSubroutineBody()
        self.close_tag('subroutineDec')
        return self.tokeniser.peek() if self.tokeniser.has_next() else (None,
                                                                        None)

    def compileParameterList(self):
        self.open_tag('parameterList')
        if not self.peek(TokenType.SYMBOL, ")"):
            self.compileType()
            self.expect(TokenType.IDENTIFIER)
            self.tryCompileVarList(True)
        self.close_tag('parameterList')

    def compileSubroutineBody(self):
        self.open_tag('subroutineBody')
        self.expect(TokenType.SYMBOL, "{")
        while self.peek(TokenType.KEYWORD, "var"):
            self.compileVarDec()
        self.compileStatements()
        self.expect(TokenType.SYMBOL, "}")
        self.close_tag('subroutineBody')

    def compileVarDec(self):
        self.open_tag("varDec")
        self.expect(TokenType.KEYWORD, "var")
        self.compileType()
        self.expect(TokenType.IDENTIFIER)
        self.tryCompileVarList()
        self.expect(TokenType.SYMBOL, ";")
        self.close_tag("varDec")

    def compileStatements(self):
        self.open_tag("statements")
        while self.peek(TokenType.KEYWORD,
                        ['let', 'if', 'while', 'do', 'return']):
            self.compileStatement()
        self.close_tag("statements")

    def compileStatement(self):
        if self.peek(TokenType.KEYWORD, "let"):
            self.compileLet()
        elif self.peek(TokenType.KEYWORD, "if"):
            self.compileIf()
        elif self.peek(TokenType.KEYWORD, "while"):
            self.compileWhile()
        elif self.peek(TokenType.KEYWORD, "do"):
            self.compileDo()
        elif self.peek(TokenType.KEYWORD, "return"):
            self.compileReturn()

    def compileLet(self):
        self.open_tag("letStatement")
        self.expect(TokenType.KEYWORD, "let")
        self.expect(TokenType.IDENTIFIER)
        if self.peek(TokenType.SYMBOL, "["):
            self.expect(TokenType.SYMBOL, "[")
            self.compileExpression()
            self.expect(TokenType.SYMBOL, "]")
        self.expect(TokenType.SYMBOL, "=")
        self.compileExpression()
        self.expect(TokenType.SYMBOL, ";")
        self.close_tag("letStatement")

    def compileIf(self):
        self.open_tag("ifStatement")
        self.expect(TokenType.KEYWORD, "if")
        self.expectGroupedExpression()
        self.expect(TokenType.SYMBOL, "{")
        self.compileStatements()
        self.expect(TokenType.SYMBOL, "}")
        if self.peek(TokenType.KEYWORD, "else"):
            self.expect(TokenType.KEYWORD, "else")
            self.expect(TokenType.SYMBOL, "{")
            self.compileStatements()
            self.expect(TokenType.SYMBOL, "}")
        self.close_tag("ifStatement")

    def expectGroupedExpression(self):
        self.expect(TokenType.SYMBOL, "(")
        self.compileExpression()
        self.expect(TokenType.SYMBOL, ")")

    def compileWhile(self):
        self.open_tag("whileStatement")
        self.expect(TokenType.KEYWORD, "while")
        self.expectGroupedExpression()
        self.expect(TokenType.SYMBOL, "{")
        self.compileStatements()
        self.expect(TokenType.SYMBOL, "}")
        self.close_tag("whileStatement")

    def compileDo(self):
        self.open_tag("doStatement")
        self.expect(TokenType.KEYWORD, "do")
        self.expect(TokenType.IDENTIFIER)
        self.compileSubroutineCall()
        self.expect(TokenType.SYMBOL, ";")
        self.close_tag("doStatement")

    def compileReturn(self):
        self.open_tag("returnStatement")
        self.expect(TokenType.KEYWORD, "return")
        if not self.peek(TokenType.SYMBOL, ";"):
            self.compileExpression()
        self.expect(TokenType.SYMBOL, ";")
        self.close_tag("returnStatement")

    def compileExpression(self):
        self.open_tag("expression")
        self.compileTerm()
        while self.peek(TokenType.SYMBOL, list("+-*/&|<>=")):
            self.compileOp()
            self.compileTerm()
        self.close_tag("expression")

    def compileOp(self):
        self.expect(TokenType.SYMBOL)

    def compileTerm(self):
        self.open_tag("term")
        if self.peek(TokenType.INT_CONST):
            self.expect(TokenType.INT_CONST)
        elif self.peek(TokenType.STR_CONST):
            self.compileStrConst()
        elif self.peek(TokenType.KEYWORD, ['true', 'false', 'null', 'this']):
            self.compileKeywordConstant()
        elif self.peek(TokenType.SYMBOL, ["-", "~"]):
            self.compileUnaryOp()
        elif self.peek(TokenType.SYMBOL, "("):
            self.expectGroupedExpression()
        elif self.tokeniser.has_next():
            t1, token1 = self.tokeniser.next()
            self.terminal_tag(t1, token1)
            if self.tokeniser.has_next():
                t2, token2 = self.tokeniser.peek()
                if self.peek(TokenType.SYMBOL, "["):
                    self.compileArrayAccess()
                elif self.peek(TokenType.SYMBOL, ["(", "."]):
                    self.compileSubroutineCall()
        self.close_tag("term")

    def compileStrConst(self):
        ttype, token = self.tokeniser.next()
        self.terminal_tag(TokenType.STR_CONST, token[1:-1])

    def compileKeywordConstant(self):
        self.expect(TokenType.KEYWORD, ['true', 'false', 'null', 'this'])

    def compileUnaryOp(self):
        self.expect(TokenType.SYMBOL, ["-", "~"])
        self.compileTerm()

    def compileArrayAccess(self):
        self.expect(TokenType.SYMBOL, "[")
        self.compileExpression()
        self.expect(TokenType.SYMBOL, "]")

    def compileSubroutineCall(self):
        if self.peek(TokenType.SYMBOL, "("):
            self.expectExpressionList()
        elif self.peek(TokenType.SYMBOL, "."):
            self.expect(TokenType.SYMBOL, ".")
            self.expect(TokenType.IDENTIFIER)
            self.expectExpressionList()

    def expectExpressionList(self):
        self.expect(TokenType.SYMBOL, "(")
        self.compileExpressionList()
        self.expect(TokenType.SYMBOL, ")")

    def compileExpressionList(self):
        self.open_tag("expressionList")
        if not self.peek(TokenType.SYMBOL, ")"):
            self.compileExpression()
            while self.peek(TokenType.SYMBOL, ","):
                self.expect(TokenType.SYMBOL, ",")
                self.compileExpression()
        self.close_tag("expressionList")

    def peek(self, e_type, e_token=None, expect=True):
        if not self.tokeniser.has_next():
            return False
        a_type, a_token = self.tokeniser.peek()
        return self.token_match(e_type, e_token, a_type, a_token)

    def expect(self, e_type, e_token=None):
        if not self.tokeniser.has_next():
            return None, None
        a_type, a_token = self.tokeniser.next()
        if self.token_match(e_type, e_token, a_type, a_token):
            self.terminal_tag(a_type, a_token)
            return self.tokeniser.peek() if self.tokeniser.has_next(
            ) else None, None
        else:
            raise SyntaxError(
                "Expected {} of type {}, got {} of type {}".format(
                    e_token, e_type, a_token, a_type))

    def token_match(self, e_type, e_token, a_type, a_token):
        return (e_type == a_type or (type(e_type) == list and a_type in e_type)) and \
                    (e_token is None or e_token == a_token or (type(e_token) == list and a_token in e_token))

    def open_tag(self, tag_name, value=''):
        self.output.append('{}<{}>{}'.format(' ' * self.depth,
                                             escape(tag_name), escape(value)))
        self.depth += 2

    def close_tag(self, tag_name, newline=True):
        self.depth -= 2
        tag = '</{}>'.format(escape(tag_name))
        if newline or not self.output:
            self.output.append(' ' * self.depth + tag)
        else:
            self.output[-1] += tag

    def terminal_tag(self, tag_name, value):
        self.depth += 2
        self.open_tag(str(tag_name), value=value)
        self.close_tag(str(tag_name), False)
        self.depth -= 2