Beispiel #1
0
def LispLexer(text, compiler):
    pos = tokens.Position("", 1, 1, 1)
    yield tokens.Token(Tag.LP, pos, pos)
    yield tokens.Token(Tag.atom, pos, pos)
    yield tokens.Token(Tag.atom, pos, pos)
    yield tokens.Token(Tag.RP, pos, pos)
    yield tokens.Token(Tag.END_OF_PROGRAM, pos, pos)
def main(output=False):
    phonecmps = []
    timecmps = []
    doclist = CreateDoclist()
    for pair in PAIRS_:
        token1 = tokens.Token(pair[0])
        TestPronunciations(token1)
        token2 = tokens.Token(pair[1])
        TestPronunciations(token2)
        comparator = token_comp.OldPhoneticDistanceComparator(token1, token2)
        comparator.ComputeDistance()
        phonecmps.append(comparator)
    if output:
        p = open(GOLDEN_FILE_, 'w')  ## clear golden file
        p.close()
        for pc in phonecmps:
            pc.ComparisonResult().Print(GOLDEN_FILE_, 'a')
        TestCorrelations(doclist, GOLDEN_FILE_)
        TestSnowActivations(doclist, GOLDEN_FILE_)
    else:
        p = open(TEST_FILE_, 'w')  ## clear test file
        p.close()
        for pc in phonecmps:
            pc.ComparisonResult().Print(TEST_FILE_, 'a')
        TestCorrelations(doclist, TEST_FILE_)
        TestSnowActivations(doclist, TEST_FILE_)
        unittest.TestUnitOutputs(sys.argv[0] + ' (main test & perceptron test)', \
                                 GOLDEN_FILE_, TEST_FILE_)
        TestAuxiliaryComparators(sys.argv[0])
Beispiel #3
0
 def test_operators(self):
     toks = list(lexer.Lexer("+-*/").make_token())
     self.assertEqual(toks, [
         tokens.Token(tokens.TokenType.PLUS),
         tokens.Token(tokens.TokenType.MINUS),
         tokens.Token(tokens.TokenType.MULTIPLY),
         tokens.Token(tokens.TokenType.DIVIDE)
     ])
Beispiel #4
0
 def test_numbers(self):
     toks = list(lexer.Lexer("1245.4 .234 1234. .").make_token())
     self.assertEqual(toks, [
         tokens.Token(tokens.TokenType.NUMBER, 1245.4),
         tokens.Token(tokens.TokenType.NUMBER, .234),
         tokens.Token(tokens.TokenType.NUMBER, 1234.),
         tokens.Token(tokens.TokenType.NUMBER, 000.000),
     ])
Beispiel #5
0
def driver(f, line):
    this_state = 0
    next_state = 0
    tk = tokens.Token()
    literal = ""

    while this_state < 1000 and this_state > -1:
        fpos = f.tell()
        datum = f.read(1)
        if datum == '#':
            while True:
                datum = f.read(1)
                if datum == '\n':
                    break
        fsa_state = get_column(datum)
        next_state = fsa_table[this_state][fsa_state]

        if next_state >= 1000 or next_state < 0:
            if next_state >= 1000:
                tk = get_tokens(next_state, literal, line)
                tk.location = line
                f.seek(fpos, os.SEEK_SET)
                return tk, line
            if next_state == -1:
                tk.identity = tokens.token_ids.token_names[35]
                tk.instance = 'EOF'
                tk.location = line
                return tk, line
            if next_state == -2:
                print "SCANNER ERROR: Illegal character '%s' on line %d" % (
                    datum, line)
                tk.identity = tokens.token_ids.token_names[36]
                tk.instance = 'bad token'
                tk.location = line
                return tk, line
        else:
            unit = datum
            if unit in specials and symbols.has_key(unit) == False:
                print "SCANNER ERROR: Illegal keyword character '%s' on line %d" % (
                    datum, line)
                tk.identity = tokens.token_ids.token_names[36]
                tk.instance = unit
                tk.location = line
                return tk, line
            if unit.isspace() == False:
                literal += unit
            if len(literal) > 7:
                print "SCANNER ERROR: Illegal keyword '%s' on line %d" % (
                    datum, line)
                return tokens.Token(tokens.token_ids.token_names[36],
                                    'illegal size', line)
            if datum == '\n':
                line = line + 1
            this_state = next_state
    return tokens.Token(tokens.token_ids.token_names[36], 'bad token', line)
Beispiel #6
0
    def TestString(self):
        program = ast.Program()
        program.Statements = [
            ast.LetStatement(
                tkn.Token(tkn.LET, "let"),
                ast.Identifier(tkn.Token(tkn.IDENT, "myVar"), "myVar"),
                ast.Identifier(tkn.Token(tkn.IDENT, "anotherVar"),
                               "anotherVar"))
        ]

        if program.String() != "let myVar = anotherVar":
            raise Exception(f'program.String() wrong got={program.String()}')
Beispiel #7
0
def test_string():
    program = ast.Program(statements=[
        ast.LetStatement(token=tokens.Token(typ=tokens.LET, literal="let"),
                         name=ast.Identifier(token=tokens.Token(
                             typ=tokens.IDENT, literal="my_var"),
                                             value="my_var"),
                         value=ast.Identifier(token=tokens.Token(
                             typ=tokens.IDENT, literal="another_var"),
                                              value="another_var"))
    ])

    expected = "let my_var = another_var;"
    assert str(
        program
    ) == expected, f"str(program) wrong. got '{str(program)}'' but expected '{expected}'"
Beispiel #8
0
def assem_conv(op, words, line):
    logging.debug("Conv %s words:%s" % (op, hl_parser.format_word_list(words)))

    token = tokens.Token("conv", err, line)

    if (op == "cmptime"):
        if (len(words) != 1):
            err.report_error("Cmptime needs one arguement")
            return
        else:
            token.add_bits(0,6, 3, 2)
            token.add_bits(0, 3, 1, 0)
            token.add_bits(0, 0, 7, 7)
            if (words[0].type() == "arg"):
                token.add_byte(1, words[0].num())
            elif (words[0].type() == "var"):
                token.add_byte(1, 0)
                token.add_vname(1, 0, words[0].val())
            else:
                err.report_error("Cmptime takes a variable as it's argument")
                return
    else:
        if (len(words) != 0):
            err.report_error("Conversions don't that arguments")
            return
        else:
            token.add_bits(0,6, 3, 2)
            token.add_bits(0, 0, 7, 3)
            if (op == "convm"):
                token.add_bits(0, 3, 1, 1)

    token.finish(token_stream)
Beispiel #9
0
def parse_tokens(text, size, source_id):
    """Parse sentence and return tokens"""
    text = re.sub(r'\s+', ' ', text).strip()  #remove multiple spaces

    string_start = STRING_START_TEXT * (size - 1)
    text = string_start + text  #append to text some special start tokens

    words = split_into_words(text)
    if not words:  #empty sentence
        return []
    length = len(words)
    lists = []
    for i in range(0, length):
        wordlist = []
        for j in range(0, size):
            if i + j < length:  #we can append more words
                wordlist.append(words[i + j])
        lists.append(wordlist)
    result = []
    length = len(lists)
    for i in range(0, length):
        is_begin = 1 if i <= size - 1 else 0
        is_end = 1 if i + size >= length - 1 else 0
        start = ' '.join(lists[i]) if i < length else ''
        end = lists[i + size][0] if i + size < length else ''
        token = tokens.Token(start, end, source_id, is_begin, is_end)
        result.append(token)
    return result
Beispiel #10
0
def assem_spec_insert(words, line):
    logging.debug("Spec_insert words:%s" % (hl_parser.format_word_list(words)))
    if (len(words) != 2):
        err.report_error("INSERT needs type and filename arguments")
        return

    type = words[0].astr()
    f_name = words[1].astr()

    if (type.lower() not in ['tokens', 'binary']):
        err.report_error("INSERT type must be one of: 'tokens', 'binary'")
        return

    if (type.lower() == 'tokens'):
        err.report_error("INSERT TOKENS should have been consumed higher up! Eek!");
        return

    # we have an insert binary special to deal with
    if (not os.path.isfile(f_name) or not os.access(f_name, os.R_OK)):
        err.report_error("INSERT BINARY file:%s doesn't exist or isn't readable")
        return

    token = tokens.Token("binary", err, line)
    token.add_binary_file(f_name)
    token.finish(token_stream)
 def LineSegment(self, line):
     try:
         utext = unicode(line.strip(), 'utf-8')
     except TypeError:
         utext = line.strip()
     for i in range(len(utext)):
         for k in [4, 3, 2]:
             sub = utext[i:i + k]
             if len(sub) != k: continue
             if k > 2 and sub[:2].encode('utf-8') in FAMILY_NAMES_:
                 if not (Utils.script.HasDigit(sub)
                         or Utils.script.HasPunctuation(sub)):
                     self.tokens_.append(tokens.Token(sub))
             elif k < 4 and sub[:1].encode('utf-8') in FAMILY_NAMES_:
                 if not Utils.script.HasDigit(sub):
                     self.tokens_.append(tokens.Token(sub))
Beispiel #12
0
def assem_uni_math(op, size, words, line):
    logging.debug("Mathu %s size:%d, words:%s" % (op, size, hl_parser.format_word_list(words)))
    if (len(words) > 1):
        err.report_error("Unary Math has at most one argument")
        return

    token = tokens.Token("uni-math", err, line)
    token.add_bits(0, 6, 3, 2)
    token.add_bits(0, 3, 1, size)
    token.add_bits(0, 0, 3, uni_assem_map[op])

    if (not words or
        (words[0].type() == "modreg" and words[0].val() == hl_parser.modreg_names["acc"])):
        token.add_bits(0, 2, 1, 0)
        token.finish(token_stream)
    elif (words[0].type() in ["var", "arg"]):
        token.add_bits(0, 2, 1, 1)
        if (words[0].type() == "arg"):
            token.add_byte(1, words[0].num())
        else:
            token.add_byte(1, 0)
            token.add_vname(1, size, words[0].val())
        token.finish(token_stream)
    else:
        err.report_error("Unary Math - invalid argument type")
        return
Beispiel #13
0
def assem_basic_math(op, size, words, line):
    logging.debug("Mathb %s size:%s, words:%s" % (op, size, hl_parser.format_word_list(words)))
    if (len(words) != 1):
        err.report_error("Basic Math needs one argument")
        return

    token = tokens.Token("basic-math", err, line)
    token.add_bits(0, 6, 3, 2)
    token.add_bits(0, 4, 3, 1)
    token.add_bits(0, 3, 1, size)
    token.add_bits(0, 0, 3, basic_assem_map[op])

    if (words[0].type() in ["var", "arg"]):
        # math with a variable
        token.add_bits(0, 2, 1, 0)
        if (words[0].type() == "arg"):
            token.add_byte(1, words[0].num())
        else:
            token.add_byte(1, 0)
            token.add_vname(1, size, words[0].val())
        token.finish(token_stream)
    elif (words[0].type() == "const"):
        token.add_bits(0, 2, 1, 1)
        if (size == 0):
            token.add_byte(1, words[0].val())
        else:
            token.add_word(1, words[0].val())

        token.finish(token_stream)
    else:
        err.report_error("Basic Math - invalid argument type")
        return
Beispiel #14
0
 def push_macro(self, optimizer):
     macro = optimizer.rewind()
     end = tokens.Token(tokens.Token.END)
     optimizer.push_node(end)
     optimizer.push_node(macro.statements)
     optimizer.open_scope()
     return True
Beispiel #15
0
def assem_other_math(op, size, words, line):
    logging.debug("Matho %s size:%d, words:%s" % (op, size, hl_parser.format_word_list(words)))

    if (len(words) != 1):
        err.report_error("Logic Math needs one arguement")
        return

    token = tokens.Token("log-math", err, line)
    token.add_bits(0, 6, 3, 2)
    token.add_bits(0, 3, 1, size)

    if (op in other1_assem_map.keys()):
        token.add_bits(0,4,3,2)
        token.add_bits(0,0,3, other1_assem_map[op])
    else:
        token.add_bits(0,4,3,3)
        token.add_bits(0,0,3, other2_assem_map[op])

    if (words[0].type() in ["var", "arg"]):
        # math with a variable
        token.add_bits(0, 2, 1, 0)
        if (words[0].type() == "arg"):
            token.add_byte(1, words[0].val())
        else:
            token.add_byte(1, 0)
            token.add_vname(1, size, words[0].val())
        token.finish(token_stream)
    elif (words[0].type() == "const"):
        token.add_bits(0, 2, 1, 1)
        token.add_byte(1, words[0].val())

        token.finish(token_stream)
    else:
        err.report_error("Logic Math - invalid argument type")
        return
Beispiel #16
0
def assem_jump(op, cond, words, line):
    logging.debug("Jump %s cond:%s, words:%s" %
                  (op, cond, hl_parser.format_word_list(words)))
    # ops is one of: branch, sub, ret, dbnz, dsnz
    # cond is one of: a, e, ne, g, l, le, lg or empty for ret, d?nz
    token = tokens.Token("jump", err, line)
    token.add_bits(0, 6, 3, 3)

    if (op == "ret"):
        if (len(words) != 0):
            err.report_error("Ret don't take an argument")
            return
        else:
            token.add_bits(0, 0, 0x3f, 0x28)
    elif (len(words) != 1):
        err.report_error("Jumps need a target to jump to")
        return
    else:
        if (op[:2] in ['su', 'ds']):
            # a call to a subroutine - push a frame
            token.add_bits(0, 3, 1, 1)
        else:
            token.add_bits(0, 3, 1, 0)

        if (not cond):
            # one of dbnz or dsnz
            token.add_bits(0, 0, 7, 7)
        else:
            token.add_bits(0, 0, 7, jcond_assem_map[cond])

        # now the target
        if (words[0].type() == "const"):
            offset = words[0].val()
            if (offset >= tokens.MIN_SBYTE and offset <= tokens.MAX_SBYTE):
                if (offset < 0):
                    # convert to a signed offset
                    offset += 256
                token.add_byte(1, offset)
            elif (offset >= tokens.MIN_WORD and offsec <= tokens.MAX_WORD):
                token.add_bits(0, 4, 1, 1)
                token.add_word(1, offset)

        elif (words[0].type() == "label"):
            if (words[0].val().startswith(':')):
                # globals are always long jumps
                token.add_bits(0, 4, 1, 1)
                token.set_jump_label(1, words[0].val(), True)
                token.add_word(1, 0)  # A placeholder
            else:
                token.set_jump_label(1, words[0].val())
                token.add_byte(1, 0)  # A placeholder

        else:
            err.report_error(
                "Jumps need either a constant or a label as argument, not a: "
                + words[0].type())
            return

    token.finish(token_stream)
Beispiel #17
0
    def make_number(self):
        num_str = ''
        dot_count = 0
        pos_start = self.pos.copy()

        while self.current_char != None and self.current_char in constants.DIGITS + '.':
            if self.current_char == '.':
                if dot_count == 1:
                    break
                dot_count += 1
            num_str += self.current_char
            self.advance()

        if dot_count == 0:
            return t.Token(t.TT_INT, int(num_str), pos_start, self.pos)
        else:
            return t.Token(t.TT_FLOAT, float(num_str), pos_start, self.pos)
Beispiel #18
0
def tester(fn):
    t = tokens.Token()
    line = 1
    with open(fn) as f:
        while True:
            t, line = scanner.driver(f, line)
            print "%s '%s' on line %d" % (t.identity, t.instance, t.location)
            if t.identity == tokens.token_ids.token_names[36]: break
            if t.identity == tokens.token_ids.token_names[35]: break
Beispiel #19
0
    def _tokenize_line(cls, _line: str) -> typing.List[typing.Any]:

        _result = []
        for i in re.findall(cls.grammer, _line):
            print(i)
            _r = [[*c, i] for c in cls.token_list if re.findall(c[-1], i)][0]
            print(_r)
            _result.append(tokens.Token(*_r))
        return _result
Beispiel #20
0
    def make_identifier(self):
        id_str = ''
        pos_start = self.pos.copy()

        while self.current_char != None and self.current_char in constants.LETTERS_DIGITS + '_':
            id_str += self.current_char
            self.advance()

        tok_type = t.TT_KEYWORD if id_str in t.KEYWORDS else t.TT_IDENTIFIER
        return t.Token(tok_type, id_str, pos_start, self.pos)
Beispiel #21
0
    def make_greater_than(self):
        tok_type = t.TT_GT
        pos_start = self.pos.copy()
        self.advance()

        if self.current_char == '=':
            self.advance()
            tok_type = t.TT_GTE

        return t.Token(tok_type, pos_start=pos_start, pos_end=self.pos)
Beispiel #22
0
    def make_minus_or_arrow(self):
        tok_type = t.TT_MINUS
        pos_start = self.pos.copy()
        self.advance()

        if self.current_char == '>':
            self.advance()
            tok_type = t.TT_ARROW

        return t.Token(tok_type, pos_start=pos_start, pos_end=self.pos)
Beispiel #23
0
    def make_not_equals(self):
        pos_start = self.pos.copy()
        self.advance()

        if self.current_char == '=':
            self.advance()
            return t.Token(t.TT_NE, pos_start=pos_start, pos_end=self.pos), None

        self.advance()
        return None, e.ExpectedCharError(pos_start, self.pos, "'=' (after '!')")
Beispiel #24
0
    def make_equals(self):
        tok_type = t.TT_EQ
        pos_start = self.pos.copy()
        self.advance()

        if self.current_char == '=':
            self.advance()
            tok_type = t.TT_EE

        return t.Token(tok_type, pos_start=pos_start, pos_end=self.pos)
Beispiel #25
0
def assem_stack(op, size, words, line):
    logging.debug("Stack %s size:%d, words:%s" % (op, size, hl_parser.format_word_list(words)))
    # op in ['push', 'pop'], size in [0, 1], one word
    if (len(words) != 1):
        err.report_error("Stack ops need 1 argument")
        return

    token = tokens.Token("stack", err, line)
    token.add_bits(0, 6, 3, 3)
    token.add_bits(0, 5, 1, 1)
    token.add_bits(0, 4, 1, size)

    if (op == "push"):
        if (words[0].type() == 'const'):
            if (size == 0):
                token.add_byte(1, words[0].val())
            else:
                token.add_word(1, words[0].val())
        elif (words[0].type() == 'modreg'):
            if (words[0].val() == hl_parser.modreg_names["acc"]):
                token.add_bits(0, 0, 0xf, 0x3)
            else:
                token.add_bits(0, 0, 0xf, 2)
                token.add_byte(1, words[0].val())
        elif (words[0].type() in  ["var", "arg"]):
            token.add_bits(0, 0, 0xf, 1)
            if (words[0].type() == "arg"):
                token.add_byte(1, words[0].num())
            else:
                token.add_byte(1, 0)
                token.add_vname(1, size, words[0].val())
        else:
            err.report_error("Push - invalid operand: %s" % (words[0].type()))
            return

    else: # pop
        if (words[0].type() == 'modreg'):
            if (words[0].val() == hl_parser.modreg_names["acc"]):
                token.add_bits(0, 0, 0xf, 0x4)
            else:
                token.add_bits(0, 0, 0xf, 6)
                token.add_byte(1, words[0].val())
        elif (words[0].type() in ["var", "arg"]):
            token.add_bits(0, 0, 0xf, 5)
            if (words[0].type() == "arg"):
                token.add_byte(1, words[0].num())
            else:
                token.add_byte(1, 0)
                token.add_vname(1, size, words[0].val())
        else:
            err.report_error("Pop - invalid operand: %s" % (words[0].type()))
            return


    token.finish(token_stream)
Beispiel #26
0
 def LineSegment(self, line):
   try: utext = unicode(line.strip(), 'utf-8')
   except TypeError: utext = line.strip()
   word = []
   for u in utext:
     if Utils.script.CharacterToScript(u) == 'Katakana':
       word.append(u.encode('utf-8'))
     else:
       if word and word != ['・']:
         self.tokens_.append(tokens.Token(''.join(word)))
         word = []
Beispiel #27
0
def get_tokens(state, literal, line):
    state_token = tokens.Token()
    if literal in keywords:
        state_token.identity = keywords.get(literal)
        state_token.instance = literal
        state_token.location = line
    elif final_states.has_key(state):
        state_token.identity = final_states.get(state)
        state_token.instance = literal
        state_token.location = line
    return state_token
def TestAuxiliaryComparators(unitname):
    ## Added tests for Wade-Giles and Pinyin comparators
    t1 = tokens.Token('毛泽东')
    t2 = tokens.Token('周恩来')
    t1py = tokens.Token('Mao Zedong')
    t2py = tokens.Token('Zhou Enlai')
    t1wg = tokens.Token('Mao Tse-tung')
    t2wg = tokens.Token('Chou Enlai')
    comparator = auxiliary_comp.PinyinComparator(t1, t1py)
    comparator.ComputeDistance()
    assert comparator.ComparisonResult().Cost() == auxiliary_comp.MATCH_, \
        '%s should match %s' % (t1.String(), t1py.String())
    comparator = auxiliary_comp.PinyinComparator(t2, t2py)
    comparator.ComputeDistance()
    assert comparator.ComparisonResult().Cost() == auxiliary_comp.MATCH_, \
        '%s should match %s' % (t2.String(), t2py.String())
    comparator = auxiliary_comp.WadeGilesComparator(t1, t1wg)
    comparator.ComputeDistance()
    assert comparator.ComparisonResult().Cost() == auxiliary_comp.MATCH_, \
        '%s should match %s' % (t1.String(), t1wg.String())
    comparator = auxiliary_comp.WadeGilesComparator(t2, t2wg)
    comparator.ComputeDistance()
    assert comparator.ComparisonResult().Cost() == auxiliary_comp.MATCH_, \
        '%s should match %s' % (t2.String(), t2wg.String())
    comparator = auxiliary_comp.WadeGilesComparator(t2, t2py)
    comparator.ComputeDistance()
    assert comparator.ComparisonResult().Cost() == auxiliary_comp.NO_MATCH_, \
        '%s should not match %s' % (t2.String(), t2py.String())
    print '%s (auxiliary tests) successful' % unitname
Beispiel #29
0
def assem_misc(op, words, line):
    logging.debug("Misc op:%s words:%s" %
                  (op, hl_parser.format_word_list(words)))
    if (op == "stop"):
        if (len(words) != 0):
            err.report_error("Stop doesn't take arguments")
            return

        token = tokens.Token("misc", err, line)
        token.add_bits(0, 0, 0xff, 0xff)
        token.finish(token_stream)

    elif (op in ["bitset", "bitclr"]):
        if (len(words) != 2):
            err.report_error(
                "Bitset/bitclr needs 2 arguments: bit and mod/reg")
            return

        bit = words[0].anum()
        if (bit < 0 or bit > 7):
            err.report_error(
                "Bitset/bitclr bit must be between 0 and 7 (not %d)" % (bit))
            return

        modreg = words[1].amodreg()

        token = tokens.Token("misc", err, line)
        token.add_bits(0, 4, 0x0f, 0x00)
        if (op == "bitset"):
            token.add_bits(0, 3, 0x1, 0x1)
        else:
            token.add_bits(0, 3, 0x1, 0x0)

        token.add_bits(0, 0, 0x7, bit)
        token.add_byte(1, modreg)
        token.finish(token_stream)

    else:
        err.report_error("Unknown misc operator: " + op)
        return
Beispiel #30
0
def parseSimpleState(text):
    # Parses a simple statement into a token list
    # Assumes that the inputted text is a valid statement.

    # But we check anyway
    if text == None:
        return None

    tokenList = []

    tokenlistpos = 0

    pos = 0

    # This loop handles most of the things
    while (pos < len(text)):
        cc = text[pos]  # cc = current char

        if cc.isalnum() == True:
            # Just avoiding errors with the if statement below
            if pos > 0:
                if text[pos - 1].isalnum == True:
                    tokenList[tokenlistpos].value += cc
                else:
                    tokenList.append(tokens.Token(tokens.OD_NAME, cc))
            else:
                tokenList.append(tokens.Token(tokens.OD_NAME, cc))

            tokenlistpos = len(tokenList)

        elif cc == '+' or cc == '-' or cc == '*' or cc == '-' or cc == '=':
            tokenList.append(tokens.Token(tokens.OP, cc))
            tokenlistpos = len(tokenList)

        else:
            tokenlistpos = len(tokenList)

        pos += 1

    return tokenList