Example #1
0
 def work(self, body):
     data = parse(body, DataSet)
     print "Tagging " + data.id
     for sent in data.sentences:
         tokens = [t.word for t in sent.tokens]
         sent.tokens = [Token(t[0], t[1]) for t in pos_tag(tokens)]
     self.write(as_json(data))
Example #2
0
def generate_token(db, user_id):
    token = Token()
    token.id = _generate_uuid_token()
    token.expires = datetime.datetime.now() + datetime.timedelta(seconds=conf.token_expires)
    token.user_id = user_id
    db.add(token)
    db.commit()
    return token
Example #3
0
 def get_tokens(self, tagged_tokens):
     tagConversionDict = {
         'NN': wn.NOUN,
         'JJ': wn.ADJ,
         'VB': wn.VERB,
         'RB': wn.ADV
     }
     tokens = []
     for index, tagged_token in enumerate(tagged_tokens):
         token = Token.Token(tagged_token[0], index, tagged_token[1])
         if token.penn_tag[:2] in tagConversionDict:
             token.wn_tag = tagConversionDict[token.penn_tag[:2]]
             token.lemma = self.lemmatizer.lemmatize(
                 token.token, token.wn_tag)
         tokens.append(token)
     return tokens
Example #4
0
 def get_all_tokens(self):
     tokens = self.collection.find()
     target_tokens = []
     for token in tokens:
         target_tokens.append(Token(token))
     return target_tokens
Example #5
0
 def get_token_info(self, token):
     token = self.collection.find_one({'token': token})
     if token is None:
         return None
     target_token = Token(token)
     return target_token
Example #6
0
def tokenize(lexemes, line_nums, d):
    token_stream = []

    count = 0
    for lexeme in lexemes:
        token = ''
        token_type = 0
        # only 1 comparison
        if lexeme == '!':
            token = 'NEGATION'
            token_type = 1

        elif lexeme == ';':
            token = 'SEMICOLON'
            token_type = 2

        elif lexeme == ',':
            token = 'COMMA'
            token_type = 2

        elif lexeme == '{':
            token = 'BRACE_L'
            token_type = 3

        elif lexeme == '}':
            token = 'BRACE_R'
            token_type = 3

        elif lexeme == '(':
            token = 'PARENTHESIS_L'
            token_type = 4

        elif lexeme == ')':
            token = 'PARENTHESIS_R'
            token_type = 4

        elif lexeme == '[':
            token = 'BRACKET_L'
            token_type = 5

        elif lexeme == ']':
            token = 'BRACKET_R'
            token_type = 5

        elif lexeme[0] == '"':
            token = 'STRING_LITERAL'
            token_type = 6

        # simple comparisons
        elif lexeme[0:2] == '//':
            token = 'COMMENT'
            token_type = 7

        elif lexeme == '=':
            token = 'ASSIGN'
            token_type = 8

        elif lexeme == '+=':
            token = 'PLUS_ASSIGN'
            token_type = 8

        elif lexeme == '-=':
            token = 'MINUS_ASSIGN'
            token_type = 8

        elif lexeme == '%':
            token = 'MOD'
            token_type = 9

        elif lexeme == '/':
            token = 'DIV'
            token_type = 9

        elif lexeme == '*':
            token = 'MULT'
            token_type = 9

        elif lexeme == '-':
            token = 'MINUS'
            token_type = 9

        elif lexeme == '+':
            token = 'SUM'
            token_type = 9

        elif lexeme == '>':
            token = 'GREATER_THAN'
            token_type = 10

        elif lexeme == '<':
            token = 'LESS_THAN'
            token_type = 10

        elif lexeme == '>=':
            token = 'GREATER_EQUALS_THAN'
            token_type = 10

        elif lexeme == '<=':
            token = 'LESS_EQUALS_THAN'
            token_type = 10

        elif lexeme == '==':
            token = 'EQUALS'
            token_type = 11

        elif lexeme == '!=':
            token = 'NOT_EQUALS'
            token_type = 11

        elif lexeme == '&&':
            token = 'AND'
            token_type = 12

        elif lexeme == '||':
            token = 'OR'
            token_type = 12

        # complex comparisons
        elif lexeme == 'class':
            token = 'RW_CLASS'
            token_type = 13

        elif lexeme == 'void':
            token = 'RW_VOID'
            token_type = 13

        elif lexeme == 'if':
            token = 'RW_IF'
            token_type = 13

        elif lexeme == 'else':
            token = 'RW_ELSE'
            token_type = 13

        elif lexeme == 'for':
            token = 'RW_FOR'
            token_type = 13

        elif lexeme == 'return':
            token = 'RW_RETURN'
            token_type = 13

        elif lexeme == 'break':
            token = 'RW_BREAK'
            token_type = 13

        elif lexeme == 'continue':
            token = 'RW_CONTINUE'
            token_type = 13

        elif lexeme == 'callout':
            token = 'RW_CALLOUT'
            token_type = 13

        elif lexeme == 'main':
            token = 'RW_MAIN'
            token_type = 13

        elif lexeme == 'int':
            token = 'VT_INTEGER'
            token_type = 14

        elif lexeme == 'boolean':
            token = 'VT_BOOLEAN'
            token_type = 14

        elif lexeme == 'true':
            token = 'TRUE_LITERAL'
            token_type = 15

        elif lexeme == 'false':
            token = 'FALSE_LITERAL'
            token_type = 15

        elif is_float(lexeme):
            token = 'DECIMAL_LITERAL'
            token_type = 16

        elif lexeme[0:2] == ('0x' or '0X') and is_hex(lexeme):
            token = 'HEXADECIMAL_LITERAL'
            token_type = 17

        else:
            token = 'ID'
            token_type = 18

        tokenized = Token.Token(lexeme, token, token_type, line_nums[count])
        if d:
            print('\nlexeme:       ', tokenized.lexeme,
                  '\ntoken:        ', tokenized.token,
                  '\ntoken_type:   ', tokenized.token_type,
                  '\nline_num:     ', tokenized.line_num,
                  '\nobj:          ', type(tokenized))
        token_stream.append(tokenized)
        count += 1
        
    return token_stream
Example #7
0
from model import Token
from archives import DFA_ex

# token_stream = Lexer.scan('../examples/example2.dcf', True)
# print('\nmmmm token:', token_stream[0].token)

# x = x
token_stream = []
token_stream.append(Token.Token('', 'x', '', 0))
token_stream.append(Token.Token('', '=', '', 0))
token_stream.append(Token.Token('', 'x', '', 0))

token_stream.append(Token.Token('$', '$', '', 0))

input_length = len(token_stream)

for i in range(0, input_length):
    print(token_stream[i].token)

state_stack = [1]
token_stack = []

state_type = ''
state = 0
dfa_input = ''
i = 0

successful_parsing = False

while i < input_length:
    if i == 0:
Example #8
0
def index_tokens(tokens):
    indexs = defaultdict(list)
    for token in tokens:
        tk = Token(token)
        indexs[tk.lineno].append(tk)
    return indexs