def work(self, body): data = parse(body, DataSet) print "Tagging " + data.id for sent in data.sentences: tokens = [t.word for t in sent.tokens] sent.tokens = [Token(t[0], t[1]) for t in pos_tag(tokens)] self.write(as_json(data))
def generate_token(db, user_id): token = Token() token.id = _generate_uuid_token() token.expires = datetime.datetime.now() + datetime.timedelta(seconds=conf.token_expires) token.user_id = user_id db.add(token) db.commit() return token
def get_tokens(self, tagged_tokens): tagConversionDict = { 'NN': wn.NOUN, 'JJ': wn.ADJ, 'VB': wn.VERB, 'RB': wn.ADV } tokens = [] for index, tagged_token in enumerate(tagged_tokens): token = Token.Token(tagged_token[0], index, tagged_token[1]) if token.penn_tag[:2] in tagConversionDict: token.wn_tag = tagConversionDict[token.penn_tag[:2]] token.lemma = self.lemmatizer.lemmatize( token.token, token.wn_tag) tokens.append(token) return tokens
def get_all_tokens(self): tokens = self.collection.find() target_tokens = [] for token in tokens: target_tokens.append(Token(token)) return target_tokens
def get_token_info(self, token): token = self.collection.find_one({'token': token}) if token is None: return None target_token = Token(token) return target_token
def tokenize(lexemes, line_nums, d): token_stream = [] count = 0 for lexeme in lexemes: token = '' token_type = 0 # only 1 comparison if lexeme == '!': token = 'NEGATION' token_type = 1 elif lexeme == ';': token = 'SEMICOLON' token_type = 2 elif lexeme == ',': token = 'COMMA' token_type = 2 elif lexeme == '{': token = 'BRACE_L' token_type = 3 elif lexeme == '}': token = 'BRACE_R' token_type = 3 elif lexeme == '(': token = 'PARENTHESIS_L' token_type = 4 elif lexeme == ')': token = 'PARENTHESIS_R' token_type = 4 elif lexeme == '[': token = 'BRACKET_L' token_type = 5 elif lexeme == ']': token = 'BRACKET_R' token_type = 5 elif lexeme[0] == '"': token = 'STRING_LITERAL' token_type = 6 # simple comparisons elif lexeme[0:2] == '//': token = 'COMMENT' token_type = 7 elif lexeme == '=': token = 'ASSIGN' token_type = 8 elif lexeme == '+=': token = 'PLUS_ASSIGN' token_type = 8 elif lexeme == '-=': token = 'MINUS_ASSIGN' token_type = 8 elif lexeme == '%': token = 'MOD' token_type = 9 elif lexeme == '/': token = 'DIV' token_type = 9 elif lexeme == '*': token = 'MULT' token_type = 9 elif lexeme == '-': token = 'MINUS' token_type = 9 elif lexeme == '+': token = 'SUM' token_type = 9 elif lexeme == '>': token = 'GREATER_THAN' token_type = 10 elif lexeme == '<': token = 'LESS_THAN' token_type = 10 elif lexeme == '>=': token = 'GREATER_EQUALS_THAN' token_type = 10 elif lexeme == '<=': token = 'LESS_EQUALS_THAN' token_type = 10 elif lexeme == '==': token = 'EQUALS' token_type = 11 elif lexeme == '!=': token = 'NOT_EQUALS' token_type = 11 elif lexeme == '&&': token = 'AND' token_type = 12 elif lexeme == '||': token = 'OR' token_type = 12 # complex comparisons elif lexeme == 'class': token = 'RW_CLASS' token_type = 13 elif lexeme == 'void': token = 'RW_VOID' token_type = 13 elif lexeme == 'if': token = 'RW_IF' token_type = 13 elif lexeme == 'else': token = 'RW_ELSE' token_type = 13 elif lexeme == 'for': token = 'RW_FOR' token_type = 13 elif lexeme == 'return': token = 'RW_RETURN' token_type = 13 elif lexeme == 'break': token = 'RW_BREAK' token_type = 13 elif lexeme == 'continue': token = 'RW_CONTINUE' token_type = 13 elif lexeme == 'callout': token = 'RW_CALLOUT' token_type = 13 elif lexeme == 'main': token = 'RW_MAIN' token_type = 13 elif lexeme == 'int': token = 'VT_INTEGER' token_type = 14 elif lexeme == 'boolean': token = 'VT_BOOLEAN' token_type = 14 elif lexeme == 'true': token = 'TRUE_LITERAL' token_type = 15 elif lexeme == 'false': token = 'FALSE_LITERAL' token_type = 15 elif is_float(lexeme): token = 'DECIMAL_LITERAL' token_type = 16 elif lexeme[0:2] == ('0x' or '0X') and is_hex(lexeme): token = 'HEXADECIMAL_LITERAL' token_type = 17 else: token = 'ID' token_type = 18 tokenized = Token.Token(lexeme, token, token_type, line_nums[count]) if d: print('\nlexeme: ', tokenized.lexeme, '\ntoken: ', tokenized.token, '\ntoken_type: ', tokenized.token_type, '\nline_num: ', tokenized.line_num, '\nobj: ', type(tokenized)) token_stream.append(tokenized) count += 1 return token_stream
from model import Token from archives import DFA_ex # token_stream = Lexer.scan('../examples/example2.dcf', True) # print('\nmmmm token:', token_stream[0].token) # x = x token_stream = [] token_stream.append(Token.Token('', 'x', '', 0)) token_stream.append(Token.Token('', '=', '', 0)) token_stream.append(Token.Token('', 'x', '', 0)) token_stream.append(Token.Token('$', '$', '', 0)) input_length = len(token_stream) for i in range(0, input_length): print(token_stream[i].token) state_stack = [1] token_stack = [] state_type = '' state = 0 dfa_input = '' i = 0 successful_parsing = False while i < input_length: if i == 0:
def index_tokens(tokens): indexs = defaultdict(list) for token in tokens: tk = Token(token) indexs[tk.lineno].append(tk) return indexs