def classify(self, test_path, output_file, filter_func, filter_words=None): line_counter = 1 results = [] for f in sorted(os.listdir(test_path)): tokens = generate_tokens(os.path.join(test_path, f), filter_func, filter_words) spam_score = self.score_spam(tokens) ham_score = self.score_ham(tokens) test_class = SPAM if spam_score > ham_score else HAM correct_class = get_label_from_file(f) results.append(self.compile_output_line(line_counter, f, test_class, ham_score, spam_score, correct_class)) line_counter += 1 with open(output_file, 'w') as f_to_write: f_to_write.write('\n'.join(results)) return True
def __init__(self, readline, offset=(0, 0), is_fast_parser=False): self.readline = readline self.gen = tokenize.generate_tokens(readline) self.offset = offset self.closed = False self.is_first = True self.push_backs = [] # fast parser options self.is_fast_parser = is_fast_parser self.current = self.previous = [None, None, (0, 0), (0, 0), ''] self.in_flow = False self.new_indent = False self.parser_indent = self.old_parser_indent = 0 self.is_decorator = False self.first_stmt = True
def _get_path_until_cursor(self, start_pos=None): def fetch_line(): if self._is_first: self._is_first = False self._line_length = self._column_temp line = self._first_line else: line = self.get_line(self._line_temp) self._line_length = len(line) line = line + '\n' # add lines with a backslash at the end while True: self._line_temp -= 1 last_line = self.get_line(self._line_temp) #print self._line_temp, repr(last_line) if last_line and last_line[-1] == '\\': line = last_line[:-1] + ' ' + line self._line_length = len(last_line) else: break return line[::-1] self._is_first = True self._line_temp, self._column_temp = start_cursor = start_pos self._first_line = self.get_line(self._line_temp)[:self._column_temp] open_brackets = ['(', '[', '{'] close_brackets = [')', ']', '}'] gen = tokenize.generate_tokens(fetch_line) string = '' level = 0 force_point = False last_type = None try: for token_type, tok, start, end, line in gen: # print 'tok', token_type, tok, force_point if last_type == token_type == tokenize.NAME: string += ' ' if level > 0: if tok in close_brackets: level += 1 if tok in open_brackets: level -= 1 elif tok == '.': force_point = False elif force_point: # it is reversed, therefore a number is getting recognized # as a floating point number if token_type == tokenize.NUMBER and tok[0] == '.': force_point = False else: break elif tok in close_brackets: level += 1 elif token_type in [tokenize.NAME, tokenize.STRING]: force_point = True elif token_type == tokenize.NUMBER: pass else: self._column_temp = self._line_length - end[1] break x = start_pos[0] - end[0] + 1 l = self.get_line(x) l = self._first_line if x == start_pos[0] else l start_cursor = x, len(l) - end[1] self._column_temp = self._line_length - end[1] string += tok last_type = token_type except tokenize.TokenError: debug.warning("Tokenize couldn't finish", sys.exc_info) # string can still contain spaces at the end return string[::-1].strip(), start_cursor
def _get_path_until_cursor(self, start_pos=None): def fetch_line(): line = self.get_line(self._line_temp) if self._is_first: self._is_first = False self._line_length = self._column_temp line = line[:self._column_temp] else: self._line_length = len(line) line = line + '\n' # add lines with a backslash at the end while 1: self._line_temp -= 1 last_line = self.get_line(self._line_temp) if last_line and last_line[-1] == '\\': line = last_line[:-1] + ' ' + line self._line_length = len(last_line) else: break return line[::-1] self._is_first = True if start_pos is None: self._line_temp = self.position[0] self._column_temp = self.position[1] else: self._line_temp, self._column_temp = start_pos open_brackets = ['(', '[', '{'] close_brackets = [')', ']', '}'] gen = tokenize.generate_tokens(fetch_line) string = '' level = 0 force_point = False last_type = None try: for token_type, tok, start, end, line in gen: #print 'tok', token_type, tok, force_point if last_type == token_type == tokenize.NAME: string += ' ' if level > 0: if tok in close_brackets: level += 1 if tok in open_brackets: level -= 1 elif tok == '.': force_point = False elif force_point: # it is reversed, therefore a number is getting recognized # as a floating point number if token_type == tokenize.NUMBER and tok[0] == '.': force_point = False else: break elif tok in close_brackets: level += 1 elif token_type in [tokenize.NAME, tokenize.STRING]: force_point = True elif token_type == tokenize.NUMBER: pass else: self._column_temp = self._line_length - end[1] break self._column_temp = self._line_length - end[1] string += tok last_type = token_type except tokenize.TokenError: debug.warning("Tokenize couldn't finish", sys.exc_info) # string can still contain spaces at the end return string[::-1].strip()
def test_tokenize_comment(self): self.assertIn(TokenType.COMMENT_IN_LINE, generate_tokens(self.code_js))
def test_tokenize_line_terminator(self): self.assertIn(TokenType.LINE_TERMINATOR, generate_tokens(self.code_js))
def test_tokenize_space(self): self.assertIn(TokenType.SPACE, generate_tokens(self.code_js))