def test_filter_tokens(self): t = Tokenizer() tokens = t.tokenize(string1) filtered = t.filter_tokens(tokens) self.assertEquals(filtered, [ 'microscopy', 'use', 'microscopes', 'see', 'micro', 'sized', 'objects' ])
def getData(): TRAIN_DATA = [] TRAIN_POS, TRAIN_NEG = 0, 0 TEST_DATA = [] TEST_POS, TEST_NEG = 0, 0 for i in range(1, TOTAL_FILE_COUNT + 1): fname = get_file_name(i) F = Tokenizer("labelled/" + fname + ".txt") F.tokenize() F.filter_tokens() # F.print_tokens() d, p, n = F.vectorize() for v in d: if int(v['fid']) in train_files: TRAIN_DATA.append(v) else: TEST_DATA.append(v) if int(v['fid']) in train_files: TRAIN_POS += p TRAIN_NEG += n else: TEST_POS += p TEST_NEG += n print('Generating Train Data tokens...') print("Token generation completed.") print('{0: <10} {1: <10} {2: <10}'.format("Total", "Positive", "Negative")) print('{0: <10} {1: <10} {2: <10}'.format(len(TRAIN_DATA), TRAIN_POS, TRAIN_NEG)) print('Generating Test Data tokens...') print("Token generation completed.") print('{0: <10} {1: <10} {2: <10}'.format("Total", "Positive", "Negative")) print('{0: <10} {1: <10} {2: <10}'.format(len(TEST_DATA), TEST_POS, TEST_NEG)) return TRAIN_DATA, TEST_DATA
def getData(startIndex, endIndex): all_data = [] all_pos = 0 all_neg = 0 for i in range(startIndex, endIndex + 1): fname = get_file_name(i) F = Tokenizer("labelled/" + fname + ".txt") F.tokenize() F.filter_tokens() # F.print_tokens() d, p, n = F.vectorize() [all_data.append(v) for v in d] all_pos += p all_neg += n print("Token generation completed.") print('{0: <10} {1: <10} {2: <10}'.format("Total", "Positive", "Negative")) print('{0: <10} {1: <10} {2: <10}'.format(len(all_data), all_pos, all_neg)) return all_data
def test_tokenizer_long(self): t = Tokenizer() tokens = t.tokenize(str) filtered = t.filter_tokens(tokens)