def test_parse_sentence_to_dict(self): sentence = 'This test is unit test and unit is unit' words_dict = {'and': 1, 'is': 2, 'test': 2, 'this': 1, 'unit': 3} bag_of_words = BagOfWords() expected_words_dict = bag_of_words.parse_sentence_to_dict(sentence) self.assertEqual(words_dict, expected_words_dict)
def get_model(self, cross_item_score): super(CrossValidationBOW, self).get_model(cross_item_score) pkgs_list = cross_item_score.keys() bag_of_words = BagOfWords() bag_of_words.train_model(pkgs_list, self.axi, save_files=False) return bag_of_words
def test_parse_sentence_to_tuple(self): sentence = 'This test is unit test and unit is unit' words_list = ['and', 'is', 'test', 'this', 'unit'] words_count = [1, 2, 2, 1, 3] bag_of_words = BagOfWords() [ret_words_list, ret_words_count] = bag_of_words.parse_sentence_to_tuple(sentence) self.assertEqual(ret_words_count.tolist()[0], words_count) self.assertEqual(ret_words_list, words_list)
def main(): EPOCH = 10 MIN_FREQ = 5 SHUFFLE = True trn_texts = open("trn.data").read().strip().split("\n") trn_labels = open("trn.label").read().strip().split("\n") dev_texts = open("dev.data").read().strip().split("\n") dev_labels = open("dev.label").read().strip().split("\n") tst_texts = open("tst.data").read().strip().split("\n") print('averaged perceptron') print('-' * 40) print('trn data size:', len(trn_texts)) print('dev data size:', len(dev_texts)) bag_of_words = BagOfWords(True, True, MIN_FREQ) trn_data = bag_of_words.fit_transform(trn_texts, trn_labels) dev_data = bag_of_words.transform(dev_texts) tst_data = bag_of_words.transform(tst_texts) print('min vocabulary freq:', MIN_FREQ) print('vocabulary size:', len(trn_data[0])) print('shuffle after epoch:', SHUFFLE) perceptron = AvgPerceptron(bag_of_words) print('training start\n') start = time() print('data accurary') print_cells(['epoch', 'trn', 'dev'], 9) print('-' * 30) trn_acc = [] dev_acc = [] def epoch_callback(i): trn_acc.append(perceptron.accuracy(trn_data, trn_labels)) dev_acc.append(perceptron.accuracy(dev_data, dev_labels)) print_cells([i, trn_acc[i], dev_acc[i]], 9) perceptron.fit(trn_data, trn_labels, EPOCH, SHUFFLE, epoch_callback) print('\ntraining end') print('duration:', round(time() - start)) plot(list(range(0, EPOCH)), trn_acc, dev_acc, 'Perceptron')
def get_feature_extractor(feature_extractor): if feature_extractor == 'hog': return HOGFeatureExtractor() elif feature_extractor == 'hog_fisher': return FisherFeatureExtractor(local_feature_extractor_name='hog') elif feature_extractor == 'sift': return SIFTFeatureExtractor() elif feature_extractor == 'sift_fisher': return FisherFeatureExtractor(local_feature_extractor_name='sift') elif feature_extractor == 'kernel_descriptors': return KernelDescriptorsExtractor() elif feature_extractor == 'bag_of_words_hog': return BagOfWords(local_feature_extractor_name='hog') elif feature_extractor == 'raw': return None else: raise Exception("Unknown feature extractor")
from bag_of_words import BagOfWords corpus = [ 'most of the statistical algorithms, e.g machine learning and deep learning techniques, work with numeric data', 'bag of words converts text to numbers' ] number = 3 bow = BagOfWords(data_entries=corpus, number_of_most_frequent_words=number) print(bow.data_entries) print(bow.wordfreq) print(bow.most_frequent) print(bow.corpus_bow()) print(bow.text_to_bow('machine learning'))
def createFeatureExtractor(self): return BagOfWords(100)
# -*- coding: utf-8 -*- #!/usr/bin/env python import os import sys from wordcount import WordCount from dictionary_maker import DictionaryMaker from pressnote import PressNote from bag_of_words import BagOfWords if __name__ == '__main__': INPUT_DIR = 'geomedia' + os.sep + 'Geomedia_extract_AGENDA' OUTPUT_DIR = 'outputs' DICTIONARY_PATH = OUTPUT_DIR + os.sep + 'dictionary_aus.txt' CLUSTERS_PATH = OUTPUT_DIR + os.sep + 'clusters_aus.txt' if len(sys.argv) != 1: print "python runner.py" else: '''dictionary_maker = DictionaryMaker('en') dictionary_maker.parse(INPUT_DIR) dictionary_maker.dump(DICTIONARY_PATH) bag_of_words = BagOfWords('en', DICTIONARY_PATH, INPUT_DIR, 1000) #change None -> dictionary size bag_of_words.cluster(CLUSTERS_PATH);''' bag_of_words = BagOfWords('en', OUTPUT_DIR, INPUT_DIR, 1000) #change None -> dictionary size # bag_of_words = BagOfWords('es', OUTPUT_DIR, INPUT_DIR, 1000) #change None -> dictionary size # bag_of_words = BagOfWords('fr', OUTPUT_DIR, INPUT_DIR, 1000) #change None -> dictionary size
def createFeatureExtractor(self): return BagOfWords(self._num_features, self._use_stemming, self._use_elminating, self._sort_threshold, self._use_good_turing)
df['datetime'] = datetime_list df['datetime'] = pd.to_datetime(df['datetime'], utc=True) # df = df.set_index('datetime') df = df.drop(['date'], axis=1) # df.sort_index(inplace=True) # pp.pprint(df) """ Apply BagOfWords() to each body of """ token_list = [] for x in range(len(df)): tokens = BagOfWords().get_tokens(df['body'][x][0]) token_list.append(tokens) df['body_tokens'] = token_list # pp.pprint(df) """ Iterate through the files and count words in body """ token_list = [] for x in range(len(df)): tokens = BagOfWords().get_tokens(df['body'][x][0]) token_list.append(tokens)