Beispiel #1
0
    def test_parse_sentence_to_dict(self):
        sentence = 'This test is unit test and unit is unit'
        words_dict = {'and': 1, 'is': 2, 'test': 2, 'this': 1, 'unit': 3}

        bag_of_words = BagOfWords()
        expected_words_dict = bag_of_words.parse_sentence_to_dict(sentence)

        self.assertEqual(words_dict, expected_words_dict)
    def get_model(self, cross_item_score):
        super(CrossValidationBOW, self).get_model(cross_item_score)

        pkgs_list = cross_item_score.keys()
        bag_of_words = BagOfWords()

        bag_of_words.train_model(pkgs_list, self.axi, save_files=False)

        return bag_of_words
Beispiel #3
0
    def test_parse_sentence_to_tuple(self):
        sentence = 'This test is unit test and unit is unit'
        words_list = ['and', 'is', 'test', 'this', 'unit']
        words_count = [1, 2, 2, 1, 3]

        bag_of_words = BagOfWords()
        [ret_words_list,
         ret_words_count] = bag_of_words.parse_sentence_to_tuple(sentence)

        self.assertEqual(ret_words_count.tolist()[0], words_count)
        self.assertEqual(ret_words_list, words_list)
Beispiel #4
0
def main():
    EPOCH = 10
    MIN_FREQ = 5
    SHUFFLE = True

    trn_texts = open("trn.data").read().strip().split("\n")
    trn_labels = open("trn.label").read().strip().split("\n")
    dev_texts = open("dev.data").read().strip().split("\n")
    dev_labels = open("dev.label").read().strip().split("\n")
    tst_texts = open("tst.data").read().strip().split("\n")

    print('averaged perceptron')
    print('-' * 40)
    print('trn data size:', len(trn_texts))
    print('dev data size:', len(dev_texts))

    bag_of_words = BagOfWords(True, True, MIN_FREQ)
    trn_data = bag_of_words.fit_transform(trn_texts, trn_labels)
    dev_data = bag_of_words.transform(dev_texts)
    tst_data = bag_of_words.transform(tst_texts)

    print('min vocabulary freq:', MIN_FREQ)
    print('vocabulary size:', len(trn_data[0]))
    print('shuffle after epoch:', SHUFFLE)

    perceptron = AvgPerceptron(bag_of_words)

    print('training start\n')
    start = time()
    print('data accurary')
    print_cells(['epoch', 'trn', 'dev'], 9)
    print('-' * 30)

    trn_acc = []
    dev_acc = []

    def epoch_callback(i):
        trn_acc.append(perceptron.accuracy(trn_data, trn_labels))
        dev_acc.append(perceptron.accuracy(dev_data, dev_labels))
        print_cells([i, trn_acc[i], dev_acc[i]], 9)

    perceptron.fit(trn_data, trn_labels, EPOCH, SHUFFLE, epoch_callback)

    print('\ntraining end')
    print('duration:', round(time() - start))

    plot(list(range(0, EPOCH)), trn_acc, dev_acc, 'Perceptron')
Beispiel #5
0
def get_feature_extractor(feature_extractor):
    if feature_extractor == 'hog':
        return HOGFeatureExtractor()
    elif feature_extractor == 'hog_fisher':
        return FisherFeatureExtractor(local_feature_extractor_name='hog')
    elif feature_extractor == 'sift':
        return SIFTFeatureExtractor()
    elif feature_extractor == 'sift_fisher':
        return FisherFeatureExtractor(local_feature_extractor_name='sift')
    elif feature_extractor == 'kernel_descriptors':
        return KernelDescriptorsExtractor()
    elif feature_extractor == 'bag_of_words_hog':
        return BagOfWords(local_feature_extractor_name='hog')
    elif feature_extractor == 'raw':
        return None
    else:
        raise Exception("Unknown feature extractor")
Beispiel #6
0
from bag_of_words import BagOfWords
corpus = [
    'most of the statistical algorithms, e.g machine learning and deep learning techniques, work with numeric data',
    'bag of words converts text to numbers'
]
number = 3
bow = BagOfWords(data_entries=corpus, number_of_most_frequent_words=number)

print(bow.data_entries)
print(bow.wordfreq)
print(bow.most_frequent)
print(bow.corpus_bow())
print(bow.text_to_bow('machine learning'))
Beispiel #7
0
 def createFeatureExtractor(self):
     return BagOfWords(100)
Beispiel #8
0
# -*- coding: utf-8 -*-
#!/usr/bin/env python

import os
import sys
from wordcount import WordCount
from dictionary_maker import DictionaryMaker
from pressnote import PressNote
from bag_of_words import BagOfWords

if __name__ == '__main__':
    INPUT_DIR = 'geomedia' + os.sep + 'Geomedia_extract_AGENDA'
    OUTPUT_DIR = 'outputs'
    DICTIONARY_PATH = OUTPUT_DIR + os.sep + 'dictionary_aus.txt'
    CLUSTERS_PATH = OUTPUT_DIR + os.sep + 'clusters_aus.txt'

    if len(sys.argv) != 1:
        print "python runner.py"
    else:
        '''dictionary_maker = DictionaryMaker('en')
		dictionary_maker.parse(INPUT_DIR)
		dictionary_maker.dump(DICTIONARY_PATH)
		
		bag_of_words = BagOfWords('en', DICTIONARY_PATH, INPUT_DIR, 1000) #change None -> dictionary size
		bag_of_words.cluster(CLUSTERS_PATH);'''

        bag_of_words = BagOfWords('en', OUTPUT_DIR, INPUT_DIR,
                                  1000)  #change None -> dictionary size
        # bag_of_words = BagOfWords('es', OUTPUT_DIR, INPUT_DIR, 1000) #change None -> dictionary size
        # bag_of_words = BagOfWords('fr', OUTPUT_DIR, INPUT_DIR, 1000) #change None -> dictionary size
Beispiel #9
0
 def createFeatureExtractor(self):
     return BagOfWords(self._num_features, self._use_stemming,
                       self._use_elminating, self._sort_threshold,
                       self._use_good_turing)
Beispiel #10
0
df['datetime'] = datetime_list
df['datetime'] = pd.to_datetime(df['datetime'], utc=True)
# df = df.set_index('datetime')
df = df.drop(['date'], axis=1)
# df.sort_index(inplace=True)

# pp.pprint(df)
"""
Apply BagOfWords() to each body of 
"""

token_list = []

for x in range(len(df)):
    tokens = BagOfWords().get_tokens(df['body'][x][0])
    token_list.append(tokens)

df['body_tokens'] = token_list

# pp.pprint(df)
"""
Iterate through the files and count words in body
"""

token_list = []

for x in range(len(df)):
    tokens = BagOfWords().get_tokens(df['body'][x][0])
    token_list.append(tokens)