Python ImdbDataHandler Examples

Programming Language: Python

Namespace/Package Name: datahandlers

Class/Type: ImdbDataHandler

Examples at hotexamples.com: 2

Python ImdbDataHandler - 2 examples found. These are the top rated real world Python examples of datahandlers.ImdbDataHandler extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

get_data(1)

to_sentence_vectors(1)

Example #1

Show file

File: prepare_imdb_new.py Project: alfredolainez/deep-text-classification

if __name__ == '__main__':

    log('Building word vectors from {}'.format(IMDB_WV_FILE))
    gb = GloVeBox(IMDB_WV_FILE)
    gb.build(zero_token=True, normalize_variance=False, normalize_norm=True)

    log('Building global word vectors from {}'.format(GLOBAL_WV_FILE))
    global_gb = GloVeBox(GLOBAL_WV_FILE)
    global_gb.build(zero_token=True, normalize_variance=False, normalize_norm=True)

    log('writing GloVeBox pickle...')
    pickle.dump(gb, open(IMDB_WV_FILE.replace('.txt', '-glovebox.pkl'), 'wb'), pickle.HIGHEST_PROTOCOL)
    pickle.dump(global_gb, open(GLOBAL_WV_FILE.replace('.txt', '-glovebox.pkl'), 'wb'), pickle.HIGHEST_PROTOCOL)

    log('Load data from original source')
    imdb = ImdbDataHandler(source=IMDB_DATA)
    (train_reviews, train_labels) = imdb.get_data(type=ImdbDataHandler.DATA_TRAIN)
    (test_reviews, test_labels) = imdb.get_data(type=ImdbDataHandler.DATA_TEST)

    log('Converting to sentences: global word vectors')
    train_global_wvs_reviews = imdb.to_sentence_vectors(train_reviews, SENTENCES_PER_PARAGRAPH,
                                                    WORDS_PER_SENTENCE, global_gb)
    test_global_wvs_reviews = imdb.to_sentence_vectors(test_reviews, SENTENCES_PER_PARAGRAPH,
                                                   WORDS_PER_SENTENCE, global_gb)

    log('Converting to sentences: only imdb word vectors')
    train_imdb_wvs_reviews = imdb.to_sentence_vectors(train_reviews, SENTENCES_PER_PARAGRAPH,
                                                    WORDS_PER_SENTENCE, gb)
    test_imdb_wvs_reviews = imdb.to_sentence_vectors(test_reviews, SENTENCES_PER_PARAGRAPH,
                                                   WORDS_PER_SENTENCE, gb)

Example #2

Show file

File: classic_classification.py Project: alfredolainez/deep-text-classification

from classic.classifiers import TextClassifier, NaiveBayesClassifier, SGDTextClassifier, \
    LogisticClassifier, SVMClassifier, PerceptronClassifier, RandomForestTextClassifier
from datahandlers import ImdbDataHandler

IMDB_DATA = './datasets/aclImdb/aclImdb'

if __name__ == '__main__':

    print "Loading data from original source"
    imdb = ImdbDataHandler(source=IMDB_DATA)
    (train_reviews, train_labels) = imdb.get_data(type=ImdbDataHandler.DATA_TRAIN)
    (test_reviews, test_labels) = imdb.get_data(type=ImdbDataHandler.DATA_TEST)
    # TODO: Shuffle data

    # Simple bag of words with SGD
    sgd = SGDTextClassifier(train_reviews, train_labels,
                            test_texts=test_reviews, test_labels=test_labels,
                            compute_features=True)
    sgd.grid_search_cv(verbose=0, n_jobs=4)

    # Simple bag of words with NB
    nb = NaiveBayesClassifier(train_reviews, train_labels,
                              test_texts=test_reviews, test_labels=test_labels)
    nb.set_bag_of_ngrams() # Also can compute bag of words manually
    nb.grid_search_cv(n_jobs=4)

    # Now shit with bigrams too
    sgd = SGDTextClassifier(train_reviews, train_labels, ngram_range=(1,2),
                            test_texts=test_reviews, test_labels=test_labels,
                            compute_features=True)
    sgd.grid_search_cv(n_jobs=4, verbose=1)