Ejemplo n.º 1
0
IMDB_DATA = './datasets/aclImdb/aclImdb'

CHARACTERS_PER_WORD = 15
WORDS_PER_DOCUMENT = 300
PREPEND = False

if __name__ == '__main__':

    log('Initializing CharMapper')
    cm = CharMapper()

    log('Load data from original source')
    imdb = ImdbDataHandler(source=IMDB_DATA)
    (train_reviews,
     train_labels) = imdb.get_data(type=ImdbDataHandler.DATA_TRAIN)
    (test_reviews, test_labels) = imdb.get_data(type=ImdbDataHandler.DATA_TEST)

    log('Converting to character level representations')
    train_global_wvs_reviews = imdb.to_char_level_idx(
        train_reviews,
        char_container=cm,
        chars_per_word=CHARACTERS_PER_WORD,
        words_per_document=WORDS_PER_DOCUMENT,
        prepend=PREPEND)

    test_global_wvs_reviews = imdb.to_char_level_idx(
        test_reviews,
        char_container=cm,
        chars_per_word=CHARACTERS_PER_WORD,
        words_per_document=WORDS_PER_DOCUMENT,
Ejemplo n.º 2
0
from cervantes.language import OneLevelEmbedding
from cervantes.nn.models import RNNClassifier

YELP_FUNNY_TRAIN = '../yelp-dataset/TrainSet_funny_75064'
YELP_FUNNY_DEV = '../yelp-dataset/DevSet_funny_75064'
YELP_FUNNY_TEST = '../yelp-dataset/TestSet_funny_75064'

IMDB_DATA = '../deep-text/datasets/aclImdb/aclImdb'
WV_FILE = '../deep-text/embeddings/wv/glove.42B.300d.120000.txt'

if __name__ == '__main__':

    print "Getting data in format texts / labels"

    imdb = ImdbDataHandler(source=IMDB_DATA)
    (train_reviews, train_labels) = imdb.get_data(type=ImdbDataHandler.DATA_TRAIN)
    (test_reviews, test_labels) = imdb.get_data(type=ImdbDataHandler.DATA_TEST)
    train_reviews = train_reviews[:5000]
    test_reviews = test_reviews[:1000]
    train_labels = list(train_labels)[:5000]
    test_labels = list(test_labels)[:1000]

    #yelp = YelpDataHandler()
    #(train_reviews, train_labels, test_reviews, test_labels) = \
    #    yelp.get_data(YELP_FUNNY_TRAIN, YELP_FUNNY_DEV, YELP_FUNNY_TEST)

    print "Building language embeddings. This requires parsing text so it might " \
          "be pretty slow "
    # Compute text embeddings, containing the processed text tokens together with a vector-to-index
    # translation object (the vector box), should be pickled in order to be efficiently used with
    # different models. Hence, we can save time once we have precomputed a language embedding
from nlpdatahandlers import ImdbDataHandler

import sys

IMDB_DATA_DEFAULT = '../deep-text/datasets/aclImdb/aclImdb'

if __name__ == '__main__':

    if len(sys.argv) > 1 and sys.argv[1] != "":
        source = sys.argv[1]
    else:
        source = IMDB_DATA_DEFAULT

    print "Loading data from original source"
    imdb = ImdbDataHandler(source=source)
    (train_reviews, train_labels) = imdb.get_data(type=ImdbDataHandler.DATA_TRAIN, shuffle=True)
    (test_reviews, test_labels) = imdb.get_data(type=ImdbDataHandler.DATA_TEST, shuffle=True)

    # Simple bag of words with SGD
    sgd = SGDTextClassifier(train_reviews, train_labels,
                            test_texts=test_reviews, test_labels=test_labels,
                            compute_features=True)
    sgd.grid_search_cv(verbose=5, n_jobs=4)
    test_error = sgd.get_test_error()
    print "Test error in held out set: " + str(test_error)
    print "=" * 20

    # Now with bigrams too
    sgd = SGDTextClassifier(train_reviews, train_labels, ngram_range=(1,2),
                            test_texts=test_reviews, test_labels=test_labels,
                            compute_features=True)
import sys

IMDB_DATA_DEFAULT = '../deep-text/datasets/aclImdb/aclImdb'

if __name__ == '__main__':

    if len(sys.argv) > 1 and sys.argv[1] != "":
        source = sys.argv[1]
    else:
        source = IMDB_DATA_DEFAULT

    print "Loading data from original source"
    imdb = ImdbDataHandler(source=source)
    (train_reviews,
     train_labels) = imdb.get_data(type=ImdbDataHandler.DATA_TRAIN,
                                   shuffle=True)
    (test_reviews, test_labels) = imdb.get_data(type=ImdbDataHandler.DATA_TEST,
                                                shuffle=True)

    # Simple bag of words with SGD
    sgd = SGDTextClassifier(train_reviews,
                            train_labels,
                            test_texts=test_reviews,
                            test_labels=test_labels,
                            compute_features=True)
    sgd.grid_search_cv(verbose=5, n_jobs=4)
    test_error = sgd.get_test_error()
    print "Test error in held out set: " + str(test_error)
    print "=" * 20

    # Now with bigrams too