YELP_FUNNY_TEST = '../yelp-dataset/TestSet_funny_75064'

YELP_COOL_TRAIN = '../yelp-dataset/TrainSet_cool_88698'
YELP_COOL_DEV = '../yelp-dataset/DevSet_cool_88698'
YELP_COOL_TEST = '../yelp-dataset/TestSet_cool_88698'

CHARACTERS_PER_WORD = 15
WORDS_PER_DOCUMENT = 300
PREPEND = False

if __name__ == '__main__':

    log('Initializing CharMapper')
    cm = CharMapper()

    yelp = YelpDataHandler()

    def get_yelp_char(train_reviews, test_reviews):
        log('Converting to character level representations')
        log('    --> Starting Training Data...')
        train_reviews = yelp.to_char_level_idx(train_reviews, 
            char_container=cm,
            chars_per_word=CHARACTERS_PER_WORD,
            words_per_document=WORDS_PER_DOCUMENT,
            prepend=PREPEND)
        log('    --> Training Data Complete')
        log('    --> Starting Testing Data...')
        test_reviews = yelp.to_char_level_idx(test_reviews, 
            char_container=cm,
            chars_per_word=CHARACTERS_PER_WORD,
            words_per_document=WORDS_PER_DOCUMENT,
Exemple #2
0
if __name__ == '__main__':

    log('Building word vectors from {}'.format(YELP_WV_FILE))
    yelp_gb = GloVeBox(YELP_WV_FILE)
    yelp_gb.build(zero_token=True, normalize_variance=False, normalize_norm=True)

    log('Building global word vectors from {}'.format(GLOBAL_WV_FILE))
    global_gb = GloVeBox(GLOBAL_WV_FILE)
    global_gb.build(zero_token=True, normalize_variance=False, normalize_norm=True)

    log('writing GloVeBox pickle...')
    pickle.dump(yelp_gb, open(YELP_WV_FILE.replace('.txt', '-glovebox.pkl'), 'wb'), pickle.HIGHEST_PROTOCOL)
    pickle.dump(global_gb, open(GLOBAL_WV_FILE.replace('.txt', '-glovebox.pkl'), 'wb'), pickle.HIGHEST_PROTOCOL)

    yelp = YelpDataHandler()

    ##################################
    ### YELP USEFUL
    ##################################
    log('Creating "useful" reviews sentence-datasets')
    (train_reviews, train_labels, test_reviews, test_labels) = \
        yelp.get_data(YELP_USEFUL_TRAIN, YELP_USEFUL_DEV, YELP_USEFUL_TEST)

    log('Converting to sentences: global word vectors')
    train_global_wvs_reviews = yelp.to_sentence_level_idx(train_reviews, SENTENCES_PER_PARAGRAPH,
                                                    WORDS_PER_SENTENCE, global_gb)
    test_global_wvs_reviews = yelp.to_sentence_level_idx(test_reviews, SENTENCES_PER_PARAGRAPH,
                                                   WORDS_PER_SENTENCE, global_gb)

    log('Converting to sentences: yelp word vectors')
Exemple #3
0
from keras.layers.recurrent import GRU
from keras.models import Sequential
from keras.layers import Embedding
from keras.layers.core import Dense, Activation, Dropout, Reshape, Flatten

YELP_FUNNY_TRAIN = '../yelp-dataset/TrainSet_funny_75064'
YELP_FUNNY_DEV = '../yelp-dataset/DevSet_funny_75064'
YELP_FUNNY_TEST = '../yelp-dataset/TestSet_funny_75064'

WV_FILE = '../deep-text/embeddings/wv/glove.42B.300d.120000.txt'

if __name__ == '__main__':

    print "Getting data in format texts / labels"
    yelp = YelpDataHandler()
    (train_reviews, train_labels, test_reviews, test_labels) = \
        yelp.get_data(YELP_FUNNY_TRAIN, YELP_FUNNY_DEV, YELP_FUNNY_TEST)

    print "Building character embedding"
    EMBEDDING_FILE = "YelpChar.pkl"
    if not os.path.isfile(EMBEDDING_FILE):

        cbox = EnglishCharBox(vector_dim=300)

        # Build the language embedding with the given vector box and 300 words per text
        lembedding = OneLevelEmbedding(cbox, type=OneLevelEmbedding.CHAR_EMBEDDING, size=5000)
        lembedding.compute(train_reviews)
        lembedding.save(EMBEDDING_FILE)
    else:
        lembedding = OneLevelEmbedding.load(EMBEDDING_FILE)
    log('Building global word vectors from {}'.format(GLOBAL_WV_FILE))
    global_gb = GloVeBox(GLOBAL_WV_FILE)
    global_gb.build(zero_token=True, normalize_variance=False, normalize_norm=True)

    log('writing GloVeBox pickle...')
    pickle.dump(global_gb, open(GLOBAL_WV_FILE.replace('.txt', '-glovebox.pkl'), 'wb'), pickle.HIGHEST_PROTOCOL)

    log('Building global word vectors from {}'.format(YELP_WV_FILE))
    yelp_gb = GloVeBox(YELP_WV_FILE)
    yelp_gb.build(zero_token=True, normalize_variance=False, normalize_norm=True)

    log('writing GloVeBox pickle...')
    pickle.dump(yelp_gb, open(YELP_WV_FILE.replace('.txt', '-glovebox.pkl'), 'wb'), pickle.HIGHEST_PROTOCOL)

    yelp = YelpDataHandler()

    ##################################
    ### YELP USEFUL
    ##################################
    log('Saving "useful" votes data')
    (train_reviews, train_labels, test_reviews, test_labels) = \
        yelp.get_data(YELP_USEFUL_TRAIN, YELP_USEFUL_DEV, YELP_USEFUL_TEST)

    reviews_wvs_train = yelp.to_word_level_idx(train_reviews, global_gb, WORDS_PER_TEXT)
    # -- training data save
    np.save('Yelp_useful_train_fulltext_glove_300_X.npy', reviews_wvs_train)
    np.save('Yelp_useful_train_fulltext_glove_300_y.npy', train_labels)

    reviews_wvs_train = yelp.to_word_level_idx(train_reviews, yelp_gb, WORDS_PER_TEXT)
    # -- training data save
    pickle.dump(global_gb,
                open(GLOBAL_WV_FILE.replace('.txt', '-glovebox.pkl'), 'wb'),
                pickle.HIGHEST_PROTOCOL)

    log('Building global word vectors from {}'.format(YELP_WV_FILE))
    yelp_gb = GloVeBox(YELP_WV_FILE)
    yelp_gb.build(zero_token=True,
                  normalize_variance=False,
                  normalize_norm=True)

    log('writing GloVeBox pickle...')
    pickle.dump(yelp_gb,
                open(YELP_WV_FILE.replace('.txt', '-glovebox.pkl'), 'wb'),
                pickle.HIGHEST_PROTOCOL)

    yelp = YelpDataHandler()

    ##################################
    ### YELP USEFUL
    ##################################
    log('Saving "useful" votes data')
    (train_reviews, train_labels, test_reviews, test_labels) = \
        yelp.get_data(YELP_USEFUL_TRAIN, YELP_USEFUL_DEV, YELP_USEFUL_TEST)

    reviews_wvs_train = yelp.to_word_level_idx(train_reviews, global_gb,
                                               WORDS_PER_TEXT)
    # -- training data save
    np.save('Yelp_useful_train_fulltext_glove_300_X.npy', reviews_wvs_train)
    np.save('Yelp_useful_train_fulltext_glove_300_y.npy', train_labels)

    reviews_wvs_train = yelp.to_word_level_idx(train_reviews, yelp_gb,
YELP_FUNNY_TEST = '../yelp-dataset/TestSet_funny_75064'

YELP_COOL_TRAIN = '../yelp-dataset/TrainSet_cool_88698'
YELP_COOL_DEV = '../yelp-dataset/DevSet_cool_88698'
YELP_COOL_TEST = '../yelp-dataset/TestSet_cool_88698'

CHARACTERS_PER_WORD = 15
WORDS_PER_DOCUMENT = 300
PREPEND = False

if __name__ == '__main__':

    log('Initializing CharMapper')
    cm = CharMapper()

    yelp = YelpDataHandler()

    def get_yelp_char(train_reviews, test_reviews):
        log('Converting to character level representations')
        log('    --> Starting Training Data...')
        train_reviews = yelp.to_char_level_idx(
            train_reviews,
            char_container=cm,
            chars_per_word=CHARACTERS_PER_WORD,
            words_per_document=WORDS_PER_DOCUMENT,
            prepend=PREPEND)
        log('    --> Training Data Complete')
        log('    --> Starting Testing Data...')
        test_reviews = yelp.to_char_level_idx(
            test_reviews,
            char_container=cm,