YELP_FUNNY_TEST = '../yelp-dataset/TestSet_funny_75064' YELP_COOL_TRAIN = '../yelp-dataset/TrainSet_cool_88698' YELP_COOL_DEV = '../yelp-dataset/DevSet_cool_88698' YELP_COOL_TEST = '../yelp-dataset/TestSet_cool_88698' CHARACTERS_PER_WORD = 15 WORDS_PER_DOCUMENT = 300 PREPEND = False if __name__ == '__main__': log('Initializing CharMapper') cm = CharMapper() yelp = YelpDataHandler() def get_yelp_char(train_reviews, test_reviews): log('Converting to character level representations') log(' --> Starting Training Data...') train_reviews = yelp.to_char_level_idx(train_reviews, char_container=cm, chars_per_word=CHARACTERS_PER_WORD, words_per_document=WORDS_PER_DOCUMENT, prepend=PREPEND) log(' --> Training Data Complete') log(' --> Starting Testing Data...') test_reviews = yelp.to_char_level_idx(test_reviews, char_container=cm, chars_per_word=CHARACTERS_PER_WORD, words_per_document=WORDS_PER_DOCUMENT,
if __name__ == '__main__': log('Building word vectors from {}'.format(YELP_WV_FILE)) yelp_gb = GloVeBox(YELP_WV_FILE) yelp_gb.build(zero_token=True, normalize_variance=False, normalize_norm=True) log('Building global word vectors from {}'.format(GLOBAL_WV_FILE)) global_gb = GloVeBox(GLOBAL_WV_FILE) global_gb.build(zero_token=True, normalize_variance=False, normalize_norm=True) log('writing GloVeBox pickle...') pickle.dump(yelp_gb, open(YELP_WV_FILE.replace('.txt', '-glovebox.pkl'), 'wb'), pickle.HIGHEST_PROTOCOL) pickle.dump(global_gb, open(GLOBAL_WV_FILE.replace('.txt', '-glovebox.pkl'), 'wb'), pickle.HIGHEST_PROTOCOL) yelp = YelpDataHandler() ################################## ### YELP USEFUL ################################## log('Creating "useful" reviews sentence-datasets') (train_reviews, train_labels, test_reviews, test_labels) = \ yelp.get_data(YELP_USEFUL_TRAIN, YELP_USEFUL_DEV, YELP_USEFUL_TEST) log('Converting to sentences: global word vectors') train_global_wvs_reviews = yelp.to_sentence_level_idx(train_reviews, SENTENCES_PER_PARAGRAPH, WORDS_PER_SENTENCE, global_gb) test_global_wvs_reviews = yelp.to_sentence_level_idx(test_reviews, SENTENCES_PER_PARAGRAPH, WORDS_PER_SENTENCE, global_gb) log('Converting to sentences: yelp word vectors')
from keras.layers.recurrent import GRU from keras.models import Sequential from keras.layers import Embedding from keras.layers.core import Dense, Activation, Dropout, Reshape, Flatten YELP_FUNNY_TRAIN = '../yelp-dataset/TrainSet_funny_75064' YELP_FUNNY_DEV = '../yelp-dataset/DevSet_funny_75064' YELP_FUNNY_TEST = '../yelp-dataset/TestSet_funny_75064' WV_FILE = '../deep-text/embeddings/wv/glove.42B.300d.120000.txt' if __name__ == '__main__': print "Getting data in format texts / labels" yelp = YelpDataHandler() (train_reviews, train_labels, test_reviews, test_labels) = \ yelp.get_data(YELP_FUNNY_TRAIN, YELP_FUNNY_DEV, YELP_FUNNY_TEST) print "Building character embedding" EMBEDDING_FILE = "YelpChar.pkl" if not os.path.isfile(EMBEDDING_FILE): cbox = EnglishCharBox(vector_dim=300) # Build the language embedding with the given vector box and 300 words per text lembedding = OneLevelEmbedding(cbox, type=OneLevelEmbedding.CHAR_EMBEDDING, size=5000) lembedding.compute(train_reviews) lembedding.save(EMBEDDING_FILE) else: lembedding = OneLevelEmbedding.load(EMBEDDING_FILE)
log('Building global word vectors from {}'.format(GLOBAL_WV_FILE)) global_gb = GloVeBox(GLOBAL_WV_FILE) global_gb.build(zero_token=True, normalize_variance=False, normalize_norm=True) log('writing GloVeBox pickle...') pickle.dump(global_gb, open(GLOBAL_WV_FILE.replace('.txt', '-glovebox.pkl'), 'wb'), pickle.HIGHEST_PROTOCOL) log('Building global word vectors from {}'.format(YELP_WV_FILE)) yelp_gb = GloVeBox(YELP_WV_FILE) yelp_gb.build(zero_token=True, normalize_variance=False, normalize_norm=True) log('writing GloVeBox pickle...') pickle.dump(yelp_gb, open(YELP_WV_FILE.replace('.txt', '-glovebox.pkl'), 'wb'), pickle.HIGHEST_PROTOCOL) yelp = YelpDataHandler() ################################## ### YELP USEFUL ################################## log('Saving "useful" votes data') (train_reviews, train_labels, test_reviews, test_labels) = \ yelp.get_data(YELP_USEFUL_TRAIN, YELP_USEFUL_DEV, YELP_USEFUL_TEST) reviews_wvs_train = yelp.to_word_level_idx(train_reviews, global_gb, WORDS_PER_TEXT) # -- training data save np.save('Yelp_useful_train_fulltext_glove_300_X.npy', reviews_wvs_train) np.save('Yelp_useful_train_fulltext_glove_300_y.npy', train_labels) reviews_wvs_train = yelp.to_word_level_idx(train_reviews, yelp_gb, WORDS_PER_TEXT) # -- training data save
pickle.dump(global_gb, open(GLOBAL_WV_FILE.replace('.txt', '-glovebox.pkl'), 'wb'), pickle.HIGHEST_PROTOCOL) log('Building global word vectors from {}'.format(YELP_WV_FILE)) yelp_gb = GloVeBox(YELP_WV_FILE) yelp_gb.build(zero_token=True, normalize_variance=False, normalize_norm=True) log('writing GloVeBox pickle...') pickle.dump(yelp_gb, open(YELP_WV_FILE.replace('.txt', '-glovebox.pkl'), 'wb'), pickle.HIGHEST_PROTOCOL) yelp = YelpDataHandler() ################################## ### YELP USEFUL ################################## log('Saving "useful" votes data') (train_reviews, train_labels, test_reviews, test_labels) = \ yelp.get_data(YELP_USEFUL_TRAIN, YELP_USEFUL_DEV, YELP_USEFUL_TEST) reviews_wvs_train = yelp.to_word_level_idx(train_reviews, global_gb, WORDS_PER_TEXT) # -- training data save np.save('Yelp_useful_train_fulltext_glove_300_X.npy', reviews_wvs_train) np.save('Yelp_useful_train_fulltext_glove_300_y.npy', train_labels) reviews_wvs_train = yelp.to_word_level_idx(train_reviews, yelp_gb,
YELP_FUNNY_TEST = '../yelp-dataset/TestSet_funny_75064' YELP_COOL_TRAIN = '../yelp-dataset/TrainSet_cool_88698' YELP_COOL_DEV = '../yelp-dataset/DevSet_cool_88698' YELP_COOL_TEST = '../yelp-dataset/TestSet_cool_88698' CHARACTERS_PER_WORD = 15 WORDS_PER_DOCUMENT = 300 PREPEND = False if __name__ == '__main__': log('Initializing CharMapper') cm = CharMapper() yelp = YelpDataHandler() def get_yelp_char(train_reviews, test_reviews): log('Converting to character level representations') log(' --> Starting Training Data...') train_reviews = yelp.to_char_level_idx( train_reviews, char_container=cm, chars_per_word=CHARACTERS_PER_WORD, words_per_document=WORDS_PER_DOCUMENT, prepend=PREPEND) log(' --> Training Data Complete') log(' --> Starting Testing Data...') test_reviews = yelp.to_char_level_idx( test_reviews, char_container=cm,