from gensim.models.word2vec import Word2Vec from spacy.en import English from regression import BaseBowRegressor from language import tokenize_document # better tokenizer nlp = English() NUM_PARTITIONS = 70 WINDOW_SIZE = 4 VECTOR_SIZE = 100 MODEL_FILE = "w2v_%d_parts_%d_vector_%d_window" % (NUM_PARTITIONS, VECTOR_SIZE, WINDOW_SIZE) reviews_texts, useful_votes, funny_votes, cool_votes, review_stars = BaseBowRegressor.get_reviews_data( range(1, NUM_PARTITIONS)) sentences = [tokenize_document(txt) for txt in enumerate(reviews_texts)] # build the word2vec model and save it w2v = Word2Vec(sentences=sentences, size=VECTOR_SIZE, alpha=0.025, window=WINDOW_SIZE, min_count=2, sample=1e-5, workers=4, negative=10) w2v.init_sims(replace=True) w2v.save(MODEL_FILE)
reviews_train, _, funny_votes_train, _, _ = BaseBowRegressor.get_reviews_data( PARTITIONS_TRAINING) count = 0 for votes in funny_votes_train: if votes > 0: count += 1 print "Total non-zero votes: %d of %d" % (count, len(funny_votes_train)) print "Tokenizing" NUM_ELEMENTS_TRAIN = None NUM_ELEMENTS_TEST = None reviews_tokens_train = [ language.tokenize_document(txt) for txt in enumerate(reviews_train[:NUM_ELEMENTS_TRAIN]) ] X_train = tokens_to_word_vectors(reviews_tokens_train, model) reviews_tokens_train = None reviews_train = None gc.collect() X_train = np.array(X_train) y_train = np.array(funny_votes_train[:NUM_ELEMENTS_TRAIN]).astype('float32') maxlen = 100 # cut texts after this number of words batch_size = 32
model = Word2Vec.load(WORD2VEC_MODEL) reviews_train, _, funny_votes_train, _, _ = BaseBowRegressor.get_reviews_data(PARTITIONS_TRAINING) count = 0 for votes in funny_votes_train: if votes > 0: count += 1 print "Total non-zero votes: %d of %d" % (count, len(funny_votes_train)) print "Tokenizing" NUM_ELEMENTS_TRAIN = None NUM_ELEMENTS_TEST = None reviews_tokens_train = [language.tokenize_document(txt) for txt in enumerate(reviews_train[:NUM_ELEMENTS_TRAIN])] X_train = tokens_to_word_vectors(reviews_tokens_train, model) reviews_tokens_train = None reviews_train = None gc.collect() X_train = np.array(X_train) y_train = np.array(funny_votes_train[:NUM_ELEMENTS_TRAIN]).astype('float32') maxlen = 100 # cut texts after this number of words batch_size = 32 print("Pad sequences (samples x time)")
WORD2VEC_MODEL = "w2v_70_parts_100_vector_4_window" PARTITIONS_TRAINING = range(1, 30) #15 PARTITIONS_TESTING = range(50, 53) #22 w2vmodel = Word2Vec.load(WORD2VEC_MODEL) reviews_train, _, funny_votes_train, _, _ = BaseBowRegressor.get_reviews_data( PARTITIONS_TRAINING) reviews_train, labels_train = give_balanced_classes(reviews_train, funny_votes_train) print "Tokenizing" NUM_ELEMENTS_TRAIN = None NUM_ELEMENTS_TEST = None reviews_tokens_train = [ language.tokenize_document((i, unicode(txt))) for (i, txt) in enumerate(reviews_train[:NUM_ELEMENTS_TRAIN]) ] X_train = tokens_to_word_vectors(reviews_tokens_train, w2vmodel) reviews_tokens_train = None reviews_train = None gc.collect() X_train = np.array(X_train) labels_train = np.array(labels_train[:NUM_ELEMENTS_TRAIN]) # Load test material print "LOADING TEST DATA"
""" Script to compute word vectors from the reviews """ from gensim.models.word2vec import Word2Vec from spacy.en import English from regression import BaseBowRegressor from language import tokenize_document # better tokenizer nlp = English() NUM_PARTITIONS = 70 WINDOW_SIZE = 4 VECTOR_SIZE = 100 MODEL_FILE = "w2v_%d_parts_%d_vector_%d_window" % (NUM_PARTITIONS, VECTOR_SIZE, WINDOW_SIZE) reviews_texts, useful_votes, funny_votes, cool_votes, review_stars = BaseBowRegressor.get_reviews_data(range(1, NUM_PARTITIONS)) sentences = [tokenize_document(txt) for txt in enumerate(reviews_texts)] # build the word2vec model and save it w2v = Word2Vec(sentences=sentences, size=VECTOR_SIZE, alpha=0.025, window=WINDOW_SIZE, min_count=2, sample=1e-5, workers=4, negative=10) w2v.init_sims(replace=True) w2v.save(MODEL_FILE)