Exemple #1
0
from gensim.models.word2vec import Word2Vec
from spacy.en import English
from regression import BaseBowRegressor
from language import tokenize_document

# better tokenizer
nlp = English()

NUM_PARTITIONS = 70
WINDOW_SIZE = 4
VECTOR_SIZE = 100
MODEL_FILE = "w2v_%d_parts_%d_vector_%d_window" % (NUM_PARTITIONS, VECTOR_SIZE,
                                                   WINDOW_SIZE)

reviews_texts, useful_votes, funny_votes, cool_votes, review_stars = BaseBowRegressor.get_reviews_data(
    range(1, NUM_PARTITIONS))

sentences = [tokenize_document(txt) for txt in enumerate(reviews_texts)]

# build the word2vec model and save it
w2v = Word2Vec(sentences=sentences,
               size=VECTOR_SIZE,
               alpha=0.025,
               window=WINDOW_SIZE,
               min_count=2,
               sample=1e-5,
               workers=4,
               negative=10)
w2v.init_sims(replace=True)
w2v.save(MODEL_FILE)
Exemple #2
0
reviews_train, _, funny_votes_train, _, _ = BaseBowRegressor.get_reviews_data(
    PARTITIONS_TRAINING)

count = 0
for votes in funny_votes_train:
    if votes > 0:
        count += 1

print "Total non-zero votes: %d of %d" % (count, len(funny_votes_train))

print "Tokenizing"
NUM_ELEMENTS_TRAIN = None
NUM_ELEMENTS_TEST = None
reviews_tokens_train = [
    language.tokenize_document(txt)
    for txt in enumerate(reviews_train[:NUM_ELEMENTS_TRAIN])
]

X_train = tokens_to_word_vectors(reviews_tokens_train, model)

reviews_tokens_train = None
reviews_train = None
gc.collect()

X_train = np.array(X_train)

y_train = np.array(funny_votes_train[:NUM_ELEMENTS_TRAIN]).astype('float32')

maxlen = 100  # cut texts after this number of words
batch_size = 32
Exemple #3
0
model = Word2Vec.load(WORD2VEC_MODEL)

reviews_train, _, funny_votes_train, _, _ = BaseBowRegressor.get_reviews_data(PARTITIONS_TRAINING)

count = 0
for votes in funny_votes_train:
    if votes > 0:
        count += 1

print "Total non-zero votes: %d of %d" % (count, len(funny_votes_train))


print "Tokenizing"
NUM_ELEMENTS_TRAIN = None
NUM_ELEMENTS_TEST = None
reviews_tokens_train = [language.tokenize_document(txt) for txt in enumerate(reviews_train[:NUM_ELEMENTS_TRAIN])]

X_train = tokens_to_word_vectors(reviews_tokens_train, model)

reviews_tokens_train = None
reviews_train = None
gc.collect()

X_train = np.array(X_train)

y_train = np.array(funny_votes_train[:NUM_ELEMENTS_TRAIN]).astype('float32')

maxlen = 100 # cut texts after this number of words
batch_size = 32

print("Pad sequences (samples x time)")
Exemple #4
0
WORD2VEC_MODEL = "w2v_70_parts_100_vector_4_window"
PARTITIONS_TRAINING = range(1, 30)  #15
PARTITIONS_TESTING = range(50, 53)  #22

w2vmodel = Word2Vec.load(WORD2VEC_MODEL)

reviews_train, _, funny_votes_train, _, _ = BaseBowRegressor.get_reviews_data(
    PARTITIONS_TRAINING)
reviews_train, labels_train = give_balanced_classes(reviews_train,
                                                    funny_votes_train)

print "Tokenizing"
NUM_ELEMENTS_TRAIN = None
NUM_ELEMENTS_TEST = None
reviews_tokens_train = [
    language.tokenize_document((i, unicode(txt)))
    for (i, txt) in enumerate(reviews_train[:NUM_ELEMENTS_TRAIN])
]

X_train = tokens_to_word_vectors(reviews_tokens_train, w2vmodel)

reviews_tokens_train = None
reviews_train = None
gc.collect()

X_train = np.array(X_train)

labels_train = np.array(labels_train[:NUM_ELEMENTS_TRAIN])

# Load test material
print "LOADING TEST DATA"
Exemple #5
0
"""
Script to compute word vectors from the reviews
"""
from gensim.models.word2vec import Word2Vec
from spacy.en import English
from regression import BaseBowRegressor
from language import tokenize_document

# better tokenizer
nlp = English()

NUM_PARTITIONS = 70
WINDOW_SIZE = 4
VECTOR_SIZE = 100
MODEL_FILE = "w2v_%d_parts_%d_vector_%d_window" % (NUM_PARTITIONS, VECTOR_SIZE, WINDOW_SIZE)


reviews_texts, useful_votes, funny_votes, cool_votes, review_stars = BaseBowRegressor.get_reviews_data(range(1, NUM_PARTITIONS))

sentences = [tokenize_document(txt) for txt in enumerate(reviews_texts)]

# build the word2vec model and save it
w2v = Word2Vec(sentences=sentences, size=VECTOR_SIZE, alpha=0.025, window=WINDOW_SIZE, min_count=2, sample=1e-5, workers=4, negative=10)
w2v.init_sims(replace=True)
w2v.save(MODEL_FILE)