# First generate the pickles with all the data in python format
# Although this could be ignored and directly generate the relevant data,
# having the files can be nice for future expansion of the data
pickles_from_json(NUM_PARTITIONS)

# We need business data to filter reviews from outside of the US (only English)
business_data = get_business_data()

# Get data from partitions created, partition by partition
review_texts = []
useful_votes = []
funny_votes = []
cool_votes = []
review_stars = []
for partition in range(1, NUM_PARTITIONS + 1):
    data = get_reviews_data((partition, ), business_data, not_include_states=["EDH", "QC", "BW"])
    (texts, useful, funny, cool, stars) = data
    review_texts.extend(texts)
    useful_votes.extend(useful)
    funny_votes.extend(funny)
    cool_votes.extend(cool)
    review_stars.extend(stars)

# Generate dataset funny reviews
reviews, labels = give_balanced_classes(review_texts, funny_votes, votes_threshold=3)
result = create_data_sets(reviews, labels, write_to_pickle=True, problem="funny")
(train_reviews, train_labels, dev_reviews, dev_labels, test_reviews, test_labels) = result

# Generate dataset of useful reviews
reviews, labels = give_balanced_classes(review_texts, useful_votes, votes_threshold=3)
result = create_data_sets(reviews, labels, write_to_pickle=True, problem="useful")
Example #2
0
"""
Compute WordVectors using Yelp Data
"""
from gensim.models.word2vec import Word2Vec
from util.language import detect_language, tokenize_text
from data_handling import get_reviews_data

# Set to true for zero in in English reviews. Makes the process much slower
FILTER_ENGLISH = True
# Name for output w2v model file
OUTPUT_MODEL_FILE = "w2v_yelp_100_alpha_0.025_window_4"
PICKLED_DATA = "/home/alfredo/deep-nlp/data/reviews.pickle."

NUM_PARTITIONS = 2  # Use all data
reviews_texts, _, _, _, _ = get_reviews_data(range(1, NUM_PARTITIONS),
                                             PICKLED_DATA)

# Each review will be considered a sentence
sentences = []
for num, text in enumerate(reviews_texts):
    if num % 10000 == 0:
        print "%d out of %d reviews read" % (num, len(reviews_texts))
    if FILTER_ENGLISH:
        if detect_language(text) == u"english":
            sentences.append(tokenize_text(text))
    else:
        sentences.append(text)

# Build a w2v model
w2v = Word2Vec(sentences=sentences,
               size=100,
"""
Compute WordVectors using Yelp Data
"""
from gensim.models.word2vec import Word2Vec
from util.language import detect_language, tokenize_text
from data_handling import get_reviews_data

# Set to true for zero in in English reviews. Makes the process much slower
FILTER_ENGLISH = True
# Name for output w2v model file
OUTPUT_MODEL_FILE = "w2v_yelp_100_alpha_0.025_window_4"
PICKLED_DATA = "/home/alfredo/deep-nlp/data/reviews.pickle."

NUM_PARTITIONS = 2 # Use all data
reviews_texts, _, _, _, _ = get_reviews_data(range(1, NUM_PARTITIONS), PICKLED_DATA)

# Each review will be considered a sentence
sentences = []
for num, text in enumerate(reviews_texts):
    if num % 10000 == 0:
        print "%d out of %d reviews read" % (num, len(reviews_texts))
    if FILTER_ENGLISH:
        if detect_language(text) == u"english":
            sentences.append(tokenize_text(text))
    else:
        sentences.append(text)

# Build a w2v model
w2v = Word2Vec(sentences=sentences, size=100, alpha=0.025, window=4, min_count=2, sample=1e-5, workers=4, negative=10)
w2v.save(OUTPUT_MODEL_FILE)