Esempio n. 1
0
# This may take a few minutes to run
forest = forest.fit(train_data_features, train["stars"])

# ********** Create an empty list and append the clean reviews one by one *******
clean_test_reviews = []
num_test_reviews = len(test["review"])

print "Cleaning and parsing the test set reviews...\n"
for i in xrange(0, num_test_reviews):
    if ((i + 1) % 10000 == 0):
        print "Review %d of %d" % (i + 1, num_test_reviews)
    clean_test_reviews.append(" ".join(
        Word2VecUtility.review_to_wordlist(test["review"][i], True)))

# Get a bag of words for the test set, and convert to a numpy array
test_data_features = vectorizer.transform(clean_test_reviews)
test_data_features = test_data_features.toarray()

# Use the random forest to make sentiment label predictions
print "Predicting test labels...\n"
pred_results = forest.predict(test_data_features)

accuracy = Word2VecUtility.compare_prediction(pred_results, test['stars'])

# # Copy the results to a pandas dataframe with an "id" column and
# # a "sentiment" column
# output = pd.DataFrame( data={"id":test["id"], "sentiment":result} )

# # Use pandas to write the comma-separated output file
# output.to_csv(os.path.join(os.path.dirname(__file__), 'data', 'Bag_of_Words_model.csv'), index=False, quoting=3)
# print "Wrote results to Bag_of_Words_model.csv"

# ********** Create an empty list and append the clean reviews one by one *******
clean_test_reviews = []
num_test_reviews = len(test["review"])

print "Cleaning and parsing the test set reviews...\n"
for i in xrange(0,num_test_reviews):
    if( (i+1) % 10000 == 0 ):
        print "Review %d of %d" % (i+1, num_test_reviews)    
    clean_test_reviews.append(" ".join(Word2VecUtility.review_to_wordlist(test["review"][i], True)))

# Get a bag of words for the test set, and convert to a numpy array
test_data_features = vectorizer.transform(clean_test_reviews)
test_data_features = test_data_features.toarray()

# Use the random forest to make sentiment label predictions
print "Predicting test labels...\n"
pred_results = forest.predict(test_data_features)

accuracy = Word2VecUtility.compare_prediction(pred_results, test['stars'])



# # Copy the results to a pandas dataframe with an "id" column and
# # a "sentiment" column
# output = pd.DataFrame( data={"id":test["id"], "sentiment":result} )

# # Use pandas to write the comma-separated output file
# output.to_csv(os.path.join(os.path.dirname(__file__), 'data', 'Bag_of_Words_model.csv'), index=False, quoting=3)
# print "Wrote results to Bag_of_Words_model.csv"