def main(): data_dir = 'C:\\Users\\John\\Documents\\Kaggle\\Word2Vec\\' model = Word2Vec.load(data_dir + '300features_40minwords_10context') # Run k-means on the word vectors and print a few clusters # Start time start = time.time() # Set "k" (num_clusters) to be 1/5th of the vocabulary size, or an # average of 5 words per cluster word_vectors = model.syn0 num_clusters = word_vectors.shape[0] / 5 # Initialize a k-means object and use it to extract centroids print 'Running K means' kmeans_clustering = KMeans(n_clusters=num_clusters) idx = kmeans_clustering.fit_predict(word_vectors) # Get the end time and print how long the process took end = time.time() elapsed = end - start print 'Time taken for K Means clustering: ', elapsed, 'seconds.' # Create a Word / Index dictionary, mapping each vocabulary word to # a cluster number word_centroid_map = dict(zip(model.index2word, idx)) # Print the first ten clusters for cluster in xrange(0, 10): # Print the cluster number print '\nCluster %d' % cluster # Find all of the words for that cluster number, and print them out words = [] for i in xrange(0, len(word_centroid_map.values())): if word_centroid_map.values()[i] == cluster: words.append(word_centroid_map.keys()[i]) print words # Create clean_train_reviews and clean_test_reviews as we did before # Read data from files train = pd.read_csv(data_dir + 'labeledTrainData.tsv', header=0, delimiter='\t', quoting=3) test = pd.read_csv(data_dir + 'testData.tsv', header=0, delimiter='\t', quoting=3) print 'Cleaning training reviews' clean_train_reviews = [] for review in train['review']: clean_train_reviews.append(KaggleUtility.review_to_wordlist(review, remove_stopwords=True)) print 'Cleaning test reviews' clean_test_reviews = [] for review in test['review']: clean_test_reviews.append(KaggleUtility.review_to_wordlist(review, remove_stopwords=True)) # Create bags of centroids # Pre-allocate an array for the training set bags of centroids (for speed) train_centroids = np.zeros((train['review'].size, num_clusters), dtype='float32') # Transform the training set reviews into bags of centroids counter = 0 for review in clean_train_reviews: train_centroids[counter] = create_bag_of_centroids(review, word_centroid_map) counter += 1 # Repeat for test reviews test_centroids = np.zeros((test['review'].size, num_clusters), dtype='float32') counter = 0 for review in clean_test_reviews: test_centroids[counter] = create_bag_of_centroids(review, word_centroid_map) counter += 1 # Fit a random forest and extract predictions forest = RandomForestClassifier(n_estimators=100) # Fitting the forest may take a few minutes print 'Fitting a random forest to labeled training data...' forest = forest.fit(train_centroids, train['sentiment']) result = forest.predict(test_centroids) # Write the test results output = pd.DataFrame(data={'id': test['id'], 'sentiment': result}) output.to_csv(data_dir + 'BagOfCentroids.csv', index=False, quoting=3) print 'Wrote BagOfCentroids.csv'
def get_clean_reviews(reviews): clean_reviews = [] for review in reviews["review"]: clean_reviews.append(KaggleUtility.review_to_wordlist(review, remove_stopwords=True)) return clean_reviews
def main(): data_dir = 'C:\\Users\\John\\Documents\\Kaggle\\Word2Vec\\' train = pd.read_csv(data_dir + 'labeledTrainData.tsv', header=0, delimiter='\t', quoting=3) test = pd.read_csv(data_dir + 'testData.tsv', header=0, delimiter='\t', quoting=3) print 'The first review is:' print train['review'][0] raw_input('Press Enter to continue...') # print 'Downloading text data sets...' # nltk.download() # Initialize an empty list to hold the clean reviews clean_train_reviews = [] # Loop over each review; create an index i that goes from 0 to the length # of the movie review list print 'Cleaning and parsing the training set movie reviews...\n' for i in xrange(0, len(train['review'])): clean_train_reviews.append(' '.join(KaggleUtility.review_to_wordlist(train['review'][i], True))) # Create a bag of words from the training set print 'Creating the bag of words...\n' # Initialize the "CountVectorizer" object, which is scikit-learn's bag of words tool vectorizer = CountVectorizer(analyzer='word', tokenizer=None, preprocessor=None, stop_words=None, max_features=5000) # fit_transform() does two functions: First, it fits the model # and learns the vocabulary; second, it transforms our training data # into feature vectors. The input to fit_transform should be a list of # strings train_data_features = vectorizer.fit_transform(clean_train_reviews) # Numpy arrays are easy to work with, so convert the result to an array train_data_features = train_data_features.toarray() # Train a random forest using the bag of words print 'Training the random forest (this may take a while)...' # Initialize a Random Forest classifier with 100 trees forest = RandomForestClassifier(n_estimators=100) # Fit the forest to the training set, using the bag of words as # features and the sentiment labels as the response variable # This may take a few minutes to run forest = forest.fit(train_data_features, train['sentiment']) # Create an empty list and append the clean reviews one by one clean_test_reviews = [] print 'Cleaning and parsing the test set movie reviews...\n' for i in xrange(0, len(test['review'])): clean_test_reviews.append(' '.join(KaggleUtility.review_to_wordlist(test['review'][i], True))) # Get a bag of words for the test set, and convert to a numpy array test_data_features = vectorizer.transform(clean_test_reviews) test_data_features = test_data_features.toarray() # Use the random forest to make sentiment label predictions print 'Predicting test labels...\n' result = forest.predict(test_data_features) # Copy the results to a pandas dataframe with an "id" column and # a "sentiment" column output = pd.DataFrame(data={'id': test['id'], 'sentiment': result}) # Use pandas to write the comma-separated output file output.to_csv(data_dir + 'Bag_of_Words_model.csv', index=False, quoting=3) print 'Wrote results to Bag_of_Words_model.csv'