def main(): data_dir = "C:\\Users\\John\\Documents\\Kaggle\\Word2Vec\\" # Read data from files train = pd.read_csv(data_dir + "labeledTrainData.tsv", header=0, delimiter="\t", quoting=3) test = pd.read_csv(data_dir + "testData.tsv", header=0, delimiter="\t", quoting=3) unlabeled_train = pd.read_csv(data_dir + "unlabeledTrainData.tsv", header=0, delimiter="\t", quoting=3) # Verify the number of reviews that were read (100,000 in total) print "Read %d labeled train reviews, %d labeled test reviews, and %d unlabeled reviews\n" % ( train["review"].size, test["review"].size, unlabeled_train["review"].size, ) # Load the punkt tokenizer tokenizer = nltk.data.load("tokenizers/punkt/english.pickle") # Split the labeled and unlabeled training sets into clean sentences # Initialize an empty list of sentences sentences = [] print "Parsing sentences from training set" for review in train["review"]: sentences += KaggleUtility.review_to_sentences(review, tokenizer) print "Parsing sentences from unlabeled set" for review in unlabeled_train["review"]: sentences += KaggleUtility.review_to_sentences(review, tokenizer) # Set parameters and train the word2vec model # Import the built-in logging module and configure it so that Word2Vec # creates nice output messages logging.basicConfig(format="%(asctime)s : %(levelname)s : %(message)s", level=logging.INFO) # Set values for various parameters num_features = 300 # Word vector dimensionality min_word_count = 40 # Minimum word count num_workers = 4 # Number of threads to run in parallel context = 10 # Context window size downsampling = 1e-3 # Downsample setting for frequent words # Initialize and train the model (this will take some time) print "Training Word2Vec model..." model = Word2Vec( sentences, workers=num_workers, size=num_features, min_count=min_word_count, window=context, sample=downsampling, seed=1, ) # If you don't plan to train the model any further, calling # init_sims will make the model much more memory-efficient. model.init_sims(replace=True) # It can be helpful to create a meaningful model name and # save the model for later use. You can load it later using Word2Vec.load() model_name = "300features_40minwords_10context" model.save(data_dir + model_name) model.doesnt_match("man woman child kitchen".split()) model.doesnt_match("france england germany berlin".split()) model.doesnt_match("paris berlin london austria".split()) model.most_similar("man") model.most_similar("queen") model.most_similar("awful") # Create average vectors for the training and test sets print "Creating average feature vecs for training reviews" train_data_vecs = get_avg_feature_vecs(get_clean_reviews(train), model, num_features) print "Creating average feature vecs for test reviews" test_data_vecs = get_avg_feature_vecs(get_clean_reviews(test), model, num_features) # Fit a random forest to the training set, then make predictions # Fit a random forest to the training data, using 100 trees forest = RandomForestClassifier(n_estimators=100) print "Fitting a random forest to labeled training data..." forest = forest.fit(train_data_vecs, train["sentiment"]) # Test & extract results result = forest.predict(test_data_vecs) # Write the test results output = pd.DataFrame(data={"id": test["id"], "sentiment": result}) output.to_csv(data_dir + "Word2Vec_AverageVectors.csv", index=False, quoting=3) print "Wrote Word2Vec_AverageVectors.csv"
def main(): data_dir = 'C:\\Users\\John\\Documents\\Kaggle\\Word2Vec\\' model = Word2Vec.load(data_dir + '300features_40minwords_10context') # Run k-means on the word vectors and print a few clusters # Start time start = time.time() # Set "k" (num_clusters) to be 1/5th of the vocabulary size, or an # average of 5 words per cluster word_vectors = model.syn0 num_clusters = word_vectors.shape[0] / 5 # Initialize a k-means object and use it to extract centroids print 'Running K means' kmeans_clustering = KMeans(n_clusters=num_clusters) idx = kmeans_clustering.fit_predict(word_vectors) # Get the end time and print how long the process took end = time.time() elapsed = end - start print 'Time taken for K Means clustering: ', elapsed, 'seconds.' # Create a Word / Index dictionary, mapping each vocabulary word to # a cluster number word_centroid_map = dict(zip(model.index2word, idx)) # Print the first ten clusters for cluster in xrange(0, 10): # Print the cluster number print '\nCluster %d' % cluster # Find all of the words for that cluster number, and print them out words = [] for i in xrange(0, len(word_centroid_map.values())): if word_centroid_map.values()[i] == cluster: words.append(word_centroid_map.keys()[i]) print words # Create clean_train_reviews and clean_test_reviews as we did before # Read data from files train = pd.read_csv(data_dir + 'labeledTrainData.tsv', header=0, delimiter='\t', quoting=3) test = pd.read_csv(data_dir + 'testData.tsv', header=0, delimiter='\t', quoting=3) print 'Cleaning training reviews' clean_train_reviews = [] for review in train['review']: clean_train_reviews.append(KaggleUtility.review_to_wordlist(review, remove_stopwords=True)) print 'Cleaning test reviews' clean_test_reviews = [] for review in test['review']: clean_test_reviews.append(KaggleUtility.review_to_wordlist(review, remove_stopwords=True)) # Create bags of centroids # Pre-allocate an array for the training set bags of centroids (for speed) train_centroids = np.zeros((train['review'].size, num_clusters), dtype='float32') # Transform the training set reviews into bags of centroids counter = 0 for review in clean_train_reviews: train_centroids[counter] = create_bag_of_centroids(review, word_centroid_map) counter += 1 # Repeat for test reviews test_centroids = np.zeros((test['review'].size, num_clusters), dtype='float32') counter = 0 for review in clean_test_reviews: test_centroids[counter] = create_bag_of_centroids(review, word_centroid_map) counter += 1 # Fit a random forest and extract predictions forest = RandomForestClassifier(n_estimators=100) # Fitting the forest may take a few minutes print 'Fitting a random forest to labeled training data...' forest = forest.fit(train_centroids, train['sentiment']) result = forest.predict(test_centroids) # Write the test results output = pd.DataFrame(data={'id': test['id'], 'sentiment': result}) output.to_csv(data_dir + 'BagOfCentroids.csv', index=False, quoting=3) print 'Wrote BagOfCentroids.csv'
def get_clean_reviews(reviews): clean_reviews = [] for review in reviews["review"]: clean_reviews.append(KaggleUtility.review_to_wordlist(review, remove_stopwords=True)) return clean_reviews
def main(): data_dir = 'C:\\Users\\John\\Documents\\Kaggle\\Word2Vec\\' train = pd.read_csv(data_dir + 'labeledTrainData.tsv', header=0, delimiter='\t', quoting=3) test = pd.read_csv(data_dir + 'testData.tsv', header=0, delimiter='\t', quoting=3) print 'The first review is:' print train['review'][0] raw_input('Press Enter to continue...') # print 'Downloading text data sets...' # nltk.download() # Initialize an empty list to hold the clean reviews clean_train_reviews = [] # Loop over each review; create an index i that goes from 0 to the length # of the movie review list print 'Cleaning and parsing the training set movie reviews...\n' for i in xrange(0, len(train['review'])): clean_train_reviews.append(' '.join(KaggleUtility.review_to_wordlist(train['review'][i], True))) # Create a bag of words from the training set print 'Creating the bag of words...\n' # Initialize the "CountVectorizer" object, which is scikit-learn's bag of words tool vectorizer = CountVectorizer(analyzer='word', tokenizer=None, preprocessor=None, stop_words=None, max_features=5000) # fit_transform() does two functions: First, it fits the model # and learns the vocabulary; second, it transforms our training data # into feature vectors. The input to fit_transform should be a list of # strings train_data_features = vectorizer.fit_transform(clean_train_reviews) # Numpy arrays are easy to work with, so convert the result to an array train_data_features = train_data_features.toarray() # Train a random forest using the bag of words print 'Training the random forest (this may take a while)...' # Initialize a Random Forest classifier with 100 trees forest = RandomForestClassifier(n_estimators=100) # Fit the forest to the training set, using the bag of words as # features and the sentiment labels as the response variable # This may take a few minutes to run forest = forest.fit(train_data_features, train['sentiment']) # Create an empty list and append the clean reviews one by one clean_test_reviews = [] print 'Cleaning and parsing the test set movie reviews...\n' for i in xrange(0, len(test['review'])): clean_test_reviews.append(' '.join(KaggleUtility.review_to_wordlist(test['review'][i], True))) # Get a bag of words for the test set, and convert to a numpy array test_data_features = vectorizer.transform(clean_test_reviews) test_data_features = test_data_features.toarray() # Use the random forest to make sentiment label predictions print 'Predicting test labels...\n' result = forest.predict(test_data_features) # Copy the results to a pandas dataframe with an "id" column and # a "sentiment" column output = pd.DataFrame(data={'id': test['id'], 'sentiment': result}) # Use pandas to write the comma-separated output file output.to_csv(data_dir + 'Bag_of_Words_model.csv', index=False, quoting=3) print 'Wrote results to Bag_of_Words_model.csv'