コード例 #1
0
def main():
    data_dir = 'C:\\Users\\John\\Documents\\Kaggle\\Word2Vec\\'
    model = Word2Vec.load(data_dir + '300features_40minwords_10context')

    # Run k-means on the word vectors and print a few clusters

    # Start time
    start = time.time()

    # Set "k" (num_clusters) to be 1/5th of the vocabulary size, or an
    # average of 5 words per cluster
    word_vectors = model.syn0
    num_clusters = word_vectors.shape[0] / 5

    # Initialize a k-means object and use it to extract centroids
    print 'Running K means'
    kmeans_clustering = KMeans(n_clusters=num_clusters)
    idx = kmeans_clustering.fit_predict(word_vectors)

    # Get the end time and print how long the process took
    end = time.time()
    elapsed = end - start
    print 'Time taken for K Means clustering: ', elapsed, 'seconds.'

    # Create a Word / Index dictionary, mapping each vocabulary word to
    # a cluster number
    word_centroid_map = dict(zip(model.index2word, idx))

    # Print the first ten clusters
    for cluster in xrange(0, 10):

        # Print the cluster number
        print '\nCluster %d' % cluster

        # Find all of the words for that cluster number, and print them out
        words = []
        for i in xrange(0, len(word_centroid_map.values())):
            if word_centroid_map.values()[i] == cluster:
                words.append(word_centroid_map.keys()[i])
        print words

    # Create clean_train_reviews and clean_test_reviews as we did before

    # Read data from files
    train = pd.read_csv(data_dir + 'labeledTrainData.tsv', header=0, delimiter='\t', quoting=3)
    test = pd.read_csv(data_dir + 'testData.tsv', header=0, delimiter='\t', quoting=3)

    print 'Cleaning training reviews'
    clean_train_reviews = []
    for review in train['review']:
        clean_train_reviews.append(KaggleUtility.review_to_wordlist(review, remove_stopwords=True))

    print 'Cleaning test reviews'
    clean_test_reviews = []
    for review in test['review']:
        clean_test_reviews.append(KaggleUtility.review_to_wordlist(review, remove_stopwords=True))

    # Create bags of centroids

    # Pre-allocate an array for the training set bags of centroids (for speed)
    train_centroids = np.zeros((train['review'].size, num_clusters), dtype='float32')

    # Transform the training set reviews into bags of centroids
    counter = 0
    for review in clean_train_reviews:
        train_centroids[counter] = create_bag_of_centroids(review, word_centroid_map)
        counter += 1

    # Repeat for test reviews
    test_centroids = np.zeros((test['review'].size, num_clusters), dtype='float32')

    counter = 0
    for review in clean_test_reviews:
        test_centroids[counter] = create_bag_of_centroids(review, word_centroid_map)
        counter += 1

    # Fit a random forest and extract predictions
    forest = RandomForestClassifier(n_estimators=100)

    # Fitting the forest may take a few minutes
    print 'Fitting a random forest to labeled training data...'
    forest = forest.fit(train_centroids, train['sentiment'])
    result = forest.predict(test_centroids)

    # Write the test results
    output = pd.DataFrame(data={'id': test['id'], 'sentiment': result})
    output.to_csv(data_dir + 'BagOfCentroids.csv', index=False, quoting=3)
    print 'Wrote BagOfCentroids.csv'
コード例 #2
0
def get_clean_reviews(reviews):
    clean_reviews = []
    for review in reviews["review"]:
        clean_reviews.append(KaggleUtility.review_to_wordlist(review, remove_stopwords=True))
    return clean_reviews
コード例 #3
0
ファイル: bag_of_words.py プロジェクト: M4573R/kaggle
def main():
    data_dir = 'C:\\Users\\John\\Documents\\Kaggle\\Word2Vec\\'

    train = pd.read_csv(data_dir + 'labeledTrainData.tsv', header=0, delimiter='\t', quoting=3)
    test = pd.read_csv(data_dir + 'testData.tsv', header=0, delimiter='\t', quoting=3)

    print 'The first review is:'
    print train['review'][0]

    raw_input('Press Enter to continue...')

    # print 'Downloading text data sets...'
    # nltk.download()

    # Initialize an empty list to hold the clean reviews
    clean_train_reviews = []

    # Loop over each review; create an index i that goes from 0 to the length
    # of the movie review list

    print 'Cleaning and parsing the training set movie reviews...\n'
    for i in xrange(0, len(train['review'])):
        clean_train_reviews.append(' '.join(KaggleUtility.review_to_wordlist(train['review'][i], True)))

    # Create a bag of words from the training set

    print 'Creating the bag of words...\n'

    # Initialize the "CountVectorizer" object, which is scikit-learn's bag of words tool
    vectorizer = CountVectorizer(analyzer='word', tokenizer=None, preprocessor=None,
                                 stop_words=None, max_features=5000)

    # fit_transform() does two functions: First, it fits the model
    # and learns the vocabulary; second, it transforms our training data
    # into feature vectors. The input to fit_transform should be a list of
    # strings
    train_data_features = vectorizer.fit_transform(clean_train_reviews)

    # Numpy arrays are easy to work with, so convert the result to an array
    train_data_features = train_data_features.toarray()

    # Train a random forest using the bag of words

    print 'Training the random forest (this may take a while)...'

    # Initialize a Random Forest classifier with 100 trees
    forest = RandomForestClassifier(n_estimators=100)

    # Fit the forest to the training set, using the bag of words as
    # features and the sentiment labels as the response variable

    # This may take a few minutes to run
    forest = forest.fit(train_data_features, train['sentiment'])

    # Create an empty list and append the clean reviews one by one
    clean_test_reviews = []

    print 'Cleaning and parsing the test set movie reviews...\n'
    for i in xrange(0, len(test['review'])):
        clean_test_reviews.append(' '.join(KaggleUtility.review_to_wordlist(test['review'][i], True)))

    # Get a bag of words for the test set, and convert to a numpy array
    test_data_features = vectorizer.transform(clean_test_reviews)
    test_data_features = test_data_features.toarray()

    # Use the random forest to make sentiment label predictions
    print 'Predicting test labels...\n'
    result = forest.predict(test_data_features)

    # Copy the results to a pandas dataframe with an "id" column and
    # a "sentiment" column
    output = pd.DataFrame(data={'id': test['id'], 'sentiment': result})

    # Use pandas to write the comma-separated output file
    output.to_csv(data_dir + 'Bag_of_Words_model.csv', index=False, quoting=3)
    print 'Wrote results to Bag_of_Words_model.csv'