Beispiel #1
0
def main():
    data_dir = '/home/john/data/bag-of-popcorn/'

    train = pd.read_csv(data_dir + 'labeledTrainData.tsv',
                        header=0,
                        delimiter='\t',
                        quoting=3)
    test = pd.read_csv(data_dir + 'testData.tsv',
                       header=0,
                       delimiter='\t',
                       quoting=3)

    print 'The first review is:'
    print train['review'][0]

    raw_input('Press Enter to continue...')

    # print 'Downloading text data sets...'
    # nltk.download()

    # Initialize an empty list to hold the clean reviews
    clean_train_reviews = []

    # Loop over each review; create an index i that goes from 0 to the length
    # of the movie review list

    print 'Cleaning and parsing the training set movie reviews...\n'
    for i in xrange(0, len(train['review'])):
        clean_train_reviews.append(' '.join(
            KaggleUtility.review_to_wordlist(train['review'][i], True)))

    # Create a bag of words from the training set

    print 'Creating the bag of words...\n'

    # Initialize the "CountVectorizer" object, which is scikit-learn's bag of words tool
    vectorizer = CountVectorizer(analyzer='word',
                                 tokenizer=None,
                                 preprocessor=None,
                                 stop_words=None,
                                 max_features=5000)

    # fit_transform() does two functions: First, it fits the model
    # and learns the vocabulary; second, it transforms our training data
    # into feature vectors. The input to fit_transform should be a list of
    # strings
    train_data_features = vectorizer.fit_transform(clean_train_reviews)

    # Numpy arrays are easy to work with, so convert the result to an array
    train_data_features = train_data_features.toarray()

    # Train a random forest using the bag of words

    print 'Training the random forest (this may take a while)...'

    # Initialize a Random Forest classifier with 100 trees
    forest = RandomForestClassifier(n_estimators=100)

    # Fit the forest to the training set, using the bag of words as
    # features and the sentiment labels as the response variable

    # This may take a few minutes to run
    forest = forest.fit(train_data_features, train['sentiment'])

    # Create an empty list and append the clean reviews one by one
    clean_test_reviews = []

    print 'Cleaning and parsing the test set movie reviews...\n'
    for i in xrange(0, len(test['review'])):
        clean_test_reviews.append(' '.join(
            KaggleUtility.review_to_wordlist(test['review'][i], True)))

    # Get a bag of words for the test set, and convert to a numpy array
    test_data_features = vectorizer.transform(clean_test_reviews)
    test_data_features = test_data_features.toarray()

    # Use the random forest to make sentiment label predictions
    print 'Predicting test labels...\n'
    result = forest.predict(test_data_features)

    # Copy the results to a pandas dataframe with an "id" column and
    # a "sentiment" column
    output = pd.DataFrame(data={'id': test['id'], 'sentiment': result})

    # Use pandas to write the comma-separated output file
    output.to_csv(data_dir + 'Bag_of_Words_model.csv', index=False, quoting=3)
    print 'Wrote results to Bag_of_Words_model.csv'
Beispiel #2
0
def main():
    data_dir = 'C:\\Users\\John\\Documents\\Kaggle\\Word2Vec\\'

    train = pd.read_csv(data_dir + 'labeledTrainData.tsv', header=0, delimiter='\t', quoting=3)
    test = pd.read_csv(data_dir + 'testData.tsv', header=0, delimiter='\t', quoting=3)

    print 'The first review is:'
    print train['review'][0]

    raw_input('Press Enter to continue...')

    # print 'Downloading text data sets...'
    # nltk.download()

    # Initialize an empty list to hold the clean reviews
    clean_train_reviews = []

    # Loop over each review; create an index i that goes from 0 to the length
    # of the movie review list

    print 'Cleaning and parsing the training set movie reviews...\n'
    for i in xrange(0, len(train['review'])):
        clean_train_reviews.append(' '.join(KaggleUtility.review_to_wordlist(train['review'][i], True)))

    # Create a bag of words from the training set

    print 'Creating the bag of words...\n'

    # Initialize the "CountVectorizer" object, which is scikit-learn's bag of words tool
    vectorizer = CountVectorizer(analyzer='word', tokenizer=None, preprocessor=None,
                                 stop_words=None, max_features=5000)

    # fit_transform() does two functions: First, it fits the model
    # and learns the vocabulary; second, it transforms our training data
    # into feature vectors. The input to fit_transform should be a list of
    # strings
    train_data_features = vectorizer.fit_transform(clean_train_reviews)

    # Numpy arrays are easy to work with, so convert the result to an array
    train_data_features = train_data_features.toarray()

    # Train a random forest using the bag of words

    print 'Training the random forest (this may take a while)...'

    # Initialize a Random Forest classifier with 100 trees
    forest = RandomForestClassifier(n_estimators=100)

    # Fit the forest to the training set, using the bag of words as
    # features and the sentiment labels as the response variable

    # This may take a few minutes to run
    forest = forest.fit(train_data_features, train['sentiment'])

    # Create an empty list and append the clean reviews one by one
    clean_test_reviews = []

    print 'Cleaning and parsing the test set movie reviews...\n'
    for i in xrange(0, len(test['review'])):
        clean_test_reviews.append(' '.join(KaggleUtility.review_to_wordlist(test['review'][i], True)))

    # Get a bag of words for the test set, and convert to a numpy array
    test_data_features = vectorizer.transform(clean_test_reviews)
    test_data_features = test_data_features.toarray()

    # Use the random forest to make sentiment label predictions
    print 'Predicting test labels...\n'
    result = forest.predict(test_data_features)

    # Copy the results to a pandas dataframe with an "id" column and
    # a "sentiment" column
    output = pd.DataFrame(data={'id': test['id'], 'sentiment': result})

    # Use pandas to write the comma-separated output file
    output.to_csv(data_dir + 'Bag_of_Words_model.csv', index=False, quoting=3)
    print 'Wrote results to Bag_of_Words_model.csv'
def main():
    data_dir = '/home/ajay/data/bag-of-popcorn/'
    model = Word2Vec.load(data_dir + '300features_40minwords_10context')

    # Run k-means on the word vectors and print a few clusters

    # Start time
    start = time.time()

    # Set "k" (num_clusters) to be 1/5th of the vocabulary size, or an
    # average of 5 words per cluster
    word_vectors = model.syn0
    num_clusters = word_vectors.shape[0] / 5

    # Initialize a k-means object and use it to extract centroids
    print 'Running K means'
    kmeans_clustering = KMeans(n_clusters=num_clusters)
    idx = kmeans_clustering.fit_predict(word_vectors)

    # Get the end time and print how long the process took
    end = time.time()
    elapsed = end - start
    print 'Time taken for K Means clustering: ', elapsed, 'seconds.'

    # Create a Word / Index dictionary, mapping each vocabulary word to
    # a cluster number
    word_centroid_map = dict(zip(model.index2word, idx))

    # Print the first ten clusters
    for cluster in xrange(0, 10):

        # Print the cluster number
        print '\nCluster %d' % cluster

        # Find all of the words for that cluster number, and print them out
        words = []
        for i in xrange(0, len(word_centroid_map.values())):
            if word_centroid_map.values()[i] == cluster:
                words.append(word_centroid_map.keys()[i])
        print words

    # Create clean_train_reviews and clean_test_reviews as we did before

    # Read data from files
    train = pd.read_csv(data_dir + 'labeledTrainData.tsv', header=0, delimiter='\t', quoting=3)
    test = pd.read_csv(data_dir + 'testData.tsv', header=0, delimiter='\t', quoting=3)

    print 'Cleaning training reviews'
    clean_train_reviews = []
    for review in train['review']:
        clean_train_reviews.append(KaggleUtility.review_to_wordlist(review, remove_stopwords=True))

    print 'Cleaning test reviews'
    clean_test_reviews = []
    for review in test['review']:
        clean_test_reviews.append(KaggleUtility.review_to_wordlist(review, remove_stopwords=True))

    # Create bags of centroids

    # Pre-allocate an array for the training set bags of centroids (for speed)
    train_centroids = np.zeros((train['review'].size, num_clusters), dtype='float32')

    # Transform the training set reviews into bags of centroids
    counter = 0
    for review in clean_train_reviews:
        train_centroids[counter] = create_bag_of_centroids(review, word_centroid_map)
        counter += 1

    # Repeat for test reviews
    test_centroids = np.zeros((test['review'].size, num_clusters), dtype='float32')

    counter = 0
    for review in clean_test_reviews:
        test_centroids[counter] = create_bag_of_centroids(review, word_centroid_map)
        counter += 1

    # Fit a random forest and extract predictions
    forest = RandomForestClassifier(n_estimators=100)

    # Fitting the forest may take a few minutes
    print 'Fitting a random forest to labeled training data...'
    forest = forest.fit(train_centroids, train['sentiment'])
    result = forest.predict(test_centroids)

    # Write the test results
    output = pd.DataFrame(data={'id': test['id'], 'sentiment': result})
    output.to_csv(data_dir + 'BagOfCentroids.csv', index=False, quoting=3)
    print 'Wrote BagOfCentroids.csv'
def main():
    data_dir = 'C:\\Users\\John\\Documents\\Kaggle\\Word2Vec\\'
    model = Word2Vec.load(data_dir + '300features_40minwords_10context')

    # Run k-means on the word vectors and print a few clusters

    # Start time
    start = time.time()

    # Set "k" (num_clusters) to be 1/5th of the vocabulary size, or an
    # average of 5 words per cluster
    word_vectors = model.syn0
    num_clusters = word_vectors.shape[0] / 5

    # Initialize a k-means object and use it to extract centroids
    print 'Running K means'
    kmeans_clustering = KMeans(n_clusters=num_clusters)
    idx = kmeans_clustering.fit_predict(word_vectors)

    # Get the end time and print how long the process took
    end = time.time()
    elapsed = end - start
    print 'Time taken for K Means clustering: ', elapsed, 'seconds.'

    # Create a Word / Index dictionary, mapping each vocabulary word to
    # a cluster number
    word_centroid_map = dict(zip(model.index2word, idx))

    # Print the first ten clusters
    for cluster in xrange(0, 10):

        # Print the cluster number
        print '\nCluster %d' % cluster

        # Find all of the words for that cluster number, and print them out
        words = []
        for i in xrange(0, len(word_centroid_map.values())):
            if word_centroid_map.values()[i] == cluster:
                words.append(word_centroid_map.keys()[i])
        print words

    # Create clean_train_reviews and clean_test_reviews as we did before

    # Read data from files
    train = pd.read_csv(data_dir + 'labeledTrainData.tsv', header=0, delimiter='\t', quoting=3)
    test = pd.read_csv(data_dir + 'testData.tsv', header=0, delimiter='\t', quoting=3)

    print 'Cleaning training reviews'
    clean_train_reviews = []
    for review in train['review']:
        clean_train_reviews.append(KaggleUtility.review_to_wordlist(review, remove_stopwords=True))

    print 'Cleaning test reviews'
    clean_test_reviews = []
    for review in test['review']:
        clean_test_reviews.append(KaggleUtility.review_to_wordlist(review, remove_stopwords=True))

    # Create bags of centroids

    # Pre-allocate an array for the training set bags of centroids (for speed)
    train_centroids = np.zeros((train['review'].size, num_clusters), dtype='float32')

    # Transform the training set reviews into bags of centroids
    counter = 0
    for review in clean_train_reviews:
        train_centroids[counter] = create_bag_of_centroids(review, word_centroid_map)
        counter += 1

    # Repeat for test reviews
    test_centroids = np.zeros((test['review'].size, num_clusters), dtype='float32')

    counter = 0
    for review in clean_test_reviews:
        test_centroids[counter] = create_bag_of_centroids(review, word_centroid_map)
        counter += 1

    # Fit a random forest and extract predictions
    forest = RandomForestClassifier(n_estimators=100)

    # Fitting the forest may take a few minutes
    print 'Fitting a random forest to labeled training data...'
    forest = forest.fit(train_centroids, train['sentiment'])
    result = forest.predict(test_centroids)

    # Write the test results
    output = pd.DataFrame(data={'id': test['id'], 'sentiment': result})
    output.to_csv(data_dir + 'BagOfCentroids.csv', index=False, quoting=3)
    print 'Wrote BagOfCentroids.csv'
def main():
    data_dir = 'C:\\Users\\John\\Documents\\Kaggle\\Word2Vec\\'

    # Read data from files
    train = pd.read_csv(data_dir + 'labeledTrainData.tsv', header=0, delimiter='\t', quoting=3)
    test = pd.read_csv(data_dir + 'testData.tsv', header=0, delimiter='\t', quoting=3)
    unlabeled_train = pd.read_csv(data_dir + 'unlabeledTrainData.tsv',  header=0,  delimiter='\t', quoting=3)

    # Verify the number of reviews that were read (100,000 in total)
    print 'Read %d labeled train reviews, %d labeled test reviews, and %d unlabeled reviews\n' % \
          (train['review'].size, test['review'].size, unlabeled_train['review'].size)

    # Load the punkt tokenizer
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

    # Split the labeled and unlabeled training sets into clean sentences

    # Initialize an empty list of sentences
    sentences = []

    print 'Parsing sentences from training set'
    for review in train['review']:
        sentences += KaggleUtility.review_to_sentences(review, tokenizer)

    print 'Parsing sentences from unlabeled set'
    for review in unlabeled_train['review']:
        sentences += KaggleUtility.review_to_sentences(review, tokenizer)

    # Set parameters and train the word2vec model

    # Import the built-in logging module and configure it so that Word2Vec
    # creates nice output messages
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

    # Set values for various parameters
    num_features = 300    # Word vector dimensionality
    min_word_count = 40   # Minimum word count
    num_workers = 4       # Number of threads to run in parallel
    context = 10          # Context window size
    downsampling = 1e-3   # Downsample setting for frequent words

    # Initialize and train the model (this will take some time)
    print 'Training Word2Vec model...'
    model = Word2Vec(sentences, workers=num_workers, size=num_features, min_count=min_word_count,
                     window=context, sample=downsampling, seed=1)

    # If you don't plan to train the model any further, calling
    # init_sims will make the model much more memory-efficient.
    model.init_sims(replace=True)

    # It can be helpful to create a meaningful model name and
    # save the model for later use. You can load it later using Word2Vec.load()
    model_name = '300features_40minwords_10context'
    model.save(data_dir + model_name)

    model.doesnt_match('man woman child kitchen'.split())
    model.doesnt_match('france england germany berlin'.split())
    model.doesnt_match('paris berlin london austria'.split())
    model.most_similar('man')
    model.most_similar('queen')
    model.most_similar('awful')

    # Create average vectors for the training and test sets

    print 'Creating average feature vecs for training reviews'

    train_data_vecs = get_avg_feature_vecs(get_clean_reviews(train), model, num_features)

    print 'Creating average feature vecs for test reviews'

    test_data_vecs = get_avg_feature_vecs(get_clean_reviews(test), model, num_features)

    # Fit a random forest to the training set, then make predictions

    # Fit a random forest to the training data, using 100 trees
    forest = RandomForestClassifier(n_estimators=100)

    print 'Fitting a random forest to labeled training data...'
    forest = forest.fit(train_data_vecs, train['sentiment'])

    # Test & extract results
    result = forest.predict(test_data_vecs)

    # Write the test results
    output = pd.DataFrame(data={'id': test['id'], 'sentiment': result})
    output.to_csv(data_dir + 'Word2Vec_AverageVectors.csv', index=False, quoting=3)
    print 'Wrote Word2Vec_AverageVectors.csv'
def get_clean_reviews(reviews):
    clean_reviews = []
    for review in reviews['review']:
        clean_reviews.append(KaggleUtility.review_to_wordlist(review, remove_stopwords=True))
    return clean_reviews
def main():
    data_dir = 'C:\\Users\\John\\Documents\\Kaggle\\Word2Vec\\'

    # Read data from files
    train = pd.read_csv(data_dir + 'labeledTrainData.tsv',
                        header=0,
                        delimiter='\t',
                        quoting=3)
    test = pd.read_csv(data_dir + 'testData.tsv',
                       header=0,
                       delimiter='\t',
                       quoting=3)
    unlabeled_train = pd.read_csv(data_dir + 'unlabeledTrainData.tsv',
                                  header=0,
                                  delimiter='\t',
                                  quoting=3)

    # Verify the number of reviews that were read (100,000 in total)
    print 'Read %d labeled train reviews, %d labeled test reviews, and %d unlabeled reviews\n' % \
          (train['review'].size, test['review'].size, unlabeled_train['review'].size)

    # Load the punkt tokenizer
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

    # Split the labeled and unlabeled training sets into clean sentences

    # Initialize an empty list of sentences
    sentences = []

    print 'Parsing sentences from training set'
    for review in train['review']:
        sentences += KaggleUtility.review_to_sentences(review, tokenizer)

    print 'Parsing sentences from unlabeled set'
    for review in unlabeled_train['review']:
        sentences += KaggleUtility.review_to_sentences(review, tokenizer)

    # Set parameters and train the word2vec model

    # Import the built-in logging module and configure it so that Word2Vec
    # creates nice output messages
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)

    # Set values for various parameters
    num_features = 300  # Word vector dimensionality
    min_word_count = 40  # Minimum word count
    num_workers = 4  # Number of threads to run in parallel
    context = 10  # Context window size
    downsampling = 1e-3  # Downsample setting for frequent words

    # Initialize and train the model (this will take some time)
    print 'Training Word2Vec model...'
    model = Word2Vec(sentences,
                     workers=num_workers,
                     size=num_features,
                     min_count=min_word_count,
                     window=context,
                     sample=downsampling,
                     seed=1)

    # If you don't plan to train the model any further, calling
    # init_sims will make the model much more memory-efficient.
    model.init_sims(replace=True)

    # It can be helpful to create a meaningful model name and
    # save the model for later use. You can load it later using Word2Vec.load()
    model_name = '300features_40minwords_10context'
    model.save(data_dir + model_name)

    model.doesnt_match('man woman child kitchen'.split())
    model.doesnt_match('france england germany berlin'.split())
    model.doesnt_match('paris berlin london austria'.split())
    model.most_similar('man')
    model.most_similar('queen')
    model.most_similar('awful')

    # Create average vectors for the training and test sets

    print 'Creating average feature vecs for training reviews'

    train_data_vecs = get_avg_feature_vecs(get_clean_reviews(train), model,
                                           num_features)

    print 'Creating average feature vecs for test reviews'

    test_data_vecs = get_avg_feature_vecs(get_clean_reviews(test), model,
                                          num_features)

    # Fit a random forest to the training set, then make predictions

    # Fit a random forest to the training data, using 100 trees
    forest = RandomForestClassifier(n_estimators=100)

    print 'Fitting a random forest to labeled training data...'
    forest = forest.fit(train_data_vecs, train['sentiment'])

    # Test & extract results
    result = forest.predict(test_data_vecs)

    # Write the test results
    output = pd.DataFrame(data={'id': test['id'], 'sentiment': result})
    output.to_csv(data_dir + 'Word2Vec_AverageVectors.csv',
                  index=False,
                  quoting=3)
    print 'Wrote Word2Vec_AverageVectors.csv'
def get_clean_reviews(reviews):
    clean_reviews = []
    for review in reviews['review']:
        clean_reviews.append(
            KaggleUtility.review_to_wordlist(review, remove_stopwords=True))
    return clean_reviews