Ejemplo n.º 1
0
def get_avgfeatures_word2vec(
        data,
        column,
        model,
        num_features=300,
        writeFeaturesFileName="./model/imdb_avgfeatures.pickle"):
    if (os.path.isfile(writeFeaturesFileName)):
        reviewFeatureVecs = cPickle.load(open(writeFeaturesFileName))
        return reviewFeatureVecs
    #
    reviews = read_article.data_to_reviews(data, column)
    # Initialize a counter
    counter = 0
    #
    # Preallocate a 2D numpy array, for speed
    reviewFeatureVecs = np.zeros((len(reviews), num_features), dtype="float32")
    #
    # Loop through the reviews
    for review in reviews:
        # Print a status message every 1000th review
        if counter % 1000. == 0.:
            print("Review %d of %d" % (counter, len(reviews)))
        # Call the function (defined above) that makes average feature vectors

        reviewFeatureVecs[counter] = makeAvgVec(review, model)
        # Increment the counter
        counter = counter + 1

    cPickle.dump(reviewFeatureVecs, open(writeFeaturesFileName, 'w'))
    return reviewFeatureVecs
Ejemplo n.º 2
0
def get_indices_word2vec(data, column, model, maxLength=50, writeIndexFileName="./model/word2vec_indices.pickle",
                         padLeft=True, keep_freqwords=[]):

    if (os.path.isfile(writeIndexFileName)):
        reviewIndexVecs = pickle.load(open(writeIndexFileName, 'rb'))
        return reviewIndexVecs
    #
    reviews = read_article.data_to_reviews(data, column, keep_freqwords=keep_freqwords)
    # Initialize a counter
    counter = 0
    #
    # Preallocate a 2D numpy array, for speed
    reviewIndexVecs = np.zeros((len(reviews), maxLength), dtype="int32")
    #
    # Loop through the reviews
    for review in reviews:
        # Print a status message every 1000th review
        if counter % 1000 == 0:
            print("Review %d of %d" % (counter, len(reviews)))
        # Call the function (defined above) that makes average feature vectors
        reviewIndexVecs[counter] = makeIndexVec(review, model, maxLength, padLeft=padLeft)
        # Increment the counter
        counter = counter + 1

    pickle.dump(reviewIndexVecs, open(writeIndexFileName, 'wb'))
    return reviewIndexVecs