# Create clean_train_reviews and clean_test_reviews as we did before
    #

    # Read data from files
    train = pd.read_csv( os.path.join(os.path.dirname(__file__), 'data', 'labeledTrainData.tsv'), header=0, delimiter="\t", quoting=3 )
    test = pd.read_csv(os.path.join(os.path.dirname(__file__), 'data', 'testData.tsv'), header=0, delimiter="\t", quoting=3 )


    print("Cleaning training reviews")
    clean_train_reviews = []
    for review in train["review"]:
        clean_train_reviews.append( KaggleWord2VecUtility.review_to_wordlist( review, \
            remove_stopwords=True ))

    print("Cleaning test reviews")
    clean_test_reviews = []
    for review in test["review"]:
        clean_test_reviews.append( KaggleWord2VecUtility.review_to_wordlist( review, \
            remove_stopwords=True ))


    # ****** Create bags of centroids
    #
    # Pre-allocate an array for the training set bags of centroids (for speed)
    train_centroids = np.zeros( (train["review"].size, num_clusters), \
        dtype="float32" )

    # Transform the training set reviews into bags of centroids
import pandas as pd
import numpy as np

path = 'D:/dataset/word2vec/'
train = pd.read_csv(path + 'labeledTrainData.tsv',
                    header=0,
                    delimiter="\t",
                    quoting=3)
test = pd.read_csv(path + 'testData.tsv', header=0, delimiter="\t", quoting=3)
y = train["sentiment"]

print("Cleaning and parsing movie reviews...\n")
traindata = []
for i in range(0, len(train["review"])):
    traindata.append(" ".join(
        KaggleWord2VecUtility.review_to_wordlist(train["review"][i], False)))
testdata = []
for i in range(0, len(test["review"])):
    testdata.append(" ".join(
        KaggleWord2VecUtility.review_to_wordlist(test["review"][i], False)))

print('vectorizing... ')
tfv = TfidfVectorizer(min_df=3,
                      max_features=None,
                      strip_accents='unicode',
                      analyzer='word',
                      token_pattern=r'\w{1,}',
                      ngram_range=(1, 2),
                      use_idf=1,
                      smooth_idf=1,
                      sublinear_tf=1,
    # Read data from files
    train = pd.read_csv(os.path.join(os.path.dirname(__file__), 'data',
                                     'labeledTrainData.tsv'),
                        header=0,
                        delimiter="\t",
                        quoting=3)
    test = pd.read_csv(os.path.join(os.path.dirname(__file__), 'data',
                                    'testData.tsv'),
                       header=0,
                       delimiter="\t",
                       quoting=3)

    print("Cleaning training reviews")
    clean_train_reviews = []
    for review in train["review"]:
        clean_train_reviews.append( KaggleWord2VecUtility.review_to_wordlist( review, \
            remove_stopwords=True ))

    print("Cleaning test reviews")
    clean_test_reviews = []
    for review in test["review"]:
        clean_test_reviews.append( KaggleWord2VecUtility.review_to_wordlist( review, \
            remove_stopwords=True ))

    # ****** Create bags of centroids
    #
    # Pre-allocate an array for the training set bags of centroids (for speed)
    train_centroids = np.zeros( (train["review"].size, num_clusters), \
        dtype="float32" )

    # Transform the training set reviews into bags of centroids
    counter = 0
Beispiel #4
0
def getCleanReviews(reviews):
    clean_reviews = []
    for review in reviews["review"]:
        clean_reviews.append( KaggleWord2VecUtility.review_to_wordlist( review, remove_stopwords=True ))
    return clean_reviews
Beispiel #5
0
     test["review"].size, unlabeled_train["review"].size ))



    # Load the punkt tokenizer
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')



    # ****** Split the labeled and unlabeled training sets into clean sentences
    #
    sentences = []  # Initialize an empty list of sentences

    print("Parsing sentences from training set")
    for review in train["review"]:
        sentences += KaggleWord2VecUtility.review_to_sentences(review, tokenizer)

    print("Parsing sentences from unlabeled set")
    for review in unlabeled_train["review"]:
        sentences += KaggleWord2VecUtility.review_to_sentences(review, tokenizer)

    # ****** Set parameters and train the word2vec model
    #
    # Import the built-in logging module and configure it so that Word2Vec
    # creates nice output messages
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\
        level=logging.INFO)

    # Set values for various parameters
    num_features = 300    # Word vector dimensionality
    min_word_count = 40   # Minimum word count
from Kaggle_bag_of_words.KaggleWord2VecUtility import KaggleWord2VecUtility
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn import cross_validation
import pandas as pd
import numpy as np

path = 'D:/dataset/word2vec/'
train = pd.read_csv(path+'labeledTrainData.tsv', header=0, delimiter="\t", quoting=3)
test = pd.read_csv(path+'testData.tsv', header=0, delimiter="\t", quoting=3 )
y = train["sentiment"]

print("Cleaning and parsing movie reviews...\n")
traindata = []
for i in range( 0, len(train["review"])):
    traindata.append(" ".join(KaggleWord2VecUtility.review_to_wordlist(train["review"][i], False)))
testdata = []
for i in range(0,len(test["review"])):
    testdata.append(" ".join(KaggleWord2VecUtility.review_to_wordlist(test["review"][i], False)))

print('vectorizing... ')
tfv = TfidfVectorizer(min_df=3,  max_features=None,
        strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
        ngram_range=(1, 2), use_idf=1,smooth_idf=1,sublinear_tf=1,
        stop_words = 'english')
X_all = traindata + testdata
lentrain = len(traindata)

print("fitting pipeline... ")
tfv.fit(X_all)
X_all = tfv.transform(X_all)
    input("Press Enter to continue...")

    print(
        "Download text data sets. If you already have NLTK datasets downloaded, just close the Python download window..."
    )
    # nltk.download()  # Download text data sets, including stop words

    # Initialize an empty list to hold the clean reviews
    clean_train_reviews = []

    # Loop over each review; create an index i that goes from 0 to the length
    # of the movie review list

    print("Cleaning and parsing the training set movie reviews...\n")
    for i in range(0, len(train["review"])):
        clean_train_reviews.append(" ".join(KaggleWord2VecUtility.review_to_wordlist(train["review"][i], False)))

    # ****** Create a bag of words from the training set
    #
    print("Creating the bag of words...\n")

    # Initialize the "CountVectorizer" object, which is scikit-learn's
    # bag of words tool.
    vectorizer = CountVectorizer(analyzer="word", tokenizer=None, preprocessor=None, stop_words=None, max_features=5000)

    # fit_transform() does two functions: First, it fits the model
    # and learns the vocabulary; second, it transforms our training data
    # into feature vectors. The data to fit_transform should be a list of
    # strings.
    train_data_features = vectorizer.fit_transform(clean_train_reviews)