import pandas as pd import numpy as np path = 'D:/dataset/word2vec/' train = pd.read_csv(path + 'labeledTrainData.tsv', header=0, delimiter="\t", quoting=3) test = pd.read_csv(path + 'testData.tsv', header=0, delimiter="\t", quoting=3) y = train["sentiment"] print("Cleaning and parsing movie reviews...\n") traindata = [] for i in range(0, len(train["review"])): traindata.append(" ".join( KaggleWord2VecUtility.review_to_wordlist(train["review"][i], False))) testdata = [] for i in range(0, len(test["review"])): testdata.append(" ".join( KaggleWord2VecUtility.review_to_wordlist(test["review"][i], False))) print('vectorizing... ') tfv = TfidfVectorizer(min_df=3, max_features=None, strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}', ngram_range=(1, 2), use_idf=1, smooth_idf=1, sublinear_tf=1,
# Create clean_train_reviews and clean_test_reviews as we did before # # Read data from files train = pd.read_csv( os.path.join(os.path.dirname(__file__), 'data', 'labeledTrainData.tsv'), header=0, delimiter="\t", quoting=3 ) test = pd.read_csv(os.path.join(os.path.dirname(__file__), 'data', 'testData.tsv'), header=0, delimiter="\t", quoting=3 ) print("Cleaning training reviews") clean_train_reviews = [] for review in train["review"]: clean_train_reviews.append( KaggleWord2VecUtility.review_to_wordlist( review, \ remove_stopwords=True )) print("Cleaning test reviews") clean_test_reviews = [] for review in test["review"]: clean_test_reviews.append( KaggleWord2VecUtility.review_to_wordlist( review, \ remove_stopwords=True )) # ****** Create bags of centroids # # Pre-allocate an array for the training set bags of centroids (for speed) train_centroids = np.zeros( (train["review"].size, num_clusters), \ dtype="float32" ) # Transform the training set reviews into bags of centroids
# Read data from files train = pd.read_csv(os.path.join(os.path.dirname(__file__), 'data', 'labeledTrainData.tsv'), header=0, delimiter="\t", quoting=3) test = pd.read_csv(os.path.join(os.path.dirname(__file__), 'data', 'testData.tsv'), header=0, delimiter="\t", quoting=3) print("Cleaning training reviews") clean_train_reviews = [] for review in train["review"]: clean_train_reviews.append( KaggleWord2VecUtility.review_to_wordlist( review, \ remove_stopwords=True )) print("Cleaning test reviews") clean_test_reviews = [] for review in test["review"]: clean_test_reviews.append( KaggleWord2VecUtility.review_to_wordlist( review, \ remove_stopwords=True )) # ****** Create bags of centroids # # Pre-allocate an array for the training set bags of centroids (for speed) train_centroids = np.zeros( (train["review"].size, num_clusters), \ dtype="float32" ) # Transform the training set reviews into bags of centroids counter = 0
def getCleanReviews(reviews): clean_reviews = [] for review in reviews["review"]: clean_reviews.append( KaggleWord2VecUtility.review_to_wordlist( review, remove_stopwords=True )) return clean_reviews
from Kaggle_bag_of_words.KaggleWord2VecUtility import KaggleWord2VecUtility from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.linear_model import LogisticRegression from sklearn import cross_validation import pandas as pd import numpy as np path = 'D:/dataset/word2vec/' train = pd.read_csv(path+'labeledTrainData.tsv', header=0, delimiter="\t", quoting=3) test = pd.read_csv(path+'testData.tsv', header=0, delimiter="\t", quoting=3 ) y = train["sentiment"] print("Cleaning and parsing movie reviews...\n") traindata = [] for i in range( 0, len(train["review"])): traindata.append(" ".join(KaggleWord2VecUtility.review_to_wordlist(train["review"][i], False))) testdata = [] for i in range(0,len(test["review"])): testdata.append(" ".join(KaggleWord2VecUtility.review_to_wordlist(test["review"][i], False))) print('vectorizing... ') tfv = TfidfVectorizer(min_df=3, max_features=None, strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}', ngram_range=(1, 2), use_idf=1,smooth_idf=1,sublinear_tf=1, stop_words = 'english') X_all = traindata + testdata lentrain = len(traindata) print("fitting pipeline... ") tfv.fit(X_all) X_all = tfv.transform(X_all)
input("Press Enter to continue...") print( "Download text data sets. If you already have NLTK datasets downloaded, just close the Python download window..." ) # nltk.download() # Download text data sets, including stop words # Initialize an empty list to hold the clean reviews clean_train_reviews = [] # Loop over each review; create an index i that goes from 0 to the length # of the movie review list print("Cleaning and parsing the training set movie reviews...\n") for i in range(0, len(train["review"])): clean_train_reviews.append(" ".join(KaggleWord2VecUtility.review_to_wordlist(train["review"][i], False))) # ****** Create a bag of words from the training set # print("Creating the bag of words...\n") # Initialize the "CountVectorizer" object, which is scikit-learn's # bag of words tool. vectorizer = CountVectorizer(analyzer="word", tokenizer=None, preprocessor=None, stop_words=None, max_features=5000) # fit_transform() does two functions: First, it fits the model # and learns the vocabulary; second, it transforms our training data # into feature vectors. The data to fit_transform should be a list of # strings. train_data_features = vectorizer.fit_transform(clean_train_reviews)