# Create clean_train_reviews and clean_test_reviews as we did before # # Read data from files train = pd.read_csv( os.path.join(os.path.dirname(__file__), 'data', 'labeledTrainData.tsv'), header=0, delimiter="\t", quoting=3 ) test = pd.read_csv(os.path.join(os.path.dirname(__file__), 'data', 'testData.tsv'), header=0, delimiter="\t", quoting=3 ) print("Cleaning training reviews") clean_train_reviews = [] for review in train["review"]: clean_train_reviews.append( KaggleWord2VecUtility.review_to_wordlist( review, \ remove_stopwords=True )) print("Cleaning test reviews") clean_test_reviews = [] for review in test["review"]: clean_test_reviews.append( KaggleWord2VecUtility.review_to_wordlist( review, \ remove_stopwords=True )) # ****** Create bags of centroids # # Pre-allocate an array for the training set bags of centroids (for speed) train_centroids = np.zeros( (train["review"].size, num_clusters), \ dtype="float32" ) # Transform the training set reviews into bags of centroids
import pandas as pd import numpy as np path = 'D:/dataset/word2vec/' train = pd.read_csv(path + 'labeledTrainData.tsv', header=0, delimiter="\t", quoting=3) test = pd.read_csv(path + 'testData.tsv', header=0, delimiter="\t", quoting=3) y = train["sentiment"] print("Cleaning and parsing movie reviews...\n") traindata = [] for i in range(0, len(train["review"])): traindata.append(" ".join( KaggleWord2VecUtility.review_to_wordlist(train["review"][i], False))) testdata = [] for i in range(0, len(test["review"])): testdata.append(" ".join( KaggleWord2VecUtility.review_to_wordlist(test["review"][i], False))) print('vectorizing... ') tfv = TfidfVectorizer(min_df=3, max_features=None, strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}', ngram_range=(1, 2), use_idf=1, smooth_idf=1, sublinear_tf=1,
# Read data from files train = pd.read_csv(os.path.join(os.path.dirname(__file__), 'data', 'labeledTrainData.tsv'), header=0, delimiter="\t", quoting=3) test = pd.read_csv(os.path.join(os.path.dirname(__file__), 'data', 'testData.tsv'), header=0, delimiter="\t", quoting=3) print("Cleaning training reviews") clean_train_reviews = [] for review in train["review"]: clean_train_reviews.append( KaggleWord2VecUtility.review_to_wordlist( review, \ remove_stopwords=True )) print("Cleaning test reviews") clean_test_reviews = [] for review in test["review"]: clean_test_reviews.append( KaggleWord2VecUtility.review_to_wordlist( review, \ remove_stopwords=True )) # ****** Create bags of centroids # # Pre-allocate an array for the training set bags of centroids (for speed) train_centroids = np.zeros( (train["review"].size, num_clusters), \ dtype="float32" ) # Transform the training set reviews into bags of centroids counter = 0
def getCleanReviews(reviews): clean_reviews = [] for review in reviews["review"]: clean_reviews.append( KaggleWord2VecUtility.review_to_wordlist( review, remove_stopwords=True )) return clean_reviews
test["review"].size, unlabeled_train["review"].size )) # Load the punkt tokenizer tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') # ****** Split the labeled and unlabeled training sets into clean sentences # sentences = [] # Initialize an empty list of sentences print("Parsing sentences from training set") for review in train["review"]: sentences += KaggleWord2VecUtility.review_to_sentences(review, tokenizer) print("Parsing sentences from unlabeled set") for review in unlabeled_train["review"]: sentences += KaggleWord2VecUtility.review_to_sentences(review, tokenizer) # ****** Set parameters and train the word2vec model # # Import the built-in logging module and configure it so that Word2Vec # creates nice output messages logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\ level=logging.INFO) # Set values for various parameters num_features = 300 # Word vector dimensionality min_word_count = 40 # Minimum word count
from Kaggle_bag_of_words.KaggleWord2VecUtility import KaggleWord2VecUtility from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.linear_model import LogisticRegression from sklearn import cross_validation import pandas as pd import numpy as np path = 'D:/dataset/word2vec/' train = pd.read_csv(path+'labeledTrainData.tsv', header=0, delimiter="\t", quoting=3) test = pd.read_csv(path+'testData.tsv', header=0, delimiter="\t", quoting=3 ) y = train["sentiment"] print("Cleaning and parsing movie reviews...\n") traindata = [] for i in range( 0, len(train["review"])): traindata.append(" ".join(KaggleWord2VecUtility.review_to_wordlist(train["review"][i], False))) testdata = [] for i in range(0,len(test["review"])): testdata.append(" ".join(KaggleWord2VecUtility.review_to_wordlist(test["review"][i], False))) print('vectorizing... ') tfv = TfidfVectorizer(min_df=3, max_features=None, strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}', ngram_range=(1, 2), use_idf=1,smooth_idf=1,sublinear_tf=1, stop_words = 'english') X_all = traindata + testdata lentrain = len(traindata) print("fitting pipeline... ") tfv.fit(X_all) X_all = tfv.transform(X_all)
input("Press Enter to continue...") print( "Download text data sets. If you already have NLTK datasets downloaded, just close the Python download window..." ) # nltk.download() # Download text data sets, including stop words # Initialize an empty list to hold the clean reviews clean_train_reviews = [] # Loop over each review; create an index i that goes from 0 to the length # of the movie review list print("Cleaning and parsing the training set movie reviews...\n") for i in range(0, len(train["review"])): clean_train_reviews.append(" ".join(KaggleWord2VecUtility.review_to_wordlist(train["review"][i], False))) # ****** Create a bag of words from the training set # print("Creating the bag of words...\n") # Initialize the "CountVectorizer" object, which is scikit-learn's # bag of words tool. vectorizer = CountVectorizer(analyzer="word", tokenizer=None, preprocessor=None, stop_words=None, max_features=5000) # fit_transform() does two functions: First, it fits the model # and learns the vocabulary; second, it transforms our training data # into feature vectors. The data to fit_transform should be a list of # strings. train_data_features = vectorizer.fit_transform(clean_train_reviews)