def bag_of_words(): train = pd.read_csv("train_dummy.csv", delimiter=',') num_reviews = train["review"].size #print "Cleaning and parsing the training set movie reviews...\n" clean_train_reviews = [] for i in xrange(0, num_reviews): clean_train_reviews.append(" ".join( KaggleWord2VecUtility.review_to_wordlist(train["review"][i], True))) #print "Creating the bag of words..\n" #vectorizer = CountVectorizer(analyzer="word", tokenizer=None, preprocessor=None, stop_words=None, max_features=5000) vectorizer = TfidfVectorizer(analyzer="word", tokenizer=None, preprocessor=None, stop_words=None, min_df=1, max_features=500) train_data_features = vectorizer.fit_transform(clean_train_reviews) #print train_data_features train_data_features = train_data_features.toarray() scaler = StandardScaler() scaler.fit(train_data_features) train_data_features = scaler.transform(train_data_features) clf = MLPClassifier(activation='logistic', hidden_layer_sizes=(500, )) clf = clf.fit(train_data_features, train["rating"]) clean_test_reviews = [] test = pd.read_csv("test_dummy.csv", delimiter=',') num_reviews1 = test["review"].size #print "Cleaning and parsing the test set movie reviews...\n" clean_train_reviews = [] for i in xrange(0, num_reviews1): clean_test_reviews.append(" ".join( KaggleWord2VecUtility.review_to_wordlist(test["review"][i], True))) test_data_features = vectorizer.transform(clean_test_reviews) test_data_features = test_data_features.toarray() test_data_features = scaler.transform(test_data_features) #print len(test_data_features) #print "Predicting test labels ...\n" result = clf.predict(test_data_features) check = test['rating'].values #print train_data_features #print test_data_features #print "The accuracy score for decision tree is" print accuracy_score(check, result) print confusion_matrix(check, result) print classification_report(check, result) '''y, predicted = check, result
def getCleanReviews(reviews, skip=0, limit=0, dispose_percent=(0,0)): clean_reviews = [] if limit == 0: for review in reviews["review"][skip:]: clean_reviews.append(KaggleWord2VecUtility.review_to_wordlist(review, remove_stopwords=True, dispose_percent=dispose_percent)) else: for review in reviews["review"][skip:limit]: clean_reviews.append(KaggleWord2VecUtility.review_to_wordlist(review, remove_stopwords=True, dispose_percent=dispose_percent)) return clean_reviews
def getCleanReviews(reviews, useSmall=None, remove_stopwords=True): clean_reviews = [] if useSmall: for review in reviews["review"][0:useSmall]: clean_reviews.append( KaggleWord2VecUtility.review_to_wordlist( review, remove_stopwords=remove_stopwords )) return clean_reviews else: for review in reviews["review"]: clean_reviews.append( KaggleWord2VecUtility.review_to_wordlist( review, remove_stopwords=remove_stopwords )) return clean_reviews
def clean_text_field(dataset, fieldname, translate): cleaned_field = [] for i in range(0, len(dataset[fieldname])): if translate: blob = TextBlob(dataset[fieldname][i]).translate(from_lang='pt-br', to="en") cleaned_field.append(" ".join( KaggleWord2VecUtility.review_to_wordlist(str(blob), True))) else: cleaned_field.append(" ".join( KaggleWord2VecUtility.review_to_wordlist( dataset[fieldname][i], True))) return cleaned_field
def main(): start_time = datetime.now() df = pd.read_csv(os.path.join(os.path.dirname(__file__), 'data', 'labeledTrainData.tsv'), header=0, delimiter="\t", quoting=3) # test = pd.read_csv(os.path.join(os.path.dirname(__file__), 'data', 'testData.tsv'), header=0, delimiter="\t", quoting=3) train, test, y, y_test = cross_validation.train_test_split(df['review'].values, df['sentiment'].values, test_size=0.4, random_state=0) print "Cleaning and parsing movie reviews...\n" traindata = [] for i in xrange(0, len(train)): traindata.append(" ".join(KaggleWord2VecUtility.review_to_wordlist(train[i], False))) testdata = [] for i in xrange(0, len(test)): testdata.append(" ".join(KaggleWord2VecUtility.review_to_wordlist(test[i], False))) print 'vectorizing... ', tfv = TfidfVectorizer(min_df=3, max_features=None, strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}', ngram_range=(1, 2), use_idf=1, smooth_idf=1, sublinear_tf=1, stop_words='english') X_all = traindata + testdata lentrain = len(traindata) print "fitting pipeline... ", tfv.fit(X_all) X_all = tfv.transform(X_all) X = X_all[:lentrain] X_test = X_all[lentrain:] model = LogisticRegression(penalty='l2', dual=True, tol=0.0001, C=1, fit_intercept=True, intercept_scaling=1.0, class_weight=None, random_state=None) print "10 Fold CV Score: ", np.mean(cross_validation.cross_val_score(model, X, y, cv=10, scoring='roc_auc')) print "Retrain on all training data, predicting test labels...\n" model.fit(X, y) # result = model.predict_proba(X_test)[:,1] # predict as probability result = model.predict(X_test) # output = pd.DataFrame( data={"id":test["id"], "sentiment":result} ) # Copy the results to a pandas dataframe with an "id" column and a "sentiment" column output = pd.DataFrame(data={"sentiment":y_test, "predict_sentiment":result}) output['succeed'] = output['sentiment'] == output['predict_sentiment'] groupby = output.groupby('succeed') print 'Result Evaluation' print groupby['sentiment'].agg(['count']) # Use pandas to write the comma-separated output file output.to_csv(os.path.join(os.path.dirname(__file__), 'data', 'Bag_of_Words_model_linear.csv'), index=False, quoting=3) print "Wrote results to Bag_of_Words_model_linear.csv" print datetime.now() - start_time
def bag_of_words(): train = pd.read_csv("train_dummy.csv", delimiter=',') num_reviews = train["review"].size print "Cleaning and parsing the training set movie reviews...\n" clean_train_reviews = [] for i in xrange(0, num_reviews): clean_train_reviews.append(" ".join( KaggleWord2VecUtility.review_to_wordlist(train["review"][i], True))) print "Creating the bag of words..\n" #vectorizer = CountVectorizer(analyzer="word", tokenizer=None, preprocessor=None, stop_words=None, max_features=5000) vectorizer = TfidfVectorizer(analyzer="word", tokenizer=None, preprocessor=None, stop_words=None, min_df=1) train_data_features = vectorizer.fit_transform(clean_train_reviews) #print train_data_features train_data_features = train_data_features.toarray() clf = DecisionTreeClassifier(criterion="gini", splitter="best") clf = clf.fit(train_data_features, train["rating"]) clean_test_reviews = [] test = pd.read_csv("test_dummy.csv", delimiter=',') num_reviews1 = test["review"].size print "Cleaning and parsing the test set movie reviews...\n" clean_train_reviews = [] for i in xrange(0, num_reviews1): clean_test_reviews.append(" ".join( KaggleWord2VecUtility.review_to_wordlist(test["review"][i], True))) test_data_features = vectorizer.transform(clean_test_reviews) test_data_features = test_data_features.toarray() #print len(test_data_features) tree.export_graphviz(clf, out_file='tree.dot') print "Predicting test labels ...\n" result = clf.predict(test_data_features) check = test['rating'].values #print train_data_features #print test_data_features print "The accuracy score is" print accuracy_score(check, result)
def generate_word2vec(model_name, dataset_list): sentences = [] nltk.download() tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') for i, dataset in enumerate(dataset_list): print("Parsing sentences from dataset " + str(i + 1)) for review in dataset["review"]: sentences += KaggleWord2VecUtility.review_to_sentences( review, tokenizer) print("Training Word2Vec model...") model = Word2Vec(sentences, workers=NUM_WORKERS, \ size=NUM_FEATURES, min_count = MIN_WORD_COUNT, \ window = CONTEXT, sample = DOWNSAMPLING, seed=1) # If you don't plan to train the model any further, calling # init_sims will make the model much more memory-efficient. model.init_sims(replace=True) # You can load the model later using Word2Vec.load() model.save(model_name) return model
def tokenize(sentence, grams): words = KaggleWord2VecUtility.review_to_wordlist(sentence) tokens = [] for gram in grams: for i in range(len(words) - gram + 1): tokens += ["_*_".join(words[i : i + gram])] return tokens
def load_data_and_labels_kaggle2(test_data_file): """ Loads MR polarity data from files, splits the data into words and generates labels. Returns split sentences and labels. """ y = [] # Load data from files training_examples = pd.read_csv(test_data_file, header=0, delimiter='\t', quoting=3) # Generate labels for x in training_examples["Sentiment"]: if x == 1: #positive test_labels = [1] else: #negative test_labels = [0] y = np.concatenate([y, test_labels], 0) print(y) print(y.shape) print("sentiment complete") #print(test_examples["review"][:10]) # preprocessing sentences = [] for review in test_examples["review"]: tmpstr = KaggleWord2VecUtility.review_to_corpus(review, remove_stopwords=False) sentences.append(tmpstr) print("preprocessing complete") return [sentences, y]
def getCleanReviews(reviews): clean_reviews = [] clean_reviews = KaggleWord2VecUtility.apply_by_multiprocessing( reviews["review"], KaggleWord2VecUtility.review_to_wordlist_with_tag, workers=4) return clean_reviews
def generate_doc2vec(model_name, dataset_list, NUM_FEATURES=100, CONTEXT=5): print(model_name) # Load the punkt tokenizer tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') # ****** Split the training sets into clean docs # # flatten each list of sentences from a review into a single list of words docs = [] # Initialize an empty list of docs for i, dataset in enumerate(dataset_list): print("Parsing sentences from dataset " + str(i + 1)) for review in dataset["review"]: sentences = KaggleWord2VecUtility.review_to_sentences( review, tokenizer) docs.append([word for sentence in sentences for word in sentence]) # Initialize and train the model print("Training Doc2Vec model...") documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(docs)] model = Doc2Vec(documents, workers=NUM_WORKERS, \ vector_size=NUM_FEATURES, min_count = MIN_WORD_COUNT, \ window = CONTEXT, seed=1) model.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True) # You can load the model later using Doc2Vec.load() print(model_name) model.save(model_name) return model
def create_task(): if not request.json or not 'id' in request.json: abort(400) task = { 'id': request.json['id'], 'text': request.json['text'], } clean_test_descripciones = [] app.logger.info('petition_classification: ' + task['text']) features = review_words(task['text']) clean_test_descripciones.append(u" ".join( KaggleWord2VecUtility.review_to_wordlist(features, True))) # Uses chord to run two jobs and a callback after processing ends # 1) A text classifier # 2) A profanity filter # 3) A callback to put all together in a JSON callback = update_remote_petition.subtask() chord([ evaluate_petition.s(task['id'], clean_test_descripciones), catch_bad_words_in_text.s(task['text']) ])(callback) return jsonify({ 'id': request.json['id'], 'text': request.json['text'] }), 201
def getCleanReviews(reviews): clean_reviews = [] for review in reviews["review"]: clean_reviews.append( KaggleWord2VecUtility.review_to_wordlist(review, remove_stopwords=True)) return clean_reviews
def classify(sentiment_str): sentiment_str = [ " ".join(KaggleWord2VecUtility.review_to_wordlist(sentiment_str, True)) ] sentiment_str = vectorizer.transform(sentiment_str) np.asarray(sentiment_str) return forest.predict(sentiment_str)
def predict_review(self, review, dispose_percent=(0, 0)): tok_sents = KaggleWord2VecUtility.review_to_sentences( review, self.tokenizer, case_sensitive=True, dispose_percent=dispose_percent) num_sarcastic = 0 num_regular = 0 for sent in tok_sents: prediction = self.predict_sentence(sent) for i in range(2): if prediction[i] > sarcasm_confidence: if self.classifier.classes_[i] == 'ironic': num_sarcastic += 1 else: num_regular += 1 if num_regular == 0 and num_sarcastic == 0: return 'regular' #More than 'sarcasm_thres' percent of sentences must be classified sarcastic, for the review to be classified sarcastic. if num_sarcastic > num_regular * sarcasm_thres: return 'ironic' return 'regular'
def create_task(): if not request.json or not 'id' in request.json: abort(400) task = { 'id': request.json['id'], 'text': request.json['text'], } clean_test_descripciones = [] app.logger.info('petition_classification: ' + task['text']) features = review_words(task['text']) clean_test_descripciones.append(u" ".join( KaggleWord2VecUtility.review_to_wordlist(features, True))) # Uses chord to run two jobs and a callback after processing ends # 1) A text classifier # 2) A profanity filter # 3) A callback to put all together in a JSON callback = update_remote_petition.subtask() chord([ evaluate_petition.s(task['id'], clean_test_descripciones), catch_bad_words_in_text.s(task['text']) ])(callback) return jsonify({'id': request.json['id'], 'text': request.json['text']}), 201
def getCleanReviews(reviews): """ Use multiple workers (multi threads) """ clean_reviews = [] clean_reviews = KaggleWord2VecUtility.apply_by_multiprocessing( reviews["review"], KaggleWord2VecUtility.review_to_wordlist, workers=4) return clean_reviews
def getCleanDescriptions(descriptions): clean_descriptions = [] local_counter=0 for description in descriptions["description"]: clean_descriptions.append( KaggleWord2VecUtility.review_to_wordlist( description, remove_stopwords=True )) local_counter=local_counter+1 print('Adding line : '+str(local_counter)) return clean_descriptions
def getCleanLabeledReviews(reviews): clean_reviews = [] for review in reviews["review"]: clean_reviews.append(KaggleWord2VecUtility.review_to_wordlist(review, True)) labelized = [] for i, id_label in enumerate(reviews["id"]): labelized.append(LabeledSentence(clean_reviews[i], [id_label])) return labelized
def getCleanLabeledReviews(reviews): clean_reviews = [] for review in reviews["review"]: clean_reviews.append(KaggleWord2VecUtility.review_to_wordlist(review)) labelized = [] for i, id_label in enumerate(reviews["id"]): labelized.append(LabeledSentence(clean_reviews[i], [id_label])) return labelized
def process_data_extend(is_char=False): # 加载训练集和测试集 x_train, y_train = get_data(train_path, convert_label=False) x_test, y_test = get_data(test_path, convert_label=False) # 将文本转化为句子,每个句子是由词组成的list train_sentences = [] # Initialize an empty list of sentences test_sentences = [] # Initialize an empty list of sentences for news in x_train: if len(news) > 0: train_sentences.append( " ".join(KaggleWord2VecUtility.review_to_wordlist(news, is_char=is_char, remove_stopwords=True))) for news in x_test: if len(news) > 0: test_sentences.append( " ".join(KaggleWord2VecUtility.review_to_wordlist(news, is_char=is_char, remove_stopwords=True))) # 对短文本进行扩充 train_extend = get_similar_words(train_sentences) test_extend = get_similar_words(test_sentences) # 转为数字索引形式 all_text = train_sentences all_text.extend(test_sentences) sequences, word_index = gen_word_index(all_text, is_char=is_char) data = sequence.pad_sequences(sequences, maxlen=maxlen) X_train = data[:train_size] X_test = data[train_size:] all_text_extend = train_extend all_text_extend.extend(test_extend) sequences, word_index_extend = gen_word_index(all_text_extend, is_char=is_char) data_extend = sequence.pad_sequences(sequences, maxlen=maxlen) X_train_extend = data_extend[:train_size] X_test_extend = data_extend[train_size:] # label转成numpy数组 Y_train = np.array(y_train) Y_test = np.array(y_test) # one-hot encoder = LabelBinarizer().fit(Y_train) Y_train = encoder.transform(Y_train) Y_test = encoder.transform(Y_test) return X_train, X_train_extend, Y_train, X_test, X_test_extend, Y_test, word_index, word_index_extend
def get_words(reviews): """ Gets list of relevant words per review using Kaggle's Word2VecUtility https://github.com/danielfrg/kaggle-word2vec/blob/master/DeepLearningMovies/KaggleWord2VecUtility.py_ :param reviews: list of reviews which should be transformed to words :return: list of words per review """ words = [] for review in reviews: words.append("".join(KaggleWord2VecUtility.review_to_wordlist(review))) return words
def bag_of_words(): train = pd.read_csv("train_dummy.csv", delimiter=',') num_reviews = train["review"].size #print "Cleaning and parsing the training set movie reviews...\n" clean_train_reviews = [] for i in xrange(0, num_reviews): clean_train_reviews.append(" ".join( KaggleWord2VecUtility.review_to_wordlist(train["review"][i], True))) vectorizer = TfidfVectorizer(max_features=2500, min_df=4) train_data_features = vectorizer.fit_transform( clean_train_reviews).todense() print train_data_features.shape train_output = train['rating'] clf = SVC(C=100.0, kernel='sigmoid', cache_size=1000, gamma=1.0) clf = clf.fit(train_input, train_output) clean_test_reviews = [] test = pd.read_csv("test_dummy.csv", delimiter=',') num_reviews1 = test["review"].size clean_train_reviews = [] for i in xrange(0, num_reviews1): clean_test_reviews.append(" ".join( KaggleWord2VecUtility.review_to_wordlist(test["review"][i], True))) test_data_features = vectorizer.transform(clean_test_reviews).todense() result = clf.predict(test_data_features) check = test['rating'] print check print accuracy_score(check, result) print confusion_matrix(check, result) print classification_report(check, result) '''y, predicted = check, result
def perform_preprocess_on_review_data_of_train(review_data_in_train): sentences = [] for review in review_data_in_train: processed = KaggleWord2VecUtility.review_to_sentences( review, remove_stopwords=False) # print("processed",processed) # [['with', 'all', 'this', 'stuff', 'go', 'down', 'at', 'the', 'moment', 'with', 'mj', 'i', 've', 'start', # 'listen', 'to', 'his', 'music', 'watch', 'the', 'odd', 'documentari', 'here', 'and', 'there', 'watch', # 'the', 'wiz', 'and', 'watch', sentences += processed return sentences
def weight_file_processing(all): traindata = [] for i in range(0, len(all["text"])): traindata.append(" ".join( KaggleWord2VecUtility.review_to_wordlist(all["text"][i], True))) all_wordlist = " ".join(traindata).split() counts = dict() for i in all_wordlist: counts[i] = counts.get(i, 0) + 1 sorted_x = sorted(counts.items(), key=operator.itemgetter(1), reverse=True) weight_file_build = open("data/reuters_vocab.txt", "ab") for each in sorted_x: weight_file_build.write(each[0] + " " + str(each[1]) + " " + "\n")
def create_classifier(subreddits, X_train, y_train, num_features): # nltk.download() # Download text data sets, including stop words clean_train_set = [] print "Cleaning and parsing the training set...\n" for i in xrange(0, len(X_train)): clean_train_set.append(KaggleWord2VecUtility.review_to_wordlist(X_train[i], True)) # ****** Set parameters and train the word2vec model # # Import the built-in logging module and configure it so that Word2Vec # creates nice output messages logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\ level=logging.INFO) # Set values for various parameters # num_features, min_word_count, num_workers = options # num_features = 300 # Word vector dimensionality min_word_count = 40 # Minimum word count num_workers = 4 # Number of threads to run in parallel context = 10 # Context window size downsampling = 1e-3 # Downsample setting for frequent words # Initialize and train the model (this will take some time) print "Training Word2Vec model..." model = Word2Vec(clean_train_set, workers=num_workers, \ size=num_features, min_count = min_word_count, \ window = context, sample = downsampling, seed=1) # If you don't plan to train the model any further, calling # init_sims will make the model much more memory-efficient. model.init_sims(replace=True) # It can be helpful to create a meaningful model name and # save the model for later use. You can load it later using Word2Vec.load() model_name = "300features_40minwords_10context" model.save(model_name) print "Creating average feature vecs for training comments" train_data_features = getAvgFeatureVecs( clean_train_set, model, num_features ) print "Training the model (this may take a while)..." classifier = LogisticRegression( solver='lbfgs', multi_class='multinomial') classifier = classifier.fit(train_data_features, y_train) return (model, classifier)
def getLabeledSentences(reviews, prefix, skip=0, limit=0, dispose_percent=0): labels = [] index = 0 if limit == 0: for review in reviews["review"][skip:]: index += 1 labels.append( LabeledSentence( KaggleWord2VecUtility.review_to_wordlist( review, remove_stopwords=False, dispose_percent=dispose_percent), [prefix + str(index)])) else: for review in reviews["review"][skip:limit]: index += 1 labels.append( LabeledSentence( KaggleWord2VecUtility.review_to_wordlist( review, remove_stopwords=False, dispose_percent=dispose_percent), [prefix + str(index)])) return labels
def review_to_sentences( review, tokenizer, remove_stopwords=False ): # Function to split a review into parsed sentences. Returns a # list of sentences, where each sentence is a list of words # # 1. Use the NLTK tokenizer to split the paragraph into sentences raw_sentences = tokenizer.tokenize(review.strip()) # # 2. Loop over each sentence sentences = [] for raw_sentence in raw_sentences: # If a sentence is empty, skip it if len(raw_sentence) > 0: # Otherwise, call review_to_wordlist to get a list of words sentences.append( KaggleWord2VecUtility.review_to_wordlist( raw_sentence, remove_stopwords )) # # Return the list of sentences (each sentence is a list of words, # so this returns a list of lists return sentences
def test_classifier(model, classifier, subreddits, X_test, y_test, num_features): # Create an empty list and append the clean reviews one by one clean_test_set = [] print "Cleaning and parsing the test set ...\n" for i in xrange(0, len(X_test)): clean_test_set.append(KaggleWord2VecUtility.review_to_wordlist(X_test[i], True)) # Get a bag of words for the test set, and convert to a numpy array test_data_features = getAvgFeatureVecs( clean_test_set, model, num_features ) # Use the random forest to make sentiment label predictions print "Predicting test labels...\n" predicted = classifier.predict(test_data_features) print metrics.accuracy_score(y_test, predicted) # print metrics.confusion_matrix(y_test, predicted) print metrics.classification_report(y_test, predicted)
def review_to_sentences(review, tokenizer, remove_stopwords=False): # Function to split a review into parsed sentences. Returns a # list of sentences, where each sentence is a list of words # # 1. Use the NLTK tokenizer to split the paragraph into sentences raw_sentences = tokenizer.tokenize(review.decode('utf8').strip()) # # 2. Loop over each sentence sentences = [] for raw_sentence in raw_sentences: # If a sentence is empty, skip it if len(raw_sentence) > 0: # Otherwise, call review_to_wordlist to get a list of words sentences.append( KaggleWord2VecUtility.review_to_wordlist( raw_sentence, \ remove_stopwords )) # # Return the list of sentences (each sentence is a list of words, # so this returns a list of lists return sentences
def predict(utterance): clean = [] clean.append(" ".join( KaggleWord2VecUtility.review_to_wordlist(utterance, True))) #print >> sys.stderr, "kaggle ", KaggleWord2VecUtility.review_to_wordlist(utterance, True) features = vectorizer.transform(clean) np.asarray(features) # Use the random forest to make sentiment label predictions #print ("Predicting sentiment...\n") #print >> sys.stderr, "features ", features result = model.predict(features) #print >> sys.stderr, "RESULT ", result #print (result) return result[0]
def predict_sentiment(test): clean_test_reviews = [] reviewstext = [] with open('scraped.csv', 'rt') as csvfile1: reader = csv.reader(csvfile1) headers = next(reader, None) for row in reader: reviewstext.append(row[1]) clean_test_reviews.append(" ".join( KaggleWord2VecUtility.review_to_wordlist( (row[1]).decode('utf-8', 'ignore'), True))) test_data_features = vectorizer.transform(clean_test_reviews) test_data_features = test_data_features.toarray() result = forest.predict(test_data_features) return {"review": reviewstext, "sentiment": result} #output = pd.DataFrame( data={"review":reviewstext, "sentiment":result} ) #output.to_csv(os.path.join(os.path.dirname(__file__), 'Bag_of_Words_model.csv'), index=False, quoting=3, quotechar='', sep='\t') #output.to_csv(os.path.join(os.path.dirname(__file__), 'Bag_of_Words_model.csv'), index=False, quoting=csv.QUOTE_NONE, quotechar='', sep='\t') #print test # Create an empty list and append the clean reviews one by one '''
test_pkl = 'shortened_' + test_pkl try: traindata = pickle.load(open(os.path.join(base_path, 'data', train_pkl), 'r')) testdata = pickle.load(open(os.path.join(base_path, 'data', test_pkl), 'r')) except IOError as e: if e.errno != errno.ENOENT: raise e else: _logger.info('cleaning and parsing movie reviews') traindata = [] for i in xrange(0, len(train["review"])): review = KaggleWord2VecUtility.review_to_wordlist(train["review"][i], False) if SHORT_REVIEW: review = review[:4] traindata.append(' '.join(review)) testdata = [] for i in xrange(0, len(test["review"])): review = KaggleWord2VecUtility.review_to_wordlist(test["review"][i], False) if SHORT_REVIEW: review = review[:4] testdata.append(' '.join(review)) pickle.dump(traindata, open(os.path.join(base_path, 'data', train_pkl), 'w')) pickle.dump(testdata, open(os.path.join(base_path, 'data', test_pkl), 'w'))
train_i, test_i = data.ix[:,11] != -1, data.ix[:,11] == -1 train = data.ix[train_i] test = data.ix[test_i] #train_i, valid_i = train_test_split( np.arange( len( train )), train_size = 0.8, random_state = 88 ) #train = train.ix[train_i] #validation = train.ix[valid_i] # print "Parsing train job titles..." clean_train_reviews = [] for title in train['abstract']: clean_train_reviews.append( " ".join( KaggleWord2VecUtility.review_to_wordlist(title,remove_stopwords=False))) print "Parsing test reviews..." clean_test_reviews = [] for title in test['abstract']: clean_test_reviews.append( " ".join( KaggleWord2VecUtility.review_to_wordlist(title,remove_stopwords=False))) #print "Parsing validation reviews..." #clean_valid_reviews = [] #for title in validation['title']: # clean_valid_reviews.append( " ".join( KaggleWord2VecUtility.review_to_wordlist(title,remove_stopwords=False))) #
def clean_review_function(review): global master_word_dict, number_of_rows list_of_words = KaggleWord2VecUtility.review_to_wordlist(review, remove_stopwords=False) return ' '.join(list_of_words)
if x in tag: Y.append(tag[x]) #for i in O_test: #test.append(O_test[i][0]) # Load the punkt tokenizer tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') # ****** Split the labeled and unlabeled training sets into clean sentences # sentences = [] # Initialize an empty list of sentences #print("Parsing sentences from training set") for review in train: sentences += KaggleWord2VecUtility.review_to_sentences(review.encode('utf-8'), tokenizer) # print "Parsing sentences from training set" #for review in test: # sentences += KaggleWord2VecUtility.review_to_sentences(review, tokenizer) # ****** Set parameters and train the word2vec model # # Import the built-in logging module and configure it so that Word2Vec # creates nice output messages logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\ level=logging.INFO) # Set values for various parameters num_features = 300 # Word vector dimensionality min_word_count = 40 # Minimum word count
# Initialize an empty list to hold the clean reviews traindata = [] testdata = [] Y1=[] Y2=[] # Loop over each review; create an index i that goes from 0 to the length # of the movie review list for i in train: buf=[] traindata.append(" ".join(KaggleWord2VecUtility.review_to_wordlist(train[i][0], True))) for j in train[i][3].split(): if j in tag_dic: buf.append(tag_dic[j]) Y1.append(buf) for i in test: buf=[] testdata.append(" ".join(KaggleWord2VecUtility.review_to_wordlist(test[i][0], True))) for j in test[i][3].split(): if j in tag_dic: buf.append(tag_dic[j]) Y2.append(buf) # ****** Create a bag of words from the training set
from sklearn import cross_validation model = Word2Vec.load_word2vec_format(constants.GOOGLE_WORD2VEC, binary=True) train = pd.read_csv(os.path.join( os.path.dirname(__file__), '..', 'fixtures', 'labeledTrainData.tsv'), header=0, delimiter="\t", quoting=csv.QUOTE_NONE) test = pd.read_csv(os.path.join( os.path.dirname(__file__), '..', 'fixtures', 'testData.tsv'), header=0, delimiter="\t", quoting=csv.QUOTE_NONE) y = train["sentiment"] print "Cleaning and parsing movie reviews...\n" traindata = [] for i in xrange(0, len(train["review"])): traindata.append(" ".join( KaggleWord2VecUtility.review_to_wordlist(train["review"][i], True))) testdata = [] for i in xrange(0, len(test["review"])): testdata.append(" ".join( KaggleWord2VecUtility.review_to_wordlist(test["review"][i], True))) X_all = traindata + testdata lentrain = len(traindata) print "fitting pipeline... ", vectorizer = CountVectorizer(min_df=4) vectorizer.fit(X_all) start = time.time() # Set "k" (num_clusters) to be 1/5th of the vocabulary size, or an
model = None if os.path.isfile(classifier_filename): model = Word2Vec.load(classifier_filename) else: unlabeled_train = pd.read_csv('data/unlabeledTrainData.tsv', header=0, delimiter="\t", quoting=3) # Verify the number of reviews that were read (100,000 in total) print ("Read {0} labeled train reviews, {1} test reviews, and {2} unlabeled reviews\n".format(len(train["review"][local_test_size:]), test["review"].size, unlabeled_train["review"].size )) sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') #Used for tokenizing paragraphs into individual sentences. sentences = [] print ("Parsing sentences from training set") for review in train["review"][local_test_size:]: sentences += KaggleWord2VecUtility.review_to_sentences(review, sent_tokenizer, dispose_percent=percent_disposal) print ("Parsing sentences from unlabeled set") for review in unlabeled_train["review"]: sentences += KaggleWord2VecUtility.review_to_sentences(review, sent_tokenizer, dispose_percent=percent_disposal) del unlabeled_train #======================================================================================================= # Train the model #======================================================================================================= # Initialize and train the model (this will take some time) print ("Training Word2Vec model...") model = Word2Vec(sentences, workers=num_workers, size=num_features, min_count = min_word_count, window = context, sample = downsampling, seed=1) model.init_sims(replace=True) #Unloads everything from memory not related to querying the models
# # Return the "bag of centroids" return bag_of_centroids if __name__ == '__main__': train = pd.read_csv(os.path.join(os.path.dirname(__file__), 'data', 'labeledTrainData.tsv'), header=0, \ delimiter="\t", quoting=3) test = pd.read_csv(os.path.join(os.path.dirname(__file__), 'data', 'testData.tsv'), header=0, delimiter="\t", \ quoting=3 ) unlabeled_train = pd.read_csv( os.path.join(os.path.dirname(__file__), 'data', "unlabeledTrainData.tsv"), header=0, delimiter="\t", quoting=3 ) tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') sentences = [] # Initialize an empty list of sentences print "Parsing sentences from training set" for review in train["review"]: sentences += KaggleWord2VecUtility.review_to_sentences(review, tokenizer) print "Parsing sentences from unlabeled set" for review in unlabeled_train["review"]: sentences += KaggleWord2VecUtility.review_to_sentences(review, tokenizer) # ****** Set parameters and train the word2vec model # # Import the built-in logging module and configure it so that Word2Vec # creates nice output messages logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\ level=logging.INFO) # Set values for various parameters num_features = 300 # Word vector dimensionality min_word_count = 40 # Minimum word count
def clean_review_function(review): list_of_sentences = KaggleWord2VecUtility.review_to_sentences(review, tokenizer, remove_stopwords=False) return list_of_sentences
from KaggleWord2VecUtility import KaggleWord2VecUtility from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.linear_model import LogisticRegression from sklearn import cross_validation import pandas as pd import numpy as np train = pd.read_csv(os.path.join(os.path.dirname(__file__), '../', 'data', 'labeledTrainData.tsv'), header=0, \ delimiter="\t", quoting=3) test = pd.read_csv(os.path.join(os.path.dirname(__file__), '../', 'data', 'testData.tsv'), header=0, delimiter="\t", \ quoting=3 ) y = train["sentiment"] print "Cleaning and parsing movie reviews...\n" traindata = [] for i in xrange( 0, len(train["review"])): traindata.append(" ".join(KaggleWord2VecUtility.review_to_wordlist(train["review"][i], False))) testdata = [] for i in xrange(0,len(test["review"])): testdata.append(" ".join(KaggleWord2VecUtility.review_to_wordlist(test["review"][i], False))) print 'vectorizing... ', tfv = TfidfVectorizer(min_df=3, max_features=None, strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}', ngram_range=(1, 2), use_idf=1,smooth_idf=1,sublinear_tf=1, stop_words = 'english') X_all = traindata + testdata lentrain = len(traindata) print "fitting pipeline... ", tfv.fit(X_all) X_all = tfv.transform(X_all)
def getCleanTestReviews(skucollection): clean_skucollection = [] for sku in skucollection["query"]: clean_skucollection.append( KaggleWord2VecUtility.sku_to_wordlist( sku, remove_stopwords=False )) return clean_skucollection
def getCleanTrainReviews(skucollection): clean_skucollection = [] for sku in skucollection["product_title"]: clean_skucollection.append( KaggleWord2VecUtility.sku_to_wordlist( sku, remove_stopwords=False )) return clean_skucollection
# Create clean_train_reviews and clean_test_reviews as we did before # # Read data from files train = pd.read_csv( os.path.join(os.path.dirname(__file__), 'data', 'labeledTrainData.tsv'), header=0, delimiter="\t", quoting=3 ) test = pd.read_csv(os.path.join(os.path.dirname(__file__), 'data', 'testData.tsv'), header=0, delimiter="\t", quoting=3 ) print "Cleaning training reviews" clean_train_reviews = [] for review in train["review"]: clean_train_reviews.append( KaggleWord2VecUtility.review_to_wordlist( review, \ remove_stopwords=True )) print "Cleaning test reviews" clean_test_reviews = [] for review in test["review"]: clean_test_reviews.append( KaggleWord2VecUtility.review_to_wordlist( review, \ remove_stopwords=True )) # ****** Create bags of centroids # # Pre-allocate an array for the training set bags of centroids (for speed) train_centroids = np.zeros( (train["review"].size, num_clusters), \ dtype="float32" ) # Transform the training set reviews into bags of centroids
raw_input("Press Enter to continue...") print 'Download text data sets. If you already have NLTK datasets downloaded, just close the Python download window...' #nltk.download() # Download text data sets, including stop words # Initialize an empty list to hold the clean reviews clean_train_reviews = [] # Loop over each review; create an index i that goes from 0 to the length # of the movie review list print "Cleaning and parsing the training set movie reviews...\n" for i in xrange( 0, len(train["review"])): print KaggleWord2VecUtility.review_to_wordlist(train["review"][i], True) clean_train_reviews.append(" ".join(KaggleWord2VecUtility.review_to_wordlist(train["review"][i], True))) # ****** Create a bag of words from the training set # print "Creating the bag of words...\n" # Initialize the "CountVectorizer" object, which is scikit-learn's # bag of words tool. vectorizer = CountVectorizer(analyzer = "word", \ tokenizer = None, \ preprocessor = None, \ stop_words = None, \ max_features = 15000)
print "Read %d labeled train skucollection " % (train["product_title"].size) # Load the punkt tokenizer tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') # ****** Split the labeled and unlabeled training sets into clean sentences # sentences = [] # Initialize an empty list of sentences print "Parsing sentences from training set" for sku in train["product_title"]: sentences += KaggleWord2VecUtility.sku_to_sentences(sku, tokenizer) # ****** Set parameters and train the word2vec model # # Import the built-in logging module and configure it so that Word2Vec # creates nice output messages logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\ level=logging.INFO) # Set values for various parameters num_features = 300 # Word vector dimensionality min_word_count = 40 # Minimum word count num_workers = 4 # Number of threads to run in parallel context = 10 # Context window size downsampling = 1e-3 # Downsample setting for frequent words
csv.field_size_limit(sys.maxsize) # Read train data. train_word_vector = pd.read_pickle('all.pkl') # print(train_word_vector) # Use the NLTK tokenizer to split the paragraph into sentences. tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') sentences = [] print "Parsing sentences from training set..." # Loop over each news article. for review in train_word_vector["text"]: try: # Split a review into parsed sentences. sentences += KaggleWord2VecUtility.review_to_sentences( review, tokenizer) except: continue logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\ level=logging.INFO) num_features = int(sys.argv[1]) # Word vector dimensionality min_word_count = 20 # Minimum word count num_workers = 40 # Number of threads to run in parallel context = 10 # Context window size downsampling = 1e-3 # Downsample setting for frequent words print "Training Word2Vec model..." # Train Word2Vec model. model = Word2Vec(sentences, workers=num_workers, hs = 0, sg = 1, negative = 10, iter = 25,\
def main(): start_time = datetime.now() df = pd.read_csv(os.path.join(os.path.dirname(__file__), 'data', 'labeledTrainData.tsv'), header=0, delimiter="\t", quoting=3) if GO_FOR_REAL: test = pd.read_csv(os.path.join(os.path.dirname(__file__), 'data', 'testData.tsv'), header=0, delimiter="\t", quoting=3 ) train = df['review'] train_sentiment = df['sentiment'] test_id = test['id'].str.replace('"', '') test = test['review'] else: train, test, train_sentiment, test_sentiment = cross_validation.train_test_split(df['review'].values, df['sentiment'].values, test_size=0.4, random_state=0) print 'Download text data sets. If you already have NLTK datasets downloaded, just close the Python download window...' #nltk.download() # Download text data sets, including stop words # Initialize an empty list to hold the clean reviews clean_train_reviews = [] # Loop over each review; create an index i that goes from 0 to the length # of the movie review list print "Cleaning and parsing the training set movie reviews...\n" for i in xrange(0, len(train)): clean_train_reviews.append(" ".join(KaggleWord2VecUtility.review_to_wordlist(train[i], True))) # ****** Create a bag of words from the training set # print "Creating the bag of words...\n" # Initialize the "CountVectorizer" object, which is scikit-learn's bag of words tool. # CountVectorizer transform each document into a word count vector, with each word as a feature. # Stop words are very frequent words in a language that may not have huge semantic impact, may dissolve the importance of other more meaning for words # N-gram provide word combination as a new feature. # stop_words: 'english' will use stop words from sklearn.feature_extraction.stop_words.ENGLISH_STOP_WORDS, seem to make result more unstable. # max_feature: limit only the more commonly appeared words in document to be in array, None will allow all features, and increase vector size. # vectorizer = CountVectorizer(analyzer = "word", tokenizer = None, preprocessor = None, stop_words = 'english', max_features = None) # Tfidf: a normalization method to reduce weight of words that appear too frequent in dataset # TfidfVectorizer: CountVectorizer that run a tfidf normalization during transform vectorizer = TfidfVectorizer(min_df=3, max_features=None, strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}', ngram_range=(1, 2), use_idf=1, smooth_idf=1, sublinear_tf=1, stop_words='english') # fit_transform() does two functions: First, it fits the model # and learns the vocabulary; second, it transforms our training data # into feature vectors. The input to fit_transform should be a list of # strings. train_data_features = vectorizer.fit_transform(clean_train_reviews) print 'Train data feature shape: ' + str(train_data_features.shape) print 'Number of vocabularies/features: %d\n' %len(vectorizer.get_feature_names()) # Numpy arrays are easy to work with, so convert the result to an # array train_data_features = train_data_features.toarray() # ******* Train a model using the bag of words # print "Training the model (this may take a while)..." # Initialize a Random Forest classifier with 100 trees # clf = RandomForestClassifier(n_estimators=100) # clf = svm.LinearSVC(C=1) clf = LogisticRegressionCV(cv=3, scoring='roc_auc', solver='liblinear', Cs=[3, 4, 5, 6, 7]) # Cross validation, this takes a long time ... # print "4 Fold CV Score: ", np.mean(cross_validation.cross_val_score(clf, train_data_features, train_sentiment, cv=4, scoring='accuracy', n_jobs=4)) # Fit the svc to the training set, using the bag of words as # features and the sentiment labels as the response variable # # This may take a few minutes to run model = clf.fit(train_data_features, train_sentiment) # Create an empty list and append the clean reviews one by one clean_test_reviews = [] print "Cleaning and parsing the test set movie reviews...\n" for i in xrange(0, len(test)): clean_test_reviews.append(" ".join(KaggleWord2VecUtility.review_to_wordlist(test[i], True))) # Get a bag of words for the test set, and convert to a numpy array test_data_features = vectorizer.transform(clean_test_reviews) test_data_features = test_data_features.toarray() # Use svc to make sentiment label predictions print "Predicting test labels...\n" # Copy the results to a pandas dataframe with an "id" column and # a "sentiment" column if GO_FOR_REAL: result = model.predict_proba(test_data_features)[:, 1] # predict as probability output = pd.DataFrame(data={"id": test_id, "sentiment":result}) else: result = model.predict(test_data_features) output = pd.DataFrame(data={"sentiment":test_sentiment, "predict_sentiment":result}) output['succeed'] = output['sentiment'] == output['predict_sentiment'] groupby = output.groupby('succeed') print 'Result Evaluation' print groupby['sentiment'].agg(['count']) # Use pandas to write the comma-separated output file output.to_csv(os.path.join(os.path.dirname(__file__), 'data', 'Bag_of_Words_model.csv'), index=False, quoting=csv.QUOTE_MINIMAL) print "Wrote results to Bag_of_Words_model.csv" print datetime.now() - start_time print 'Cs_' print getattr(model, 'Cs_') print 'scores_' print getattr(model, 'scores_') print 'C_' print getattr(model,'C_')
# In[12]: import pandas as pd # Read data from files article = pd.read_csv( "train_trend_1.csv") article_test = pd.read_csv( "test_trend_1.csv") # In[ ]: print "Parsing train reviews..." opinions = [] for opinion in article['Articles']: opinions.append( " ".join( KaggleWord2VecUtility.review_to_wordlist( opinion ))) # In[ ]: print "Parsing test reviews..." opinions_test = [] for opinion_test in article_test['Articles']: opinions_test.append( " ".join( KaggleWord2VecUtility.review_to_wordlist( opinion_test ))) # In[ ]: # Pre-allocate an array for the training set bags of centroids (for speed) train_centroids = np.zeros( (article['Articles'].size, num_clusters), dtype="float32" )
def clean_review_function(review): list_of_words = KaggleWord2VecUtility.review_to_wordlist(review, remove_stopwords=False) return ' '.join(list_of_words)
def getCleanReviews(reviews): clean_reviews = [] for review in reviews["review"]: clean_reviews.append( KaggleWord2VecUtility.review_to_wordlist( review, remove_stopwords=True )) return clean_reviews
train_file = 'data/labeledTrainData.tsv' test_file = 'data/testData.tsv' output_file = 'data/bow_predictions.csv' # train = pd.read_csv( train_file, header = 0, delimiter = "\t", quoting = 3 ) test = pd.read_csv( test_file, header = 0, delimiter = "\t", quoting = 3 ) # print "Parsing train reviews..." clean_train_reviews = [] for review in train['review']: clean_train_reviews.append( " ".join( KaggleWord2VecUtility.review_to_wordlist( review ))) print "Parsing test reviews..." clean_test_reviews = [] for review in test['review']: clean_test_reviews.append( " ".join( KaggleWord2VecUtility.review_to_wordlist( review ))) # print "Vectorizing train..." vectorizer = TfidfVectorizer( max_features = 40000, ngram_range = ( 1, 3 ), sublinear_tf = True ) train_x = vectorizer.fit_transform( clean_train_reviews )
testing_documents = [item[2] + ' '+item[3] for item in fetched_testing_data]; print("Predicting the labels of the test set...") print("%d documents" % len(testing_documents)) print("%d categories" % len(testing_outputs)) # Initialize an empty list to hold the clean reviews clean_train_reviews = [] # Loop over each review; create an index i that goes from 0 to the length # of the movie review list print("Cleaning and parsing the training set...\n") for i in xrange( 0, len(training_documents)): print('Row '+str(i)) clean_train_reviews.append(" ".join(KaggleWord2VecUtility.review_to_wordlist(training_documents[i], True))) vectorizer = CountVectorizer(analyzer = "word", \ tokenizer = None, \ preprocessor = None, \ stop_words = None, \ max_features = 5000) # fit_transform() does two functions: First, it fits the model # and learns the vocabulary; second, it transforms our training data # into feature vectors. The input to fit_transform should be a list of # strings. train_data_features = vectorizer.fit_transform(clean_train_reviews) # Numpy arrays are easy to work with, so convert the result to an # array train_data_features = train_data_features.toarray()
test["review"].size, unlabeled_train["review"].size ) # Load the punkt tokenizer tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') # ****** Split the labeled and unlabeled training sets into clean sentences # sentences = [] # Initialize an empty list of sentences print "Parsing sentences from training set" for review in train["review"]: sentences += KaggleWord2VecUtility.review_to_sentences(review, tokenizer) print "Parsing sentences from unlabeled set" for review in unlabeled_train["review"]: sentences += KaggleWord2VecUtility.review_to_sentences(review, tokenizer) # ****** Set parameters and train the word2vec model # # Import the built-in logging module and configure it so that Word2Vec # creates nice output messages logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\ level=logging.INFO) # Set values for various parameters num_features = 300 # Word vector dimensionality min_word_count = 40 # Minimum word count
# Loop over each review; create an index i that goes from 0 to the length # of the movie review list # making a connection to mongoDB client = MongoClient('localhost', 27017) db = client.cs336 db.create_collection("unlabeled_review") print "Cleaning and parsing the training set movie reviews...\n" # for i in xrange( 0, len(train["review"])): for i in xrange(0, 500): #clean_train_revie0ws.append(KaggleWord2VecUtility.review_to_wordlist(train["review"][i], True) #clean_train_reviews.append(" ".join(KaggleWord2VecUtility.review_to_worddict(train["review"][i], True))) print i clean_train_review = KaggleWord2VecUtility.review_to_worddict(train["review"][i], True) #pprint(clean_train_reviews) record = {} record["id"] = train["id"][i] # record["sentiment"] = train["sentiment"][i] record["review"] = clean_train_review #pprint(record) db.unlabeled_review.insert_one(record) print "Inserted all documents to the collection" #pprint(clean_train_reviews[0]) # ****** Create a bag of words from the training set
import numpy as np import pickle train = pd.read_csv(os.path.join(os.path.dirname(__file__), 'data', 'labeledTrainData.tsv'), header=0, \ delimiter="\t", quoting=3) test = pd.read_csv(os.path.join(os.path.dirname(__file__), 'data', 'testData.tsv'), header=0, delimiter="\t", \ quoting=3 ) y = train["sentiment"] print "Cleaning and parsing movie reviews...\n" # print "lentrain ",len(train["review"]) traindata = [] for i in xrange( 0, len(train["review"])): traindata.append(" ".join(KaggleWord2VecUtility.review_to_wordlist(train["review"][i], False))) testdata = [] for i in xrange(0,len(test["review"])): testdata.append(" ".join(KaggleWord2VecUtility.review_to_wordlist(test["review"][i], False))) print 'vectorizing... ', tfv = TfidfVectorizer(min_df=3, max_features=None, strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}', ngram_range=(1, 2), use_idf=1,smooth_idf=1,sublinear_tf=1, stop_words = 'english') X_all = traindata + testdata lentrain = len(traindata) print "fitting pipeline... ", tfv.fit(X_all) X_all = tfv.transform(X_all)
name_list = ['sentiment', 'id', 'time', 'query', 'user', 'text'] train = pd.read_csv(data_path + "training.1600000.processed.noemoticon.csv", \ header=None, names=name_list) test = pd.read_csv(data_path + "testdata.manual.2009.06.14.csv", \ header=None, names=name_list) # Initialize an empty list to hold the clean tweets clean_train_tweets = [] # Loop over each review; create an index i that goes from 0 to the length # of the movie review list print "Cleaning and parsing the training set tweets...\n" for i in xrange(0, len(train["text"])): print (str(i)) clean_train_tweets.append(KaggleWord2VecUtility.review_to_wordlist(train["text"][i], remove_stop_words)) train_labels = train['sentiment'].tolist() # Initialize an empty list to hold the clean tweets clean_test_tweets = [] # Loop over each review; create an index i that goes from 0 to the length # of the movie review list print "Cleaning and parsing the test set tweets...\n" for i in xrange(0, len(test["text"])): print (str(i)) clean_test_tweets.append(KaggleWord2VecUtility.review_to_wordlist(test["text"][i], remove_stop_words)) test_labels = test['sentiment'].tolist()