def process_tweets(csv_file, test_file=True): """Returns a list of tuples of type (tweet_id, feature_vector) or (tweet_id, sentiment, feature_vector) Args: csv_file (str): Name of processed csv file generated by preprocess.py test_file (bool, optional): If processing test file Returns: list: Of tuples """ tweets = [] print('Generating feature vectors') with open(csv_file, 'r') as csv: lines = csv.readlines() total = len(lines) for i, line in enumerate(lines): if test_file: tweet_id, tweet = line.split(',') else: tweet_id, sentiment, tweet = line.split(',') feature_vector = get_feature_vector(tweet) if test_file: tweets.append((tweet_id, feature_vector)) else: tweets.append((tweet_id, int(sentiment), feature_vector)) utils.write_status(i + 1, total) print('\n') return tweets
def dtmain(dataset): dict = {} np.random.seed(1337) unigrams = utils.top_n_words(FREQ_DIST_FILE, UNIGRAM_SIZE) if USE_BIGRAMS: bigrams = utils.top_n_bigrams(BI_FREQ_DIST_FILE, BIGRAM_SIZE) tweets = process_tweets(TRAIN_PROCESSED_FILE, test_file=False) if TRAIN: train_tweets, val_tweets = utils.split_data(tweets) else: random.shuffle(tweets) train_tweets = tweets del tweets print('Extracting features & training batches') clf = DecisionTreeClassifier(max_depth=25) batch_size = len(train_tweets) i = 1 n_train_batches = int(np.ceil(len(train_tweets) / float(batch_size))) for training_set_X, training_set_y in extract_features( train_tweets, test_file=False, feat_type=FEAT_TYPE, batch_size=batch_size): utils.write_status(i, n_train_batches) i += 1 if FEAT_TYPE == 'frequency': tfidf = apply_tf_idf(training_set_X) training_set_X = tfidf.transform(training_set_X) clf.fit(training_set_X, training_set_y) print('\n') print('Testing') if TRAIN: correct, total = 0, len(val_tweets) i = 1 batch_size = len(val_tweets) n_val_batches = int(np.ceil(len(val_tweets) / float(batch_size))) for val_set_X, val_set_y in extract_features(val_tweets, test_file=False, feat_type=FEAT_TYPE, batch_size=batch_size): if FEAT_TYPE == 'frequency': val_set_X = tfidf.transform(val_set_X) prediction = clf.predict(val_set_X) correct += np.sum(prediction == val_set_y) utils.write_status(i, n_val_batches) i += 1 dict.update({'dataset': dataset}) dict.update({'correct': correct}) dict.update({'total': total}) rslt = correct * 100. / total dict.update({'result': round(rslt, 2)}) print('\nCorrect: %d/%d = %.4f %%' % (correct, total, correct * 100. / total)) else: del train_tweets test_tweets = process_tweets(TEST_PROCESSED_FILE, test_file=True) n_test_batches = int(np.ceil(len(test_tweets) / float(batch_size))) predictions = np.array([]) print('Predicting batches') i = 1 for test_set_X, _ in extract_features(test_tweets, test_file=True, feat_type=FEAT_TYPE): if FEAT_TYPE == 'frequency': test_set_X = tfidf.transform(test_set_X) prediction = clf.predict(test_set_X) predictions = np.concatenate((predictions, prediction)) utils.write_status(i, n_test_batches) i += 1 predictions = [(str(j), int(predictions[j])) for j in range(len(test_tweets))] utils.save_results_to_csv(predictions, 'decisiontree.csv') print('\nSaved to decisiontree.csv') return dict
def mainmethod(self): USE_BIGRAMS = True dict = {} np.random.seed(1337) #unigrams = utils.top_n_words(self.FREQ_DIST_FILE, self.UNIGRAM_SIZE) #if USE_BIGRAMS: #bigrams = utils.top_n_bigrams(self.BI_FREQ_DIST_FILE, self.BIGRAM_SIZE) tweets = self.process_tweets(self.TRAIN_PROCESSED_FILE, test_file=False) if self.TRAIN: train_tweets, val_tweets = utils.split_data(tweets) else: random.shuffle(tweets) train_tweets = tweets del tweets print('Extracting features & training batches') clf = MultinomialNB() batch_size = len(train_tweets) i = 1 n_train_batches = int(np.ceil(len(train_tweets) / float(batch_size))) for training_set_X, training_set_y in self.extract_features( train_tweets, test_file=False, feat_type=self.FEAT_TYPE, batch_size=batch_size): utils.write_status(i, n_train_batches) i += 1 if self.FEAT_TYPE == 'frequency': tfidf = self.apply_tf_idf(training_set_X) training_set_X = tfidf.transform(training_set_X) clf.partial_fit(training_set_X, training_set_y, classes=[0, 1]) print('\n') print('Testing') if self.TRAIN: correct, total = 0, len(val_tweets) i = 1 batch_size = len(val_tweets) n_val_batches = int(np.ceil(len(val_tweets) / float(batch_size))) for val_set_X, val_set_y in self.extract_features( val_tweets, test_file=False, feat_type=self.FEAT_TYPE, batch_size=batch_size): if self.FEAT_TYPE == 'frequency': val_set_X = tfidf.transform(val_set_X) prediction = clf.predict(val_set_X) correct += np.sum(prediction == val_set_y) utils.write_status(i, n_val_batches) i += 1 dict.update({'dataset': self.TEST_PROCESSED_FILE}) dict.update({'correct': correct}) dict.update({'total': total}) rslt = correct * 100. / total dict.update({'result': round(rslt, 2)}) print('\nCorrect: %d/%d = %.4f %%' % (correct, total, correct * 100. / total)) else: del train_tweets test_tweets = self.process_tweets(self.TEST_PROCESSED_FILE, test_file=True) n_test_batches = int(np.ceil(len(test_tweets) / float(batch_size))) predictions = np.array([]) print('Predicting batches') i = 1 for test_set_X, _ in self.extract_features( test_tweets, test_file=True, feat_type=self.FEAT_TYPE): if FEAT_TYPE == 'frequency': test_set_X = tfidf.transform(test_set_X) prediction = clf.predict(test_set_X) predictions = np.concatenate((predictions, prediction)) utils.write_status(i, n_test_batches) i += 1 predictions = [(str(j), int(predictions[j])) for j in range(len(test_tweets))] utils.save_results_to_csv(predictions, 'naivebayes.csv') print('\nSaved to naivebayes.csv') return dict