else: tweet_id, sentiment, tweet =line.split(',') feature_vector=getFeatureVector(tweet) if test_file: tweets.append((tweet_id, feature_vector)) else: tweets.append((tweet_id, int(sentiment), feature_vector)) #utils.write_status(i+1,total) print ('\n') return tweets if __name__ =='__main__': np.random.seed(1337) unigrams=utils.top_n_words(FreqDistFile, UnigramSize) if UseBigrams: bigrams=utils.top_n_bigrams(BiFreqDistFile, BigramSize) tweets=process_tweets(TrainProcessedFile, test_file=False) #if TRAIN: train_tweets, val_tweets = utils.split_data(tweets)#validation # else: # random.shuffle(tweets) # train_tweets=tweets del tweets print ('Extracting Features and Training batches') clf = svm.LinearSVC(C=0.1) batch_size=len(train_tweets)#!!!!!!!!!!!!! i=1 n_train_batches=int(np.ceil(len(train_tweets)/float(batch_size)))#!!!!!!!!!!!!!!! for training_set_x, training_set_y in extractFeatures(train_tweets, test_file=False, feat_type=FeatType, batch_size=batch_size): #utils.write_status(i, n_train_batches)
tweet_id, sentiment, tweet = line.split(',') feature_vector = get_feature_vector(tweet) if test_file: tweets.append((tweet_id, feature_vector)) else: tweets.append((tweet_id, int(sentiment), feature_vector)) utils.write_status(i + 1, total) print('\n') return tweets if __name__ == '__main__': np.random.seed(1337) unigrams = utils.top_n_words(FREQ_DIST_FILE, UNIGRAM_SIZE) if USE_BIGRAMS: bigrams = utils.top_n_bigrams(BI_FREQ_DIST_FILE, BIGRAM_SIZE) tweets = process_tweets(TRAIN_PROCESSED_FILE, test_file=False) if TRAIN: train_tweets, val_tweets = utils.split_data(tweets) else: random.shuffle(tweets) train_tweets = tweets del tweets print('Extracting features & training batches') clf = DecisionTreeClassifier(max_depth=25) batch_size = len(train_tweets) i = 1 n_train_batches = int(np.ceil(len(train_tweets) / float(batch_size))) for training_set_X, training_set_y in extract_features( train_tweets, test_file=False,
print(wordsList[:3]) # FUNCTION CALL TO SAVE THE PROCESSED DATA INTO CSV FILES save_processed_file(wordsList[:31963], './datasets/TRAIN_PROCESSED_FILE.csv', './datasets/train_tweets.csv', False) save_processed_file(wordsList[31962:], './datasets/TEST_PROCESSED_FILE.csv', './datasets/test_tweets.csv', True) np.random.seed(1337) unigrams = utils.top_n_words('./datasets/train_tweets-freqdist.pkl', UNIGRAM_SIZE) if USE_BIGRAMS: bigrams = utils.top_n_bigrams( './datasets/train_tweets-freqdist-bi.pkl', BIGRAM_SIZE) tweets = process_tweets(TRAIN_PROCESSED_FILE, test_file=False) if TRAIN: train_tweets, val_tweets = utils.split_data(tweets) else: random.shuffle(tweets) train_tweets = tweets del tweets # TREE CONSTRUCTION print('Extracting features & training batches') clf = DecisionTreeClassifier(max_depth=25) batch_size = len(train_tweets) i = 1 n_train_batches = int(np.ceil(len(train_tweets) / float(batch_size))) for training_set_X, training_set_y in extract_features(