def sentiment_analysis(indata): print("Predicting: {0}".format(indata)) np.random.seed(1337) unigrams = utils.top_n_words(FREQ_DIST_FILE, UNIGRAM_SIZE) if USE_BIGRAMS: bigrams = utils.top_n_bigrams(BI_FREQ_DIST_FILE, BIGRAM_SIZE) batch_size = 500 model = load_model('best_model.h5') test_data = [(1, get_feature_vector(indata, unigrams, bigrams))] n_test_batches = int(np.ceil(len(test_data) / float(batch_size))) predictions = np.array([]) for test_set_X, _ in extract_features(test_data, unigrams, bigrams, feat_type=FEAT_TYPE, batch_size=batch_size, test_file=True): prediction = np.round(model.predict_on_batch(test_set_X).flatten()) predictions = np.concatenate((predictions, prediction)) predictions = [(str(j), int(predictions[j])) for j in range(len(test_data))] if predictions[0][1] == 0: print("Feedback ruim") return 0 else: print("Feedback bom") return 1
optimizer='adam', metrics=['accuracy']) return model def evaluate_model(model, val_tweets): correct, total = 0, len(val_tweets) for val_set_X, val_set_y in extract_features(val_tweets, feat_type=FEAT_TYPE, test_file=False): prediction = model.predict_on_batch(val_set_X) prediction = np.round(prediction) correct += np.sum(prediction == val_set_y[:, None]) return float(correct) / total if __name__ == '__main__': np.random.seed(1337) unigrams = utils.top_n_words(FREQ_DIST_FILE, UNIGRAM_SIZE) if USE_BIGRAMS: bigrams = utils.top_n_bigrams(BI_FREQ_DIST_FILE, BIGRAM_SIZE) tweets = process_tweets(TRAIN_PROCESSED_FILE, test_file=False) if TRAIN: train_tweets, val_tweets = utils.split_data(tweets) else: random.shuffle(tweets) train_tweets = tweets del tweets print ('Extracting features & training batches') nb_epochs = 20 batch_size = 500 model = build_model() n_train_batches = int(np.ceil(len(train_tweets) / float(batch_size))) best_val_acc = 0.0
tweets.append(feature_vector) labels.append(int(sentiment)) utils.write_status(i + 1, total) print('\n') return tweets, np.array(labels) if __name__ == '__main__': train = len(sys.argv) == 1 np.random.seed(1337) vocab_size = 90000 batch_size = 500 max_length = 40 filters = 600 kernel_size = 3 vocab = utils.top_n_words(FREQ_DIST_FILE, vocab_size, shift=1) glove_vectors = get_glove_vectors(vocab) tweets, labels = process_tweets(TRAIN_PROCESSED_FILE, test_file=False) # Create and embedding matrix embedding_matrix = np.random.randn(vocab_size + 1, dim) * 0.01 # Seed it with GloVe vectors for word, i in vocab.items(): glove_vector = glove_vectors.get(word) if glove_vector is not None: embedding_matrix[i] = glove_vector tweets = pad_sequences(tweets, maxlen=max_length, padding='post') shuffled_indices = np.random.permutation(tweets.shape[0]) tweets = tweets[shuffled_indices] labels = labels[shuffled_indices] if train: model = Sequential()
for val_set_X, val_set_y in extractFeatures(val_tweets, 500, feat_type=FeatType, test_file=False): prediction = model.predict_on_batch(val_set_X) prediction = np.round(prediction) #numpy rounds to the nearest even value:This is specifically called Round half to even and is useful because it does not introduce bias. # This is especially important in finance, which it's sometimes called "bankers' rounding" correct += np.sum(prediction == val_set_y[:, None]) #!!!!!!!!!!!!!!!!!!! return float(correct) / total if __name__ == '__main__': np.random.seed(1337) unigrams = utils.top_n_words(FreqDistFile, UnigramSize) if UseBigrams: bigrams = utils.top_n_bigrams(BiFreqDistFile, BigramSize) tweets = process_tweets(TrainProcessedFile, test_file=False) #if TRAIN: train_tweets, val_tweets = utils.split_data(tweets) #validation # else: # random.shuffle(tweets) # train_tweets=tweets del tweets print('Extracting Features and Training batches') nb_epochs = 20 batch_size = 500 #!!!!!!!!!!!!! model = build_model() n_train_batches = int(np.ceil(len(train_tweets) /
for i in range(len(wordsList)): wordsList[i] = ' '.join(wordsList[i]) print(wordsList[:3]) # FUNCTION CALL TO SAVE THE PROCESSED DATA INTO CSV FILES save_processed_file(wordsList[:31963], './datasets/TRAIN_PROCESSED_FILE.csv', './datasets/train_tweets.csv', False) save_processed_file(wordsList[31962:], './datasets/TEST_PROCESSED_FILE.csv', './datasets/test_tweets.csv', True) np.random.seed(1337) unigrams = utils.top_n_words('./datasets/train_tweets-freqdist.pkl', UNIGRAM_SIZE) if USE_BIGRAMS: bigrams = utils.top_n_bigrams( './datasets/train_tweets-freqdist-bi.pkl', BIGRAM_SIZE) tweets = process_tweets(TRAIN_PROCESSED_FILE, test_file=False) if TRAIN: train_tweets, val_tweets = utils.split_data(tweets) else: random.shuffle(tweets) train_tweets = tweets del tweets # TREE CONSTRUCTION print('Extracting features & training batches') clf = DecisionTreeClassifier(max_depth=25) batch_size = len(train_tweets)