if __name__ == "__main__": from tfn.preprocess import Dataset from tfn.helper import export_results from sklearn.metrics import accuracy_score, roc_auc_score, f1_score from sklearn.model_selection import train_test_split import argparse parser = argparse.ArgumentParser() parser.add_argument("-x", "--export-results", dest="export", action='store_true', help="Exports results to results.csv") args = parser.parse_args() data = Dataset('twitter') X_train, X_test, y_train, y_test = train_test_split(data.X, data.y) grad_boost = GradientBoost() grad_boost.fit(X_train, y_train) y_pred = grad_boost.predict(X_test) print(y_pred) acc = accuracy_score(y_test, y_pred) roc = roc_auc_score(y_test, y_pred) f1 = f1_score(y_test, y_pred) print('TF-IDF + xgb accuracy:', round(acc, 4)) print('TF-IDF + xgb AUC:', round(roc, 4)) print('TF-IDF + xgb F1:', round(f1, 4))
parser.add_argument("--emb-type", "-t", dest="type", default="glove", type=str, help="Embedding type. Can be 'glove' or 'char'.") parser.add_argument("-x", "--export-results", dest="export", action='store_true', help="Exports results to results.csv") args = parser.parse_args() if args.type == "glove": emb_size = args.emb_size data = Dataset(args.type) emb = GloveEmbedding(data.X, emb_size=emb_size, type=args.type) X = emb.corpus_vectors y = np.array(data.y) elif args.type == "char": data = Dataset(args.type) emb = CharEmbedding(data.X) X = emb.X_enc y = np.array(data.y) emb_size = 100 X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True) # print(X_train.shape, X_test.shape, y_train.shape, y_test.shape) lstm = LSTMModel(num_features=emb_size, seq_length=X.shape[1])
help="Proportion of data used for testing.") parser.add_argument("--val-prop", "-v", dest="val_prop", default=0.2, type=float, help="Proportion of data used for validation.") parser.add_argument( "--no-export", "-n", dest="no_export", action="store_false", help="Results of running will not be stored in results.csv.") args = parser.parse_args() data_t = Dataset('twitter') # Load augmentation data if args.load_aug: with open(AUG_PATH / args.load_aug, 'rb') as aug_file: num_copies, aug_t = pickle.load(aug_file) else: # Run data augmentation (takes a long time) num_copies = args.aug_copies aug_t = AugmentWithEmbeddings(data_t.X, data_t.y, num_copies=num_copies, replace_pr=args.repl_prob) if args.save_aug: with open(AUG_PATH / args.save_aug, 'wb') as aug_file: pickle.dump((num_copies, aug_t), aug_file)
for X_test in self.pred_test_loader: X_test = X_test[0] y_pred = self.model(X_test) if self.device == "cpu": predictions_list.append(y_pred.data.numpy()) else: predictions_list.append(y_pred.data.cpu().numpy()) predictions = np.vstack(predictions_list) return predictions if __name__ == '__main__': from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score, roc_auc_score, f1_score data = Dataset("glove") embedding = GloveEmbedding(data.X, emb_size=EMBEDDING_DIM, type="glove") X = embedding.corpus_vectors y = np.array(data.y) cnn = CNNModel(EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT, BATCH_SIZE) X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True) #optimiser = optim.SGD(self.model.parameters(), lr=lr, momentum=momentum) optimiser = optim.Adam(cnn.model.parameters()) cnn.fit(X_train, y_train, optimiser, epochs=30) y_pred = cnn.predict(X_test)
class Callback(CallbackAny2Vec): """Callback to print loss after each epoch.""" def __init__(self): self.epoch = 0 self.loss_previous_step = 0 def on_epoch_end(self, model): loss = model.get_latest_training_loss() print('Loss after epoch {}: {}'.format( self.epoch, loss - self.loss_previous_step)) self.epoch += 1 self.loss_previous_step = loss if __name__ == "__main__": from tfn.preprocess import Dataset # Test GloveEmbedding type = 'glove' ds = Dataset(type) emb = GloveEmbedding(ds.X, type=type) print(emb.corpus_vectors.shape) print(emb.corpus[0]) print(emb.corpus_vectors[0]) # Test OneHotCharEmbedding # ds = Dataset('char') # emb = CharEmbedding(ds.X, train=True, training_path="../data/training.1600000.processed.noemoticon.csv")
while (i < len(vocab)): temp = vocab[i].split() for ii in temp: new_vocab.append(ii) i = i + 1 #print(new_vocab) #remove duplicate word in list newlist = sorted(set(new_vocab), key=lambda x: new_vocab.index(x)) #print(newlist) print('Get ' + str(gram) + '-gram ' + str(target) + ' target') return newlist if __name__ == '__main__': trainX, y = Dataset('twitter')._get_training_data_from_csv() vocab1_diaster = get_word(gram=1, target=1, length=500) vocab1_nondiaster = get_word(gram=1, target=0, length=50) vocab2_diaster = get_word(gram=2, target=1, length=50) vocab2_nondiaster = get_word(gram=2, target=0, length=50) vocab3_diaster = get_word(gram=3, target=1, length=50) vocab3_nondiaster = get_word(gram=3, target=0, length=50) word2vecmod = word2vec_model(corpus=trainX, update=False) tsne_plot(word2vecmod, vocab1_diaster, "UniGram diaster word embedding") tsne_plot(word2vecmod, vocab1_nondiaster, "UniGram nondiaster word embedding") tsne_plot(word2vecmod, vocab2_diaster, "BiGram diaster word embedding") tsne_plot(word2vecmod, vocab2_nondiaster,