def main(ptrain, ntrain, ptest, ntest, out, modeltype): assert modeltype in ["gated_recurrent", "lstm_recurrent"] print("Using the %s model ..." % modeltype) print("Loading data ...") trX, trY = load_data(ptrain, ntrain) teX, teY = load_data(ptest, ntest) tokenizer = Tokenizer(min_df=10, max_features=100000) trX = tokenizer.fit_transform(trX) teX = tokenizer.transform(teX) print("Training ...") if modeltype == "gated_recurrent": layers = [ Embedding(size=256, n_features=tokenizer.n_features), GatedRecurrent(size=512, activation='tanh', gate_activation='steeper_sigmoid', init='orthogonal', seq_output=False, p_drop=0.75), Dense(size=1, activation='sigmoid', init='orthogonal') ] else: layers = [ Embedding(size=256, n_features=tokenizer.n_features), LstmRecurrent(size=512, activation='tanh', gate_activation='steeper_sigmoid', init='orthogonal', seq_output=False, p_drop=0.75), Dense(size=1, activation='sigmoid', init='orthogonal') ] model = RNN(layers=layers, cost='bce', updater=Adadelta(lr=0.5)) model.fit(trX, trY, n_epochs=10) # Predicting the probabilities of positive labels print("Predicting ...") pr_teX = model.predict(teX).flatten() predY = np.ones(len(teY)) predY[pr_teX < 0.5] = -1 with open(out, "w") as f: for lab, pos_pr, neg_pr in zip(predY, pr_teX, 1 - pr_teX): f.write("%d %f %f\n" % (lab, pos_pr, neg_pr))
return [html.fromstring(text).text_content().lower().strip() for text in texts] if __name__ == "__main__": tr_data = pd.read_csv('labeledTrainData.tsv', delimiter='\t') trX = clean(tr_data['review'].values) trY = tr_data['sentiment'].values print("Training data loaded and cleaned.") tokenizer = Tokenizer(min_df=10, max_features=100000) trX = tokenizer.fit_transform(trX) print("Training data tokenized.") layers = [ Embedding(size=256, n_features=tokenizer.n_features), GatedRecurrent(size=512, activation='tanh', gate_activation='steeper_sigmoid', init='orthogonal', seq_output=False, p_drop=0.75), Dense(size=1, activation='sigmoid', init='orthogonal') ] model = RNN(layers=layers, cost='bce', updater=Adadelta(lr=0.5)) model.fit(trX, trY, n_epochs=10) te_data = pd.read_csv('testData.tsv', delimiter='\t') ids = te_data['id'].values teX = clean(te_data['review'].values) teX = tokenizer.transform(teX) pr_teX = model.predict(teX).flatten() pd.DataFrame(np.asarray([ids, pr_teX]).T).to_csv('submission.csv', index=False, header=["id", "sentiment"])
from passage.models import RNN from passage.updates import NAG, Regularizer from passage.layers import Generic, GatedRecurrent, Dense from passage.utils import load, save from load import load_mnist trX, teX, trY, teY = load_mnist() #Use generic layer - RNN processes a size 28 vector at a time scanning from left to right layers = [ Generic(size=28), GatedRecurrent(size=512, p_drop=0.2), Dense(size=10, activation='softmax', p_drop=0.5) ] #A bit of l2 helps with generalization, higher momentum helps convergence updater = NAG(momentum=0.95, regularizer=Regularizer(l2=1e-4)) #Linear iterator for real valued data, cce cost for softmax model = RNN(layers=layers, updater=updater, iterator='linear', cost='cce') model.fit(trX, trY, n_epochs=20) tr_preds = model.predict(trX[:len(teY)]) te_preds = model.predict(teX) tr_acc = np.mean(trY[:len(teY)] == np.argmax(tr_preds, axis=1)) te_acc = np.mean(teY == np.argmax(te_preds, axis=1)) # Test accuracy should be between 98.9% and 99.3% print 'train accuracy', tr_acc, 'test accuracy', te_acc
tokenizer = Tokenizer(min_df=10, max_features=100000) trX = tokenizer.fit_transform(trX) print("Training data tokenized.") layers = [ Embedding(size=256, n_features=tokenizer.n_features), GatedRecurrent(size=512, activation='tanh', gate_activation='steeper_sigmoid', init='orthogonal', seq_output=False, p_drop=0.75), Dense(size=1, activation='sigmoid', init='orthogonal') ] model = RNN(layers=layers, cost='bce', updater=Adadelta(lr=0.5)) model.fit(trX, trY, n_epochs=10) te_data = pd.read_csv('testData.tsv', delimiter='\t') ids = te_data['id'].values teX = clean(te_data['review'].values) teX = tokenizer.transform(teX) pr_teX = model.predict(teX).flatten() pd.DataFrame(np.asarray([ids, pr_teX]).T).to_csv('submission.csv', index=False, header=["id", "sentiment"])
ntest=1000) # Can increase up to 250K or so print len(trX), len(trY), len(teX), len(teY) print teX.shape() tokenizer = Tokenizer(min_df=10, max_features=50000) #print trX[1] # see a blog example trX = tokenizer.fit_transform(trX) text = "Evropa je v jeho politika naprosto impotent ." teX = tokenizer.transform(text) print "number of tokens:" + str(len(trX)) print "number of feathures:" + str(tokenizer.n_features) layers = [ Embedding(size=256, n_features=tokenizer.n_features), GatedRecurrent(size=512, p_drop=0.2), Dense(size=10, activation='softmax', p_drop=0.5) ] model = RNN( layers=layers, cost='cce' ) # bce is classification loss for binary classification and sigmoid output model = load('modelEcho.pkl') # How to load te_pred = model.predict(teX) #tr_acc = metrics.accuracy_score(trY[:len(teY)], tr_preds > 0.5) #te_acc = metrics.accuracy_score(teY, te_preds > 0.5) print te_pred
gru_sizes = [10, 20, 50, 100, 200, 1000] epochs = [1, 3, 5, 7, 10] for embedding_size, gru_size, num_epochs in product(embedding_sizes, gru_sizes, epochs): X_train, X_test, y_train, y_test = cross_validation.train_test_split( train_text, train_labels, test_size=0.1, random_state=0 ) layers = [ Embedding(size=embedding_size, n_features=tokenizer.n_features), GatedRecurrent(size=gru_size), Dense(size=1, activation="sigmoid"), ] model = RNN(layers=layers, cost="BinaryCrossEntropy") model.fit(train_tokens, train_labels, n_epochs=int(num_epochs)) modelfile_name = "stubborn_model.paramsearch.embedding{}.gru{}.epoch{}".format(embedding_size, gru_size, num_epochs) save(model, modelfile_name + ".pkl") pickle.dump(tokenizer, open(modelfile_name + "-tokenizer.pkl", "wb")) results = model.predict(tokenizer.transform(X_test)) count = 0 for r, g in zip(results, y_test): if int(r >= 0.5) == int(g): count += 1 results = 1.0 * count / len(y_test) print(modelfile_name + "\t" + str(results))
def train_model(modeltype, delta): assert modeltype in ["gated_recurrent", "lstm_recurrent"] print "Begin Training" df_imdb_reviews = pd.read_csv('../data/imdb_review_data.tsv', escapechar='\\', delimiter='\t') X = clean(df_imdb_reviews['review'].values) y = df_imdb_reviews['sentiment'].values X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42) print "Tokenize" tokenizer = Tokenizer(min_df=10, max_features=100000) X_train = tokenizer.fit_transform(X_train) X_train = [[float(x) for x in y] for y in X_train] X_test = tokenizer.transform(X_test) X_test = [[float(x) for x in y] for y in X_test] print "Number of featers: {}".format(tokenizer.n_features) print "Training model" if modeltype == "gated_recurrent": layers = [ Embedding(size=256, n_features=tokenizer.n_features), GatedRecurrent(size=512, activation='tanh', gate_activation='steeper_sigmoid', init='orthogonal', seq_output=True, p_drop=0.5), Dense(size=1, activation='sigmoid', init='orthogonal') ] else: layers = [ Embedding(size=256, n_features=tokenizer.n_features), LstmRecurrent(size=512, activation='tanh', gate_activation='steeper_sigmoid', init='orthogonal', seq_output=True, p_drop=0.5), Dense(size=1, activation='sigmoid', init='orthogonal') ] # bce is classification loss for binary classification and sigmoid output model = RNN(layers=layers, cost='bce', updater=Adadelta, (lr=delta)) model.fit(X_train, y_train, n_epochs=20) with open('../data/{}_tokenizer_delta_{}_pdrop_0.5.pkl'.format(modeltype, delta), 'w') as f: vectorizer = pickle.dump(tokenizer, f) with open('../data/{}_model_delta_{}._pdrop_0.5.pkl'.format(modeltype, delta), 'w') as f: model = pickle.dump(model, f) try: y_pred_te = model.predict(X_test).flatten() >= 0.5 y_pred_tr = model.predict(X_train).flatten() >= 0.5 print 'Test Accuracy: {}'.format(accuracy_score(y_test,y_pred_te)) print 'Test Precision: {}'.format(precision_score(y_test,y_pred_te)) print 'Test Recall: {}'.format(recall_score(y_test,y_pred_te)) print 'Train Accuracy: {}'.format(accuracy_score(y_train,y_pred_tr)) print 'Train Precision: {}'.format(precision_score(y_train,y_pred_tr)) print 'Train Recall: {}'.format(recall_score(y_train,y_pred_tr)) except: print "Unable to perform metrics" return tokenizer, model
X_test = tokenizer.transform(newsgroups_test.data) Y_train = newsgroups_train.target Y_test = newsgroups_test.target print tokenizer.n_features layers = [ Embedding(size=128, n_features=tokenizer.n_features), GatedRecurrent(size=256, activation='tanh', gate_activation='steeper_sigmoid', init='orthogonal', seq_output=False), Dense(size=1, activation='sigmoid', init='orthogonal') # sigmoid for binary classification ] model = RNN( layers=layers, cost='bce' ) # bce is classification loss for binary classification and sigmoid output for i in range(2): model.fit(X_train, Y_train, n_epochs=1) tr_preds = model.predict(X_train[:len(Y_test)]) te_preds = model.predict(X_test) tr_acc = metrics.accuracy_score(Y_train[:len(Y_test)], tr_preds > 0.5) te_acc = metrics.accuracy_score(Y_test, te_preds > 0.5) print i, tr_acc, te_acc # dataset too small to fully utilize Passage save(model, 'model.pkl')
pickle.dump(tokenizer, open('tokenizer.pkl', 'wb')) print "number of tokens:" + str(len(trX)) teX = tokenizer.transform(teX) print "number of feathures:" + str(tokenizer.n_features) layers = [ Embedding(size=256, n_features=tokenizer.n_features), GatedRecurrent(size=725), Dense(size=10, activation='softmax') ] model = RNN(layers=layers, cost='cce') model.fit(trX, trY, n_epochs=10) save(model, 'modelEcho.pkl') tr_preds = model.predict(trX) te_preds = model.predict(teX) data = pd.DataFrame(trY) data.to_csv('data/trY.vec') data = pd.DataFrame(tr_preds) data.to_csv('data/tr_preds.vec') tr_acc = np.mean(np.argmax(trY, axis=1) == np.argmax(tr_preds, axis=1)) indexy = np.argmax(teY, axis=1) data = pd.DataFrame(indexy) data.to_csv('data/ev_agrmax.txt') data = pd.DataFrame(np.argmax(te_preds, axis=1)) data.to_csv('data/ev_pred_agrmax.txt')
from passage.layers import Embedding, GatedRecurrent, Dense from passage.models import RNN from passage.utils import save tokenizer = Tokenizer(min_df=10, max_features=50000) X_train = tokenizer.fit_transform(newsgroups_train.data) X_test = tokenizer.transform(newsgroups_test.data) Y_train = newsgroups_train.target Y_test = newsgroups_test.target print tokenizer.n_features layers = [ Embedding(size=128, n_features=tokenizer.n_features), GatedRecurrent(size=256, activation='tanh', gate_activation='steeper_sigmoid', init='orthogonal', seq_output=False), Dense(size=1, activation='sigmoid', init='orthogonal') # sigmoid for binary classification ] model = RNN(layers=layers, cost='bce') # bce is classification loss for binary classification and sigmoid output for i in range(2): model.fit(X_train, Y_train, n_epochs=1) tr_preds = model.predict(X_train[:len(Y_test)]) te_preds = model.predict(X_test) tr_acc = metrics.accuracy_score(Y_train[:len(Y_test)], tr_preds > 0.5) te_acc = metrics.accuracy_score(Y_test, te_preds > 0.5) print i, tr_acc, te_acc # dataset too small to fully utilize Passage save(model, 'model.pkl')
#!/usr/bin/env python # coding=utf-8 from passage.preprocessing import Tokenizer from passage.layers import Embedding, GatedRecurrent, Dense from passage.models import RNN from passage.utils import save, load train_text = ['hello world','foo bar'] train_labels = [0,1] test_text = ['good man'] tokenizer = Tokenizer() train_tokens = tokenizer.fit_transform(train_text) layers = [ Embedding(size=128, n_features=tokenizer.n_features), GatedRecurrent(size=128), Dense(size=1, activation='sigmoid') ] model = RNN(layers=layers, cost='BinaryCrossEntropy') model.fit(train_tokens, train_labels) print model.predict(tokenizer.transform(test_text)) save(model, 'save_test.pkl') model = load('save_test.pkl')
num_feats = len(set(flatten(train_tokens))) def get_labels(id): if id == 3: return [1, 0] else: return [0, 1] seq_labels = map(lambda (l): map(get_labels, l), train_tokens) layers = [ Embedding(size=128, n_features=num_feats), GatedRecurrent(size=128, seq_output=True), Dense(size=num_feats, activation='softmax') ] #iterator = SortedPadded(y_pad=True, y_dtype=intX) #iterator = SortedPadded(y_dtype=intX) #model = RNN(layers=layers, cost='seq_cce', iterator=iterator, Y=T.imatrix()) model = RNN(layers=layers, cost='seq_cce') #model.fit(train_tokens, [1,0,1]) model.fit(train_tokens, train_tokens) #model.predict(tokenizer.transform(["Frogs are awesome", "frogs are amphibious"])) model.predict(train_tokens) save(model, 'save_test.pkl') model = load('save_test.pkl') """ This model, although doing sequential prediction, predicts a tag per document not per word. """