def RNN(data_train, labels_train, data_test, labels_test, n_features): """ Adapted from Passage's sentiment.py at https://github.com/IndicoDataSolutions/Passage/blob/master/examples/sentiment.py License: MIT """ import numpy as np import pandas as pd from passage.models import RNN from passage.updates import Adadelta from passage.layers import Embedding, GatedRecurrent, Dense from passage.preprocessing import Tokenizer layers = [ Embedding(size=128, n_features=n_features), GatedRecurrent(size=128, activation='tanh', gate_activation='steeper_sigmoid', init='orthogonal', seq_output=False, p_drop=0.75), Dense(size=1, activation='sigmoid', init='orthogonal') ] model = RNN(layers=layers, cost='bce', updater=Adadelta(lr=0.5)) tokenizer = Tokenizer(min_df=10) X = tokenizer.fit_transform(data) model.fit(X, labels, n_epochs=10) predi = model.predit(data_test).flatten labels_predicted = np.ones(len(data_test)) labels_predicted[predi<0.5] = 0
def rnn(train_text, train_label): tokenizer = Tokenizer() train_tokens = tokenizer.fit_transform(train_text) layers = [ Embedding(size=50, n_features=tokenizer.n_features), GatedRecurrent(size=128), Dense(size=1, activation='sigmoid') ] # print "train_tokens=", train_tokens model = RNN(layers=layers, cost='BinaryCrossEntropy') model.fit(train_tokens, train_label) return model
def main(ptrain, ntrain, ptest, ntest, out, modeltype): assert modeltype in ["gated_recurrent", "lstm_recurrent"] print("Using the %s model ..." % modeltype) print("Loading data ...") trX, trY = load_data(ptrain, ntrain) teX, teY = load_data(ptest, ntest) tokenizer = Tokenizer(min_df=10, max_features=100000) trX = tokenizer.fit_transform(trX) teX = tokenizer.transform(teX) print("Training ...") if modeltype == "gated_recurrent": layers = [ Embedding(size=256, n_features=tokenizer.n_features), GatedRecurrent(size=512, activation='tanh', gate_activation='steeper_sigmoid', init='orthogonal', seq_output=False, p_drop=0.75), Dense(size=1, activation='sigmoid', init='orthogonal') ] else: layers = [ Embedding(size=256, n_features=tokenizer.n_features), LstmRecurrent(size=512, activation='tanh', gate_activation='steeper_sigmoid', init='orthogonal', seq_output=False, p_drop=0.75), Dense(size=1, activation='sigmoid', init='orthogonal') ] model = RNN(layers=layers, cost='bce', updater=Adadelta(lr=0.5)) model.fit(trX, trY, n_epochs=10) # Predicting the probabilities of positive labels print("Predicting ...") pr_teX = model.predict(teX).flatten() predY = np.ones(len(teY)) predY[pr_teX < 0.5] = -1 with open(out, "w") as f: for lab, pos_pr, neg_pr in zip(predY, pr_teX, 1 - pr_teX): f.write("%d %f %f\n" % (lab, pos_pr, neg_pr))
from passage.preprocessing import Tokenizer # download data at kaggle.com/c/word2vec-nlp-tutorial/data def clean(texts): return [html.fromstring(text).text_content().lower().strip() for text in texts] if __name__ == "__main__": tr_data = pd.read_csv('labeledTrainData.tsv', delimiter='\t') trX = clean(tr_data['review'].values) trY = tr_data['sentiment'].values print("Training data loaded and cleaned.") tokenizer = Tokenizer(min_df=10, max_features=100000) trX = tokenizer.fit_transform(trX) print("Training data tokenized.") layers = [ Embedding(size=256, n_features=tokenizer.n_features), GatedRecurrent(size=512, activation='tanh', gate_activation='steeper_sigmoid', init='orthogonal', seq_output=False, p_drop=0.75), Dense(size=1, activation='sigmoid', init='orthogonal') ] model = RNN(layers=layers, cost='bce', updater=Adadelta(lr=0.5)) model.fit(trX, trY, n_epochs=10) te_data = pd.read_csv('testData.tsv', delimiter='\t') ids = te_data['id'].values teX = clean(te_data['review'].values)
from passage.preprocessing import Tokenizer from passage.layers import Embedding, GatedRecurrent, Dense from passage.models import RNN from passage.utils import save, load train_text=['hello world','foo bar'] train_labels=[0,1] test_text=['hello you','not'] tokenizer = Tokenizer() train_tokens = tokenizer.fit_transform(train_text) # layers = [ # Embedding(size=128, n_features=tokenizer.n_features), # GatedRecurrent(size=128), # Dense(size=1, activation='sigmoid') # ] # model = RNN(layers=layers, cost='BinaryCrossEntropy') # model.fit(train_tokens, train_labels) # model.predict(tokenizer.transform(test_text)) # save(model, 'save_test.pkl') # model = load('save_test.pkl') print train_tokens
import sys # --- # --- print 'loading dataset' d = Dataset(settings['FN_DATASET'], settings['FN_VOCABULARY']) d.load() print 'generating labeled training set' train_text,train_labels = d.getNextWordPredTrainset(10) #for t,l in zip(train_text,train_labels): # print t,'->',l tokenizer = Tokenizer() train_tokens = tokenizer.fit_transform(train_text) save(train_tokens, settings['FN_TRAINED_TOKENIZER']) layers = [ Embedding(size=128, n_features=tokenizer.n_features), GatedRecurrent(size=128), Dense(size=1, activation='sigmoid') ] model = RNN(layers=layers, cost='BinaryCrossEntropy') model.fit(train_tokens, train_labels) save(model, settings['FN_MODEL_NEXTWORDPRED'])
import pandas as pd from sklearn import metrics from passage.preprocessing import Tokenizer from passage.layers import Embedding, GatedRecurrent, Dense from passage.models import RNN from passage.utils import load, save from load import load_gender_data trX, teX, trY, teY = load_gender_data(ntrain=10000) # Can increase up to 250K or so tokenizer = Tokenizer(min_df=10, max_features=50000) print trX[1] # see a blog example trX = tokenizer.fit_transform(trX) teX = tokenizer.transform(teX) print tokenizer.n_features layers = [ Embedding(size=128, n_features=tokenizer.n_features), GatedRecurrent(size=256, activation='tanh', gate_activation='steeper_sigmoid', init='orthogonal', seq_output=False), Dense(size=1, activation='sigmoid', init='orthogonal') # sigmoid for binary classification ] model = RNN(layers=layers, cost='bce') # bce is classification loss for binary classification and sigmoid output for i in range(2): model.fit(trX, trY, n_epochs=1) tr_preds = model.predict(trX[:len(teY)]) te_preds = model.predict(teX)
def train_model(modeltype, delta): assert modeltype in ["gated_recurrent", "lstm_recurrent"] print "Begin Training" df_imdb_reviews = pd.read_csv('../data/imdb_review_data.tsv', escapechar='\\', delimiter='\t') X = clean(df_imdb_reviews['review'].values) y = df_imdb_reviews['sentiment'].values X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42) print "Tokenize" tokenizer = Tokenizer(min_df=10, max_features=100000) X_train = tokenizer.fit_transform(X_train) X_train = [[float(x) for x in y] for y in X_train] X_test = tokenizer.transform(X_test) X_test = [[float(x) for x in y] for y in X_test] print "Number of featers: {}".format(tokenizer.n_features) print "Training model" if modeltype == "gated_recurrent": layers = [ Embedding(size=256, n_features=tokenizer.n_features), GatedRecurrent(size=512, activation='tanh', gate_activation='steeper_sigmoid', init='orthogonal', seq_output=True, p_drop=0.5), Dense(size=1, activation='sigmoid', init='orthogonal') ] else: layers = [ Embedding(size=256, n_features=tokenizer.n_features), LstmRecurrent(size=512, activation='tanh', gate_activation='steeper_sigmoid', init='orthogonal', seq_output=True, p_drop=0.5), Dense(size=1, activation='sigmoid', init='orthogonal') ] # bce is classification loss for binary classification and sigmoid output model = RNN(layers=layers, cost='bce', updater=Adadelta, (lr=delta)) model.fit(X_train, y_train, n_epochs=20) with open('../data/{}_tokenizer_delta_{}_pdrop_0.5.pkl'.format(modeltype, delta), 'w') as f: vectorizer = pickle.dump(tokenizer, f) with open('../data/{}_model_delta_{}._pdrop_0.5.pkl'.format(modeltype, delta), 'w') as f: model = pickle.dump(model, f) try: y_pred_te = model.predict(X_test).flatten() >= 0.5 y_pred_tr = model.predict(X_train).flatten() >= 0.5 print 'Test Accuracy: {}'.format(accuracy_score(y_test,y_pred_te)) print 'Test Precision: {}'.format(precision_score(y_test,y_pred_te)) print 'Test Recall: {}'.format(recall_score(y_test,y_pred_te)) print 'Train Accuracy: {}'.format(accuracy_score(y_train,y_pred_tr)) print 'Train Precision: {}'.format(precision_score(y_train,y_pred_tr)) print 'Train Recall: {}'.format(recall_score(y_train,y_pred_tr)) except: print "Unable to perform metrics" return tokenizer, model
remove=('headers', 'footers', 'quotes'), categories=categories) newsgroups_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'), categories=categories) print len(newsgroups_train.data), len(newsgroups_test.data) from sklearn import metrics from passage.preprocessing import Tokenizer from passage.layers import Embedding, GatedRecurrent, Dense from passage.models import RNN from passage.utils import save tokenizer = Tokenizer(min_df=10, max_features=50000) X_train = tokenizer.fit_transform(newsgroups_train.data) X_test = tokenizer.transform(newsgroups_test.data) Y_train = newsgroups_train.target Y_test = newsgroups_test.target print tokenizer.n_features layers = [ Embedding(size=128, n_features=tokenizer.n_features), GatedRecurrent(size=256, activation='tanh', gate_activation='steeper_sigmoid', init='orthogonal', seq_output=False), Dense(size=1, activation='sigmoid', init='orthogonal') # sigmoid for binary classification
remove=('headers', 'footers', 'quotes'), categories=categories) newsgroups_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'), categories=categories) print len(newsgroups_train.data), len(newsgroups_test.data) from sklearn import metrics from passage.preprocessing import Tokenizer from passage.layers import Embedding, GatedRecurrent, Dense from passage.models import RNN from passage.utils import save tokenizer = Tokenizer(min_df=10, max_features=50000) X_train = tokenizer.fit_transform(newsgroups_train.data) X_test = tokenizer.transform(newsgroups_test.data) Y_train = newsgroups_train.target Y_test = newsgroups_test.target print tokenizer.n_features layers = [ Embedding(size=128, n_features=tokenizer.n_features), GatedRecurrent(size=256, activation='tanh', gate_activation='steeper_sigmoid', init='orthogonal', seq_output=False), Dense(size=1, activation='sigmoid', init='orthogonal') # sigmoid for binary classification ] model = RNN(layers=layers, cost='bce') # bce is classification loss for binary classification and sigmoid output for i in range(2):
def predict(model, test_text): tokenizer = Tokenizer() result = model.predict(tokenizer.fit_transform(test_text)) # print result.shape # print "result =", result return result