def main(ptrain, ntrain, ptest, ntest, out, modeltype): assert modeltype in ["gated_recurrent", "lstm_recurrent"] print("Using the %s model ..." % modeltype) print("Loading data ...") trX, trY = load_data(ptrain, ntrain) teX, teY = load_data(ptest, ntest) tokenizer = Tokenizer(min_df=10, max_features=100000) trX = tokenizer.fit_transform(trX) teX = tokenizer.transform(teX) print("Training ...") if modeltype == "gated_recurrent": layers = [ Embedding(size=256, n_features=tokenizer.n_features), GatedRecurrent(size=512, activation='tanh', gate_activation='steeper_sigmoid', init='orthogonal', seq_output=False, p_drop=0.75), Dense(size=1, activation='sigmoid', init='orthogonal') ] else: layers = [ Embedding(size=256, n_features=tokenizer.n_features), LstmRecurrent(size=512, activation='tanh', gate_activation='steeper_sigmoid', init='orthogonal', seq_output=False, p_drop=0.75), Dense(size=1, activation='sigmoid', init='orthogonal') ] model = RNN(layers=layers, cost='bce', updater=Adadelta(lr=0.5)) model.fit(trX, trY, n_epochs=10) # Predicting the probabilities of positive labels print("Predicting ...") pr_teX = model.predict(teX).flatten() predY = np.ones(len(teY)) predY[pr_teX < 0.5] = -1 with open(out, "w") as f: for lab, pos_pr, neg_pr in zip(predY, pr_teX, 1 - pr_teX): f.write("%d %f %f\n" % (lab, pos_pr, neg_pr))
return [html.fromstring(text).text_content().lower().strip() for text in texts] if __name__ == "__main__": tr_data = pd.read_csv('labeledTrainData.tsv', delimiter='\t') trX = clean(tr_data['review'].values) trY = tr_data['sentiment'].values print("Training data loaded and cleaned.") tokenizer = Tokenizer(min_df=10, max_features=100000) trX = tokenizer.fit_transform(trX) print("Training data tokenized.") layers = [ Embedding(size=256, n_features=tokenizer.n_features), GatedRecurrent(size=512, activation='tanh', gate_activation='steeper_sigmoid', init='orthogonal', seq_output=False, p_drop=0.75), Dense(size=1, activation='sigmoid', init='orthogonal') ] model = RNN(layers=layers, cost='bce', updater=Adadelta(lr=0.5)) model.fit(trX, trY, n_epochs=10) te_data = pd.read_csv('testData.tsv', delimiter='\t') ids = te_data['id'].values teX = clean(te_data['review'].values) teX = tokenizer.transform(teX) pr_teX = model.predict(teX).flatten() pd.DataFrame(np.asarray([ids, pr_teX]).T).to_csv('submission.csv', index=False, header=["id", "sentiment"])
import pandas as pd from sklearn import metrics from passage.preprocessing import Tokenizer from passage.layers import Embedding, GatedRecurrent, Dense from passage.models import RNN from passage.utils import load, save from load import load_gender_data trX, teX, trY, teY = load_gender_data(ntrain=10000) # Can increase up to 250K or so tokenizer = Tokenizer(min_df=10, max_features=50000) print trX[1] # see a blog example trX = tokenizer.fit_transform(trX) teX = tokenizer.transform(teX) print tokenizer.n_features layers = [ Embedding(size=128, n_features=tokenizer.n_features), GatedRecurrent(size=256, activation='tanh', gate_activation='steeper_sigmoid', init='orthogonal', seq_output=False), Dense(size=1, activation='sigmoid', init='orthogonal') # sigmoid for binary classification ] model = RNN(layers=layers, cost='bce') # bce is classification loss for binary classification and sigmoid output for i in range(2): model.fit(trX, trY, n_epochs=1) tr_preds = model.predict(trX[:len(teY)]) te_preds = model.predict(teX) tr_acc = metrics.accuracy_score(trY[:len(teY)], tr_preds > 0.5)
gru_sizes = [10, 20, 50, 100, 200, 1000] epochs = [1, 3, 5, 7, 10] for embedding_size, gru_size, num_epochs in product(embedding_sizes, gru_sizes, epochs): X_train, X_test, y_train, y_test = cross_validation.train_test_split( train_text, train_labels, test_size=0.1, random_state=0 ) layers = [ Embedding(size=embedding_size, n_features=tokenizer.n_features), GatedRecurrent(size=gru_size), Dense(size=1, activation="sigmoid"), ] model = RNN(layers=layers, cost="BinaryCrossEntropy") model.fit(train_tokens, train_labels, n_epochs=int(num_epochs)) modelfile_name = "stubborn_model.paramsearch.embedding{}.gru{}.epoch{}".format(embedding_size, gru_size, num_epochs) save(model, modelfile_name + ".pkl") pickle.dump(tokenizer, open(modelfile_name + "-tokenizer.pkl", "wb")) results = model.predict(tokenizer.transform(X_test)) count = 0 for r, g in zip(results, y_test): if int(r >= 0.5) == int(g): count += 1 results = 1.0 * count / len(y_test) print(modelfile_name + "\t" + str(results))
def train_model(modeltype, delta): assert modeltype in ["gated_recurrent", "lstm_recurrent"] print "Begin Training" df_imdb_reviews = pd.read_csv('../data/imdb_review_data.tsv', escapechar='\\', delimiter='\t') X = clean(df_imdb_reviews['review'].values) y = df_imdb_reviews['sentiment'].values X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42) print "Tokenize" tokenizer = Tokenizer(min_df=10, max_features=100000) X_train = tokenizer.fit_transform(X_train) X_train = [[float(x) for x in y] for y in X_train] X_test = tokenizer.transform(X_test) X_test = [[float(x) for x in y] for y in X_test] print "Number of featers: {}".format(tokenizer.n_features) print "Training model" if modeltype == "gated_recurrent": layers = [ Embedding(size=256, n_features=tokenizer.n_features), GatedRecurrent(size=512, activation='tanh', gate_activation='steeper_sigmoid', init='orthogonal', seq_output=True, p_drop=0.5), Dense(size=1, activation='sigmoid', init='orthogonal') ] else: layers = [ Embedding(size=256, n_features=tokenizer.n_features), LstmRecurrent(size=512, activation='tanh', gate_activation='steeper_sigmoid', init='orthogonal', seq_output=True, p_drop=0.5), Dense(size=1, activation='sigmoid', init='orthogonal') ] # bce is classification loss for binary classification and sigmoid output model = RNN(layers=layers, cost='bce', updater=Adadelta, (lr=delta)) model.fit(X_train, y_train, n_epochs=20) with open('../data/{}_tokenizer_delta_{}_pdrop_0.5.pkl'.format(modeltype, delta), 'w') as f: vectorizer = pickle.dump(tokenizer, f) with open('../data/{}_model_delta_{}._pdrop_0.5.pkl'.format(modeltype, delta), 'w') as f: model = pickle.dump(model, f) try: y_pred_te = model.predict(X_test).flatten() >= 0.5 y_pred_tr = model.predict(X_train).flatten() >= 0.5 print 'Test Accuracy: {}'.format(accuracy_score(y_test,y_pred_te)) print 'Test Precision: {}'.format(precision_score(y_test,y_pred_te)) print 'Test Recall: {}'.format(recall_score(y_test,y_pred_te)) print 'Train Accuracy: {}'.format(accuracy_score(y_train,y_pred_tr)) print 'Train Precision: {}'.format(precision_score(y_train,y_pred_tr)) print 'Train Recall: {}'.format(recall_score(y_train,y_pred_tr)) except: print "Unable to perform metrics" return tokenizer, model
categories=categories) newsgroups_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'), categories=categories) print len(newsgroups_train.data), len(newsgroups_test.data) from sklearn import metrics from passage.preprocessing import Tokenizer from passage.layers import Embedding, GatedRecurrent, Dense from passage.models import RNN from passage.utils import save tokenizer = Tokenizer(min_df=10, max_features=50000) X_train = tokenizer.fit_transform(newsgroups_train.data) X_test = tokenizer.transform(newsgroups_test.data) Y_train = newsgroups_train.target Y_test = newsgroups_test.target print tokenizer.n_features layers = [ Embedding(size=128, n_features=tokenizer.n_features), GatedRecurrent(size=256, activation='tanh', gate_activation='steeper_sigmoid', init='orthogonal', seq_output=False), Dense(size=1, activation='sigmoid', init='orthogonal') # sigmoid for binary classification ]
categories=categories) newsgroups_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'), categories=categories) print len(newsgroups_train.data), len(newsgroups_test.data) from sklearn import metrics from passage.preprocessing import Tokenizer from passage.layers import Embedding, GatedRecurrent, Dense from passage.models import RNN from passage.utils import save tokenizer = Tokenizer(min_df=10, max_features=50000) X_train = tokenizer.fit_transform(newsgroups_train.data) X_test = tokenizer.transform(newsgroups_test.data) Y_train = newsgroups_train.target Y_test = newsgroups_test.target print tokenizer.n_features layers = [ Embedding(size=128, n_features=tokenizer.n_features), GatedRecurrent(size=256, activation='tanh', gate_activation='steeper_sigmoid', init='orthogonal', seq_output=False), Dense(size=1, activation='sigmoid', init='orthogonal') # sigmoid for binary classification ] model = RNN(layers=layers, cost='bce') # bce is classification loss for binary classification and sigmoid output for i in range(2): model.fit(X_train, Y_train, n_epochs=1)
#!/usr/bin/env python # coding=utf-8 from passage.preprocessing import Tokenizer from passage.layers import Embedding, GatedRecurrent, Dense from passage.models import RNN from passage.utils import save, load train_text = ['hello world','foo bar'] train_labels = [0,1] test_text = ['good man'] tokenizer = Tokenizer() train_tokens = tokenizer.fit_transform(train_text) layers = [ Embedding(size=128, n_features=tokenizer.n_features), GatedRecurrent(size=128), Dense(size=1, activation='sigmoid') ] model = RNN(layers=layers, cost='BinaryCrossEntropy') model.fit(train_tokens, train_labels) print model.predict(tokenizer.transform(test_text)) save(model, 'save_test.pkl') model = load('save_test.pkl')
def train_and_save_passage_tokenizer_and_rnn_model(x_train, y_train, x_test, character_model=False): """Train and save Passage tokenizer and Passage RNN model. x_train and x_test should each be a series that's already been pre-preocessed: html->text, lowercase, removed punct/#s x_train+x_test are used to build the tokenizer. Note that character-based RNN is a work-in-progress and not actuallly implemented as of now. """ # Note that we assume we have train/test reviews that had been preprocessed: html->text, lowercased, removed # punct/#s # Note in https://github.com/IndicoDataSolutions/Passage/blob/master/examples/sentiment.py they only # extract text from html, lowercase and strip (no punctuation removal) # Tokenization: Assign each word in the reviews an ID to be used in all reviews tokenizer = Tokenizer(min_df=10, max_features=100000, character=character_model) train_reviews_list = x_train.tolist() tokenizer.fit(train_reviews_list + x_test.tolist()) # Tokenize training reviws (so can use to fit RNN model on) train_reviews_tokenized = tokenizer.transform(train_reviews_list) # Based on https://github.com/vinhkhuc/kaggle-sentiment-popcorn/blob/master/scripts/passage_nn.py which is based # on https://github.com/IndicoDataSolutions/Passage/blob/master/examples/sentiment.py # RNN Network: # -Each tokenized review will be converted into a sequence of words, where each word has an embedding representation # (256) # -RNN layer (GRU) attempts to find pattern in sequence of words # -Final dense layer is used as a logistic classifier to turn RNN output into a probability/prediction if not character_model: layers = [ Embedding(size=256, n_features=tokenizer.n_features), # May replace with LstmRecurrent for LSTM layer GatedRecurrent(size=512, activation='tanh', gate_activation='steeper_sigmoid', init='orthogonal', seq_output=False, p_drop=0.75), Dense(size=1, activation='sigmoid', init='orthogonal') ] else: # Character-level RNN # Idea is to convert character tokenizations into one-hot encodings in which case # the embeddings layer is no longer needed train_reviews_tokenized = map( lambda r_indexes: pd.get_dummies( r_indexes, columns=range(tokenizer.n_features + 1)).values, train_reviews_tokenized) layers = [ # May replace with LstmRecurrent for LSTM layer GatedRecurrent(size=100, activation='tanh', gate_activation='steeper_sigmoid', init='orthogonal', seq_output=False, p_drop=0.75), Dense(size=1, activation='sigmoid', init='orthogonal') ] # RNN classifer uses Binary Cross-Entropy as the cost function classifier = RNN(layers=layers, cost='bce', updater=Adadelta(lr=0.5)) NUM_EPOCHS = 10 # 10 epochs may take 10+ hours to run depending on machine classifier.fit(train_reviews_tokenized, y_train.tolist(), n_epochs=NUM_EPOCHS) # Store model and tokenizer if character_model: passage.utils.save(classifier, PASSAGE_CHAR_RNN_MODEL) _ = joblib.dump(tokenizer, PASSAGE_CHAR_TOKENIZER, compress=9) else: passage.utils.save(classifier, PASSAGE_RNN_MODEL) _ = joblib.dump(tokenizer, PASSAGE_TOKENIZER, compress=9)