def fit(self, trX, trY, batch_size=64, n_epochs=1, len_filter=LenFilter(), snapshot_freq=1, path=None): """Train model on given training examples and return the list of costs after each minibatch is processed. Args: trX (list) -- Inputs trY (list) -- Outputs batch_size (int, optional) -- number of examples in a minibatch (default 64) n_epochs (int, optional) -- number of epochs to train for (default 1) len_filter (object, optional) -- object to filter training example by length (default LenFilter()) snapshot_freq (int, optional) -- number of epochs between saving model snapshots (default 1) path (str, optional) -- prefix of path where model snapshots are saved. If None, no snapshots are saved (default None) Returns: list -- costs of model after processing each minibatch """ if len_filter is not None: trX, trY = len_filter.filter(trX, trY) trY = standardize_targets(trY, cost=self.cost) n = 0. t = time() costs = [] for e in range(n_epochs): epoch_costs = [] for xmb, ymb in self.iterator.iterXY(trX, trY): c = self._train(xmb, ymb) epoch_costs.append(c) n += len(ymb) if self.verbose >= 2: n_per_sec = n / (time() - t) n_left = len(trY) - n % len(trY) time_left = n_left / n_per_sec sys.stdout.write( "\rEpoch %d Seen %d samples Avg cost %0.4f Time left %d seconds" % (e, n, np.mean(epoch_costs[-250:]), time_left)) sys.stdout.flush() costs.extend(epoch_costs) status = "Epoch %d Seen %d samples Avg cost %0.4f Time elapsed %d seconds" % ( e, n, np.mean(epoch_costs[-250:]), time() - t) if self.verbose >= 2: sys.stdout.write("\r" + status) sys.stdout.flush() sys.stdout.write("\n") elif self.verbose == 1: print(status) if path and e % snapshot_freq == 0: save(self, "{0}.{1}".format(path, e)) return costs
model = train_RNN(tokenizer, X_tokens, y_train) y_pred_tr = model.predict(X_tokens).flatten() # Check overall performance test_tokens = tokenizer.transform(X_test) y_pred_tst = model.predict(test_tokens).flatten() # Conver predictions to binary yhat_train = y_pred_tr.reshape(-1, 1) yhat_test = y_pred_tst.reshape(-1, 1) binarizer = Binarizer(threshold=0.5).fit(yhat_train) yhat_tr_b = binarizer.transform(yhat_train).astype(int) yhat_tst_b = binarizer.transform(yhat_test).astype(int) save(model, review_score_full.pkl) with open('review_tokenizer_full.pkl', 'wb') as fileObject: pickle.dump(tokenizer, fileObject) # # Save model for future use # save(model, 'review_scorer1.pkl') # # model = load('review_scorer.pkl') # with open('review_tokenizer1.pkl','wb') as fileObject: # pickle.dump(tokenizer, fileObject) # Scorers to consider # score() # Scoring method: Roc_Auc_Score # Train: 0.99, Test: 0.97
import sys # --- # --- print 'loading dataset' d = Dataset(settings['FN_DATASET'], settings['FN_VOCABULARY']) d.load() print 'generating labeled training set' train_text,train_labels = d.getNextWordPredTrainset(10) #for t,l in zip(train_text,train_labels): # print t,'->',l tokenizer = Tokenizer() train_tokens = tokenizer.fit_transform(train_text) save(train_tokens, settings['FN_TRAINED_TOKENIZER']) layers = [ Embedding(size=128, n_features=tokenizer.n_features), GatedRecurrent(size=128), Dense(size=1, activation='sigmoid') ] model = RNN(layers=layers, cost='BinaryCrossEntropy') model.fit(train_tokens, train_labels) save(model, settings['FN_MODEL_NEXTWORDPRED'])
teX = tokenizer.transform(teX) print tokenizer.n_features layers = [ Embedding(size=128, n_features=tokenizer.n_features), GatedRecurrent(size=256, activation='tanh', gate_activation='steeper_sigmoid', init='orthogonal', seq_output=False), Dense(size=1, activation='sigmoid', init='orthogonal') # sigmoid for binary classification ] model = RNN(layers=layers, cost='bce') # bce is classification loss for binary classification and sigmoid output for i in range(2): model.fit(trX, trY, n_epochs=1) tr_preds = model.predict(trX[:len(teY)]) te_preds = model.predict(teX) tr_acc = metrics.accuracy_score(trY[:len(teY)], tr_preds > 0.5) te_acc = metrics.accuracy_score(teY, te_preds > 0.5) print i, tr_acc, te_acc save(model, 'save_test.pkl') # How to save model = load('save_test.pkl') # How to load tr_preds = model.predict(trX[:len(teY)]) te_preds = model.predict(teX) tr_acc = metrics.accuracy_score(trY[:len(teY)], tr_preds > 0.5) te_acc = metrics.accuracy_score(teY, te_preds > 0.5) print tr_acc, te_acc
def fit(self, trX, trY, valX, valY, n_epochs=1, early_stopping=None, len_filter=LenFilter(max_len=10000, percentile=100), snapshot_freq=1, path=None): """Train model on given training examples and return the list of costs after each minibatch is processed. Args: trX (list) -- Inputs trY (list) -- Outputs valX (list) -- Validation Inputs valY (list) -- Validation Outputs n_epochs (int, optional) -- number of epochs to train for (default 1) early_stopping -- number of consecutive epochs above minimum validation before stopping (default None; no early stopping) len_filter (object, optional) -- object to filter training example by length (default LenFilter()) snapshot_freq (int, optional) -- number of epochs between saving model snapshots (default 1) path (str, optional) -- prefix of path where model snapshots are saved. If None, no snapshots are saved (default None) Returns: list -- costs of model after processing each minibatch """ if len_filter is not None: trX, trY = len_filter.filter(trX, trY) trY = standardize_targets(trY, cost=self.cost) n = 0. stats = [] t = time() costs = [] valY = np.asarray(valY) self.valcosts = [] sensitivity = [] specificity = [] training_costs = [] min_val = float('inf') min_train = float('inf') stopping_count = 0 for e in range(n_epochs): weights = [] epoch_costs = [] for xmb, ymb in self.iterator.iterXY(trX, trY): c = self._train(xmb, ymb) epoch_costs.append(c) n += len(ymb) if self.verbose >= 2: n_per_sec = n / (time() - t) n_left = len(trY) - n % len(trY) time_left = n_left / n_per_sec sys.stdout.write( "\rEpoch %d Seen %d samples Avg cost %0.4f Time left %d seconds" % (e, n, np.mean(epoch_costs[-250:]), time_left)) sys.stdout.flush() weights.append('Epoch: {0} samples: {1} avg cost: {2}'.format( e, n, np.mean(epoch_costs[-250:]))) for layer in self.settings['layers']: try: w = layer['config']['weights'] except TypeError: w = [p.get_value() for p in layer.params] for p in w: weights.append(str(p)) if np.any(np.isnan(p)): err_file = 'error_0.txt' i = 1 while exists(err_file): err_file = 'err_{0}.txt'.format(i) i += 1 with open(err_file, 'w') as out: out.write('\n'.join(weights)) raise Exception('NaN weights') costs.extend(epoch_costs) training_costs.append(np.mean(epoch_costs)) status = "Epoch %d Seen %d samples Avg cost %0.4f Time elapsed %d seconds" % ( e, n, np.mean(epoch_costs[-250:]), time() - t) if self.verbose >= 2: sys.stdout.write("\r" + status) sys.stdout.flush() sys.stdout.write("\n") elif self.verbose == 1: print status if path and e % snapshot_freq == 0: save(self, "{0}.{1}".format(path, e)) preds = self.batch_predict(valX) val_loss, sens, spec = self.val_loss_accuracy(preds, valY) print "Validation loss:", val_loss print "Sensitivity:", [round(x, 4) for x in sens] print "Specificity:", [round(x, 4) for x in spec] self.valcosts.append(val_loss) sensitivity.append(sens) specificity.append(spec) #early stopping if early_stopping is not None: if val_loss <= min_val: min_val = val_loss #reset count stopping_count = 0 #keep track of the traning cost at the minimum validation loss min_train = training_costs[-1] elif training_costs[-1] < min_train: #only increase counter if the latest training loss is below the training loss at minimum validation loss stopping_count += 1 if stopping_count >= early_stopping: break return training_costs, self.valcosts, sensitivity, specificity
def train(X, y): from keras.layers.embeddings import Embedding from keras.layers.recurrent import LSTM, GRU, SimpleRNN from keras.layers.core import Dense from keras.models import Sequential from keras.layers.core import Dropout from keras.preprocessing.text import Tokenizer from keras.preprocessing.sequence import pad_sequences from math import e vocab = 10000 tokenizer = Tokenizer(nb_words=vocab) tokenizer.fit_on_texts(X) X = tokenizer.texts_to_sequences(X) """ index_word = {v: k for k, v in tokenizer.word_index.items()} for i in range(1, 10001): print str(i) + "," + index_word[i] return """ maxlen = 50 X1 = [] y1 = [] for thing, target in zip(X, y): if len(thing) != 0: X1.append(thing) y1.append(target) X = X1 y = y1 KERAS = False if KERAS: X = pad_sequences(X, maxlen=maxlen) from random import shuffle xy = zip(X, y) shuffle(xy) X_s, y_s = zip(*xy) X_train, y_train, X_test, y_test = X_s[:-1000], y_s[:-1000], X_s[ -1000:], y_s[-1000:] embedding_size = 256 dropout = .3 batch_size = 256 recurrent_gate_size = 512 """ model = Sequential() model.add(Embedding(vocab, embedding_size, mask_zero=True)) model.add(Dropout(dropout)) model.add(LSTM(recurrent_gate_size)) model.add(Dropout(dropout)) model.add(Dense(1)) print "building model..." model.compile(loss="msle", optimizer="rmsprop") print "fitting model" #model.load_weights("mymodel") model.fit(np.asarray(X_train), np.asarray(y_train), nb_epoch=30, verbose=1, batch_size=batch_size, validation_data=(np.asarray(X_test), np.asarray(y_test))) model.save_weights("mymodel") """ from passage.preprocessing import Tokenizer, LenFilter from passage.layers import Embedding, GatedRecurrent, Dense, OneHot, LstmRecurrent from passage.models import RNN from passage.utils import save, load from passage.iterators import Padded layers = [ # OneHot(n_features=5), Embedding(size=embedding_size, n_features=vocab), # GatedRecurrent(size=recurrent_gate_size, seq_output=True, p_drop=dropout), # LstmRecurrent(size=recurrent_gate_size, p_drop=dropout), GatedRecurrent(size=recurrent_gate_size, p_drop=dropout), Dense(size=8, activation='softmax', p_drop=dropout) ] print >> sys.stderr, "learning model" model_iterator = Padded() model = load("mymodel.final.pkl") #model = RNN(layers=layers, cost='CategoricalCrossEntropy', verbose=2, updater="Adam") filter = LenFilter(max_len=maxlen) model.fit(np.asarray(X_train), np.asarray(y_train), batch_size=batch_size, n_epochs=1000, path="mymodel.pkl", snapshot_freq=49, len_filter=filter) save(model, "mymodel.final.pkl") # print "test cost" # print model._cost(np.asarray(X_test), np.asarray(y_test)) print "test accuracy" passage_batch_predict(np.asarray(X_train), np.asarray(y_train), model) exit = False print "enter a sentence" while not exit: text = raw_input() if text == "exit": break else: tokens = tokenizer.texts_to_sequences([text]) if len(tokens) == 0: print "Sentence too strange, try again" continue if KERAS: tokens = pad_sequences(tokens, maxlen=maxlen) prediction = np.argmax(model.predict(tokens)[0]) try: print e**(prediction - 2) except Exception: pass
gru_sizes = [10, 20, 50, 100, 200, 1000] epochs = [1, 3, 5, 7, 10] for embedding_size, gru_size, num_epochs in product(embedding_sizes, gru_sizes, epochs): X_train, X_test, y_train, y_test = cross_validation.train_test_split( train_text, train_labels, test_size=0.1, random_state=0 ) layers = [ Embedding(size=embedding_size, n_features=tokenizer.n_features), GatedRecurrent(size=gru_size), Dense(size=1, activation="sigmoid"), ] model = RNN(layers=layers, cost="BinaryCrossEntropy") model.fit(train_tokens, train_labels, n_epochs=int(num_epochs)) modelfile_name = "stubborn_model.paramsearch.embedding{}.gru{}.epoch{}".format(embedding_size, gru_size, num_epochs) save(model, modelfile_name + ".pkl") pickle.dump(tokenizer, open(modelfile_name + "-tokenizer.pkl", "wb")) results = model.predict(tokenizer.transform(X_test)) count = 0 for r, g in zip(results, y_test): if int(r >= 0.5) == int(g): count += 1 results = 1.0 * count / len(y_test) print(modelfile_name + "\t" + str(results))
Y_train = newsgroups_train.target Y_test = newsgroups_test.target print tokenizer.n_features layers = [ Embedding(size=128, n_features=tokenizer.n_features), GatedRecurrent(size=256, activation='tanh', gate_activation='steeper_sigmoid', init='orthogonal', seq_output=False), Dense(size=1, activation='sigmoid', init='orthogonal') # sigmoid for binary classification ] model = RNN( layers=layers, cost='bce' ) # bce is classification loss for binary classification and sigmoid output for i in range(2): model.fit(X_train, Y_train, n_epochs=1) tr_preds = model.predict(X_train[:len(Y_test)]) te_preds = model.predict(X_test) tr_acc = metrics.accuracy_score(Y_train[:len(Y_test)], tr_preds > 0.5) te_acc = metrics.accuracy_score(Y_test, te_preds > 0.5) print i, tr_acc, te_acc # dataset too small to fully utilize Passage save(model, 'model.pkl')
tokenizer = Tokenizer(min_df=10, max_features=50000) trX = tokenizer.fit_transform(trX) pickle.dump(tokenizer, open('tokenizer.pkl', 'wb')) print "number of tokens:" + str(len(trX)) teX = tokenizer.transform(teX) print "number of feathures:" + str(tokenizer.n_features) layers = [ Embedding(size=256, n_features=tokenizer.n_features), GatedRecurrent(size=725), Dense(size=10, activation='softmax') ] model = RNN(layers=layers, cost='cce') model.fit(trX, trY, n_epochs=10) save(model, 'modelEcho.pkl') tr_preds = model.predict(trX) te_preds = model.predict(teX) data = pd.DataFrame(trY) data.to_csv('data/trY.vec') data = pd.DataFrame(tr_preds) data.to_csv('data/tr_preds.vec') tr_acc = np.mean(np.argmax(trY, axis=1) == np.argmax(tr_preds, axis=1)) indexy = np.argmax(teY, axis=1) data = pd.DataFrame(indexy) data.to_csv('data/ev_agrmax.txt') data = pd.DataFrame(np.argmax(te_preds, axis=1))
from passage.models import RNN from passage.utils import save tokenizer = Tokenizer(min_df=10, max_features=50000) X_train = tokenizer.fit_transform(newsgroups_train.data) X_test = tokenizer.transform(newsgroups_test.data) Y_train = newsgroups_train.target Y_test = newsgroups_test.target print tokenizer.n_features layers = [ Embedding(size=128, n_features=tokenizer.n_features), GatedRecurrent(size=256, activation='tanh', gate_activation='steeper_sigmoid', init='orthogonal', seq_output=False), Dense(size=1, activation='sigmoid', init='orthogonal') # sigmoid for binary classification ] model = RNN(layers=layers, cost='bce') # bce is classification loss for binary classification and sigmoid output for i in range(2): model.fit(X_train, Y_train, n_epochs=1) tr_preds = model.predict(X_train[:len(Y_test)]) te_preds = model.predict(X_test) tr_acc = metrics.accuracy_score(Y_train[:len(Y_test)], tr_preds > 0.5) te_acc = metrics.accuracy_score(Y_test, te_preds > 0.5) print i, tr_acc, te_acc # dataset too small to fully utilize Passage save(model, 'model.pkl')
#!/usr/bin/env python # coding=utf-8 from passage.preprocessing import Tokenizer from passage.layers import Embedding, GatedRecurrent, Dense from passage.models import RNN from passage.utils import save, load train_text = ['hello world','foo bar'] train_labels = [0,1] test_text = ['good man'] tokenizer = Tokenizer() train_tokens = tokenizer.fit_transform(train_text) layers = [ Embedding(size=128, n_features=tokenizer.n_features), GatedRecurrent(size=128), Dense(size=1, activation='sigmoid') ] model = RNN(layers=layers, cost='BinaryCrossEntropy') model.fit(train_tokens, train_labels) print model.predict(tokenizer.transform(test_text)) save(model, 'save_test.pkl') model = load('save_test.pkl')
num_feats = len(set(flatten(train_tokens))) def get_labels(id): if id == 3: return [1, 0] else: return [0, 1] seq_labels = map(lambda (l): map(get_labels, l), train_tokens) layers = [ Embedding(size=128, n_features=num_feats), GatedRecurrent(size=128, seq_output=True), Dense(size=num_feats, activation='softmax') ] #iterator = SortedPadded(y_pad=True, y_dtype=intX) #iterator = SortedPadded(y_dtype=intX) #model = RNN(layers=layers, cost='seq_cce', iterator=iterator, Y=T.imatrix()) model = RNN(layers=layers, cost='seq_cce') #model.fit(train_tokens, [1,0,1]) model.fit(train_tokens, train_tokens) #model.predict(tokenizer.transform(["Frogs are awesome", "frogs are amphibious"])) model.predict(train_tokens) save(model, 'save_test.pkl') model = load('save_test.pkl') """ This model, although doing sequential prediction, predicts a tag per document not per word. """