def build_model(weights=None, embedding_size=256, recurrent_gate_size=512, n_features=5, dropout=0.4): """ build_model Inputs: weights - Path to a weights file to load, or None if the model should be built from scratch embedding_size - Size of the embedding layer recurrent_gate_size - Size of the gated recurrent layer n_features - Number of features for the embedding layer dropout - Dropout value Returns: A model object ready for training (or evaluation if a previous model was loaded via `weights`) """ # vvvvv #Modify this if you want to change the structure of the network! # ^^^^^ model_layers = [ Embedding(size=embedding_size, n_features=n_features), GatedRecurrent(size=recurrent_gate_size, p_drop=dropout), Dense(size=1, activation='sigmoid', p_drop=dropout) ] model = RNN(layers=model_layers, cost='BinaryCrossEntropy', verbose=2, updater='Adam') if weights: #Just load the provided model instead, I guess? model = load(weights) return model
def test(): model_name = "resources/DeepLincM5.pkl.90" from tests import load_test_seqs from passage.utils import load from evaluate import batch_predict seqs = load_test_seqs() mrnn = load(model_name) for p in batch_predict([mrnn], seqs): print p
def sentiment_scorer(): """ INPUT: None Returns the previously trained RNN sentiment scoring model and the tokenizer used """ print "\nLoading tokenizer" with open('review_tokenizer.pkl', 'r') as fileObject: tokenizer = pickle.load(fileObject) print "\nLoading Recurrent Neral Network model" model = load('review_scorer.pkl') print "\nDone loading models" return model, tokenizer
def sentiment_scorer(): """ INPUT: None Returns the previously trained RNN sentiment scoring model and the tokenizer used """ print "\nLoading tokenizer" with open('review_tokenizer.pkl','r') as fileObject: tokenizer = pickle.load(fileObject) print "\nLoading Recurrent Neral Network model" model = load('review_scorer.pkl') print "\nDone loading models" return model, tokenizer
teX = tokenizer.transform(teX) print tokenizer.n_features layers = [ Embedding(size=128, n_features=tokenizer.n_features), GatedRecurrent(size=256, activation='tanh', gate_activation='steeper_sigmoid', init='orthogonal', seq_output=False), Dense(size=1, activation='sigmoid', init='orthogonal') # sigmoid for binary classification ] model = RNN(layers=layers, cost='bce') # bce is classification loss for binary classification and sigmoid output for i in range(2): model.fit(trX, trY, n_epochs=1) tr_preds = model.predict(trX[:len(teY)]) te_preds = model.predict(teX) tr_acc = metrics.accuracy_score(trY[:len(teY)], tr_preds > 0.5) te_acc = metrics.accuracy_score(teY, te_preds > 0.5) print i, tr_acc, te_acc save(model, 'save_test.pkl') # How to save model = load('save_test.pkl') # How to load tr_preds = model.predict(trX[:len(teY)]) te_preds = model.predict(teX) tr_acc = metrics.accuracy_score(trY[:len(teY)], tr_preds > 0.5) te_acc = metrics.accuracy_score(teY, te_preds > 0.5) print tr_acc, te_acc
def train(X, y): from keras.layers.embeddings import Embedding from keras.layers.recurrent import LSTM, GRU, SimpleRNN from keras.layers.core import Dense from keras.models import Sequential from keras.layers.core import Dropout from keras.preprocessing.text import Tokenizer from keras.preprocessing.sequence import pad_sequences from math import e vocab = 10000 tokenizer = Tokenizer(nb_words=vocab) tokenizer.fit_on_texts(X) X = tokenizer.texts_to_sequences(X) """ index_word = {v: k for k, v in tokenizer.word_index.items()} for i in range(1, 10001): print str(i) + "," + index_word[i] return """ maxlen = 50 X1 = [] y1 = [] for thing, target in zip(X, y): if len(thing) != 0: X1.append(thing) y1.append(target) X = X1 y = y1 KERAS = False if KERAS: X = pad_sequences(X, maxlen=maxlen) from random import shuffle xy = zip(X, y) shuffle(xy) X_s, y_s = zip(*xy) X_train, y_train, X_test, y_test = X_s[:-1000], y_s[:-1000], X_s[ -1000:], y_s[-1000:] embedding_size = 256 dropout = .3 batch_size = 256 recurrent_gate_size = 512 """ model = Sequential() model.add(Embedding(vocab, embedding_size, mask_zero=True)) model.add(Dropout(dropout)) model.add(LSTM(recurrent_gate_size)) model.add(Dropout(dropout)) model.add(Dense(1)) print "building model..." model.compile(loss="msle", optimizer="rmsprop") print "fitting model" #model.load_weights("mymodel") model.fit(np.asarray(X_train), np.asarray(y_train), nb_epoch=30, verbose=1, batch_size=batch_size, validation_data=(np.asarray(X_test), np.asarray(y_test))) model.save_weights("mymodel") """ from passage.preprocessing import Tokenizer, LenFilter from passage.layers import Embedding, GatedRecurrent, Dense, OneHot, LstmRecurrent from passage.models import RNN from passage.utils import save, load from passage.iterators import Padded layers = [ # OneHot(n_features=5), Embedding(size=embedding_size, n_features=vocab), # GatedRecurrent(size=recurrent_gate_size, seq_output=True, p_drop=dropout), # LstmRecurrent(size=recurrent_gate_size, p_drop=dropout), GatedRecurrent(size=recurrent_gate_size, p_drop=dropout), Dense(size=8, activation='softmax', p_drop=dropout) ] print >> sys.stderr, "learning model" model_iterator = Padded() model = load("mymodel.final.pkl") #model = RNN(layers=layers, cost='CategoricalCrossEntropy', verbose=2, updater="Adam") filter = LenFilter(max_len=maxlen) model.fit(np.asarray(X_train), np.asarray(y_train), batch_size=batch_size, n_epochs=1000, path="mymodel.pkl", snapshot_freq=49, len_filter=filter) save(model, "mymodel.final.pkl") # print "test cost" # print model._cost(np.asarray(X_test), np.asarray(y_test)) print "test accuracy" passage_batch_predict(np.asarray(X_train), np.asarray(y_train), model) exit = False print "enter a sentence" while not exit: text = raw_input() if text == "exit": break else: tokens = tokenizer.texts_to_sequences([text]) if len(tokens) == 0: print "Sentence too strange, try again" continue if KERAS: tokens = pad_sequences(tokens, maxlen=maxlen) prediction = np.argmax(model.predict(tokens)[0]) try: print e**(prediction - 2) except Exception: pass
ntest=1000) # Can increase up to 250K or so print len(trX), len(trY), len(teX), len(teY) print teX.shape() tokenizer = Tokenizer(min_df=10, max_features=50000) #print trX[1] # see a blog example trX = tokenizer.fit_transform(trX) text = "Evropa je v jeho politika naprosto impotent ." teX = tokenizer.transform(text) print "number of tokens:" + str(len(trX)) print "number of feathures:" + str(tokenizer.n_features) layers = [ Embedding(size=256, n_features=tokenizer.n_features), GatedRecurrent(size=512, p_drop=0.2), Dense(size=10, activation='softmax', p_drop=0.5) ] model = RNN( layers=layers, cost='cce' ) # bce is classification loss for binary classification and sigmoid output model = load('modelEcho.pkl') # How to load te_pred = model.predict(teX) #tr_acc = metrics.accuracy_score(trY[:len(teY)], tr_preds > 0.5) #te_acc = metrics.accuracy_score(teY, te_preds > 0.5) print te_pred
# -*- coding: utf-8 -*- from settings import getSettings settings = getSettings() # from passage.preprocessing import Tokenizer from passage.utils import load # --- tokenizer = load(settings['FN_TRAINED_TOKENIZER']) model = load(settings['FN_MODEL_NEXTWORDPRED']) while(True): sentence = raw_input('>') model.predict(tokenizer.transform(sentence)) print 'best next word for <%s>: None' % (sentence)
import io import random import sys try: import cPickle as pickle except: import pickle from passage.preprocessing import Tokenizer from passage.layers import Embedding, GatedRecurrent, Dense from passage.models import RNN from passage.utils import save, load random.seed(0) testfile, modelfile, tokenizerfile = sys.argv[1:] test_text = [] with io.open(testfile, 'r', encoding='utf8') as fin: for text in fin: test_text.append(text.strip()) tokenizer = pickle.load(open(tokenizerfile, 'rb')) model = load(modelfile) results = model.predict(tokenizer.transform(test_text)) for r in results: print (int(r>=0.5))
#!/usr/bin/env python # coding=utf-8 from passage.preprocessing import Tokenizer from passage.layers import Embedding, GatedRecurrent, Dense from passage.models import RNN from passage.utils import save, load train_text = ['hello world','foo bar'] train_labels = [0,1] test_text = ['good man'] tokenizer = Tokenizer() train_tokens = tokenizer.fit_transform(train_text) layers = [ Embedding(size=128, n_features=tokenizer.n_features), GatedRecurrent(size=128), Dense(size=1, activation='sigmoid') ] model = RNN(layers=layers, cost='BinaryCrossEntropy') model.fit(train_tokens, train_labels) print model.predict(tokenizer.transform(test_text)) save(model, 'save_test.pkl') model = load('save_test.pkl')
num_feats = len(set(flatten(train_tokens))) def get_labels(id): if id == 3: return [1, 0] else: return [0, 1] seq_labels = map(lambda (l): map(get_labels, l), train_tokens) layers = [ Embedding(size=128, n_features=num_feats), GatedRecurrent(size=128, seq_output=True), Dense(size=num_feats, activation='softmax') ] #iterator = SortedPadded(y_pad=True, y_dtype=intX) #iterator = SortedPadded(y_dtype=intX) #model = RNN(layers=layers, cost='seq_cce', iterator=iterator, Y=T.imatrix()) model = RNN(layers=layers, cost='seq_cce') #model.fit(train_tokens, [1,0,1]) model.fit(train_tokens, train_tokens) #model.predict(tokenizer.transform(["Frogs are awesome", "frogs are amphibious"])) model.predict(train_tokens) save(model, 'save_test.pkl') model = load('save_test.pkl') """ This model, although doing sequential prediction, predicts a tag per document not per word. """
model = load(modelfile_name + '.pkl') results = model.predict(train_tokens) count = 0 for r, g in zip(results, train_labels): if int(r>=0.5) == int(g): count+=1 results = 1.0 * count /len(train_labels) print (modelfile_name + '\t' + str(results)) ''' models = [] predictions =[] for embedding_size, gru_size, num_epochs in best_hyperparams: modelfile_name = 'stubborn_model.gridsearch.embedding{}.gru{}.epoch{}'.format(embedding_size, gru_size, num_epochs) model = load(modelfile_name + '.pkl') models.append(model) with io.open('ensemble.train', 'w') as fout: for instance, label in zip(train_tokens, train_labels): line = " ".join([str(model.predict([instance])[0][0]) for model in models]) fout.write(unicode(line + " " + str(label) + '\n')) testfile = 'cwi_test.lemmatized.txt' test_text = [] with io.open(testfile, 'r', encoding='utf8') as fin: for text in fin: test_text.append(text.strip()) test_tokens = tokenizer.transform(test_text)
print "Loading tagger..." taggerMorp = Tagger.load(sys.argv[1]) if not taggerMorp: sys.stderr.write("Cannot load tagger from file '%s'\n" % sys.argv[1]) sys.exit(1) # Init RNN print "Loading RNN..." tokenizerRNN = pickle.load(open('tokenizer.pkl', 'rb')) layers = [ Embedding(size=256, n_features=tokenizerRNN.n_features), GatedRecurrent(size=512, p_drop=0.2), Dense(size=10, activation='softmax', p_drop=0.5) ] model = RNN( layers=layers, cost='cce' ) # bce is classification loss for binary classification and sigmoid output model = load(sys.argv[2]) # How to load print "RNN loaded" s.listen(10) print 'Socket now listening' ####################################################################################################### while 1: conn, addr = s.accept() print 'Connected with ' + addr[0] + ':' + str(addr[1]) start_new_thread(clientthread, (conn, )) s.close()