Exemple #1
0
def build_model(weights=None,
                embedding_size=256,
                recurrent_gate_size=512,
                n_features=5,
                dropout=0.4):
    """
    build_model

    Inputs:
        weights - Path to a weights file to load, or None if the model should be built from scratch
        embedding_size - Size of the embedding layer
        recurrent_gate_size - Size of the gated recurrent layer
        n_features - Number of features for the embedding layer
        dropout - Dropout value

    Returns:
        A model object ready for training (or evaluation if a previous model was loaded via `weights`)
    """
    # vvvvv
    #Modify this if you want to change the structure of the network!
    # ^^^^^
    model_layers = [
        Embedding(size=embedding_size, n_features=n_features),
        GatedRecurrent(size=recurrent_gate_size, p_drop=dropout),
        Dense(size=1, activation='sigmoid', p_drop=dropout)
    ]
    model = RNN(layers=model_layers,
                cost='BinaryCrossEntropy',
                verbose=2,
                updater='Adam')
    if weights:  #Just load the provided model instead, I guess?
        model = load(weights)
    return model
Exemple #2
0
def test():
    model_name = "resources/DeepLincM5.pkl.90"
    from tests import load_test_seqs
    from passage.utils import load
    from evaluate import batch_predict

    seqs = load_test_seqs()
    mrnn = load(model_name)
    for p in batch_predict([mrnn], seqs):
        print p
def sentiment_scorer():
    """
    INPUT: None
    Returns the previously trained RNN sentiment scoring model and the tokenizer used
    """
    print "\nLoading tokenizer"

    with open('review_tokenizer.pkl', 'r') as fileObject:
        tokenizer = pickle.load(fileObject)

    print "\nLoading Recurrent Neral Network model"

    model = load('review_scorer.pkl')

    print "\nDone loading models"

    return model, tokenizer
def sentiment_scorer():
    """
    INPUT: None
    Returns the previously trained RNN sentiment scoring model and the tokenizer used
    """
    print "\nLoading tokenizer"

    with open('review_tokenizer.pkl','r') as fileObject:
        tokenizer = pickle.load(fileObject)

    print "\nLoading Recurrent Neral Network model"

    model = load('review_scorer.pkl')

    print "\nDone loading models"

    return model, tokenizer
Exemple #5
0
teX = tokenizer.transform(teX)
print tokenizer.n_features

layers = [
    Embedding(size=128, n_features=tokenizer.n_features),
    GatedRecurrent(size=256, activation='tanh', gate_activation='steeper_sigmoid', init='orthogonal', seq_output=False),
    Dense(size=1, activation='sigmoid', init='orthogonal') # sigmoid for binary classification
]

model = RNN(layers=layers, cost='bce') # bce is classification loss for binary classification and sigmoid output
for i in range(2):
    model.fit(trX, trY, n_epochs=1)
    tr_preds = model.predict(trX[:len(teY)])
    te_preds = model.predict(teX)

    tr_acc = metrics.accuracy_score(trY[:len(teY)], tr_preds > 0.5)
    te_acc = metrics.accuracy_score(teY, te_preds > 0.5)

    print i, tr_acc, te_acc

save(model, 'save_test.pkl') # How to save

model = load('save_test.pkl') # How to load

tr_preds = model.predict(trX[:len(teY)])
te_preds = model.predict(teX)

tr_acc = metrics.accuracy_score(trY[:len(teY)], tr_preds > 0.5)
te_acc = metrics.accuracy_score(teY, te_preds > 0.5)

print tr_acc, te_acc
Exemple #6
0
def train(X, y):
    from keras.layers.embeddings import Embedding
    from keras.layers.recurrent import LSTM, GRU, SimpleRNN
    from keras.layers.core import Dense
    from keras.models import Sequential
    from keras.layers.core import Dropout
    from keras.preprocessing.text import Tokenizer
    from keras.preprocessing.sequence import pad_sequences
    from math import e
    vocab = 10000
    tokenizer = Tokenizer(nb_words=vocab)
    tokenizer.fit_on_texts(X)
    X = tokenizer.texts_to_sequences(X)
    """
    index_word =  {v: k for k, v in tokenizer.word_index.items()}
    for i in range(1, 10001):
        print str(i) + "," + index_word[i]

    return
    """
    maxlen = 50
    X1 = []
    y1 = []
    for thing, target in zip(X, y):
        if len(thing) != 0:
            X1.append(thing)
            y1.append(target)

    X = X1
    y = y1
    KERAS = False
    if KERAS:
        X = pad_sequences(X, maxlen=maxlen)

    from random import shuffle
    xy = zip(X, y)
    shuffle(xy)
    X_s, y_s = zip(*xy)
    X_train, y_train, X_test, y_test = X_s[:-1000], y_s[:-1000], X_s[
        -1000:], y_s[-1000:]
    embedding_size = 256
    dropout = .3
    batch_size = 256
    recurrent_gate_size = 512
    """
    model = Sequential()
    model.add(Embedding(vocab, embedding_size, mask_zero=True))
    model.add(Dropout(dropout))
    model.add(LSTM(recurrent_gate_size))
    model.add(Dropout(dropout))
    model.add(Dense(1))
    print "building model..."
    model.compile(loss="msle", optimizer="rmsprop")
    print "fitting model"
    #model.load_weights("mymodel")
    model.fit(np.asarray(X_train), np.asarray(y_train), nb_epoch=30, verbose=1, batch_size=batch_size, validation_data=(np.asarray(X_test), np.asarray(y_test)))
   
    model.save_weights("mymodel") 
    """
    from passage.preprocessing import Tokenizer, LenFilter
    from passage.layers import Embedding, GatedRecurrent, Dense, OneHot, LstmRecurrent
    from passage.models import RNN
    from passage.utils import save, load
    from passage.iterators import Padded

    layers = [
        #    OneHot(n_features=5),
        Embedding(size=embedding_size, n_features=vocab),
        #    GatedRecurrent(size=recurrent_gate_size, seq_output=True, p_drop=dropout),
        #    LstmRecurrent(size=recurrent_gate_size, p_drop=dropout),
        GatedRecurrent(size=recurrent_gate_size, p_drop=dropout),
        Dense(size=8, activation='softmax', p_drop=dropout)
    ]

    print >> sys.stderr, "learning model"
    model_iterator = Padded()
    model = load("mymodel.final.pkl")
    #model = RNN(layers=layers, cost='CategoricalCrossEntropy', verbose=2, updater="Adam")
    filter = LenFilter(max_len=maxlen)
    model.fit(np.asarray(X_train),
              np.asarray(y_train),
              batch_size=batch_size,
              n_epochs=1000,
              path="mymodel.pkl",
              snapshot_freq=49,
              len_filter=filter)
    save(model, "mymodel.final.pkl")
    #    print "test cost"
    #    print model._cost(np.asarray(X_test), np.asarray(y_test))
    print "test accuracy"
    passage_batch_predict(np.asarray(X_train), np.asarray(y_train), model)

    exit = False
    print "enter a sentence"
    while not exit:
        text = raw_input()
        if text == "exit":
            break
        else:
            tokens = tokenizer.texts_to_sequences([text])
            if len(tokens) == 0:
                print "Sentence too strange, try again"
                continue
            if KERAS:
                tokens = pad_sequences(tokens, maxlen=maxlen)
            prediction = np.argmax(model.predict(tokens)[0])
            try:
                print e**(prediction - 2)
            except Exception:
                pass
                               ntest=1000)  # Can increase up to 250K or so
print len(trX), len(trY), len(teX), len(teY)

print teX.shape()
tokenizer = Tokenizer(min_df=10, max_features=50000)
#print trX[1] # see a blog example
trX = tokenizer.fit_transform(trX)

text = "Evropa je v jeho politika naprosto impotent ."
teX = tokenizer.transform(text)
print "number of tokens:" + str(len(trX))
print "number of feathures:" + str(tokenizer.n_features)

layers = [
    Embedding(size=256, n_features=tokenizer.n_features),
    GatedRecurrent(size=512, p_drop=0.2),
    Dense(size=10, activation='softmax', p_drop=0.5)
]

model = RNN(
    layers=layers, cost='cce'
)  # bce is classification loss for binary classification and sigmoid output
model = load('modelEcho.pkl')  # How to load

te_pred = model.predict(teX)

#tr_acc = metrics.accuracy_score(trY[:len(teY)], tr_preds > 0.5)
#te_acc = metrics.accuracy_score(teY, te_preds > 0.5)

print te_pred
Exemple #8
0
# -*- coding: utf-8 -*-

from settings import getSettings
settings = getSettings()

# from passage.preprocessing import Tokenizer
from passage.utils import load

# ---


tokenizer = load(settings['FN_TRAINED_TOKENIZER'])
model = load(settings['FN_MODEL_NEXTWORDPRED'])

while(True):
    sentence = raw_input('>')
    model.predict(tokenizer.transform(sentence))
    print 'best next word for <%s>: None' % (sentence)
Exemple #9
0
import io
import random
import sys

try:
	import cPickle as pickle
except:
	import pickle

from passage.preprocessing import Tokenizer
from passage.layers import Embedding, GatedRecurrent, Dense
from passage.models import RNN
from passage.utils import save, load

random.seed(0)

testfile, modelfile, tokenizerfile = sys.argv[1:]

test_text = []
with io.open(testfile, 'r', encoding='utf8') as fin:
    for text in fin:
        test_text.append(text.strip())

tokenizer = pickle.load(open(tokenizerfile, 'rb'))
model = load(modelfile)
results = model.predict(tokenizer.transform(test_text))

for r in results:
    print (int(r>=0.5))
Exemple #10
0
#!/usr/bin/env python
# coding=utf-8
from passage.preprocessing import Tokenizer
from passage.layers import Embedding, GatedRecurrent, Dense
from passage.models import RNN
from passage.utils import save, load

train_text = ['hello world','foo bar']
train_labels = [0,1]
test_text = ['good man']
tokenizer = Tokenizer()
train_tokens = tokenizer.fit_transform(train_text)

layers = [
        Embedding(size=128, n_features=tokenizer.n_features),
        GatedRecurrent(size=128),
        Dense(size=1, activation='sigmoid')
]

model = RNN(layers=layers, cost='BinaryCrossEntropy')
model.fit(train_tokens, train_labels)

print model.predict(tokenizer.transform(test_text))
save(model, 'save_test.pkl')
model = load('save_test.pkl')
num_feats = len(set(flatten(train_tokens)))


def get_labels(id):
    if id == 3:
        return [1, 0]
    else:
        return [0, 1]


seq_labels = map(lambda (l): map(get_labels, l), train_tokens)

layers = [
    Embedding(size=128, n_features=num_feats),
    GatedRecurrent(size=128, seq_output=True),
    Dense(size=num_feats, activation='softmax')
]

#iterator = SortedPadded(y_pad=True, y_dtype=intX)
#iterator = SortedPadded(y_dtype=intX)

#model = RNN(layers=layers, cost='seq_cce', iterator=iterator, Y=T.imatrix())
model = RNN(layers=layers, cost='seq_cce')
#model.fit(train_tokens, [1,0,1])
model.fit(train_tokens, train_tokens)

#model.predict(tokenizer.transform(["Frogs are awesome", "frogs are amphibious"]))
model.predict(train_tokens)
save(model, 'save_test.pkl')
model = load('save_test.pkl')
""" This model, although doing sequential prediction, predicts a tag per document not per word. """
    model = load(modelfile_name + '.pkl')
    results = model.predict(train_tokens)

    count = 0
    for r, g in zip(results, train_labels):
        if int(r>=0.5) == int(g):
            count+=1
    results = 1.0 * count /len(train_labels)
    print (modelfile_name + '\t' + str(results))
'''

models = []
predictions =[]
for embedding_size, gru_size, num_epochs in best_hyperparams:
    modelfile_name = 'stubborn_model.gridsearch.embedding{}.gru{}.epoch{}'.format(embedding_size, gru_size, num_epochs)
    model = load(modelfile_name + '.pkl')
    models.append(model)


with io.open('ensemble.train', 'w') as fout:
    for instance, label in zip(train_tokens, train_labels):
        line = " ".join([str(model.predict([instance])[0][0]) for model in models])
        fout.write(unicode(line + " " + str(label) + '\n'))

testfile = 'cwi_test.lemmatized.txt'
test_text = []
with io.open(testfile, 'r', encoding='utf8') as fin:
    for text in fin:
        test_text.append(text.strip())

test_tokens = tokenizer.transform(test_text)
Exemple #13
0
    print "Loading tagger..."
    taggerMorp = Tagger.load(sys.argv[1])
    if not taggerMorp:
        sys.stderr.write("Cannot load tagger from file '%s'\n" % sys.argv[1])
        sys.exit(1)

    # Init RNN
    print "Loading RNN..."
    tokenizerRNN = pickle.load(open('tokenizer.pkl', 'rb'))

    layers = [
        Embedding(size=256, n_features=tokenizerRNN.n_features),
        GatedRecurrent(size=512, p_drop=0.2),
        Dense(size=10, activation='softmax', p_drop=0.5)
    ]

    model = RNN(
        layers=layers, cost='cce'
    )  # bce is classification loss for binary classification and sigmoid output
    model = load(sys.argv[2])  # How to load
    print "RNN loaded"

    s.listen(10)
    print 'Socket now listening'
    #######################################################################################################
    while 1:
        conn, addr = s.accept()
        print 'Connected with ' + addr[0] + ':' + str(addr[1])
        start_new_thread(clientthread, (conn, ))
    s.close()