Beispiel #1
0
    def fit(self,
            trX,
            trY,
            batch_size=64,
            n_epochs=1,
            len_filter=LenFilter(),
            snapshot_freq=1,
            path=None):
        """Train model on given training examples and return the list of costs after each minibatch is processed.

        Args:
          trX (list) -- Inputs
          trY (list) -- Outputs
          batch_size (int, optional) -- number of examples in a minibatch (default 64)
          n_epochs (int, optional)  -- number of epochs to train for (default 1)
          len_filter (object, optional) -- object to filter training example by length (default LenFilter())
          snapshot_freq (int, optional) -- number of epochs between saving model snapshots (default 1)
          path (str, optional) -- prefix of path where model snapshots are saved.
            If None, no snapshots are saved (default None)

        Returns:
          list -- costs of model after processing each minibatch
        """
        if len_filter is not None:
            trX, trY = len_filter.filter(trX, trY)
        trY = standardize_targets(trY, cost=self.cost)

        n = 0.
        t = time()
        costs = []
        for e in range(n_epochs):
            epoch_costs = []
            for xmb, ymb in self.iterator.iterXY(trX, trY):
                c = self._train(xmb, ymb)
                epoch_costs.append(c)
                n += len(ymb)
                if self.verbose >= 2:
                    n_per_sec = n / (time() - t)
                    n_left = len(trY) - n % len(trY)
                    time_left = n_left / n_per_sec
                    sys.stdout.write(
                        "\rEpoch %d Seen %d samples Avg cost %0.4f Time left %d seconds"
                        % (e, n, np.mean(epoch_costs[-250:]), time_left))
                    sys.stdout.flush()
            costs.extend(epoch_costs)

            status = "Epoch %d Seen %d samples Avg cost %0.4f Time elapsed %d seconds" % (
                e, n, np.mean(epoch_costs[-250:]), time() - t)
            if self.verbose >= 2:
                sys.stdout.write("\r" + status)
                sys.stdout.flush()
                sys.stdout.write("\n")
            elif self.verbose == 1:
                print(status)
            if path and e % snapshot_freq == 0:
                save(self, "{0}.{1}".format(path, e))
        return costs
Beispiel #2
0
	model = train_RNN(tokenizer, X_tokens, y_train)

	y_pred_tr = model.predict(X_tokens).flatten()

	# Check overall performance
	test_tokens = tokenizer.transform(X_test)
	y_pred_tst = model.predict(test_tokens).flatten()

	# Conver predictions to binary
	yhat_train = y_pred_tr.reshape(-1, 1)
	yhat_test  = y_pred_tst.reshape(-1, 1)
	binarizer = Binarizer(threshold=0.5).fit(yhat_train)
	yhat_tr_b = binarizer.transform(yhat_train).astype(int)
	yhat_tst_b = binarizer.transform(yhat_test).astype(int)

    save(model, review_score_full.pkl)

    with open('review_tokenizer_full.pkl', 'wb') as fileObject:
        pickle.dump(tokenizer, fileObject)

    # # Save model for future use
    # save(model, 'review_scorer1.pkl')
    # # model = load('review_scorer.pkl')
    # with open('review_tokenizer1.pkl','wb') as fileObject:
    #     pickle.dump(tokenizer, fileObject)

	# Scorers to consider
    # score()

	# Scoring method: Roc_Auc_Score
	# Train: 0.99, Test: 0.97
import sys

# ---

# ---

print 'loading dataset'
d = Dataset(settings['FN_DATASET'], settings['FN_VOCABULARY'])
d.load()

print 'generating labeled training set'
train_text,train_labels = d.getNextWordPredTrainset(10)
#for t,l in zip(train_text,train_labels):
#    print t,'->',l

tokenizer = Tokenizer()
train_tokens = tokenizer.fit_transform(train_text)
save(train_tokens, settings['FN_TRAINED_TOKENIZER'])

layers = [
    Embedding(size=128, n_features=tokenizer.n_features),
    GatedRecurrent(size=128),
    Dense(size=1, activation='sigmoid')
]

model = RNN(layers=layers, cost='BinaryCrossEntropy')
model.fit(train_tokens, train_labels)

save(model, settings['FN_MODEL_NEXTWORDPRED'])
Beispiel #4
0
teX = tokenizer.transform(teX)
print tokenizer.n_features

layers = [
    Embedding(size=128, n_features=tokenizer.n_features),
    GatedRecurrent(size=256, activation='tanh', gate_activation='steeper_sigmoid', init='orthogonal', seq_output=False),
    Dense(size=1, activation='sigmoid', init='orthogonal') # sigmoid for binary classification
]

model = RNN(layers=layers, cost='bce') # bce is classification loss for binary classification and sigmoid output
for i in range(2):
    model.fit(trX, trY, n_epochs=1)
    tr_preds = model.predict(trX[:len(teY)])
    te_preds = model.predict(teX)

    tr_acc = metrics.accuracy_score(trY[:len(teY)], tr_preds > 0.5)
    te_acc = metrics.accuracy_score(teY, te_preds > 0.5)

    print i, tr_acc, te_acc

save(model, 'save_test.pkl') # How to save

model = load('save_test.pkl') # How to load

tr_preds = model.predict(trX[:len(teY)])
te_preds = model.predict(teX)

tr_acc = metrics.accuracy_score(trY[:len(teY)], tr_preds > 0.5)
te_acc = metrics.accuracy_score(teY, te_preds > 0.5)

print tr_acc, te_acc
Beispiel #5
0
    def fit(self,
            trX,
            trY,
            valX,
            valY,
            n_epochs=1,
            early_stopping=None,
            len_filter=LenFilter(max_len=10000, percentile=100),
            snapshot_freq=1,
            path=None):
        """Train model on given training examples and return the list of costs after each minibatch is processed.
                Args:
                  trX (list) -- Inputs
                  trY (list) -- Outputs
                  valX (list) -- Validation Inputs
                  valY (list) -- Validation Outputs
                  n_epochs (int, optional)  -- number of epochs to train for (default 1)
                  early_stopping -- number of consecutive epochs above minimum validation before stopping (default None; no early stopping)
                  len_filter (object, optional) -- object to filter training example by length (default LenFilter())
                  snapshot_freq (int, optional) -- number of epochs between saving model snapshots (default 1)
                  path (str, optional) -- prefix of path where model snapshots are saved.
                        If None, no snapshots are saved (default None)
                Returns:
                  list -- costs of model after processing each minibatch
                """
        if len_filter is not None:
            trX, trY = len_filter.filter(trX, trY)
        trY = standardize_targets(trY, cost=self.cost)
        n = 0.
        stats = []
        t = time()
        costs = []
        valY = np.asarray(valY)
        self.valcosts = []
        sensitivity = []
        specificity = []
        training_costs = []

        min_val = float('inf')
        min_train = float('inf')
        stopping_count = 0

        for e in range(n_epochs):
            weights = []
            epoch_costs = []
            for xmb, ymb in self.iterator.iterXY(trX, trY):
                c = self._train(xmb, ymb)
                epoch_costs.append(c)
                n += len(ymb)
                if self.verbose >= 2:
                    n_per_sec = n / (time() - t)
                    n_left = len(trY) - n % len(trY)
                    time_left = n_left / n_per_sec
                    sys.stdout.write(
                        "\rEpoch %d Seen %d samples Avg cost %0.4f Time left %d seconds"
                        % (e, n, np.mean(epoch_costs[-250:]), time_left))
                    sys.stdout.flush()
                weights.append('Epoch: {0} samples: {1} avg cost: {2}'.format(
                    e, n, np.mean(epoch_costs[-250:])))
                for layer in self.settings['layers']:
                    try:
                        w = layer['config']['weights']
                    except TypeError:
                        w = [p.get_value() for p in layer.params]
                    for p in w:
                        weights.append(str(p))
                        if np.any(np.isnan(p)):
                            err_file = 'error_0.txt'
                            i = 1
                            while exists(err_file):
                                err_file = 'err_{0}.txt'.format(i)
                                i += 1
                            with open(err_file, 'w') as out:
                                out.write('\n'.join(weights))
                            raise Exception('NaN weights')
            costs.extend(epoch_costs)
            training_costs.append(np.mean(epoch_costs))
            status = "Epoch %d Seen %d samples Avg cost %0.4f Time elapsed %d seconds" % (
                e, n, np.mean(epoch_costs[-250:]), time() - t)
            if self.verbose >= 2:
                sys.stdout.write("\r" + status)
                sys.stdout.flush()
                sys.stdout.write("\n")
            elif self.verbose == 1:
                print status
            if path and e % snapshot_freq == 0:
                save(self, "{0}.{1}".format(path, e))
            preds = self.batch_predict(valX)
            val_loss, sens, spec = self.val_loss_accuracy(preds, valY)

            print "Validation loss:", val_loss
            print "Sensitivity:", [round(x, 4) for x in sens]
            print "Specificity:", [round(x, 4) for x in spec]
            self.valcosts.append(val_loss)
            sensitivity.append(sens)
            specificity.append(spec)
            #early stopping
            if early_stopping is not None:
                if val_loss <= min_val:
                    min_val = val_loss
                    #reset count
                    stopping_count = 0
                    #keep track of the traning cost at the minimum validation loss
                    min_train = training_costs[-1]
                elif training_costs[-1] < min_train:
                    #only increase counter if the latest training loss is below the training loss at minimum validation loss
                    stopping_count += 1
                    if stopping_count >= early_stopping:
                        break
        return training_costs, self.valcosts, sensitivity, specificity
Beispiel #6
0
def train(X, y):
    from keras.layers.embeddings import Embedding
    from keras.layers.recurrent import LSTM, GRU, SimpleRNN
    from keras.layers.core import Dense
    from keras.models import Sequential
    from keras.layers.core import Dropout
    from keras.preprocessing.text import Tokenizer
    from keras.preprocessing.sequence import pad_sequences
    from math import e
    vocab = 10000
    tokenizer = Tokenizer(nb_words=vocab)
    tokenizer.fit_on_texts(X)
    X = tokenizer.texts_to_sequences(X)
    """
    index_word =  {v: k for k, v in tokenizer.word_index.items()}
    for i in range(1, 10001):
        print str(i) + "," + index_word[i]

    return
    """
    maxlen = 50
    X1 = []
    y1 = []
    for thing, target in zip(X, y):
        if len(thing) != 0:
            X1.append(thing)
            y1.append(target)

    X = X1
    y = y1
    KERAS = False
    if KERAS:
        X = pad_sequences(X, maxlen=maxlen)

    from random import shuffle
    xy = zip(X, y)
    shuffle(xy)
    X_s, y_s = zip(*xy)
    X_train, y_train, X_test, y_test = X_s[:-1000], y_s[:-1000], X_s[
        -1000:], y_s[-1000:]
    embedding_size = 256
    dropout = .3
    batch_size = 256
    recurrent_gate_size = 512
    """
    model = Sequential()
    model.add(Embedding(vocab, embedding_size, mask_zero=True))
    model.add(Dropout(dropout))
    model.add(LSTM(recurrent_gate_size))
    model.add(Dropout(dropout))
    model.add(Dense(1))
    print "building model..."
    model.compile(loss="msle", optimizer="rmsprop")
    print "fitting model"
    #model.load_weights("mymodel")
    model.fit(np.asarray(X_train), np.asarray(y_train), nb_epoch=30, verbose=1, batch_size=batch_size, validation_data=(np.asarray(X_test), np.asarray(y_test)))
   
    model.save_weights("mymodel") 
    """
    from passage.preprocessing import Tokenizer, LenFilter
    from passage.layers import Embedding, GatedRecurrent, Dense, OneHot, LstmRecurrent
    from passage.models import RNN
    from passage.utils import save, load
    from passage.iterators import Padded

    layers = [
        #    OneHot(n_features=5),
        Embedding(size=embedding_size, n_features=vocab),
        #    GatedRecurrent(size=recurrent_gate_size, seq_output=True, p_drop=dropout),
        #    LstmRecurrent(size=recurrent_gate_size, p_drop=dropout),
        GatedRecurrent(size=recurrent_gate_size, p_drop=dropout),
        Dense(size=8, activation='softmax', p_drop=dropout)
    ]

    print >> sys.stderr, "learning model"
    model_iterator = Padded()
    model = load("mymodel.final.pkl")
    #model = RNN(layers=layers, cost='CategoricalCrossEntropy', verbose=2, updater="Adam")
    filter = LenFilter(max_len=maxlen)
    model.fit(np.asarray(X_train),
              np.asarray(y_train),
              batch_size=batch_size,
              n_epochs=1000,
              path="mymodel.pkl",
              snapshot_freq=49,
              len_filter=filter)
    save(model, "mymodel.final.pkl")
    #    print "test cost"
    #    print model._cost(np.asarray(X_test), np.asarray(y_test))
    print "test accuracy"
    passage_batch_predict(np.asarray(X_train), np.asarray(y_train), model)

    exit = False
    print "enter a sentence"
    while not exit:
        text = raw_input()
        if text == "exit":
            break
        else:
            tokens = tokenizer.texts_to_sequences([text])
            if len(tokens) == 0:
                print "Sentence too strange, try again"
                continue
            if KERAS:
                tokens = pad_sequences(tokens, maxlen=maxlen)
            prediction = np.argmax(model.predict(tokens)[0])
            try:
                print e**(prediction - 2)
            except Exception:
                pass
Beispiel #7
0
gru_sizes = [10, 20, 50, 100, 200, 1000]
epochs = [1, 3, 5, 7, 10]

for embedding_size, gru_size, num_epochs in product(embedding_sizes, gru_sizes, epochs):
    X_train, X_test, y_train, y_test = cross_validation.train_test_split(
        train_text, train_labels, test_size=0.1, random_state=0
    )

    layers = [
        Embedding(size=embedding_size, n_features=tokenizer.n_features),
        GatedRecurrent(size=gru_size),
        Dense(size=1, activation="sigmoid"),
    ]

    model = RNN(layers=layers, cost="BinaryCrossEntropy")
    model.fit(train_tokens, train_labels, n_epochs=int(num_epochs))

    modelfile_name = "stubborn_model.paramsearch.embedding{}.gru{}.epoch{}".format(embedding_size, gru_size, num_epochs)

    save(model, modelfile_name + ".pkl")
    pickle.dump(tokenizer, open(modelfile_name + "-tokenizer.pkl", "wb"))

    results = model.predict(tokenizer.transform(X_test))

    count = 0
    for r, g in zip(results, y_test):
        if int(r >= 0.5) == int(g):
            count += 1
    results = 1.0 * count / len(y_test)
    print(modelfile_name + "\t" + str(results))
Beispiel #8
0
Y_train = newsgroups_train.target
Y_test = newsgroups_test.target

print tokenizer.n_features

layers = [
    Embedding(size=128, n_features=tokenizer.n_features),
    GatedRecurrent(size=256,
                   activation='tanh',
                   gate_activation='steeper_sigmoid',
                   init='orthogonal',
                   seq_output=False),
    Dense(size=1, activation='sigmoid',
          init='orthogonal')  # sigmoid for binary classification
]

model = RNN(
    layers=layers, cost='bce'
)  # bce is classification loss for binary classification and sigmoid output
for i in range(2):
    model.fit(X_train, Y_train, n_epochs=1)
    tr_preds = model.predict(X_train[:len(Y_test)])
    te_preds = model.predict(X_test)

    tr_acc = metrics.accuracy_score(Y_train[:len(Y_test)], tr_preds > 0.5)
    te_acc = metrics.accuracy_score(Y_test, te_preds > 0.5)

    print i, tr_acc, te_acc  # dataset too small to fully utilize Passage

save(model, 'model.pkl')
tokenizer = Tokenizer(min_df=10, max_features=50000)
trX = tokenizer.fit_transform(trX)
pickle.dump(tokenizer, open('tokenizer.pkl', 'wb'))
print "number of tokens:" + str(len(trX))
teX = tokenizer.transform(teX)
print "number of feathures:" + str(tokenizer.n_features)

layers = [
    Embedding(size=256, n_features=tokenizer.n_features),
    GatedRecurrent(size=725),
    Dense(size=10, activation='softmax')
]

model = RNN(layers=layers, cost='cce')
model.fit(trX, trY, n_epochs=10)
save(model, 'modelEcho.pkl')

tr_preds = model.predict(trX)
te_preds = model.predict(teX)

data = pd.DataFrame(trY)
data.to_csv('data/trY.vec')

data = pd.DataFrame(tr_preds)
data.to_csv('data/tr_preds.vec')

tr_acc = np.mean(np.argmax(trY, axis=1) == np.argmax(tr_preds, axis=1))
indexy = np.argmax(teY, axis=1)
data = pd.DataFrame(indexy)
data.to_csv('data/ev_agrmax.txt')
data = pd.DataFrame(np.argmax(te_preds, axis=1))
Beispiel #10
0
from passage.models import RNN
from passage.utils import save

tokenizer = Tokenizer(min_df=10, max_features=50000)
X_train = tokenizer.fit_transform(newsgroups_train.data)
X_test  = tokenizer.transform(newsgroups_test.data)
Y_train = newsgroups_train.target
Y_test  = newsgroups_test.target

print tokenizer.n_features

layers = [
    Embedding(size=128, n_features=tokenizer.n_features),
    GatedRecurrent(size=256, activation='tanh', gate_activation='steeper_sigmoid',
    			   init='orthogonal', seq_output=False),
    Dense(size=1, activation='sigmoid', init='orthogonal') # sigmoid for binary classification
]

model = RNN(layers=layers, cost='bce') # bce is classification loss for binary classification and sigmoid output
for i in range(2):
    model.fit(X_train, Y_train, n_epochs=1)
    tr_preds = model.predict(X_train[:len(Y_test)])
    te_preds = model.predict(X_test)

    tr_acc = metrics.accuracy_score(Y_train[:len(Y_test)], tr_preds > 0.5)
    te_acc = metrics.accuracy_score(Y_test, te_preds > 0.5)

    print i, tr_acc, te_acc # dataset too small to fully utilize Passage

save(model, 'model.pkl')
Beispiel #11
0
#!/usr/bin/env python
# coding=utf-8
from passage.preprocessing import Tokenizer
from passage.layers import Embedding, GatedRecurrent, Dense
from passage.models import RNN
from passage.utils import save, load

train_text = ['hello world','foo bar']
train_labels = [0,1]
test_text = ['good man']
tokenizer = Tokenizer()
train_tokens = tokenizer.fit_transform(train_text)

layers = [
        Embedding(size=128, n_features=tokenizer.n_features),
        GatedRecurrent(size=128),
        Dense(size=1, activation='sigmoid')
]

model = RNN(layers=layers, cost='BinaryCrossEntropy')
model.fit(train_tokens, train_labels)

print model.predict(tokenizer.transform(test_text))
save(model, 'save_test.pkl')
model = load('save_test.pkl')
num_feats = len(set(flatten(train_tokens)))


def get_labels(id):
    if id == 3:
        return [1, 0]
    else:
        return [0, 1]


seq_labels = map(lambda (l): map(get_labels, l), train_tokens)

layers = [
    Embedding(size=128, n_features=num_feats),
    GatedRecurrent(size=128, seq_output=True),
    Dense(size=num_feats, activation='softmax')
]

#iterator = SortedPadded(y_pad=True, y_dtype=intX)
#iterator = SortedPadded(y_dtype=intX)

#model = RNN(layers=layers, cost='seq_cce', iterator=iterator, Y=T.imatrix())
model = RNN(layers=layers, cost='seq_cce')
#model.fit(train_tokens, [1,0,1])
model.fit(train_tokens, train_tokens)

#model.predict(tokenizer.transform(["Frogs are awesome", "frogs are amphibious"]))
model.predict(train_tokens)
save(model, 'save_test.pkl')
model = load('save_test.pkl')
""" This model, although doing sequential prediction, predicts a tag per document not per word. """