def myfunc(authorlist, doc_id, tdoc):
    subset = None

    #Whether to save model parameters
    save = False
    model_name_path = 'params/crepe_model.json'
    model_weights_path = 'params/crepe_model_weights.h5'

    #Maximum length. Longer gets chopped. Shorter gets padded.
    maxlen = 1014

    #Model params
    #Filters for conv layers
    nb_filter = 128  #initially 256
    #Number of units in the dense layer
    dense_outputs = 512  #Initially 1024
    #Conv layer kernel size
    filter_kernels = [3, 3, 3, 3, 3, 3]
    #Number of units in the final output layer. Number of classes.

    #Compile/fit params
    batch_size = 32
    nb_epoch = 10

    print('Loading data...')
    #Expect x to be a list of sentences. Y to be a one-hot encoding of the
    #categories.

    ### 515-1122-122 and 1573 with remove 6 layers
    #authorlist=[121, 479 , 649 ]
    #doc_id = 14706

    #authorlist=[114, 1492, 124, 1228]
    #doc_id = [206, 205]
    cat_output = len(authorlist)  #binary in the last layer

    # def main(authorlist, doc_id):

    ((trainX, trainY), (valX,
                        valY)) = data_helpers.load_ag_data(authors=authorlist,
                                                           docID=doc_id,
                                                           tdoc=tdoc)

    print('Creating vocab...')
    vocab, reverse_vocab, vocab_size, check = data_helpers.create_vocab_set()

    #trainX = data_helpers.encode_data(trainX, maxlen, vocab, vocab_size, check)
    #test_data = data_helpers.encode_data(valX, maxlen, vocab, vocab_size, check)

    print('Build model...')

    classes = len(authorlist)
    (model, sgd,
     model_weights_path) = py_crepe.build_model(classes, filter_kernels,
                                                dense_outputs, maxlen,
                                                vocab_size, nb_filter)

    vocab_size

    print('Fit model...')
    initial = datetime.datetime.now()
    for e in xrange(nb_epoch):
        xi, yi = data_helpers.shuffle_matrix(trainX, trainY)
        xi_test, yi_test = data_helpers.shuffle_matrix(valX, valY)
        if subset:
            batches = data_helpers.mini_batch_generator(xi[:subset],
                                                        yi[:subset],
                                                        vocab,
                                                        vocab_size,
                                                        check,
                                                        maxlen,
                                                        batch_size=batch_size)
        else:
            batches = data_helpers.mini_batch_generator(xi,
                                                        yi,
                                                        vocab,
                                                        vocab_size,
                                                        check,
                                                        maxlen,
                                                        batch_size=batch_size)

        test_batches = data_helpers.mini_batch_generator(xi_test,
                                                         yi_test,
                                                         vocab,
                                                         vocab_size,
                                                         check,
                                                         maxlen,
                                                         batch_size=batch_size)

        accuracy = 0.0
        loss = 0.0
        step = 1
        start = datetime.datetime.now()
        print('Epoch: {}'.format(e))
        for x_train, y_train in batches:

            f = model.train_on_batch(x_train, y_train)
            loss += f[0]
            loss_avg = loss / step
            accuracy += f[1]
            accuracy_avg = accuracy / step
            if step % 100 == 0:
                print('  Step: {}'.format(step))
                print('\tLoss: {}. Accuracy: {}'.format(
                    loss_avg, accuracy_avg))
            step += 1

        test_accuracy = 0.0
        test_loss = 0.0
        test_step = 1

        for x_test_batch, y_test_batch in test_batches:
            f_ev = model.test_on_batch(x_test_batch, y_test_batch)
            test_loss += f_ev[0]
            test_loss_avg = test_loss / test_step
            test_accuracy += f_ev[1]
            test_accuracy_avg = test_accuracy / test_step
            test_step += 1
        stop = datetime.datetime.now()
        e_elap = stop - start
        t_elap = stop - initial
        print(
            'Epoch {}. Loss: {}. Accuracy: {}\nEpoch time: {}. Total time: {}\n'
            .format(e, test_loss_avg, test_accuracy_avg, e_elap, t_elap))

    if save:
        print('Saving model params...')
        json_string = model.to_json()
        with open(model_name_path, 'w') as f:
            json.dump(json_string, f)

    model.save_weights(model_weights_path)

    import cPickle as pickle
    with open('sgd.pickle', 'wb') as handle:
        pickle.dump(sgd, handle, protocol=pickle.HIGHEST_PROTOCOL)

    del trainX, trainY, valX, valY

    model.load_weights(model_weights_path)

    #from keras.optimizers import SGD
    #sgd = SGD(lr=0.01, momentum=0.9, nesterov= True)

    # Compile model again (required to make predictions)
    model.compile(loss='categorical_crossentropy',
                  optimizer=sgd,
                  metrics=['accuracy'])

    def predictModel(model, testX):
        # Function to take input of data and return prediction model
        predY = np.array(model.predict(testX))

        predYList = predY[:]
        entro = []

        flag = False
        import math
        for row in predY:
            entroval = 0
            for i in row:
                if (i <= 0):
                    flag = True
                    pass
                else:
                    entroval += (i * (math.log(i, 2)))
            entroval = -1 * entroval
            entro.append(entroval)

        if (flag == False):
            yx = zip(entro, predY)
            yx = sorted(yx, key=lambda t: t[0])
            newPredY = [x for y, x in yx]
            predYEntroList = newPredY[:int(len(newPredY) * 0.3)]  # Reduce this
            predY = np.mean(predYEntroList, axis=0)
        else:
            predY = np.mean(predYList, axis=0)

        return (predYList, predY)

    test_binary = []
    #d = []
    #d.append(doc_id)
    #for docs in d:
    (testX, testY) = data_helpers.load_doc_data(authors=authorlist,
                                                docID=doc_id)
    testX = data_helpers.encode_data(testX, maxlen, vocab, vocab_size, check)
    (predYList, predY) = predictModel(model, testX)
    testY = np.array(testY)
    testY = testY.mean(axis=0)
    predLocation = predY.tolist().index(max(predY))
    if predLocation == testY:
        test_binary.append(1)
    else:
        test_binary.append(0)

    from IPython.display import clear_output
    clear_output()

    #predY = np.array(model.predict(testX, batch_size=batch_size))

    #predY

    #predY.mean(axis = 0)

    return test_binary
    """
Esempio n. 2
0
import data_helpers

maxlen = 9

vocab, _, vocab_size, check = data_helpers.create_vocab_set()

x_test = ['this is awesome']

res = data_helpers.encode_data(x_test, maxlen, vocab, 69, check)

print(res)
Esempio n. 3
0
#Compile/fit params
batch_size = 80
nb_epoch = 10

# In[10]:

print('Loading data...')
#Expect x to be a list of sentences. Y to be a one-hot encoding of the
#categories.
(xt, yt), (x_test, y_test) = data_helpers.load_ag_data()

print('Creating vocab...')
vocab, reverse_vocab, vocab_size, check = data_helpers.create_vocab_set()

test_data = data_helpers.encode_data(x_test, maxlen, vocab, vocab_size, check)

print('Build model...')

model = py_crepe.model(filter_kernels, dense_outputs, maxlen, vocab_size,
                       nb_filter, cat_output)

# In[4]:

print('Fit model...')
initial = datetime.datetime.now()
for e in xrange(nb_epoch):
    xi, yi = data_helpers.shuffle_matrix(xt, yt)
    xi_test, yi_test = data_helpers.shuffle_matrix(x_test, y_test)
    if subset:
        batches = data_helpers.mini_batch_generator(xi[:subset],
Esempio n. 4
0
# Compile/fit params
batch_size = 80
nb_epoch = 20

print('Loading data...')
# Expect x to be a list of sentences. Y to be index of the categories.
(xt, yt), (x_test, y_test) = data_helpers.load_ag_data()

print('Creating vocab...')
vocab, reverse_vocab, vocab_size, alphabet = data_helpers.create_vocab_set()

print('Build model...')
model = py_crepe.create_model(filter_kernels, dense_outputs, maxlen, vocab_size,
                              nb_filter, cat_output)
# Encode data
xt = data_helpers.encode_data(xt, maxlen, vocab)
x_test = data_helpers.encode_data(x_test, maxlen, vocab)

print('Chars vocab: {}'.format(alphabet))
print('Chars vocab size: {}'.format(vocab_size))
print('X_train.shape: {}'.format(xt.shape))
model.summary()
print('Fit model...')
model.fit(xt, yt,
          validation_data=(x_test, y_test), batch_size=batch_size, epochs=nb_epoch, shuffle=True)

if save:
    print('Saving model params...')
    json_string = model.to_json()
    with open(model_name_path, 'w') as f:
        json.dump(json_string, f)
Esempio n. 5
0
dense_outputs = 2048

# Compile/fit params
batch_size = 128
nb_epoch = 200

print('Loading data...')
# Expect x to be a list of sentences. Y to be a one-hot encoding of the categories.
(xt, yt), (x_test, y_test), label_array = data_helpers.load_ag_data()
# Number of units in the final output layer. Number of classes.
cat_output = len(label_array)  # 1007

print('Creating vocab...')
vocab, reverse_vocab, vocab_size, alphabet = data_helpers.create_vocab_set()

xt = data_helpers.encode_data(xt, maxlen, vocab)
x_test = data_helpers.encode_data(x_test, maxlen, vocab)

print('Chars vocab: {}'.format(alphabet))
print('Chars vocab size: {}'.format(vocab_size))
print('X_train.shape: {}'.format(xt.shape))

print('Build model...')

model = char_cnn_model.model(filter_kernels, dense_outputs, maxlen, vocab_size,
                             nb_filter, cat_output)

print('Fit model...')
model.summary()

model.fit(xt,
Esempio n. 6
0
# Filters for conv layers
nb_filter = 512

# Conv layer kernel size
filter_kernels = [7, 7, 3, 3]

lg.info('Loading data...')

# Expect x to be a list of sentences. Y to be a one-hot encoding of the categories.
(xt, yt), (x_test, y_test) = data_helpers.load_restoclub_data_for_encdec("data")

lg.info('Creating vocab...')
vocab, reverse_vocab, vocab_size, check = data_helpers.create_vocab_set()

lg.info(str(vocab))
test_data = data_helpers.encode_data(x_test, maxlen, vocab, vocab_size, check)

lg.info('Build model...')
autoencoder_model, encoder_model = tweet2vec.model(filter_kernels,
                                                   maxlen,
                                                   vocab_size,
                                                   nb_filter,
                                                   latent_dim,
                                                   latent_dim,
                                                   rnn_type)

lg.info('Fit model...')
initial = datetime.datetime.now()

for e in range(nb_epoch):