Esempio n. 1
0
def testSystem():
    t = Tokenizer()

    xTrain, yTrain = t.getData()
    np.random.seed(10)
    model = RNN(15000)
    o, s = model.forwardPropagation(xTrain[30])
    predictions = model.predict(xTrain[30])
    print(o.shape)
    print(o)
    print(predictions.shape)
    print(predictions)

    print("Expected Loss: \n" + str(np.log(model.vocab)))
    print("Actual Loss:")
    print(model.calculateLoss(xTrain[:100], yTrain[:100]))
Esempio n. 2
0
    # Training loop
    for k in range(8):
        for i in tqdm(range(0, len(all_data) - 25, seq_length)):
            # select batch
            X_chars = all_data[i:i + seq_length]
            X = rnn.mapper[X_chars]
            Y_chars = all_data[i + 1:i + seq_length + 1]
            Y = rnn.mapper[Y_chars]

            # display loss every 250th update
            if np.mod(ll, 100) == 0:
                print(loss)

            # write synthesized text in file every 500th update
            if np.mod(ll, 500) == 0:
                txt = rnn.predict(x=X[:, 0], h=rnn.h, n=250)
                if np.mod(ll, 1000) == 0:
                    with open('trump_synthezed/out_trumpp2.txt', 'a') as f:
                        print("\n*iter =*" + str(ll) + "*, smooth_loos=*" +
                              str(loss) + "\n",
                              file=f)
                        print("".join(rnn.word(txt)), file=f)
                print("**iter =**" + str(ll) + "**, smooth_loos=**" +
                      str(loss) + "\n")
                print("".join(rnn.word(txt)))

            # reset initial state if new epoche
            if i == 0:
                rnn.h = None

            # update loss
Esempio n. 3
0
print("\nExample sentence after Pre-processing: '%s'" % tokenized_sentences[0])

# Create the training data
X_train = np.asarray([[word_to_index[w] for w in sent[:-1]]
                      for sent in tokenized_sentences])
y_train = np.asarray([[word_to_index[w] for w in sent[1:]]
                      for sent in tokenized_sentences])

np.random.seed(10)
model = RNN(vocabulary_size)
o, s = model.forward_propagation(X_train[10])

print(o.shape)
print(o)

predictions = model.predict(X_train[10])
print(predictions.shape)
print(predictions)

print("-------------------------------------------")
'''
Cross Entropy loss is L(y, o) = -\cfrac{1}{N}\sum_{n\in N} y_n \log o_n
'''
E_loss = np.log(vocabulary_size)
print('Expected loss for random predictions is {}'.format(E_loss))

Actual_loos = model.calc_loss(X_train[:1000], y_train[:1000])
print('Actual loss for random predictions is {}'.format(Actual_loos))

losses = train_with_sgd(model,
                        X_train[:100],
Esempio n. 4
0
old_valid_error = 0.0
result = []  # list for saving all predictions made by the network
# file = 'training_pred.pickle.gz'
for k in range(n_epoch):
    correct_number_train = 0.0
    correct_number_valid = 0.0
    class_occurrence_train = np.zeros(n_classes)
    class_occurrence_valid = np.zeros(n_classes)
    confusion_matrix_train = np.zeros((n_classes, n_classes))
    confusion_matrix_valid = np.zeros((n_classes, n_classes))
    n_data = 0
    for i in range(n_batches):
        cost, output, gradient, hidden_act, t, x_1, wout = rnn.train(input[i], mask[i], label[i], lrate)
        rnn.save(filesave)
        train_cost.append(cost)
        prob = rnn.predict(input[i], mask[i])
        for jj in range(prob.shape[1]):
            for kk in range(prob.shape[0]):
                prediction = (-prob[kk, jj, :]).argsort()
                label_sorted = (-label[i][kk, jj, :]).argsort()
                if any(label[i][kk, jj, :]):
                    n_data += 1
                    if prediction[0] == label_sorted[0]:
                        correct_number_train += 1
                    confusion_matrix_train[prediction[0], label_sorted[0]] += 1
                    class_occurrence_train[label_sorted[0]] += 1
    log('> epoch ' + str(k) + ', training cost: ' + str(100 * np.mean(train_cost)))
    confusion_matrix_train = 100 * confusion_matrix_train / class_occurrence_train
    train_error_rate = 100 * (1.0 - correct_number_train / n_data)
    log('Error rate on training set is ' + str(train_error_rate) + ' (%)')
    log('Confusion Matrix on training set is \n\n ' + str(np.around(confusion_matrix_train, 2)) + ' (%)\n')
Esempio n. 5
0
            temp = [0] * index
            temp[y[i][j]] = 1
            y[i][j] = temp
    return [x, y, index, dict]


if __name__ == '__main__':

    X, Y, index, dicts = read('1168.txt')
    print("read finished")
    print(index)
    rnn = RNN(index, 20)

    rnn.train(X, Y)
    s = [0] * index
    s[1] = 1
    predict_x = []
    for i in range(5):
        temp = [0] * index
        p = random.randint(2, index - 1)
        temp[p] = 1
        predict_x.append(temp)
    #index = rnn.predict(np.array([s,predict_x[0],predict_x[1],predict_x[2],predict_x[3],predict_x[4]]))
    index = rnn.predict(np.array([s, predict_x[0], predict_x[1],
                                  predict_x[2]]))
    print(index)
    for i in index:
        for j in dicts.keys():

            if dicts[j] == i:
                print(j)
Esempio n. 6
0
    locals().update(cPickle.load(f))
    with open(os.path.expanduser(SCALINGFACTORS), 'rb') as f:
        w, b = cPickle.load(f)
        pca = lambda x: ((x[:, mask] - mu) / sigma).dot(V) * w + b

# Predict for each recording
for filename in os.listdir(INPUT_DIR):
    conf = {}
    print "Filename {}".format(filename)
    id, ext = os.path.splitext(filename)
    if ext != '.htk': continue
    print "Predicting for {} ...".format(id)
    feature = pca(readHtk(os.path.join(INPUT_DIR, filename))).astype('float32')
    x = feature.reshape((1, ) + feature.shape)
    m = numpy.ones(x.shape[:-1], dtype='int32')
    conf[id] = net.predict(x, m)[0]

    # Save predictions
    with smart_open(os.path.join(OUTPUT_DIR, id + '.confidence.pkl.gz'),
                    'wb') as f:
        cPickle.dump(conf, f)
        savemat(os.path.join(OUTPUT_DIR, id + '.confidence.mat'), conf)

    result_ = conf[id]

    # Add classes 1 and 2 (speech english and speech non english)
    # to create a class " Speech "
    result = numpy.zeros((result_.shape[0], result_.shape[1] - 1))
    result[:, 0] = result_[:, 0]
    result[:, 1] = result_[:, 1] + result_[:, 2]
    result[:, 2:] = result_[:, 3:]
Esempio n. 7
0
old_valid_error = 0.0
result = []  # list for saving all predictions made by the network
# file = 'training_pred.pickle.gz'
for k in range(n_epoch):
    correct_number_train = 0.0
    correct_number_valid = 0.0
    class_occurrence_train = np.zeros(n_classes)
    class_occurrence_valid = np.zeros(n_classes)
    confusion_matrix_train = np.zeros((n_classes, n_classes))
    confusion_matrix_valid = np.zeros((n_classes, n_classes))
    n_data = 0
    for i in range(n_batches):
        cost, output = rnn.train(input[i], mask[i], label[i], lrate)
        rnn.save(filesave)
        train_cost.append(cost)
        prob = rnn.predict(input[i], mask[i])
        if multi_label == "false":
            for jj in range(prob.shape[1]):
                for kk in range(prob.shape[0]):
                    prediction = (-prob[kk, jj, :]).argsort()
                    label_sorted = (-label[i][kk, jj, :]).argsort()
                    if any(label[i][kk, jj, :]):
                        n_data += 1
                        if prediction[0] == label_sorted[0]:
                            correct_number_train += 1
                        confusion_matrix_train[prediction[0],
                                               label_sorted[0]] += 1
                        class_occurrence_train[label_sorted[0]] += 1
    if multi_label == "true":
        train_error_rate = format_results(prob, label[i], multi_label,
                                          n_classes)
Esempio n. 8
0
# preparing dataset
size = 8
int2binary = {}
largest_number = pow(2, size)
binary = np.unpackbits(np.array([range(largest_number)], dtype=np.uint8).T,
                       axis=1)
for i in range(largest_number):
    int2binary[i] = binary[i]

X = []
for j in range(10000):
    a_int = np.random.randint(largest_number / 2)  # int version
    a = int2binary[a_int]  # binary encoding
    b_int = np.random.randint(largest_number / 2)  # int version
    b = int2binary[b_int]  # binary encoding
    c_int = a_int + b_int  # true answer
    c = int2binary[c_int]
    problem = BinaryAddition(a, b)
    problem.set_output(c)
    X.append(problem)

rnn = RNN(2, 16, 1)
rnn.train(X)

a = int2binary[np.random.randint(largest_number / 2)]  # binary encoding
b = int2binary[np.random.randint(largest_number / 2)]  # binary encoding
problem = BinaryAddition(a, b)

rnn.predict(problem)

problem.print()
Esempio n. 9
0
# preparing dataset
size = 8
int2binary = {}
largest_number = pow(2, size)
binary = np.unpackbits(
    np.array([range(largest_number)], dtype=np.uint8).T, axis=1)
for i in range(largest_number):
    int2binary[i] = binary[i]

X = []
for j in range(10000):
    a_int = np.random.randint(largest_number / 2)  # int version
    a = int2binary[a_int]  # binary encoding
    b_int = np.random.randint(largest_number / 2)  # int version
    b = int2binary[b_int]  # binary encoding
    c_int = a_int + b_int  # true answer
    c = int2binary[c_int]
    problem = BinaryAddition(a, b)
    problem.set_output(c)
    X.append(problem)

rnn = RNN(2, 16, 1)
rnn.train(X)

a = int2binary[np.random.randint(largest_number / 2)]  # binary encoding
b = int2binary[np.random.randint(largest_number / 2)]  # binary encoding
problem = BinaryAddition(a, b)

rnn.predict(problem)

problem.print()