コード例 #1
0
def main():
    train_data = MNISTData(dset='train')
    val_data = MNISTData(dset='val')

    train_loader = DataLoader(train_data, batch_size=8)
    val_loader = DataLoader(val_data, batch_size=8)

    loss_func = F.cross_entropy
    # loss_func

    Net = MNISTNet()

    optimizer = optim.Adam(Net.parameters(), lr=1e-3)

    Net.train()
    for epoch in range(1):
        for x, y in tqdm(train_loader):
            pred = Net(x)
            loss = loss_func(pred, y)

            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

        print(loss.item())

    Net.eval()
    print("TRAIN ACCURACY")
    print(accuracy(Net, train_loader))

    print("VAL ACCURACY")
    print(accuracy(Net, val_loader))

    torch.save(Net.state_dict(), './model.pt')
コード例 #2
0
ファイル: test.py プロジェクト: shfd27/HeXA_Advanced_AI_Study
def main():
    train_data = MNISTData(dset='train')
    val_data = MNISTData(dset='val')

    train_loader = DataLoader(train_data, batch_size=8)
    val_loader = DataLoader(val_data, batch_size=8)

    Net = MNISTNet()
    Net.load_state_dict(torch.load('./model.pt'))
    Net.eval()

    print("TRAIN ACCURACY")
    print(accuracy(Net, train_loader))

    print("VAL ACCURACY")
    print(accuracy(Net, val_loader))
コード例 #3
0
def BSGD(X, y):
    start_time = time.time()
    Y = util.transformY(y)
    W = np.ones([5, X.shape[1]]) * 1.0 / X.shape[1]
    nabla_list = []
    lambdada = 0.05
    step = 0.001
    iter = 0
    batch_size = 500
    chunk_list = chunks(range(X.shape[0]), batch_size)
    round = int(math.ceil(X.shape[0]/(batch_size+0.0)))

    while iter < 30000:
        # iteratively update
        r = chunk_list[iter%round]
        sumover = np.zeros(X[r].shape[0]).reshape([1, X[r].shape[0]])
        for j in xrange(5):
            sumover += np.exp(W[j]*(X[r].transpose()))
        softmax = np.exp(W * (X[r].transpose()))/sumover
        temp = Y[r].T - softmax
        nabla = temp * X[r] - lambdada * W

        # adaptive learning rate
        #step = 10.0/(1000+iter)# adaptive learning rate
        W = W + step * nabla

        # if np.sum(nabla * step) < 0.001:
        #     break
        # train prediction
        if iter%round == 0:
            #nabla_list.append(step*np.linalg.norm(nabla))
            Sumover = 0
            for j in xrange(5):
                Sumover += np.exp(W[j]*(X.transpose()))
            distri = np.exp(W * (X.transpose()))/Sumover

            # hard prediction
            t = np.argmax(distri, axis=0)
            t = t + 1
            print eval.accuracy(t, y)

        iter += 1

    print 'time: %ss' % (time.time()-start_time)
    #plt.plot(nabla_list)
    #plt.show()
    return W
コード例 #4
0
STEPS = 100
dataset_size = 1111
save_dir = './'  # 保存网络路径
global_step = tf.Variable(0, trainable=False)
log_dir = './log/'
X = tf.placeholder(tf.float32, shape=(None, SIZE, SIZE, 3), name="input_x")
Y = tf.placeholder(tf.float32, shape=(None, SIZE, SIZE, 1), name="input_y")
y_ = PSP_model.y  # 最终输出结果,列表

learing_rate = tf.train.exponential_decay(0.1,
                                          global_step,
                                          STEPS // 50,
                                          0.9,
                                          staircase=True)
tf.summary.scalar('learning_rate', learing_rate)
loss = eval.accuracy(Y, y_)  # 损失应该加入L2正则化
tf.summary.scalar('loss', loss)
train_step = tf.train.AdamOptimizer(learing_rate).minimize(
    loss, global_step=global_step)  # 可以用滑动平均模型改进
saver = tf.train.Saver()
merged = tf.summary.merge_all()
with tf.Session as sess:
    writer = tf.summary.FileWriter(log_dir, sess.graph)
    init_op = tf.initialize_all_variables()
    sess.run(init_op)
    for i in range(STEPS):
        start = (i * batch_size) % dataset_size
        end = min(start + batch_size, dataset_size)
        _, _ = sess.run([train_step, global_step],
                        feed_dict={
                            x: X[start:end],
コード例 #5
0
         handle_feat.remove()
         handle_grad.remove()
         atten = grad_cam(grad_block,fmap_block)
         att_loss = criterion(out, targets)
         out, x1, x2, loss_1, loss_2= model(img,cam=False,att=atten)
         loss = criterion(out, targets)
         #loss = (1-(1/(epoch+1))) * loss + (1/(epoch+1)) * att_loss + (1-(1/(epoch+1))) * lambda_ * loss_1.sum()
         loss = 0.4 * loss + 0.6 * att_loss + lambda_ * (loss_1.sum() + loss_2.sum())
     else:
         out = model(img)
         loss = criterion(out, targets)
     if args.model_type != 'norm' and args.model_type != 'gradcam'and args.model_type != 'atten':
         running_loss_1 += loss_1.sum().item() * targets.size(0)
         #running_loss_2 += loss_2.sum().item() * targets.size(0)
     running_loss += loss.item() * targets.size(0)
     prec1, prec5 = accuracy(out.data, targets.data, topk=(1, 5))
     #_, pred = torch.max(out, 1)     # 预测最大值所在的位置标签
     #num_correct = (pred == targets).sum()
     #accuracy = (pred == targets).float().mean()
     #running_acc += num_correct.item()
     running_acc_1 += prec1.item()
     running_acc_5 += prec5.item()
     # 向后传播
     optimizer.zero_grad()
     
     if args.model_type != 'norm' and args.model_type != 'gradcam'and args.model_type != 'atten':
         #compute and add gradient
         x1.retain_grad()
         #x2.retain_grad()
 
         loss_1.backward(Variable(torch.ones(*loss_1.size()).cuda(0)), retain_graph=True)
コード例 #6
0
def train_joint_conv_net(w2vFile,
                         dataFile,
                         labelStructureFile,
                         cfswitch,
                         filter_hs,
                         n_epochs=1000,
                         batch_size=50,
                         feature_maps=100,
                         hasmlphidden=False,
                         usefscore=False):
    """
    function: learning and testing sentence level Question Classification Task
            in a joint fashion, ie. adding the loss function of coarse label prediction
            and fine label prediction together.
    :param w2vFile: the path of the word embedding file(pickle file with numpy
            array value, produced by word2vec.py module)
    :param dataFile: the dataset file produced by process_data.py module
    :param labelStructureFile: a file that describes label structure of coarse and fine
            grains. It is produced in produce_data.py in outputlabelstructure()
    "param filter_h: sliding window size.
            *** warning ***
            you cannot just change window size here, if you want to use a different window
            for the experiment. YOU NEED TO RE-PRODUCE A NEW DATASET IN process_data.py
            WITH THE CORRESPONDING WINDOW SIZE.
    :param n_epochs: the number of epochs the training needs to run
    :param batch_size: the size of the mini-batch
    :param feature_maps: how many dimensions you want the abstract sentence
            representation to be
    :param mlphiddensize: the size of the hidden layer in MLP
    :param logFile: the output file of the brief info of each epoch results, basically a
            save for the print out
    :param logTest: keep track of results on test set
    :return: a tuple of best fine grained prediction accuracy and its corresponding
            coarse grained prediction accuracy
    """
    """
    Loading and preparing data
    """
    datasets = load(dataFile)
    clbl_vec, flbl_vec = process_qc.label_structure(labelStructureFile)
    trainDataSetIndex = 0
    testDataSetIndex = 1
    validDataSetIndex = 2
    sentenceIndex = 0
    clblIndex = 1  # coarse label(clbl) index in the dataset structure
    flblIndex = 2  # fine label(flbl) index

    if cfswitch == 'c':
        lblIndex = clblIndex
        label_vec = clbl_vec
    elif cfswitch == 'f':
        lblIndex = flblIndex
        label_vec = flbl_vec
    else:
        print 'wrong arg value in: cfswtich!'
        sys.exit()

    label_size = len(label_vec)

    if hasmlphidden:
        layer_size = [feature_maps * len(filter_hs), 100, label_size]
    else:
        layer_size = [feature_maps * len(filter_hs), label_size]

    # train part
    train_y = shared_store(datasets[trainDataSetIndex][lblIndex])
    train_x = shared_store(datasets[trainDataSetIndex][sentenceIndex])

    # test part
    gold_test_y = datasets[testDataSetIndex][lblIndex]
    test_x = shared_store(datasets[testDataSetIndex][sentenceIndex])

    # valid part
    gold_valid_y = datasets[validDataSetIndex][lblIndex]
    valid_x = shared_store(datasets[validDataSetIndex][sentenceIndex])

    w2v = load(w2vFile)
    img_w = w2v.shape[1]  # the dimension of the word embedding
    img_h = len(datasets[trainDataSetIndex][sentenceIndex]
                [0])  # length of each sentence
    filter_w = img_w  # word embedding dimension
    image_shapes = []
    filter_shapes = []
    for i in xrange(len(filter_hs)):
        image_shapes.append((batch_size, 1, img_h, img_w * filter_hs[i]))
        filter_shapes.append((feature_maps, 1, 1, filter_w * filter_hs[i]))

    pool_size = (img_h, 1)

    train_size = len(datasets[trainDataSetIndex][sentenceIndex])
    print 'number of sentences in training set: ' + str(train_size)
    print 'max sentence length: ' + str(
        len(datasets[trainDataSetIndex][sentenceIndex][0]))
    print 'train data shape: ' + str(
        datasets[trainDataSetIndex][sentenceIndex].shape)
    print 'word embedding dim: ' + str(w2v.shape[1])
    """
    Building model in theano language, less comments here.
    You can refer to Theano web site for more details
    """
    batch_index = T.lvector('hello_batch_index')
    x = T.itensor3('hello_x')
    y = T.ivector('hello_y')
    w2v_shared = theano.shared(value=w2v, name='w2v', borrow=True)
    rng = np.random.RandomState(3435)

    conv_layer_outputs = []
    conv_layers = []
    for i in xrange(len(filter_hs)):
        input = w2v_shared[x.flatten()].reshape(
            (x.shape[0], 1, x.shape[1],
             x.shape[2] * img_w))[:, :, :, 0:filter_hs[i] * img_w]

        conv_layer = LeNetConvPoolLayer(rng,
                                        input=input,
                                        filter_shape=filter_shapes[i],
                                        poolsize=pool_size,
                                        image_shape=image_shapes[i],
                                        non_linear="relu")

        conv_layers.append(conv_layer)
        conv_layer_outputs.append(conv_layer.output.flatten(2))

    mlp_input = T.concatenate(conv_layer_outputs, 1)

    classifier = MLPDropout(
        rng=rng,
        input=mlp_input,
        layer_sizes=layer_size,  # [feature_maps * len(filter_hs), label_size],
        dropout_rate=0.5,
        activation=Iden)

    params = []
    for conv_layer in conv_layers:
        params += conv_layer.params
    params += classifier.params

    cost = classifier.negative_log_likelihood(y)
    updates = sgd_updates_adadelta(params, cost)

    n_batches = train_x.shape.eval()[0] / batch_size

    train_model = theano.function(
        inputs=[batch_index],
        outputs=cost,
        updates=updates,
        givens={
            x: train_x[batch_index],
            y: train_y[batch_index],
        },
    )
    """
    Building test model
    """
    test_conv_layer_outputs = []
    for i, conv_layer in enumerate(conv_layers):
        test_input = w2v_shared[x.flatten()].reshape(
            (x.shape[0], 1, x.shape[1],
             x.shape[2] * img_w))[:, :, :, 0:filter_hs[i] * img_w]
        test_conv_layer_outputs.append(
            conv_layer.conv_layer_output(test_input,
                                         (test_x.shape.eval()[0], 1, img_h,
                                          img_w * filter_hs[i])).flatten(2))
    test_prediction = classifier.predict(
        T.concatenate(test_conv_layer_outputs, 1))

    # test on test set
    test_model = theano.function(inputs=[],
                                 outputs=test_prediction,
                                 givens={
                                     x: test_x,
                                 })

    # test on valid set
    valid_model = theano.function(inputs=[],
                                  outputs=test_prediction,
                                  givens={
                                      x: valid_x,
                                  })
    """
    Training part
    """
    print 'training....'
    best_valid_ep = 0
    best_valid_acc = 0.
    best_test_ep = 0
    best_test_acc = 0.
    final_acc = 0.
    epoch = 0
    last_acc = 0.

    # create gold value sequences, required by the eval.py
    with open('../exp/goldrs', 'w') as writer:
        for lbl in gold_test_y:
            writer.write(str(lbl) + '\n')

    # training loop
    while (epoch < n_epochs):
        epoch += 1
        print '************* epoch ' + str(epoch)
        batch_indexes = range(train_size)
        rng.shuffle(batch_indexes)
        for bchidx in xrange(n_batches):
            random_indexes = batch_indexes[bchidx * batch_size:(bchidx + 1) *
                                           batch_size]
            train_cost = train_model(random_indexes)

        test_y_preds = test_model()
        valid_y_preds = valid_model()
        if usefscore:
            test_acc = eval.fscore(gold_test_y, test_y_preds)
            valid_acc = eval.fscore(gold_valid_y, valid_y_preds)
        else:
            test_acc = eval.accuracy(gold_test_y, test_y_preds)
            valid_acc = eval.accuracy(gold_valid_y, valid_y_preds)
        if valid_acc > best_valid_acc:
            best_valid_acc = valid_acc
            best_valid_ep = epoch
            if final_acc < test_acc:
                final_acc = test_acc
                with open('../exp/predictions', 'w') as writer:
                    for lblidx in test_y_preds:
                        writer.write(str(lblidx) + '\n')
        if test_acc > best_test_acc:
            best_test_acc = test_acc
            best_test_ep = epoch
            # output predictions

        print 'test accuracy is: ' + str(test_acc)
        print 'valid accuracy is: ' + str(valid_acc)
        print 'current best valid prediction accuracy is: ' + str(
            best_valid_acc) + ' at epoch ' + str(best_valid_ep)
        print 'current best final prediction accuracy is: ' + str(
            final_acc) + ' at epoch ' + str(best_valid_ep)
        print 'current best test prediction accuracy is: ' + str(
            best_test_acc) + ' at epoch ' + str(best_test_ep)
        last_acc = test_acc
    # final_acc = last_acc
    return final_acc
コード例 #7
0
def train(data, lst_label):
    print "START TRAINING!"
    label = getLabel(lst_label)
    label = sps.csr_matrix(label)
    num_feature = data.shape[1]
    epsilon = 0.0005  # learning rate
    lamda = 0.001  # reguarization factor
    num_batch = 2500
    momentum = 0.4
    maxIter = 1500

    W = init(num_feature)
    W = sps.csr_matrix(W)
    prev_W_grad = sps.csr_matrix(W.shape)
    prev_train_error = 10.0

    prior = [
        0.2 / 0.10199, 0.2 / 0.08965, 0.2 / 0.14196, 0.2 / 0.29750,
        0.2 / 0.36689
    ]
    prior_m = np.array(prior)
    prior_m = np.matrix(prior_m)
    prior_m = prior_m.transpose()
    prior_m = np.repeat(prior_m, num_feature, axis=1)
    prior_m = sps.csr_matrix(prior_m)

    for iter in xrange(maxIter):
        shuffled_data, shuffled_label = shuffle(data, label)
        train_error = 0
        batch_size = data.shape[0] / num_batch
        for batch in xrange(num_batch):
            # print "start batch"
            # get batch size
            start = batch * batch_size
            end = (batch +
                   1) * batch_size if batch < num_batch - 1 else data.shape[0]
            batch_size = batch_size if batch < num_batch - 1 else data.shape[
                0] - batch * batch_size
            batch_data = shuffled_data[start:end, :]
            batch_label = shuffled_label[start:end, :]
            # print "start calculating prob"
            prob = getProb(W, batch_data)
            # calculate gradient
            # print "start calculating gradient"
            delta = batch_label - prob
            # train_error += delta.multiply(delta).sum()
            train_error += getError(batch_label, prob)
            dW = delta.transpose().dot(batch_data)
            # dW -= lamda * W.multiply(prior_m)
            dW -= lamda * W
            # update weights
            # print "start updating weights"
            W_grad = momentum * prev_W_grad + epsilon * dW.multiply(prior_m)
            W += W_grad
            prev_W_grad = W_grad
        epsilon *= 0.995
        train_error = train_error / data.shape[0]
        if math.fabs(train_error - prev_train_error) < 1e-8:
            break
        prev_train_error = train_error
        print "train_error:", train_error, " iter: ", iter

    print "traning finished!"
    prob = getProb(W, data)
    # hard predict
    lst_pred_hard = pred_hard_helper(prob)
    # soft predict
    lst_pred_soft = pred_soft_helper(prob)
    print "accuracy:", eval.accuracy(lst_pred_hard,
                                     lst_label), " rmse:", eval.rmse(
                                         lst_pred_soft, lst_label)
    # save_model(W, "cf_model")
    return W
コード例 #8
0
def train_joint_conv_net(
        w2vFile,
        dataFile,
        labelStructureFile,
        cfswitch,
        filter_hs,
        n_epochs=1000,
        batch_size=50,
        feature_maps=100,
        hasmlphidden=False,
        usefscore=False
):
    """
    function: learning and testing sentence level Question Classification Task
            in a joint fashion, ie. adding the loss function of coarse label prediction
            and fine label prediction together.
    :param w2vFile: the path of the word embedding file(pickle file with numpy
            array value, produced by word2vec.py module)
    :param dataFile: the dataset file produced by process_data.py module
    :param labelStructureFile: a file that describes label structure of coarse and fine
            grains. It is produced in produce_data.py in outputlabelstructure()
    "param filter_h: sliding window size.
            *** warning ***
            you cannot just change window size here, if you want to use a different window
            for the experiment. YOU NEED TO RE-PRODUCE A NEW DATASET IN process_data.py
            WITH THE CORRESPONDING WINDOW SIZE.
    :param n_epochs: the number of epochs the training needs to run
    :param batch_size: the size of the mini-batch
    :param feature_maps: how many dimensions you want the abstract sentence
            representation to be
    :param mlphiddensize: the size of the hidden layer in MLP
    :param logFile: the output file of the brief info of each epoch results, basically a
            save for the print out
    :param logTest: keep track of results on test set
    :return: a tuple of best fine grained prediction accuracy and its corresponding
            coarse grained prediction accuracy
    """

    """
    Loading and preparing data
    """
    datasets = load(dataFile)
    clbl_vec, flbl_vec = process_qc.label_structure(labelStructureFile)
    trainDataSetIndex = 0
    testDataSetIndex = 1
    validDataSetIndex = 2
    sentenceIndex = 0
    clblIndex = 1  # coarse label(clbl) index in the dataset structure
    flblIndex = 2  # fine label(flbl) index

    if cfswitch == 'c':
        lblIndex = clblIndex
        label_vec = clbl_vec
    elif cfswitch == 'f':
        lblIndex = flblIndex
        label_vec = flbl_vec
    else:
        print 'wrong arg value in: cfswtich!'
        sys.exit()

    label_size = len(label_vec)

    if hasmlphidden:
        layer_size = [feature_maps * len(filter_hs), 100, label_size]
    else:
        layer_size = [feature_maps * len(filter_hs), label_size]

    # train part
    train_y = shared_store(datasets[trainDataSetIndex][lblIndex])
    train_x = shared_store(datasets[trainDataSetIndex][sentenceIndex])

    # test part
    gold_test_y = datasets[testDataSetIndex][lblIndex]
    test_x = shared_store(datasets[testDataSetIndex][sentenceIndex])

    # valid part
    gold_valid_y = datasets[validDataSetIndex][lblIndex]
    valid_x = shared_store(datasets[validDataSetIndex][sentenceIndex])

    w2v = load(w2vFile)
    img_w = w2v.shape[1]  # the dimension of the word embedding
    img_h = len(datasets[trainDataSetIndex][sentenceIndex][0])  # length of each sentence
    filter_w = img_w  # word embedding dimension
    image_shapes = []
    filter_shapes = []
    for i in xrange(len(filter_hs)):
        image_shapes.append((batch_size, 1, img_h, img_w * filter_hs[i]))
        filter_shapes.append((feature_maps, 1, 1, filter_w * filter_hs[i]))

    pool_size = (img_h, 1)

    train_size = len(datasets[trainDataSetIndex][sentenceIndex])
    print 'number of sentences in training set: ' + str(train_size)
    print 'max sentence length: ' + str(len(datasets[trainDataSetIndex][sentenceIndex][0]))
    print 'train data shape: ' + str(datasets[trainDataSetIndex][sentenceIndex].shape)
    print 'word embedding dim: ' + str(w2v.shape[1])

    """
    Building model in theano language, less comments here.
    You can refer to Theano web site for more details
    """
    batch_index = T.lvector('hello_batch_index')
    x = T.itensor3('hello_x')
    y = T.ivector('hello_y')
    w2v_shared = theano.shared(value=w2v, name='w2v', borrow=True)
    rng = np.random.RandomState(3435)

    conv_layer_outputs = []
    conv_layers = []
    for i in xrange(len(filter_hs)):
        input = w2v_shared[x.flatten()].reshape(
            (x.shape[0], 1, x.shape[1], x.shape[2] * img_w)
        )[:, :, :, 0:filter_hs[i] * img_w]

        conv_layer = LeNetConvPoolLayer(
            rng,
            input=input,
            filter_shape=filter_shapes[i],
            poolsize=pool_size,
            image_shape=image_shapes[i],
            non_linear="relu"
        )

        conv_layers.append(conv_layer)
        conv_layer_outputs.append(conv_layer.output.flatten(2))

    mlp_input = T.concatenate(conv_layer_outputs, 1)

    classifier = MLPDropout(
        rng=rng,
        input=mlp_input,
        layer_sizes=layer_size,  # [feature_maps * len(filter_hs), label_size],
        dropout_rate=0.5,
        activation=Iden
    )

    params = []
    for conv_layer in conv_layers:
        params += conv_layer.params
    params += classifier.params

    cost = classifier.negative_log_likelihood(y)
    updates = sgd_updates_adadelta(params, cost)

    n_batches = train_x.shape.eval()[0] / batch_size

    train_model = theano.function(
        inputs=[batch_index],
        outputs=cost,
        updates=updates,
        givens={
            x: train_x[batch_index],
            y: train_y[batch_index],
        },
    )

    """
    Building test model
    """
    test_conv_layer_outputs = []
    for i, conv_layer in enumerate(conv_layers):
        test_input = w2v_shared[x.flatten()].reshape(
            (x.shape[0], 1, x.shape[1], x.shape[2] * img_w)
        )[:, :, :, 0:filter_hs[i] * img_w]
        test_conv_layer_outputs.append(
            conv_layer.conv_layer_output(
                test_input,
                (test_x.shape.eval()[0], 1, img_h, img_w * filter_hs[i])
            ).flatten(2)
        )
    test_prediction = classifier.predict(T.concatenate(test_conv_layer_outputs, 1))

    # test on test set
    test_model = theano.function(
        inputs=[],
        outputs=test_prediction,
        givens={
            x: test_x,
        }
    )

    # test on valid set
    valid_model = theano.function(
        inputs=[],
        outputs=test_prediction,
        givens={
            x: valid_x,
        }
    )

    """
    Training part
    """
    print 'training....'
    best_valid_ep = 0
    best_valid_acc = 0.
    best_test_ep = 0
    best_test_acc = 0.
    final_acc = 0.
    epoch = 0
    last_acc = 0.

    # create gold value sequences, required by the eval.py
    with open('../exp/goldrs', 'w') as writer:
        for lbl in gold_test_y:
            writer.write(str(lbl) + '\n')

    # training loop
    while (epoch < n_epochs):
        epoch += 1
        print '************* epoch ' + str(epoch)
        batch_indexes = range(train_size)
        rng.shuffle(batch_indexes)
        for bchidx in xrange(n_batches):
            random_indexes = batch_indexes[bchidx * batch_size:(bchidx + 1) * batch_size]
            train_cost = train_model(random_indexes)

        test_y_preds = test_model()
        valid_y_preds = valid_model()
        if usefscore:
            test_acc = eval.fscore(gold_test_y, test_y_preds)
            valid_acc = eval.fscore(gold_valid_y, valid_y_preds)
        else:
            test_acc = eval.accuracy(gold_test_y, test_y_preds)
            valid_acc = eval.accuracy(gold_valid_y, valid_y_preds)
        if valid_acc > best_valid_acc:
            best_valid_acc = valid_acc
            best_valid_ep = epoch
            if final_acc < test_acc:
                final_acc = test_acc
                with open('../exp/predictions', 'w') as writer:
                    for lblidx in test_y_preds:
                        writer.write(str(lblidx) + '\n')
        if test_acc > best_test_acc:
            best_test_acc = test_acc
            best_test_ep = epoch
            # output predictions

        print 'test accuracy is: ' + str(test_acc)
        print 'valid accuracy is: ' + str(valid_acc)
        print 'current best valid prediction accuracy is: ' + str(best_valid_acc) + ' at epoch ' + str(best_valid_ep)
        print 'current best final prediction accuracy is: ' + str(final_acc) + ' at epoch ' + str(best_valid_ep)
        print 'current best test prediction accuracy is: ' + str(best_test_acc) + ' at epoch ' + str(best_test_ep)
        last_acc = test_acc
    # final_acc = last_acc
    return final_acc