train_wrong_classification += _wrong_classification.tolist()
            train_batches += 1
        # if train_err > last_loss * sgd_lr_decay_threshold:
        #     sgd_lr.set_value((sgd_lr * sgd_lr_decay).eval())
        last_loss = train_err
        print("Epoch {} of {} took {:.3f}s".format(
            epoch + 1, num_epochs, time.time() - start_time))
        print("  training loss:\t\t{:.6f}\t{:.6f}".format(train_err / train_batches, class_err / train_batches))
        print("  training accuracy:\t{:.6f} %".format(
            train_acc / train_batches * 100))

        valid_err, valid_acc, valid_wrong_samples, valid_wrong_classification = run_test(val_fn, vlX, vlY, "validation")
        # save backup model every 10 epochs
        if epoch % 10 == 0:
            io.write_model_data([unsupervised_graph, supervised_graph],
                                [best_validation_err, best_validation_acc, sgd_lr.get_value()],
                                LAST_MODEL_PATH)
        # if best model is found, save best model
        if run_parameters.test_model:
            if valid_err < best_validation_err:
                print('NEW BEST MODEL FOUND!')
                best_validation_acc = valid_acc
                best_validation_err = valid_err
                io.write_model_data([unsupervised_graph, supervised_graph],
                                    [best_validation_err, best_validation_acc, sgd_lr.get_value()],
                                    BEST_MODEL_PATH)
                _, _, test_wrong_samples, test_wrong_classification = run_test(val_fn, teX, teY, "test")
                with open(WRONG_SAMPLES_PATH, 'w') as f:
                    pickle.dump([train_wrong_samples, train_wrong_classification,
                                 valid_wrong_samples, valid_wrong_classification,
                                 test_wrong_samples, test_wrong_classification], f, pickle.HIGHEST_PROTOCOL)
            train_acc += acc
            train_batches += 1
        train_loss[-1] /= train_batches
        train_loss1[-1] /= train_batches
        train_loss2[-1] /= train_batches
        train_regularize[-1] /= train_batches
        print("Epoch {} of {} took {:.3f}s".format(
            epoch + 1, num_epochs, time.time() - start_time))
        print("  training loss:\t\t{:.6f}".format(train_err / train_batches))
        print("  training accuracy:\t{:.6f} %".format(
            train_acc / train_batches * 100))

        valid_err, valid_acc = run_test(val_fn, vlX, vlY, "validation")
        # save backup model every 10 epochs
        if epoch % 10 == 0:
            io.write_model_data([unsupervised_graph, supervised_graph], [best_validation_acc], LAST_MODEL_PATH)
        # if best model is found, save best model
        if valid_acc > best_validation_acc:
            print('NEW BEST MODEL FOUND!')
            best_validation_acc = valid_acc
            io.write_model_data([unsupervised_graph, supervised_graph], [best_validation_acc], BEST_MODEL_PATH)
            run_test(val_fn, teX, teY, "test")

    # plot losses graph
    plt.clf()
    plt.plot(train_loss, 'r-')
    plt.plot(train_loss1, 'g-')
    plt.plot(train_loss2, 'b-')
    plt.plot(train_regularize, 'k-')
    plt.show()