#     rng=npy_rng
# )

print "\n\n... fine-tuning the whole network"
init_lr = trainer.learningrate
prev_cost = numpy.inf
epc_cost = 0.
patience = 0
avg = 50
crnt_avg = [numpy.inf, ] * avg
hist_avg = [numpy.inf, ] * avg
for step in xrange(finetune_epc * 50000 / batchsize):
    # learn
    # if (step - 1) % 500 == 0:
    #     print "normal cost: ",
    cost = trainer.step_fast(verbose_stride=500)
    # if (step - 1) % 500 == 0:
    #     print "gradient cost: ",
    # cost_grad = trainer2.step_fast(verbose_stride=500)
    apply_mask[0]()
    apply_mask[1]()
    #apply_mask[2]()

    epc_cost += cost
    if step % (50000 / batchsize) == 0 and step > 0:
        # set stop rule
        ind = (step / (50000 / batchsize)) % avg
        hist_avg[ind] = crnt_avg[ind]
        crnt_avg[ind] = epc_cost
        if sum(hist_avg) < sum(crnt_avg):
            break
    rng=npy_rng
)

print "\n\n... fine-tuning the whole network"
init_lr = trainer.learningrate
prev_cost = numpy.inf
epc_cost = 0.
patience = 0
avg = 50
crnt_avg = [numpy.inf, ] * avg
hist_avg = [numpy.inf, ] * avg
for step in xrange(finetune_epc * 50000 / batchsize):
    # learn
    if (step - 1) % 500 == 0:
        print "normal cost: ",
    cost = trainer.step_fast(verbose_stride=500)
    if (step - 1) % 500 == 0:
        print "gradient cost: ",
    apply_mask_l0()
    cost_grad = trainer2.step_fast(verbose_stride=500)
    #apply_mask[1]()
    #apply_mask[2]()

    epc_cost += cost
    if step % (50000 / batchsize) == 0 and step > 0:
        # set stop rule
        ind = (step / (50000 / batchsize)) % avg
        hist_avg[ind] = crnt_avg[ind]
        crnt_avg[ind] = epc_cost
        if sum(hist_avg) < sum(crnt_avg):
            break