Ejemplo n.º 1
0
def trial(N_HIDDEN_LAYERS, NUM_UNITS, OUTPUT_TYPE, MAIN_LOSS_TYPE, LAMBDA,
          FOLD, FINTUNE_SNAPSHOT, FINTUNE_SCALE):
    # BN parameters
    batch_size = 97
    print("batch_size = " + str(batch_size))
    # alpha is the exponential moving average factor
    # alpha = .15
    alpha = .1
    print("alpha = " + str(alpha))
    epsilon = 1e-4
    print("epsilon = " + str(epsilon))

    # MLP parameters
    #NUM_UNITS = 25
    print("NUM_UNITS = " + str(NUM_UNITS))
    #N_HIDDEN_LAYERS = 1
    print("N_HIDDEN_LAYERS = " + str(N_HIDDEN_LAYERS))

    # Training parameters
    num_epochs = 1000
    print("num_epochs = " + str(num_epochs))

    # Dropout parameters
    dropout_in = .2  # 0. means no dropout
    print("dropout_in = " + str(dropout_in))
    dropout_hidden = .5
    print("dropout_hidden = " + str(dropout_hidden))

    # BinaryOut
    activation = binary_net.binary_tanh_unit
    print("activation = binary_net.binary_tanh_unit")
    # activation = binary_net.binary_sigmoid_unit
    # print("activation = binary_net.binary_sigmoid_unit")

    # BinaryConnect
    binary = True
    print("binary = " + str(binary))
    stochastic = False
    print("stochastic = " + str(stochastic))
    # (-H,+H) are the two binary values
    # H = "Glorot"
    H = 1.
    print("H = " + str(H))
    # W_LR_scale = 1.
    W_LR_scale = "Glorot"  # "Glorot" means we are using the coefficients from Glorot's paper
    print("W_LR_scale = " + str(W_LR_scale))

    # Decaying LR
    #LR_start = .003
    LR_start = 0.000003
    print("LR_start = " + str(LR_start))
    #LR_fin = 0.0000003
    LR_fin = LR_start
    print("LR_fin = " + str(LR_fin))
    LR_decay = (LR_fin / LR_start)**(1. / num_epochs)
    print("LR_decay = " + str(LR_decay))
    # BTW, LR decay might good for the BN moving average...

    # replace the dataset
    print('Loading SFEW2 dataset...')
    [train_x, train_y, val_x, val_y] = SFEW2.load_train_val()
    print(train_x.shape)
    print(train_y.shape)
    print(val_x.shape)
    print(val_y.shape)
    print('last training minibatch size: ' +
          str(train_x.shape[0] - train_x.shape[0] / batch_size * batch_size) +
          ' / ' + str(batch_size))
    print(
        'last training minibatch size should not be too small (except 0). try decrease the batch_size, but not add more minibatches.'
    )
    print('minibatches size: ' + str(batch_size))
    print('suggested minibatches size: ' + str(
        math.ceil(
            float(train_x.shape[0]) /
            math.ceil(float(train_x.shape[0]) / 100))))

    print('Building the MLP...')
    # Prepare Theano variables for inputs and targets
    input = T.matrix('inputs')
    target = T.matrix('targets')
    LR = T.scalar('LR', dtype=theano.config.floatX)

    mlp = lasagne.layers.InputLayer(shape=(None, train_x.shape[1]),
                                    input_var=input)

    mlp = lasagne.layers.DropoutLayer(mlp, p=dropout_in)

    for k in range(N_HIDDEN_LAYERS):

        # pretrain-finetune
        if (k == 0):
            # fixed num_units
            mlp = binary_net.DenseLayer(
                mlp,
                binary=binary,
                stochastic=stochastic,
                H=H,
                W_LR_scale=W_LR_scale,
                nonlinearity=lasagne.nonlinearities.identity,
                num_units=1500)

            # scale down the LR of transfered dense layer
            print('scale down the LR of transfered dense layer from',
                  str(mlp.W_LR_scale))
            mlp.W_LR_scale *= np.float32(FINTUNE_SCALE)
            print('to', str(mlp.W_LR_scale))
        else:
            mlp = binary_net.DenseLayer(
                mlp,
                binary=binary,
                stochastic=stochastic,
                H=H,
                W_LR_scale=W_LR_scale,
                nonlinearity=lasagne.nonlinearities.identity,
                num_units=NUM_UNITS)

        mlp = lasagne.layers.BatchNormLayer(mlp, epsilon=epsilon, alpha=alpha)

        mlp = lasagne.layers.NonlinearityLayer(mlp, nonlinearity=activation)

        mlp = lasagne.layers.DropoutLayer(mlp, p=dropout_hidden)

        # pretrain-finetune
        # only restore the first layer group
        if (k == 0):
            if (FINTUNE_SNAPSHOT != 0):
                print('Load ./W-%d.npz' % FINTUNE_SNAPSHOT)
                with np.load('./W-%d.npz' % FINTUNE_SNAPSHOT) as f:
                    param_values = [
                        f['arr_%d' % i] for i in range(len(f.files))
                    ]
                param_values = param_values[0:6]
                lasagne.layers.set_all_param_values(mlp, param_values)

    mlp = binary_net.DenseLayer(mlp,
                                binary=binary,
                                stochastic=stochastic,
                                H=H,
                                W_LR_scale=W_LR_scale,
                                nonlinearity=lasagne.nonlinearities.identity,
                                num_units=7)

    mlp = lasagne.layers.BatchNormLayer(mlp, epsilon=epsilon, alpha=alpha)

    # network output BN or SGN
    if OUTPUT_TYPE == 'C':
        pass  #
    elif OUTPUT_TYPE == 'D':
        mlp = lasagne.layers.NonlinearityLayer(mlp, nonlinearity=activation)
    else:
        assert (False)

    # loss weight nodes
    SPARSITY = 0.9
    SPARSITY_MAP = (np.float32(train_x == -1)).mean(0)
    LOSS_WEIGHT_1 = 1. + input * (2. * SPARSITY - 1)
    LOSS_WEIGHT_1 /= 4 * SPARSITY * (1 - SPARSITY
                                     )  # fixed 1->-1:5 -1->1:5/9 weights
    LOSS_WEIGHT_2 = 1. + input * (2. * SPARSITY_MAP - 1)  #
    LOSS_WEIGHT_2 /= 4 * SPARSITY_MAP * (
        1 - SPARSITY_MAP)  # weights considering element's prior probability

    # train loss nodes
    train_output = lasagne.layers.get_output(mlp, deterministic=False)
    if MAIN_LOSS_TYPE == 'SH':
        train_loss = T.mean(T.sqr(T.maximum(0., 1. - target * train_output)))
    elif MAIN_LOSS_TYPE == 'W1SH':
        train_loss = T.mean(
            T.sqr(T.maximum(0., (1. - target * train_output))) * LOSS_WEIGHT_1)
    elif MAIN_LOSS_TYPE == 'W2SH':
        train_loss = T.mean(
            T.sqr(T.maximum(0., (1. - target * train_output))) * LOSS_WEIGHT_2)
    elif MAIN_LOSS_TYPE == 'H':
        train_loss = T.mean(T.maximum(0., 1. - target * train_output))
    elif MAIN_LOSS_TYPE == 'W1H':
        train_loss = T.mean(
            T.maximum(0., (1. - target * train_output)) * LOSS_WEIGHT_1)
    elif MAIN_LOSS_TYPE == 'W2H':
        train_loss = T.mean(
            T.maximum(0., (1. - target * train_output)) * LOSS_WEIGHT_2)
    else:
        assert (False)

    # + sparse penalty
    if LAMBDA > 0:
        train_pixel_wise_density = T.mean(T.reshape(
            (train_output + 1.) / 2.,
            [train_output.shape[0], train_output.shape[1] / 10, 10]),
                                          axis=2)
        train_penalty = LAMBDA * T.mean(
            T.sqr(train_pixel_wise_density - (1. - SPARSITY)))
    else:
        train_penalty = T.constant(0.)
    train_loss = train_loss + train_penalty

    # acc
    train_acc = T.mean(T.eq(T.argmax(train_output, axis=1),
                            T.argmax(target, axis=1)),
                       dtype=theano.config.floatX)

    # grad nodes
    if binary:
        # W updates
        W = lasagne.layers.get_all_params(mlp, binary=True)
        W_grads = binary_net.compute_grads(train_loss, mlp)
        updates = lasagne.updates.adam(loss_or_grads=W_grads,
                                       params=W,
                                       learning_rate=LR)
        updates = binary_net.clipping_scaling(updates, mlp)

        # other parameters updates
        params = lasagne.layers.get_all_params(mlp,
                                               trainable=True,
                                               binary=False)
        updates = OrderedDict(updates.items() + lasagne.updates.adam(
            loss_or_grads=train_loss, params=params, learning_rate=LR).items())

    else:
        params = lasagne.layers.get_all_params(mlp, trainable=True)
        updates = lasagne.updates.adam(loss_or_grads=train_loss,
                                       params=params,
                                       learning_rate=LR)

    # val loss nodes
    # must be created after grad nodes
    val_output = lasagne.layers.get_output(mlp, deterministic=True)
    if MAIN_LOSS_TYPE == 'SH':
        val_loss = T.mean(T.sqr(T.maximum(0., 1. - target * val_output)))
    elif MAIN_LOSS_TYPE == 'W1SH':
        val_loss = T.mean(
            T.sqr(T.maximum(0., (1. - target * val_output))) * LOSS_WEIGHT_1)
    elif MAIN_LOSS_TYPE == 'W2SH':
        val_loss = T.mean(
            T.sqr(T.maximum(0., (1. - target * val_output))) * LOSS_WEIGHT_2)
    elif MAIN_LOSS_TYPE == 'H':
        val_loss = T.mean(T.maximum(0., 1. - target * val_output))
    elif MAIN_LOSS_TYPE == 'W1H':
        val_loss = T.mean(
            T.maximum(0., (1. - target * val_output)) * LOSS_WEIGHT_1)
    elif MAIN_LOSS_TYPE == 'W2H':
        val_loss = T.mean(
            T.maximum(0., (1. - target * val_output)) * LOSS_WEIGHT_2)

    # + sparse penalty
    if LAMBDA > 0:
        val_pixel_wise_density = T.mean(T.reshape(
            (val_output + 1.) / 2.,
            [val_output.shape[0], val_output.shape[1] / 10, 10]),
                                        axis=2)
        val_penalty = LAMBDA * T.mean(
            T.sqr(val_pixel_wise_density - (1. - SPARSITY)))
    else:
        val_penalty = T.constant(0.)
    val_loss = val_loss + val_penalty

    # acc
    val_acc = T.mean(T.eq(T.argmax(val_output, axis=1), T.argmax(target,
                                                                 axis=1)),
                     dtype=theano.config.floatX)

    # Compile a function performing a training step on a mini-batch (by giving the updates dictionary)
    # and returning the corresponding training train_loss:
    train_fn = theano.function(
        [input, target, LR],
        [train_loss, train_penalty, train_acc, train_output],
        updates=updates)

    # Compile a second function computing the validation train_loss and accuracy:
    val_fn = theano.function([input, target],
                             [val_loss, val_penalty, val_acc, val_output])

    print('Training...')
    train_x = binary_net.MoveParameter(train_x)
    binary_net.train(train_fn, val_fn, batch_size, LR_start, LR_decay,
                     num_epochs, train_x, train_y, val_x, val_y)
def trial(N_HIDDEN_LAYERS, NUM_UNITS, OUTPUT_TYPE, MAIN_LOSS_TYPE, LAMBDA,
          FOLD):
    # BN parameters
    batch_size = 100
    print("batch_size = " + str(batch_size))
    # alpha is the exponential moving average factor
    # alpha = .15
    alpha = .1
    print("alpha = " + str(alpha))
    epsilon = 1e-4
    print("epsilon = " + str(epsilon))

    # MLP parameters
    #NUM_UNITS = 25
    print("NUM_UNITS = " + str(NUM_UNITS))
    #N_HIDDEN_LAYERS = 1
    print("N_HIDDEN_LAYERS = " + str(N_HIDDEN_LAYERS))

    # Training parameters
    num_epochs = 1000000
    print("num_epochs = " + str(num_epochs))

    # Dropout parameters
    dropout_in = .2  # 0. means no dropout
    print("dropout_in = " + str(dropout_in))
    dropout_hidden = .5
    print("dropout_hidden = " + str(dropout_hidden))

    # BinaryOut
    activation = binary_net.binary_tanh_unit
    print("activation = binary_net.binary_tanh_unit")
    # activation = binary_net.binary_sigmoid_unit
    # print("activation = binary_net.binary_sigmoid_unit")

    # BinaryConnect
    binary = True
    print("binary = " + str(binary))
    stochastic = False
    print("stochastic = " + str(stochastic))
    # (-H,+H) are the two binary values
    # H = "Glorot"
    H = 1.
    print("H = " + str(H))
    # W_LR_scale = 1.
    W_LR_scale = "Glorot"  # "Glorot" means we are using the coefficients from Glorot's paper
    print("W_LR_scale = " + str(W_LR_scale))

    # Decaying LR
    #LR_start = .003
    LR_start = 0.000003
    print("LR_start = " + str(LR_start))
    #LR_fin = 0.0000003
    LR_fin = LR_start
    print("LR_fin = " + str(LR_fin))
    LR_decay = (LR_fin / LR_start)**(1. / num_epochs)
    print("LR_decay = " + str(LR_decay))
    # BTW, LR decay might good for the BN moving average...

    # replace the dataset
    print('Loading SFEW2 dataset...')
    [train_x] = SFEW2.load_lfw()
    assert (train_x.shape[0] == 26404)
    train_x = train_x[0:26400, :]
    [val_x, _, _, _] = SFEW2.load_train_val()

    print(train_x.shape)
    print(val_x.shape)
    print('last training minibatch size: ' +
          str(train_x.shape[0] - train_x.shape[0] / batch_size * batch_size) +
          ' / ' + str(batch_size))
    print(
        'last training minibatch size should not be too small (except 0). try decrease the batch_size, but not add more minibatches.'
    )
    print('minibatches size: ' + str(batch_size))
    print('suggested minibatches size: ' + str(
        math.ceil(
            float(train_x.shape[0]) /
            math.ceil(float(train_x.shape[0]) / 100))))

    ##############################################################################################

    print('Building the MLP...')
    # Prepare Theano variables for inputs and targets
    input = T.matrix('inputs')
    LR = T.scalar('LR', dtype=theano.config.floatX)

    mlp = lasagne.layers.InputLayer(shape=(None, train_x.shape[1]),
                                    input_var=input)

    mlp = lasagne.layers.DropoutLayer(
        mlp, p=0)  # train BAE-2: no dropout on input & BAE-1 layer

    for k in range(N_HIDDEN_LAYERS):
        if (k == 0):
            mlp = binary_net.DenseLayer(
                mlp,
                binary=binary,
                stochastic=stochastic,
                H=H,
                W_LR_scale=W_LR_scale,
                nonlinearity=lasagne.nonlinearities.identity,
                num_units=NUM_UNITS)
        elif (k == 1):
            mlp = binary_net.DenseLayer(
                mlp,
                binary=binary,
                stochastic=stochastic,
                H=H,
                W_LR_scale=W_LR_scale,
                nonlinearity=lasagne.nonlinearities.identity,
                num_units=NUM_UNITS * 2)
        else:
            assert (False)

        #if(k==0):
        #    print('scale down the LR of transfered dense layer from', str(mlp.W_LR_scale))
        #    mlp.W_LR_scale = 0
        #    print('to', str(mlp.W_LR_scale))

        if (k == 0):
            # BAE1 encoder: BN
            mlp = lasagne.layers.BatchNormLayer(mlp,
                                                epsilon=epsilon,
                                                alpha=alpha)
        elif (k == 1):
            # BAE2 encoder: do not use BN for encouraging sparsity
            pass
        else:
            # further layer use BN
            mlp = lasagne.layers.BatchNormLayer(mlp,
                                                epsilon=epsilon,
                                                alpha=alpha)

        # midactivation place before hard tanh
        # encoder and decoder should not use BatchNorm
        # "l1 reg" on midactivation
        if (k == 1):
            mlp_midactivation = mlp

        mlp = lasagne.layers.NonlinearityLayer(mlp, nonlinearity=activation)

        if (k == 0):
            mlp = lasagne.layers.DropoutLayer(
                mlp, p=0)  # train BAE-2: no dropout on input & BAE-1 layer
        else:
            mlp = lasagne.layers.DropoutLayer(mlp, p=dropout_hidden)

        # pretrain-finetune
        # only restore the first layer group
        if (k == 0):
            print('Load ./W-1168.npz')
            with np.load('./W-1168.npz') as f:
                param_values = [f['arr_%d' % i] for i in range(len(f.files))]
            param_values = param_values[0:6]
            lasagne.layers.set_all_param_values(mlp, param_values)

            mlp_groundtruth = mlp

    mlp = binary_net.DenseLayer(mlp,
                                binary=binary,
                                stochastic=stochastic,
                                H=H,
                                W_LR_scale=W_LR_scale,
                                nonlinearity=lasagne.nonlinearities.identity,
                                num_units=1500)

    mlp = lasagne.layers.BatchNormLayer(mlp, epsilon=epsilon, alpha=alpha)

    # network output BN or SGN
    if OUTPUT_TYPE == 'C':
        pass  #
    elif OUTPUT_TYPE == 'D':
        mlp = lasagne.layers.NonlinearityLayer(mlp, nonlinearity=activation)
    else:
        assert (False)
    '''
    # equal transform validation
    # 1 set AE transform to I
    # 1 modift AE DenseLayer.get_output_for() use W(0 1) instead of Wb(+1 -1)
    # 2 set encoder's dropout=0
    # 3 comment out encoder's and decoder's BatchNormLayer, modify set_all_param_values
    # will see train loss = 0
    pv = lasagne.layers.get_all_param_values(mlp)
    pv[2] = np.identity(1500, np.float64)
    pv[4] = np.identity(1500, np.float64)
    lasagne.layers.set_all_param_values(mlp, pv)
    '''
    '''
    # loss weight nodes
    SPARSITY = 0.9
    SPARSITY_MAP = (np.float32(train_x==-1)).mean(0)
    LOSS_WEIGHT_1 = 1.+input*(2.*SPARSITY-1)
    LOSS_WEIGHT_1 /= 4*SPARSITY*(1 - SPARSITY)# fixed 1->-1:5 -1->1:5/9 weights
    LOSS_WEIGHT_2 = 1.+input*(2.*SPARSITY_MAP-1)#
    LOSS_WEIGHT_2 /= 4*SPARSITY_MAP*(1 - SPARSITY_MAP)# weights considering element's prior probability
    '''

    # train loss nodes
    '''
    train_output = lasagne.layers.get_output(mlp, deterministic=False)
    if MAIN_LOSS_TYPE=='SH':
        train_loss = T.mean(T.sqr(T.maximum(0.,1.-input*train_output)))
    elif MAIN_LOSS_TYPE == 'W1SH':
        train_loss = T.mean(T.sqr(T.maximum(0., (1. - input * train_output))) * LOSS_WEIGHT_1)
    elif MAIN_LOSS_TYPE == 'W2SH':
        train_loss = T.mean(T.sqr(T.maximum(0., (1. - input * train_output))) * LOSS_WEIGHT_2)
    elif MAIN_LOSS_TYPE == 'H':
        train_loss = T.mean(T.maximum(0.,1.-input*train_output))
    elif MAIN_LOSS_TYPE == 'W1H':
        train_loss = T.mean(T.maximum(0., (1. - input * train_output)) * LOSS_WEIGHT_1)
    elif MAIN_LOSS_TYPE == 'W2H':
        train_loss = T.mean(T.maximum(0., (1. - input * train_output)) * LOSS_WEIGHT_2)
    else:
        assert(False)
    '''
    [
        train_output_mlp_groundtruth, train_output_mlp_midactivation,
        train_output
    ] = lasagne.layers.get_output([mlp_groundtruth, mlp_midactivation, mlp],
                                  deterministic=False)
    train_loss = T.mean(
        T.maximum(0., 1. - train_output_mlp_groundtruth * train_output))

    # + sparse penalty
    '''
    if LAMBDA>0:
        train_pixel_wise_density = T.mean(T.reshape((train_output+1.)/2., [train_output.shape[0], train_output.shape[1]/10, 10]), axis=2)
        train_penalty = LAMBDA*T.mean(T.sqr(train_pixel_wise_density - (1.-SPARSITY)))
    else:
        train_penalty = T.constant(0.)
    train_loss = train_loss + train_penalty
    '''
    if LAMBDA > 0:
        train_penalty = LAMBDA * T.mean(
            T.maximum(0., 1. + train_output_mlp_midactivation))
    else:
        train_penalty = T.constant(0.)
    train_loss = train_loss + train_penalty

    # grad nodes
    if binary:
        # W updates
        W = lasagne.layers.get_all_params(mlp, binary=True)
        W_grads = binary_net.compute_grads(train_loss, mlp)

        # untrainable W1
        assert (len(W) == 3)
        assert (len(W_grads) == 3)
        W = W[1:len(W)]
        W_grads = W_grads[1:len(W_grads)]
        assert (len(W) == 2)
        assert (len(W_grads) == 2)

        updates = lasagne.updates.adam(loss_or_grads=W_grads,
                                       params=W,
                                       learning_rate=LR)
        updates = binary_net.clipping_scaling(updates, mlp)

        # other parameters updates
        params = lasagne.layers.get_all_params(mlp,
                                               trainable=True,
                                               binary=False)

        # untrainable b1 bn1
        assert (len(params) == 7)
        assert (params[0].name == 'b')  # fix
        assert (params[1].name == 'beta')  # fix
        assert (params[2].name == 'gamma')  # fix
        assert (params[3].name == 'b')
        assert (params[4].name == 'b')
        assert (params[5].name == 'beta')
        assert (params[6].name == 'gamma')
        params = params[3:len(params)]
        assert (len(params) == 4)

        updates = OrderedDict(updates.items() + lasagne.updates.adam(
            loss_or_grads=train_loss, params=params, learning_rate=LR).items())

    else:
        params = lasagne.layers.get_all_params(mlp, trainable=True)
        updates = lasagne.updates.adam(loss_or_grads=train_loss,
                                       params=params,
                                       learning_rate=LR)

    ##############################################################################################

    # val loss nodes
    # must be created after grad nodes
    '''
    val_output = lasagne.layers.get_output(mlp, deterministic=True)
    if MAIN_LOSS_TYPE=='SH':
        val_loss = T.mean(T.sqr(T.maximum(0.,1.-input*val_output)))
    elif MAIN_LOSS_TYPE == 'W1SH':
        val_loss = T.mean(T.sqr(T.maximum(0., (1. - input * val_output))) * LOSS_WEIGHT_1)
    elif MAIN_LOSS_TYPE == 'W2SH':
        val_loss = T.mean(T.sqr(T.maximum(0., (1. - input * val_output))) * LOSS_WEIGHT_2)
    elif MAIN_LOSS_TYPE == 'H':
        val_loss = T.mean(T.maximum(0.,1.-input*val_output))
    elif MAIN_LOSS_TYPE == 'W1H':
        val_loss = T.mean(T.maximum(0., (1. - input * val_output)) * LOSS_WEIGHT_1)
    elif MAIN_LOSS_TYPE == 'W2H':
        val_loss = T.mean(T.maximum(0., (1. - input * val_output)) * LOSS_WEIGHT_2)
    '''
    [val_output_mlp_groundtruth, val_output_mlp_midactivation, val_output
     ] = lasagne.layers.get_output([mlp_groundtruth, mlp_midactivation, mlp],
                                   deterministic=True)
    val_loss = T.mean(
        T.maximum(0., 1. - val_output_mlp_groundtruth * val_output))

    # + sparse penalty
    '''
    if LAMBDA>0:
        val_pixel_wise_density = T.mean(T.reshape((val_output + 1.) / 2., [val_output.shape[0], val_output.shape[1] / 10, 10]), axis=2)
        val_penalty = LAMBDA*T.mean(T.sqr(val_pixel_wise_density - (1. - SPARSITY)))
    else:
        val_penalty = T.constant(0.)
    val_loss = val_loss + val_penalty
    '''
    if LAMBDA > 0:
        val_penalty = LAMBDA * T.mean(
            T.maximum(0., 1. + val_output_mlp_midactivation))
    else:
        val_penalty = T.constant(0.)
    val_loss = val_loss + val_penalty

    ##############################################################################################

    # Compile a function performing a training step on a mini-batch (by giving the updates dictionary)
    # and returning the corresponding training train_loss:
    train_fn = theano.function([input, LR], [
        train_loss, train_penalty, train_output_mlp_groundtruth,
        train_output_mlp_midactivation, train_output
    ],
                               updates=updates)

    ##############################################################################################

    # Compile a second function computing the validation loss and accuracy:
    val_fn = theano.function([input], [
        val_loss, val_penalty, val_output_mlp_groundtruth,
        val_output_mlp_midactivation, val_output
    ])

    ##############################################################################################

    print('Training...')
    train_x = binary_net.MoveParameter(train_x)
    binary_net.train(train_fn, val_fn, batch_size, LR_start, LR_decay,
                     num_epochs, train_x, val_x, mlp)

    print('Save W')
    np.savez('./W.npz', *lasagne.layers.get_all_param_values(
        mlp))  # W b BN BN BN BN W b BN BN BN BN