Esempio n. 1
0
    def getGeneratorUpdates(self, loss, aLearningRate, aBeta1):
        LR = aLearningRate
        mlp = self.gen
        if self.IS_BINARY:
            # W updates
            # extract 'binary' parameters only
            Wb_list = lasagne.layers.get_all_params(self.gen, binary=True)
            for eW in Wb_list:
                #print( 'eW:', type(eW), eW )
                pass

            # Make a list of the gradients w.r.t. the binary parameters
            W_grad_list = binary_net.compute_grads(loss, mlp)
            #print('W_grad_list', type(W_grad_list), W_grad_list)

            # Update function map(OrderedDict) with ADAM learning method
            updates_b0 = lasagne.updates.adam(loss_or_grads=W_grad_list,
                                              params=Wb_list,
                                              learning_rate=LR,
                                              beta1=aBeta1)

            # clipping & scaling for binarization
            updates_b1 = binary_net.clipping_scaling(updates_b0, mlp)

            # other parameters updates
            Wr_list = lasagne.layers.get_all_params(mlp,
                                                    trainable=True,
                                                    binary=False)
            for Wr in Wr_list:
                #print('Wr:', type(Wr), Wr)
                pass

            # Marging the parameters : binary params + other params
            updates = OrderedDict(updates_b1.items() +
                                  lasagne.updates.adam(loss_or_grads=loss,
                                                       params=Wr_list,
                                                       learning_rate=LR,
                                                       beta1=aBeta1).items())

        else:
            Wr_list = lasagne.layers.get_all_params(mlp, trainable=True)
            updates = lasagne.updates.adam(loss_or_grads=loss,
                                           params=Wr_list,
                                           learning_rate=LR,
                                           beta1=aBeta1)

        return updates
Esempio n. 2
0
    # if this does not work, try
    # train_0_when_0 = batch_size - T.sum(T.or_(T.argmax(train_output,axis=1),T.argmax(target,axis=1))),dtype=theano.config.floatX)
    train_precision = train_0_when_0 / (train_0_when_0 + train_0_when_1
                                        )  # TP/(TP+FP)
    train_recall = train_0_when_0 / (train_0_when_0 + train_1_when_0
                                     )  # TP/(TP+FN)

    if binary:

        # W updates
        W = lasagne.layers.get_all_params(cnn, binary=True)
        W_grads = binary_net.compute_grads(loss, cnn)
        updates = lasagne.updates.adam(loss_or_grads=W_grads,
                                       params=W,
                                       learning_rate=LR)
        updates = binary_net.clipping_scaling(updates, cnn)

        # other parameters updates
        params = lasagne.layers.get_all_params(cnn,
                                               trainable=True,
                                               binary=False)
        updates = OrderedDict(updates.items() + lasagne.updates.adam(
            loss_or_grads=loss, params=params, learning_rate=LR).items())

    else:
        params = lasagne.layers.get_all_params(cnn, trainable=True)
        updates = lasagne.updates.adam(loss_or_grads=loss,
                                       params=params,
                                       learning_rate=LR)

    test_output = lasagne.layers.get_output(cnn, deterministic=True)
Esempio n. 3
0
            cnn,
            epsilon=epsilon, 
            alpha=alpha)

    train_output = lasagne.layers.get_output(cnn, deterministic=False)
    
    # squared hinge loss 
    loss = T.mean(T.sqr(T.maximum(0.,1.-target*train_output)))
    
    if binary:
        
        # W updates
        W = lasagne.layers.get_all_params(cnn, binary=True)
        W_grads = binary_net.compute_grads(loss,cnn)
        updates = lasagne.updates.adam(loss_or_grads=W_grads, params=W, learning_rate=LR)
        updates = binary_net.clipping_scaling(updates,cnn)
        
        # other parameters updates
        params = lasagne.layers.get_all_params(cnn, trainable=True, binary=False)
        updates = OrderedDict(updates.items() + lasagne.updates.adam(loss_or_grads=loss, params=params, learning_rate=LR).items())
        
    else:
        params = lasagne.layers.get_all_params(cnn, trainable=True)
        updates = lasagne.updates.adam(loss_or_grads=loss, params=params, learning_rate=LR)

    test_output = lasagne.layers.get_output(cnn, deterministic=True)
    test_loss = T.mean(T.sqr(T.maximum(0.,1.-target*test_output)))
    test_err = T.mean(T.neq(T.argmax(test_output, axis=1), T.argmax(target, axis=1)),dtype=theano.config.floatX)
    
    # Compile a function performing a training step on a mini-batch (by giving the updates dictionary) 
    # and returning the corresponding training loss:
Esempio n. 4
0
def main():
    # BN parameters
    # alpha is the exponential moving average factor
    alpha = .1
    print("alpha = " + str(alpha))
    epsilon = 1e-4
    print("epsilon = " + str(epsilon))

    # BinaryOut
    activation = binary_net.binary_tanh_unit
    print("activation = binary_net.binary_tanh_unit")
    # activation = binary_net.binary_sigmoid_unit
    # print("activation = binary_net.binary_sigmoid_unit")

    # BinaryConnect
    binary = True
    print("binary = " + str(binary))
    stochastic = False
    print("stochastic = " + str(stochastic))
    # (-H,+H) are the two binary values
    # H = "Glorot"
    H = 1.
    print("H = " + str(H))
    W_LR_scale = 1.
    #W_LR_scale = "Glorot"  # "Glorot" means we are using the coefficients from Glorot's paper
    print("W_LR_scale = " + str(W_LR_scale))

    # Training parameters
    num_epochs = 50
    print("num_epochs = " + str(num_epochs))

    # Decaying LR
    LR_start = 0.01
    print("LR_start = " + str(LR_start))
    LR_fin = 0.000003

    # LR_start = 0.01
    # print("LR_start = " + str(LR_start))
    # LR_fin = 1e-6
    print("LR_fin = " + str(LR_fin))
    LR_decay = (LR_fin / LR_start)**(1. / num_epochs)
    print("LR_decay = " + str(LR_decay))
    # BTW, LR decay might good for the BN moving average...

    shuffle_parts = 1
    print("shuffle_parts = " + str(shuffle_parts))

    networkType = 'google'
    dataType = 'TCDTIMIT'

    #networkType='cifar10'
    #dataType='cifar10'

    # these batch sizes work for a GTX 1060 (6GB)
    if dataType == 'TCDTIMIT':
        if networkType == 'google': batch_size = 100
        else: batch_size = 24
    elif dataType == 'cifar10' and networkType == 'cifar10': batch_size = 400
    elif dataType == 'cifar10' and networkType == 'google': batch_size = 1000

    model_name = os.path.expanduser('~/TCDTIMIT/lipreading/TCDTIMIT/results/CNN_binaryNet/lipspeakers_') \
                 + networkType + "_phoneme39_binary" + "_" + dataType
    model_path = model_name + ".npz"
    if not os.path.exists(os.path.dirname(model_path)):
        os.makedirs(os.path.dirname(model_path))

    print('Building the CNN...')
    # Prepare Theano variables for inputs and targets
    input = T.tensor4('inputs')
    target = T.matrix('targets')
    LR = T.scalar('LR', dtype=theano.config.floatX)

    cnn = buildCNN(networkType, dataType, input, epsilon, alpha, activation,
                   binary, stochastic, H, W_LR_scale)

    # restore network weights
    if os.path.exists(model_path):
        with np.load(model_path) as f:
            param_values = [f['arr_%d' % i] for i in range(len(f.files))]
            try:
                lasagne.layers.set_all_param_values(cnn, *param_values)
            except:
                lasagne.layers.set_all_param_values(cnn, param_values)
            print("\n\n\t Loaded model " + model_path)

    print('Loading ' + dataType + ' dataset...')
    X_train, y_train, X_val, y_val, X_test, y_test = loadDataset(dataType)

    print("Building Functions...")
    train_output = lasagne.layers.get_output(cnn, deterministic=False)

    # squared hinge loss
    loss = T.mean(T.sqr(T.maximum(0., 1. - target * train_output)))

    # W updates
    W = lasagne.layers.get_all_params(cnn, binary=True)
    W_grads = binary_net.compute_grads(loss, cnn)
    updates = lasagne.updates.adam(loss_or_grads=W_grads,
                                   params=W,
                                   learning_rate=LR)
    updates = binary_net.clipping_scaling(updates, cnn)

    # other parameters updates
    params = lasagne.layers.get_all_params(cnn, trainable=True, binary=False)
    updates = OrderedDict(updates.items() + lasagne.updates.adam(
        loss_or_grads=loss, params=params, learning_rate=LR).items())

    test_output = lasagne.layers.get_output(cnn, deterministic=True)
    test_loss = T.mean(T.sqr(T.maximum(0., 1. - target * test_output)))
    test_err = T.mean(T.neq(T.argmax(test_output, axis=1),
                            T.argmax(target, axis=1)),
                      dtype=theano.config.floatX)

    # Compile a function performing a training step on a mini-batch (by giving the updates dictionary)
    # and returning the corresponding training loss:
    train_fn = theano.function([input, target, LR], loss, updates=updates)

    # Compile a second function computing the validation loss and accuracy:
    val_fn = theano.function([input, target], [test_loss, test_err])

    print('Training...')

    binary_net.train(train_fn,
                     val_fn,
                     cnn,
                     batch_size,
                     LR_start,
                     LR_decay,
                     num_epochs,
                     X_train,
                     y_train,
                     X_val,
                     y_val,
                     X_test,
                     y_test,
                     save_name=model_name,
                     shuffle_parts=shuffle_parts,
                     justTest=justTest)
Esempio n. 5
0
    train_output = lasagne.layers.get_output(cnn, deterministic=False)

    # squared hinge loss
    loss = T.mean(T.sqr(T.maximum(0., 1. - target * train_output)))

    if args.train:
        if binary:

            # W updates
            W = lasagne.layers.get_all_params(cnn, binary=True)
            W_grads = binary_net.compute_grads(loss, cnn)
            updates = lasagne.updates.adam(loss_or_grads=W_grads,
                                           params=W,
                                           learning_rate=LR)
            updates = binary_net.clipping_scaling(
                updates, cnn)  # weight scaling disabled

            # other parameters updates
            params = lasagne.layers.get_all_params(cnn,
                                                   trainable=True,
                                                   binary=False)
            updates = OrderedDict(updates.items() + lasagne.updates.adam(
                loss_or_grads=loss, params=params, learning_rate=LR).items())

        else:
            params = lasagne.layers.get_all_params(cnn, trainable=True)
            updates = lasagne.updates.adam(loss_or_grads=loss,
                                           params=params,
                                           learning_rate=LR)

        train_fn = theano.function([input, target, LR], loss, updates=updates)
Esempio n. 6
0
def run(binary=False, noise=None, nalpha=0, result_path=None):
    # BN parameters
    batch_size = 128
    print("batch_size = " + str(batch_size))

    # alpha is the exponential moving average factor
    alpha = .1
    print("alpha = " + str(alpha))
    epsilon = 1e-4
    print("epsilon = " + str(epsilon))

    # Training parameters
    num_epochs = 150
    print("num_epochs = " + str(num_epochs))

    # Dropout parameters
    dropout_in = .2  # default: .2
    print("dropout_in = " + str(dropout_in))
    dropout_hidden = .5  # default: .5
    print("dropout_hidden = " + str(dropout_hidden))

    # BinaryOut
    if binary:
        activation = binary_net.binary_tanh_unit
        print("activation = binary_net.binary_tanh_unit")
    else:
        activation = lasagne.nonlinearities.tanh
        print("activation = lasagne.nonlinearities.tanh")

    # BinaryConnect
    print("binary = " + str(binary))
    stochastic = False
    print("stochastic = " + str(stochastic))
    # (-H,+H) are the two binary values
    # H = "Glorot"
    H = 1.
    print("H = " + str(H))
    # W_LR_scale = 1.
    W_LR_scale = "Glorot"  # "Glorot" means we are using the coefficients from Glorot's paper
    print("W_LR_scale = " + str(W_LR_scale))

    # Decaying LR
    LR_start = 0.005
    print("LR_start = " + str(LR_start))
    LR_fin = 0.0000005  # 0.0000003
    print("LR_fin = " + str(LR_fin))
    LR_decay = (LR_fin / LR_start)**(1. / num_epochs)
    print("LR_decay = " + str(LR_decay))
    # BTW, LR decay might good for the BN moving average...

    train_set_size = 40000
    shuffle_parts = 1
    print("shuffle_parts = " + str(shuffle_parts))

    print("noise = " + str(noise))
    print("nalpha = " + str(nalpha))

    print('Loading CIFAR-10 dataset...')
    cifar = CifarReader("./data/cifar-10-batches-py/")

    train_X, train_y = cifar.get_train_data(n_samples=train_set_size,
                                            noise=noise,
                                            alpha=nalpha)
    valid_X, valid_y = cifar.get_validation_data()
    test_X, test_y = cifar.get_test_data()
    print("train_set_size = " + str(train_y.shape[0]))
    print("validation_set_size = " + str(valid_y.shape[0]))
    print("test_set_size = " + str(test_y.shape[0]))

    # Log output
    with open(result_path + "params.txt", "a+") as l:
        print("batch_size = " + str(batch_size), file=l)
        print("alpha = " + str(alpha), file=l)
        print("epsilon = " + str(epsilon), file=l)
        print("num_epochs = " + str(num_epochs), file=l)
        print("dropout_in = " + str(dropout_in), file=l)
        print("dropout_hidden = " + str(dropout_hidden), file=l)
        if binary:
            print("activation = binary_net.binary_tanh_unit", file=l)
        else:
            print("activation = lasagne.nonlinearities.tanh", file=l)
        print("binary = " + str(binary), file=l)
        print("stochastic = " + str(stochastic), file=l)
        print("H = " + str(H), file=l)
        print("W_LR_scale = " + str(W_LR_scale), file=l)
        print("LR_start = " + str(LR_start), file=l)
        print("LR_fin = " + str(LR_fin), file=l)
        print("LR_decay = " + str(LR_decay), file=l)
        print("shuffle_parts = " + str(shuffle_parts), file=l)
        print("noise = " + str(noise), file=l)
        print("nalpha = " + str(nalpha), file=l)
        print("train_set_size = " + str(train_y.shape[0]), file=l)
        print("validation_set_size = " + str(valid_y.shape[0]), file=l)
        print("test_set_size = " + str(test_y.shape[0]), file=l)

    # bc01 format
    # Inputs in the range [-1,+1]
    # print("Inputs in the range [-1,+1]")
    train_X = np.reshape(np.subtract(np.multiply(2. / 255., train_X), 1.),
                         (-1, 3, 32, 32))
    valid_X = np.reshape(np.subtract(np.multiply(2. / 255., valid_X), 1.),
                         (-1, 3, 32, 32))
    test_X = np.reshape(np.subtract(np.multiply(2. / 255., test_X), 1.),
                        (-1, 3, 32, 32))

    # flatten targets
    train_y = np.hstack(train_y)
    valid_y = np.hstack(valid_y)
    test_y = np.hstack(test_y)

    # Onehot the targets
    train_y = np.float32(np.eye(10)[train_y])
    valid_y = np.float32(np.eye(10)[valid_y])
    test_y = np.float32(np.eye(10)[test_y])

    # for hinge loss
    train_y = 2 * train_y - 1.
    valid_y = 2 * valid_y - 1.
    test_y = 2 * test_y - 1.

    print('Building the CNN...')

    # Prepare Theano variables for inputs and targets
    input = T.tensor4('inputs')
    target = T.matrix('targets')
    LR = T.scalar('LR', dtype=theano.config.floatX)

    cnn = lasagne.layers.InputLayer(shape=(None, 3, 32, 32), input_var=input)

    cnn = lasagne.layers.DropoutLayer(cnn, p=dropout_in)

    # 32C3-64C3-P2
    cnn = binary_net.Conv2DLayer(cnn,
                                 binary=binary,
                                 stochastic=stochastic,
                                 H=H,
                                 W_LR_scale=W_LR_scale,
                                 num_filters=32,
                                 filter_size=(3, 3),
                                 pad=1,
                                 nonlinearity=lasagne.nonlinearities.identity)

    cnn = lasagne.layers.BatchNormLayer(cnn, epsilon=epsilon, alpha=alpha)

    cnn = lasagne.layers.NonlinearityLayer(cnn, nonlinearity=activation)

    cnn = binary_net.Conv2DLayer(cnn,
                                 binary=binary,
                                 stochastic=stochastic,
                                 H=H,
                                 W_LR_scale=W_LR_scale,
                                 num_filters=64,
                                 filter_size=(3, 3),
                                 pad=1,
                                 nonlinearity=lasagne.nonlinearities.identity)

    cnn = lasagne.layers.MaxPool2DLayer(cnn, pool_size=(2, 2))

    cnn = lasagne.layers.BatchNormLayer(cnn, epsilon=epsilon, alpha=alpha)

    cnn = lasagne.layers.NonlinearityLayer(cnn, nonlinearity=activation)

    cnn = lasagne.layers.DropoutLayer(cnn, p=dropout_hidden)

    # 128FP-10FP
    cnn = binary_net.DenseLayer(cnn,
                                binary=binary,
                                stochastic=stochastic,
                                H=H,
                                W_LR_scale=W_LR_scale,
                                nonlinearity=lasagne.nonlinearities.identity,
                                num_units=128)

    cnn = lasagne.layers.BatchNormLayer(cnn, epsilon=epsilon, alpha=alpha)

    cnn = lasagne.layers.NonlinearityLayer(cnn, nonlinearity=activation)

    cnn = lasagne.layers.DropoutLayer(cnn, p=dropout_hidden)

    cnn = binary_net.DenseLayer(cnn,
                                binary=binary,
                                stochastic=stochastic,
                                H=H,
                                W_LR_scale=W_LR_scale,
                                nonlinearity=lasagne.nonlinearities.identity,
                                num_units=10)

    cnn = lasagne.layers.BatchNormLayer(cnn, epsilon=epsilon, alpha=alpha)

    cnn = lasagne.layers.NonlinearityLayer(
        cnn, nonlinearity=lasagne.nonlinearities.softmax)

    train_output = lasagne.layers.get_output(cnn, deterministic=False)

    # squared hinge loss
    loss = T.mean(T.sqr(T.maximum(0., 1. - target * train_output)))

    if binary:

        # W updates
        W = lasagne.layers.get_all_params(cnn, binary=True)
        W_grads = binary_net.compute_grads(loss, cnn)
        updates = lasagne.updates.adam(loss_or_grads=W_grads,
                                       params=W,
                                       learning_rate=LR)
        updates = binary_net.clipping_scaling(updates, cnn)

        # other parameters updates
        params = lasagne.layers.get_all_params(cnn,
                                               trainable=True,
                                               binary=False)
        updates.update(
            lasagne.updates.adam(loss_or_grads=loss,
                                 params=params,
                                 learning_rate=LR))

    else:
        params = lasagne.layers.get_all_params(cnn, trainable=True)
        updates = lasagne.updates.adam(loss_or_grads=loss,
                                       params=params,
                                       learning_rate=LR)

    test_output = lasagne.layers.get_output(cnn, deterministic=True)
    test_loss = T.mean(T.sqr(T.maximum(0., 1. - target * test_output)))
    test_err = T.mean(T.neq(T.argmax(test_output, axis=1),
                            T.argmax(target, axis=1)),
                      dtype=theano.config.floatX)

    # Compile a function performing a training step on a mini-batch (by giving the updates dictionary)
    # and returning the corresponding training loss:
    train_fn = theano.function([input, target, LR], loss, updates=updates)

    # Compile a second function computing the validation loss and accuracy:
    val_fn = theano.function([input, target], [test_loss, test_err])

    print('Training...')

    binary_net.train(train_fn,
                     val_fn,
                     cnn,
                     batch_size,
                     LR_start,
                     LR_decay,
                     num_epochs,
                     train_X,
                     train_y,
                     valid_X,
                     valid_y,
                     test_X,
                     test_y,
                     shuffle_parts=shuffle_parts,
                     result_path=result_path)
Esempio n. 7
0
def main():
    # BN parameters
    batch_size = 200
    print("batch_size = " + str(batch_size))
    # alpha is the exponential moving average factor
    alpha = .1
    print("alpha = " + str(alpha))
    epsilon = 1e-4
    print("epsilon = " + str(epsilon))

    # BinaryOut
    activation = binary_net.binary_tanh_unit
    print("activation = binary_net.binary_tanh_unit")
    # activation = binary_net.binary_sigmoid_unit
    # print("activation = binary_net.binary_sigmoid_unit")

    # BinaryConnect
    binary = True
    print("binary = " + str(binary))
    stochastic = False
    print("stochastic = " + str(stochastic))
    # (-H,+H) are the two binary values
    # H = "Glorot"
    H = 1.
    print("H = " + str(H))
    # W_LR_scale = 1.
    W_LR_scale = "Glorot"  # "Glorot" means we are using the coefficients from Glorot's paper
    print("W_LR_scale = " + str(W_LR_scale))

    # Training parameters
    num_epochs = 500
    print("num_epochs = " + str(num_epochs))

    # Decaying LR
    LR_start = 0.01
    print("LR_start = " + str(LR_start))
    LR_fin = 0.0000003
    print("LR_fin = " + str(LR_fin))
    LR_decay = (LR_fin / LR_start)**(1. / num_epochs)
    print("LR_decay = " + str(LR_decay))
    # BTW, LR decay might good for the BN moving average...

    train_set_size = 45000
    print("train_set_size = " + str(train_set_size))
    shuffle_parts = 1
    print("shuffle_parts = " + str(shuffle_parts))

    print('\nLoading CIFAR-10 dataset...')

    train_set = CIFAR10(which_set="train", start=0, stop=train_set_size)
    valid_set = CIFAR10(which_set="train", start=train_set_size, stop=50000)
    test_set = CIFAR10(which_set="test")

    # bc01 format
    # Inputs in the range [-1,+1]
    # print("Inputs in the range [-1,+1]")
    train_set.X = np.reshape(
        np.subtract(np.multiply(2. / 255., train_set.X), 1.), (-1, 3, 32, 32))
    valid_set.X = np.reshape(
        np.subtract(np.multiply(2. / 255., valid_set.X), 1.), (-1, 3, 32, 32))
    test_set.X = np.reshape(
        np.subtract(np.multiply(2. / 255., test_set.X), 1.), (-1, 3, 32, 32))

    # flatten targets
    train_set.y = np.hstack(train_set.y)
    valid_set.y = np.hstack(valid_set.y)
    test_set.y = np.hstack(test_set.y)

    if oneHot:
        #  Onehot the targets
        train_set.y = np.float32(np.eye(10)[train_set.y])
        valid_set.y = np.float32(np.eye(10)[valid_set.y])
        test_set.y = np.float32(np.eye(10)[test_set.y])

        # for hinge loss
        train_set.y = 2 * train_set.y - 1.
        valid_set.y = 2 * valid_set.y - 1.
        test_set.y = 2 * test_set.y - 1.
    else:
        train_set.y = np.int32(train_set.y)
        valid_set.y = np.int32(valid_set.y)
        test_set.y = np.int32(test_set.y)

    #import pdb;pdb.set_trace()

    print('\nBuilding the CNN...')

    # Prepare Theano variables for inputs and targets
    input = T.tensor4('inputs')
    if oneHot: target = T.matrix('targets')
    else: target = T.ivector('targets')

    LR = T.scalar('LR', dtype=theano.config.floatX)

    cnn = buildCNN(dataType='cifar10',
                   networkType='cifar10',
                   oneHot=oneHot,
                   input=input,
                   epsilon=epsilon,
                   alpha=alpha,
                   activation=activation,
                   binary=binary,
                   stochastic=stochastic,
                   H=H,
                   W_LR_scale=W_LR_scale)

    train_output = lasagne.layers.get_output(cnn, deterministic=False)

    # squared hinge loss
    if oneHot: loss = T.mean(T.sqr(T.maximum(0., 1. - target * train_output)))
    else:
        loss = LO.categorical_crossentropy(train_output, target)
        loss = loss.mean()

    # W updates
    W = lasagne.layers.get_all_params(cnn, binary=True)
    W_grads = binary_net.compute_grads(loss, cnn)
    updates = lasagne.updates.adam(loss_or_grads=W_grads,
                                   params=W,
                                   learning_rate=LR)
    updates = binary_net.clipping_scaling(updates, cnn)

    # other parameters updates
    params = lasagne.layers.get_all_params(cnn, trainable=True, binary=False)
    updates = OrderedDict(updates.items() + lasagne.updates.adam(
        loss_or_grads=loss, params=params, learning_rate=LR).items())

    test_output = lasagne.layers.get_output(cnn, deterministic=True)

    if oneHot:
        test_loss = T.mean(T.sqr(T.maximum(0., 1. - target * test_output)))
        test_err = T.mean(T.neq(T.argmax(test_output, axis=1),
                                T.argmax(target, axis=1)),
                          dtype=theano.config.floatX)
    else:
        test_loss = LO.categorical_crossentropy(test_output, target)
        test_loss = test_loss.mean()
        test_err = T.mean(T.neq(T.argmax(test_output, axis=1),
                                T.argmax(target)),
                          dtype=theano.config.floatX)

    # Compile a function performing a training step on a mini-batch (by giving the updates dictionary)
    # and returning the corresponding training loss:
    train_fn = theano.function([input, target, LR], loss, updates=updates)

    # Compile a second function computing the validation loss and accuracy:
    val_fn = theano.function([input, target], [test_loss, test_err])

    print('Training...')

    binary_net.train(train_fn,
                     val_fn,
                     cnn,
                     batch_size,
                     LR_start,
                     LR_decay,
                     num_epochs,
                     train_set.X,
                     train_set.y,
                     valid_set.X,
                     valid_set.y,
                     test_set.X,
                     test_set.y,
                     shuffle_parts=shuffle_parts)
Esempio n. 8
0
def trial(N_HIDDEN_LAYERS, NUM_UNITS, OUTPUT_TYPE, MAIN_LOSS_TYPE, LAMBDA,
          FOLD, FINTUNE_SNAPSHOT, FINTUNE_SCALE):
    # BN parameters
    batch_size = 97
    print("batch_size = " + str(batch_size))
    # alpha is the exponential moving average factor
    # alpha = .15
    alpha = .1
    print("alpha = " + str(alpha))
    epsilon = 1e-4
    print("epsilon = " + str(epsilon))

    # MLP parameters
    #NUM_UNITS = 25
    print("NUM_UNITS = " + str(NUM_UNITS))
    #N_HIDDEN_LAYERS = 1
    print("N_HIDDEN_LAYERS = " + str(N_HIDDEN_LAYERS))

    # Training parameters
    num_epochs = 1000
    print("num_epochs = " + str(num_epochs))

    # Dropout parameters
    dropout_in = .2  # 0. means no dropout
    print("dropout_in = " + str(dropout_in))
    dropout_hidden = .5
    print("dropout_hidden = " + str(dropout_hidden))

    # BinaryOut
    activation = binary_net.binary_tanh_unit
    print("activation = binary_net.binary_tanh_unit")
    # activation = binary_net.binary_sigmoid_unit
    # print("activation = binary_net.binary_sigmoid_unit")

    # BinaryConnect
    binary = True
    print("binary = " + str(binary))
    stochastic = False
    print("stochastic = " + str(stochastic))
    # (-H,+H) are the two binary values
    # H = "Glorot"
    H = 1.
    print("H = " + str(H))
    # W_LR_scale = 1.
    W_LR_scale = "Glorot"  # "Glorot" means we are using the coefficients from Glorot's paper
    print("W_LR_scale = " + str(W_LR_scale))

    # Decaying LR
    #LR_start = .003
    LR_start = 0.000003
    print("LR_start = " + str(LR_start))
    #LR_fin = 0.0000003
    LR_fin = LR_start
    print("LR_fin = " + str(LR_fin))
    LR_decay = (LR_fin / LR_start)**(1. / num_epochs)
    print("LR_decay = " + str(LR_decay))
    # BTW, LR decay might good for the BN moving average...

    # replace the dataset
    print('Loading SFEW2 dataset...')
    [train_x, train_y, val_x, val_y] = SFEW2.load_train_val()
    print(train_x.shape)
    print(train_y.shape)
    print(val_x.shape)
    print(val_y.shape)
    print('last training minibatch size: ' +
          str(train_x.shape[0] - train_x.shape[0] / batch_size * batch_size) +
          ' / ' + str(batch_size))
    print(
        'last training minibatch size should not be too small (except 0). try decrease the batch_size, but not add more minibatches.'
    )
    print('minibatches size: ' + str(batch_size))
    print('suggested minibatches size: ' + str(
        math.ceil(
            float(train_x.shape[0]) /
            math.ceil(float(train_x.shape[0]) / 100))))

    print('Building the MLP...')
    # Prepare Theano variables for inputs and targets
    input = T.matrix('inputs')
    target = T.matrix('targets')
    LR = T.scalar('LR', dtype=theano.config.floatX)

    mlp = lasagne.layers.InputLayer(shape=(None, train_x.shape[1]),
                                    input_var=input)

    mlp = lasagne.layers.DropoutLayer(mlp, p=dropout_in)

    for k in range(N_HIDDEN_LAYERS):

        # pretrain-finetune
        if (k == 0):
            # fixed num_units
            mlp = binary_net.DenseLayer(
                mlp,
                binary=binary,
                stochastic=stochastic,
                H=H,
                W_LR_scale=W_LR_scale,
                nonlinearity=lasagne.nonlinearities.identity,
                num_units=1500)

            # scale down the LR of transfered dense layer
            print('scale down the LR of transfered dense layer from',
                  str(mlp.W_LR_scale))
            mlp.W_LR_scale *= np.float32(FINTUNE_SCALE)
            print('to', str(mlp.W_LR_scale))
        else:
            mlp = binary_net.DenseLayer(
                mlp,
                binary=binary,
                stochastic=stochastic,
                H=H,
                W_LR_scale=W_LR_scale,
                nonlinearity=lasagne.nonlinearities.identity,
                num_units=NUM_UNITS)

        mlp = lasagne.layers.BatchNormLayer(mlp, epsilon=epsilon, alpha=alpha)

        mlp = lasagne.layers.NonlinearityLayer(mlp, nonlinearity=activation)

        mlp = lasagne.layers.DropoutLayer(mlp, p=dropout_hidden)

        # pretrain-finetune
        # only restore the first layer group
        if (k == 0):
            if (FINTUNE_SNAPSHOT != 0):
                print('Load ./W-%d.npz' % FINTUNE_SNAPSHOT)
                with np.load('./W-%d.npz' % FINTUNE_SNAPSHOT) as f:
                    param_values = [
                        f['arr_%d' % i] for i in range(len(f.files))
                    ]
                param_values = param_values[0:6]
                lasagne.layers.set_all_param_values(mlp, param_values)

    mlp = binary_net.DenseLayer(mlp,
                                binary=binary,
                                stochastic=stochastic,
                                H=H,
                                W_LR_scale=W_LR_scale,
                                nonlinearity=lasagne.nonlinearities.identity,
                                num_units=7)

    mlp = lasagne.layers.BatchNormLayer(mlp, epsilon=epsilon, alpha=alpha)

    # network output BN or SGN
    if OUTPUT_TYPE == 'C':
        pass  #
    elif OUTPUT_TYPE == 'D':
        mlp = lasagne.layers.NonlinearityLayer(mlp, nonlinearity=activation)
    else:
        assert (False)

    # loss weight nodes
    SPARSITY = 0.9
    SPARSITY_MAP = (np.float32(train_x == -1)).mean(0)
    LOSS_WEIGHT_1 = 1. + input * (2. * SPARSITY - 1)
    LOSS_WEIGHT_1 /= 4 * SPARSITY * (1 - SPARSITY
                                     )  # fixed 1->-1:5 -1->1:5/9 weights
    LOSS_WEIGHT_2 = 1. + input * (2. * SPARSITY_MAP - 1)  #
    LOSS_WEIGHT_2 /= 4 * SPARSITY_MAP * (
        1 - SPARSITY_MAP)  # weights considering element's prior probability

    # train loss nodes
    train_output = lasagne.layers.get_output(mlp, deterministic=False)
    if MAIN_LOSS_TYPE == 'SH':
        train_loss = T.mean(T.sqr(T.maximum(0., 1. - target * train_output)))
    elif MAIN_LOSS_TYPE == 'W1SH':
        train_loss = T.mean(
            T.sqr(T.maximum(0., (1. - target * train_output))) * LOSS_WEIGHT_1)
    elif MAIN_LOSS_TYPE == 'W2SH':
        train_loss = T.mean(
            T.sqr(T.maximum(0., (1. - target * train_output))) * LOSS_WEIGHT_2)
    elif MAIN_LOSS_TYPE == 'H':
        train_loss = T.mean(T.maximum(0., 1. - target * train_output))
    elif MAIN_LOSS_TYPE == 'W1H':
        train_loss = T.mean(
            T.maximum(0., (1. - target * train_output)) * LOSS_WEIGHT_1)
    elif MAIN_LOSS_TYPE == 'W2H':
        train_loss = T.mean(
            T.maximum(0., (1. - target * train_output)) * LOSS_WEIGHT_2)
    else:
        assert (False)

    # + sparse penalty
    if LAMBDA > 0:
        train_pixel_wise_density = T.mean(T.reshape(
            (train_output + 1.) / 2.,
            [train_output.shape[0], train_output.shape[1] / 10, 10]),
                                          axis=2)
        train_penalty = LAMBDA * T.mean(
            T.sqr(train_pixel_wise_density - (1. - SPARSITY)))
    else:
        train_penalty = T.constant(0.)
    train_loss = train_loss + train_penalty

    # acc
    train_acc = T.mean(T.eq(T.argmax(train_output, axis=1),
                            T.argmax(target, axis=1)),
                       dtype=theano.config.floatX)

    # grad nodes
    if binary:
        # W updates
        W = lasagne.layers.get_all_params(mlp, binary=True)
        W_grads = binary_net.compute_grads(train_loss, mlp)
        updates = lasagne.updates.adam(loss_or_grads=W_grads,
                                       params=W,
                                       learning_rate=LR)
        updates = binary_net.clipping_scaling(updates, mlp)

        # other parameters updates
        params = lasagne.layers.get_all_params(mlp,
                                               trainable=True,
                                               binary=False)
        updates = OrderedDict(updates.items() + lasagne.updates.adam(
            loss_or_grads=train_loss, params=params, learning_rate=LR).items())

    else:
        params = lasagne.layers.get_all_params(mlp, trainable=True)
        updates = lasagne.updates.adam(loss_or_grads=train_loss,
                                       params=params,
                                       learning_rate=LR)

    # val loss nodes
    # must be created after grad nodes
    val_output = lasagne.layers.get_output(mlp, deterministic=True)
    if MAIN_LOSS_TYPE == 'SH':
        val_loss = T.mean(T.sqr(T.maximum(0., 1. - target * val_output)))
    elif MAIN_LOSS_TYPE == 'W1SH':
        val_loss = T.mean(
            T.sqr(T.maximum(0., (1. - target * val_output))) * LOSS_WEIGHT_1)
    elif MAIN_LOSS_TYPE == 'W2SH':
        val_loss = T.mean(
            T.sqr(T.maximum(0., (1. - target * val_output))) * LOSS_WEIGHT_2)
    elif MAIN_LOSS_TYPE == 'H':
        val_loss = T.mean(T.maximum(0., 1. - target * val_output))
    elif MAIN_LOSS_TYPE == 'W1H':
        val_loss = T.mean(
            T.maximum(0., (1. - target * val_output)) * LOSS_WEIGHT_1)
    elif MAIN_LOSS_TYPE == 'W2H':
        val_loss = T.mean(
            T.maximum(0., (1. - target * val_output)) * LOSS_WEIGHT_2)

    # + sparse penalty
    if LAMBDA > 0:
        val_pixel_wise_density = T.mean(T.reshape(
            (val_output + 1.) / 2.,
            [val_output.shape[0], val_output.shape[1] / 10, 10]),
                                        axis=2)
        val_penalty = LAMBDA * T.mean(
            T.sqr(val_pixel_wise_density - (1. - SPARSITY)))
    else:
        val_penalty = T.constant(0.)
    val_loss = val_loss + val_penalty

    # acc
    val_acc = T.mean(T.eq(T.argmax(val_output, axis=1), T.argmax(target,
                                                                 axis=1)),
                     dtype=theano.config.floatX)

    # Compile a function performing a training step on a mini-batch (by giving the updates dictionary)
    # and returning the corresponding training train_loss:
    train_fn = theano.function(
        [input, target, LR],
        [train_loss, train_penalty, train_acc, train_output],
        updates=updates)

    # Compile a second function computing the validation train_loss and accuracy:
    val_fn = theano.function([input, target],
                             [val_loss, val_penalty, val_acc, val_output])

    print('Training...')
    train_x = binary_net.MoveParameter(train_x)
    binary_net.train(train_fn, val_fn, batch_size, LR_start, LR_decay,
                     num_epochs, train_x, train_y, val_x, val_y)
def main():
    # BN parameters
    batch_size = 100
    logger_lip.info("batch_size = %s", batch_size)
    # alpha is the exponential moving average factor
    alpha = .1
    logger_lip.info("alpha = %s", alpha)
    epsilon = 1e-4
    logger_lip.info("epsilon = %s", epsilon)

    # BinaryOut
    activation = binary_net.binary_tanh_unit
    print("activation = binary_tanh_unit")
    stochastic = True
    print("stochastic = " + str(stochastic))
    # (-H,+H) are the two binary values
    #H = "Glorot"
    H = 1.
    print("H = " + str(H))
    # W_LR_scale = 1.
    W_LR_scale = "Glorot"  # "Glorot" means we are using the coefficients from Glorot's paper
    print("W_LR_scale = " + str(W_LR_scale))

    # Training parameters
    num_epochs = 50
    logger_lip.info("num_epochs = %s", num_epochs)

    # Decaying LR
    LR_start = 0.1
    logger_lip.info("LR_start = %s", LR_start)
    LR_fin = 0.0000003
    logger_lip.info("LR_fin = %s", LR_fin)
    # LR_decay = (LR_fin / LR_start) ** (1. / num_epochs)
    LR_decay = 0.5  # sqrt(0.5)
    logger_lip.info("LR_decay = %s", LR_decay)
    # BTW, LR decay might good for the BN moving average...

    shuffle_parts = 1
    logger_lip.info("shuffle_parts = %s", shuffle_parts)
    if binary: oneHot = True
    else: oneHot = False

    ##############################################
    network_type = "google"
    viseme = False  # will set nbClasses and store path   vis: 6.498.828   phn: 7.176.231

    if viseme:
        nbClasses = 12
    else:
        nbClasses = 39

    # get the database
    # If it's small (lipspeakers) -> generate X_train, y_train etc here
    # otherwise we need to load and generate each speaker seperately in the training loop
    dataset = "TCDTIMIT"
    root_dir = os.path.join(
        os.path.expanduser('~/TCDTIMIT/lipreading/' + dataset))
    results_dir = root_dir + "/results/CNN_binaryNet"
    if not os.path.exists(results_dir): os.makedirs(results_dir)
    if viseme:
        database_binaryDir = root_dir + '/binaryViseme'
    else:
        database_binaryDir = root_dir + '/binary'
    datasetType = "lipspeakers"  # "lipspeakers" #"volunteers" #"volunteers" #    lipspeakers or volunteers"
    ##############################################

    if datasetType == "lipspeakers":
        loadPerSpeaker = False  # only lipspeakers small enough to fit in CPU RAM, generate X_train etc here
        storeProcessed = True
        processedDir = database_binaryDir + "_allLipspeakersProcessed"

        # TODO: prepLip_all can be used to generate pkl containing all the lipspeaker data. Not sure if this stil works, so use with care!
        if not oneHot: pkl_path = processedDir + os.sep + datasetType + ".pkl"
        else:
            pkl_path = processedDir + os.sep + datasetType + "_oneHot" + ".pkl"
        if not os.path.exists(pkl_path):
            logger_lip.info("dataset not yet processed. Processing...")
            code.lipreading.preprocessLipreading.prepLip_all(
                data_path=database_binaryDir,
                store_path=pkl_path,
                trainFraction=0.7,
                validFraction=0.1,
                testFraction=0.2,
                nbClasses=nbClasses,
                onehot=oneHot,
                type=datasetType,
                verbose=True)
        datasetFiles = code.lipreading.general_tools.unpickle(pkl_path)
        X_train, y_train, X_val, y_val, X_test, y_test = datasetFiles
        dtypeX = 'float32'
        dtypeY = 'float32'
        X_train = X_train.astype(dtypeX)
        y_train = y_train.astype(dtypeY)
        X_val = X_val.astype(dtypeX)
        y_val = y_val.astype(dtypeY)
        X_test = X_test.astype(dtypeX)
        y_test = y_test.astype(dtypeY)
        datasetFiles = [X_train, y_train, X_val, y_val, X_test, y_test]

        # These files have been generated with datasetToPkl_fromCombined, so that the train/val/test set are the same as for combinedSR.
        # X_train, y_train = unpickle(os.path.expanduser("~/TCDTIMIT/lipreading/TCDTIMIT/binary/allLipspeakersTrain.pkl"))
        # X_val, y_val = unpickle(os.path.expanduser("~/TCDTIMIT/lipreading/TCDTIMIT/binary/allLipspeakersVal.pkl"))
        # X_test, y_test = unpickle(os.path.expanduser("~/TCDTIMIT/lipreading/TCDTIMIT/binary/allLipspeakersTest.pkl"))
        # datasetFiles = [X_train, y_train, X_val, y_val, X_test, y_test]

    else:  # we need to load and preprocess each speaker before we evaluate, because dataset is too large and doesn't fit in CPU RAM
        loadPerSpeaker = True
        storeProcessed = True  # if you have about 10GB hdd space, you can increase the speed by not reprocessing it each iteration
        processedDir = database_binaryDir + "_finalProcessed"
        # you can just run this program and it will generate the files the first time it encounters them, or generate them manually with datasetToPkl.py

        # just get the names
        testVolunteerNumbers = [
            "13F", "15F", "21M", "23M", "24M", "25M", "28M", "29M", "30F",
            "31F", "34M", "36F", "37F", "43F", "47M", "51F", "54M"
        ]
        testVolunteers = [
            str(testNumber) + ".pkl" for testNumber in testVolunteerNumbers
        ]
        lipspeakers = ["Lipspkr1.pkl", "Lipspkr2.pkl", "Lipspkr3.pkl"]
        allSpeakers = [
            f for f in os.listdir(database_binaryDir)
            if os.path.isfile(os.path.join(database_binaryDir, f))
            and os.path.splitext(f)[1] == ".pkl"
        ]
        trainVolunteers = [
            f for f in allSpeakers
            if not (f in testVolunteers or f in lipspeakers)
        ]
        trainVolunteers = [vol for vol in trainVolunteers if vol is not None]

        if datasetType == "combined":
            trainingSpeakerFiles = trainVolunteers + lipspeakers
            testSpeakerFiles = testVolunteers
        elif datasetType == "volunteers":
            trainingSpeakerFiles = trainVolunteers
            testSpeakerFiles = testVolunteers
        else:
            raise Exception("invalid dataset entered")
        datasetFiles = [trainingSpeakerFiles, testSpeakerFiles]

    model_name = datasetType + "_" + network_type + "_" + ("viseme" if viseme else "phoneme") + str(nbClasses) \
        + ("_binary" if binary else "")
    model_save_name = os.path.join(results_dir, model_name)

    # log file
    logFile = results_dir + os.sep + model_name + '.log'
    # if os.path.exists(logFile):
    #     fh = logging.FileHandler(logFileT)  # append to existing log
    # else:
    fh = logging.FileHandler(logFile, 'w')  # create new logFile
    fh.setLevel(logging.DEBUG)
    fh.setFormatter(formatter)
    logger_lip.addHandler(fh)

    logger_lip.info('Building the CNN...')

    # Prepare Theano variables for inputs and targets
    inputs = T.tensor4('inputs')
    if oneHot:
        targets = T.matrix('targets')
    else:
        targets = T.ivector('targets')

    LR = T.scalar('LR', dtype=theano.config.floatX)

    # get the network structure
    l_out = code.lipreading.buildNetworks.build_network_google_binary(
        activation, alpha, epsilon, inputs, binary, stochastic, H,
        W_LR_scale)  # 7176231 params
    for layer in L.get_all_layers(l_out):
        print(layer)

    # print het amount of network parameters
    logger_lip.info("Using the %s network", network_type)
    logger_lip.info("The number of parameters of this network: %s",
                    L.count_params(l_out))

    logger_lip.info("loading %s", model_save_name + '.npz')
    load_model(model_save_name + '.npz', l_out)

    logger_lip.info("* COMPILING FUNCTIONS...")
    train_output = lasagne.layers.get_output(l_out, deterministic=False)

    # squared hinge loss
    loss = T.mean(T.sqr(T.maximum(0., 1. - targets * train_output)))

    # W updates
    W = lasagne.layers.get_all_params(l_out, binary=True)
    W_grads = binary_net.compute_grads(loss, l_out)
    updates = lasagne.updates.adam(loss_or_grads=W_grads,
                                   params=W,
                                   learning_rate=LR)
    updates = binary_net.clipping_scaling(updates, l_out)

    # other parameters updates
    params = lasagne.layers.get_all_params(l_out, trainable=True, binary=False)
    updates = OrderedDict(updates.items() + lasagne.updates.adam(
        loss_or_grads=loss, params=params, learning_rate=LR).items())

    test_output = lasagne.layers.get_output(l_out, deterministic=True)
    out_fn = theano.function([inputs], test_output)

    test_loss = T.mean(T.sqr(T.maximum(0., 1. - targets * test_output)))
    test_acc = T.mean(T.eq(T.argmax(test_output, axis=1),
                           T.argmax(targets, axis=1)),
                      dtype=theano.config.floatX)
    k = 3
    test_top3_acc = T.zeros((1, ))
    topk_acc_fn = theano.function([], test_top3_acc)
    val_fn = theano.function([inputs, targets],
                             [test_loss, test_acc, test_top3_acc])

    if debug:
        nb = 3
        debugX = X_train[0:nb]
        debugY = y_train[0:nb]
        out = out_fn(debugX)
        val = val_fn(debugX, debugY)
        import pdb
        pdb.set_trace()

        # Compile a function performing a training step on a mini-batch (by giving the updates dictionary)
    # and returning the corresponding training loss:
    train_fn = theano.function([inputs, targets, LR], loss, updates=updates)

    logger_lip.info('Training...')
    import code.lipreading.train_lipreading
    code.lipreading.train_lipreading.train(
        train_fn=train_fn,
        val_fn=val_fn,
        out_fn=out_fn,
        topk_acc_fn=topk_acc_fn,
        k=k,
        network_output_layer=l_out,
        batch_size=batch_size,
        LR_start=LR_start,
        LR_decay=LR_decay,
        num_epochs=num_epochs,
        dataset=datasetFiles,
        database_binaryDir=database_binaryDir,
        storeProcessed=storeProcessed,
        processedDir=processedDir,
        loadPerSpeaker=loadPerSpeaker,
        justTest=justTest,
        save_name=model_save_name,
        shuffleEnabled=True)
def trial(N_HIDDEN_LAYERS, NUM_UNITS, OUTPUT_TYPE, MAIN_LOSS_TYPE, LAMBDA,
          FOLD):
    # BN parameters
    batch_size = 100
    print("batch_size = " + str(batch_size))
    # alpha is the exponential moving average factor
    # alpha = .15
    alpha = .1
    print("alpha = " + str(alpha))
    epsilon = 1e-4
    print("epsilon = " + str(epsilon))

    # MLP parameters
    #NUM_UNITS = 25
    print("NUM_UNITS = " + str(NUM_UNITS))
    #N_HIDDEN_LAYERS = 1
    print("N_HIDDEN_LAYERS = " + str(N_HIDDEN_LAYERS))

    # Training parameters
    num_epochs = 1000000
    print("num_epochs = " + str(num_epochs))

    # Dropout parameters
    dropout_in = .2  # 0. means no dropout
    print("dropout_in = " + str(dropout_in))
    dropout_hidden = .5
    print("dropout_hidden = " + str(dropout_hidden))

    # BinaryOut
    activation = binary_net.binary_tanh_unit
    print("activation = binary_net.binary_tanh_unit")
    # activation = binary_net.binary_sigmoid_unit
    # print("activation = binary_net.binary_sigmoid_unit")

    # BinaryConnect
    binary = True
    print("binary = " + str(binary))
    stochastic = False
    print("stochastic = " + str(stochastic))
    # (-H,+H) are the two binary values
    # H = "Glorot"
    H = 1.
    print("H = " + str(H))
    # W_LR_scale = 1.
    W_LR_scale = "Glorot"  # "Glorot" means we are using the coefficients from Glorot's paper
    print("W_LR_scale = " + str(W_LR_scale))

    # Decaying LR
    #LR_start = .003
    LR_start = 0.000003
    print("LR_start = " + str(LR_start))
    #LR_fin = 0.0000003
    LR_fin = LR_start
    print("LR_fin = " + str(LR_fin))
    LR_decay = (LR_fin / LR_start)**(1. / num_epochs)
    print("LR_decay = " + str(LR_decay))
    # BTW, LR decay might good for the BN moving average...

    # replace the dataset
    print('Loading SFEW2 dataset...')
    [train_x] = SFEW2.load_lfw()
    assert (train_x.shape[0] == 26404)
    train_x = train_x[0:26400, :]
    [val_x, _, _, _] = SFEW2.load_train_val()

    print(train_x.shape)
    print(val_x.shape)
    print('last training minibatch size: ' +
          str(train_x.shape[0] - train_x.shape[0] / batch_size * batch_size) +
          ' / ' + str(batch_size))
    print(
        'last training minibatch size should not be too small (except 0). try decrease the batch_size, but not add more minibatches.'
    )
    print('minibatches size: ' + str(batch_size))
    print('suggested minibatches size: ' + str(
        math.ceil(
            float(train_x.shape[0]) /
            math.ceil(float(train_x.shape[0]) / 100))))

    ##############################################################################################

    print('Building the MLP...')
    # Prepare Theano variables for inputs and targets
    input = T.matrix('inputs')
    LR = T.scalar('LR', dtype=theano.config.floatX)

    mlp = lasagne.layers.InputLayer(shape=(None, train_x.shape[1]),
                                    input_var=input)

    mlp = lasagne.layers.DropoutLayer(
        mlp, p=0)  # train BAE-2: no dropout on input & BAE-1 layer

    for k in range(N_HIDDEN_LAYERS):
        if (k == 0):
            mlp = binary_net.DenseLayer(
                mlp,
                binary=binary,
                stochastic=stochastic,
                H=H,
                W_LR_scale=W_LR_scale,
                nonlinearity=lasagne.nonlinearities.identity,
                num_units=NUM_UNITS)
        elif (k == 1):
            mlp = binary_net.DenseLayer(
                mlp,
                binary=binary,
                stochastic=stochastic,
                H=H,
                W_LR_scale=W_LR_scale,
                nonlinearity=lasagne.nonlinearities.identity,
                num_units=NUM_UNITS * 2)
        else:
            assert (False)

        #if(k==0):
        #    print('scale down the LR of transfered dense layer from', str(mlp.W_LR_scale))
        #    mlp.W_LR_scale = 0
        #    print('to', str(mlp.W_LR_scale))

        if (k == 0):
            # BAE1 encoder: BN
            mlp = lasagne.layers.BatchNormLayer(mlp,
                                                epsilon=epsilon,
                                                alpha=alpha)
        elif (k == 1):
            # BAE2 encoder: do not use BN for encouraging sparsity
            pass
        else:
            # further layer use BN
            mlp = lasagne.layers.BatchNormLayer(mlp,
                                                epsilon=epsilon,
                                                alpha=alpha)

        # midactivation place before hard tanh
        # encoder and decoder should not use BatchNorm
        # "l1 reg" on midactivation
        if (k == 1):
            mlp_midactivation = mlp

        mlp = lasagne.layers.NonlinearityLayer(mlp, nonlinearity=activation)

        if (k == 0):
            mlp = lasagne.layers.DropoutLayer(
                mlp, p=0)  # train BAE-2: no dropout on input & BAE-1 layer
        else:
            mlp = lasagne.layers.DropoutLayer(mlp, p=dropout_hidden)

        # pretrain-finetune
        # only restore the first layer group
        if (k == 0):
            print('Load ./W-1168.npz')
            with np.load('./W-1168.npz') as f:
                param_values = [f['arr_%d' % i] for i in range(len(f.files))]
            param_values = param_values[0:6]
            lasagne.layers.set_all_param_values(mlp, param_values)

            mlp_groundtruth = mlp

    mlp = binary_net.DenseLayer(mlp,
                                binary=binary,
                                stochastic=stochastic,
                                H=H,
                                W_LR_scale=W_LR_scale,
                                nonlinearity=lasagne.nonlinearities.identity,
                                num_units=1500)

    mlp = lasagne.layers.BatchNormLayer(mlp, epsilon=epsilon, alpha=alpha)

    # network output BN or SGN
    if OUTPUT_TYPE == 'C':
        pass  #
    elif OUTPUT_TYPE == 'D':
        mlp = lasagne.layers.NonlinearityLayer(mlp, nonlinearity=activation)
    else:
        assert (False)
    '''
    # equal transform validation
    # 1 set AE transform to I
    # 1 modift AE DenseLayer.get_output_for() use W(0 1) instead of Wb(+1 -1)
    # 2 set encoder's dropout=0
    # 3 comment out encoder's and decoder's BatchNormLayer, modify set_all_param_values
    # will see train loss = 0
    pv = lasagne.layers.get_all_param_values(mlp)
    pv[2] = np.identity(1500, np.float64)
    pv[4] = np.identity(1500, np.float64)
    lasagne.layers.set_all_param_values(mlp, pv)
    '''
    '''
    # loss weight nodes
    SPARSITY = 0.9
    SPARSITY_MAP = (np.float32(train_x==-1)).mean(0)
    LOSS_WEIGHT_1 = 1.+input*(2.*SPARSITY-1)
    LOSS_WEIGHT_1 /= 4*SPARSITY*(1 - SPARSITY)# fixed 1->-1:5 -1->1:5/9 weights
    LOSS_WEIGHT_2 = 1.+input*(2.*SPARSITY_MAP-1)#
    LOSS_WEIGHT_2 /= 4*SPARSITY_MAP*(1 - SPARSITY_MAP)# weights considering element's prior probability
    '''

    # train loss nodes
    '''
    train_output = lasagne.layers.get_output(mlp, deterministic=False)
    if MAIN_LOSS_TYPE=='SH':
        train_loss = T.mean(T.sqr(T.maximum(0.,1.-input*train_output)))
    elif MAIN_LOSS_TYPE == 'W1SH':
        train_loss = T.mean(T.sqr(T.maximum(0., (1. - input * train_output))) * LOSS_WEIGHT_1)
    elif MAIN_LOSS_TYPE == 'W2SH':
        train_loss = T.mean(T.sqr(T.maximum(0., (1. - input * train_output))) * LOSS_WEIGHT_2)
    elif MAIN_LOSS_TYPE == 'H':
        train_loss = T.mean(T.maximum(0.,1.-input*train_output))
    elif MAIN_LOSS_TYPE == 'W1H':
        train_loss = T.mean(T.maximum(0., (1. - input * train_output)) * LOSS_WEIGHT_1)
    elif MAIN_LOSS_TYPE == 'W2H':
        train_loss = T.mean(T.maximum(0., (1. - input * train_output)) * LOSS_WEIGHT_2)
    else:
        assert(False)
    '''
    [
        train_output_mlp_groundtruth, train_output_mlp_midactivation,
        train_output
    ] = lasagne.layers.get_output([mlp_groundtruth, mlp_midactivation, mlp],
                                  deterministic=False)
    train_loss = T.mean(
        T.maximum(0., 1. - train_output_mlp_groundtruth * train_output))

    # + sparse penalty
    '''
    if LAMBDA>0:
        train_pixel_wise_density = T.mean(T.reshape((train_output+1.)/2., [train_output.shape[0], train_output.shape[1]/10, 10]), axis=2)
        train_penalty = LAMBDA*T.mean(T.sqr(train_pixel_wise_density - (1.-SPARSITY)))
    else:
        train_penalty = T.constant(0.)
    train_loss = train_loss + train_penalty
    '''
    if LAMBDA > 0:
        train_penalty = LAMBDA * T.mean(
            T.maximum(0., 1. + train_output_mlp_midactivation))
    else:
        train_penalty = T.constant(0.)
    train_loss = train_loss + train_penalty

    # grad nodes
    if binary:
        # W updates
        W = lasagne.layers.get_all_params(mlp, binary=True)
        W_grads = binary_net.compute_grads(train_loss, mlp)

        # untrainable W1
        assert (len(W) == 3)
        assert (len(W_grads) == 3)
        W = W[1:len(W)]
        W_grads = W_grads[1:len(W_grads)]
        assert (len(W) == 2)
        assert (len(W_grads) == 2)

        updates = lasagne.updates.adam(loss_or_grads=W_grads,
                                       params=W,
                                       learning_rate=LR)
        updates = binary_net.clipping_scaling(updates, mlp)

        # other parameters updates
        params = lasagne.layers.get_all_params(mlp,
                                               trainable=True,
                                               binary=False)

        # untrainable b1 bn1
        assert (len(params) == 7)
        assert (params[0].name == 'b')  # fix
        assert (params[1].name == 'beta')  # fix
        assert (params[2].name == 'gamma')  # fix
        assert (params[3].name == 'b')
        assert (params[4].name == 'b')
        assert (params[5].name == 'beta')
        assert (params[6].name == 'gamma')
        params = params[3:len(params)]
        assert (len(params) == 4)

        updates = OrderedDict(updates.items() + lasagne.updates.adam(
            loss_or_grads=train_loss, params=params, learning_rate=LR).items())

    else:
        params = lasagne.layers.get_all_params(mlp, trainable=True)
        updates = lasagne.updates.adam(loss_or_grads=train_loss,
                                       params=params,
                                       learning_rate=LR)

    ##############################################################################################

    # val loss nodes
    # must be created after grad nodes
    '''
    val_output = lasagne.layers.get_output(mlp, deterministic=True)
    if MAIN_LOSS_TYPE=='SH':
        val_loss = T.mean(T.sqr(T.maximum(0.,1.-input*val_output)))
    elif MAIN_LOSS_TYPE == 'W1SH':
        val_loss = T.mean(T.sqr(T.maximum(0., (1. - input * val_output))) * LOSS_WEIGHT_1)
    elif MAIN_LOSS_TYPE == 'W2SH':
        val_loss = T.mean(T.sqr(T.maximum(0., (1. - input * val_output))) * LOSS_WEIGHT_2)
    elif MAIN_LOSS_TYPE == 'H':
        val_loss = T.mean(T.maximum(0.,1.-input*val_output))
    elif MAIN_LOSS_TYPE == 'W1H':
        val_loss = T.mean(T.maximum(0., (1. - input * val_output)) * LOSS_WEIGHT_1)
    elif MAIN_LOSS_TYPE == 'W2H':
        val_loss = T.mean(T.maximum(0., (1. - input * val_output)) * LOSS_WEIGHT_2)
    '''
    [val_output_mlp_groundtruth, val_output_mlp_midactivation, val_output
     ] = lasagne.layers.get_output([mlp_groundtruth, mlp_midactivation, mlp],
                                   deterministic=True)
    val_loss = T.mean(
        T.maximum(0., 1. - val_output_mlp_groundtruth * val_output))

    # + sparse penalty
    '''
    if LAMBDA>0:
        val_pixel_wise_density = T.mean(T.reshape((val_output + 1.) / 2., [val_output.shape[0], val_output.shape[1] / 10, 10]), axis=2)
        val_penalty = LAMBDA*T.mean(T.sqr(val_pixel_wise_density - (1. - SPARSITY)))
    else:
        val_penalty = T.constant(0.)
    val_loss = val_loss + val_penalty
    '''
    if LAMBDA > 0:
        val_penalty = LAMBDA * T.mean(
            T.maximum(0., 1. + val_output_mlp_midactivation))
    else:
        val_penalty = T.constant(0.)
    val_loss = val_loss + val_penalty

    ##############################################################################################

    # Compile a function performing a training step on a mini-batch (by giving the updates dictionary)
    # and returning the corresponding training train_loss:
    train_fn = theano.function([input, LR], [
        train_loss, train_penalty, train_output_mlp_groundtruth,
        train_output_mlp_midactivation, train_output
    ],
                               updates=updates)

    ##############################################################################################

    # Compile a second function computing the validation loss and accuracy:
    val_fn = theano.function([input], [
        val_loss, val_penalty, val_output_mlp_groundtruth,
        val_output_mlp_midactivation, val_output
    ])

    ##############################################################################################

    print('Training...')
    train_x = binary_net.MoveParameter(train_x)
    binary_net.train(train_fn, val_fn, batch_size, LR_start, LR_decay,
                     num_epochs, train_x, val_x, mlp)

    print('Save W')
    np.savez('./W.npz', *lasagne.layers.get_all_param_values(
        mlp))  # W b BN BN BN BN W b BN BN BN BN
Esempio n. 11
0
def run(binary=False, noise=None, nalpha=0, result_path=None):
    # BN parameters
    batch_size = 128  # default: 100
    print("batch_size = " + str(batch_size))

    # alpha is the exponential moving average factor
    alpha = .1  # default: .1
    print("alpha = " + str(alpha))
    epsilon = 1e-4  # default: 1e-4
    print("epsilon = " + str(epsilon))

    # MLP parameters
    num_units = 300  # default: 4096
    print("num_units = " + str(num_units))
    n_hidden_layers = 1  # default: 3
    print("n_hidden_layers = " + str(n_hidden_layers))

    # Training parameters
    num_epochs = 500  # default: 1000
    print("num_epochs = " + str(num_epochs))

    # Dropout parameters
    dropout_in = .2  # default: .2
    print("dropout_in = " + str(dropout_in))
    dropout_hidden = .5  # default: .5
    print("dropout_hidden = " + str(dropout_hidden))

    # BinaryOut
    if binary:
        activation = binary_net.binary_tanh_unit
        print("activation = binary_net.binary_tanh_unit")
    else:
        activation = lasagne.nonlinearities.tanh
        print("activation = lasagne.nonlinearities.tanh")

    # BinaryConnect
    print("binary = " + str(binary))
    stochastic = False  # default: False
    print("stochastic = " + str(stochastic))
    # (-H,+H) are the two binary values
    # H = "Glorot"
    H = 1.  # default: 1.
    print("H = " + str(H))
    # W_LR_scale = 1.
    W_LR_scale = "Glorot"  # default: "Glorot"
    print("W_LR_scale = " + str(W_LR_scale))

    # Decaying LR
    LR_start = 0.005  # default: .003
    print("LR_start = " + str(LR_start))
    LR_fin = 0.0000005  # default: 0.0000003
    print("LR_fin = " + str(LR_fin))
    LR_decay = (LR_fin / LR_start) ** (1. / num_epochs)
    print("LR_decay = " + str(LR_decay))
    # BTW, LR decay might good for the BN moving average...

    save_path = None  # default: "mnist_parameters.npz"
    print("save_path = " + str(save_path))

    # Load the dataset (https://github.com/mnielsen/neural-networks-and-deep-learning)
    print('Loading MNIST dataset...')
    mnist = MnistReader("./data/mnist.pkl.gz")

    shuffle_parts = 1  # default: 1
    print("shuffle_parts = " + str(shuffle_parts))

    print("noise = " + str(noise))
    print("nalpha = " + str(nalpha))

    train_set_size = 50000  # default: 50000
    train_X, train_y = mnist.get_train_data(n_samples=train_set_size, noise=noise, alpha=nalpha)
    validation_X, validation_y = mnist.get_validation_data()
    test_X, test_y = mnist.get_test_data()
    print("train_set_size = "+str(train_y.shape[0]))
    print("validation_set_size = "+str(validation_y.shape[0]))
    print("test_set_size = "+str(test_y.shape[0]))

    # Log output
    with open(result_path + "params.txt", "a+") as l:
        print("batch_size = " + str(batch_size), file=l)
        print("alpha = " + str(alpha), file=l)
        print("epsilon = " + str(epsilon), file=l)
        print("num_units = " + str(num_units), file=l)
        print("n_hidden_layers = " + str(n_hidden_layers), file=l)
        print("num_epochs = " + str(num_epochs), file=l)
        print("dropout_in = " + str(dropout_in), file=l)
        print("dropout_hidden = " + str(dropout_hidden), file=l)
        if binary:
            print("activation = binary_net.binary_tanh_unit", file=l)
        else:
            print("activation = lasagne.nonlinearities.tanh", file=l)
        print("binary = " + str(binary), file=l)
        print("stochastic = " + str(stochastic), file=l)
        print("H = " + str(H), file=l)
        print("W_LR_scale = " + str(W_LR_scale), file=l)
        print("LR_start = " + str(LR_start), file=l)
        print("LR_fin = " + str(LR_fin), file=l)
        print("LR_decay = " + str(LR_decay), file=l)
        print("save_path = " + str(save_path), file=l)
        print("shuffle_parts = " + str(shuffle_parts), file=l)
        print("noise = " + str(noise), file=l)
        print("nalpha = " + str(nalpha), file=l)
        print("train_set_size = "+str(train_y.shape[0]), file=l)
        print("validation_set_size = "+str(validation_y.shape[0]), file=l)
        print("test_set_size = "+str(test_y.shape[0]), file=l)

    # bc01 format
    # Inputs in the range [-1,+1]
    # print("Inputs in the range [-1,+1]")
    train_X = 2 * train_X.reshape(-1, 1, 28, 28) - 1.
    validation_X = 2 * validation_X.reshape(-1, 1, 28, 28) - 1.
    test_X = 2 * test_X.reshape(-1, 1, 28, 28) - 1.

    # flatten targets
    train_y = np.hstack(train_y)
    validation_y = np.hstack(validation_y)
    test_y = np.hstack(test_y)

    # Onehot the targets
    train_y = np.float32(np.eye(10)[train_y])
    validation_y = np.float32(np.eye(10)[validation_y])
    test_y = np.float32(np.eye(10)[test_y])

    # for hinge loss
    train_y = 2 * train_y - 1.
    validation_y = 2 * validation_y - 1.
    test_y = 2 * test_y - 1.

    print('Building the MLP...')

    # Prepare Theano variables for inputs and targets
    input = T.tensor4('inputs')
    target = T.matrix('targets')
    LR = T.scalar('LR', dtype=theano.config.floatX)

    mlp = lasagne.layers.InputLayer(
        shape=(None, 1, 28, 28),
        input_var=input)

    mlp = lasagne.layers.DropoutLayer(
        mlp,
        p=dropout_in)

    for k in range(n_hidden_layers):
        mlp = binary_net.DenseLayer(
            mlp,
            binary=binary,
            stochastic=stochastic,
            H=H,
            W_LR_scale=W_LR_scale,
            nonlinearity=lasagne.nonlinearities.identity,
            num_units=num_units)

        mlp = lasagne.layers.BatchNormLayer(
            mlp,
            epsilon=epsilon,
            alpha=alpha)

        mlp = lasagne.layers.NonlinearityLayer(
            mlp,
            nonlinearity=activation)

        mlp = lasagne.layers.DropoutLayer(
            mlp,
            p=dropout_hidden)

    mlp = binary_net.DenseLayer(
        mlp,
        binary=binary,
        stochastic=stochastic,
        H=H,
        W_LR_scale=W_LR_scale,
        nonlinearity=lasagne.nonlinearities.identity,
        num_units=10)

    mlp = lasagne.layers.BatchNormLayer(
        mlp,
        epsilon=epsilon,
        alpha=alpha)

    train_output = lasagne.layers.get_output(mlp, deterministic=False)

    # squared hinge loss
    loss = T.mean(T.sqr(T.maximum(0., 1. - target * train_output)))

    if binary:

        # W updates
        W = lasagne.layers.get_all_params(mlp, binary=True)
        W_grads = binary_net.compute_grads(loss, mlp)
        updates = lasagne.updates.adam(loss_or_grads=W_grads, params=W, learning_rate=LR)
        updates = binary_net.clipping_scaling(updates, mlp)

        # other parameters updates
        params = lasagne.layers.get_all_params(mlp, trainable=True, binary=False)
        updates.update(lasagne.updates.adam(loss_or_grads=loss, params=params, learning_rate=LR))

    else:
        params = lasagne.layers.get_all_params(mlp, trainable=True)
        updates = lasagne.updates.adam(loss_or_grads=loss, params=params, learning_rate=LR)

    test_output = lasagne.layers.get_output(mlp, deterministic=True)
    test_loss = T.mean(T.sqr(T.maximum(0., 1. - target * test_output)))
    test_err = T.mean(T.neq(T.argmax(test_output, axis=1), T.argmax(target, axis=1)), dtype=theano.config.floatX)

    # Compile a function performing a training step on a mini-batch (by giving the updates dictionary)
    # and returning the corresponding training loss:
    train_fn = theano.function([input, target, LR], loss, updates=updates)

    # Compile a second function computing the validation loss and accuracy:
    val_fn = theano.function([input, target], [test_loss, test_err])

    print('Training...')

    binary_net.train(
        train_fn, val_fn,
        mlp,
        batch_size,
        LR_start, LR_decay,
        num_epochs,
        train_X, train_y,
        validation_X, validation_y,
        test_X, test_y,
        save_path,
        shuffle_parts,
        result_path)
Esempio n. 12
0
        # binary なパラメータだけを取り出す
        Wb_list = lasagne.layers.get_all_params(mlp, binary=True)
        for eW in Wb_list:
            print('eW:', type(eW), eW)

        # binary なパラメータのみに対する勾配を求めてリストアップ
        W_grad_list = binary_net.compute_grads(loss, mlp)
        print('W_grad_list', type(W_grad_list), W_grad_list)

        # ADAM学習則による更新式マップ(OrderedDict)
        updates_b0 = lasagne.updates.adam(loss_or_grads=W_grad_list,
                                          params=Wb_list,
                                          learning_rate=LR)

        # バイナリ化のためのクリッピング&スケーリング
        updates_b1 = binary_net.clipping_scaling(updates_b0, mlp)

        # other parameters updates
        # 非バイナリパラメータの更新則
        Wr_list = lasagne.layers.get_all_params(mlp,
                                                trainable=True,
                                                binary=False)

        # バイナリ+非バイナリ:パラメータ群をまとめる
        updates = OrderedDict(updates_b1.items() + lasagne.updates.adam(
            loss_or_grads=loss, params=Wr_list, learning_rate=LR).items())

    else:
        Wr_list = lasagne.layers.get_all_params(mlp, trainable=True)
        updates = lasagne.updates.adam(loss_or_grads=loss,
                                       params=Wr_list,