Ejemplo n.º 1
0
def get_updates(nnet, train_obj, trainable_params):

    implemented_solvers = ("nesterov", "adagrad", "adadelta", "adam")

    if not hasattr(nnet, "solver") or nnet.solver not in implemented_solvers:
        nnet.sgd_solver = "nesterov"
    else:
        nnet.sgd_solver = nnet.solver

    if nnet.sgd_solver == "nesterov":
        updates = l_updates.nesterov_momentum(train_obj,
                                              trainable_params,
                                              learning_rate=Cfg.learning_rate,
                                              momentum=0.9)

    elif nnet.sgd_solver == "adagrad":
        updates = l_updates.adagrad(train_obj,
                                    trainable_params,
                                    learning_rate=Cfg.learning_rate)

    elif nnet.sgd_solver == "adadelta":
        updates = l_updates.adadelta(train_obj,
                                     trainable_params,
                                     learning_rate=Cfg.learning_rate)

    elif nnet.sgd_solver == "adam":
        updates = l_updates.adam(train_obj,
                                 trainable_params,
                                 learning_rate=Cfg.learning_rate)

    return updates
Ejemplo n.º 2
0
    def train(self, X_train, X_valid, early_stop_count = 20 , X_test = None):

        l2_norm_squared = 0.001*sum([layer.L2 for layer in self.layers])
        mae = T.mean(T.sqrt(T.sum(T.sqr(self.layers[-1].output.flatten(2) - self.X), axis=1)), axis=0)
        cost = mae + l2_norm_squared
        updates = adadelta(cost,self.params)
        # updates = adam(cost, self.params)

        self.train_model = theano.function(inputs=[self.X], outputs=[cost, mae], updates=updates)
        self.valid_model = theano.function(inputs=[self.X], outputs=[cost, mae])

        num_training_batches = int(X_train.shape[0] / self.mini_batch_size)
        num_validation_batches = int(X_valid.shape[0] / self.mini_batch_size)

        counter = 0
        best_valid_err = 100
        early_stop = early_stop_count
        epoch_i = 0

        train_rand_idxs = list(range(0, X_train.shape[0]))
        valid_rand_idxs = list(range(0, X_valid.shape[0]))

        while counter < early_stop:
            epoch_i +=1
            train_costs = []
            train_errs = []

            valid_costs = []
            valid_errs = []

            np.random.shuffle(train_rand_idxs)
            for batch_i in range(num_training_batches):
                mnb_X = X_train[train_rand_idxs[batch_i*self.mini_batch_size: batch_i*self.mini_batch_size + self.mini_batch_size]]
                train_cost, train_err = self.train_model(mnb_X)
                train_costs.append(train_cost)
                train_errs.append(train_err)

            np.random.shuffle(valid_rand_idxs)
            for batch_i in range(num_validation_batches):
                mnb_X = X_train[train_rand_idxs[batch_i*self.mini_batch_size: batch_i*self.mini_batch_size + self.mini_batch_size]]
                valid_cost, valid_err = self.valid_model(mnb_X)
                valid_costs.append(valid_cost)
                valid_errs.append(valid_err)

            train_err = np.mean(np.array(train_errs))
            train_cost = np.mean(np.array(train_costs))
            val_err = np.mean(np.array(valid_errs))
            val_cost = np.mean(np.array(valid_costs))

            if val_err < best_valid_err:
                best_valid_err = val_err
                sys.stdout.write("Epoch "+str(epoch_i)+" Train cost: "+ str(train_cost)+ "Train mae: "+ str(train_err) + " Validation cost: "+ str(val_cost)+" Validation mae "+ str(val_err)  + ",counter "+str(counter)+ " __best__ \n")
                sys.stdout.flush()
                counter = 0
                with open("model/" + self.name +".model", mode="wb") as f:
                    cPickle.dump(self.params,f)
            else:
                counter +=1
                sys.stdout.write("Epoch " + str(epoch_i)+" Train cost: "+ str(train_cost)+ "Train mae: "+ str(train_err) + " Validation cost: "+ str(val_cost)+" Validation mae "+ str(val_err)  + ",counter "+str(counter) + "\n")
                sys.stdout.flush()
def generate_theano_func(args, network, penalty, input_dict, target_var):

    prediction = get_output(network, input_dict)

    # loss = T.mean( target_var * ( T.log(target_var) - prediction ))
    loss = T.mean(categorical_crossentropy(prediction, target_var))
    # loss += 0.0001 * sum (T.sum(layer_params ** 2) for layer_params in get_all_params(network) )
    # penalty = sum ( T.sum(lstm_param**2) for lstm_param in lstm_params )
    # penalty = regularize_layer_params(l_forward_1_lstm, l2)
    # penalty = T.sum(lstm_param**2 for lstm_param in lstm_params)
    # penalty = 0.0001 * sum (T.sum(layer_params ** 2) for layer_params in get_all_params(l_forward_1) )

    loss = loss + penalty

    params = get_all_params(network, trainable=True)

    if args.optimizer == "sgd":
        updates = sgd(loss, params, learning_rate=args.step)
    elif args.optimizer == "adagrad":
        updates = adagrad(loss, params, learning_rate=args.step)
    elif args.optimizer == "adadelta":
        updates = adadelta(loss, params, learning_rate=args.step)
    elif args.optimizer == "nesterov":
        updates = nesterov_momentum(loss, params, learning_rate=args.step)
    elif args.optimizer == "rms":
        updates = rmsprop(loss, params, learning_rate=args.step)
    elif args.optimizer == "adam":
        updates = adam(loss, params, learning_rate=args.step)
    else:
        raise "Need set optimizer correctly"

    test_prediction = get_output(network, input_dict, deterministic=True)
    # test_prediction = get_output(network, deterministic=True)
    # test_loss = T.mean( target_var * ( T.log(target_var) - test_prediction))
    test_loss = T.mean(categorical_crossentropy(test_prediction, target_var))

    train_fn = theano.function(
        [input1_var, input1_mask_var, input2_var, input2_mask_var, target_var],
        loss,
        updates=updates,
        allow_input_downcast=True,
    )

    if args.task == "sts":
        val_fn = theano.function(
            [input1_var, input1_mask_var, input2_var, input2_mask_var, target_var],
            [test_loss, test_prediction],
            allow_input_downcast=True,
        )

    elif args.task == "ent":
        # test_acc = T.mean(T.eq(T.argmax(test_prediction, axis=1), target_var), dtype=theano.config.floatX)
        test_acc = T.mean(categorical_accuracy(test_prediction, target_var))
        val_fn = theano.function(
            [input1_var, input1_mask_var, input2_var, input2_mask_var, target_var],
            [test_loss, test_acc],
            allow_input_downcast=True,
        )

    return train_fn, val_fn
Ejemplo n.º 4
0
    def __init__(self, output_size, meta_size, depth=2):

        encoder_sizes = [64, 64, 64]

        input_var = TT.matrix()
        meta_var = TT.matrix()
        target_var = TT.matrix()
        mask_var = TT.matrix()

        input_layer = layers.InputLayer((None, output_size), input_var=input_var)
        meta_layer = layers.InputLayer((None, meta_size), input_var=meta_var)
        concat_input_layer = layers.ConcatLayer([input_layer, meta_layer])
        dense = concat_input_layer

        for idx in xrange(depth):
            dense = layers.DenseLayer(dense, encoder_sizes[idx])
            dense = layers.batch_norm(dense)

        mu_and_logvar = layers.DenseLayer(dense, 2 * output_size, nonlinearity=nonlinearities.linear)
        mu = layers.SliceLayer(mu_and_logvar, slice(0, output_size), axis=1)
        log_var = layers.SliceLayer(mu_and_logvar, slice(output_size, None), axis=1)

        loss = neg_log_likelihood2(
            target_var,
            layers.get_output(mu),
            layers.get_output(log_var),
            mask_var
        ).mean()

        test_loss = neg_log_likelihood2(
            target_var,
            layers.get_output(mu, deterministic=True),
            layers.get_output(log_var, deterministic=True),
            mask_var
        ).mean()

        params = layers.get_all_params(mu_and_logvar, trainable=True)
        param_updates = updates.adadelta(loss, params)

        self._train_fn = theano.function(
            [input_var, meta_var, target_var],
            updates=param_updates,
            outputs=loss
        )

        self._loss_fn = theano.function(
            [input_var, meta_var, target_var],
            outputs=test_loss
        )

        self._predict_fn = theano.function(
            [input_var, meta_var],
            outputs=[
                layers.get_output(mu, deterministic=True),
                layers.get_output(log_var, deterministic=True)
            ]
        )
Ejemplo n.º 5
0
    def build_treatment_model(self, n_vars, **kwargs):

        input_vars = TT.matrix()
        instrument_vars = TT.matrix()
        targets = TT.vector()

        inputs = layers.InputLayer((None, n_vars), input_vars)
        inputs = layers.DropoutLayer(inputs, p=0.2)

        dense_layer = layers.DenseLayer(inputs, 2 * kwargs['dense_size'], nonlinearity=nonlinearities.rectify)
        dense_layer = layers.batch_norm(dense_layer)
        dense_layer= layers.DropoutLayer(dense_layer, p=0.2)

        for _ in xrange(kwargs['n_dense_layers'] - 1):
            dense_layer = layers.DenseLayer(dense_layer, kwargs['dense_size'], nonlinearity=nonlinearities.rectify)
            dense_layer = layers.batch_norm(dense_layer)

        self.treatment_output = layers.DenseLayer(dense_layer, 1, nonlinearity=nonlinearities.linear)
        init_params = layers.get_all_param_values(self.treatment_output)

        prediction = layers.get_output(self.treatment_output, deterministic=False)
        test_prediction = layers.get_output(self.treatment_output, deterministic=True)

        l2_cost = regularization.regularize_network_params(self.treatment_output, regularization.l2)
        loss = gmm_loss(prediction, targets, instrument_vars) + 1e-4 * l2_cost

        params = layers.get_all_params(self.treatment_output, trainable=True)
        param_updates = updates.adadelta(loss, params)

        self._train_fn = theano.function(
            [
                input_vars,
                targets,
                instrument_vars,
            ],
            loss,
            updates=param_updates
        )

        self._loss_fn = theano.function(
            [
                input_vars,
                targets,
                instrument_vars,
            ],
            loss,
        )

        self._output_fn = theano.function(
            [
                input_vars,
            ],
            test_prediction,
        )

        return init_params
Ejemplo n.º 6
0
def adadelta_momentum(grads,
                      params,
                      learning_rate=1.0,
                      momentum=0.9,
                      rho=0.95,
                      epsilon=1e-06):
    return apply_nesterov_momentum(adadelta(grads, params, learning_rate, rho,
                                            epsilon),
                                   params=params,
                                   momentum=momentum)
Ejemplo n.º 7
0
    def build_instrument_model(self, n_vars, **kwargs):

        targets = TT.vector()
        instrument_vars = TT.matrix()

        instruments = layers.InputLayer((None, n_vars), instrument_vars)
        instruments = layers.DropoutLayer(instruments, p=0.2)

        dense_layer = layers.DenseLayer(instruments,
                                        kwargs['dense_size'],
                                        nonlinearity=nonlinearities.tanh)
        dense_layer = layers.DropoutLayer(dense_layer, p=0.2)

        for _ in xrange(kwargs['n_dense_layers'] - 1):
            dense_layer = layers.DenseLayer(dense_layer,
                                            kwargs['dense_size'],
                                            nonlinearity=nonlinearities.tanh)
            dense_layer = layers.DropoutLayer(dense_layer, p=0.5)

        self.instrument_output = layers.DenseLayer(
            dense_layer, 1, nonlinearity=nonlinearities.linear)
        init_params = layers.get_all_param_values(self.instrument_output)
        prediction = layers.get_output(self.instrument_output,
                                       deterministic=False)
        test_prediction = layers.get_output(self.instrument_output,
                                            deterministic=True)

        # flexible here, endog variable can be categorical, continuous, etc.
        l2_cost = regularization.regularize_network_params(
            self.instrument_output, regularization.l2)
        loss = objectives.squared_error(
            prediction.flatten(), targets.flatten()).mean() + 1e-4 * l2_cost
        loss_total = objectives.squared_error(prediction.flatten(),
                                              targets.flatten()).mean()

        params = layers.get_all_params(self.instrument_output, trainable=True)
        param_updates = updates.adadelta(loss, params)

        self._instrument_train_fn = theano.function([
            targets,
            instrument_vars,
        ],
                                                    loss,
                                                    updates=param_updates)

        self._instrument_loss_fn = theano.function([
            targets,
            instrument_vars,
        ], loss_total)

        self._instrument_output_fn = theano.function([instrument_vars],
                                                     test_prediction)

        return init_params
Ejemplo n.º 8
0
    def run(self, parameter, parameterName, loss, **kwargs) :
        pVar = parameter.getVar()
        gparam = tt.grad(loss, pVar)
        updates = LUP.adadelta( [ gparam ], [pVar], learning_rate=self.getHP("lr"), rho=self.getHP("rho"), epsilon=self.getHP("epsilon"))

        ret = OptimizerResult(pVar, parameterName, gparam, updates[pVar])
        i = 0
        for param, update in updates.items() :
            if param is not pVar :
                name = "%s_adadelta_%s" % (parameterName, i)
                ret.addCoParameter(param, name, None, update)
                i += 1

        return ret
Ejemplo n.º 9
0
def get_updates(nnet, train_obj, trainable_params, solver=None):

    implemented_solvers = ("sgd", "momentum", "nesterov", "adagrad", "rmsprop",
                           "adadelta", "adam", "adamax")

    if solver not in implemented_solvers:
        nnet.sgd_solver = "adam"
    else:
        nnet.sgd_solver = solver

    if nnet.sgd_solver == "sgd":
        updates = l_updates.sgd(train_obj,
                                trainable_params,
                                learning_rate=Cfg.learning_rate)
    elif nnet.sgd_solver == "momentum":
        updates = l_updates.momentum(train_obj,
                                     trainable_params,
                                     learning_rate=Cfg.learning_rate,
                                     momentum=Cfg.momentum)
    elif nnet.sgd_solver == "nesterov":
        updates = l_updates.nesterov_momentum(train_obj,
                                              trainable_params,
                                              learning_rate=Cfg.learning_rate,
                                              momentum=Cfg.momentum)
    elif nnet.sgd_solver == "adagrad":
        updates = l_updates.adagrad(train_obj,
                                    trainable_params,
                                    learning_rate=Cfg.learning_rate)
    elif nnet.sgd_solver == "rmsprop":
        updates = l_updates.rmsprop(train_obj,
                                    trainable_params,
                                    learning_rate=Cfg.learning_rate,
                                    rho=Cfg.rho)
    elif nnet.sgd_solver == "adadelta":
        updates = l_updates.adadelta(train_obj,
                                     trainable_params,
                                     learning_rate=Cfg.learning_rate,
                                     rho=Cfg.rho)
    elif nnet.sgd_solver == "adam":
        updates = l_updates.adam(train_obj,
                                 trainable_params,
                                 learning_rate=Cfg.learning_rate)
    elif nnet.sgd_solver == "adamax":
        updates = l_updates.adamax(train_obj,
                                   trainable_params,
                                   learning_rate=Cfg.learning_rate)

    return updates
Ejemplo n.º 10
0
    def build_instrument_model(self, n_vars, **kwargs):

        targets = TT.vector()
        instrument_vars = TT.matrix()

        instruments = layers.InputLayer((None, n_vars), instrument_vars)
        instruments = layers.DropoutLayer(instruments, p=0.2)

        dense_layer = layers.DenseLayer(instruments, kwargs['dense_size'], nonlinearity=nonlinearities.tanh)
        dense_layer = layers.DropoutLayer(dense_layer, p=0.2)

        for _ in xrange(kwargs['n_dense_layers'] - 1):
            dense_layer = layers.DenseLayer(dense_layer, kwargs['dense_size'], nonlinearity=nonlinearities.tanh)
            dense_layer = layers.DropoutLayer(dense_layer, p=0.5)

        self.instrument_output = layers.DenseLayer(dense_layer, 1, nonlinearity=nonlinearities.linear)
        init_params = layers.get_all_param_values(self.instrument_output)
        prediction = layers.get_output(self.instrument_output, deterministic=False)
        test_prediction = layers.get_output(self.instrument_output, deterministic=True)

        # flexible here, endog variable can be categorical, continuous, etc.
        l2_cost = regularization.regularize_network_params(self.instrument_output, regularization.l2)
        loss = objectives.squared_error(prediction.flatten(), targets.flatten()).mean() + 1e-4 * l2_cost
        loss_total = objectives.squared_error(prediction.flatten(), targets.flatten()).mean()

        params = layers.get_all_params(self.instrument_output, trainable=True)
        param_updates = updates.adadelta(loss, params)

        self._instrument_train_fn = theano.function(
            [
                targets,
                instrument_vars,
            ],
            loss,
            updates=param_updates
        )

        self._instrument_loss_fn = theano.function(
            [
                targets,
                instrument_vars,
            ],
            loss_total
        )

        self._instrument_output_fn = theano.function([instrument_vars], test_prediction)

        return init_params
Ejemplo n.º 11
0
def train_setup():

    x = T.tensor3('input')
    y = T.lvector('output')

    network = cnn(x, config.input_length, config.output_length)

    print 'Number of Parameters {0}'.format(count_params(network))

    if config.init_model is not None:

        with np.load(config.init_model) as f:
            param_values = [f['arr_%d' % i] for i in range(len(f.files))]

        set_all_param_values(decoding, param_values)

    # training tasks in sequence

    prediction = get_output(network)

    ent = categorical_crossentropy(prediction, y)
    ent = ent.mean()

    l1_norm = config.l1_weight * regularize_network_params(network, l1)
    l2_norm = config.l2_weight * regularize_network_params(network, l2)

    total_error = ent + l1_norm + l2_norm

    params = get_all_params(network, trainable=True)

    updates = adadelta( total_error, params, config.learning_rate, \
                                             config.rho, \
                                             config.eps )

    train_fn = function( [x, y], [ent, l1_norm, l2_norm, prediction], \
                              updates = updates, \
                              allow_input_downcast = True )

    val_prediction = get_output(network, deterministic=True)
    val_ent = categorical_crossentropy(val_prediction, y)
    val_ent = val_ent.mean()

    val_fn = function([x, y], [val_ent, val_prediction],
                      allow_input_downcast=True)

    return network, train_fn, val_fn
Ejemplo n.º 12
0
def train_setup():

    x = T.tensor3('input')
    y = T.matrix('output')

    encoding, decoding = cnn( x, config.input_length, config.output_length, \
                                 config.encoding_length )

    print 'Number of Parameters {0}'.format(count_params(decoding))

    if config.init_model is not None:

        with np.load(config.init_model) as f:
            param_values = [f['arr_%d' % i] for i in range(len(f.files))]

        set_all_param_values(decoding, param_values)

    # training tasks in sequence

    prediction = get_output(decoding)

    error = squared_error(y, prediction)
    error = error.mean()

    l1_norm = config.l1_weight * regularize_network_params(decoding, l1)
    l2_norm = config.l2_weight * regularize_network_params(decoding, l2)

    total_error = error + l1_norm + l2_norm

    params = get_all_params(decoding, trainable=True)

    updates = adadelta( total_error, params, config.learning_rate, \
                                             config.rho, \
                                             config.eps )

    train_fn = function( [x, y], [error, l1_norm, l2_norm], \
                              updates = updates, \
                              allow_input_downcast = True )

    val_prediction = get_output(decoding, deterministic=True)
    val_error = squared_error(y, val_prediction)
    val_error = val_error.mean()

    val_fn = function([x, y], val_error, allow_input_downcast=True)

    return encoding, decoding, train_fn, val_fn
Ejemplo n.º 13
0
def main():

    def signal_handler(signal, frame):
        global terminate
        terminate = True
        print('terminating...'.format(terminate))

    signal.signal(signal.SIGINT, signal_handler)
    configure_theano()
    options = parse_options()
    X, X_val = generate_data()

    # X = np.reshape(X, (-1, 1, 30, 40))[:-5]
    print('X type and shape:', X.dtype, X.shape)
    print('X.min():', X.min())
    print('X.max():', X.max())

    # X_val = np.reshape(X_val, (-1, 1, 30, 40))[:-1]
    print('X_val type and shape:', X_val.dtype, X_val.shape)
    print('X_val.min():', X_val.min())
    print('X_val.max():', X_val.max())

    # we need our target to be 1 dimensional
    X_out = X.reshape((X.shape[0], -1))
    X_val_out = X_val.reshape((X_val.shape[0], -1))
    print('X_out:', X_out.dtype, X_out.shape)
    print('X_val_out', X_val_out.dtype, X_val_out.shape)

    # X_noisy = apply_gaussian_noise(X_out)
    # visualize_reconstruction(X_noisy[0:25], X_out[0:25], shape=(28, 28))
    # X = np.reshape(X_noisy, (-1, 1, 28, 28))

    print('constructing and compiling model...')
    # input_var = T.tensor4('input', dtype='float32')
    input_var = T.tensor3('input', dtype='float32')
    target_var = T.matrix('output', dtype='float32')
    lr = theano.shared(np.array(0.8, dtype=theano.config.floatX), name='learning_rate')
    lr_decay = np.array(0.9, dtype=theano.config.floatX)

    # try building a reshaping layer
    # network = create_model(input_var, (None, 1, 30, 40), options)
    l_input = InputLayer((None, None, 1200), input_var, name='input')
    l_input = ReshapeLayer(l_input, (-1, 1, 30, 40), name='reshape_input')
    # l_input = InputLayer((None, 1, 30, 40), input_var, name='input')
    if options['MODEL'] == 'normal':
        network, encoder = avletters_convae.create_model(l_input, options)
    if options['MODEL'] == 'batchnorm':
        network, encoder = avletters_convae_bn.create_model(l_input, options)
    if options['MODEL'] == 'dropout':
        network, encoder = avletters_convae_drop.create_model(l_input, options)
    if options['MODEL'] == 'bn+dropout':
        network, encoder = avletters_convae_bndrop.create_model(l_input, options)

    print('AE Network architecture: {}'.format(options['MODEL']))
    print_network(network)

    recon = las.layers.get_output(network, deterministic=False)
    all_params = las.layers.get_all_params(network, trainable=True)
    cost = T.mean(squared_error(recon, target_var))
    updates = adadelta(cost, all_params, lr)
    # updates = las.updates.apply_nesterov_momentum(updates, all_params, momentum=0.90)

    use_max_constraint = False
    print('apply max norm constraint: {}'.format(use_max_constraint))
    if use_max_constraint:
        MAX_NORM = 4
        for param in las.layers.get_all_params(network, regularizable=True):
            if param.ndim > 1:  # only apply to dimensions larger than 1, exclude biases
                # updates[param] = norm_constraint(param, MAX_NORM * las.utils.compute_norms(param.get_value()).mean())
                updates[param] = norm_constraint(param, MAX_NORM)

    train = theano.function([input_var, target_var], recon, updates=updates, allow_input_downcast=True)
    train_cost_fn = theano.function([input_var, target_var], cost, allow_input_downcast=True)

    eval_recon = las.layers.get_output(network, deterministic=True)
    eval_cost = T.mean(las.objectives.squared_error(eval_recon, target_var))
    eval_cost_fn = theano.function([input_var, target_var], eval_cost, allow_input_downcast=True)
    recon_fn = theano.function([input_var], eval_recon, allow_input_downcast=True)

    if terminate:
        exit()

    NUM_EPOCHS = options['NUM_EPOCHS']
    EPOCH_SIZE = options['EPOCH_SIZE']
    NO_STRIDES = options['NO_STRIDES']
    VAL_NO_STRIDES = options['VAL_NO_STRIDES']

    print('begin training for {} epochs...'.format(NUM_EPOCHS))
    datagen = batch_iterator(X, X_out, 128)

    costs = []
    val_costs = []
    for epoch in range(NUM_EPOCHS):
        time_start = time.time()
        for i in range(EPOCH_SIZE):
            batch_X, batch_y = next(datagen)
            print_str = 'Epoch {} batch {}/{}: {} examples at learning rate = {:.4f}'.format(
                epoch + 1, i + 1, EPOCH_SIZE, len(batch_X), lr.get_value())
            print(print_str, end='')
            sys.stdout.flush()
            batch_X = batch_X.reshape((-1, 1, 1200))
            train(batch_X, batch_y)
            print('\r', end='')
            if terminate:
                break
        if terminate:
            break

        cost = batch_compute_cost(X, X_out, NO_STRIDES, train_cost_fn)
        val_cost = batch_compute_cost(X_val, X_val_out, VAL_NO_STRIDES, eval_cost_fn)
        costs.append(cost)
        val_costs.append(val_cost)

        print("Epoch {} train cost = {}, validation cost = {} ({:.1f}sec) "
              .format(epoch + 1, cost, val_cost, time.time() - time_start))
        if epoch > 10:
            lr.set_value(lr.get_value() * lr_decay)

    X_val_recon = recon_fn(X_val)
    visualize_reconstruction(X_val_out[450:550], X_val_recon[450:550], shape=(30, 40), savefilename='avletters')
    plot_validation_cost(costs, val_costs, None, savefilename='valid_cost')

    conv2d1 = las.layers.get_all_layers(network)[2]
    visualize.plot_conv_weights(conv2d1, (15, 14)).savefig('conv2d1.png')

    print('saving encoder...')
    save_model(encoder, 'models/conv_encoder.dat')
    save_model(network, 'models/conv_ae.dat')
Ejemplo n.º 14
0
def main():
    configure_theano()
    config_file = 'config/separate_train.ini'
    print('loading config file: {}'.format(config_file))
    config = ConfigParser.ConfigParser()
    config.read(config_file)

    print('preprocessing dataset...')
    data = load_mat_file(config.get('data', 'images'))
    ae_pretrained = config.get('models', 'pretrained')
    ae_finetuned = config.get('models', 'finetuned')
    learning_rate = float(config.get('training', 'learning_rate'))
    decay_rate = float(config.get('training', 'decay_rate'))
    decay_start = int(config.get('training', 'decay_start'))
    lstm_units = int(config.get('training', 'lstm_units'))
    output_units = int(config.get('training', 'output_units'))
    do_finetune = config.getboolean('training', 'do_finetune')
    save_finetune = config.getboolean('training', 'save_finetune')
    load_finetune = config.getboolean('training', 'load_finetune')

    # 53 subjects, 70 utterances, 5 view angles
    # s[x]_v[y]_u[z].mp4
    # resized, height, width = (26, 44)
    # ['dataMatrix', 'targetH', 'targetsPerVideoVec', 'videoLengthVec', '__header__', 'targetsVec',
    # '__globals__', 'iterVec', 'filenamesVec', 'dataMatrixCells', 'subjectsVec', 'targetW', '__version__']

    print(data.keys())
    X = data['dataMatrix'].astype('float32')  # .reshape((-1, 26, 44), order='f').reshape((-1, 26 * 44))
    y = data['targetsVec'].astype('int32')
    y = y.reshape((len(y),))
    uniques = np.unique(y)
    print('number of classifications: {}'.format(len(uniques)))
    subjects = data['subjectsVec'].astype('int')
    subjects = subjects.reshape((len(subjects),))
    video_lens = data['videoLengthVec'].astype('int')
    video_lens = video_lens.reshape((len(video_lens,)))

    train_subject_ids = read_data_split_file('data/train.txt')
    val_subject_ids = read_data_split_file('data/val.txt')
    test_subject_ids = read_data_split_file('data/test.txt')
    print('Train: {}'.format(train_subject_ids))
    print('Validation: {}'.format(val_subject_ids))
    print('Test: {}'.format(test_subject_ids))

    train_X, train_y, train_vidlens, train_subjects, \
    val_X, val_y, val_vidlens, val_subjects, \
    test_X, test_y, test_vidlens, test_subjects = \
        split_data(X, y, subjects, video_lens, train_subject_ids, val_subject_ids, test_subject_ids)

    assert train_X.shape[0] + val_X.shape[0] + test_X.shape[0] == len(X)
    assert train_y.shape[0] + val_y.shape[0] + test_y.shape[0] == len(y)
    assert train_vidlens.shape[0] + val_vidlens.shape[0] + test_vidlens.shape[0] == len(video_lens)
    assert train_subjects.shape[0] + val_subjects.shape[0] + test_subjects.shape[0] == len(subjects)

    train_X = normalize_input(train_X, centralize=True)
    test_X = normalize_input(test_X, centralize=True)

    if do_finetune:
        dbn = load_dbn(ae_pretrained)
        dbn.initialize()
        dbn.fit(train_X, train_X)
        recon = dbn.predict(test_X)
        visualize_reconstruction(reorder_data(test_X[800:864], (26, 44)),
                                 reorder_data(recon[800:864], (26, 44)),
                                 shape=(26, 44))

    if save_finetune:
        pickle.dump(dbn, open(ae_finetuned, 'wb'))

    if load_finetune:
        print('loading pre-trained encoding layers...')
        dbn = pickle.load(open(ae_finetuned, 'rb'))
        dbn.initialize()
        # recon = dbn.predict(test_X)
        # visualize_reconstruction(reorder_data(test_X[800:864], (26, 44)),
        #                         reorder_data(recon[800:864], (26, 44)),
        #                         shape=(26, 44))

    encoder = extract_encoder(dbn)
    train_X = encoder.predict(train_X)
    val_X = encoder.predict(val_X)
    test_X = encoder.predict(test_X)

    # train_X = concat_first_second_deltas(train_X, train_vidlens)
    # val_X = concat_first_second_deltas(val_X, val_vidlens)
    # test_X = concat_first_second_deltas(test_X, test_vidlens)

    # featurewise normalize
    train_X, mean, std = featurewise_normalize_sequence(train_X)
    val_X = (val_X - mean) / std
    test_X = (test_X - mean) / std


    # recon = dbn.predict(test_X)
    # visualize_reconstruction(test_X[550:650], recon[550:650], (26, 44))
    # exit()

    # IMPT: the encoder was trained with fortan ordered images, so to visualize
    # convert all the images to C order using reshape_images_order()
    # output = dbn.predict(test_X)
    # test_X = reshape_images_order(test_X, (26, 44))
    # output = reshape_images_order(output, (26, 44))
    # visualize_reconstruction(test_X[:36, :], output[:36, :], shape=(26, 44))

    inputs = T.tensor3('inputs', dtype='float32')
    mask = T.matrix('mask', dtype='uint8')
    targets = T.ivector('targets')
    lr = theano.shared(np.array(learning_rate, dtype=theano.config.floatX), name='learning_rate')
    lr_decay = np.array(decay_rate, dtype=theano.config.floatX)

    print('constructing lstm classifier...')
    network = lstm_classifier_baseline.create_model((None, None, 50), inputs,
                                                    (None, None), mask,
                                                    lstm_units, output_units)

    print_network(network)
    print('compiling model...')
    predictions = las.layers.get_output(network, deterministic=False)
    all_params = las.layers.get_all_params(network, trainable=True)
    cost = T.mean(las.objectives.categorical_crossentropy(predictions, targets))
    updates = adadelta(cost, all_params, learning_rate=lr)
    # updates = las.updates.apply_momentum(sgd(cost, all_params, learning_rate=lr), all_params, 0.1)

    use_max_constraint = False
    if use_max_constraint:
        MAX_NORM = 4
        for param in las.layers.get_all_params(network, regularizable=True):
            if param.ndim > 1:  # only apply to dimensions larger than 1, exclude biases
                updates[param] = norm_constraint(param, MAX_NORM * las.utils.compute_norms(param.get_value()).mean())

    train = theano.function(
        [inputs, targets, mask],
        cost, updates=updates, allow_input_downcast=True)
    compute_train_cost = theano.function([inputs, targets, mask], cost, allow_input_downcast=True)

    test_predictions = las.layers.get_output(network, deterministic=True)
    test_cost = T.mean(las.objectives.categorical_crossentropy(test_predictions, targets))
    compute_test_cost = theano.function(
        [inputs, targets, mask], test_cost, allow_input_downcast=True)

    val_fn = theano.function([inputs, mask], test_predictions, allow_input_downcast=True)

    # We'll train the network with 10 epochs of 30 minibatches each
    print('begin training...')
    cost_train = []
    cost_val = []
    class_rate = []
    NUM_EPOCHS = 30
    EPOCH_SIZE = 120
    BATCH_SIZE = 10
    STRIP_SIZE = 3
    MAX_LOSS = 0.2
    VALIDATION_WINDOW = 10
    val_window = circular_list(VALIDATION_WINDOW)
    train_strip = np.zeros((STRIP_SIZE,))
    best_val = float('inf')
    best_conf = None
    best_cr = 0.0

    datagen = gen_lstm_batch_random(train_X, train_y, train_vidlens, batchsize=BATCH_SIZE)
    val_datagen = gen_lstm_batch_random(val_X, val_y, val_vidlens, batchsize=len(val_vidlens))
    test_datagen = gen_lstm_batch_random(test_X, test_y, test_vidlens, batchsize=len(test_vidlens))

    # We'll use this "validation set" to periodically check progress
    X_val, y_val, mask_val, _ = next(val_datagen)
    X_test, y_test, mask_test, _ = next(test_datagen)

    def early_stop(cost_window):
        if len(cost_window) < 2:
            return False
        else:
            curr = cost_window[0]
            for idx, cost in enumerate(cost_window):
                if curr < cost or idx == 0:
                    curr = cost
                else:
                    return False
            return True

    for epoch in range(NUM_EPOCHS):
        time_start = time.time()
        for i in range(EPOCH_SIZE):
            X, y, m, _ = next(datagen)
            print_str = 'Epoch {} batch {}/{}: {} examples at learning rate = {:.4f}'.format(
                epoch + 1, i + 1, EPOCH_SIZE, len(X), float(lr.get_value()))
            print(print_str, end='')
            sys.stdout.flush()
            train(X, y, m)
            print('\r', end='')
        cost = compute_train_cost(X, y, m)
        val_cost = compute_test_cost(X_val, y_val, mask_val)
        cost_train.append(cost)
        cost_val.append(val_cost)
        train_strip[epoch % STRIP_SIZE] = cost
        val_window.push(val_cost)

        gl = 100 * (cost_val[-1] / np.min(cost_val) - 1)
        pk = 1000 * (np.sum(train_strip) / (STRIP_SIZE * np.min(train_strip)) - 1)
        pq = gl / pk

        cr, val_conf = evaluate_model(X_val, y_val, mask_val, val_fn)
        class_rate.append(cr)

        if val_cost < best_val:
            best_val = val_cost
            best_conf = val_conf
            best_cr = cr
            test_cr, test_conf = evaluate_model(X_test, y_test, mask_test, val_fn)
            print("Epoch {} train cost = {}, val cost = {}, "
                  "GL loss = {:.3f}, GQ = {:.3f}, CR = {:.3f}, Test CR= {:.3f} ({:.1f}sec)"
                  .format(epoch + 1, cost_train[-1], cost_val[-1], gl, pq, cr, test_cr, time.time() - time_start))
        else:
            print("Epoch {} train cost = {}, val cost = {}, "
                  "GL loss = {:.3f}, GQ = {:.3f}, CR = {:.3f} ({:.1f}sec)"
                  .format(epoch + 1, cost_train[-1], cost_val[-1], gl, pq, cr, time.time() - time_start))

        if epoch >= VALIDATION_WINDOW and early_stop(val_window):
            break

        # learning rate decay
        if epoch > decay_start:
            lr.set_value(lr.get_value() * lr_decay)

    phrases = ['p1', 'p2', 'p3', 'p4', 'p5', 'p6', 'p7', 'p8', 'p9', 'p10']

    print('Final Model')
    print('CR: {}, val loss: {}, Test CR: {}'.format(best_cr, best_val, test_cr))
    print('confusion matrix: ')
    plot_confusion_matrix(test_conf, phrases, fmt='grid')
    plot_validation_cost(cost_train, cost_val, savefilename='valid_cost')
def build_network_2dconv(args, input_var, target_var, wordEmbeddings, maxlen=60):

    print("Building model with 2D Convolution")

    vocab_size = wordEmbeddings.shape[1]
    wordDim = wordEmbeddings.shape[0]

    num_filters = 100
    stride = 1

    # CNN_sentence config
    filter_size = (3, wordDim)
    pool_size = (maxlen - 3 + 1, 1)

    input = InputLayer((None, maxlen), input_var=input_var)
    batchsize, seqlen = input.input_var.shape
    emb = EmbeddingLayer(input, input_size=vocab_size, output_size=wordDim, W=wordEmbeddings.T)
    emb.params[emb.W].remove("trainable")  # (batchsize, maxlen, wordDim)

    reshape = ReshapeLayer(emb, (batchsize, 1, maxlen, wordDim))

    conv2d = Conv2DLayer(
        reshape,
        num_filters=num_filters,
        filter_size=(filter_size),
        stride=stride,
        nonlinearity=rectify,
        W=GlorotUniform(),
    )  # (None, 100, 34, 1)
    maxpool = MaxPool2DLayer(conv2d, pool_size=pool_size)  # (None, 100, 1, 1)

    forward = FlattenLayer(maxpool)  # (None, 100) #(None, 50400)

    hid = DenseLayer(forward, num_units=args.hiddenDim, nonlinearity=sigmoid)

    network = DenseLayer(hid, num_units=2, nonlinearity=softmax)

    prediction = get_output(network)

    loss = T.mean(binary_crossentropy(prediction, target_var))
    lambda_val = 0.5 * 1e-4

    layers = {conv2d: lambda_val, hid: lambda_val, network: lambda_val}
    penalty = regularize_layer_params_weighted(layers, l2)
    loss = loss + penalty

    params = get_all_params(network, trainable=True)

    if args.optimizer == "sgd":
        updates = sgd(loss, params, learning_rate=args.step)
    elif args.optimizer == "adagrad":
        updates = adagrad(loss, params, learning_rate=args.step)
    elif args.optimizer == "adadelta":
        updates = adadelta(loss, params, learning_rate=args.step)
    elif args.optimizer == "nesterov":
        updates = nesterov_momentum(loss, params, learning_rate=args.step)
    elif args.optimizer == "rms":
        updates = rmsprop(loss, params, learning_rate=args.step)
    elif args.optimizer == "adam":
        updates = adam(loss, params, learning_rate=args.step)
    else:
        raise "Need set optimizer correctly"

    test_prediction = get_output(network, deterministic=True)
    test_loss = T.mean(binary_crossentropy(test_prediction, target_var))

    train_fn = theano.function([input_var, target_var], loss, updates=updates, allow_input_downcast=True)

    test_acc = T.mean(binary_accuracy(test_prediction, target_var))
    val_fn = theano.function([input_var, target_var], [test_loss, test_acc], allow_input_downcast=True)

    return train_fn, val_fn
Ejemplo n.º 16
0
    def __init__(self,
                 hidden_size=100,
                 nclasses=73,
                 num_embeddings=11359,
                 embedding_dim=100,
                 window_size=1,
                 memory_size=40,
                 n_memory_slots=8,
                 go_code=1,
                 depth=2,
                 load_dir=None):

        articles, titles = T.imatrices('articles', 'titles')
        n_article_slots = int(n_memory_slots /
                              2)  # TODO derive this from an arg
        n_title_slots = n_memory_slots - n_article_slots
        n_instances = articles.shape[0]

        self.window_size = window_size

        randoms = {
            # attr: shape
            # 'emb': (num_embeddings + 1, embedding_dim),
            'M_a': (memory_size, n_article_slots),
            'M_t': (memory_size, n_title_slots),
            'w_a': (n_article_slots, ),
            'w_t': (n_title_slots, ),
            'Wg_a': (window_size * embedding_dim, n_article_slots),
            'Wg_t': (window_size * embedding_dim, n_title_slots),
            'Wk': (hidden_size, memory_size),
            'Wb': (hidden_size, 1),
            'Wv': (hidden_size, memory_size),
            'We_a': (hidden_size, n_article_slots),
            'We_t': (hidden_size, n_title_slots),
            'Wx': (window_size * embedding_dim, hidden_size),
            'Wh': (memory_size, hidden_size),
            'W': (hidden_size, nclasses),
            'h0': hidden_size
        }

        zeros = {
            # attr: shape
            'bg_a': n_article_slots,
            'bg_t': n_title_slots,
            'bk': memory_size,
            'bb': 1,
            'bv': memory_size,
            'be_a': n_article_slots,
            'be_t': n_title_slots,
            'bh': hidden_size,
            'b': nclasses,
        }

        for l in range(depth):
            randoms['gru' + str(l)] = (1, embedding_dim)

        def random_shared(name):
            shape = randoms[name]
            return theano.shared(
                0.2 *
                np.random.normal(size=shape).astype(theano.config.floatX),
                name=name)

        def zeros_shared(name):
            shape = zeros[name]
            return theano.shared(np.zeros(shape, dtype=theano.config.floatX),
                                 name=name)

        for key in randoms:
            # create an attribute with associated shape and random values
            setattr(self, key, random_shared(key))

        for key in zeros:
            # create an attribute with associated shape and values equal to 0
            setattr(self, key, zeros_shared(key))

        self.names = randoms.keys() + zeros.keys()
        # self.names.remove('emb')  # no need to save or update embeddings
        scan_vars = 'h0 w_a M_a w_t M_t'.split()

        def repeat_for_each_instance(param):
            """ repeat param along new axis once for each instance """
            return T.repeat(T.shape_padleft(param),
                            repeats=n_instances,
                            axis=0)

        for key in scan_vars:
            setattr(self, key,
                    repeat_for_each_instance(self.__getattribute__(key)))
            self.names.remove(key)

        if load_dir is not None:
            with open(os.path.join(load_dir, 'params.pkl')) as handle:
                params = pickle.load(handle)
                self.__dict__.update(params)

        def recurrence(i, h_tm1, w_a, M_a, *args, **kwargs):
            """
            notes
            Headers from paper in all caps
            mem = n_article slots if is_article else n_title_slots

            :param i: center index of sliding window
            :param h_tm1: h_{t-1} (hidden state)
            :param w_a: attention weights for article memory
            :param M_a: article memory
            :param args: gru_weights, maybe w_t, maybe M_t
                   gru_weights: weights with which to initialize GRULayer on each time step
                   w_t: attention weights for titles memory
                   M_t: titles memory
            :param kwargs: is_training, is_article
                   is_training:
                   is_article: we use different parts of memory when working with a article
            :return: [y = model outputs,
                      i + 1 = increment index,
                      h w, M (see above)]
            """
            is_training = kwargs['is_training']
            is_article = kwargs['is_article']
            gru_weights = args[:depth]
            if len(args) > depth:
                w_t = args[depth]
                M_t = args[depth + 1]

            i_type = T.iscalar if is_article or is_training else T.ivector
            assert i.type == i_type

            if not is_article:
                assert w_t is not None and M_t is not None

            word_idxs = i
            if is_article or is_training:
                # get representation of word window
                document = articles if is_article else titles  # [instances, bucket_width]
                word_idxs = document[:, i:i + 1]  # [instances, 1]
            # x_i = self.emb[word_idxs].flatten(ndim=2)  # [instances, embedding_dim]

            input = InputLayer(shape=(None, 1), input_var=word_idxs)
            embed = EmbeddingLayer(input, num_embeddings, embedding_dim)
            gru = GRULayer(incoming=embed,
                           num_units=embedding_dim,
                           hid_init=self.gru0)
            for weight in gru_weights:
                gru = GRULayer(incoming=gru,
                               num_units=embedding_dim,
                               hid_init=weight)
            x_i = get_output(gru).flatten(ndim=2)
            x_i = Print('x_i')(x_i)  # [instances, embedding_dim]

            gru_weights = []

            if is_article:
                M_read = M_a  # [instances, memory_size, n_article_slots]
                w_read = w_a  # [instances, n_article_slots]
            else:
                M_read = T.concatenate(
                    [M_a, M_t],
                    axis=2)  # [instances, memory_size, n_title_slots]
                w_read = T.concatenate([w_a, w_t],
                                       axis=1)  # [instances, n_title_slots]

            # eqn 15
            c = T.batched_dot(M_read, w_read)  # [instances, memory_size]

            # EXTERNAL MEMORY READ
            def get_attention(Wg, bg, M, w):
                g = T.nnet.sigmoid(T.dot(x_i, Wg) + bg)  # [instances, mem]

                # eqn 11
                k = T.dot(h_tm1, self.Wk) + self.bk  # [instances, memory_size]

                # eqn 13
                beta = T.dot(h_tm1, self.Wb) + self.bb
                beta = T.nnet.softplus(beta)
                beta = T.addbroadcast(beta, 1)  # [instances, 1]

                # eqn 12
                w_hat = T.nnet.softmax(beta * cosine_dist(M, k))

                # eqn 14
                return (1 - g) * w + g * w_hat  # [instances, mem]

            w_a = get_attention(self.Wg_a, self.bg_a, M_a,
                                w_a)  # [instances, n_article_slots]
            if not is_article:
                w_t = get_attention(self.Wg_t, self.bg_t, M_t,
                                    w_t)  # [instances, n_title_slots]

            # MODEL INPUT AND OUTPUT
            # eqn 9
            h = T.dot(c, self.Wh) + T.dot(
                x_i, self.Wx) + self.bh  # [instances, hidden_size]

            # eqn 10
            y = T.nnet.softmax(T.dot(h, self.W) +
                               self.b)  # [instances, nclasses]

            # EXTERNAL MEMORY UPDATE
            def update_memory(We, be, w_update, M_update):
                # eqn 17
                e = T.nnet.sigmoid(T.dot(h_tm1, We) + be)  # [instances, mem]
                f = 1. - w_update * e  # [instances, mem]

                # eqn 16
                v = T.tanh(T.dot(h, self.Wv) +
                           self.bv)  # [instances, memory_size]

                # need to add broadcast layers for memory update
                f = f.dimshuffle(0, 'x', 1)  # [instances, 1, mem]
                u = w_update.dimshuffle(0, 'x', 1)  # [instances, 1, mem]
                v = v.dimshuffle(0, 1, 'x')  # [instances, memory_size, 1]

                # eqn 19
                return M_update * f + T.batched_dot(v, u) * (
                    1 - f)  # [instances, memory_size, mem]

            M_a = update_memory(self.We_a, self.be_a, w_a, M_a)
            attention_and_memory = [w_a, M_a]
            if not is_article:
                M_t = update_memory(self.We_t, self.be_t, w_t, M_t)
                attention_and_memory += [w_t, M_t]

            y_max = y.argmax(axis=1).astype(int32)
            next_idxs = i + 1 if is_training or is_article else y_max
            return [y, y_max, next_idxs, h] + attention_and_memory

        read_article = partial(recurrence, is_training=True, is_article=True)
        # for read_article, it actually doesn't matter whether is_training is true

        i0 = T.constant(0, dtype=int32, name='first_value_of_i')
        gru_weights = [eval('self.gru' + str(l)) for l in range(depth)]
        outputs_info = [None, None, i0, self.h0, self.w_a, self.M_a
                        ] + gru_weights

        [_, _, _, h, w, M], _ = theano.scan(fn=read_article,
                                            outputs_info=outputs_info,
                                            n_steps=articles.shape[1],
                                            name='read_scan')

        produce_title = partial(recurrence, is_training=True, is_article=False)
        outputs_info[3:6] = [param[-1, :, :] for param in (h, w, M)]
        outputs_info.extend([self.w_t, self.M_t])
        bucket_width = titles.shape[
            1] - 1  # subtract 1 because <go> is omitted in y_true
        [y, y_max, _, _, _, _, _,
         _], _ = theano.scan(fn=produce_title,
                             outputs_info=outputs_info,
                             n_steps=bucket_width,
                             name='train_scan')

        # loss and updates
        y_clip = T.clip(y, .01, .99)
        y_flatten = y_clip.dimshuffle(2, 1, 0).flatten(ndim=2).T
        y_true = titles[:, 1:].ravel()  # [:, 1:] in order to omit <go>
        counts = T.extra_ops.bincount(y_true, assert_nonneg=True)
        weights = 1.0 / (counts[y_true] + 1) * T.neq(y_true, 0)
        losses = T.nnet.categorical_crossentropy(y_flatten, y_true)
        loss = objectives.aggregate(losses, weights, mode='sum')
        updates = adadelta(loss, self.params())

        self.learn = theano.function(inputs=[articles, titles],
                                     outputs=[y_max.T, loss],
                                     updates=updates,
                                     allow_input_downcast=True,
                                     name='learn')

        produce_title_test = partial(recurrence,
                                     is_training=False,
                                     is_article=False)

        self.test = theano.function(inputs=[articles, titles],
                                    outputs=[y_max.T],
                                    on_unused_input='ignore')

        outputs_info[2] = T.zeros([n_instances], dtype=int32) + go_code
        [_, y_max, _, _, _, _, _,
         _], _ = theano.scan(fn=produce_title_test,
                             outputs_info=outputs_info,
                             n_steps=bucket_width,
                             name='test_scan')

        self.predict = theano.function(inputs=[articles, titles],
                                       outputs=y_max.T,
                                       name='infer')
Ejemplo n.º 17
0
def event_span_classifier(args, input_var, target_var, wordEmbeddings, seqlen, num_feats):

    print("Building model with 1D Convolution")

    vocab_size = wordEmbeddings.shape[1]
    wordDim = wordEmbeddings.shape[0]

    kw = 2
    num_filters = seqlen-kw+1
    stride = 1 

    #important context words as channels
 
    #CNN_sentence config
    filter_size=wordDim
    pool_size=seqlen-filter_size+1

    input = InputLayer((None, seqlen, num_feats),input_var=input_var)
    batchsize, _, _ = input.input_var.shape
    emb = EmbeddingLayer(input, input_size=vocab_size, output_size=wordDim, W=wordEmbeddings.T)
    #emb.params[emb.W].remove('trainable') #(batchsize, seqlen, wordDim)

    #print get_output_shape(emb)
    reshape = ReshapeLayer(emb, (batchsize, seqlen, num_feats*wordDim))
    #print get_output_shape(reshape)

    conv1d = Conv1DLayer(reshape, num_filters=num_filters, filter_size=wordDim, stride=1, 
        nonlinearity=tanh,W=GlorotUniform()) #nOutputFrame = num_flters, 
                                            #nOutputFrameSize = (num_feats*wordDim-filter_size)/stride +1

    #print get_output_shape(conv1d)

    conv1d = DimshuffleLayer(conv1d, (0,2,1))

    #print get_output_shape(conv1d)

    pool_size=num_filters

    maxpool = MaxPool1DLayer(conv1d, pool_size=pool_size) 

    #print get_output_shape(maxpool)
  
    #forward = FlattenLayer(maxpool) 

    #print get_output_shape(forward)
 
    hid = DenseLayer(maxpool, num_units=args.hiddenDim, nonlinearity=sigmoid)

    network = DenseLayer(hid, num_units=2, nonlinearity=softmax)

    prediction = get_output(network)
    
    loss = T.mean(binary_crossentropy(prediction,target_var))
    lambda_val = 0.5 * 1e-4

    layers = {emb:lambda_val, conv1d:lambda_val, hid:lambda_val, network:lambda_val} 
    penalty = regularize_layer_params_weighted(layers, l2)
    loss = loss + penalty


    params = get_all_params(network, trainable=True)

    if args.optimizer == "sgd":
        updates = sgd(loss, params, learning_rate=args.step)
    elif args.optimizer == "adagrad":
        updates = adagrad(loss, params, learning_rate=args.step)
    elif args.optimizer == "adadelta":
        updates = adadelta(loss, params, learning_rate=args.step)
    elif args.optimizer == "nesterov":
        updates = nesterov_momentum(loss, params, learning_rate=args.step)
    elif args.optimizer == "rms":
        updates = rmsprop(loss, params, learning_rate=args.step)
    elif args.optimizer == "adam":
        updates = adam(loss, params, learning_rate=args.step)
    else:
        raise "Need set optimizer correctly"
 
    test_prediction = get_output(network, deterministic=True)
    test_loss = T.mean(binary_crossentropy(test_prediction,target_var))

    train_fn = theano.function([input_var, target_var], 
        loss, updates=updates, allow_input_downcast=True)

    test_acc = T.mean(binary_accuracy(test_prediction, target_var))
    val_fn = theano.function([input_var, target_var], [test_loss, test_acc], allow_input_downcast=True)

    return train_fn, val_fn, network
Ejemplo n.º 18
0
    def build_chain_trainer(self):
        bs = self.bs
        td = self.td

        wi = T.ivector('wi')  # bs (disamb. word indices)
        nwi = T.ivector('nwi')  # negative samples
        lr = T.dscalar('lr').astype(theano.config.floatX)  # learning rate
        lam = T.dscalar('lam').astype(theano.config.floatX)
        L = self.params['L']
        L1 = self.params['L1']  # hd x td
        #Wt = self.params['Wt']
        if not self.hinge_cost:
            L2 = self.params['L2']
            B = self.params['B']  # td
            B2 = self.params['B2']

        dwe = self.params['dwe']
        df = self.dat[wi, :]  #T.itensor3('df')# bs x mw x ms
        pr = self.sense_priors[wi, :]  # bs x mw x ms
        mk = self.dmask[wi, :]  #T.itensor3('mk')# bs x mw x ms
        pd = self.pd[
            wi, :]  #T.imatrix('pd') # bs x mdw (plain definition sentence)
        pe = self.ex[wi, :]  # plain example sentences bs x mew
        dw = dwe[wi, :]  # bs x td
        msk = self.wmask[wi, :].dimshuffle(0, 1, 'x')  # bs x mw x 1
        ndw = dwe[nwi, :]  # negative words

        def to_vect(d, m, p):
            hid_inp = dwe[d, :]  # mw x ms x hd
            logit = T.exp(T.dot(hid_inp, L0)[:, :, p])  # (mw x ms) x mw
            mk = T.switch(T.lt(p, 0), 0,
                          1)  # mw: word-level mask (different mask from m)
            mask = mk.dimshuffle(0, 'x', 'x')
            l2 = logit * mask  # mw x ms x mw
            l2 = T.sum(l2 * mk.dimshuffle('x', 'x', 0), axis=2) * m  # mw x ms
            w0 = l2 / T.sum(l2, axis=1).dimshuffle(0, 'x')
            w1 = T.switch(T.isnan(w0), 0, w0)
            w = w1.dimshuffle(0, 1, 'x')  # mw x ms x 1
            res = T.sum(w * hid_inp, axis=1)  # mw x hd
            return res  #, logit, weights

        def to_weight(d, m, p, prior):
            logit = T.tensordot(dwe[d, :], dwe.T,
                                axes=1)[:, :, d]  # mw x ms x mw x ms
            cnt = T.sum(m, axis=1).dimshuffle('x', 'x', 0)  # 1 x 1 x mw
            logit = T.sum(logit * m.dimshuffle('x', 'x', 0, 1),
                          axis=3) / cnt  # mw x ms x mw
            logit = T.exp(10 *
                          T.switch(T.isnan(logit), 0, logit))  # mw x ms x mw
            logit = T.prod(logit, axis=2) * prior  # mw x ms
            sm = T.sum(logit * m, axis=1, keepdims=True)  # mw x 1
            #mask = T.switch(T.lt(p, 0), 0, 1).dimshuffle(0, 'x') #
            logit = (logit * m) / sm  # mw x ms
            return T.switch(T.or_(T.isnan(logit), T.isinf(logit)), 0, logit)

        '''def to_weight(d, m, p, prior):
			A = dwe[d, :] # mw x ms x td
			#tmp = T.tensordot(T.dot(A, Wt), A.T, axes=1) # mw x ms x ms x mw
			#B = A * Wt.dimshuffle('x', 'x', 0) # 'diag' setting
			#tmp = T.tensordot(B, B.T, axes = 1)
			tmp = T.tensordot(A, A.T, axes = 1) # 'iden' setting
			tmp = T.exp(1000 * tmp.dimshuffle(0, 1, 3, 2)) # mw x ms x mw x ms
			tmp = tmp * m.dimshuffle('x', 'x', 0, 1)
			nrm = T.sum(tmp, axis=3)
			tmp = tmp / nrm.dimshuffle(0, 1, 2, 'x')
			tmp = T.switch(T.isnan(tmp), 0, tmp)
			mk = T.switch(T.lt(p, 0), 0, 1) # mw: word-level mask (different mask from m)
			tmp = T.max(tmp, axis=3) * mk.dimshuffle('x', 'x', 0) # mw x ms x mw
			tmp = T.exp(T.sum(T.log(T.switch(T.eq(tmp, 0), 1, tmp)), axis=2)) * m # mw x ms
			tmp = tmp * prior
			tmp = tmp / T.sum(tmp, axis=1).dimshuffle(0, 'x')
			return T.switch(T.isnan(tmp), 0, tmp)'''

        def cosim(x, y):
            return T.mean(
                T.sum(x * y, axis=1) / (x.norm(2, axis=1) * y.norm(2, axis=1)))

        #dat, _ = theano.scan(fn=to_vect, sequences=[df, mk, pd]) # bs x mw x td
        #ndat, _ = theano.scan(fn=to_vect_tmp, sequences=[ndf, nmk, npd]) # bs x mw x td
        weights, _ = theano.scan(fn=to_weight, sequences=[df, mk, pd,
                                                          pr])  # bs x mw x ms
        hid_inp = dwe[df, :]  # bs x mw x ms x td
        dat = T.sum(weights.dimshuffle(0, 1, 2, 'x') * hid_inp,
                    axis=2)  # bs x mw x td '''
        inp = dat.astype(theano.config.floatX)
        def_emb = T.sum(T.dot(inp, L) * msk, axis=1)  # bs x hd
        #neg_inp = ndat.astype(theano.config.floatX)
        #def_emb = get_sentence(inp, msk) # bs x hd

        #neg_def_emb = get_sentence(neg_inp, neg_msk)

        #w_cost = T.sum((def_emb - dw) ** 2)
        #w_neg_cost = T.sum((def_emb - ndw) ** 2)
        if self.hinge_cost:
            def_emb = T.dot(def_emb, L1)
            w_cost = -cosim(def_emb, dw)
            rep = nwi.shape[0] / wi.shape[
                0]  # b/c there are more negative samples than pos.
            de = T.extra_ops.repeat(def_emb, rep, axis=0)
            w_neg_cost = -cosim(de, ndw)
            cost = T.mean(T.maximum(0,
                                    0.01 + w_cost - w_neg_cost))  # hingeloss
        else:
            regress = T.dot(T.nnet.sigmoid(T.dot(def_emb, L1) + B),
                            L2) + B2  # bs x td
            cost = T.mean(
                (regress - dw)**
                2) + 0.01 * T.sum(abs(L2))  # only regularize the last

        if self.reg_alpha:
            cost += 0.1 * T.sum(abs(weights))
        #w_cost = get_word_probs(def_emb, wi, L1) #dwe.T) # dwe instead of L1
        #w_neg_cost = get_word_probs(def_emb, nwi, L1) #dwe.T) # dwe instead of L1

        #c_cost = -get_context_probs(def_emb, pe, L0) # negative of the likelihood
        #c_neg_cost = -get_context_probs(def_emb, npe, L0)

        #all_params = [self.params[k] for k in self.params if k != 'dwe' and not k.startswith('L')]
        all_params = [self.params[k] for k in self.params if k != 'dwe']
        #L_params = [L0]
        '''Copy of the same function in Lasagne (with minor changes)'''

        def apply_nesterov_momentum(ups, mom, shape=None):
            params = ups.keys()
            ups = OrderedDict(ups)
            if shape is None:
                shape = [p.get_value(borrow=True).shape for p in params]

            for (param, shp) in zip(params, shape):
                velocity = theano.shared(np.zeros(shp,
                                                  dtype=theano.config.floatX),
                                         broadcastable=param.broadcastable)
                x = mom * velocity + ups[param] - param
                ups[velocity] = x
                ups[param] = mom * x + ups[param]
            return ups

        dwe_params = [dw, ndw]
        if self.do_sgd:
            grads = T.grad(cost, all_params)
            updates = OrderedDict()
            for (p, g) in zip(all_params, grads):
                updates[p] = p - lr * g
            apply_nesterov_momentum(updates, mom=0.9)
            if self.no_alt or not self.do_fixedpoint:
                dgrads = T.grad(cost, dwe_params)
                dwe_update = OrderedDict()
                for (p, g) in zip(dwe_params, dgrads):
                    dwe_update[p] = p - lr * g
                    foo = lr * g
                apply_nesterov_momentum(dwe_update,
                                        mom=0.9,
                                        shape=[(bs, td), (bs, td)])
        else:
            updates = adadelta(cost, all_params, learning_rate=lr)
            #L_update = adadelta(cost, L_params, learning_rate = lr)
            if self.no_alt or not self.do_fixedpoint:
                dwe_update = adadelta(cost, dwe_params, learning_rate=lr)

        if not self.no_alt and self.do_fixedpoint:  # because no alternating training means optimization
            if self.do_rw:
                #posword = self.base[wi] + 0.3 * def_emb #0.3 * ((1 - self.lam) * def_emb + self.lam * dw)
                idf = self.idf[wi].dimshuffle(
                    0, 1, 'x')  # bs x mw x 1  (dat is bs x mw x hd)
                rw_term = T.sum(dat * idf, axis=1)  # bs x hd
                disc_fact = 0.9
                if self.init_dwe:
                    #posword = disc_fact * rw_term # + self.base[wi] # truerw
                    posword = (
                        1 - lam
                    ) * dw + lam * disc_fact * rw_term  # + self.base[wi] # truerw
                else:
                    base = self.lam * def_emb + (1 - self.lam) * dw
                    posword = base + disc_fact * rw_term
                word_update = T.set_subtensor(
                    dw, posword.astype(theano.config.floatX))
                dwe_update = {dwe: word_update}
                dwe_ret = T.max(T.abs_(posword -
                                       dw))  # max-norm of the increment
            else:
                posword = (1 - self.lam) * def_emb + self.lam * dw
                word_update = T.set_subtensor(dw, posword - self.lam * ndw)
                dwe_update = {dwe: word_update}
                dwe_ret = word_update
        else:  #elif not self.do_fixedpoint or self.no_alt:
            word_update = dwe_update[dw]
            word_update = T.set_subtensor(dw, word_update)
            nword_update = dwe_update[ndw]
            word_update = T.set_subtensor(word_update[nwi, :], nword_update)
            dwe_update = {dwe: word_update}  #T.set_subtensor(dw, word_update)
            if self.no_alt:
                updates.update({dwe: word_update})
            dwe_ret = word_update
            #updates.update({dwe: dwe_update[dwe]}) #word_update})
        #updates.update({dwe: word_update})

        self.train_step = theano.function([wi, nwi, lr], [cost, weights],
                                          updates=updates)
        if not self.no_alt:
            self.dwe_train_step = theano.function([wi, nwi, lam],
                                                  [cost, dwe_ret, weights],
                                                  updates=dwe_update)
Ejemplo n.º 19
0
def main():
    configure_theano()
    options = parse_options()
    config_file = options['config']
    config = ConfigParser.ConfigParser()
    config.read(config_file)

    print('CLI options: {}'.format(options.items()))

    print('Reading Config File: {}...'.format(config_file))
    print(config.items('data'))
    print(config.items('models'))
    print(config.items('training'))

    print('preprocessing dataset...')
    data = load_mat_file(config.get('data', 'images'))
    dct_data = load_mat_file(config.get('data', 'dct'))
    ae_finetuned = config.get('models', 'finetuned')
    ae_finetuned_diff = config.get('models', 'finetuned_diff')
    fusiontype = config.get('models', 'fusiontype')
    learning_rate = float(config.get('training', 'learning_rate'))
    decay_rate = float(config.get('training', 'decay_rate'))
    decay_start = int(config.get('training', 'decay_start'))
    load_finetune = config.getboolean('training', 'load_finetune')
    load_finetune_diff = config.getboolean('training', 'load_finetune_diff')

    train_vidlens = data['trVideoLengthVec'].astype('int').reshape((-1, ))
    val_vidlens = data['valVideoLengthVec'].astype('int').reshape((-1, ))
    test_vidlens = data['testVideoLengthVec'].astype('int').reshape((-1, ))
    train_X = data['trData'].astype('float32')
    val_X = data['valData'].astype('float32')
    test_X = data['testData'].astype('float32')
    train_dct = dct_data['trDctFeatures'].astype('float32')
    val_dct = dct_data['valDctFeatures'].astype('float32')
    test_dct = dct_data['testDctFeatures'].astype('float32')
    train_X_diff = compute_diff_images(train_X, train_vidlens)
    val_X_diff = compute_diff_images(val_X, val_vidlens)
    test_X_diff = compute_diff_images(test_X, test_vidlens)
    train_y = data['trTargetsVec'].astype('int').reshape(
        (-1, )) + 1  # +1 to handle the -1 introduced in lstm_gendata
    val_y = data['valTargetsVec'].astype('int').reshape((-1, )) + 1
    test_y = data['testTargetsVec'].astype('int').reshape((-1, )) + 1

    # featurewise normalize dct features
    train_dct, dct_mean, dct_std = featurewise_normalize_sequence(train_dct)
    val_dct = (val_dct - dct_mean) / dct_std
    test_dct = (test_dct - dct_mean) / dct_std

    if load_finetune:
        print('loading finetuned encoder: {}...'.format(ae_finetuned))
        ae = pickle.load(open(ae_finetuned, 'rb'))
        ae.initialize()

    if load_finetune_diff:
        print('loading finetuned encoder: {}...'.format(ae_finetuned_diff))
        ae_diff = pickle.load(open(ae_finetuned_diff, 'rb'))
        ae_diff.initialize()

    # IMPT: the encoder was trained with fortan ordered images, so to visualize
    # convert all the images to C order using reshape_images_order()
    # output = dbn.predict(test_X)
    # test_X = reshape_images_order(test_X, (26, 44))
    # output = reshape_images_order(output, (26, 44))
    # visualize_reconstruction(test_X[:36, :], output[:36, :], shape=(26, 44))

    window = T.iscalar('theta')
    dct = T.tensor3('dct', dtype='float32')
    inputs = T.tensor3('inputs', dtype='float32')
    inputs_diff = T.tensor3('inputs_diff', dtype='float32')
    mask = T.matrix('mask', dtype='uint8')
    targets = T.ivector('targets')
    lr = theano.shared(np.array(learning_rate, dtype=theano.config.floatX),
                       name='learning_rate')
    lr_decay = np.array(decay_rate, dtype=theano.config.floatX)

    print('constructing end to end model...')
    network, l_fuse = adenet_v3.create_model(ae, ae_diff, (None, None, 1500),
                                             inputs, (None, None), mask,
                                             (None, None, 90), dct,
                                             (None, None, 1500), inputs_diff,
                                             250, window, 10, fusiontype)

    print_network(network)
    # draw_to_file(las.layers.get_all_layers(network), 'network.png')
    print('compiling model...')
    predictions = las.layers.get_output(network, deterministic=False)
    all_params = las.layers.get_all_params(network, trainable=True)
    cost = T.mean(las.objectives.categorical_crossentropy(
        predictions, targets))
    updates = adadelta(cost, all_params, learning_rate=lr)
    # updates = adagrad(cost, all_params, learning_rate=lr)

    train = theano.function([inputs, targets, mask, dct, inputs_diff, window],
                            cost,
                            updates=updates,
                            allow_input_downcast=True)
    compute_train_cost = theano.function(
        [inputs, targets, mask, dct, inputs_diff, window],
        cost,
        allow_input_downcast=True)

    test_predictions = las.layers.get_output(network, deterministic=True)
    test_cost = T.mean(
        las.objectives.categorical_crossentropy(test_predictions, targets))
    compute_test_cost = theano.function(
        [inputs, targets, mask, dct, inputs_diff, window],
        test_cost,
        allow_input_downcast=True)

    val_fn = theano.function([inputs, mask, dct, inputs_diff, window],
                             test_predictions,
                             allow_input_downcast=True)

    # We'll train the network with 10 epochs of 30 minibatches each
    print('begin training...')
    cost_train = []
    cost_val = []
    class_rate = []
    NUM_EPOCHS = 30
    EPOCH_SIZE = 45
    BATCH_SIZE = 20
    WINDOW_SIZE = 9
    STRIP_SIZE = 3
    MAX_LOSS = 0.2
    VALIDATION_WINDOW = 4
    val_window = circular_list(VALIDATION_WINDOW)
    train_strip = np.zeros((STRIP_SIZE, ))
    best_val = float('inf')
    best_conf = None
    best_cr = 0.0

    datagen = gen_lstm_batch_random(train_X,
                                    train_y,
                                    train_vidlens,
                                    batchsize=BATCH_SIZE)
    integral_lens = compute_integral_len(train_vidlens)

    val_datagen = gen_lstm_batch_random(val_X,
                                        val_y,
                                        val_vidlens,
                                        batchsize=len(val_vidlens))
    test_datagen = gen_lstm_batch_random(test_X,
                                         test_y,
                                         test_vidlens,
                                         batchsize=len(test_vidlens))

    # We'll use this "validation set" to periodically check progress
    X_val, y_val, mask_val, idxs_val = next(val_datagen)
    integral_lens_val = compute_integral_len(val_vidlens)
    dct_val = gen_seq_batch_from_idx(val_dct, idxs_val, val_vidlens,
                                     integral_lens_val, np.max(val_vidlens))
    X_diff_val = gen_seq_batch_from_idx(val_X_diff, idxs_val, val_vidlens,
                                        integral_lens_val, np.max(val_vidlens))

    # we use the test set to check final classification rate
    X_test, y_test, mask_test, idxs_test = next(test_datagen)
    integral_lens_test = compute_integral_len(test_vidlens)
    dct_test = gen_seq_batch_from_idx(test_dct, idxs_test, test_vidlens,
                                      integral_lens_test, np.max(test_vidlens))
    X_diff_test = gen_seq_batch_from_idx(test_X_diff, idxs_test, test_vidlens,
                                         integral_lens_test,
                                         np.max(test_vidlens))

    def early_stop(cost_window):
        if len(cost_window) < 2:
            return False
        else:
            curr = cost_window[0]
            for idx, cost in enumerate(cost_window):
                if curr < cost or idx == 0:
                    curr = cost
                else:
                    return False
            return True

    for epoch in range(NUM_EPOCHS):
        time_start = time.time()
        for i in range(EPOCH_SIZE):
            X, y, m, batch_idxs = next(datagen)
            d = gen_seq_batch_from_idx(train_dct, batch_idxs, train_vidlens,
                                       integral_lens, np.max(train_vidlens))
            X_diff = gen_seq_batch_from_idx(train_X_diff, batch_idxs,
                                            train_vidlens, integral_lens,
                                            np.max(train_vidlens))
            print_str = 'Epoch {} batch {}/{}: {} examples at learning rate = {:.4f}'.format(
                epoch + 1, i + 1, EPOCH_SIZE, len(X), float(lr.get_value()))
            print(print_str, end='')
            sys.stdout.flush()
            train(X, y, m, d, X_diff, WINDOW_SIZE)
            print('\r', end='')
        cost = compute_train_cost(X, y, m, d, X_diff, WINDOW_SIZE)
        val_cost = compute_test_cost(X_val, y_val, mask_val, dct_val,
                                     X_diff_val, WINDOW_SIZE)
        cost_train.append(cost)
        cost_val.append(val_cost)
        train_strip[epoch % STRIP_SIZE] = cost
        val_window.push(val_cost)

        gl = 100 * (cost_val[-1] / np.min(cost_val) - 1)
        pk = 1000 * (np.sum(train_strip) /
                     (STRIP_SIZE * np.min(train_strip)) - 1)
        pq = gl / pk

        cr, val_conf = evaluate_model(X_val, y_val, mask_val, dct_val,
                                      X_diff_val, WINDOW_SIZE, val_fn)
        class_rate.append(cr)

        if val_cost < best_val:
            best_val = val_cost
            best_cr = cr
            if fusiontype == 'adasum':
                adascale_param = las.layers.get_all_param_values(
                    l_fuse, scaling_param=True)
            test_cr, test_conf = evaluate_model(X_test, y_test, mask_test,
                                                dct_test, X_diff_test,
                                                WINDOW_SIZE, val_fn)
            print(
                "Epoch {} train cost = {}, val cost = {}, "
                "GL loss = {:.3f}, GQ = {:.3f}, CR = {:.3f}, Test CR= {:.3f} ({:.1f}sec)"
                .format(epoch + 1, cost_train[-1], cost_val[-1], gl, pq, cr,
                        test_cr,
                        time.time() - time_start))
        else:
            print("Epoch {} train cost = {}, val cost = {}, "
                  "GL loss = {:.3f}, GQ = {:.3f}, CR = {:.3f} ({:.1f}sec)".
                  format(epoch + 1, cost_train[-1], cost_val[-1], gl, pq, cr,
                         time.time() - time_start))

        if epoch >= VALIDATION_WINDOW and early_stop(val_window):
            break

        # learning rate decay
        if epoch + 1 >= decay_start:
            lr.set_value(lr.get_value() * lr_decay)

    numbers = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']

    print('Final Model')
    print('CR: {}, val loss: {}, Test CR: {}'.format(best_cr, best_val,
                                                     test_cr))
    if fusiontype == 'adasum':
        print("final scaling params: {}".format(adascale_param))
    print('confusion matrix: ')
    plot_confusion_matrix(test_conf, numbers, fmt='latex')
    plot_validation_cost(cost_train, cost_val, savefilename='valid_cost')

    if options['write_results']:
        results_file = options['write_results']
        with open(results_file, mode='a') as f:
            f.write('{},{},{}\n'.format(fusiontype, test_cr, best_val))
Ejemplo n.º 20
0
def construct_lstm(input_size, lstm_size, output_size, train_data_gen, val_data_gen):

    # All gates have initializers for the input-to-gate and hidden state-to-gate
    # weight matrices, the cell-to-gate weight vector, the bias vector, and the nonlinearity.
    # The convention is that gates use the standard sigmoid nonlinearity,
    # which is the default for the Gate class.
    gate_parameters = Gate(
        W_in=las.init.Orthogonal(), W_hid=las.init.Orthogonal(),
        b=las.init.Constant(0.))
    cell_parameters = Gate(
        W_in=las.init.Orthogonal(), W_hid=las.init.Orthogonal(),
        # Setting W_cell to None denotes that no cell connection will be used.
        W_cell=None, b=las.init.Constant(0.),
        # By convention, the cell nonlinearity is tanh in an LSTM.
        nonlinearity=tanh)

    # prepare the input layers
    # By setting the first and second dimensions to None, we allow
    # arbitrary minibatch sizes with arbitrary sequence lengths.
    # The number of feature dimensions is 150, as described above.
    l_in = InputLayer(shape=(None, None, input_size))
    # This input will be used to provide the network with masks.
    # Masks are expected to be matrices of shape (n_batch, n_time_steps);
    # both of these dimensions are variable for us so we will use
    # an input shape of (None, None)
    l_mask = InputLayer(shape=(None, None))

    # Our LSTM will have 180 hidden/cell units as published in paper
    N_HIDDEN = lstm_size
    l_lstm = LSTMLayer(
        l_in, N_HIDDEN,
        # We need to specify a separate input for masks
        mask_input=l_mask,
        # Here, we supply the gate parameters for each gate
        ingate=gate_parameters, forgetgate=gate_parameters,
        cell=cell_parameters, outgate=gate_parameters,
        # We'll learn the initialization and use gradient clipping
        learn_init=True, grad_clipping=5.)

    # The "backwards" layer is the same as the first,
    # except that the backwards argument is set to True.
    l_lstm_back = LSTMLayer(
        l_in, N_HIDDEN, ingate=gate_parameters,
        mask_input=l_mask, forgetgate=gate_parameters,
        cell=cell_parameters, outgate=gate_parameters,
        learn_init=True, grad_clipping=5., backwards=True)
    # We'll combine the forward and backward layer output by summing.
    # Merge layers take in lists of layers to merge as input.
    l_sum = ElemwiseSumLayer([l_lstm, l_lstm_back])

    # implement drop-out regularization
    l_dropout = DropoutLayer(l_sum)

    l_lstm2 = LSTMLayer(
        l_dropout, N_HIDDEN,
        # We need to specify a separate input for masks
        mask_input=l_mask,
        # Here, we supply the gate parameters for each gate
        ingate=gate_parameters, forgetgate=gate_parameters,
        cell=cell_parameters, outgate=gate_parameters,
        # We'll learn the initialization and use gradient clipping
        learn_init=True, grad_clipping=5.)

    # The "backwards" layer is the same as the first,
    # except that the backwards argument is set to True.
    l_lstm_back2 = LSTMLayer(
        l_dropout, N_HIDDEN, ingate=gate_parameters,
        mask_input=l_mask, forgetgate=gate_parameters,
        cell=cell_parameters, outgate=gate_parameters,
        learn_init=True, grad_clipping=5., backwards=True)

    # We'll combine the forward and backward layer output by summing.
    # Merge layers take in lists of layers to merge as input.
    l_sum2 = ElemwiseSumLayer([l_lstm2, l_lstm_back2])

    '''
    l_dropout2 = DropoutLayer(l_sum2)

    l_lstm3 = LSTMLayer(
        l_dropout2, N_HIDDEN,
        # We need to specify a separate input for masks
        mask_input=l_mask,
        # Here, we supply the gate parameters for each gate
        ingate=gate_parameters, forgetgate=gate_parameters,
        cell=cell_parameters, outgate=gate_parameters,
        # We'll learn the initialization and use gradient clipping
        learn_init=True, grad_clipping=5.)

    # The "backwards" layer is the same as the first,
    # except that the backwards argument is set to True.
    l_lstm_back3 = LSTMLayer(
        l_dropout2, N_HIDDEN, ingate=gate_parameters,
        mask_input=l_mask, forgetgate=gate_parameters,
        cell=cell_parameters, outgate=gate_parameters,
        learn_init=True, grad_clipping=5., backwards=True)

    # We'll combine the forward and backward layer output by summing.
    # Merge layers take in lists of layers to merge as input.
    l_sum3 = ElemwiseSumLayer([l_lstm3, l_lstm_back3])
    '''
    # The l_forward layer creates an output of dimension (batch_size, SEQ_LENGTH, N_HIDDEN)
    # Since we are only interested in the final prediction, we isolate that quantity and feed it to the next layer.
    # The output of the sliced layer will then be of size (batch_size, N_HIDDEN)
    l_forward_slice = SliceLayer(l_sum2, -1, 1)

    # Now, we can apply feed-forward layers as usual.
    # We want the network to predict a classification for the sequence,
    # so we'll use a the number of classes.
    l_out = DenseLayer(
        l_forward_slice, num_units=output_size, nonlinearity=las.nonlinearities.softmax)

    # Now, the shape will be n_batch*n_timesteps, output_size. We can then reshape to
    # n_batch, n_timesteps to get a single value for each timstep from each sequence
    # l_out = las.layers.ReshapeLayer(l_dense, (n_batch, n_time_steps))

    # Symbolic variable for the target network output.
    # It will be of shape n_batch, because there's only 1 target value per sequence.
    target_values = T.ivector('target_output')

    # This matrix will tell the network the length of each sequences.
    # The actual values will be supplied by the gen_data function.
    mask = T.matrix('mask')

    # lasagne.layers.get_output produces an expression for the output of the net
    network_output = las.layers.get_output(l_out)

    # The value we care about is the final value produced for each sequence
    # so we simply slice it out.
    # predicted_values = network_output[:, -1]

    # Our cost will be categorical cross entropy error
    cost = T.mean(las.objectives.categorical_crossentropy(network_output, target_values))
    # cost = T.mean((predicted_values - target_values) ** 2)
    # Retrieve all parameters from the network
    all_params = las.layers.get_all_params(l_out)
    # Compute adam updates for training
    # updates = las.updates.adam(cost, all_params)
    updates = adadelta(cost, all_params)
    # Theano functions for training and computing cost
    train = theano.function(
        [l_in.input_var, target_values, l_mask.input_var],
        cost, updates=updates, allow_input_downcast=True)

    compute_cost = theano.function(
        [l_in.input_var, target_values, l_mask.input_var], cost, allow_input_downcast=True)

    probs = theano.function([l_in.input_var, l_mask.input_var], network_output, allow_input_downcast=True)

    # We'll use this "validation set" to periodically check progress
    X_val, y_val, mask_val = next(val_data_gen)

    # We'll train the network with 10 epochs of 100 minibatches each
    cost_train = []
    cost_val = []
    class_rate = []
    NUM_EPOCHS = 20
    EPOCH_SIZE = 26
    for epoch in range(NUM_EPOCHS):
        for _ in range(EPOCH_SIZE):
            X, y, m = next(train_data_gen)
            train(X, y, m)
        cost_train.append(compute_cost(X, y, m))
        cost_val.append(compute_cost(X_val, y_val, mask_val))
        cr, _ = evaluate_model(X_val, y_val, mask_val, probs)
        class_rate.append(cr)

        # one good value to early stop using GL technique, alpha = 0.10 (10% worst)
        gl = cost_val[-1] / np.min(cost_val) - 1
        # PQ, GL / Pk(t) where Pk(t) = 1000 * (sum(training strip error) / k * min(training strip error) - 1

        print("Epoch {} train cost = {}, validation cost = {}, generalization loss = {}, classification rate = {}"
              .format(epoch + 1, cost_train[-1], cost_val[-1], gl, cr))
    cr, conf = evaluate_model(X_val, y_val, mask_val, probs)
    print('Final Model')
    print('classification rate: {}'.format(cr))
    print('confusion matrix: ')
    plot_confusion_matrix(conf, fmt='grid')
    plot_validation_cost(cost_train, cost_val, class_rate)
Ejemplo n.º 21
0
def construct_lstm(input_size, lstm_size, output_size, train_data_gen,
                   val_data_gen):

    # All gates have initializers for the input-to-gate and hidden state-to-gate
    # weight matrices, the cell-to-gate weight vector, the bias vector, and the nonlinearity.
    # The convention is that gates use the standard sigmoid nonlinearity,
    # which is the default for the Gate class.
    gate_parameters = Gate(W_in=las.init.Orthogonal(),
                           W_hid=las.init.Orthogonal(),
                           b=las.init.Constant(0.))
    cell_parameters = Gate(
        W_in=las.init.Orthogonal(),
        W_hid=las.init.Orthogonal(),
        # Setting W_cell to None denotes that no cell connection will be used.
        W_cell=None,
        b=las.init.Constant(0.),
        # By convention, the cell nonlinearity is tanh in an LSTM.
        nonlinearity=tanh)

    # prepare the input layers
    # By setting the first and second dimensions to None, we allow
    # arbitrary minibatch sizes with arbitrary sequence lengths.
    # The number of feature dimensions is 150, as described above.
    l_in = InputLayer(shape=(None, None, input_size), name='input')
    # This input will be used to provide the network with masks.
    # Masks are expected to be matrices of shape (n_batch, n_time_steps);
    # both of these dimensions are variable for us so we will use
    # an input shape of (None, None)
    l_mask = InputLayer(shape=(None, None), name='mask')

    # Our LSTM will have 250 hidden/cell units
    N_HIDDEN = lstm_size
    l_lstm = LSTMLayer(
        l_in,
        N_HIDDEN,
        # We need to specify a separate input for masks
        mask_input=l_mask,
        # Here, we supply the gate parameters for each gate
        ingate=gate_parameters,
        forgetgate=gate_parameters,
        cell=cell_parameters,
        outgate=gate_parameters,
        # We'll learn the initialization and use gradient clipping
        learn_init=True,
        grad_clipping=5.,
        name='lstm1')
    '''
    # The "backwards" layer is the same as the first,
    # except that the backwards argument is set to True.
    l_lstm_back = LSTMLayer(
        l_in, N_HIDDEN, ingate=gate_parameters,
        mask_input=l_mask, forgetgate=gate_parameters,
        cell=cell_parameters, outgate=gate_parameters,
        learn_init=True, grad_clipping=5., backwards=True)
    # We'll combine the forward and backward layer output by summing.
    # Merge layers take in lists of layers to merge as input.
    l_sum = ElemwiseSumLayer([l_lstm, l_lstm_back])

    # implement drop-out regularization
    l_dropout = DropoutLayer(l_sum)

    l_lstm2 = LSTMLayer(
        l_dropout, N_HIDDEN,
        # We need to specify a separate input for masks
        mask_input=l_mask,
        # Here, we supply the gate parameters for each gate
        ingate=gate_parameters, forgetgate=gate_parameters,
        cell=cell_parameters, outgate=gate_parameters,
        # We'll learn the initialization and use gradient clipping
        learn_init=True, grad_clipping=5.)

    # The "backwards" layer is the same as the first,
    # except that the backwards argument is set to True.
    l_lstm_back2 = LSTMLayer(
        l_dropout, N_HIDDEN, ingate=gate_parameters,
        mask_input=l_mask, forgetgate=gate_parameters,
        cell=cell_parameters, outgate=gate_parameters,
        learn_init=True, grad_clipping=5., backwards=True)

    # We'll combine the forward and backward layer output by summing.
    # Merge layers take in lists of layers to merge as input.
    l_sum2 = ElemwiseSumLayer([l_lstm2, l_lstm_back2])
    '''
    # The l_forward layer creates an output of dimension (batch_size, SEQ_LENGTH, N_HIDDEN)
    # Since we are only interested in the final prediction, we isolate that quantity and feed it to the next layer.
    # The output of the sliced layer will then be of size (batch_size, N_HIDDEN)
    l_forward_slice = SliceLayer(l_lstm, -1, 1, name='slice')

    # Now, we can apply feed-forward layers as usual.
    # We want the network to predict a classification for the sequence,
    # so we'll use a the number of classes.
    l_out = DenseLayer(l_forward_slice,
                       num_units=output_size,
                       nonlinearity=las.nonlinearities.softmax,
                       name='output')

    print_network(l_out)
    # draw_to_file(las.layers.get_all_layers(l_out), 'network.png')

    # Symbolic variable for the target network output.
    # It will be of shape n_batch, because there's only 1 target value per sequence.
    target_values = T.ivector('target_output')

    # This matrix will tell the network the length of each sequences.
    # The actual values will be supplied by the gen_data function.
    mask = T.matrix('mask')

    # lasagne.layers.get_output produces an expression for the output of the net
    prediction = las.layers.get_output(l_out)

    # The value we care about is the final value produced for each sequence
    # so we simply slice it out.
    # predicted_values = network_output[:, -1]

    # Our cost will be categorical cross entropy error
    cost = T.mean(
        las.objectives.categorical_crossentropy(prediction, target_values))
    # cost = T.mean((predicted_values - target_values) ** 2)
    # Retrieve all parameters from the network
    all_params = las.layers.get_all_params(l_out, trainable=True)
    # Compute adam updates for training
    # updates = las.updates.adam(cost, all_params)
    updates = adadelta(cost, all_params)
    # Theano functions for training and computing cost
    train = theano.function([l_in.input_var, target_values, l_mask.input_var],
                            cost,
                            updates=updates,
                            allow_input_downcast=True)
    compute_train_cost = theano.function(
        [l_in.input_var, target_values, l_mask.input_var],
        cost,
        allow_input_downcast=True)

    test_prediction = las.layers.get_output(l_out, deterministic=True)
    test_cost = T.mean(
        las.objectives.categorical_crossentropy(test_prediction,
                                                target_values))
    compute_val_cost = theano.function(
        [l_in.input_var, target_values, l_mask.input_var],
        test_cost,
        allow_input_downcast=True)
    val_fn = theano.function([l_in.input_var, l_mask.input_var],
                             test_prediction,
                             allow_input_downcast=True)

    # We'll use this "validation set" to periodically check progress
    X_val, y_val, mask_val = next(val_data_gen)

    # We'll train the network with 10 epochs of 100 minibatches each
    cost_train = []
    cost_val = []
    class_rate = []
    best_val = float('inf')
    best_conf = None
    best_cr = 0.0
    NUM_EPOCHS = 30
    EPOCH_SIZE = 26
    STRIP_SIZE = 3
    MAX_LOSS = 0.05
    VALIDATION_WINDOW = 4
    val_window = circular_list(VALIDATION_WINDOW)
    train_strip = np.zeros((STRIP_SIZE, ))

    def early_stop(cost_window):
        if len(cost_window) < 2:
            return False
        else:
            curr = cost_window[0]
            for idx, cost in enumerate(cost_window):
                if curr < cost or idx == 0:
                    curr = cost
                else:
                    return False
            return True

    for epoch in range(NUM_EPOCHS):
        time_start = time.time()
        for _ in range(EPOCH_SIZE):
            X, y, m, _ = next(train_data_gen)
            train(X, y, m)
        train_cost = compute_train_cost(X, y, m)
        val_cost = compute_val_cost(X_val, y_val, mask_val)
        cr, conf = evaluate_model(X_val, y_val, mask_val, val_fn)
        cost_train.append(train_cost)
        cost_val.append(val_cost)
        class_rate.append(cr)
        train_strip[epoch % STRIP_SIZE] = train_cost
        val_window.push(val_cost)

        gl = 100 * (cost_val[-1] / np.min(cost_val) - 1)
        pk = 1000 * (np.sum(train_strip) /
                     (STRIP_SIZE * np.min(train_strip)) - 1)
        pq = gl / pk

        print(
            "Epoch {} train cost = {}, validation cost = {}, "
            "generalization loss = {:.3f}, GQ = {:.3f}, classification rate = {:.3f} ({:.1f}sec)"
            .format(epoch + 1, cost_train[-1], cost_val[-1], gl, pq, cr,
                    time.time() - time_start))

        if val_cost < best_val:
            best_val = val_cost
            best_cr = cr
            best_conf = conf

        if epoch >= VALIDATION_WINDOW and early_stop(val_window):
            break

    letters = [
        'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
        'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z'
    ]

    print('Final Model')
    print('classification rate: {}'.format(best_cr))
    print('validation loss: {}'.format(best_val))
    print('confusion matrix: ')
    plot_confusion_matrix(best_conf, letters, fmt='grid')
    plot_validation_cost(cost_train, cost_val, class_rate)
Ejemplo n.º 22
0
    def _prepare(self, X, y, X_valid=None, y_valid=None, sample_weight=None,
                 whole_dataset_in_device=True):

        self._stats = []
        self._class_label_encoder = LabelEncoder()
        if self.is_classification is True:
            self._class_label_encoder.fit(y)
            self.classes_ = self._class_label_encoder.classes_
            y = self._class_label_encoder.transform(y).astype(y.dtype)
            self.y_train_transformed = y
            if y_valid is not None:
                y_valid_transformed = self._class_label_encoder.transform(
                    y_valid).astype(y_valid.dtype)

        self._l_x_in = layers.InputLayer(shape=(None, X.shape[1]))
        batch_index, X_batch, y_batch, batch_slice = get_theano_batch_variables(
            self.batch_size, y_softmax=self.is_classification)

        if sample_weight is not None:
            t_sample_weight = T.vector('sample_weight')
            sample_weight = sample_weight.astype(theano.config.floatX)
        else:
            t_sample_weight = T.scalar('sample_weight')

        if self.is_classification is True:
            y_dim = len(set(y.flatten().tolist()))
        else:
            y_dim = y.shape[1]

        self._prediction_layer = self._build_model(y_dim)
        self._layers = layers.get_all_layers(self._prediction_layer)
        self._build_prediction_functions(X_batch, self._prediction_layer)

        if self.input_noise_function is None:
            output = layers.get_output(self._prediction_layer, X_batch)

        else:
            X_batch_noisy = self.input_noise_function(X_batch)
            output = layers.get_output(self._prediction_layer, X_batch_noisy)

        if self.is_classification:
            loss = -T.mean(t_sample_weight * T.log(output)
                           [T.arange(y_batch.shape[0]), y_batch])
        else:
            loss = T.mean(
                t_sample_weight * T.sum((output - y_batch) ** 2, axis=1))

        loss_unreg = loss

        all_params = layers.get_all_params(self._prediction_layer)
        if self._output_softener_coefs is not None:
            all_params.append(self._output_softener_coefs)

        W_params = layers.get_all_param_values(
            self._prediction_layer, regularizable=True)

        # regularization
        if self.L1_factor is not None:
            for L1_factor_layer, W in zip(self.L1_factor, W_params):
                loss = loss + L1_factor_layer * T.sum(abs(W))

        if self.L2_factor is not None:
            for L2_factor_layer, W in zip(self.L2_factor, W_params):
                loss = loss + L2_factor_layer * T.sum(W**2)

        if self.optimization_method == 'nesterov_momentum':
            gradient_updates = updates.nesterov_momentum(loss, all_params, learning_rate=self.learning_rate,
                                                         momentum=self.momentum)
        elif self.optimization_method == 'adadelta':
            # don't need momentum there
            gradient_updates = updates.adadelta(
                loss, all_params, learning_rate=self.learning_rate)
        elif self.optimization_method == 'adam':
            gradient_updates = updates.Adam(
                loss, all_params, learning_rate=self.learning_rate)
        elif self.optimization_method == 'momentum':
            gradient_updates = updates.momentum(
                loss, all_params, learning_rate=self.learning_rate,
                momentum=self.momentum
            )
        elif self.optimization_method == 'adagrad':
            gradient_updates = updates.adadelta(
                loss, all_params, learning_rate=self.learning_rate)
        elif self.optimization_method == 'rmsprop':
            gradient_updates = updates.adadelta(
                loss, all_params, learning_rate=self.learning_rate)
        elif self.optimization_method == 'sgd':
            gradient_updates = updates.sgd(
                loss, all_params, learning_rate=self.learning_rate,
            )
        else:
            raise Exception("wrong optimization method")

        nb_batches = X.shape[0] // self.batch_size
        if (X.shape[0] % self.batch_size) != 0:
            nb_batches += 1

        X = X.astype(theano.config.floatX)
        if self.is_classification == True:
            y = y.astype(np.int32)
        else:
            y = y.astype(theano.config.floatX)

        if whole_dataset_in_device == True:
            X_shared = theano.shared(X, borrow=True)
            y_shared = theano.shared(y, borrow=True)

            givens = {
                X_batch: X_shared[batch_slice],
                y_batch: y_shared[batch_slice]
            }

            if sample_weight is not None:
                sample_weight_shared = theano.shared(
                    sample_weight, borrow=True)
                givens[t_sample_weight] = sample_weight_shared[batch_slice]
            else:
                givens[t_sample_weight] = T.as_tensor_variable(
                    np.array(1., dtype=theano.config.floatX))

            iter_update_batch = theano.function(
                [batch_index], loss,
                updates=gradient_updates,
                givens=givens,

            )
        else:
            if sample_weight is None:
                iter_update_gradients = theano.function(
                    [X_batch, y_batch],
                    loss,
                    updates=gradient_updates,
                    givens={t_sample_weight: T.as_tensor_variable(
                        np.array(1., dtype=theano.config.floatX))},

                )

                def iter_update_batch(batch_index):
                    sl = slice(batch_index * self.batch_size,
                               (batch_index + 1) * self.batch_size)
                    return iter_update_gradients(X[sl], y[sl])

            else:
                iter_update_gradients = theano.function(
                    [X_batch, y_batch, t_sample_weight],
                    loss,
                    updates=gradient_updates
                )

                def iter_update_batch(batch_index):
                    sl = slice(batch_index * self.batch_size,
                               (batch_index + 1) * self.batch_size)
                    return iter_update_gradients(X[sl], y[sl], sample_weight[sl])
        self._iter_update_batch = iter_update_batch
        self._get_loss = theano.function(
            [X_batch, y_batch, t_sample_weight], loss_unreg, allow_input_downcast=True)

        def iter_update(epoch):
            losses = []
            #self.learning_rate.set_value(self.learning_rate.get_value() * np.array(0.99, dtype=theano.config.floatX))
            for i in xrange(nb_batches):
                losses.append(self._iter_update_batch(i))
                # max norm
                if self.max_norm is not None:
                    for max_norm_layer, layer in zip(self.max_norm, self._layers):
                        layer.W = updates.norm_constraint(
                            layer.W, self.max_norm)

            losses = np.array(losses)

            d = OrderedDict()
            d["epoch"] = epoch
            #d["loss_train_std"] = losses.std()

            #d["loss_train"] = losses.mean()
            d["loss_train"] = self._get_loss(
                self.X_train, self.y_train_transformed, 1.)

            d["accuracy_train"] = (
                self.predict(self.X_train) == self.y_train).mean()

            if X_valid is not None and y_valid is not None:
                d["loss_valid"] = self._get_loss(
                    X_valid, y_valid_transformed, 1.)

                if self.is_classification == True:
                    d["accuracy_valid"] = (
                        self.predict(X_valid) == y_valid).mean()

            if self.verbose > 0:
                if (epoch % self.report_each) == 0:
                    print(tabulate([d], headers="keys"))
            self._stats.append(d)
            return d

        def quitter(update_status):
            cur_epoch = len(self._stats) - 1
            if self.patience_nb_epochs > 0:
                # patience heuristic (for early stopping)
                cur_patience_stat = update_status[self.patience_stat]

                if self.cur_best_patience_stat is None:
                    self.cur_best_patience_stat = cur_patience_stat
                    first_time = True
                else:
                    first_time = False

                thresh = self.patience_progression_rate_threshold
                if cur_patience_stat < self.cur_best_patience_stat * thresh or first_time:

                    if self.verbose >= 2:
                        fmt = "--Early stopping-- good we have a new best value : {0}={1}, last best : epoch {2}, value={3}"
                        print(fmt.format(self.patience_stat, cur_patience_stat,
                                         self.cur_best_epoch, self.cur_best_patience_stat))
                    self.cur_best_epoch = cur_epoch
                    self.cur_best_patience_stat = cur_patience_stat
                    if hasattr(self, "set_state") and hasattr(self, "get_state"):
                        self.cur_best_model = self.get_state()
                    else:
                        self.cur_best_model = pickle.dumps(
                            self.__dict__, protocol=pickle.HIGHEST_PROTOCOL)
                if (cur_epoch - self.cur_best_epoch) >= self.patience_nb_epochs:
                    finish = True
                    if hasattr(self, "set_state") and hasattr(self, "get_state"):
                        self.set_state(self.cur_best_model)
                    else:
                        self.__dict__.update(pickle.loads(self.cur_best_model))

                    self._stats = self._stats[0:self.cur_best_epoch + 1]
                    if self.verbose >= 2:
                        print("out of patience...take the model at epoch {0} and quit".format(
                            self.cur_best_epoch + 1))
                else:
                    finish = False
                return finish
            else:
                return False

        def monitor(update_status):
            pass

        def observer(monitor_output):
            pass

        return (iter_update, quitter, monitor, observer)
def event_span_classifier(args, input_var, input_mask_var, target_var, wordEmbeddings, seqlen):

    print("Building model with LSTM")

    vocab_size = wordEmbeddings.shape[1]
    wordDim = wordEmbeddings.shape[0]

    GRAD_CLIP = wordDim

    args.lstmDim = 150

    input = InputLayer((None, seqlen),input_var=input_var)
    batchsize, seqlen = input.input_var.shape
    input_mask = InputLayer((None, seqlen),input_var=input_mask_var)
    
    emb = EmbeddingLayer(input, input_size=vocab_size, output_size=wordDim, W=wordEmbeddings.T)
    #emb.params[emb_1.W].remove('trainable')

    lstm = LSTMLayer(emb, num_units=args.lstmDim, mask_input=input_mask, grad_clipping=GRAD_CLIP,
        nonlinearity=tanh)

    lstm_back = LSTMLayer(
        emb, num_units=args.lstmDim, mask_input=input_mask, grad_clipping=GRAD_CLIP,
        nonlinearity=tanh, backwards=True)

    slice_forward = SliceLayer(lstm, indices=-1, axis=1) # out_shape (None, args.lstmDim)
    slice_backward = SliceLayer(lstm_back, indices=0, axis=1) # out_shape (None, args.lstmDim)

    concat = ConcatLayer([slice_forward, slice_backward])

    hid = DenseLayer(concat, num_units=args.hiddenDim, nonlinearity=sigmoid)

    network = DenseLayer(hid, num_units=2, nonlinearity=softmax)

    prediction = get_output(network)
    
    loss = T.mean(binary_crossentropy(prediction,target_var))
    lambda_val = 0.5 * 1e-4

    layers = {emb:lambda_val, lstm:lambda_val, hid:lambda_val, network:lambda_val} 
    penalty = regularize_layer_params_weighted(layers, l2)
    loss = loss + penalty


    params = get_all_params(network, trainable=True)

    if args.optimizer == "sgd":
        updates = sgd(loss, params, learning_rate=args.step)
    elif args.optimizer == "adagrad":
        updates = adagrad(loss, params, learning_rate=args.step)
    elif args.optimizer == "adadelta":
        updates = adadelta(loss, params, learning_rate=args.step)
    elif args.optimizer == "nesterov":
        updates = nesterov_momentum(loss, params, learning_rate=args.step)
    elif args.optimizer == "rms":
        updates = rmsprop(loss, params, learning_rate=args.step)
    elif args.optimizer == "adam":
        updates = adam(loss, params, learning_rate=args.step)
    else:
        raise "Need set optimizer correctly"
 
    test_prediction = get_output(network, deterministic=True)
    test_loss = T.mean(binary_crossentropy(test_prediction,target_var))


    train_fn = theano.function([input_var, input_mask_var,target_var], 
        loss, updates=updates, allow_input_downcast=True)

    test_acc = T.mean(binary_accuracy(test_prediction, target_var))

    val_fn = theano.function([input_var, input_mask_var, target_var], [test_loss, test_acc], allow_input_downcast=True)

    return train_fn, val_fn, network
Ejemplo n.º 24
0
    layer1_input = T.concatenate(layer1_inputs,1)
    input_dims = feature_maps*len(filter_hs)

    regess = RegressionNeuralNetwork(rng, input=layer1_input,n_in=input_dims,n_hidden=100,n_out=1,activation=[Sigmoid,Sigmoid])

    mse = regess.entropy(Y)

    L2 = sum([conv_layer.L2 for conv_layer in conv_layers]) + regess.L2

    cost = mse + L2

    params = regess.params
    for conv_layer in conv_layers:
        params+=conv_layer.params

    updates = adadelta(cost,params)

    train_model = theano.function([X,Y],[mse, cost],updates=updates)
    valid_model = theano.function([X,Y],[mse, cost])

    showfunction = theano.function(inputs=[X],outputs=regess.regressionlayer.y_pred)

    patience = 0
    best_valid_mse_global = 100
    early_stop = 20
    epoch_i = 0

    train_rand_idxs = list(range(0,X_train.shape[0]))
    valid_rand_idxs = list(range(0,X_valid.shape[0]))

    while patience < early_stop:
Ejemplo n.º 25
0
    def __init__(self,
                 rng,
                 n_in,
                 n_per_base,
                 n_out,
                 n_layer=1,
                 basefuncs1=None,
                 basefuncs2=None,
                 gradient=None,
                 with_shortcuts=False):
        """Initialize the parameters for the multilayer function graph

		:type rng: numpy.random.RandomState
		:param rng: a random number generator used to initialize weights

		:type n_in: int
		:param n_in: number of input units, the dimension of the space in
		which the datapoints lie

		:type n_layer: int
		:param n_layer: number of hidden layers

		:type n_per_base: int
		:param n_per_base: number of nodes per basis function see FGLayer

		:type n_out: int
		:param n_out: number of output units, the dimension of the space in
		which the labels lie

		:type basefuncs1: [int]
		:param basefuncs1: see FGLayer

		:type basefuncs2: [int]
		:param basefuncs2: see FGLayer

		:type gradient: string
		:param gradient: type of gradient descent algo (None=="sgd+","adagrad","adadelta","nag")

		:type with_shortcuts: bool
		:param with_shortcuts: whether to use shortcut connections (output is connected to all units)

		"""
        self.input = T.matrix('input')  # the data is presented as vector input
        self.labels = T.matrix(
            'labels')  # the labels are presented as vector of continous values
        self.rng = rng
        self.n_layers = n_layer
        self.hidden_layers = []
        self.params = []
        self.n_in = n_in
        self.n_out = n_out
        self.with_shortcuts = with_shortcuts
        self.fixL0 = False

        for l in xrange(n_layer):
            if l == 0:
                layer_input = self.input
                n_input = n_in
            else:
                layer_input = self.hidden_layers[l - 1].output
                n_input = self.hidden_layers[l - 1].n_out

            hiddenLayer = FGLayer(
                rng=rng,
                inp=layer_input,
                n_in=n_input,
                n_per_base=n_per_base,
                basefuncs1=basefuncs1,
                basefuncs2=basefuncs2,
                layer_idx=l,
            )
            self.hidden_layers.append(hiddenLayer)
            self.params.extend(hiddenLayer.params)

        div_thresh = T.scalar("div_thresh")

        # The linear output layer, either it gets as input the output of ALL previous layers
        if self.with_shortcuts:
            output_layer_inp = T.concatenate(
                [l.output for l in reversed(self.hidden_layers)], axis=1)
            output_layer_n_in = sum([l.n_out for l in self.hidden_layers])
        else:  # or just of the last hidden layer
            output_layer_inp = self.hidden_layers[-1].output
            output_layer_n_in = self.hidden_layers[-1].n_out
        self.output_layer = DivisionRegression(rng=rng,
                                               inp=output_layer_inp,
                                               n_in=output_layer_n_in,
                                               n_out=n_out,
                                               div_thresh=div_thresh)

        self.params.extend(self.output_layer.params)

        self.evalfun = theano.function(
            inputs=[self.input, In(div_thresh, value=0.0001)],
            outputs=self.output_layer.output)

        L1_reg = T.scalar('L1_reg')
        L2_reg = T.scalar('L2_reg')
        fixL0 = T.bscalar('fixL0')
        self.L1 = self.output_layer.L1 + sum(
            [l.L1 for l in self.hidden_layers])
        self.L2_sqr = self.output_layer.L2_sqr + sum(
            [l.L2_sqr for l in self.hidden_layers])
        self.penalty = self.output_layer.penalty

        self.loss = self.output_layer.loss
        self.errors = self.loss
        self.cost = (self.loss(self.labels) + L1_reg * self.L1 +
                     L2_reg * self.L2_sqr + self.penalty)

        #Extrapol penalty
        self.extrapol_cost = self.output_layer.extrapol_loss

        learning_rate = T.scalar('learning_rate')

        def process_updates(par, newp):
            # print par.name
            if par.name == "W":
                # if fixL0 is True, then keep small weights at 0
                return par, ifelse(
                    fixL0, T.switch(T.abs_(par) < 0.001, par * 0, newp), newp)
            return par, newp

        print "Gradient:", gradient
        update = None
        if gradient == 'sgd+' or gradient == 'sgd' or gradient == None:
            gparams = [T.grad(self.cost, param) for param in self.params]
            update = OrderedDict([
                (param, param - (learning_rate * gparam).clip(-1.0, 1.0))
                for param, gparam in zip(self.params, gparams)
            ])
        elif gradient == 'adam':
            update = Lupdates.adam(self.cost,
                                   self.params,
                                   learning_rate,
                                   epsilon=1e-04)
        elif gradient == 'adadelta':
            update = Lupdates.adadelta(self.cost, self.params, learning_rate)
        elif gradient == 'rmsprop':
            update = Lupdates.rmsprop(self.cost, self.params, learning_rate)
        elif gradient == 'nag':
            update = Lupdates.nesterov_momentum(self.cost, self.params,
                                                learning_rate)
        else:
            assert ("unknown gradient " + gradient)

        #Extrapol sanity gradient computation:

        extrapol_updates = Lupdates.adam(self.extrapol_cost,
                                         self.params,
                                         learning_rate,
                                         epsilon=1e-04)

        updates = [process_updates(*up) for up in update.items()]
        self.train_model = theano.function(
            inputs=[
                self.input, self.labels, L1_reg, L2_reg, fixL0, learning_rate,
                div_thresh
            ],
            outputs=self.cost,
            updates=updates,
        )
        # avoid too large outputs in extrapolation domain
        self.remove_extrapol_error = theano.function(
            inputs=[self.input, learning_rate, div_thresh],
            outputs=self.extrapol_cost,
            updates=extrapol_updates,
        )

        self.test_model = theano.function(
            inputs=[self.input, self.labels,
                    In(div_thresh, value=0.0001)],
            outputs=self.errors(self.labels),
        )
        self.validate_model = theano.function(
            inputs=[self.input, self.labels,
                    In(div_thresh, value=0.0001)],
            outputs=self.errors(self.labels),
        )
        self.L1_loss = theano.function(
            inputs=[],
            outputs=self.L1,
        )
        self.MSE = theano.function(
            inputs=[self.input, self.labels,
                    In(div_thresh, value=0.0001)],
            outputs=self.errors(self.labels),
        )
Ejemplo n.º 26
0
    def __init__(self, hidden_size=100, nclasses=73, num_embeddings=11359, embedding_dim=100, window_size=1,
                 memory_size=40, n_memory_slots=8, go_code=1, depth=2, load_dir=None):

        articles, titles = T.imatrices('articles', 'titles')
        n_article_slots = int(n_memory_slots / 2)  # TODO derive this from an arg
        n_title_slots = n_memory_slots - n_article_slots
        n_instances = articles.shape[0]

        self.window_size = window_size

        randoms = {
            # attr: shape
            # 'emb': (num_embeddings + 1, embedding_dim),
            'M_a': (memory_size, n_article_slots),
            'M_t': (memory_size, n_title_slots),
            'w_a': (n_article_slots,),
            'w_t': (n_title_slots,),
            'Wg_a': (window_size * embedding_dim, n_article_slots),
            'Wg_t': (window_size * embedding_dim, n_title_slots),
            'Wk': (hidden_size, memory_size),
            'Wb': (hidden_size, 1),
            'Wv': (hidden_size, memory_size),
            'We_a': (hidden_size, n_article_slots),
            'We_t': (hidden_size, n_title_slots),
            'Wx': (window_size * embedding_dim, hidden_size),
            'Wh': (memory_size, hidden_size),
            'W': (hidden_size, nclasses),
            'h0': hidden_size
        }

        zeros = {
            # attr: shape
            'bg_a': n_article_slots,
            'bg_t': n_title_slots,
            'bk': memory_size,
            'bb': 1,
            'bv': memory_size,
            'be_a': n_article_slots,
            'be_t': n_title_slots,
            'bh': hidden_size,
            'b': nclasses,
        }

        for l in range(depth):
            randoms['gru' + str(l)] = (1, embedding_dim)

        def random_shared(name):
            shape = randoms[name]
            return theano.shared(
                0.2 * np.random.normal(size=shape).astype(theano.config.floatX),
                name=name)

        def zeros_shared(name):
            shape = zeros[name]
            return theano.shared(np.zeros(shape, dtype=theano.config.floatX), name=name)

        for key in randoms:
            # create an attribute with associated shape and random values
            setattr(self, key, random_shared(key))

        for key in zeros:
            # create an attribute with associated shape and values equal to 0
            setattr(self, key, zeros_shared(key))

        self.names = randoms.keys() + zeros.keys()
        # self.names.remove('emb')  # no need to save or update embeddings
        scan_vars = 'h0 w_a M_a w_t M_t'.split()

        def repeat_for_each_instance(param):
            """ repeat param along new axis once for each instance """
            return T.repeat(T.shape_padleft(param), repeats=n_instances, axis=0)

        for key in scan_vars:
            setattr(self, key, repeat_for_each_instance(self.__getattribute__(key)))
            self.names.remove(key)

        if load_dir is not None:
            with open(os.path.join(load_dir, 'params.pkl')) as handle:
                params = pickle.load(handle)
                self.__dict__.update(params)

        def recurrence(i,
                       h_tm1,
                       w_a,
                       M_a,
                       *args,
                       **kwargs):
            """
            notes
            Headers from paper in all caps
            mem = n_article slots if is_article else n_title_slots

            :param i: center index of sliding window
            :param h_tm1: h_{t-1} (hidden state)
            :param w_a: attention weights for article memory
            :param M_a: article memory
            :param args: gru_weights, maybe w_t, maybe M_t
                   gru_weights: weights with which to initialize GRULayer on each time step
                   w_t: attention weights for titles memory
                   M_t: titles memory
            :param kwargs: is_training, is_article
                   is_training:
                   is_article: we use different parts of memory when working with a article
            :return: [y = model outputs,
                      i + 1 = increment index,
                      h w, M (see above)]
            """
            is_training = kwargs['is_training']
            is_article = kwargs['is_article']
            gru_weights = args[:depth]
            if len(args) > depth:
                w_t = args[depth]
                M_t = args[depth + 1]

            i_type = T.iscalar if is_article or is_training else T.ivector
            assert i.type == i_type

            if not is_article:
                assert w_t is not None and M_t is not None

            word_idxs = i
            if is_article or is_training:
                # get representation of word window
                document = articles if is_article else titles  # [instances, bucket_width]
                word_idxs = document[:, i:i+1]  # [instances, 1]
            # x_i = self.emb[word_idxs].flatten(ndim=2)  # [instances, embedding_dim]

            input = InputLayer(shape=(None, 1),
                               input_var=word_idxs)
            embed = EmbeddingLayer(input, num_embeddings, embedding_dim)
            gru = GRULayer(incoming=embed, num_units=embedding_dim, hid_init=self.gru0)
            for weight in gru_weights:
                gru = GRULayer(incoming=gru, num_units=embedding_dim,
                               hid_init=weight)
            x_i = get_output(gru).flatten(ndim=2)
            x_i = Print('x_i')(x_i)  # [instances, embedding_dim]

            gru_weights = []

            if is_article:
                M_read = M_a  # [instances, memory_size, n_article_slots]
                w_read = w_a  # [instances, n_article_slots]
            else:
                M_read = T.concatenate([M_a, M_t], axis=2)  # [instances, memory_size, n_title_slots]
                w_read = T.concatenate([w_a, w_t], axis=1)  # [instances, n_title_slots]

            # eqn 15
            c = T.batched_dot(M_read, w_read)  # [instances, memory_size]

            # EXTERNAL MEMORY READ
            def get_attention(Wg, bg, M, w):
                g = T.nnet.sigmoid(T.dot(x_i, Wg) + bg)  # [instances, mem]

                # eqn 11
                k = T.dot(h_tm1, self.Wk) + self.bk  # [instances, memory_size]

                # eqn 13
                beta = T.dot(h_tm1, self.Wb) + self.bb
                beta = T.nnet.softplus(beta)
                beta = T.addbroadcast(beta, 1)  # [instances, 1]

                # eqn 12
                w_hat = T.nnet.softmax(beta * cosine_dist(M, k))

                # eqn 14
                return (1 - g) * w + g * w_hat  # [instances, mem]

            w_a = get_attention(self.Wg_a, self.bg_a, M_a, w_a)  # [instances, n_article_slots]
            if not is_article:
                w_t = get_attention(self.Wg_t, self.bg_t, M_t, w_t)  # [instances, n_title_slots]

            # MODEL INPUT AND OUTPUT
            # eqn 9
            h = T.dot(c, self.Wh) + T.dot(x_i, self.Wx) + self.bh  # [instances, hidden_size]

            # eqn 10
            y = T.nnet.softmax(T.dot(h, self.W) + self.b)  # [instances, nclasses]

            # EXTERNAL MEMORY UPDATE
            def update_memory(We, be, w_update, M_update):
                # eqn 17
                e = T.nnet.sigmoid(T.dot(h_tm1, We) + be)  # [instances, mem]
                f = 1. - w_update * e  # [instances, mem]

                # eqn 16
                v = T.tanh(T.dot(h, self.Wv) + self.bv)  # [instances, memory_size]

                # need to add broadcast layers for memory update
                f = f.dimshuffle(0, 'x', 1)  # [instances, 1, mem]
                u = w_update.dimshuffle(0, 'x', 1)  # [instances, 1, mem]
                v = v.dimshuffle(0, 1, 'x')  # [instances, memory_size, 1]

                # eqn 19
                return M_update * f + T.batched_dot(v, u) * (1 - f)  # [instances, memory_size, mem]

            M_a = update_memory(self.We_a, self.be_a, w_a, M_a)
            attention_and_memory = [w_a, M_a]
            if not is_article:
                M_t = update_memory(self.We_t, self.be_t, w_t, M_t)
                attention_and_memory += [w_t, M_t]

            y_max = y.argmax(axis=1).astype(int32)
            next_idxs = i + 1 if is_training or is_article else y_max
            return [y, y_max, next_idxs, h] + attention_and_memory

        read_article = partial(recurrence, is_training=True, is_article=True)
        # for read_article, it actually doesn't matter whether is_training is true

        i0 = T.constant(0, dtype=int32, name='first_value_of_i')
        gru_weights = [eval('self.gru' + str(l)) for l in range(depth)]
        outputs_info = [None, None, i0, self.h0, self.w_a, self.M_a] + gru_weights

        [_, _, _, h, w, M], _ = theano.scan(fn=read_article,
                                            outputs_info=outputs_info,
                                            n_steps=articles.shape[1],
                                            name='read_scan')

        produce_title = partial(recurrence, is_training=True, is_article=False)
        outputs_info[3:6] = [param[-1, :, :] for param in (h, w, M)]
        outputs_info.extend([self.w_t, self.M_t])
        bucket_width = titles.shape[1] - 1  # subtract 1 because <go> is omitted in y_true
        [y, y_max, _, _, _, _, _, _], _ = theano.scan(fn=produce_title,
                                                      outputs_info=outputs_info,
                                                      n_steps=bucket_width,
                                                      name='train_scan')

        # loss and updates
        y_clip = T.clip(y, .01, .99)
        y_flatten = y_clip.dimshuffle(2, 1, 0).flatten(ndim=2).T
        y_true = titles[:, 1:].ravel()  # [:, 1:] in order to omit <go>
        counts = T.extra_ops.bincount(y_true, assert_nonneg=True)
        weights = 1.0 / (counts[y_true] + 1) * T.neq(y_true, 0)
        losses = T.nnet.categorical_crossentropy(y_flatten, y_true)
        loss = objectives.aggregate(losses, weights, mode='sum')
        updates = adadelta(loss, self.params())

        self.learn = theano.function(inputs=[articles, titles],
                                     outputs=[y_max.T, loss],
                                     updates=updates,
                                     allow_input_downcast=True,
                                     name='learn')

        produce_title_test = partial(recurrence, is_training=False, is_article=False)

        self.test = theano.function(inputs=[articles, titles],
                                    outputs=[y_max.T],
                                    on_unused_input='ignore')

        outputs_info[2] = T.zeros([n_instances], dtype=int32) + go_code
        [_, y_max, _, _, _, _, _, _], _ = theano.scan(fn=produce_title_test,
                                                      outputs_info=outputs_info,
                                                      n_steps=bucket_width,
                                                      name='test_scan')

        self.predict = theano.function(inputs=[articles, titles],
                                       outputs=y_max.T,
                                       name='infer')
def event_span_classifier(args, input_var, target_var, wordEmbeddings, seqlen, num_feats):

    print("Building model with 1D Convolution")

    vocab_size = wordEmbeddings.shape[1]
    wordDim = wordEmbeddings.shape[0]

    kw = 2
    num_filters = seqlen-kw+1
    stride = 1 

    #important context words as channels
 
    #CNN_sentence config
    filter_size=wordDim
    pool_size=seqlen-filter_size+1

    input = InputLayer((None, seqlen, num_feats),input_var=input_var)
    batchsize, _, _ = input.input_var.shape
    emb = EmbeddingLayer(input, input_size=vocab_size, output_size=wordDim, W=wordEmbeddings.T)
    #emb.params[emb.W].remove('trainable') #(batchsize, seqlen, wordDim)

    #print get_output_shape(emb)
    reshape = ReshapeLayer(emb, (batchsize, seqlen, num_feats*wordDim))
    #print get_output_shape(reshape)

    conv1d = Conv1DLayer(reshape, num_filters=num_filters, filter_size=wordDim, stride=1, 
        nonlinearity=tanh,W=GlorotUniform()) #nOutputFrame = num_flters, 
                                            #nOutputFrameSize = (num_feats*wordDim-filter_size)/stride +1

    #print get_output_shape(conv1d)

    conv1d = DimshuffleLayer(conv1d, (0,2,1))

    #print get_output_shape(conv1d)

    pool_size=num_filters

    maxpool = MaxPool1DLayer(conv1d, pool_size=pool_size) 

    #print get_output_shape(maxpool)
  
    #forward = FlattenLayer(maxpool) 

    #print get_output_shape(forward)
 
    hid = DenseLayer(maxpool, num_units=args.hiddenDim, nonlinearity=sigmoid)

    network = DenseLayer(hid, num_units=2, nonlinearity=softmax)

    prediction = get_output(network)
    
    loss = T.mean(binary_crossentropy(prediction,target_var))
    lambda_val = 0.5 * 1e-4

    layers = {emb:lambda_val, conv1d:lambda_val, hid:lambda_val, network:lambda_val} 
    penalty = regularize_layer_params_weighted(layers, l2)
    loss = loss + penalty


    params = get_all_params(network, trainable=True)

    if args.optimizer == "sgd":
        updates = sgd(loss, params, learning_rate=args.step)
    elif args.optimizer == "adagrad":
        updates = adagrad(loss, params, learning_rate=args.step)
    elif args.optimizer == "adadelta":
        updates = adadelta(loss, params, learning_rate=args.step)
    elif args.optimizer == "nesterov":
        updates = nesterov_momentum(loss, params, learning_rate=args.step)
    elif args.optimizer == "rms":
        updates = rmsprop(loss, params, learning_rate=args.step)
    elif args.optimizer == "adam":
        updates = adam(loss, params, learning_rate=args.step)
    else:
        raise "Need set optimizer correctly"
 
    test_prediction = get_output(network, deterministic=True)
    test_loss = T.mean(binary_crossentropy(test_prediction,target_var))

    train_fn = theano.function([input_var, target_var], 
        loss, updates=updates, allow_input_downcast=True)

    test_acc = T.mean(binary_accuracy(test_prediction, target_var))
    val_fn = theano.function([input_var, target_var], [test_loss, test_acc], allow_input_downcast=True)

    return train_fn, val_fn, network
Ejemplo n.º 28
0
    regess.set_params(save_params[:4])
    for i in xrange(len(filter_hs)):
        print(4 + i * 2)
        print(4 + i * 2 + 2)
        conv_layers[i].set_params(save_params[4 + i * 2:4 + i * 2 + 2])

    mse = regess.mse(Y)

    L2 = sum([conv_layer.L2 for conv_layer in conv_layers]) + regess.L2

    cost = mse + L2

    params = regess.params
    for conv_layer in conv_layers:
        params += conv_layer.params

    updates = adadelta(cost, params)

    train_model = theano.function([X, Y], [mse, cost], updates=updates)
    valid_model = theano.function([X, Y], [mse, cost])

    showfunction = theano.function(inputs=[X],
                                   outputs=regess.hiddenlayer.output)

    X_mnb = X_valid[:batch_size]
    Y_mnb = Y_valid_rouge2[:batch_size]
    print(X_mnb.shape, X_mnb.dtype, Y_mnb, Y_mnb.dtype)
    pred = showfunction(X_mnb)
    print pred
    print Y_valid_rouge2[:batch_size]
Ejemplo n.º 29
0
def main():
    configure_theano()
    options = parse_options()
    config_file = 'config/leave_one_out.ini'
    print('loading config file: {}'.format(config_file))
    config = ConfigParser.ConfigParser()
    config.read(config_file)

    print('preprocessing dataset...')
    data = load_mat_file(config.get('data', 'images'))
    dct_data = load_mat_file(config.get('data', 'dct'))
    ae_pretrained = config.get('models', 'pretrained')
    ae_finetuned = config.get('models', 'finetuned')
    ae_finetuned_diff = config.get('models', 'finetuned_diff')
    learning_rate = float(config.get('training', 'learning_rate'))
    decay_rate = float(config.get('training', 'decay_rate'))
    decay_start = int(config.get('training', 'decay_start'))
    do_finetune = config.getboolean('training', 'do_finetune')
    save_finetune = config.getboolean('training', 'save_finetune')
    load_finetune = config.getboolean('training', 'load_finetune')
    load_finetune_diff = config.getboolean('training', 'load_finetune_diff')

    # 53 subjects, 70 utterances, 5 view angles
    # s[x]_v[y]_u[z].mp4
    # resized, height, width = (26, 44)
    # ['dataMatrix', 'targetH', 'targetsPerVideoVec', 'videoLengthVec', '__header__', 'targetsVec',
    # '__globals__', 'iterVec', 'filenamesVec', 'dataMatrixCells', 'subjectsVec', 'targetW', '__version__']

    print(data.keys())
    X = data['dataMatrix'].astype('float32')
    y = data['targetsVec'].astype('int32')
    y = y.reshape((len(y), ))
    dct_feats = dct_data['dctFeatures'].astype('float32')
    uniques = np.unique(y)
    print('number of classifications: {}'.format(len(uniques)))
    subjects = data['subjectsVec'].astype('int')
    subjects = subjects.reshape((len(subjects), ))
    video_lens = data['videoLengthVec'].astype('int')
    video_lens = video_lens.reshape((len(video_lens, )))

    # X = reorder_data(X, (26, 44), 'f', 'c')
    # print('performing sequencewise mean image removal...')
    # X = sequencewise_mean_image_subtraction(X, video_lens)
    # visualize_images(X[550:650], (26, 44))
    X_diff = compute_diff_images(X, video_lens)

    # mean remove dct features
    dct_feats = sequencewise_mean_image_subtraction(dct_feats, video_lens)

    test_subject_ids = [options['test_subj']]
    train_subject_ids = range(1, 54)
    for subj in test_subject_ids:
        train_subject_ids.remove(subj)

    if 'results' in options:
        results_file = options['results']
        f = open(results_file, mode='a')

    print(train_subject_ids)
    print(test_subject_ids)
    train_X, train_y, train_dct, train_X_diff, train_vidlens, train_subjects, \
    test_X, test_y, test_dct, test_X_diff, test_vidlens, test_subjects = \
        split_data(X, y, dct_feats, X_diff, subjects, video_lens, train_subject_ids, test_subject_ids)

    assert train_X.shape[0] + test_X.shape[0] == len(X)
    assert train_y.shape[0] + test_y.shape[0] == len(y)
    assert train_vidlens.shape[0] + test_vidlens.shape[0] == len(video_lens)
    assert train_subjects.shape[0] + test_subjects.shape[0] == len(subjects)

    train_X = normalize_input(train_X, centralize=True)
    test_X = normalize_input(test_X, centralize=True)

    # featurewise normalize dct features
    train_dct, dct_mean, dct_std = featurewise_normalize_sequence(train_dct)
    test_dct = (test_dct - dct_mean) / dct_std

    if do_finetune:
        print('performing finetuning on pretrained encoder: {}'.format(
            ae_pretrained))
        ae = load_dbn(ae_pretrained)
        ae.initialize()
        ae.fit(train_X, train_X)

    if save_finetune:
        print('saving finetuned encoder: {}...'.format(ae_finetuned))
        pickle.dump(ae, open(ae_finetuned, 'wb'))

    if load_finetune:
        print('loading finetuned encoder: {}...'.format(ae_finetuned))
        ae = pickle.load(open(ae_finetuned, 'rb'))
        ae.initialize()

    if load_finetune_diff:
        print('loading finetuned encoder: {}...'.format(ae_finetuned_diff))
        ae_diff = pickle.load(open(ae_finetuned_diff, 'rb'))
        ae_diff.initialize()

    # IMPT: the encoder was trained with fortan ordered images, so to visualize
    # convert all the images to C order using reshape_images_order()
    # output = dbn.predict(test_X)
    # test_X = reshape_images_order(test_X, (26, 44))
    # output = reshape_images_order(output, (26, 44))
    # visualize_reconstruction(test_X[:36, :], output[:36, :], shape=(26, 44))

    window = T.iscalar('theta')
    dct = T.tensor3('dct', dtype='float32')
    inputs = T.tensor3('inputs', dtype='float32')
    inputs_diff = T.tensor3('inputs_diff', dtype='float32')
    mask = T.matrix('mask', dtype='uint8')
    targets = T.ivector('targets')
    lr = theano.shared(np.array(learning_rate, dtype=theano.config.floatX),
                       name='learning_rate')
    lr_decay = np.array(decay_rate, dtype=theano.config.floatX)

    print('constructing end to end model...')
    '''
    network = create_end_to_end_model(dbn, (None, None, 1144), inputs,
                                      (None, None), mask, 250, window)
    '''

    network = adenet_v5.create_model(ae, ae_diff, (None, None, 1144), inputs,
                                     (None, None), mask, (None, None, 90), dct,
                                     (None, None, 1144), inputs_diff, 250,
                                     window, 10)

    print_network(network)
    print('compiling model...')
    predictions = las.layers.get_output(network, deterministic=False)
    all_params = las.layers.get_all_params(network, trainable=True)
    cost = T.mean(las.objectives.categorical_crossentropy(
        predictions, targets))
    updates = adadelta(cost, all_params, learning_rate=lr)
    # updates = adagrad(cost, all_params, learning_rate=lr)

    use_max_constraint = False
    if use_max_constraint:
        MAX_NORM = 4
        for param in las.layers.get_all_params(network, regularizable=True):
            if param.ndim > 1:  # only apply to dimensions larger than 1, exclude biases
                updates[param] = norm_constraint(
                    param,
                    MAX_NORM *
                    las.utils.compute_norms(param.get_value()).mean())

    train = theano.function([inputs, targets, mask, dct, inputs_diff, window],
                            cost,
                            updates=updates,
                            allow_input_downcast=True)
    compute_train_cost = theano.function(
        [inputs, targets, mask, dct, inputs_diff, window],
        cost,
        allow_input_downcast=True)

    test_predictions = las.layers.get_output(network, deterministic=True)
    test_cost = T.mean(
        las.objectives.categorical_crossentropy(test_predictions, targets))
    compute_test_cost = theano.function(
        [inputs, targets, mask, dct, inputs_diff, window],
        test_cost,
        allow_input_downcast=True)

    val_fn = theano.function([inputs, mask, dct, inputs_diff, window],
                             test_predictions,
                             allow_input_downcast=True)

    # We'll train the network with 10 epochs of 30 minibatches each
    print('begin training...')
    cost_train = []
    cost_val = []
    class_rate = []
    NUM_EPOCHS = 10
    EPOCH_SIZE = 120
    BATCH_SIZE = 10
    WINDOW_SIZE = 9
    STRIP_SIZE = 3
    MAX_LOSS = 0.2
    VALIDATION_WINDOW = 4
    val_window = circular_list(VALIDATION_WINDOW)
    train_strip = np.zeros((STRIP_SIZE, ))
    best_val = float('inf')
    best_conf = None
    best_cr = 0.0

    datagen = gen_lstm_batch_random(train_X,
                                    train_y,
                                    train_vidlens,
                                    batchsize=BATCH_SIZE)
    val_datagen = gen_lstm_batch_random(test_X,
                                        test_y,
                                        test_vidlens,
                                        batchsize=len(test_vidlens))
    integral_lens = compute_integral_len(train_vidlens)

    # We'll use this "validation set" to periodically check progress
    X_val, y_val, mask_val, idxs_val = next(val_datagen)
    integral_lens_val = compute_integral_len(test_vidlens)
    dct_val = gen_seq_batch_from_idx(test_dct, idxs_val, test_vidlens,
                                     integral_lens_val, np.max(test_vidlens))
    X_diff_val = gen_seq_batch_from_idx(test_X_diff, idxs_val,
                                        test_vidlens, integral_lens_val,
                                        np.max(test_vidlens))

    def early_stop(cost_window):
        if len(cost_window) < 2:
            return False
        else:
            curr = cost_window[0]
            for idx, cost in enumerate(cost_window):
                if curr < cost or idx == 0:
                    curr = cost
                else:
                    return False
            return True

    for epoch in range(NUM_EPOCHS):
        time_start = time.time()
        for i in range(EPOCH_SIZE):
            X, y, m, batch_idxs = next(datagen)
            d = gen_seq_batch_from_idx(train_dct, batch_idxs, train_vidlens,
                                       integral_lens, np.max(train_vidlens))
            X_diff = gen_seq_batch_from_idx(train_X_diff, batch_idxs,
                                            train_vidlens, integral_lens,
                                            np.max(train_vidlens))
            print_str = 'Epoch {} batch {}/{}: {} examples at learning rate = {:.4f}'.format(
                epoch + 1, i + 1, EPOCH_SIZE, len(X), float(lr.get_value()))
            print(print_str, end='')
            sys.stdout.flush()
            train(X, y, m, d, X_diff, WINDOW_SIZE)
            print('\r', end='')
        cost = compute_train_cost(X, y, m, d, X_diff, WINDOW_SIZE)
        val_cost = compute_test_cost(X_val, y_val, mask_val, dct_val,
                                     X_diff_val, WINDOW_SIZE)
        cost_train.append(cost)
        cost_val.append(val_cost)
        train_strip[epoch % STRIP_SIZE] = cost
        val_window.push(val_cost)

        gl = 100 * (cost_val[-1] / np.min(cost_val) - 1)
        pk = 1000 * (np.sum(train_strip) /
                     (STRIP_SIZE * np.min(train_strip)) - 1)
        pq = gl / pk

        cr, val_conf = evaluate_model(X_val, y_val, mask_val, dct_val,
                                      X_diff_val, WINDOW_SIZE, val_fn)
        class_rate.append(cr)

        print(
            "Epoch {} train cost = {}, validation cost = {}, "
            "generalization loss = {:.3f}, GQ = {:.3f}, classification rate = {:.3f} ({:.1f}sec)"
            .format(epoch + 1, cost_train[-1], cost_val[-1], gl, pq, cr,
                    time.time() - time_start))

        if val_cost < best_val:
            best_val = val_cost
            best_conf = val_conf
            best_cr = cr

        if epoch >= VALIDATION_WINDOW and early_stop(val_window):
            break

        # learning rate decay
        if epoch >= decay_start - 1:
            lr.set_value(lr.get_value() * lr_decay)

    phrases = ['p1', 'p2', 'p3', 'p4', 'p5', 'p6', 'p7', 'p8', 'p9', 'p10']

    print('Final Model')
    print('classification rate: {}, validation loss: {}'.format(
        best_cr, best_val))
    print('confusion matrix: ')
    plot_confusion_matrix(best_conf, phrases, fmt='grid')
    plot_validation_cost(cost_train,
                         cost_val,
                         class_rate,
                         savefilename='valid_cost')

    if 'results' in options:
        print('writing to results file: {}...'.format(options['results']))
        f.write('{}, {}, {}\n'.format(test_subject_ids[0], best_cr, best_val))
        f.close()
Ejemplo n.º 30
0
def main():
    configure_theano()
    options = parse_options()
    config_file = options['config']
    config = ConfigParser.ConfigParser()
    config.read(config_file)

    print('CLI options: {}'.format(options.items()))

    print('Reading Config File: {}...'.format(config_file))
    print(config.items('data'))
    print(config.items('models'))
    print(config.items('training'))

    print('preprocessing dataset...')
    data = load_mat_file(config.get('data', 'images'))
    dct_data = load_mat_file(config.get('data', 'dct'))
    ae_finetuned = config.get('models', 'finetuned')
    ae_finetuned_diff = config.get('models', 'finetuned_diff')
    fusiontype = config.get('models', 'fusiontype')
    learning_rate = float(config.get('training', 'learning_rate'))
    decay_rate = float(config.get('training', 'decay_rate'))
    decay_start = int(config.get('training', 'decay_start'))
    load_finetune = config.getboolean('training', 'load_finetune')
    load_finetune_diff = config.getboolean('training', 'load_finetune_diff')

    train_vidlens = data['trVideoLengthVec'].astype('int').reshape((-1,))
    val_vidlens = data['valVideoLengthVec'].astype('int').reshape((-1,))
    test_vidlens = data['testVideoLengthVec'].astype('int').reshape((-1,))
    train_X = data['trData'].astype('float32')
    val_X = data['valData'].astype('float32')
    test_X = data['testData'].astype('float32')
    train_dct = dct_data['trDctFeatures'].astype('float32')
    val_dct = dct_data['valDctFeatures'].astype('float32')
    test_dct = dct_data['testDctFeatures'].astype('float32')
    train_X_diff = compute_diff_images(train_X, train_vidlens)
    val_X_diff = compute_diff_images(val_X, val_vidlens)
    test_X_diff = compute_diff_images(test_X, test_vidlens)
    train_y = data['trTargetsVec'].astype('int').reshape((-1,)) + 1  # +1 to handle the -1 introduced in lstm_gendata
    val_y = data['valTargetsVec'].astype('int').reshape((-1,)) + 1
    test_y = data['testTargetsVec'].astype('int').reshape((-1,)) + 1

    # featurewise normalize dct features
    train_dct, dct_mean, dct_std = featurewise_normalize_sequence(train_dct)
    val_dct = (val_dct - dct_mean) / dct_std
    test_dct = (test_dct - dct_mean) / dct_std

    if load_finetune:
        print('loading finetuned encoder: {}...'.format(ae_finetuned))
        ae = pickle.load(open(ae_finetuned, 'rb'))
        ae.initialize()

    if load_finetune_diff:
        print('loading finetuned encoder: {}...'.format(ae_finetuned_diff))
        ae_diff = pickle.load(open(ae_finetuned_diff, 'rb'))
        ae_diff.initialize()

    # IMPT: the encoder was trained with fortan ordered images, so to visualize
    # convert all the images to C order using reshape_images_order()
    # output = dbn.predict(test_X)
    # test_X = reshape_images_order(test_X, (26, 44))
    # output = reshape_images_order(output, (26, 44))
    # visualize_reconstruction(test_X[:36, :], output[:36, :], shape=(26, 44))

    window = T.iscalar('theta')
    dct = T.tensor3('dct', dtype='float32')
    inputs = T.tensor3('inputs', dtype='float32')
    inputs_diff = T.tensor3('inputs_diff', dtype='float32')
    mask = T.matrix('mask', dtype='uint8')
    targets = T.ivector('targets')
    lr = theano.shared(np.array(learning_rate, dtype=theano.config.floatX), name='learning_rate')
    lr_decay = np.array(decay_rate, dtype=theano.config.floatX)

    print('constructing end to end model...')
    network, l_fuse = adenet_v3.create_model(ae, ae_diff, (None, None, 1500), inputs,
                                             (None, None), mask,
                                             (None, None, 90), dct,
                                             (None, None, 1500), inputs_diff,
                                             250, window, 10, fusiontype)

    print_network(network)
    # draw_to_file(las.layers.get_all_layers(network), 'network.png')
    print('compiling model...')
    predictions = las.layers.get_output(network, deterministic=False)
    all_params = las.layers.get_all_params(network, trainable=True)
    cost = T.mean(las.objectives.categorical_crossentropy(predictions, targets))
    updates = adadelta(cost, all_params, learning_rate=lr)
    # updates = adagrad(cost, all_params, learning_rate=lr)

    train = theano.function(
        [inputs, targets, mask, dct, inputs_diff, window],
        cost, updates=updates, allow_input_downcast=True)
    compute_train_cost = theano.function([inputs, targets, mask, dct, inputs_diff, window],
                                         cost, allow_input_downcast=True)

    test_predictions = las.layers.get_output(network, deterministic=True)
    test_cost = T.mean(las.objectives.categorical_crossentropy(test_predictions, targets))
    compute_test_cost = theano.function(
        [inputs, targets, mask, dct, inputs_diff, window], test_cost, allow_input_downcast=True)

    val_fn = theano.function([inputs, mask, dct, inputs_diff, window], test_predictions, allow_input_downcast=True)

    # We'll train the network with 10 epochs of 30 minibatches each
    print('begin training...')
    cost_train = []
    cost_val = []
    class_rate = []
    NUM_EPOCHS = 30
    EPOCH_SIZE = 45
    BATCH_SIZE = 20
    WINDOW_SIZE = 9
    STRIP_SIZE = 3
    MAX_LOSS = 0.2
    VALIDATION_WINDOW = 4
    val_window = circular_list(VALIDATION_WINDOW)
    train_strip = np.zeros((STRIP_SIZE,))
    best_val = float('inf')
    best_conf = None
    best_cr = 0.0

    datagen = gen_lstm_batch_random(train_X, train_y, train_vidlens, batchsize=BATCH_SIZE)
    integral_lens = compute_integral_len(train_vidlens)

    val_datagen = gen_lstm_batch_random(val_X, val_y, val_vidlens, batchsize=len(val_vidlens))
    test_datagen = gen_lstm_batch_random(test_X, test_y, test_vidlens, batchsize=len(test_vidlens))

    # We'll use this "validation set" to periodically check progress
    X_val, y_val, mask_val, idxs_val = next(val_datagen)
    integral_lens_val = compute_integral_len(val_vidlens)
    dct_val = gen_seq_batch_from_idx(val_dct, idxs_val, val_vidlens, integral_lens_val, np.max(val_vidlens))
    X_diff_val = gen_seq_batch_from_idx(val_X_diff, idxs_val, val_vidlens, integral_lens_val, np.max(val_vidlens))

    # we use the test set to check final classification rate
    X_test, y_test, mask_test, idxs_test = next(test_datagen)
    integral_lens_test = compute_integral_len(test_vidlens)
    dct_test = gen_seq_batch_from_idx(test_dct, idxs_test, test_vidlens, integral_lens_test, np.max(test_vidlens))
    X_diff_test = gen_seq_batch_from_idx(test_X_diff, idxs_test, test_vidlens, integral_lens_test, np.max(test_vidlens))

    def early_stop(cost_window):
        if len(cost_window) < 2:
            return False
        else:
            curr = cost_window[0]
            for idx, cost in enumerate(cost_window):
                if curr < cost or idx == 0:
                    curr = cost
                else:
                    return False
            return True

    for epoch in range(NUM_EPOCHS):
        time_start = time.time()
        for i in range(EPOCH_SIZE):
            X, y, m, batch_idxs = next(datagen)
            d = gen_seq_batch_from_idx(train_dct, batch_idxs,
                                       train_vidlens, integral_lens, np.max(train_vidlens))
            X_diff = gen_seq_batch_from_idx(train_X_diff, batch_idxs,
                                            train_vidlens, integral_lens, np.max(train_vidlens))
            print_str = 'Epoch {} batch {}/{}: {} examples at learning rate = {:.4f}'.format(
                epoch + 1, i + 1, EPOCH_SIZE, len(X), float(lr.get_value()))
            print(print_str, end='')
            sys.stdout.flush()
            train(X, y, m, d, X_diff, WINDOW_SIZE)
            print('\r', end='')
        cost = compute_train_cost(X, y, m, d, X_diff, WINDOW_SIZE)
        val_cost = compute_test_cost(X_val, y_val, mask_val, dct_val, X_diff_val, WINDOW_SIZE)
        cost_train.append(cost)
        cost_val.append(val_cost)
        train_strip[epoch % STRIP_SIZE] = cost
        val_window.push(val_cost)

        gl = 100 * (cost_val[-1] / np.min(cost_val) - 1)
        pk = 1000 * (np.sum(train_strip) / (STRIP_SIZE * np.min(train_strip)) - 1)
        pq = gl / pk

        cr, val_conf = evaluate_model(X_val, y_val, mask_val, dct_val, X_diff_val, WINDOW_SIZE, val_fn)
        class_rate.append(cr)

        if val_cost < best_val:
            best_val = val_cost
            best_cr = cr
            if fusiontype == 'adasum':
                adascale_param = las.layers.get_all_param_values(l_fuse, scaling_param=True)
            test_cr, test_conf = evaluate_model(X_test, y_test, mask_test,
                                                dct_test, X_diff_test, WINDOW_SIZE, val_fn)
            print("Epoch {} train cost = {}, val cost = {}, "
                  "GL loss = {:.3f}, GQ = {:.3f}, CR = {:.3f}, Test CR= {:.3f} ({:.1f}sec)"
                  .format(epoch + 1, cost_train[-1], cost_val[-1], gl, pq, cr, test_cr, time.time() - time_start))
        else:
            print("Epoch {} train cost = {}, val cost = {}, "
                  "GL loss = {:.3f}, GQ = {:.3f}, CR = {:.3f} ({:.1f}sec)"
                  .format(epoch + 1, cost_train[-1], cost_val[-1], gl, pq, cr, time.time() - time_start))

        if epoch >= VALIDATION_WINDOW and early_stop(val_window):
            break

        # learning rate decay
        if epoch + 1 >= decay_start:
            lr.set_value(lr.get_value() * lr_decay)

    numbers = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']

    print('Final Model')
    print('CR: {}, val loss: {}, Test CR: {}'.format(best_cr, best_val, test_cr))
    if fusiontype == 'adasum':
        print("final scaling params: {}".format(adascale_param))
    print('confusion matrix: ')
    plot_confusion_matrix(test_conf, numbers, fmt='latex')
    plot_validation_cost(cost_train, cost_val, savefilename='valid_cost')

    if options['write_results']:
        results_file = options['write_results']
        with open(results_file, mode='a') as f:
            f.write('{},{},{}\n'.format(fusiontype, test_cr, best_val))
Ejemplo n.º 31
0
def main():
    def signal_handler(signal, frame):
        global terminate
        terminate = True
        print('terminating...'.format(terminate))

    signal.signal(signal.SIGINT, signal_handler)
    configure_theano()
    options = parse_options()
    X, X_val = generate_data()

    # X = np.reshape(X, (-1, 1, 30, 40))[:-5]
    print('X type and shape:', X.dtype, X.shape)
    print('X.min():', X.min())
    print('X.max():', X.max())

    # X_val = np.reshape(X_val, (-1, 1, 30, 40))[:-1]
    print('X_val type and shape:', X_val.dtype, X_val.shape)
    print('X_val.min():', X_val.min())
    print('X_val.max():', X_val.max())

    # we need our target to be 1 dimensional
    X_out = X.reshape((X.shape[0], -1))
    X_val_out = X_val.reshape((X_val.shape[0], -1))
    print('X_out:', X_out.dtype, X_out.shape)
    print('X_val_out', X_val_out.dtype, X_val_out.shape)

    # X_noisy = apply_gaussian_noise(X_out)
    # visualize_reconstruction(X_noisy[0:25], X_out[0:25], shape=(28, 28))
    # X = np.reshape(X_noisy, (-1, 1, 28, 28))

    print('constructing and compiling model...')
    # input_var = T.tensor4('input', dtype='float32')
    input_var = T.tensor3('input', dtype='float32')
    target_var = T.matrix('output', dtype='float32')
    lr = theano.shared(np.array(0.8, dtype=theano.config.floatX),
                       name='learning_rate')
    lr_decay = np.array(0.9, dtype=theano.config.floatX)

    # try building a reshaping layer
    # network = create_model(input_var, (None, 1, 30, 40), options)
    l_input = InputLayer((None, None, 1200), input_var, name='input')
    l_input = ReshapeLayer(l_input, (-1, 1, 30, 40), name='reshape_input')
    # l_input = InputLayer((None, 1, 30, 40), input_var, name='input')
    if options['MODEL'] == 'normal':
        network, encoder = avletters_convae.create_model(l_input, options)
    if options['MODEL'] == 'batchnorm':
        network, encoder = avletters_convae_bn.create_model(l_input, options)
    if options['MODEL'] == 'dropout':
        network, encoder = avletters_convae_drop.create_model(l_input, options)
    if options['MODEL'] == 'bn+dropout':
        network, encoder = avletters_convae_bndrop.create_model(
            l_input, options)

    print('AE Network architecture: {}'.format(options['MODEL']))
    print_network(network)

    recon = las.layers.get_output(network, deterministic=False)
    all_params = las.layers.get_all_params(network, trainable=True)
    cost = T.mean(squared_error(recon, target_var))
    updates = adadelta(cost, all_params, lr)
    # updates = las.updates.apply_nesterov_momentum(updates, all_params, momentum=0.90)

    use_max_constraint = False
    print('apply max norm constraint: {}'.format(use_max_constraint))
    if use_max_constraint:
        MAX_NORM = 4
        for param in las.layers.get_all_params(network, regularizable=True):
            if param.ndim > 1:  # only apply to dimensions larger than 1, exclude biases
                # updates[param] = norm_constraint(param, MAX_NORM * las.utils.compute_norms(param.get_value()).mean())
                updates[param] = norm_constraint(param, MAX_NORM)

    train = theano.function([input_var, target_var],
                            recon,
                            updates=updates,
                            allow_input_downcast=True)
    train_cost_fn = theano.function([input_var, target_var],
                                    cost,
                                    allow_input_downcast=True)

    eval_recon = las.layers.get_output(network, deterministic=True)
    eval_cost = T.mean(las.objectives.squared_error(eval_recon, target_var))
    eval_cost_fn = theano.function([input_var, target_var],
                                   eval_cost,
                                   allow_input_downcast=True)
    recon_fn = theano.function([input_var],
                               eval_recon,
                               allow_input_downcast=True)

    if terminate:
        exit()

    NUM_EPOCHS = options['NUM_EPOCHS']
    EPOCH_SIZE = options['EPOCH_SIZE']
    NO_STRIDES = options['NO_STRIDES']
    VAL_NO_STRIDES = options['VAL_NO_STRIDES']

    print('begin training for {} epochs...'.format(NUM_EPOCHS))
    datagen = batch_iterator(X, X_out, 128)

    costs = []
    val_costs = []
    for epoch in range(NUM_EPOCHS):
        time_start = time.time()
        for i in range(EPOCH_SIZE):
            batch_X, batch_y = next(datagen)
            print_str = 'Epoch {} batch {}/{}: {} examples at learning rate = {:.4f}'.format(
                epoch + 1, i + 1, EPOCH_SIZE, len(batch_X), lr.get_value())
            print(print_str, end='')
            sys.stdout.flush()
            batch_X = batch_X.reshape((-1, 1, 1200))
            train(batch_X, batch_y)
            print('\r', end='')
            if terminate:
                break
        if terminate:
            break

        cost = batch_compute_cost(X, X_out, NO_STRIDES, train_cost_fn)
        val_cost = batch_compute_cost(X_val, X_val_out, VAL_NO_STRIDES,
                                      eval_cost_fn)
        costs.append(cost)
        val_costs.append(val_cost)

        print("Epoch {} train cost = {}, validation cost = {} ({:.1f}sec) ".
              format(epoch + 1, cost, val_cost,
                     time.time() - time_start))
        if epoch > 10:
            lr.set_value(lr.get_value() * lr_decay)

    X_val_recon = recon_fn(X_val)
    visualize_reconstruction(X_val_out[450:550],
                             X_val_recon[450:550],
                             shape=(30, 40),
                             savefilename='avletters')
    plot_validation_cost(costs, val_costs, None, savefilename='valid_cost')

    conv2d1 = las.layers.get_all_layers(network)[2]
    visualize.plot_conv_weights(conv2d1, (15, 14)).savefig('conv2d1.png')

    print('saving encoder...')
    save_model(encoder, 'models/conv_encoder.dat')
    save_model(network, 'models/conv_ae.dat')
def build_network_2dconv(
    args, input1_var, input1_mask_var, input2_var, intut2_mask_var, target_var, wordEmbeddings, maxlen=36
):

    print ("Building model with 2D Convolution")

    vocab_size = wordEmbeddings.shape[1]
    wordDim = wordEmbeddings.shape[0]

    num_filters = 100

    stride = 1

    # CNN_sentence config
    filter_size = (3, wordDim)
    pool_size = (maxlen - 3 + 1, 1)

    # two conv pool layer
    # filter_size=(10, 100)
    # pool_size=(4,4)

    input_1 = InputLayer((None, maxlen), input_var=input1_var)
    batchsize, seqlen = input_1.input_var.shape
    # input_1_mask = InputLayer((None, maxlen),input_var=input1_mask_var)
    emb_1 = EmbeddingLayer(input_1, input_size=vocab_size, output_size=wordDim, W=wordEmbeddings.T)
    emb_1.params[emb_1.W].remove("trainable")  # (batchsize, maxlen, wordDim)

    reshape_1 = ReshapeLayer(emb_1, (batchsize, 1, maxlen, wordDim))

    conv2d_1 = Conv2DLayer(
        reshape_1,
        num_filters=num_filters,
        filter_size=(filter_size),
        stride=stride,
        nonlinearity=rectify,
        W=GlorotUniform(),
    )  # (None, 100, 34, 1)
    maxpool_1 = MaxPool2DLayer(conv2d_1, pool_size=pool_size)  # (None, 100, 1, 1)

    """
    filter_size_2=(4, 10)
    pool_size_2=(2,2)
    conv2d_1 = Conv2DLayer(maxpool_1, num_filters=num_filters, filter_size=filter_size_2, stride=stride, 
        nonlinearity=rectify,W=GlorotUniform()) #(None, 100, 34, 1)
    maxpool_1 = MaxPool2DLayer(conv2d_1, pool_size=pool_size_2) #(None, 100, 1, 1) (None, 100, 1, 20)
    """

    forward_1 = FlattenLayer(maxpool_1)  # (None, 100) #(None, 50400)

    input_2 = InputLayer((None, maxlen), input_var=input2_var)
    # input_2_mask = InputLayer((None, maxlen),input_var=input2_mask_var)
    emb_2 = EmbeddingLayer(input_2, input_size=vocab_size, output_size=wordDim, W=wordEmbeddings.T)
    emb_2.params[emb_2.W].remove("trainable")

    reshape_2 = ReshapeLayer(emb_2, (batchsize, 1, maxlen, wordDim))
    conv2d_2 = Conv2DLayer(
        reshape_2,
        num_filters=num_filters,
        filter_size=filter_size,
        stride=stride,
        nonlinearity=rectify,
        W=GlorotUniform(),
    )  # (None, 100, 34, 1)
    maxpool_2 = MaxPool2DLayer(conv2d_2, pool_size=pool_size)  # (None, 100, 1, 1)

    """
    conv2d_2 = Conv2DLayer(maxpool_2, num_filters=num_filters, filter_size=filter_size_2, stride=stride, 
        nonlinearity=rectify,W=GlorotUniform()) #(None, 100, 34, 1)
    maxpool_2 = MaxPool2DLayer(conv2d_2, pool_size=pool_size_2) #(None, 100, 1, 1)
    """

    forward_2 = FlattenLayer(maxpool_2)  # (None, 100)

    # elementwisemerge need fix the sequence length
    mul = ElemwiseMergeLayer([forward_1, forward_2], merge_function=T.mul)
    sub = AbsSubLayer([forward_1, forward_2], merge_function=T.sub)
    concat = ConcatLayer([mul, sub])

    concat = ConcatLayer([forward_1, forward_2])

    hid = DenseLayer(concat, num_units=args.hiddenDim, nonlinearity=sigmoid)

    if args.task == "sts":
        network = DenseLayer(hid, num_units=5, nonlinearity=softmax)

    elif args.task == "ent":
        network = DenseLayer(hid, num_units=3, nonlinearity=softmax)

    # prediction = get_output(network, {input_1:input1_var, input_2:input2_var})
    prediction = get_output(network)

    loss = T.mean(categorical_crossentropy(prediction, target_var))
    lambda_val = 0.5 * 1e-4

    layers = {conv2d_1: lambda_val, hid: lambda_val, network: lambda_val}
    penalty = regularize_layer_params_weighted(layers, l2)
    loss = loss + penalty

    params = get_all_params(network, trainable=True)

    if args.optimizer == "sgd":
        updates = sgd(loss, params, learning_rate=args.step)
    elif args.optimizer == "adagrad":
        updates = adagrad(loss, params, learning_rate=args.step)
    elif args.optimizer == "adadelta":
        updates = adadelta(loss, params, learning_rate=args.step)
    elif args.optimizer == "nesterov":
        updates = nesterov_momentum(loss, params, learning_rate=args.step)
    elif args.optimizer == "rms":
        updates = rmsprop(loss, params, learning_rate=args.step)
    elif args.optimizer == "adam":
        updates = adam(loss, params, learning_rate=args.step)
    else:
        raise "Need set optimizer correctly"

    # test_prediction = get_output(network, {input_1:input1_var, input_2:input2_var}, deterministic=True)
    test_prediction = get_output(network, deterministic=True)
    test_loss = T.mean(categorical_crossentropy(test_prediction, target_var))

    """
    train_fn = theano.function([input1_var, input1_mask_var, input2_var, intut2_mask_var, target_var], 
        loss, updates=updates, allow_input_downcast=True)
    """
    train_fn = theano.function([input1_var, input2_var, target_var], loss, updates=updates, allow_input_downcast=True)

    if args.task == "sts":
        """
        val_fn = theano.function([input1_var, input1_mask_var, input2_var, intut2_mask_var, target_var], 
            [test_loss, test_prediction], allow_input_downcast=True)
        """
        val_fn = theano.function(
            [input1_var, input2_var, target_var], [test_loss, test_prediction], allow_input_downcast=True
        )

    elif args.task == "ent":
        # test_acc = T.mean(T.eq(T.argmax(test_prediction, axis=1), target_var), dtype=theano.config.floatX)
        test_acc = T.mean(categorical_accuracy(test_prediction, target_var))

        """
        val_fn = theano.function([input1_var, input1_mask_var, input2_var, intut2_mask_var, target_var], 
            [test_loss, test_acc], allow_input_downcast=True)
        """
        val_fn = theano.function([input1_var, input2_var, target_var], [test_loss, test_acc], allow_input_downcast=True)

    return train_fn, val_fn
Ejemplo n.º 33
0
def main():
    configure_theano()
    config_file = 'config/trimodal.ini'
    print('loading config file: {}'.format(config_file))
    config = ConfigParser.ConfigParser()
    config.read(config_file)

    print('Reading Config File: {}...'.format(config_file))
    print(config.items('data'))
    print(config.items('models'))
    print(config.items('training'))

    print('preprocessing dataset...')
    data = load_mat_file(config.get('data', 'images'))
    dct_data = load_mat_file(config.get('data', 'dct'))
    ae_pretrained = config.get('models', 'pretrained')
    ae_finetuned = config.get('models', 'finetuned')
    ae_finetuned_diff = config.get('models', 'finetuned_diff')
    use_adascale = config.getboolean('models', 'use_adascale')
    learning_rate = float(config.get('training', 'learning_rate'))
    decay_rate = float(config.get('training', 'decay_rate'))
    decay_start = int(config.get('training', 'decay_start'))
    do_finetune = config.getboolean('training', 'do_finetune')
    save_finetune = config.getboolean('training', 'save_finetune')
    load_finetune = config.getboolean('training', 'load_finetune')
    load_finetune_diff = config.getboolean('training', 'load_finetune_diff')

    # 53 subjects, 70 utterances, 5 view angles
    # s[x]_v[y]_u[z].mp4
    # resized, height, width = (26, 44)
    # ['dataMatrix', 'targetH', 'targetsPerVideoVec', 'videoLengthVec', '__header__', 'targetsVec',
    # '__globals__', 'iterVec', 'filenamesVec', 'dataMatrixCells', 'subjectsVec', 'targetW', '__version__']

    print(data.keys())
    X = data['dataMatrix'].astype('float32')
    y = data['targetsVec'].astype('int32')
    y = y.reshape((len(y),))
    dct_feats = dct_data['dctFeatures'].astype('float32')
    uniques = np.unique(y)
    print('number of classifications: {}'.format(len(uniques)))
    subjects = data['subjectsVec'].astype('int')
    subjects = subjects.reshape((len(subjects),))
    video_lens = data['videoLengthVec'].astype('int')
    video_lens = video_lens.reshape((len(video_lens,)))

    # X = reorder_data(X, (26, 44), 'f', 'c')
    # print('performing sequencewise mean image removal...')
    # X = sequencewise_mean_image_subtraction(X, video_lens)
    # visualize_images(X[550:650], (26, 44))
    X_diff = compute_diff_images(X, video_lens)

    # mean remove dct features
    dct_feats = sequencewise_mean_image_subtraction(dct_feats, video_lens)

    train_subject_ids = read_data_split_file('data/train_val.txt')
    test_subject_ids = read_data_split_file('data/test.txt')
    print(train_subject_ids)
    print(test_subject_ids)
    train_X, train_y, train_dct, train_X_diff, train_vidlens, train_subjects, \
    test_X, test_y, test_dct, test_X_diff, test_vidlens, test_subjects = \
        split_data(X, y, dct_feats, X_diff, subjects, video_lens, train_subject_ids, test_subject_ids)

    assert train_X.shape[0] + test_X.shape[0] == len(X)
    assert train_y.shape[0] + test_y.shape[0] == len(y)
    assert train_vidlens.shape[0] + test_vidlens.shape[0] == len(video_lens)
    assert train_subjects.shape[0] + test_subjects.shape[0] == len(subjects)

    train_X = normalize_input(train_X, centralize=True)
    test_X = normalize_input(test_X, centralize=True)

    # featurewise normalize dct features
    train_dct, dct_mean, dct_std = featurewise_normalize_sequence(train_dct)
    test_dct = (test_dct - dct_mean) / dct_std

    if do_finetune:
        print('performing finetuning on pretrained encoder: {}'.format(ae_pretrained))
        ae = load_dbn(ae_pretrained)
        ae.initialize()
        ae.fit(train_X, train_X)

    if save_finetune:
        print('saving finetuned encoder: {}...'.format(ae_finetuned))
        pickle.dump(ae, open(ae_finetuned, 'wb'))

    if load_finetune:
        print('loading finetuned encoder: {}...'.format(ae_finetuned))
        ae = pickle.load(open(ae_finetuned, 'rb'))
        ae.initialize()

    if load_finetune_diff:
        print('loading finetuned encoder: {}...'.format(ae_finetuned_diff))
        ae_diff = pickle.load(open(ae_finetuned_diff, 'rb'))
        ae_diff.initialize()

    # IMPT: the encoder was trained with fortan ordered images, so to visualize
    # convert all the images to C order using reshape_images_order()
    # output = dbn.predict(test_X)
    # test_X = reshape_images_order(test_X, (26, 44))
    # output = reshape_images_order(output, (26, 44))
    # visualize_reconstruction(test_X[:36, :], output[:36, :], shape=(26, 44))

    window = T.iscalar('theta')
    dct = T.tensor3('dct', dtype='float32')
    inputs = T.tensor3('inputs', dtype='float32')
    inputs_diff = T.tensor3('inputs_diff', dtype='float32')
    mask = T.matrix('mask', dtype='uint8')
    targets = T.ivector('targets')
    lr = theano.shared(np.array(learning_rate, dtype=theano.config.floatX), name='learning_rate')
    lr_decay = np.array(decay_rate, dtype=theano.config.floatX)

    print('constructing end to end model...')
    '''
    network = create_end_to_end_model(dbn, (None, None, 1144), inputs,
                                      (None, None), mask, 250, window)
    '''

    network, adascale = adenet_v5.create_model(ae, ae_diff, (None, None, 1144), inputs,
                                               (None, None), mask,
                                               (None, None, 90), dct,
                                               (None, None, 1144), inputs_diff,
                                               250, window, 10, use_adascale)

    print_network(network)
    print('compiling model...')
    predictions = las.layers.get_output(network, deterministic=False)
    all_params = las.layers.get_all_params(network, trainable=True)
    cost = T.mean(las.objectives.categorical_crossentropy(predictions, targets))
    updates = adadelta(cost, all_params, learning_rate=lr)
    # updates = adagrad(cost, all_params, learning_rate=lr)

    use_max_constraint = False
    if use_max_constraint:
        MAX_NORM = 4
        for param in las.layers.get_all_params(network, regularizable=True):
            if param.ndim > 1:  # only apply to dimensions larger than 1, exclude biases
                updates[param] = norm_constraint(param, MAX_NORM * las.utils.compute_norms(param.get_value()).mean())

    train = theano.function(
        [inputs, targets, mask, dct, inputs_diff, window],
        cost, updates=updates, allow_input_downcast=True)
    compute_train_cost = theano.function([inputs, targets, mask, dct, inputs_diff, window],
                                         cost, allow_input_downcast=True)

    test_predictions = las.layers.get_output(network, deterministic=True)
    test_cost = T.mean(las.objectives.categorical_crossentropy(test_predictions, targets))
    compute_test_cost = theano.function(
        [inputs, targets, mask, dct, inputs_diff, window], test_cost, allow_input_downcast=True)

    val_fn = theano.function([inputs, mask, dct, inputs_diff, window], test_predictions, allow_input_downcast=True)

    # We'll train the network with 10 epochs of 30 minibatches each
    print('begin training...')
    cost_train = []
    cost_val = []
    class_rate = []
    NUM_EPOCHS = 30
    EPOCH_SIZE = 120
    BATCH_SIZE = 10
    WINDOW_SIZE = 9
    STRIP_SIZE = 3
    MAX_LOSS = 0.2
    VALIDATION_WINDOW = 4
    val_window = circular_list(VALIDATION_WINDOW)
    train_strip = np.zeros((STRIP_SIZE,))
    best_val = float('inf')
    best_conf = None
    best_cr = 0.0

    datagen = gen_lstm_batch_random(train_X, train_y, train_vidlens, batchsize=BATCH_SIZE)
    val_datagen = gen_lstm_batch_random(test_X, test_y, test_vidlens,
                                        batchsize=len(test_vidlens))
    integral_lens = compute_integral_len(train_vidlens)

    # We'll use this "validation set" to periodically check progress
    X_val, y_val, mask_val, idxs_val = next(val_datagen)
    integral_lens_val = compute_integral_len(test_vidlens)
    dct_val = gen_seq_batch_from_idx(test_dct, idxs_val, test_vidlens, integral_lens_val, np.max(test_vidlens))
    X_diff_val = gen_seq_batch_from_idx(test_X_diff, idxs_val, test_vidlens, integral_lens_val, np.max(test_vidlens))

    def early_stop(cost_window):
        if len(cost_window) < 2:
            return False
        else:
            curr = cost_window[0]
            for idx, cost in enumerate(cost_window):
                if curr < cost or idx == 0:
                    curr = cost
                else:
                    return False
            return True

    for epoch in range(NUM_EPOCHS):
        time_start = time.time()
        for i in range(EPOCH_SIZE):
            X, y, m, batch_idxs = next(datagen)
            d = gen_seq_batch_from_idx(train_dct, batch_idxs,
                                       train_vidlens, integral_lens, np.max(train_vidlens))
            X_diff = gen_seq_batch_from_idx(train_X_diff, batch_idxs,
                                            train_vidlens, integral_lens, np.max(train_vidlens))
            print_str = 'Epoch {} batch {}/{}: {} examples at learning rate = {:.4f}'.format(
                epoch + 1, i + 1, EPOCH_SIZE, len(X), float(lr.get_value()))
            print(print_str, end='')
            sys.stdout.flush()
            train(X, y, m, d, X_diff, WINDOW_SIZE)
            print('\r', end='')
        cost = compute_train_cost(X, y, m, d, X_diff, WINDOW_SIZE)
        val_cost = compute_test_cost(X_val, y_val, mask_val, dct_val, X_diff_val, WINDOW_SIZE)
        cost_train.append(cost)
        cost_val.append(val_cost)
        train_strip[epoch % STRIP_SIZE] = cost
        val_window.push(val_cost)

        gl = 100 * (cost_val[-1] / np.min(cost_val) - 1)
        pk = 1000 * (np.sum(train_strip) / (STRIP_SIZE * np.min(train_strip)) - 1)
        pq = gl / pk

        cr, val_conf = evaluate_model(X_val, y_val, mask_val, dct_val, X_diff_val, WINDOW_SIZE, val_fn)
        class_rate.append(cr)

        print("Epoch {} train cost = {}, validation cost = {}, "
              "generalization loss = {:.3f}, GQ = {:.3f}, classification rate = {:.3f} ({:.1f}sec)"
              .format(epoch + 1, cost_train[-1], cost_val[-1], gl, pq, cr, time.time() - time_start))

        if val_cost < best_val:
            best_val = val_cost
            best_conf = val_conf
            best_cr = cr
            if use_adascale:
                adascale_param = las.layers.get_all_param_values(adascale, scaling_param=True)

        if epoch >= VALIDATION_WINDOW and early_stop(val_window):
            break

        # learning rate decay
        if epoch >= decay_start - 1:
            lr.set_value(lr.get_value() * lr_decay)

    phrases = ['p1', 'p2', 'p3', 'p4', 'p5', 'p6', 'p7', 'p8', 'p9', 'p10']

    print('Final Model')
    print('classification rate: {}, validation loss: {}'.format(best_cr, best_val))
    if use_adascale:
        print("final scaling params: {}".format(adascale_param))
    print('confusion matrix: ')
    plot_confusion_matrix(best_conf, phrases, fmt='grid')
    plot_validation_cost(cost_train, cost_val, class_rate, savefilename='valid_cost')
Ejemplo n.º 34
0
def construct_lstm(input_size, lstm_size, output_size, train_data_gen, val_data_gen):

    # All gates have initializers for the input-to-gate and hidden state-to-gate
    # weight matrices, the cell-to-gate weight vector, the bias vector, and the nonlinearity.
    # The convention is that gates use the standard sigmoid nonlinearity,
    # which is the default for the Gate class.
    gate_parameters = Gate(
        W_in=las.init.Orthogonal(), W_hid=las.init.Orthogonal(),
        b=las.init.Constant(0.))
    cell_parameters = Gate(
        W_in=las.init.Orthogonal(), W_hid=las.init.Orthogonal(),
        # Setting W_cell to None denotes that no cell connection will be used.
        W_cell=None, b=las.init.Constant(0.),
        # By convention, the cell nonlinearity is tanh in an LSTM.
        nonlinearity=tanh)

    # prepare the input layers
    # By setting the first and second dimensions to None, we allow
    # arbitrary minibatch sizes with arbitrary sequence lengths.
    # The number of feature dimensions is 150, as described above.
    l_in = InputLayer(shape=(None, None, input_size), name='input')
    # This input will be used to provide the network with masks.
    # Masks are expected to be matrices of shape (n_batch, n_time_steps);
    # both of these dimensions are variable for us so we will use
    # an input shape of (None, None)
    l_mask = InputLayer(shape=(None, None), name='mask')

    # Our LSTM will have 250 hidden/cell units
    N_HIDDEN = lstm_size
    l_lstm = LSTMLayer(
        l_in, N_HIDDEN,
        # We need to specify a separate input for masks
        mask_input=l_mask,
        # Here, we supply the gate parameters for each gate
        ingate=gate_parameters, forgetgate=gate_parameters,
        cell=cell_parameters, outgate=gate_parameters,
        # We'll learn the initialization and use gradient clipping
        learn_init=True, grad_clipping=5., name='lstm1')

    '''
    # The "backwards" layer is the same as the first,
    # except that the backwards argument is set to True.
    l_lstm_back = LSTMLayer(
        l_in, N_HIDDEN, ingate=gate_parameters,
        mask_input=l_mask, forgetgate=gate_parameters,
        cell=cell_parameters, outgate=gate_parameters,
        learn_init=True, grad_clipping=5., backwards=True)
    # We'll combine the forward and backward layer output by summing.
    # Merge layers take in lists of layers to merge as input.
    l_sum = ElemwiseSumLayer([l_lstm, l_lstm_back])

    # implement drop-out regularization
    l_dropout = DropoutLayer(l_sum)

    l_lstm2 = LSTMLayer(
        l_dropout, N_HIDDEN,
        # We need to specify a separate input for masks
        mask_input=l_mask,
        # Here, we supply the gate parameters for each gate
        ingate=gate_parameters, forgetgate=gate_parameters,
        cell=cell_parameters, outgate=gate_parameters,
        # We'll learn the initialization and use gradient clipping
        learn_init=True, grad_clipping=5.)

    # The "backwards" layer is the same as the first,
    # except that the backwards argument is set to True.
    l_lstm_back2 = LSTMLayer(
        l_dropout, N_HIDDEN, ingate=gate_parameters,
        mask_input=l_mask, forgetgate=gate_parameters,
        cell=cell_parameters, outgate=gate_parameters,
        learn_init=True, grad_clipping=5., backwards=True)

    # We'll combine the forward and backward layer output by summing.
    # Merge layers take in lists of layers to merge as input.
    l_sum2 = ElemwiseSumLayer([l_lstm2, l_lstm_back2])
    '''
    # The l_forward layer creates an output of dimension (batch_size, SEQ_LENGTH, N_HIDDEN)
    # Since we are only interested in the final prediction, we isolate that quantity and feed it to the next layer.
    # The output of the sliced layer will then be of size (batch_size, N_HIDDEN)
    l_forward_slice = SliceLayer(l_lstm, -1, 1, name='slice')

    # Now, we can apply feed-forward layers as usual.
    # We want the network to predict a classification for the sequence,
    # so we'll use a the number of classes.
    l_out = DenseLayer(
        l_forward_slice, num_units=output_size, nonlinearity=las.nonlinearities.softmax, name='output')

    print_network(l_out)
    # draw_to_file(las.layers.get_all_layers(l_out), 'network.png')

    # Symbolic variable for the target network output.
    # It will be of shape n_batch, because there's only 1 target value per sequence.
    target_values = T.ivector('target_output')

    # This matrix will tell the network the length of each sequences.
    # The actual values will be supplied by the gen_data function.
    mask = T.matrix('mask')

    # lasagne.layers.get_output produces an expression for the output of the net
    prediction = las.layers.get_output(l_out)

    # The value we care about is the final value produced for each sequence
    # so we simply slice it out.
    # predicted_values = network_output[:, -1]

    # Our cost will be categorical cross entropy error
    cost = T.mean(las.objectives.categorical_crossentropy(prediction, target_values))
    # cost = T.mean((predicted_values - target_values) ** 2)
    # Retrieve all parameters from the network
    all_params = las.layers.get_all_params(l_out, trainable=True)
    # Compute adam updates for training
    # updates = las.updates.adam(cost, all_params)
    updates = adadelta(cost, all_params)
    # Theano functions for training and computing cost
    train = theano.function(
        [l_in.input_var, target_values, l_mask.input_var],
        cost, updates=updates, allow_input_downcast=True)
    compute_train_cost = theano.function(
        [l_in.input_var, target_values, l_mask.input_var], cost, allow_input_downcast=True)

    test_prediction = las.layers.get_output(l_out, deterministic=True)
    test_cost = T.mean(las.objectives.categorical_crossentropy(test_prediction, target_values))
    compute_val_cost = theano.function([l_in.input_var, target_values, l_mask.input_var],
                                       test_cost, allow_input_downcast=True)
    val_fn = theano.function([l_in.input_var, l_mask.input_var], test_prediction, allow_input_downcast=True)

    # We'll use this "validation set" to periodically check progress
    X_val, y_val, mask_val = next(val_data_gen)

    # We'll train the network with 10 epochs of 100 minibatches each
    cost_train = []
    cost_val = []
    class_rate = []
    best_val = float('inf')
    best_conf = None
    best_cr = 0.0
    NUM_EPOCHS = 30
    EPOCH_SIZE = 26
    STRIP_SIZE = 3
    MAX_LOSS = 0.05
    VALIDATION_WINDOW = 4
    val_window = circular_list(VALIDATION_WINDOW)
    train_strip = np.zeros((STRIP_SIZE,))

    def early_stop(cost_window):
        if len(cost_window) < 2:
            return False
        else:
            curr = cost_window[0]
            for idx, cost in enumerate(cost_window):
                if curr < cost or idx == 0:
                    curr = cost
                else:
                    return False
            return True

    for epoch in range(NUM_EPOCHS):
        time_start = time.time()
        for _ in range(EPOCH_SIZE):
            X, y, m, _ = next(train_data_gen)
            train(X, y, m)
        train_cost = compute_train_cost(X, y, m)
        val_cost = compute_val_cost(X_val, y_val, mask_val)
        cr, conf = evaluate_model(X_val, y_val, mask_val, val_fn)
        cost_train.append(train_cost)
        cost_val.append(val_cost)
        class_rate.append(cr)
        train_strip[epoch % STRIP_SIZE] = train_cost
        val_window.push(val_cost)

        gl = 100 * (cost_val[-1] / np.min(cost_val) - 1)
        pk = 1000 * (np.sum(train_strip) / (STRIP_SIZE * np.min(train_strip)) - 1)
        pq = gl / pk

        print("Epoch {} train cost = {}, validation cost = {}, "
              "generalization loss = {:.3f}, GQ = {:.3f}, classification rate = {:.3f} ({:.1f}sec)"
              .format(epoch + 1, cost_train[-1], cost_val[-1], gl, pq, cr, time.time() - time_start))

        if val_cost < best_val:
            best_val = val_cost
            best_cr = cr
            best_conf = conf

        if epoch >= VALIDATION_WINDOW and early_stop(val_window):
            break

    letters = ['a', 'b', 'c', 'd', 'e', 'f', 'g',
               'h', 'i', 'j', 'k', 'l', 'm', 'n',
               'o', 'p', 'q', 'r', 's', 't', 'u',
               'v', 'w', 'x', 'y', 'z']

    print('Final Model')
    print('classification rate: {}'.format(best_cr))
    print('validation loss: {}'.format(best_val))
    print('confusion matrix: ')
    plot_confusion_matrix(best_conf, letters, fmt='grid')
    plot_validation_cost(cost_train, cost_val, class_rate)
Ejemplo n.º 35
0
def main():
    configure_theano()
    config_file = 'config/separate_train.ini'
    print('loading config file: {}'.format(config_file))
    config = ConfigParser.ConfigParser()
    config.read(config_file)

    print('preprocessing dataset...')
    data = load_mat_file(config.get('data', 'images'))
    ae_pretrained = config.get('models', 'pretrained')
    ae_finetuned = config.get('models', 'finetuned')
    learning_rate = float(config.get('training', 'learning_rate'))
    decay_rate = float(config.get('training', 'decay_rate'))
    decay_start = int(config.get('training', 'decay_start'))
    lstm_units = int(config.get('training', 'lstm_units'))
    output_units = int(config.get('training', 'output_units'))
    do_finetune = config.getboolean('training', 'do_finetune')
    save_finetune = config.getboolean('training', 'save_finetune')
    load_finetune = config.getboolean('training', 'load_finetune')

    # 53 subjects, 70 utterances, 5 view angles
    # s[x]_v[y]_u[z].mp4
    # resized, height, width = (26, 44)
    # ['dataMatrix', 'targetH', 'targetsPerVideoVec', 'videoLengthVec', '__header__', 'targetsVec',
    # '__globals__', 'iterVec', 'filenamesVec', 'dataMatrixCells', 'subjectsVec', 'targetW', '__version__']

    print(data.keys())
    X = data['dataMatrix'].astype(
        'float32')  # .reshape((-1, 26, 44), order='f').reshape((-1, 26 * 44))
    y = data['targetsVec'].astype('int32')
    y = y.reshape((len(y), ))
    uniques = np.unique(y)
    print('number of classifications: {}'.format(len(uniques)))
    subjects = data['subjectsVec'].astype('int')
    subjects = subjects.reshape((len(subjects), ))
    video_lens = data['videoLengthVec'].astype('int')
    video_lens = video_lens.reshape((len(video_lens, )))

    train_subject_ids = read_data_split_file('data/train.txt')
    val_subject_ids = read_data_split_file('data/val.txt')
    test_subject_ids = read_data_split_file('data/test.txt')
    print('Train: {}'.format(train_subject_ids))
    print('Validation: {}'.format(val_subject_ids))
    print('Test: {}'.format(test_subject_ids))

    train_X, train_y, train_vidlens, train_subjects, \
    val_X, val_y, val_vidlens, val_subjects, \
    test_X, test_y, test_vidlens, test_subjects = \
        split_data(X, y, subjects, video_lens, train_subject_ids, val_subject_ids, test_subject_ids)

    assert train_X.shape[0] + val_X.shape[0] + test_X.shape[0] == len(X)
    assert train_y.shape[0] + val_y.shape[0] + test_y.shape[0] == len(y)
    assert train_vidlens.shape[0] + val_vidlens.shape[0] + test_vidlens.shape[
        0] == len(video_lens)
    assert train_subjects.shape[0] + val_subjects.shape[
        0] + test_subjects.shape[0] == len(subjects)

    train_X = normalize_input(train_X, centralize=True)
    test_X = normalize_input(test_X, centralize=True)

    if do_finetune:
        dbn = load_dbn(ae_pretrained)
        dbn.initialize()
        dbn.fit(train_X, train_X)
        recon = dbn.predict(test_X)
        visualize_reconstruction(reorder_data(test_X[800:864], (26, 44)),
                                 reorder_data(recon[800:864], (26, 44)),
                                 shape=(26, 44))

    if save_finetune:
        pickle.dump(dbn, open(ae_finetuned, 'wb'))

    if load_finetune:
        print('loading pre-trained encoding layers...')
        dbn = pickle.load(open(ae_finetuned, 'rb'))
        dbn.initialize()
        # recon = dbn.predict(test_X)
        # visualize_reconstruction(reorder_data(test_X[800:864], (26, 44)),
        #                         reorder_data(recon[800:864], (26, 44)),
        #                         shape=(26, 44))

    encoder = extract_encoder(dbn)
    train_X = encoder.predict(train_X)
    val_X = encoder.predict(val_X)
    test_X = encoder.predict(test_X)

    # train_X = concat_first_second_deltas(train_X, train_vidlens)
    # val_X = concat_first_second_deltas(val_X, val_vidlens)
    # test_X = concat_first_second_deltas(test_X, test_vidlens)

    # featurewise normalize
    train_X, mean, std = featurewise_normalize_sequence(train_X)
    val_X = (val_X - mean) / std
    test_X = (test_X - mean) / std

    # recon = dbn.predict(test_X)
    # visualize_reconstruction(test_X[550:650], recon[550:650], (26, 44))
    # exit()

    # IMPT: the encoder was trained with fortan ordered images, so to visualize
    # convert all the images to C order using reshape_images_order()
    # output = dbn.predict(test_X)
    # test_X = reshape_images_order(test_X, (26, 44))
    # output = reshape_images_order(output, (26, 44))
    # visualize_reconstruction(test_X[:36, :], output[:36, :], shape=(26, 44))

    inputs = T.tensor3('inputs', dtype='float32')
    mask = T.matrix('mask', dtype='uint8')
    targets = T.ivector('targets')
    lr = theano.shared(np.array(learning_rate, dtype=theano.config.floatX),
                       name='learning_rate')
    lr_decay = np.array(decay_rate, dtype=theano.config.floatX)

    print('constructing lstm classifier...')
    network = lstm_classifier_baseline.create_model(
        (None, None, 50), inputs, (None, None), mask, lstm_units, output_units)

    print_network(network)
    print('compiling model...')
    predictions = las.layers.get_output(network, deterministic=False)
    all_params = las.layers.get_all_params(network, trainable=True)
    cost = T.mean(las.objectives.categorical_crossentropy(
        predictions, targets))
    updates = adadelta(cost, all_params, learning_rate=lr)
    # updates = las.updates.apply_momentum(sgd(cost, all_params, learning_rate=lr), all_params, 0.1)

    use_max_constraint = False
    if use_max_constraint:
        MAX_NORM = 4
        for param in las.layers.get_all_params(network, regularizable=True):
            if param.ndim > 1:  # only apply to dimensions larger than 1, exclude biases
                updates[param] = norm_constraint(
                    param,
                    MAX_NORM *
                    las.utils.compute_norms(param.get_value()).mean())

    train = theano.function([inputs, targets, mask],
                            cost,
                            updates=updates,
                            allow_input_downcast=True)
    compute_train_cost = theano.function([inputs, targets, mask],
                                         cost,
                                         allow_input_downcast=True)

    test_predictions = las.layers.get_output(network, deterministic=True)
    test_cost = T.mean(
        las.objectives.categorical_crossentropy(test_predictions, targets))
    compute_test_cost = theano.function([inputs, targets, mask],
                                        test_cost,
                                        allow_input_downcast=True)

    val_fn = theano.function([inputs, mask],
                             test_predictions,
                             allow_input_downcast=True)

    # We'll train the network with 10 epochs of 30 minibatches each
    print('begin training...')
    cost_train = []
    cost_val = []
    class_rate = []
    NUM_EPOCHS = 30
    EPOCH_SIZE = 120
    BATCH_SIZE = 10
    STRIP_SIZE = 3
    MAX_LOSS = 0.2
    VALIDATION_WINDOW = 10
    val_window = circular_list(VALIDATION_WINDOW)
    train_strip = np.zeros((STRIP_SIZE, ))
    best_val = float('inf')
    best_conf = None
    best_cr = 0.0

    datagen = gen_lstm_batch_random(train_X,
                                    train_y,
                                    train_vidlens,
                                    batchsize=BATCH_SIZE)
    val_datagen = gen_lstm_batch_random(val_X,
                                        val_y,
                                        val_vidlens,
                                        batchsize=len(val_vidlens))
    test_datagen = gen_lstm_batch_random(test_X,
                                         test_y,
                                         test_vidlens,
                                         batchsize=len(test_vidlens))

    # We'll use this "validation set" to periodically check progress
    X_val, y_val, mask_val, _ = next(val_datagen)
    X_test, y_test, mask_test, _ = next(test_datagen)

    def early_stop(cost_window):
        if len(cost_window) < 2:
            return False
        else:
            curr = cost_window[0]
            for idx, cost in enumerate(cost_window):
                if curr < cost or idx == 0:
                    curr = cost
                else:
                    return False
            return True

    for epoch in range(NUM_EPOCHS):
        time_start = time.time()
        for i in range(EPOCH_SIZE):
            X, y, m, _ = next(datagen)
            print_str = 'Epoch {} batch {}/{}: {} examples at learning rate = {:.4f}'.format(
                epoch + 1, i + 1, EPOCH_SIZE, len(X), float(lr.get_value()))
            print(print_str, end='')
            sys.stdout.flush()
            train(X, y, m)
            print('\r', end='')
        cost = compute_train_cost(X, y, m)
        val_cost = compute_test_cost(X_val, y_val, mask_val)
        cost_train.append(cost)
        cost_val.append(val_cost)
        train_strip[epoch % STRIP_SIZE] = cost
        val_window.push(val_cost)

        gl = 100 * (cost_val[-1] / np.min(cost_val) - 1)
        pk = 1000 * (np.sum(train_strip) /
                     (STRIP_SIZE * np.min(train_strip)) - 1)
        pq = gl / pk

        cr, val_conf = evaluate_model(X_val, y_val, mask_val, val_fn)
        class_rate.append(cr)

        if val_cost < best_val:
            best_val = val_cost
            best_conf = val_conf
            best_cr = cr
            test_cr, test_conf = evaluate_model(X_test, y_test, mask_test,
                                                val_fn)
            print(
                "Epoch {} train cost = {}, val cost = {}, "
                "GL loss = {:.3f}, GQ = {:.3f}, CR = {:.3f}, Test CR= {:.3f} ({:.1f}sec)"
                .format(epoch + 1, cost_train[-1], cost_val[-1], gl, pq, cr,
                        test_cr,
                        time.time() - time_start))
        else:
            print("Epoch {} train cost = {}, val cost = {}, "
                  "GL loss = {:.3f}, GQ = {:.3f}, CR = {:.3f} ({:.1f}sec)".
                  format(epoch + 1, cost_train[-1], cost_val[-1], gl, pq, cr,
                         time.time() - time_start))

        if epoch >= VALIDATION_WINDOW and early_stop(val_window):
            break

        # learning rate decay
        if epoch > decay_start:
            lr.set_value(lr.get_value() * lr_decay)

    phrases = ['p1', 'p2', 'p3', 'p4', 'p5', 'p6', 'p7', 'p8', 'p9', 'p10']

    print('Final Model')
    print('CR: {}, val loss: {}, Test CR: {}'.format(best_cr, best_val,
                                                     test_cr))
    print('confusion matrix: ')
    plot_confusion_matrix(test_conf, phrases, fmt='grid')
    plot_validation_cost(cost_train, cost_val, savefilename='valid_cost')
Ejemplo n.º 36
0
def model_class(ds, paths, param_arch, param_cost, param_updates, param_train):

    # create a log file containing the architecture configuration
    formatter = logging.Formatter('%(message)s')
    logger = logging.getLogger('log_config')
    if 'start_from_epoch' in param_train:
        name_tmp = 'config_from_epoch=%04d.log' % (
            param_train['start_from_epoch'])
    else:
        name_tmp = 'config.log'
    path_tmp = os.path.join(paths['exp'], name_tmp)
    if not os.path.isfile(path_tmp):
        handler = logging.FileHandler(
            path_tmp,
            mode='w')  # to append at the end of the file use: mode='a'
    else:
        raise Exception('[e] the log file ', name_tmp, ' already exists!')
    handler.setFormatter(formatter)
    handler.setLevel(logging.INFO)
    logger.addHandler(handler)
    logger.setLevel(logging.INFO)

    # input dimensions
    dim_desc = ds.descs_train[0].shape[1]
    dim_labels = ds.labels_train[0].shape[0]
    print(dim_labels)

    # architecture definition:
    print(("[i] architecture definition... "), end=' ')
    tic = time.time()
    if param_arch['type'] == 0:
        desc, patch_op, cla, net, logger = arch_class_00(
            dim_desc, dim_labels, param_arch, logger)
    elif param_arch['type'] == 1:
        desc, patch_op, cla, net, logger = arch_class_01(
            dim_desc, dim_labels, param_arch, logger)
    elif param_arch['type'] == 2:
        desc, patch_op, cla, net, logger = arch_class_02(
            dim_desc, dim_labels, param_arch, logger)
    else:
        raise Exception('[e] architecture not supported!')
    print(("%02.2fs" % (time.time() - tic)))

    # cost function definition:
    print(("[i] cost function definition... "), end=' ')
    tic = time.time()
    pred = LL.get_output(cla, deterministic=True)  # in case we use dropout
    feat = LL.get_output(net)
    target = T.ivector('target')
    # data term
    if param_cost['cost_func'] == 'cross_entropy':
        if param_arch['non_linearity'] == 'softmax':
            cost_dataterm = T.mean(
                LO.categorical_crossentropy(pred, target)
            )  # in the original code we were using *.mean() instead of T.mean(*)
        elif param_arch['non_linearity'] == 'log_softmax':
            cost_dataterm = T.mean(
                categorical_crossentropy_logdomain(pred, target))
    elif param_cost['cost_func'] == 'cross_entropy_stable':
        if param_arch['non_linearity'] == 'softmax':
            cost_dataterm = T.mean(
                categorical_crossentropy_stable(pred, target))
        else:
            raise Exception(
                '[e] the chosen cost function is not implemented for the chosen non-linearity!'
            )
    else:
        raise Exception('[e] the chosen cost function is not supported!')
    # classification accuracy
    acc = LO.categorical_accuracy(pred, target).mean()
    # regularization
    cost_reg = param_cost['mu'] * LR.regularize_network_params(cla, LR.l2)
    # cost function
    cost = cost_dataterm + cost_reg
    # get params
    params = LL.get_all_params(cla)
    # gradient definition
    grad = T.grad(cost, params)
    grad_norm = T.nlinalg.norm(T.concatenate([g.flatten() for g in grad]), 2)
    print(("%02.2fs" % (time.time() - tic)))

    # updates definition:
    print(("[i] gradient updates definition... "), end=' ')
    tic = time.time()
    if param_updates['method'] == 'momentum':
        if param_updates.get('learning_rate') is not None:
            learning_rate = param_updates['learning_rate']  # default: 1.0
        else:
            raise Exception('[e] missing learning_rate parameter!')
        if param_updates.get('momentum') is not None:
            momentum = param_updates['momentum']  # default: 0.9
        else:
            raise Exception('[e] missing learning_rate parameter!')
        updates = LU.momentum(grad, params, learning_rate, momentum)
    elif param_updates['method'] == 'adagrad':
        if param_updates.get('learning_rate') is not None:
            learning_rate = param_updates['learning_rate']  # default: 1.0
        else:
            raise Exception('[e] missing learning_rate parameter!')
        updates = LU.adagrad(grad, params, learning_rate)
    elif param_updates['method'] == 'adadelta':
        if param_updates.get('learning_rate') is not None:
            learning_rate = param_updates['learning_rate']  # default: 1.0
        else:
            raise Exception('[e] missing learning_rate parameter!')
        updates = LU.adadelta(grad, params, learning_rate)
    elif param_updates['method'] == 'adam':
        if param_updates.get('learning_rate') is not None:
            learning_rate = param_updates['learning_rate']  # default: 1e-03
        else:
            raise Exception('[e] missing learning_rate parameter!')
        if param_updates.get('beta1') is not None:
            beta1 = param_updates['beta1']  # default: 0.9
        else:
            raise Exception('[e] missing beta1 parameter!')
        if param_updates.get('beta2') is not None:
            beta2 = param_updates['beta2']  # default: 0.999
        else:
            raise Exception('[e] missing beta2 parameter!')
        if param_updates.get('epsilon') is not None:
            epsilon = param_updates['epsilon']  # default: 1e-08
        else:
            raise Exception('[e] missing epsilon parameter!')
        updates = LU.adam(grad, params, learning_rate, beta1, beta2, epsilon)
    else:
        raise Exception('[e] updates method not supported!')
    print(("%02.2fs" % (time.time() - tic)))

    # train / test functions:
    funcs = dict()
    print(("[i] compiling function 'train'... "), end=' ')
    tic = time.time()
    funcs['train'] = theano.function(
        [desc.input_var, patch_op.input_var, target],
        [cost, cost_dataterm, cost_reg, grad_norm, acc],
        updates=updates,
        allow_input_downcast=True,
        on_unused_input='warn')
    print(("%02.2fs" % (time.time() - tic)))
    print(("[i] compiling function 'fwd'... "), end=' ')
    tic = time.time()
    funcs['fwd'] = theano.function(
        [desc.input_var, patch_op.input_var, target], [cost, grad_norm, acc],
        allow_input_downcast=True,
        on_unused_input='ignore')
    print(("%02.2fs" % (time.time() - tic)))
    print(("[i] compiling function 'pred'... "), end=' ')
    tic = time.time()
    funcs['pred'] = theano.function(
        [desc.input_var, patch_op.input_var, target], [pred],
        allow_input_downcast=True,
        on_unused_input='ignore')
    print(("%02.2fs" % (time.time() - tic)))
    print(("[i] compiling function 'feat'... "), end=' ')
    tic = time.time()
    funcs['feat'] = theano.function(
        [desc.input_var, patch_op.input_var, target], [feat],
        allow_input_downcast=True,
        on_unused_input='ignore')
    print(("%02.2fs" % (time.time() - tic)))

    # save cost function parameters to a config file
    logger.info('\nCost function parameters:')
    logger.info('   cost function = %s' % param_cost['cost_func'])
    logger.info('   mu            = %e' % param_cost['mu'])

    # save updates parameters to a config file
    logger.info('\nUpdates parameters:')
    logger.info('   method        = %s' % param_updates['method'])
    logger.info('   learning rate = %e' % param_updates['learning_rate'])
    if param_updates['method'] == 'momentum':
        logger.info('   momentum      = %e' % param_updates['momentum'])
    if param_updates['method'] == 'adam':
        logger.info('   beta1         = %e' % param_updates['beta1'])
        logger.info('   beta2         = %e' % param_updates['beta2'])
        logger.info('   epsilon       = %e' % param_updates['epsilon'])

    # save training parameters to a config file
    logger.info('\nTraining parameters:')
    logger.info('   epoch size = %d' % ds.epoch_size)

    return funcs, cla, updates
Ejemplo n.º 37
0
    def __init__(self,
                 retina_model,
                 seeder_model,
                 n_seeds,
                 n_steps,
                 n_units=100,
                 normalization_coefs=None,
                 loss_coefs=None,
                 alpha=1.0,
                 threshold=1.0):
        self.seeder_model = seeder_model
        self.n_seeds = n_seeds
        self.n_steps = n_steps

        self.threshold = threshold

        self.retina = retina_model

        event_shareds = retina_model.get_event_variables()

        self.seeder = self.seeder_model(retina_model)

        if normalization_coefs is None:
            normalization_coefs = np.ones(shape=retina_model.model_nparams,
                                          dtype='float32')
        else:
            normalization_coefs = np.array(normalization_coefs,
                                           dtype='float32')

        ### params + sigma
        self.inputs = retina_model.alloc_model_params()

        self.input_layer, self.out_layer, self.reg = self.build_nn(
            retina_model.model_nparams, n_units=n_units)

        print 'Linking to Retina Model'

        iterations = [self.inputs]
        responses = []

        for i in xrange(self.n_steps):
            print 'Iteration %d' % i

            prev = iterations[i]
            r, grads = retina_model.grad_for(*event_shareds + prev)

            normed_params = [p * c for p, c in zip(prev, normalization_coefs)]

            normed_grads = [g * c for g, c in zip(grads, normalization_coefs)]

            out = self.get_update_for(normed_params, r, normed_grads)

            param_updates = [out[:, i] for i in range(len(self.inputs))]

            track_param_updates, sigma_update = param_updates[:
                                                              -1], param_updates[
                                                                  -1]

            ### sigma (last parameter) is updated simply by replacing
            ### previous variable
            update = [
                var + upd * alpha
                for var, upd in zip(prev[:-1], track_param_updates)
            ] + [T.exp(-sigma_update)]

            for var, upd, new in zip(prev[:-1], track_param_updates, update):
                print '  -', new, '=', var, '+ %.2e' % alpha, upd

            iterations.append(update)
            responses.append(r)

        prediction = iterations[-1]

        sigma_train = T.fscalar('sigma_train')

        ### Except sigma
        self.true_parameters_shareds = [
            theano.shared(np.ndarray(shape=(0, ), dtype='float32'), name=name)
            for name in retina_model.model_params_names[:-1]
        ]

        ### predictions without sigma
        print 'Constucting loss:'
        print '  - Loss coefs:', loss_coefs
        print '  - True params shared:', self.true_parameters_shareds
        print '  - Predictions:', prediction[:-1]
        print '  - Sigma:', sigma_train

        pure_response, rmse = retina_model.parameter_response(
            loss_coefs,
            *self.true_parameters_shareds + prediction[:-1] + [sigma_train])

        pure_loss = 1.0 - pure_response

        initial_response, initial_rmse = retina_model.parameter_response(
            loss_coefs,
            *self.true_parameters_shareds + self.inputs[:-1] + [sigma_train])

        initial_loss = 1.0 - initial_response

        reg_c = T.fscalar('reg_c')
        alpha_rmse = T.fscalar('reg_c')

        loss = (1.0 -
                alpha_rmse) * pure_loss + alpha_rmse * rmse + reg_c * self.reg

        params = layers.get_all_params(self.out_layer)
        learning_rate = T.fscalar('learning rate')

        net_updates = updates.adadelta(loss,
                                       params,
                                       learning_rate=learning_rate)

        self._train = theano.function(
            self.inputs + [sigma_train, learning_rate, reg_c, alpha_rmse],
            [pure_loss, rmse, self.reg, loss, initial_loss, initial_rmse],
            updates=net_updates)

        self._loss = theano.function(self.inputs + [sigma_train], pure_loss)

        outputs = [v for it in iterations for v in it]

        self.ndim = len(self.inputs)

        self.predictions = theano.function(self.inputs, responses + outputs)

        self.responses = None
        self.traces = None
        self.seeds = None
Ejemplo n.º 38
0
w_L3 = T.sum(alphas_L3[:, :, :, None, None] * basis_L3[None, None, :, :, :],
             axis=2)
w_L4 = init_weights((3136, 10))

#-------------------------
# Set up function
#-------------------------

noise_l1, noise_l2, noise_l3, noise_py_x = model(X, w_L1, w_L2, w_L3, w_L4,
                                                 0.2, 0.7)
l1, l2, l3, py_x = model(X, w_L1, w_L2, w_L3, w_L4, 0., 0.)
y_x = T.argmax(py_x, axis=1)

cost = T.mean(T.nnet.categorical_crossentropy(noise_py_x, Y))
params = [alphas_L1, alphas_L2, alphas_L3, w_L4]
updates = adadelta(cost, params, learning_rate=lr, rho=0.95, epsilon=1e-6)

train = theano.function(inputs=[X, Y, lr],
                        outputs=cost,
                        updates=updates,
                        allow_input_downcast=True)
predict = theano.function(inputs=[X], outputs=y_x, allow_input_downcast=True)

#-------------------------
# Train model
#-------------------------

d = {}
batch_size = 25
epochs = args.epochs[0]
epoch_count = np.array(args.epochs).astype(int)
Ejemplo n.º 39
0
    def build_treatment_model(self, n_vars, **kwargs):

        input_vars = TT.matrix()
        instrument_vars = TT.matrix()
        targets = TT.vector()

        inputs = layers.InputLayer((None, n_vars), input_vars)
        inputs = layers.DropoutLayer(inputs, p=0.2)

        dense_layer = layers.DenseLayer(inputs,
                                        2 * kwargs['dense_size'],
                                        nonlinearity=nonlinearities.rectify)
        dense_layer = layers.batch_norm(dense_layer)
        dense_layer = layers.DropoutLayer(dense_layer, p=0.2)

        for _ in xrange(kwargs['n_dense_layers'] - 1):
            dense_layer = layers.DenseLayer(
                dense_layer,
                kwargs['dense_size'],
                nonlinearity=nonlinearities.rectify)
            dense_layer = layers.batch_norm(dense_layer)

        self.treatment_output = layers.DenseLayer(
            dense_layer, 1, nonlinearity=nonlinearities.linear)
        init_params = layers.get_all_param_values(self.treatment_output)

        prediction = layers.get_output(self.treatment_output,
                                       deterministic=False)
        test_prediction = layers.get_output(self.treatment_output,
                                            deterministic=True)

        l2_cost = regularization.regularize_network_params(
            self.treatment_output, regularization.l2)
        loss = gmm_loss(prediction, targets, instrument_vars) + 1e-4 * l2_cost

        params = layers.get_all_params(self.treatment_output, trainable=True)
        param_updates = updates.adadelta(loss, params)

        self._train_fn = theano.function([
            input_vars,
            targets,
            instrument_vars,
        ],
                                         loss,
                                         updates=param_updates)

        self._loss_fn = theano.function(
            [
                input_vars,
                targets,
                instrument_vars,
            ],
            loss,
        )

        self._output_fn = theano.function(
            [
                input_vars,
            ],
            test_prediction,
        )

        return init_params
Ejemplo n.º 40
0
    def __init__(self, full_length, output_size, meta_size, depth=2, encoder_size=64, decoder_size=64):

        latent_size = 16

        input_var = TT.tensor3(dtype='float32')
        meta_var = TT.tensor3(dtype='float32')
        target_var = TT.matrix()
        cut_weights = TT.vector(dtype='float32')

        input_layer = layers.InputLayer((None, None, output_size), input_var=input_var)
        meta_layer = layers.InputLayer((None, None, meta_size), input_var=meta_var)
        meta_layer = layers.DropoutLayer(meta_layer, p=0.2)
        concat_input_layer = layers.ConcatLayer([input_layer, meta_layer], axis=-1)

        # encoder
        lstm_layer = layers.RecurrentLayer(concat_input_layer, encoder_size / 2, learn_init=True)
        lstm_layer = layers.RecurrentLayer(lstm_layer, encoder_size / 2, learn_init=True)

        lstm_layer = layers.ReshapeLayer(lstm_layer, (-1, encoder_size / 2))

        encoded = layers.DenseLayer(lstm_layer, latent_size)
        encoded = layers.batch_norm(encoded)

        dense = encoded
        for idx in xrange(depth):
            dense = layers.DenseLayer(dense, decoder_size)
            dense = layers.batch_norm(dense)

        mu_and_logvar_x_layer = layers.DenseLayer(dense, full_length * 2, nonlinearity=nonlinearities.linear)

        mu_x_layer = layers.SliceLayer(mu_and_logvar_x_layer, slice(0, full_length), axis=1)
        mu_x_layer = layers.ReshapeLayer(mu_x_layer, (-1, full_length, full_length))
        logvar_x_layer = layers.SliceLayer(mu_and_logvar_x_layer, slice(full_length, None), axis=1)
        logvar_x_layer = layers.ReshapeLayer(logvar_x_layer, (-1, full_length, full_length))

        l2_norm = regularization.regularize_network_params(mu_and_logvar_x_layer, regularization.l2)

        loss = neg_log_likelihood(
            target_var,
            layers.get_output(mu_x_layer, deterministic=False),
            layers.get_output(logvar_x_layer, deterministic=False),
            cut_weights
        ) + 1e-4 * l2_norm

        test_loss = neg_log_likelihood(
            target_var,
            layers.get_output(mu_x_layer, deterministic=False),
            layers.get_output(logvar_x_layer, deterministic=False),
            cut_weights
        ) + 1e-4 * l2_norm

        params = layers.get_all_params(mu_and_logvar_x_layer, trainable=True)
        param_updates = updates.adadelta(loss.mean(), params)

        self._train_fn = theano.function(
            [input_var, meta_var, target_var, cut_weights],
            updates=param_updates,
            outputs=loss.mean()
        )

        self._loss_fn = theano.function(
            [input_var, meta_var, target_var, cut_weights],
            outputs=test_loss.mean()
        )

        self._predict_fn = theano.function(
            [input_var, meta_var],
            outputs=[
                layers.get_output(mu_x_layer, deterministic=True),
                layers.get_output(logvar_x_layer, deterministic=True)
            ]
        )