Example #1
0
def get_updates(nnet, train_obj, trainable_params):

    implemented_solvers = ("nesterov", "adagrad", "adadelta", "adam")

    if not hasattr(nnet, "solver") or nnet.solver not in implemented_solvers:
        nnet.sgd_solver = "nesterov"
    else:
        nnet.sgd_solver = nnet.solver

    if nnet.sgd_solver == "nesterov":
        updates = l_updates.nesterov_momentum(train_obj,
                                              trainable_params,
                                              learning_rate=Cfg.learning_rate,
                                              momentum=0.9)

    elif nnet.sgd_solver == "adagrad":
        updates = l_updates.adagrad(train_obj,
                                    trainable_params,
                                    learning_rate=Cfg.learning_rate)

    elif nnet.sgd_solver == "adadelta":
        updates = l_updates.adadelta(train_obj,
                                     trainable_params,
                                     learning_rate=Cfg.learning_rate)

    elif nnet.sgd_solver == "adam":
        updates = l_updates.adam(train_obj,
                                 trainable_params,
                                 learning_rate=Cfg.learning_rate)

    return updates
Example #2
0
def create_train_func(layers, lr=0.01):
    # dims: batch, sequence, vocabulary
    X = T.tensor3('X')
    X_batch = T.tensor3('X_batch')

    # dims: target
    y = T.ivector('y')
    y_batch = T.ivector('y_batch')

    y_hat = get_output(layers['l_out'], X, deterministic=False)

    train_loss = T.mean(categorical_crossentropy(y_hat, y), axis=0)
    params = get_all_params(layers['l_out'], trainable=True)

    updates = adagrad(train_loss, params, lr)

    train_func = theano.function(
        inputs=[theano.In(X_batch), theano.In(y_batch)],
        outputs=train_loss,
        updates=updates,
        givens={
            X: X_batch,
            y: y_batch,
        },
    )

    return train_func
def generate_theano_func(args, network, penalty, input_dict, target_var):

    prediction = get_output(network, input_dict)

    # loss = T.mean( target_var * ( T.log(target_var) - prediction ))
    loss = T.mean(categorical_crossentropy(prediction, target_var))
    # loss += 0.0001 * sum (T.sum(layer_params ** 2) for layer_params in get_all_params(network) )
    # penalty = sum ( T.sum(lstm_param**2) for lstm_param in lstm_params )
    # penalty = regularize_layer_params(l_forward_1_lstm, l2)
    # penalty = T.sum(lstm_param**2 for lstm_param in lstm_params)
    # penalty = 0.0001 * sum (T.sum(layer_params ** 2) for layer_params in get_all_params(l_forward_1) )

    loss = loss + penalty

    params = get_all_params(network, trainable=True)

    if args.optimizer == "sgd":
        updates = sgd(loss, params, learning_rate=args.step)
    elif args.optimizer == "adagrad":
        updates = adagrad(loss, params, learning_rate=args.step)
    elif args.optimizer == "adadelta":
        updates = adadelta(loss, params, learning_rate=args.step)
    elif args.optimizer == "nesterov":
        updates = nesterov_momentum(loss, params, learning_rate=args.step)
    elif args.optimizer == "rms":
        updates = rmsprop(loss, params, learning_rate=args.step)
    elif args.optimizer == "adam":
        updates = adam(loss, params, learning_rate=args.step)
    else:
        raise "Need set optimizer correctly"

    test_prediction = get_output(network, input_dict, deterministic=True)
    # test_prediction = get_output(network, deterministic=True)
    # test_loss = T.mean( target_var * ( T.log(target_var) - test_prediction))
    test_loss = T.mean(categorical_crossentropy(test_prediction, target_var))

    train_fn = theano.function(
        [input1_var, input1_mask_var, input2_var, input2_mask_var, target_var],
        loss,
        updates=updates,
        allow_input_downcast=True,
    )

    if args.task == "sts":
        val_fn = theano.function(
            [input1_var, input1_mask_var, input2_var, input2_mask_var, target_var],
            [test_loss, test_prediction],
            allow_input_downcast=True,
        )

    elif args.task == "ent":
        # test_acc = T.mean(T.eq(T.argmax(test_prediction, axis=1), target_var), dtype=theano.config.floatX)
        test_acc = T.mean(categorical_accuracy(test_prediction, target_var))
        val_fn = theano.function(
            [input1_var, input1_mask_var, input2_var, input2_mask_var, target_var],
            [test_loss, test_acc],
            allow_input_downcast=True,
        )

    return train_fn, val_fn
Example #4
0
def build_updates(grad, params, optimization, learning_rate):
	""" setup optimization algorithm """

	if optimization['optimizer'] == 'sgd':
		update_op = updates.sgd(grad, params, learning_rate=learning_rate) 
 
	elif optimization['optimizer'] == 'nesterov_momentum':
		if momenum in optimization:
			momentum = optimization['momentum']
		else:
			momentum = 0.9
		update_op = updates.nesterov_momentum(grad, params, learning_rate=learning_rate, momentum=momentum)
	
	elif optimization['optimizer'] == 'adagrad':
		update_op = updates.adagrad(grad, params, learning_rate=learning_rate)
	
	elif optimization['optimizer'] == 'rmsprop':
		if 'rho' in optimization:
			rho = optimization['rho']
		else:
			rho = 0.9
		update_op = updates.rmsprop(grad, params, learning_rate=learning_rate, rho=rho)
	
	elif optimization['optimizer'] == 'adam':
		if 'beta1' in optimization:
			beta1 = optimization['beta1']
		else:
			beta1 = 0.9
		if 'beta2' in optimization:
			beta2 = optimization['beta2']
		else:
			beta2 = 0.999
		update_op = updates.adam(grad, params, learning_rate=learning_rate, beta1=beta1, beta2=beta2)
  
	return update_op
Example #5
0
 def optimize(grads, params):
     if state['optim_method'] == 'adam':
         updates = adam(grads, params, lrt, state['momentum'])
     elif state['optim_method'] == 'adagrad':
         updates = adagrad(grads, params, lrt)
     elif state['optim_method'] == 'sgd':
         updates = sgd(grads, params, lrt)
     return updates
Example #6
0
def adagrad_momentum(grads,
                     params,
                     learning_rate=1.0,
                     momentum=0.9,
                     epsilon=1e-06):
    return apply_nesterov_momentum(adagrad(grads, params, learning_rate,
                                           epsilon),
                                   params=params,
                                   momentum=momentum)
    def create_iter_functions(self,
                              dataset,
                              output_layer,
                              X_tensor_type=T.matrix):
        batch_index = T.iscalar('batch_index')
        X_batch = X_tensor_type('x')
        y_batch = T.ivector('y')

        batch_slice = slice(batch_index * self.batch_size,
                            (batch_index + 1) * self.batch_size)

        objective = Objective(output_layer,
                              loss_function=categorical_crossentropy)

        loss_train = objective.get_loss(X_batch, target=y_batch)
        loss_eval = objective.get_loss(X_batch,
                                       target=y_batch,
                                       deterministic=True)

        pred = T.argmax(output_layer.get_output(X_batch, deterministic=True),
                        axis=1)
        proba = output_layer.get_output(X_batch, deterministic=True)
        accuracy = T.mean(T.eq(pred, y_batch), dtype=theano.config.floatX)

        all_params = get_all_params(output_layer)
        updates = adagrad(loss_train, all_params, self.lr, self.epsilon)

        iter_train = theano.function(
            [batch_index],
            loss_train,
            updates=updates,
            givens={
                X_batch: dataset['X_train'][batch_slice],
                y_batch: dataset['y_train'][batch_slice],
            },
            on_unused_input='ignore',
        )

        iter_valid = None
        if self.use_valid:
            iter_valid = theano.function(
                [batch_index],
                [loss_eval, accuracy, proba],
                givens={
                    X_batch: dataset['X_valid'][batch_slice],
                    y_batch: dataset['y_valid'][batch_slice],
                },
            )

        return dict(train=iter_train, valid=iter_valid)
Example #8
0
 def test_workflow(self):
     input_var = theano.shared(self.x)
     with pm.Model() as model:
         inp = layers.InputLayer(self.x.shape, input_var=input_var)
         out = BayesDenseLayer(inp, 1, nonlinearity=to.identity)
         pm.Normal('y', mu=get_output(out), sd=self.sd, observed=self.y)
     elbo, _, upd_rng, vp = sample_elbo(model, samples=1)
     upd_adam = updates.adagrad(-elbo, vp.params)
     upd_rng.update(upd_adam)
     step = theano.function([], elbo, updates=upd_rng)
     for i in range(100):
         step()
     self.assertRaises(ValueError, get_output, out, deterministic=True)
     preds = get_output(out, vp=vp, deterministic=True)
     np.testing.assert_allclose(preds.eval(), self.y, rtol=0, atol=1)
Example #9
0
    def get_cost_updates(self, corruption_level, learning_rate, noise = 0.0, momentum=0):
        """ This function computes the cost and the updates for one trainng
        step of the dA """

        tilde_x = self.get_corrupted_input(self.x, corruption_level, noise)
        y = self.get_hidden_values(tilde_x)
        z = self.get_reconstructed_input(y)
        L = - T.sum(self.desired * T.log(z) + (1 - self.desired) * T.log(1 - z), axis=1)
        cost = T.mean(L)

        # adagrad with momentum on cost
        updates_ada = adagrad(cost, self.params, learning_rate=learning_rate)
        updates = apply_momentum(updates_ada, self.params, momentum=momentum)

        return (cost, updates)
Example #10
0
def get_updates(nnet, train_obj, trainable_params, solver=None):

    implemented_solvers = ("sgd", "momentum", "nesterov", "adagrad", "rmsprop",
                           "adadelta", "adam", "adamax")

    if solver not in implemented_solvers:
        nnet.sgd_solver = "adam"
    else:
        nnet.sgd_solver = solver

    if nnet.sgd_solver == "sgd":
        updates = l_updates.sgd(train_obj,
                                trainable_params,
                                learning_rate=Cfg.learning_rate)
    elif nnet.sgd_solver == "momentum":
        updates = l_updates.momentum(train_obj,
                                     trainable_params,
                                     learning_rate=Cfg.learning_rate,
                                     momentum=Cfg.momentum)
    elif nnet.sgd_solver == "nesterov":
        updates = l_updates.nesterov_momentum(train_obj,
                                              trainable_params,
                                              learning_rate=Cfg.learning_rate,
                                              momentum=Cfg.momentum)
    elif nnet.sgd_solver == "adagrad":
        updates = l_updates.adagrad(train_obj,
                                    trainable_params,
                                    learning_rate=Cfg.learning_rate)
    elif nnet.sgd_solver == "rmsprop":
        updates = l_updates.rmsprop(train_obj,
                                    trainable_params,
                                    learning_rate=Cfg.learning_rate,
                                    rho=Cfg.rho)
    elif nnet.sgd_solver == "adadelta":
        updates = l_updates.adadelta(train_obj,
                                     trainable_params,
                                     learning_rate=Cfg.learning_rate,
                                     rho=Cfg.rho)
    elif nnet.sgd_solver == "adam":
        updates = l_updates.adam(train_obj,
                                 trainable_params,
                                 learning_rate=Cfg.learning_rate)
    elif nnet.sgd_solver == "adamax":
        updates = l_updates.adamax(train_obj,
                                   trainable_params,
                                   learning_rate=Cfg.learning_rate)

    return updates
    def create_iter_functions(self, dataset, output_layer, X_tensor_type=T.matrix):
        batch_index = T.iscalar('batch_index')
        X_batch = X_tensor_type('x')
        y_batch = T.ivector('y')

        batch_slice = slice(batch_index * self.batch_size, (batch_index + 1) * self.batch_size)

        objective = Objective(output_layer, loss_function=categorical_crossentropy)

        loss_train = objective.get_loss(X_batch, target=y_batch)
        loss_eval = objective.get_loss(X_batch, target=y_batch, deterministic=True)

        pred = T.argmax(output_layer.get_output(X_batch, deterministic=True), axis=1)
        proba = output_layer.get_output(X_batch, deterministic=True)
        accuracy = T.mean(T.eq(pred, y_batch), dtype=theano.config.floatX)

        all_params = get_all_params(output_layer)
        updates = adagrad(loss_train, all_params, self.lr, self.rho)

        iter_train = theano.function(
            [batch_index], loss_train,
            updates=updates,
            givens={
                X_batch: dataset['X_train'][batch_slice],
                y_batch: dataset['y_train'][batch_slice],
            },
            on_unused_input='ignore',
        )

        iter_valid = None
        if self.use_valid:
            iter_valid = theano.function(
                [batch_index], [loss_eval, accuracy, proba],
                givens={
                    X_batch: dataset['X_valid'][batch_slice],
                    y_batch: dataset['y_valid'][batch_slice],
                },
            )

        return dict(train=iter_train, valid=iter_valid)
Example #12
0
File: model.py Project: xenx/speech
    def __init__(self, num_units=2):
        """
        :param num_units: output vector dimension
        """

        def loss_func(all_predicted):
            def distance_sq(x1, x2):
                return T.sum(T.sqr(x1 - x2))

            d1 = distance_sq(all_predicted[:, 0], all_predicted[:, 1])
            d2 = distance_sq(all_predicted[:, 0], all_predicted[:, 2])
            alpha = 1e-2

            return T.maximum(d1 + alpha, 0) - T.maximum(d2 + alpha, 0)

        super().__init__(
            sound_shape=(20, 100),
            num_units=num_units,
            main_layer_class=Speech2VecLayer,
            loss_func=loss_func,
            updates_func=lambda loss, weights: adagrad(loss, weights, learning_rate=0.001)
        )
Example #13
0
def model_class(ds, paths, param_arch, param_cost, param_updates, param_train):

    # create a log file containing the architecture configuration
    formatter = logging.Formatter('%(message)s')
    logger = logging.getLogger('log_config')
    if 'start_from_epoch' in param_train:
        name_tmp = 'config_from_epoch=%04d.log' % (
            param_train['start_from_epoch'])
    else:
        name_tmp = 'config.log'
    path_tmp = os.path.join(paths['exp'], name_tmp)
    if not os.path.isfile(path_tmp):
        handler = logging.FileHandler(
            path_tmp,
            mode='w')  # to append at the end of the file use: mode='a'
    else:
        raise Exception('[e] the log file ', name_tmp, ' already exists!')
    handler.setFormatter(formatter)
    handler.setLevel(logging.INFO)
    logger.addHandler(handler)
    logger.setLevel(logging.INFO)

    # input dimensions
    dim_desc = ds.descs_train[0].shape[1]
    dim_labels = ds.labels_train[0].shape[0]
    print(dim_labels)

    # architecture definition:
    print(("[i] architecture definition... "), end=' ')
    tic = time.time()
    if param_arch['type'] == 0:
        desc, patch_op, cla, net, logger = arch_class_00(
            dim_desc, dim_labels, param_arch, logger)
    elif param_arch['type'] == 1:
        desc, patch_op, cla, net, logger = arch_class_01(
            dim_desc, dim_labels, param_arch, logger)
    elif param_arch['type'] == 2:
        desc, patch_op, cla, net, logger = arch_class_02(
            dim_desc, dim_labels, param_arch, logger)
    else:
        raise Exception('[e] architecture not supported!')
    print(("%02.2fs" % (time.time() - tic)))

    # cost function definition:
    print(("[i] cost function definition... "), end=' ')
    tic = time.time()
    pred = LL.get_output(cla, deterministic=True)  # in case we use dropout
    feat = LL.get_output(net)
    target = T.ivector('target')
    # data term
    if param_cost['cost_func'] == 'cross_entropy':
        if param_arch['non_linearity'] == 'softmax':
            cost_dataterm = T.mean(
                LO.categorical_crossentropy(pred, target)
            )  # in the original code we were using *.mean() instead of T.mean(*)
        elif param_arch['non_linearity'] == 'log_softmax':
            cost_dataterm = T.mean(
                categorical_crossentropy_logdomain(pred, target))
    elif param_cost['cost_func'] == 'cross_entropy_stable':
        if param_arch['non_linearity'] == 'softmax':
            cost_dataterm = T.mean(
                categorical_crossentropy_stable(pred, target))
        else:
            raise Exception(
                '[e] the chosen cost function is not implemented for the chosen non-linearity!'
            )
    else:
        raise Exception('[e] the chosen cost function is not supported!')
    # classification accuracy
    acc = LO.categorical_accuracy(pred, target).mean()
    # regularization
    cost_reg = param_cost['mu'] * LR.regularize_network_params(cla, LR.l2)
    # cost function
    cost = cost_dataterm + cost_reg
    # get params
    params = LL.get_all_params(cla)
    # gradient definition
    grad = T.grad(cost, params)
    grad_norm = T.nlinalg.norm(T.concatenate([g.flatten() for g in grad]), 2)
    print(("%02.2fs" % (time.time() - tic)))

    # updates definition:
    print(("[i] gradient updates definition... "), end=' ')
    tic = time.time()
    if param_updates['method'] == 'momentum':
        if param_updates.get('learning_rate') is not None:
            learning_rate = param_updates['learning_rate']  # default: 1.0
        else:
            raise Exception('[e] missing learning_rate parameter!')
        if param_updates.get('momentum') is not None:
            momentum = param_updates['momentum']  # default: 0.9
        else:
            raise Exception('[e] missing learning_rate parameter!')
        updates = LU.momentum(grad, params, learning_rate, momentum)
    elif param_updates['method'] == 'adagrad':
        if param_updates.get('learning_rate') is not None:
            learning_rate = param_updates['learning_rate']  # default: 1.0
        else:
            raise Exception('[e] missing learning_rate parameter!')
        updates = LU.adagrad(grad, params, learning_rate)
    elif param_updates['method'] == 'adadelta':
        if param_updates.get('learning_rate') is not None:
            learning_rate = param_updates['learning_rate']  # default: 1.0
        else:
            raise Exception('[e] missing learning_rate parameter!')
        updates = LU.adadelta(grad, params, learning_rate)
    elif param_updates['method'] == 'adam':
        if param_updates.get('learning_rate') is not None:
            learning_rate = param_updates['learning_rate']  # default: 1e-03
        else:
            raise Exception('[e] missing learning_rate parameter!')
        if param_updates.get('beta1') is not None:
            beta1 = param_updates['beta1']  # default: 0.9
        else:
            raise Exception('[e] missing beta1 parameter!')
        if param_updates.get('beta2') is not None:
            beta2 = param_updates['beta2']  # default: 0.999
        else:
            raise Exception('[e] missing beta2 parameter!')
        if param_updates.get('epsilon') is not None:
            epsilon = param_updates['epsilon']  # default: 1e-08
        else:
            raise Exception('[e] missing epsilon parameter!')
        updates = LU.adam(grad, params, learning_rate, beta1, beta2, epsilon)
    else:
        raise Exception('[e] updates method not supported!')
    print(("%02.2fs" % (time.time() - tic)))

    # train / test functions:
    funcs = dict()
    print(("[i] compiling function 'train'... "), end=' ')
    tic = time.time()
    funcs['train'] = theano.function(
        [desc.input_var, patch_op.input_var, target],
        [cost, cost_dataterm, cost_reg, grad_norm, acc],
        updates=updates,
        allow_input_downcast=True,
        on_unused_input='warn')
    print(("%02.2fs" % (time.time() - tic)))
    print(("[i] compiling function 'fwd'... "), end=' ')
    tic = time.time()
    funcs['fwd'] = theano.function(
        [desc.input_var, patch_op.input_var, target], [cost, grad_norm, acc],
        allow_input_downcast=True,
        on_unused_input='ignore')
    print(("%02.2fs" % (time.time() - tic)))
    print(("[i] compiling function 'pred'... "), end=' ')
    tic = time.time()
    funcs['pred'] = theano.function(
        [desc.input_var, patch_op.input_var, target], [pred],
        allow_input_downcast=True,
        on_unused_input='ignore')
    print(("%02.2fs" % (time.time() - tic)))
    print(("[i] compiling function 'feat'... "), end=' ')
    tic = time.time()
    funcs['feat'] = theano.function(
        [desc.input_var, patch_op.input_var, target], [feat],
        allow_input_downcast=True,
        on_unused_input='ignore')
    print(("%02.2fs" % (time.time() - tic)))

    # save cost function parameters to a config file
    logger.info('\nCost function parameters:')
    logger.info('   cost function = %s' % param_cost['cost_func'])
    logger.info('   mu            = %e' % param_cost['mu'])

    # save updates parameters to a config file
    logger.info('\nUpdates parameters:')
    logger.info('   method        = %s' % param_updates['method'])
    logger.info('   learning rate = %e' % param_updates['learning_rate'])
    if param_updates['method'] == 'momentum':
        logger.info('   momentum      = %e' % param_updates['momentum'])
    if param_updates['method'] == 'adam':
        logger.info('   beta1         = %e' % param_updates['beta1'])
        logger.info('   beta2         = %e' % param_updates['beta2'])
        logger.info('   epsilon       = %e' % param_updates['epsilon'])

    # save training parameters to a config file
    logger.info('\nTraining parameters:')
    logger.info('   epoch size = %d' % ds.epoch_size)

    return funcs, cla, updates
Example #14
0
def run_dnn(learning_rate=0.001, dnn_strategy='mix', possitive_punishment=1):
    #input_var = T.TensorType('float32', ((False,) * 3))()        # Notice the () at the end
    input_var = T.ftensor3('X')
    target_var = T.imatrix('y')

    features_type = 16
    perioid = 20
    features_dim = features_type * perioid
    network = build_mix(input_var, 1, features_type, features_dim, perioid, activity=sigmoid)
    if dnn_strategy == 'dnn':
        build_dnn(input_var, 1, features_type, features_dim, perioid, activity=sigmoid)
    elif dnn_strategy == 'conv1d':
        build_conv1d(input_var, 1, features_type, features_dim, perioid, activity=sigmoid)
    elif dnn_strategy == 'cascade':
        build_cascade(input_var, 1, features_type, features_dim, perioid, activity=sigmoid)
    elif dnn_strategy == 'lstm':
        build_lstm(input_var, 1, features_type, features_dim, perioid, activity=sigmoid)
    elif dnn_strategy == 'partitioned':
        build_partitioned(input_var, 1, features_type, features_dim, perioid, activity=sigmoid)
    elif dnn_strategy == 'mix':
        pass
    else:
        raise AttributeError("This dnn_strategy is not supported!")

    l_output = get_output(network)
    loss = self_binary_crossentropy(l_output, target_var, possitive_punishment=possitive_punishment).mean()
    train_acc = binary_accuracy(l_output, target_var).mean()
    all_params = get_all_params(network, trainable=True)
    updates = adagrad(loss, all_params, learning_rate=learning_rate)
    train = theano.function([input_var, target_var], [loss, train_acc], updates=updates)

    test_prediction = get_output(network, deterministic=True)
    test_loss = self_binary_crossentropy(test_prediction, target_var, possitive_punishment=possitive_punishment).mean()
    test_acc = binary_accuracy(test_prediction, target_var).mean()

    #calculate win rate
    win_rate_result1 = []
    win_rate_result2 = []
    for win_rate_threhold in [0.5, 0.6, 0.7, 0.8, 0.9]:
        tmp1 = T.sum(T.switch(T.and_(T.gt(test_prediction, win_rate_threhold), T.eq(target_var, 1)), 1, 0), dtype=theano.config.floatX)
        tmp2 = T.sum(T.switch(T.gt(test_prediction, win_rate_threhold), 1, 0), dtype=theano.config.floatX)
        test_win_rate = (tmp1 + 0.00001) / (tmp2 + 0.00001)
        win_rate_result1.append(test_win_rate)
        win_rate_result2.append(tmp1)

    val = theano.function([input_var, target_var], [test_prediction, test_loss, test_acc, T.as_tensor_variable(win_rate_result1), T.as_tensor_variable(win_rate_result2)])

    _, _, _, _, X_train, y_train, X_val, y_val, _, _ = load_dataset('../../data/800core')
    '''
    test_data_list = []
    test_label_list = []
    for ix in range(103):
        file_name = '../../data/test_dis/data_' + str(ix) + '.txt'
        tmp_test_data, tmp_test_label, _, _, _, _, _, _ = load_dataset(file_name)
        test_data_list.append(tmp_test_data)
        test_label_list.append(tmp_test_label)
    '''

    num_epochs = 150
    batch_size = 128
    for epoch in xrange(num_epochs):
        train_err = 0
        train_acc = 0
        train_batches = 0
        start_time = time.time()

        #train
        for batch in iterate_minibatches(X_train, y_train, batch_size):
            inputs, targets = batch
            err, acc= train(inputs, targets)
            train_err += err
            train_acc += acc
            train_batches += 1

        #validate
        _, val_err, val_acc, val_wr1, val_wr2 = val(X_val, y_val)

        # Then we print the results for this epoch:
        for ix in range(len([0.5, 0.6, 0.7, 0.8, 0.9])):
            sys.stdout.write("  validation win rate :\t\t{}\n".format(val_wr1[ix]))
            sys.stdout.write("  validation possitive num:\t\t{}\n".format(val_wr2[ix]))
        sys.stdout.write("Epoch {} of {} took {:.3f}s\n".format(
            epoch + 1, num_epochs, time.time() - start_time))
        sys.stdout.write("  training loss:\t\t{}\n".format(train_err / train_batches))
        sys.stdout.write("  training accuracy:\t\t{}\n".format(train_acc / train_batches))
        sys.stdout.write("  validation loss:\t\t{}\n".format(val_err/1))
        sys.stdout.write("  validation accuracy:\t\t{} %\n".format(val_acc * 100))
        sys.stdout.write('\n')
        sys.stdout.flush()

        #sotre for gpu
        with open('../../model/' + dnn_strategy + '/' + 'learning_rate' + str(learning_rate) + '_punishment' + str(possitive_punishment) + '_epoch' + str(epoch) + '.model', 'w') as f:
            cPickle.dump(network, f, protocol=cPickle.HIGHEST_PROTOCOL)

    print 'Done!'
Example #15
0
cost_d = -T.mean(T.log(prob_T)) - T.mean(T.log(1. - prob_F))

#### generator
cost_g = -T.mean(T.log(prob_F))

#### cost variables to be considered
cost_str = ['E_T', 'E_F', 'P_T', 'P_F']
cost_var = [E_T, E_F, P_T, P_F]

############################
# Gradient & Optimization
############################
#### Parameter updates
# updates_d = sgd(cost_d, params_d, lr)
# updates_g = sgd(cost_g, params_g, lr)
updates_d = adagrad(cost_d, params_d, lr)
updates_g = adagrad(cost_g, params_g, lr)

#### Barchnorm state updates
batchnorm_updates_d = disc_model.get_updates()
print '# D batchnorm updates: %d' % (len(batchnorm_updates_d))

batchnorm_updates_g = gen_model.get_updates()
print '# G batchnorm updates: %d' % (len(batchnorm_updates_g))

#### Collect all updates
updates_d.update(batchnorm_updates_d)
updates_g.update(batchnorm_updates_g)

updates_all = OrderedDict()
updates_all.update(updates_d)
def multi_task_classifier(args, input_var, target_var, wordEmbeddings, seqlen, num_feats, lambda_val = 0.5 * 1e-4):

    print("Building multi task model with 1D Convolution")

    vocab_size = wordEmbeddings.shape[1]
    wordDim = wordEmbeddings.shape[0]

    kw = 2
    num_filters = seqlen-kw+1
    stride = 1 
    filter_size=wordDim
    pool_size=num_filters

    input = InputLayer((None, seqlen, num_feats),input_var=input_var)
    batchsize, _, _ = input.input_var.shape
    emb = EmbeddingLayer(input, input_size=vocab_size, output_size=wordDim, W=wordEmbeddings.T)
    reshape = ReshapeLayer(emb, (batchsize, seqlen, num_feats*wordDim))


    conv1d_1 = DimshuffleLayer(Conv1DLayer(reshape, num_filters=num_filters, filter_size=wordDim, stride=1, 
        nonlinearity=tanh,W=GlorotUniform()), (0,2,1))
    maxpool_1 = MaxPool1DLayer(conv1d_1, pool_size=pool_size)  
    hid_1 = DenseLayer(maxpool_1, num_units=args.hiddenDim, nonlinearity=sigmoid)
    network_1 = DenseLayer(hid_1, num_units=2, nonlinearity=softmax)


    conv1d_2 = DimshuffleLayer(Conv1DLayer(reshape, num_filters=num_filters, filter_size=wordDim, stride=1, 
        nonlinearity=tanh,W=GlorotUniform()), (0,2,1))
    maxpool_2 = MaxPool1DLayer(conv1d_2, pool_size=pool_size)  
    hid_2 = DenseLayer(maxpool_2, num_units=args.hiddenDim, nonlinearity=sigmoid)
    network_2 = DenseLayer(hid_2, num_units=4, nonlinearity=softmax)

    conv1d_3 = DimshuffleLayer(Conv1DLayer(reshape, num_filters=num_filters, filter_size=wordDim, stride=1, 
        nonlinearity=tanh,W=GlorotUniform()), (0,2,1))
    maxpool_3 = MaxPool1DLayer(conv1d_3, pool_size=pool_size)  
    hid_3 = DenseLayer(maxpool_3, num_units=args.hiddenDim, nonlinearity=sigmoid)
    network_3 = DenseLayer(hid_3, num_units=3, nonlinearity=softmax)

    conv1d_4 = DimshuffleLayer(Conv1DLayer(reshape, num_filters=num_filters, filter_size=wordDim, stride=1, 
        nonlinearity=tanh,W=GlorotUniform()), (0,2,1))
    maxpool_4 = MaxPool1DLayer(conv1d_4, pool_size=pool_size)  
    hid_4 = DenseLayer(maxpool_4, num_units=args.hiddenDim, nonlinearity=sigmoid)
    network_4 = DenseLayer(hid_4, num_units=3, nonlinearity=softmax)

    conv1d_5 = DimshuffleLayer(Conv1DLayer(reshape, num_filters=num_filters, filter_size=wordDim, stride=1, 
        nonlinearity=tanh,W=GlorotUniform()), (0,2,1))
    maxpool_5 = MaxPool1DLayer(conv1d_5, pool_size=pool_size)  
    hid_5 = DenseLayer(maxpool_5, num_units=args.hiddenDim, nonlinearity=sigmoid)
    network_5 = DenseLayer(hid_5, num_units=2, nonlinearity=softmax)

    conv1d_6 = DimshuffleLayer(Conv1DLayer(reshape, num_filters=num_filters, filter_size=wordDim, stride=1, 
        nonlinearity=tanh,W=GlorotUniform()), (0,2,1))
    maxpool_6 = MaxPool1DLayer(conv1d_6, pool_size=pool_size)  
    hid_6 = DenseLayer(maxpool_6, num_units=args.hiddenDim, nonlinearity=sigmoid)
    network_6 = DenseLayer(hid_6, num_units=4, nonlinearity=softmax)


    conv1d_7 = DimshuffleLayer(Conv1DLayer(reshape, num_filters=num_filters, filter_size=wordDim, stride=1, 
        nonlinearity=tanh,W=GlorotUniform()), (0,2,1))
    maxpool_7 = MaxPool1DLayer(conv1d_7, pool_size=pool_size)  
    hid_7 = DenseLayer(maxpool_7, num_units=args.hiddenDim, nonlinearity=sigmoid)
    network_7 = DenseLayer(hid_7, num_units=3, nonlinearity=softmax)

    conv1d_8 = DimshuffleLayer(Conv1DLayer(reshape, num_filters=num_filters, filter_size=wordDim, stride=1, 
        nonlinearity=tanh,W=GlorotUniform()), (0,2,1))
    maxpool_8 = MaxPool1DLayer(conv1d_8, pool_size=pool_size)  
    hid_8 = DenseLayer(maxpool_8, num_units=args.hiddenDim, nonlinearity=sigmoid)
    network_8 = DenseLayer(hid_8, num_units=3, nonlinearity=softmax)


    # Is this important?
    network_1_out, network_2_out, network_3_out, network_4_out, \
    network_5_out, network_6_out, network_7_out, network_8_out = \
    get_output([network_1, network_2, network_3, network_4, network_5, network_6, network_7, network_8])

    loss_1 = T.mean(binary_crossentropy(network_1_out,target_var)) + regularize_layer_params_weighted({emb:lambda_val, conv1d_1:lambda_val, 
                hid_1:lambda_val, network_1:lambda_val} , l2)
    updates_1 = adagrad(loss_1, get_all_params(network_1, trainable=True), learning_rate=args.step)
    train_fn_1 = theano.function([input_var, target_var], loss_1, updates=updates_1, allow_input_downcast=True)
    val_acc_1 =  T.mean(binary_accuracy(get_output(network_1, deterministic=True), target_var))
    val_fn_1 = theano.function([input_var, target_var], val_acc_1, allow_input_downcast=True)


    loss_2 = T.mean(categorical_crossentropy(network_2_out,target_var)) + regularize_layer_params_weighted({emb:lambda_val, conv1d_2:lambda_val, 
                hid_2:lambda_val, network_2:lambda_val} , l2)
    updates_2 = adagrad(loss_2, get_all_params(network_2, trainable=True), learning_rate=args.step)
    train_fn_2 = theano.function([input_var, target_var], loss_2, updates=updates_2, allow_input_downcast=True)
    val_acc_2 =  T.mean(categorical_accuracy(get_output(network_2, deterministic=True), target_var))
    val_fn_2 = theano.function([input_var, target_var], val_acc_2, allow_input_downcast=True)


    loss_3 = T.mean(categorical_crossentropy(network_3_out,target_var)) + regularize_layer_params_weighted({emb:lambda_val, conv1d_3:lambda_val, 
                hid_3:lambda_val, network_3:lambda_val} , l2)
    updates_3 = adagrad(loss_3, get_all_params(network_3, trainable=True), learning_rate=args.step)
    train_fn_3 = theano.function([input_var, target_var], loss_3, updates=updates_3, allow_input_downcast=True)
    val_acc_3 =  T.mean(categorical_accuracy(get_output(network_3, deterministic=True), target_var))
    val_fn_3 = theano.function([input_var, target_var], val_acc_3, allow_input_downcast=True)


    loss_4 = T.mean(categorical_crossentropy(network_4_out,target_var)) + regularize_layer_params_weighted({emb:lambda_val, conv1d_4:lambda_val, 
                hid_4:lambda_val, network_4:lambda_val} , l2)
    updates_4 = adagrad(loss_4, get_all_params(network_4, trainable=True), learning_rate=args.step)
    train_fn_4 = theano.function([input_var, target_var], loss_4, updates=updates_4, allow_input_downcast=True)
    val_acc_4 =  T.mean(categorical_accuracy(get_output(network_4, deterministic=True), target_var))
    val_fn_4 = theano.function([input_var, target_var], val_acc_4, allow_input_downcast=True)

    loss_5 = T.mean(binary_crossentropy(network_5_out,target_var)) + regularize_layer_params_weighted({emb:lambda_val, conv1d_5:lambda_val, 
                hid_5:lambda_val, network_5:lambda_val} , l2)
    updates_5 = adagrad(loss_5, get_all_params(network_5, trainable=True), learning_rate=args.step)
    train_fn_5 = theano.function([input_var, target_var], loss_5, updates=updates_5, allow_input_downcast=True)
    val_acc_5 =  T.mean(binary_accuracy(get_output(network_5, deterministic=True), target_var))
    val_fn_5 = theano.function([input_var, target_var], val_acc_5, allow_input_downcast=True)

    loss_6 = T.mean(categorical_crossentropy(network_6_out,target_var)) + regularize_layer_params_weighted({emb:lambda_val, conv1d_6:lambda_val, 
                hid_6:lambda_val, network_6:lambda_val} , l2)
    updates_6 = adagrad(loss_6, get_all_params(network_6, trainable=True), learning_rate=args.step)
    train_fn_6 = theano.function([input_var, target_var], loss_6, updates=updates_6, allow_input_downcast=True)
    val_acc_6 =  T.mean(categorical_accuracy(get_output(network_6, deterministic=True), target_var))
    val_fn_6 = theano.function([input_var, target_var], val_acc_6, allow_input_downcast=True)

    loss_7 = T.mean(categorical_crossentropy(network_7_out,target_var)) + regularize_layer_params_weighted({emb:lambda_val, conv1d_7:lambda_val, 
                hid_7:lambda_val, network_7:lambda_val} , l2)
    updates_7 = adagrad(loss_7, get_all_params(network_7, trainable=True), learning_rate=args.step)
    train_fn_7 = theano.function([input_var, target_var], loss_7, updates=updates_7, allow_input_downcast=True)
    val_acc_7 =  T.mean(categorical_accuracy(get_output(network_7, deterministic=True), target_var))
    val_fn_7 = theano.function([input_var, target_var], val_acc_7, allow_input_downcast=True)

    loss_8 = T.mean(categorical_crossentropy(network_8_out,target_var)) + regularize_layer_params_weighted({emb:lambda_val, conv1d_8:lambda_val, 
                hid_8:lambda_val, network_8:lambda_val} , l2)
    updates_8 = adagrad(loss_8, get_all_params(network_8, trainable=True), learning_rate=args.step)
    train_fn_8 = theano.function([input_var, target_var], loss_8, updates=updates_8, allow_input_downcast=True)
    val_acc_8 =  T.mean(categorical_accuracy(get_output(network_8, deterministic=True), target_var))
    val_fn_8 = theano.function([input_var, target_var], val_acc_8, allow_input_downcast=True)


    return train_fn_1, val_fn_1, network_1, train_fn_2, val_fn_2, network_2, train_fn_3, val_fn_3, \
            network_3, train_fn_4, val_fn_4, network_4, train_fn_5, val_fn_5, network_5, \
            train_fn_6, val_fn_6, network_6, train_fn_7, val_fn_7, network_7, train_fn_8, val_fn_8, network_8
def event_span_classifier(args, input_var, input_mask_var, target_var, wordEmbeddings, seqlen):

    print("Building model with LSTM")

    vocab_size = wordEmbeddings.shape[1]
    wordDim = wordEmbeddings.shape[0]

    GRAD_CLIP = wordDim

    args.lstmDim = 150

    input = InputLayer((None, seqlen),input_var=input_var)
    batchsize, seqlen = input.input_var.shape
    input_mask = InputLayer((None, seqlen),input_var=input_mask_var)
    
    emb = EmbeddingLayer(input, input_size=vocab_size, output_size=wordDim, W=wordEmbeddings.T)
    #emb.params[emb_1.W].remove('trainable')

    lstm = LSTMLayer(emb, num_units=args.lstmDim, mask_input=input_mask, grad_clipping=GRAD_CLIP,
        nonlinearity=tanh)

    lstm_back = LSTMLayer(
        emb, num_units=args.lstmDim, mask_input=input_mask, grad_clipping=GRAD_CLIP,
        nonlinearity=tanh, backwards=True)

    slice_forward = SliceLayer(lstm, indices=-1, axis=1) # out_shape (None, args.lstmDim)
    slice_backward = SliceLayer(lstm_back, indices=0, axis=1) # out_shape (None, args.lstmDim)

    concat = ConcatLayer([slice_forward, slice_backward])

    hid = DenseLayer(concat, num_units=args.hiddenDim, nonlinearity=sigmoid)

    network = DenseLayer(hid, num_units=2, nonlinearity=softmax)

    prediction = get_output(network)
    
    loss = T.mean(binary_crossentropy(prediction,target_var))
    lambda_val = 0.5 * 1e-4

    layers = {emb:lambda_val, lstm:lambda_val, hid:lambda_val, network:lambda_val} 
    penalty = regularize_layer_params_weighted(layers, l2)
    loss = loss + penalty


    params = get_all_params(network, trainable=True)

    if args.optimizer == "sgd":
        updates = sgd(loss, params, learning_rate=args.step)
    elif args.optimizer == "adagrad":
        updates = adagrad(loss, params, learning_rate=args.step)
    elif args.optimizer == "adadelta":
        updates = adadelta(loss, params, learning_rate=args.step)
    elif args.optimizer == "nesterov":
        updates = nesterov_momentum(loss, params, learning_rate=args.step)
    elif args.optimizer == "rms":
        updates = rmsprop(loss, params, learning_rate=args.step)
    elif args.optimizer == "adam":
        updates = adam(loss, params, learning_rate=args.step)
    else:
        raise "Need set optimizer correctly"
 
    test_prediction = get_output(network, deterministic=True)
    test_loss = T.mean(binary_crossentropy(test_prediction,target_var))


    train_fn = theano.function([input_var, input_mask_var,target_var], 
        loss, updates=updates, allow_input_downcast=True)

    test_acc = T.mean(binary_accuracy(test_prediction, target_var))

    val_fn = theano.function([input_var, input_mask_var, target_var], [test_loss, test_acc], allow_input_downcast=True)

    return train_fn, val_fn, network
l4=MaxPool2DLayer(l3,(2,2))
l5=Conv2DLayer(l4,96,(5,5),nonlinearity=very_leaky_rectify,W=GlorotUniform('relu'))
l6=MaxPool2DLayer(l5,(3,3))
l7=DenseLayer(l6,512,nonlinearity=very_leaky_rectify,W=lasagne.init.GlorotNormal())
#l7_5=cyclicpool(l7)
#l7_5=lasagne.layers.DropoutLayer(l7)
l8=DenseLayer(l7,2,nonlinearity=softmax)



rate=theano.shared(.0002)
params = lasagne.layers.get_all_params(l8)
prediction = lasagne.layers.get_output(l8)
loss = lasagne.objectives.categorical_crossentropy(prediction,y1)
loss = loss.mean()
updates_sgd = adagrad(loss, params, learning_rate=rate)
updates = apply_nesterov_momentum(updates_sgd, params, momentum=0.9)


train_model = theano.function([x1,y1],outputs=loss,updates=updates)

pred = theano.function([x1,y1],outputs=lasagne.objectives.categorical_crossentropy(prediction,y1))

# pred=theano.function([x1,y1],outputs=prediction,on_unused_input='ignore')

### begin to train
renewtrain=len(train_x)/batchsize
renewtest=len(test_x)/batchsize
for i in range(15000):
    if i>325 and i<3000:
        rate.set_value(.001)
def event_span_classifier(args, input_var, target_var, wordEmbeddings, seqlen, num_feats):

    print("Building model with 1D Convolution")

    vocab_size = wordEmbeddings.shape[1]
    wordDim = wordEmbeddings.shape[0]

    kw = 2
    num_filters = seqlen-kw+1
    stride = 1 

    #important context words as channels
 
    #CNN_sentence config
    filter_size=wordDim
    pool_size=seqlen-filter_size+1

    input = InputLayer((None, seqlen, num_feats),input_var=input_var)
    batchsize, _, _ = input.input_var.shape
    emb = EmbeddingLayer(input, input_size=vocab_size, output_size=wordDim, W=wordEmbeddings.T)
    #emb.params[emb.W].remove('trainable') #(batchsize, seqlen, wordDim)

    #print get_output_shape(emb)
    reshape = ReshapeLayer(emb, (batchsize, seqlen, num_feats*wordDim))
    #print get_output_shape(reshape)

    conv1d = Conv1DLayer(reshape, num_filters=num_filters, filter_size=wordDim, stride=1, 
        nonlinearity=tanh,W=GlorotUniform()) #nOutputFrame = num_flters, 
                                            #nOutputFrameSize = (num_feats*wordDim-filter_size)/stride +1

    #print get_output_shape(conv1d)

    conv1d = DimshuffleLayer(conv1d, (0,2,1))

    #print get_output_shape(conv1d)

    pool_size=num_filters

    maxpool = MaxPool1DLayer(conv1d, pool_size=pool_size) 

    #print get_output_shape(maxpool)
  
    #forward = FlattenLayer(maxpool) 

    #print get_output_shape(forward)
 
    hid = DenseLayer(maxpool, num_units=args.hiddenDim, nonlinearity=sigmoid)

    network = DenseLayer(hid, num_units=2, nonlinearity=softmax)

    prediction = get_output(network)
    
    loss = T.mean(binary_crossentropy(prediction,target_var))
    lambda_val = 0.5 * 1e-4

    layers = {emb:lambda_val, conv1d:lambda_val, hid:lambda_val, network:lambda_val} 
    penalty = regularize_layer_params_weighted(layers, l2)
    loss = loss + penalty


    params = get_all_params(network, trainable=True)

    if args.optimizer == "sgd":
        updates = sgd(loss, params, learning_rate=args.step)
    elif args.optimizer == "adagrad":
        updates = adagrad(loss, params, learning_rate=args.step)
    elif args.optimizer == "adadelta":
        updates = adadelta(loss, params, learning_rate=args.step)
    elif args.optimizer == "nesterov":
        updates = nesterov_momentum(loss, params, learning_rate=args.step)
    elif args.optimizer == "rms":
        updates = rmsprop(loss, params, learning_rate=args.step)
    elif args.optimizer == "adam":
        updates = adam(loss, params, learning_rate=args.step)
    else:
        raise "Need set optimizer correctly"
 
    test_prediction = get_output(network, deterministic=True)
    test_loss = T.mean(binary_crossentropy(test_prediction,target_var))

    train_fn = theano.function([input_var, target_var], 
        loss, updates=updates, allow_input_downcast=True)

    test_acc = T.mean(binary_accuracy(test_prediction, target_var))
    val_fn = theano.function([input_var, target_var], [test_loss, test_acc], allow_input_downcast=True)

    return train_fn, val_fn, network
def build_network_2dconv(args, input_var, target_var, wordEmbeddings, maxlen=60):

    print("Building model with 2D Convolution")

    vocab_size = wordEmbeddings.shape[1]
    wordDim = wordEmbeddings.shape[0]

    num_filters = 100
    stride = 1

    # CNN_sentence config
    filter_size = (3, wordDim)
    pool_size = (maxlen - 3 + 1, 1)

    input = InputLayer((None, maxlen), input_var=input_var)
    batchsize, seqlen = input.input_var.shape
    emb = EmbeddingLayer(input, input_size=vocab_size, output_size=wordDim, W=wordEmbeddings.T)
    emb.params[emb.W].remove("trainable")  # (batchsize, maxlen, wordDim)

    reshape = ReshapeLayer(emb, (batchsize, 1, maxlen, wordDim))

    conv2d = Conv2DLayer(
        reshape,
        num_filters=num_filters,
        filter_size=(filter_size),
        stride=stride,
        nonlinearity=rectify,
        W=GlorotUniform(),
    )  # (None, 100, 34, 1)
    maxpool = MaxPool2DLayer(conv2d, pool_size=pool_size)  # (None, 100, 1, 1)

    forward = FlattenLayer(maxpool)  # (None, 100) #(None, 50400)

    hid = DenseLayer(forward, num_units=args.hiddenDim, nonlinearity=sigmoid)

    network = DenseLayer(hid, num_units=2, nonlinearity=softmax)

    prediction = get_output(network)

    loss = T.mean(binary_crossentropy(prediction, target_var))
    lambda_val = 0.5 * 1e-4

    layers = {conv2d: lambda_val, hid: lambda_val, network: lambda_val}
    penalty = regularize_layer_params_weighted(layers, l2)
    loss = loss + penalty

    params = get_all_params(network, trainable=True)

    if args.optimizer == "sgd":
        updates = sgd(loss, params, learning_rate=args.step)
    elif args.optimizer == "adagrad":
        updates = adagrad(loss, params, learning_rate=args.step)
    elif args.optimizer == "adadelta":
        updates = adadelta(loss, params, learning_rate=args.step)
    elif args.optimizer == "nesterov":
        updates = nesterov_momentum(loss, params, learning_rate=args.step)
    elif args.optimizer == "rms":
        updates = rmsprop(loss, params, learning_rate=args.step)
    elif args.optimizer == "adam":
        updates = adam(loss, params, learning_rate=args.step)
    else:
        raise "Need set optimizer correctly"

    test_prediction = get_output(network, deterministic=True)
    test_loss = T.mean(binary_crossentropy(test_prediction, target_var))

    train_fn = theano.function([input_var, target_var], loss, updates=updates, allow_input_downcast=True)

    test_acc = T.mean(binary_accuracy(test_prediction, target_var))
    val_fn = theano.function([input_var, target_var], [test_loss, test_acc], allow_input_downcast=True)

    return train_fn, val_fn
Example #21
0
def marginal_likelihood(x, y, n_epochs=100, lrate=.1):
    gp_params = np.array([2., 10.])
    indep_noise = 0.3
    #gp_params = np.array([.4966, 148.41])
    #indep_noise = .0821

    t_log_gp_params = theano.shared(np.log(gp_params))
    t_log_indep_noise = theano.shared(np.log(indep_noise))

    def print_params():
        print t_log_gp_params.get_value()
        print t_log_indep_noise.get_value()

    print_params()

    t_x = T.vector('x')
    t_y = T.vector('y')
    t_gp_params = T.exp(t_log_gp_params)
    t_indep_noise = T.exp(t_log_indep_noise)
    x_col = t_x.dimshuffle(0, 'x')
    x_row = t_x.dimshuffle('x', 0)
    K = t_gp_params[0] * T.exp(-t_gp_params[1] * T.sqr(x_col - x_row))
    K = K + t_indep_noise * T.identity_like(K)
    y_Kinv_y = t_y.dot(T.nlinalg.matrix_inverse(K)).dot(t_y)
    #logdetK = T.log(T.nlinalg.det(K))
    logdetK = logabsdet(K)
    marginal_ll = -0.5 * (y_Kinv_y + logdetK)
    loss = -marginal_ll

    if batch_update:
        loss_fn = theano.function([t_x, t_y], loss)
        grad_loss_fn = theano.function(
            [t_x, t_y], theano.grad(loss,
                                    [t_log_gp_params, t_log_indep_noise]))

        n_params = len(gp_params) + 1

        def f_df(params):
            a, b, c = params
            t_log_gp_params.set_value([a, b])
            t_log_indep_noise.set_value(c)
            total_loss = 0
            grad = np.zeros(n_params)
            for each_x, each_y in izip(x, y):
                total_loss += loss_fn(each_x, each_y)
                grad += np.append(*grad_loss_fn(each_x, each_y))
            return total_loss, grad

        init_params = np.r_[gp_params, indep_noise]
        opt = fmin_l_bfgs_b(
            f_df,
            init_params,
            factr=1e3,
            pgtol=1e-07,
            disp=1,
        )[0]
        print opt
        opt_gp_params = opt[:-1]
        opt_indep_noise = opt[-1]
    else:
        # Stochastic optimization
        updates = adagrad(loss, [t_log_gp_params, t_log_indep_noise],
                          learning_rate=lrate)
        loss_fn = theano.function([t_x, t_y], loss, updates=updates)
        grad_loss_fn = theano.function(
            [t_x, t_y], theano.grad(loss,
                                    [t_log_gp_params, t_log_indep_noise]))

        for i in xrange(n_epochs):
            count = 1
            trace = []
            sys.stderr.write('%4d  ' % i)
            for each_x, each_y in izip(x, y):
                val = loss_fn(each_x, each_y)
                trace.append(val)
                count += 1
                if count % 20 == 0:
                    sys.stderr.write('.')
            print
            print np.mean(trace), np.std(trace)
            print_params()
        opt_gp_params = t_log_gp_params.get_value()
        opt_indep_noise = t_log_indep_noise.get_value()
    return opt_gp_params, opt_indep_noise
def multi_task_classifier(args,
                          input_var,
                          target_var,
                          wordEmbeddings,
                          seqlen,
                          num_feats,
                          lambda_val=0.5 * 1e-4):

    print("Building multi task model with 1D Convolution")

    vocab_size = wordEmbeddings.shape[1]
    wordDim = wordEmbeddings.shape[0]

    kw = 2
    num_filters = seqlen - kw + 1
    stride = 1
    filter_size = wordDim
    pool_size = num_filters

    input = InputLayer((None, seqlen, num_feats), input_var=input_var)
    batchsize, _, _ = input.input_var.shape

    #span
    emb1 = EmbeddingLayer(input,
                          input_size=vocab_size,
                          output_size=wordDim,
                          W=wordEmbeddings.T)
    reshape1 = ReshapeLayer(emb1, (batchsize, seqlen, num_feats * wordDim))
    conv1d_1 = DimshuffleLayer(
        Conv1DLayer(reshape1,
                    num_filters=num_filters,
                    filter_size=wordDim,
                    stride=1,
                    nonlinearity=tanh,
                    W=GlorotUniform()), (0, 2, 1))
    maxpool_1 = MaxPool1DLayer(conv1d_1, pool_size=pool_size)
    hid_1 = DenseLayer(maxpool_1,
                       num_units=args.hiddenDim,
                       nonlinearity=sigmoid)
    network_1 = DenseLayer(hid_1, num_units=2, nonlinearity=softmax)
    """
    #DocTimeRel
    emb2 = EmbeddingLayer(input, input_size=vocab_size, output_size=wordDim, W=wordEmbeddings.T)
    reshape2 = ReshapeLayer(emb2, (batchsize, seqlen, num_feats*wordDim))
    conv1d_2 = DimshuffleLayer(Conv1DLayer(reshape2, num_filters=num_filters, filter_size=wordDim, stride=1, 
        nonlinearity=tanh,W=GlorotUniform()), (0,2,1))
    maxpool_2 = MaxPool1DLayer(conv1d_2, pool_size=pool_size)  
    hid_2 = DenseLayer(maxpool_2, num_units=args.hiddenDim, nonlinearity=sigmoid)
    network_2 = DenseLayer(hid_2, num_units=5, nonlinearity=softmax)
    """

    #Type
    emb3 = EmbeddingLayer(input,
                          input_size=vocab_size,
                          output_size=wordDim,
                          W=wordEmbeddings.T)
    reshape3 = ReshapeLayer(emb3, (batchsize, seqlen, num_feats * wordDim))
    conv1d_3 = DimshuffleLayer(
        Conv1DLayer(reshape3,
                    num_filters=num_filters,
                    filter_size=wordDim,
                    stride=1,
                    nonlinearity=tanh,
                    W=GlorotUniform()), (0, 2, 1))
    maxpool_3 = MaxPool1DLayer(conv1d_3, pool_size=pool_size)
    hid_3 = DenseLayer(maxpool_3,
                       num_units=args.hiddenDim,
                       nonlinearity=sigmoid)
    network_3 = DenseLayer(hid_3, num_units=4, nonlinearity=softmax)

    #Degree
    emb4 = EmbeddingLayer(input,
                          input_size=vocab_size,
                          output_size=wordDim,
                          W=wordEmbeddings.T)
    reshape4 = ReshapeLayer(emb4, (batchsize, seqlen, num_feats * wordDim))
    conv1d_4 = DimshuffleLayer(
        Conv1DLayer(reshape4,
                    num_filters=num_filters,
                    filter_size=wordDim,
                    stride=1,
                    nonlinearity=tanh,
                    W=GlorotUniform()), (0, 2, 1))
    maxpool_4 = MaxPool1DLayer(conv1d_4, pool_size=pool_size)
    hid_4 = DenseLayer(maxpool_4,
                       num_units=args.hiddenDim,
                       nonlinearity=sigmoid)
    network_4 = DenseLayer(hid_4, num_units=4, nonlinearity=softmax)

    #Polarity
    emb5 = EmbeddingLayer(input,
                          input_size=vocab_size,
                          output_size=wordDim,
                          W=wordEmbeddings.T)
    reshape5 = ReshapeLayer(emb5, (batchsize, seqlen, num_feats * wordDim))
    conv1d_5 = DimshuffleLayer(
        Conv1DLayer(reshape5,
                    num_filters=num_filters,
                    filter_size=wordDim,
                    stride=1,
                    nonlinearity=tanh,
                    W=GlorotUniform()), (0, 2, 1))
    maxpool_5 = MaxPool1DLayer(conv1d_5, pool_size=pool_size)
    hid_5 = DenseLayer(maxpool_5,
                       num_units=args.hiddenDim,
                       nonlinearity=sigmoid)
    network_5 = DenseLayer(hid_5, num_units=3, nonlinearity=softmax)

    #ContextualModality
    emb6 = EmbeddingLayer(input,
                          input_size=vocab_size,
                          output_size=wordDim,
                          W=wordEmbeddings.T)
    reshape6 = ReshapeLayer(emb6, (batchsize, seqlen, num_feats * wordDim))
    conv1d_6 = DimshuffleLayer(
        Conv1DLayer(reshape6,
                    num_filters=num_filters,
                    filter_size=wordDim,
                    stride=1,
                    nonlinearity=tanh,
                    W=GlorotUniform()), (0, 2, 1))
    maxpool_6 = MaxPool1DLayer(conv1d_6, pool_size=pool_size)
    hid_6 = DenseLayer(maxpool_6,
                       num_units=args.hiddenDim,
                       nonlinearity=sigmoid)
    network_6 = DenseLayer(hid_6, num_units=5, nonlinearity=softmax)
    """
    #ContextualAspect
    emb7 = EmbeddingLayer(input, input_size=vocab_size, output_size=wordDim, W=wordEmbeddings.T)
    reshape7 = ReshapeLayer(emb7, (batchsize, seqlen, num_feats*wordDim))
    conv1d_7 = DimshuffleLayer(Conv1DLayer(reshape7, num_filters=num_filters, filter_size=wordDim, stride=1, 
        nonlinearity=tanh,W=GlorotUniform()), (0,2,1))
    maxpool_7 = MaxPool1DLayer(conv1d_7, pool_size=pool_size)  
    hid_7 = DenseLayer(maxpool_7, num_units=args.hiddenDim, nonlinearity=sigmoid)
    network_7 = DenseLayer(hid_7, num_units=4, nonlinearity=softmax)
    """
    """
    #Permanence
    emb8 = EmbeddingLayer(input, input_size=vocab_size, output_size=wordDim, W=wordEmbeddings.T)
    reshape8 = ReshapeLayer(emb8, (batchsize, seqlen, num_feats*wordDim))
    conv1d_8 = DimshuffleLayer(Conv1DLayer(reshape8, num_filters=num_filters, filter_size=wordDim, stride=1, 
        nonlinearity=tanh,W=GlorotUniform()), (0,2,1))
    maxpool_8 = MaxPool1DLayer(conv1d_8, pool_size=pool_size)  
    hid_8 = DenseLayer(maxpool_8, num_units=args.hiddenDim, nonlinearity=sigmoid)
    network_8 = DenseLayer(hid_8, num_units=4, nonlinearity=softmax)
    """

    # Is this important?
    """
    network_1_out, network_2_out, network_3_out, network_4_out, \
    network_5_out, network_6_out, network_7_out, network_8_out = \
    get_output([network_1, network_2, network_3, network_4, network_5, network_6, network_7, network_8])
    """
    network_1_out = get_output(network_1)
    network_3_out = get_output(network_3)
    network_4_out = get_output(network_4)
    network_5_out = get_output(network_5)
    network_6_out = get_output(network_6)

    loss_1 = T.mean(binary_crossentropy(
        network_1_out, target_var)) + regularize_layer_params_weighted(
            {
                emb1: lambda_val,
                conv1d_1: lambda_val,
                hid_1: lambda_val,
                network_1: lambda_val
            }, l2)
    updates_1 = adagrad(loss_1,
                        get_all_params(network_1, trainable=True),
                        learning_rate=args.step)
    train_fn_1 = theano.function([input_var, target_var],
                                 loss_1,
                                 updates=updates_1,
                                 allow_input_downcast=True)
    val_acc_1 = T.mean(
        binary_accuracy(get_output(network_1, deterministic=True), target_var))
    val_fn_1 = theano.function([input_var, target_var],
                               val_acc_1,
                               allow_input_downcast=True)
    """
    loss_2 = T.mean(categorical_crossentropy(network_2_out,target_var)) + regularize_layer_params_weighted({emb2:lambda_val, conv1d_2:lambda_val, 
                hid_2:lambda_val, network_2:lambda_val} , l2)
    updates_2 = adagrad(loss_2, get_all_params(network_2, trainable=True), learning_rate=args.step)
    train_fn_2 = theano.function([input_var, target_var], loss_2, updates=updates_2, allow_input_downcast=True)
    val_acc_2 =  T.mean(categorical_accuracy(get_output(network_2, deterministic=True), target_var))
    val_fn_2 = theano.function([input_var, target_var], val_acc_2, allow_input_downcast=True)
    """

    loss_3 = T.mean(categorical_crossentropy(
        network_3_out, target_var)) + regularize_layer_params_weighted(
            {
                emb3: lambda_val,
                conv1d_3: lambda_val,
                hid_3: lambda_val,
                network_3: lambda_val
            }, l2)
    updates_3 = adagrad(loss_3,
                        get_all_params(network_3, trainable=True),
                        learning_rate=args.step)
    train_fn_3 = theano.function([input_var, target_var],
                                 loss_3,
                                 updates=updates_3,
                                 allow_input_downcast=True)
    val_acc_3 = T.mean(
        categorical_accuracy(get_output(network_3, deterministic=True),
                             target_var))
    val_fn_3 = theano.function([input_var, target_var],
                               val_acc_3,
                               allow_input_downcast=True)

    loss_4 = T.mean(categorical_crossentropy(
        network_4_out, target_var)) + regularize_layer_params_weighted(
            {
                emb4: lambda_val,
                conv1d_4: lambda_val,
                hid_4: lambda_val,
                network_4: lambda_val
            }, l2)
    updates_4 = adagrad(loss_4,
                        get_all_params(network_4, trainable=True),
                        learning_rate=args.step)
    train_fn_4 = theano.function([input_var, target_var],
                                 loss_4,
                                 updates=updates_4,
                                 allow_input_downcast=True)
    val_acc_4 = T.mean(
        categorical_accuracy(get_output(network_4, deterministic=True),
                             target_var))
    val_fn_4 = theano.function([input_var, target_var],
                               val_acc_4,
                               allow_input_downcast=True)

    loss_5 = T.mean(categorical_crossentropy(
        network_5_out, target_var)) + regularize_layer_params_weighted(
            {
                emb5: lambda_val,
                conv1d_5: lambda_val,
                hid_5: lambda_val,
                network_5: lambda_val
            }, l2)
    updates_5 = adagrad(loss_5,
                        get_all_params(network_5, trainable=True),
                        learning_rate=args.step)
    train_fn_5 = theano.function([input_var, target_var],
                                 loss_5,
                                 updates=updates_5,
                                 allow_input_downcast=True)
    val_acc_5 = T.mean(
        categorical_accuracy(get_output(network_5, deterministic=True),
                             target_var))
    val_fn_5 = theano.function([input_var, target_var],
                               val_acc_5,
                               allow_input_downcast=True)

    loss_6 = T.mean(categorical_crossentropy(
        network_6_out, target_var)) + regularize_layer_params_weighted(
            {
                emb6: lambda_val,
                conv1d_6: lambda_val,
                hid_6: lambda_val,
                network_6: lambda_val
            }, l2)
    updates_6 = adagrad(loss_6,
                        get_all_params(network_6, trainable=True),
                        learning_rate=args.step)
    train_fn_6 = theano.function([input_var, target_var],
                                 loss_6,
                                 updates=updates_6,
                                 allow_input_downcast=True)
    val_acc_6 = T.mean(
        categorical_accuracy(get_output(network_6, deterministic=True),
                             target_var))
    val_fn_6 = theano.function([input_var, target_var],
                               val_acc_6,
                               allow_input_downcast=True)
    """
    loss_7 = T.mean(categorical_crossentropy(network_7_out,target_var)) + regularize_layer_params_weighted({emb7:lambda_val, conv1d_7:lambda_val, 
                hid_7:lambda_val, network_7:lambda_val} , l2)
    updates_7 = adagrad(loss_7, get_all_params(network_7, trainable=True), learning_rate=args.step)
    train_fn_7 = theano.function([input_var, target_var], loss_7, updates=updates_7, allow_input_downcast=True)
    val_acc_7 =  T.mean(categorical_accuracy(get_output(network_7, deterministic=True), target_var))
    val_fn_7 = theano.function([input_var, target_var], val_acc_7, allow_input_downcast=True)

    loss_8 = T.mean(categorical_crossentropy(network_8_out,target_var)) + regularize_layer_params_weighted({emb8:lambda_val, conv1d_8:lambda_val, 
                hid_8:lambda_val, network_8:lambda_val} , l2)
    updates_8 = adagrad(loss_8, get_all_params(network_8, trainable=True), learning_rate=args.step)
    train_fn_8 = theano.function([input_var, target_var], loss_8, updates=updates_8, allow_input_downcast=True)
    val_acc_8 =  T.mean(categorical_accuracy(get_output(network_8, deterministic=True), target_var))
    val_fn_8 = theano.function([input_var, target_var], val_acc_8, allow_input_downcast=True)
    """
    """
    return train_fn_1, val_fn_1, network_1, train_fn_2, val_fn_2, network_2, train_fn_3, val_fn_3, \
            network_3, train_fn_4, val_fn_4, network_4, train_fn_5, val_fn_5, network_5, \
            train_fn_6, val_fn_6, network_6, train_fn_7, val_fn_7, network_7, train_fn_8, val_fn_8, network_8
    """
    return train_fn_1, val_fn_1, network_1, train_fn_3, val_fn_3, \
            network_3, train_fn_4, val_fn_4, network_4, train_fn_5, val_fn_5, network_5, \
            train_fn_6, val_fn_6, network_6
Example #23
0
    def update_params(self, loss, all_params):

        return adagrad(loss, all_params, self.learning_rate)
def build_network_2dconv(
    args, input1_var, input1_mask_var, input2_var, intut2_mask_var, target_var, wordEmbeddings, maxlen=36
):

    print ("Building model with 2D Convolution")

    vocab_size = wordEmbeddings.shape[1]
    wordDim = wordEmbeddings.shape[0]

    num_filters = 100

    stride = 1

    # CNN_sentence config
    filter_size = (3, wordDim)
    pool_size = (maxlen - 3 + 1, 1)

    # two conv pool layer
    # filter_size=(10, 100)
    # pool_size=(4,4)

    input_1 = InputLayer((None, maxlen), input_var=input1_var)
    batchsize, seqlen = input_1.input_var.shape
    # input_1_mask = InputLayer((None, maxlen),input_var=input1_mask_var)
    emb_1 = EmbeddingLayer(input_1, input_size=vocab_size, output_size=wordDim, W=wordEmbeddings.T)
    emb_1.params[emb_1.W].remove("trainable")  # (batchsize, maxlen, wordDim)

    reshape_1 = ReshapeLayer(emb_1, (batchsize, 1, maxlen, wordDim))

    conv2d_1 = Conv2DLayer(
        reshape_1,
        num_filters=num_filters,
        filter_size=(filter_size),
        stride=stride,
        nonlinearity=rectify,
        W=GlorotUniform(),
    )  # (None, 100, 34, 1)
    maxpool_1 = MaxPool2DLayer(conv2d_1, pool_size=pool_size)  # (None, 100, 1, 1)

    """
    filter_size_2=(4, 10)
    pool_size_2=(2,2)
    conv2d_1 = Conv2DLayer(maxpool_1, num_filters=num_filters, filter_size=filter_size_2, stride=stride, 
        nonlinearity=rectify,W=GlorotUniform()) #(None, 100, 34, 1)
    maxpool_1 = MaxPool2DLayer(conv2d_1, pool_size=pool_size_2) #(None, 100, 1, 1) (None, 100, 1, 20)
    """

    forward_1 = FlattenLayer(maxpool_1)  # (None, 100) #(None, 50400)

    input_2 = InputLayer((None, maxlen), input_var=input2_var)
    # input_2_mask = InputLayer((None, maxlen),input_var=input2_mask_var)
    emb_2 = EmbeddingLayer(input_2, input_size=vocab_size, output_size=wordDim, W=wordEmbeddings.T)
    emb_2.params[emb_2.W].remove("trainable")

    reshape_2 = ReshapeLayer(emb_2, (batchsize, 1, maxlen, wordDim))
    conv2d_2 = Conv2DLayer(
        reshape_2,
        num_filters=num_filters,
        filter_size=filter_size,
        stride=stride,
        nonlinearity=rectify,
        W=GlorotUniform(),
    )  # (None, 100, 34, 1)
    maxpool_2 = MaxPool2DLayer(conv2d_2, pool_size=pool_size)  # (None, 100, 1, 1)

    """
    conv2d_2 = Conv2DLayer(maxpool_2, num_filters=num_filters, filter_size=filter_size_2, stride=stride, 
        nonlinearity=rectify,W=GlorotUniform()) #(None, 100, 34, 1)
    maxpool_2 = MaxPool2DLayer(conv2d_2, pool_size=pool_size_2) #(None, 100, 1, 1)
    """

    forward_2 = FlattenLayer(maxpool_2)  # (None, 100)

    # elementwisemerge need fix the sequence length
    mul = ElemwiseMergeLayer([forward_1, forward_2], merge_function=T.mul)
    sub = AbsSubLayer([forward_1, forward_2], merge_function=T.sub)
    concat = ConcatLayer([mul, sub])

    concat = ConcatLayer([forward_1, forward_2])

    hid = DenseLayer(concat, num_units=args.hiddenDim, nonlinearity=sigmoid)

    if args.task == "sts":
        network = DenseLayer(hid, num_units=5, nonlinearity=softmax)

    elif args.task == "ent":
        network = DenseLayer(hid, num_units=3, nonlinearity=softmax)

    # prediction = get_output(network, {input_1:input1_var, input_2:input2_var})
    prediction = get_output(network)

    loss = T.mean(categorical_crossentropy(prediction, target_var))
    lambda_val = 0.5 * 1e-4

    layers = {conv2d_1: lambda_val, hid: lambda_val, network: lambda_val}
    penalty = regularize_layer_params_weighted(layers, l2)
    loss = loss + penalty

    params = get_all_params(network, trainable=True)

    if args.optimizer == "sgd":
        updates = sgd(loss, params, learning_rate=args.step)
    elif args.optimizer == "adagrad":
        updates = adagrad(loss, params, learning_rate=args.step)
    elif args.optimizer == "adadelta":
        updates = adadelta(loss, params, learning_rate=args.step)
    elif args.optimizer == "nesterov":
        updates = nesterov_momentum(loss, params, learning_rate=args.step)
    elif args.optimizer == "rms":
        updates = rmsprop(loss, params, learning_rate=args.step)
    elif args.optimizer == "adam":
        updates = adam(loss, params, learning_rate=args.step)
    else:
        raise "Need set optimizer correctly"

    # test_prediction = get_output(network, {input_1:input1_var, input_2:input2_var}, deterministic=True)
    test_prediction = get_output(network, deterministic=True)
    test_loss = T.mean(categorical_crossentropy(test_prediction, target_var))

    """
    train_fn = theano.function([input1_var, input1_mask_var, input2_var, intut2_mask_var, target_var], 
        loss, updates=updates, allow_input_downcast=True)
    """
    train_fn = theano.function([input1_var, input2_var, target_var], loss, updates=updates, allow_input_downcast=True)

    if args.task == "sts":
        """
        val_fn = theano.function([input1_var, input1_mask_var, input2_var, intut2_mask_var, target_var], 
            [test_loss, test_prediction], allow_input_downcast=True)
        """
        val_fn = theano.function(
            [input1_var, input2_var, target_var], [test_loss, test_prediction], allow_input_downcast=True
        )

    elif args.task == "ent":
        # test_acc = T.mean(T.eq(T.argmax(test_prediction, axis=1), target_var), dtype=theano.config.floatX)
        test_acc = T.mean(categorical_accuracy(test_prediction, target_var))

        """
        val_fn = theano.function([input1_var, input1_mask_var, input2_var, intut2_mask_var, target_var], 
            [test_loss, test_acc], allow_input_downcast=True)
        """
        val_fn = theano.function([input1_var, input2_var, target_var], [test_loss, test_acc], allow_input_downcast=True)

    return train_fn, val_fn
Example #25
0
def event_span_classifier(args, input_var, target_var, wordEmbeddings, seqlen, num_feats):

    print("Building model with 1D Convolution")

    vocab_size = wordEmbeddings.shape[1]
    wordDim = wordEmbeddings.shape[0]

    kw = 2
    num_filters = seqlen-kw+1
    stride = 1 

    #important context words as channels
 
    #CNN_sentence config
    filter_size=wordDim
    pool_size=seqlen-filter_size+1

    input = InputLayer((None, seqlen, num_feats),input_var=input_var)
    batchsize, _, _ = input.input_var.shape
    emb = EmbeddingLayer(input, input_size=vocab_size, output_size=wordDim, W=wordEmbeddings.T)
    #emb.params[emb.W].remove('trainable') #(batchsize, seqlen, wordDim)

    #print get_output_shape(emb)
    reshape = ReshapeLayer(emb, (batchsize, seqlen, num_feats*wordDim))
    #print get_output_shape(reshape)

    conv1d = Conv1DLayer(reshape, num_filters=num_filters, filter_size=wordDim, stride=1, 
        nonlinearity=tanh,W=GlorotUniform()) #nOutputFrame = num_flters, 
                                            #nOutputFrameSize = (num_feats*wordDim-filter_size)/stride +1

    #print get_output_shape(conv1d)

    conv1d = DimshuffleLayer(conv1d, (0,2,1))

    #print get_output_shape(conv1d)

    pool_size=num_filters

    maxpool = MaxPool1DLayer(conv1d, pool_size=pool_size) 

    #print get_output_shape(maxpool)
  
    #forward = FlattenLayer(maxpool) 

    #print get_output_shape(forward)
 
    hid = DenseLayer(maxpool, num_units=args.hiddenDim, nonlinearity=sigmoid)

    network = DenseLayer(hid, num_units=2, nonlinearity=softmax)

    prediction = get_output(network)
    
    loss = T.mean(binary_crossentropy(prediction,target_var))
    lambda_val = 0.5 * 1e-4

    layers = {emb:lambda_val, conv1d:lambda_val, hid:lambda_val, network:lambda_val} 
    penalty = regularize_layer_params_weighted(layers, l2)
    loss = loss + penalty


    params = get_all_params(network, trainable=True)

    if args.optimizer == "sgd":
        updates = sgd(loss, params, learning_rate=args.step)
    elif args.optimizer == "adagrad":
        updates = adagrad(loss, params, learning_rate=args.step)
    elif args.optimizer == "adadelta":
        updates = adadelta(loss, params, learning_rate=args.step)
    elif args.optimizer == "nesterov":
        updates = nesterov_momentum(loss, params, learning_rate=args.step)
    elif args.optimizer == "rms":
        updates = rmsprop(loss, params, learning_rate=args.step)
    elif args.optimizer == "adam":
        updates = adam(loss, params, learning_rate=args.step)
    else:
        raise "Need set optimizer correctly"
 
    test_prediction = get_output(network, deterministic=True)
    test_loss = T.mean(binary_crossentropy(test_prediction,target_var))

    train_fn = theano.function([input_var, target_var], 
        loss, updates=updates, allow_input_downcast=True)

    test_acc = T.mean(binary_accuracy(test_prediction, target_var))
    val_fn = theano.function([input_var, target_var], [test_loss, test_acc], allow_input_downcast=True)

    return train_fn, val_fn, network
Example #26
0
def logisticRegression(X, Y, l, **options):
    #%% imports
    from theano import function
    from theano import shared
    import numpy as np
    from numpy import random as rng
    import matplotlib.pyplot as plt
    from theano import tensor as Tn
    from lasagne.updates import adagrad
    if options.get('plotFigure'):
        plotFigure = options.get('plotFigure')
    else:
        plotFigure = False

    if options.get('verbose'):
        verbose = options.get('verbose')
    else:
        verbose = False
    #%% load data
    l = np.array(l).astype(float)
    numObservations = len(Y)
    numFeatures = len(X) / numObservations
    Y = np.squeeze(np.array(Y).astype('int'))
    X = np.reshape(np.array(X).astype('float'), (numObservations, numFeatures),
                   order='F')
    # remove any nans
    mskOutNans = (np.sum(np.isnan(X), axis=1) + np.squeeze(np.isnan(Y))) < 1
    X = X[mskOutNans, :]
    Y = Y[mskOutNans]
    numObservations, numFeatures = X.shape
    Data = (X, Y)
    # data tuple

    #%% declaration of sympolic variables and initializations
    # Declare theano symbolic input/ output variables
    x = Tn.matrix('x')
    # sympolic feature variable
    y = Tn.vector('y')
    # sympolic label variable
    lambda_l2 = Tn.dscalar('lambda_l2')
    # sympolic l2-regularization parameter variable
    lambda_l1 = Tn.dscalar('lambda_l1')
    # sympolic l1-regularization parameter variable
    # Declare and initialize theano shared optimization variables
    w = shared(rng.randn(numFeatures), name='w')
    # initialize the weight vector (w) randomly
    b = shared(np.random.randn(), name='b')
    # initialize the bias variable (b) randomly
    clps = shared(0.5 * np.random.rand(), name='clps')
    # initialize the lapse rate variable (lps) randomly between 0 and 1

    #%% functions expressions and compilations
    # function sympolic expressions
    XW = Tn.dot(x, w)
    L_Exp = Tn.shape(y)
    lps = 0.5 / (1. + Tn.exp(-10. * clps))
    prob1Expression = lps + (1 - 2 * lps) / (1.0 + Tn.exp(-(XW + b)))
    # expression of logistic function (prob of 1)
    prob0Expression = 1.0 - prob1Expression
    # expression of logistic function (prob of 0)
    predictExpression = prob1Expression > 0.5
    logLikelihood_Exp = (y * Tn.log(prob1Expression) +
                         (1.0 - y) * Tn.log(prob0Expression)).sum()
    # loglikelihood expression
    costExpression = -logLikelihood_Exp / (L_Exp[0]) + lambda_l2 * (
        w**2).sum() + lambda_l1 * abs(w).sum()
    # mean cost across all samples with the regularization
    perClassErExpression = abs(predictExpression -
                               y).sum() / Tn.shape(y)[0] * 100
    # percent classification error expression

    # compiling function expressions for speed
    prob1Fn = function(inputs=[x], outputs=prob1Expression)
    prob0Fn = function(inputs=[x], outputs=prob0Expression)
    predictFn = function(inputs=[x], outputs=predictExpression)
    costFn = function(inputs=[x, y, lambda_l2, lambda_l1],
                      outputs=costExpression)
    perClassErFn = function(inputs=[x, y], outputs=perClassErExpression)

    # cost gradient with respect to all parameters
    grad_w, grad_b, grad_clps = Tn.grad(costExpression, [w, b, clps])

    updates = adagrad([grad_w, grad_b, grad_clps], [w, b, clps],
                      learning_rate=0.2,
                      epsilon=0.0001)

    # training function
    trainFn = function(
        inputs=[x, y, lambda_l2, lambda_l1],
        outputs=[costExpression, perClassErExpression, logLikelihood_Exp],
        updates=updates
        #updates = ((w, w - learnRate*grad_w), (b, b - learnRate*grad_b), (lps, Tn.clip((lps - learnRate*grad_lps) , 0.0, 0.4999999)))
    )

    #%% Training the model
    maxIter = 10000
    numRepetitions = 4
    w_0 = []
    b_0 = []
    clps_0 = []
    minCost = np.inf
    rbest = 0
    for r in range(numRepetitions):
        w_0.append(rng.randn(numFeatures))
        b_0.append(np.random.rand())
        clps_0.append(np.random.randn())

        w.set_value(w_0[r])
        b.set_value(b_0[r])
        clps.set_value(clps_0[r])
        b_i = [b.get_value()]
        lps_i = [lps.eval()]
        cost = []
        perClassEr = []
        lklhood_i = []
        for i in range(int(maxIter)):
            Er1, Er2, lklhood = trainFn(Data[0], Data[1], l[0], l[1])
            #lps.set_value(np.clip(lps.get_value(), 0., 0.5)) # enforce constraint
            lklhood_i.append(lklhood)
            cost.append(Er1)
            perClassEr.append(Er2)
            b_i.append(b.get_value())
            lps_i.append(lps.eval())
            if i > 500:
                if abs(cost[i - 100] - cost[i]) < (10.0**-6):
                    break
            if verbose:
                print 'iteration %d , objective value %.5f, initial conditions %d out of %d' % (
                    i + 1, cost[i], r + 1, numRepetitions)
        if cost[-1] < minCost:
            rbest = r
            minCost = cost[-1]
            costbest = cost
            perClassErbest = perClassEr
            wbest = w.get_value()
            clpsbest = clps.get_value()
            bbest = b.get_value()
            bbest_i = b_i
            lpsbest_i = lps_i
            lklhoodBest_i = lklhood_i

    w_0 = w_0[rbest]
    cost = costbest
    perClassEr = perClassErbest
    w.set_value(wbest)
    clps.set_value(clpsbest)
    b.set_value(bbest)
    b_i = bbest_i
    lps_i = lpsbest_i
    lklhood_i = lklhoodBest_i

    #%% plot results
    #%%
    msk1 = Data[1] > 0
    msk0 = Data[1] < 1
    Y1 = Data[1][msk1]
    Y0 = Data[1][msk0]
    Yhat = predictFn(Data[0])
    Yhat1 = Yhat[msk1]
    Yhat0 = Yhat[msk0]
    prob1 = prob1Fn(Data[0])
    prob1_1 = prob1[msk1]
    prob1_0 = prob1[msk0]

    if plotFigure:

        plt.figure('cost')
        plt.subplot(3, 1, 1)
        plt.plot(cost)
        plt.xlabel('iteration')
        plt.ylabel('cross-entropy loss')
        plt.subplot(3, 1, 2)
        plt.plot(perClassEr)
        plt.ylim(0, 100)
        plt.xlabel('iteration')
        plt.ylabel('classification error (%)')
        plt.subplot(3, 1, 3)
        plt.plot(lklhood_i)
        plt.xlabel('iteration')
        plt.ylabel('log likelihood')

        plt.figure('prediction')
        plt.plot(np.arange(1, len(Y0) + 1), Yhat0, 'ro', label='prediction')
        plt.plot(np.arange(1,
                           len(Y0) + 1),
                 prob1_0,
                 'b.',
                 label='likelihood of success')
        plt.plot(np.arange(len(Y0) + 1, len(Yhat) + 1), Yhat1, 'ro')
        plt.plot(np.arange(len(Y0) + 1, len(Yhat) + 1), prob1_1, 'b.')
        plt.plot(np.arange(1, len(Y0) + 1), Y0, 'k.', label='data')
        plt.plot(np.arange(len(Y0) + 1, len(Yhat) + 1), Y1, 'k.')

        plt.ylim(-0.1, 1.1)
        plt.xticks([0, 1])
        plt.xlabel('observation')
        plt.ylabel('class label')
        plt.legend()

        plt.figure('weights')
        plt.subplot(3, 1, 1)
        plt.plot(w_0, 'g')
        plt.plot(w.get_value(), 'r')
        plt.xlabel('feature number')
        plt.ylabel('feature weight')
        plt.legend(('before optimization', 'after optimization'))
        plt.subplot(3, 1, 2)
        plt.plot(b_i)
        plt.xlabel('iteration')
        plt.ylabel('bias')
        plt.subplot(3, 1, 3)
        plt.plot(lps_i)
        plt.ylim(0., 1.)
        plt.xlabel('iteration')
        plt.ylabel('lapse rate')

    #%% save optimization parameters to class
    class optParamsClass:
        cost_per_iter = np.inf
        perClassEr_per_iter = 100.
        loglikelihood = -np.inf
        prediction = []
        likelihood_trial = []
        predictFn = []
        perClassErFn = []
        probSucessFn = []

        def description(self):
            return 'object contains optimization parameters'

    optParams = optParamsClass()
    optParams.cost_per_iter = np.array(cost)
    optParams.perClassEr_per_iter = np.array(perClassEr)
    optParams.loglikelihood = lklhood_i
    optParams.prediction = Yhat
    optParams.likelihood_trial = prob1
    optParams.predictFn = predictFn
    optParams.perClassErFn = perClassErFn
    optParams.probSucessFn = prob1Fn
    #%% return parameters
    return w.get_value(), b.get_value(), lps.eval(
    ), perClassEr[-1], cost[-1], optParams