def get_updates(nnet, train_obj, trainable_params): implemented_solvers = ("nesterov", "adagrad", "adadelta", "adam") if not hasattr(nnet, "solver") or nnet.solver not in implemented_solvers: nnet.sgd_solver = "nesterov" else: nnet.sgd_solver = nnet.solver if nnet.sgd_solver == "nesterov": updates = l_updates.nesterov_momentum(train_obj, trainable_params, learning_rate=Cfg.learning_rate, momentum=0.9) elif nnet.sgd_solver == "adagrad": updates = l_updates.adagrad(train_obj, trainable_params, learning_rate=Cfg.learning_rate) elif nnet.sgd_solver == "adadelta": updates = l_updates.adadelta(train_obj, trainable_params, learning_rate=Cfg.learning_rate) elif nnet.sgd_solver == "adam": updates = l_updates.adam(train_obj, trainable_params, learning_rate=Cfg.learning_rate) return updates
def create_train_func(layers, lr=0.01): # dims: batch, sequence, vocabulary X = T.tensor3('X') X_batch = T.tensor3('X_batch') # dims: target y = T.ivector('y') y_batch = T.ivector('y_batch') y_hat = get_output(layers['l_out'], X, deterministic=False) train_loss = T.mean(categorical_crossentropy(y_hat, y), axis=0) params = get_all_params(layers['l_out'], trainable=True) updates = adagrad(train_loss, params, lr) train_func = theano.function( inputs=[theano.In(X_batch), theano.In(y_batch)], outputs=train_loss, updates=updates, givens={ X: X_batch, y: y_batch, }, ) return train_func
def generate_theano_func(args, network, penalty, input_dict, target_var): prediction = get_output(network, input_dict) # loss = T.mean( target_var * ( T.log(target_var) - prediction )) loss = T.mean(categorical_crossentropy(prediction, target_var)) # loss += 0.0001 * sum (T.sum(layer_params ** 2) for layer_params in get_all_params(network) ) # penalty = sum ( T.sum(lstm_param**2) for lstm_param in lstm_params ) # penalty = regularize_layer_params(l_forward_1_lstm, l2) # penalty = T.sum(lstm_param**2 for lstm_param in lstm_params) # penalty = 0.0001 * sum (T.sum(layer_params ** 2) for layer_params in get_all_params(l_forward_1) ) loss = loss + penalty params = get_all_params(network, trainable=True) if args.optimizer == "sgd": updates = sgd(loss, params, learning_rate=args.step) elif args.optimizer == "adagrad": updates = adagrad(loss, params, learning_rate=args.step) elif args.optimizer == "adadelta": updates = adadelta(loss, params, learning_rate=args.step) elif args.optimizer == "nesterov": updates = nesterov_momentum(loss, params, learning_rate=args.step) elif args.optimizer == "rms": updates = rmsprop(loss, params, learning_rate=args.step) elif args.optimizer == "adam": updates = adam(loss, params, learning_rate=args.step) else: raise "Need set optimizer correctly" test_prediction = get_output(network, input_dict, deterministic=True) # test_prediction = get_output(network, deterministic=True) # test_loss = T.mean( target_var * ( T.log(target_var) - test_prediction)) test_loss = T.mean(categorical_crossentropy(test_prediction, target_var)) train_fn = theano.function( [input1_var, input1_mask_var, input2_var, input2_mask_var, target_var], loss, updates=updates, allow_input_downcast=True, ) if args.task == "sts": val_fn = theano.function( [input1_var, input1_mask_var, input2_var, input2_mask_var, target_var], [test_loss, test_prediction], allow_input_downcast=True, ) elif args.task == "ent": # test_acc = T.mean(T.eq(T.argmax(test_prediction, axis=1), target_var), dtype=theano.config.floatX) test_acc = T.mean(categorical_accuracy(test_prediction, target_var)) val_fn = theano.function( [input1_var, input1_mask_var, input2_var, input2_mask_var, target_var], [test_loss, test_acc], allow_input_downcast=True, ) return train_fn, val_fn
def build_updates(grad, params, optimization, learning_rate): """ setup optimization algorithm """ if optimization['optimizer'] == 'sgd': update_op = updates.sgd(grad, params, learning_rate=learning_rate) elif optimization['optimizer'] == 'nesterov_momentum': if momenum in optimization: momentum = optimization['momentum'] else: momentum = 0.9 update_op = updates.nesterov_momentum(grad, params, learning_rate=learning_rate, momentum=momentum) elif optimization['optimizer'] == 'adagrad': update_op = updates.adagrad(grad, params, learning_rate=learning_rate) elif optimization['optimizer'] == 'rmsprop': if 'rho' in optimization: rho = optimization['rho'] else: rho = 0.9 update_op = updates.rmsprop(grad, params, learning_rate=learning_rate, rho=rho) elif optimization['optimizer'] == 'adam': if 'beta1' in optimization: beta1 = optimization['beta1'] else: beta1 = 0.9 if 'beta2' in optimization: beta2 = optimization['beta2'] else: beta2 = 0.999 update_op = updates.adam(grad, params, learning_rate=learning_rate, beta1=beta1, beta2=beta2) return update_op
def optimize(grads, params): if state['optim_method'] == 'adam': updates = adam(grads, params, lrt, state['momentum']) elif state['optim_method'] == 'adagrad': updates = adagrad(grads, params, lrt) elif state['optim_method'] == 'sgd': updates = sgd(grads, params, lrt) return updates
def adagrad_momentum(grads, params, learning_rate=1.0, momentum=0.9, epsilon=1e-06): return apply_nesterov_momentum(adagrad(grads, params, learning_rate, epsilon), params=params, momentum=momentum)
def create_iter_functions(self, dataset, output_layer, X_tensor_type=T.matrix): batch_index = T.iscalar('batch_index') X_batch = X_tensor_type('x') y_batch = T.ivector('y') batch_slice = slice(batch_index * self.batch_size, (batch_index + 1) * self.batch_size) objective = Objective(output_layer, loss_function=categorical_crossentropy) loss_train = objective.get_loss(X_batch, target=y_batch) loss_eval = objective.get_loss(X_batch, target=y_batch, deterministic=True) pred = T.argmax(output_layer.get_output(X_batch, deterministic=True), axis=1) proba = output_layer.get_output(X_batch, deterministic=True) accuracy = T.mean(T.eq(pred, y_batch), dtype=theano.config.floatX) all_params = get_all_params(output_layer) updates = adagrad(loss_train, all_params, self.lr, self.epsilon) iter_train = theano.function( [batch_index], loss_train, updates=updates, givens={ X_batch: dataset['X_train'][batch_slice], y_batch: dataset['y_train'][batch_slice], }, on_unused_input='ignore', ) iter_valid = None if self.use_valid: iter_valid = theano.function( [batch_index], [loss_eval, accuracy, proba], givens={ X_batch: dataset['X_valid'][batch_slice], y_batch: dataset['y_valid'][batch_slice], }, ) return dict(train=iter_train, valid=iter_valid)
def test_workflow(self): input_var = theano.shared(self.x) with pm.Model() as model: inp = layers.InputLayer(self.x.shape, input_var=input_var) out = BayesDenseLayer(inp, 1, nonlinearity=to.identity) pm.Normal('y', mu=get_output(out), sd=self.sd, observed=self.y) elbo, _, upd_rng, vp = sample_elbo(model, samples=1) upd_adam = updates.adagrad(-elbo, vp.params) upd_rng.update(upd_adam) step = theano.function([], elbo, updates=upd_rng) for i in range(100): step() self.assertRaises(ValueError, get_output, out, deterministic=True) preds = get_output(out, vp=vp, deterministic=True) np.testing.assert_allclose(preds.eval(), self.y, rtol=0, atol=1)
def get_cost_updates(self, corruption_level, learning_rate, noise = 0.0, momentum=0): """ This function computes the cost and the updates for one trainng step of the dA """ tilde_x = self.get_corrupted_input(self.x, corruption_level, noise) y = self.get_hidden_values(tilde_x) z = self.get_reconstructed_input(y) L = - T.sum(self.desired * T.log(z) + (1 - self.desired) * T.log(1 - z), axis=1) cost = T.mean(L) # adagrad with momentum on cost updates_ada = adagrad(cost, self.params, learning_rate=learning_rate) updates = apply_momentum(updates_ada, self.params, momentum=momentum) return (cost, updates)
def get_updates(nnet, train_obj, trainable_params, solver=None): implemented_solvers = ("sgd", "momentum", "nesterov", "adagrad", "rmsprop", "adadelta", "adam", "adamax") if solver not in implemented_solvers: nnet.sgd_solver = "adam" else: nnet.sgd_solver = solver if nnet.sgd_solver == "sgd": updates = l_updates.sgd(train_obj, trainable_params, learning_rate=Cfg.learning_rate) elif nnet.sgd_solver == "momentum": updates = l_updates.momentum(train_obj, trainable_params, learning_rate=Cfg.learning_rate, momentum=Cfg.momentum) elif nnet.sgd_solver == "nesterov": updates = l_updates.nesterov_momentum(train_obj, trainable_params, learning_rate=Cfg.learning_rate, momentum=Cfg.momentum) elif nnet.sgd_solver == "adagrad": updates = l_updates.adagrad(train_obj, trainable_params, learning_rate=Cfg.learning_rate) elif nnet.sgd_solver == "rmsprop": updates = l_updates.rmsprop(train_obj, trainable_params, learning_rate=Cfg.learning_rate, rho=Cfg.rho) elif nnet.sgd_solver == "adadelta": updates = l_updates.adadelta(train_obj, trainable_params, learning_rate=Cfg.learning_rate, rho=Cfg.rho) elif nnet.sgd_solver == "adam": updates = l_updates.adam(train_obj, trainable_params, learning_rate=Cfg.learning_rate) elif nnet.sgd_solver == "adamax": updates = l_updates.adamax(train_obj, trainable_params, learning_rate=Cfg.learning_rate) return updates
def create_iter_functions(self, dataset, output_layer, X_tensor_type=T.matrix): batch_index = T.iscalar('batch_index') X_batch = X_tensor_type('x') y_batch = T.ivector('y') batch_slice = slice(batch_index * self.batch_size, (batch_index + 1) * self.batch_size) objective = Objective(output_layer, loss_function=categorical_crossentropy) loss_train = objective.get_loss(X_batch, target=y_batch) loss_eval = objective.get_loss(X_batch, target=y_batch, deterministic=True) pred = T.argmax(output_layer.get_output(X_batch, deterministic=True), axis=1) proba = output_layer.get_output(X_batch, deterministic=True) accuracy = T.mean(T.eq(pred, y_batch), dtype=theano.config.floatX) all_params = get_all_params(output_layer) updates = adagrad(loss_train, all_params, self.lr, self.rho) iter_train = theano.function( [batch_index], loss_train, updates=updates, givens={ X_batch: dataset['X_train'][batch_slice], y_batch: dataset['y_train'][batch_slice], }, on_unused_input='ignore', ) iter_valid = None if self.use_valid: iter_valid = theano.function( [batch_index], [loss_eval, accuracy, proba], givens={ X_batch: dataset['X_valid'][batch_slice], y_batch: dataset['y_valid'][batch_slice], }, ) return dict(train=iter_train, valid=iter_valid)
def __init__(self, num_units=2): """ :param num_units: output vector dimension """ def loss_func(all_predicted): def distance_sq(x1, x2): return T.sum(T.sqr(x1 - x2)) d1 = distance_sq(all_predicted[:, 0], all_predicted[:, 1]) d2 = distance_sq(all_predicted[:, 0], all_predicted[:, 2]) alpha = 1e-2 return T.maximum(d1 + alpha, 0) - T.maximum(d2 + alpha, 0) super().__init__( sound_shape=(20, 100), num_units=num_units, main_layer_class=Speech2VecLayer, loss_func=loss_func, updates_func=lambda loss, weights: adagrad(loss, weights, learning_rate=0.001) )
def model_class(ds, paths, param_arch, param_cost, param_updates, param_train): # create a log file containing the architecture configuration formatter = logging.Formatter('%(message)s') logger = logging.getLogger('log_config') if 'start_from_epoch' in param_train: name_tmp = 'config_from_epoch=%04d.log' % ( param_train['start_from_epoch']) else: name_tmp = 'config.log' path_tmp = os.path.join(paths['exp'], name_tmp) if not os.path.isfile(path_tmp): handler = logging.FileHandler( path_tmp, mode='w') # to append at the end of the file use: mode='a' else: raise Exception('[e] the log file ', name_tmp, ' already exists!') handler.setFormatter(formatter) handler.setLevel(logging.INFO) logger.addHandler(handler) logger.setLevel(logging.INFO) # input dimensions dim_desc = ds.descs_train[0].shape[1] dim_labels = ds.labels_train[0].shape[0] print(dim_labels) # architecture definition: print(("[i] architecture definition... "), end=' ') tic = time.time() if param_arch['type'] == 0: desc, patch_op, cla, net, logger = arch_class_00( dim_desc, dim_labels, param_arch, logger) elif param_arch['type'] == 1: desc, patch_op, cla, net, logger = arch_class_01( dim_desc, dim_labels, param_arch, logger) elif param_arch['type'] == 2: desc, patch_op, cla, net, logger = arch_class_02( dim_desc, dim_labels, param_arch, logger) else: raise Exception('[e] architecture not supported!') print(("%02.2fs" % (time.time() - tic))) # cost function definition: print(("[i] cost function definition... "), end=' ') tic = time.time() pred = LL.get_output(cla, deterministic=True) # in case we use dropout feat = LL.get_output(net) target = T.ivector('target') # data term if param_cost['cost_func'] == 'cross_entropy': if param_arch['non_linearity'] == 'softmax': cost_dataterm = T.mean( LO.categorical_crossentropy(pred, target) ) # in the original code we were using *.mean() instead of T.mean(*) elif param_arch['non_linearity'] == 'log_softmax': cost_dataterm = T.mean( categorical_crossentropy_logdomain(pred, target)) elif param_cost['cost_func'] == 'cross_entropy_stable': if param_arch['non_linearity'] == 'softmax': cost_dataterm = T.mean( categorical_crossentropy_stable(pred, target)) else: raise Exception( '[e] the chosen cost function is not implemented for the chosen non-linearity!' ) else: raise Exception('[e] the chosen cost function is not supported!') # classification accuracy acc = LO.categorical_accuracy(pred, target).mean() # regularization cost_reg = param_cost['mu'] * LR.regularize_network_params(cla, LR.l2) # cost function cost = cost_dataterm + cost_reg # get params params = LL.get_all_params(cla) # gradient definition grad = T.grad(cost, params) grad_norm = T.nlinalg.norm(T.concatenate([g.flatten() for g in grad]), 2) print(("%02.2fs" % (time.time() - tic))) # updates definition: print(("[i] gradient updates definition... "), end=' ') tic = time.time() if param_updates['method'] == 'momentum': if param_updates.get('learning_rate') is not None: learning_rate = param_updates['learning_rate'] # default: 1.0 else: raise Exception('[e] missing learning_rate parameter!') if param_updates.get('momentum') is not None: momentum = param_updates['momentum'] # default: 0.9 else: raise Exception('[e] missing learning_rate parameter!') updates = LU.momentum(grad, params, learning_rate, momentum) elif param_updates['method'] == 'adagrad': if param_updates.get('learning_rate') is not None: learning_rate = param_updates['learning_rate'] # default: 1.0 else: raise Exception('[e] missing learning_rate parameter!') updates = LU.adagrad(grad, params, learning_rate) elif param_updates['method'] == 'adadelta': if param_updates.get('learning_rate') is not None: learning_rate = param_updates['learning_rate'] # default: 1.0 else: raise Exception('[e] missing learning_rate parameter!') updates = LU.adadelta(grad, params, learning_rate) elif param_updates['method'] == 'adam': if param_updates.get('learning_rate') is not None: learning_rate = param_updates['learning_rate'] # default: 1e-03 else: raise Exception('[e] missing learning_rate parameter!') if param_updates.get('beta1') is not None: beta1 = param_updates['beta1'] # default: 0.9 else: raise Exception('[e] missing beta1 parameter!') if param_updates.get('beta2') is not None: beta2 = param_updates['beta2'] # default: 0.999 else: raise Exception('[e] missing beta2 parameter!') if param_updates.get('epsilon') is not None: epsilon = param_updates['epsilon'] # default: 1e-08 else: raise Exception('[e] missing epsilon parameter!') updates = LU.adam(grad, params, learning_rate, beta1, beta2, epsilon) else: raise Exception('[e] updates method not supported!') print(("%02.2fs" % (time.time() - tic))) # train / test functions: funcs = dict() print(("[i] compiling function 'train'... "), end=' ') tic = time.time() funcs['train'] = theano.function( [desc.input_var, patch_op.input_var, target], [cost, cost_dataterm, cost_reg, grad_norm, acc], updates=updates, allow_input_downcast=True, on_unused_input='warn') print(("%02.2fs" % (time.time() - tic))) print(("[i] compiling function 'fwd'... "), end=' ') tic = time.time() funcs['fwd'] = theano.function( [desc.input_var, patch_op.input_var, target], [cost, grad_norm, acc], allow_input_downcast=True, on_unused_input='ignore') print(("%02.2fs" % (time.time() - tic))) print(("[i] compiling function 'pred'... "), end=' ') tic = time.time() funcs['pred'] = theano.function( [desc.input_var, patch_op.input_var, target], [pred], allow_input_downcast=True, on_unused_input='ignore') print(("%02.2fs" % (time.time() - tic))) print(("[i] compiling function 'feat'... "), end=' ') tic = time.time() funcs['feat'] = theano.function( [desc.input_var, patch_op.input_var, target], [feat], allow_input_downcast=True, on_unused_input='ignore') print(("%02.2fs" % (time.time() - tic))) # save cost function parameters to a config file logger.info('\nCost function parameters:') logger.info(' cost function = %s' % param_cost['cost_func']) logger.info(' mu = %e' % param_cost['mu']) # save updates parameters to a config file logger.info('\nUpdates parameters:') logger.info(' method = %s' % param_updates['method']) logger.info(' learning rate = %e' % param_updates['learning_rate']) if param_updates['method'] == 'momentum': logger.info(' momentum = %e' % param_updates['momentum']) if param_updates['method'] == 'adam': logger.info(' beta1 = %e' % param_updates['beta1']) logger.info(' beta2 = %e' % param_updates['beta2']) logger.info(' epsilon = %e' % param_updates['epsilon']) # save training parameters to a config file logger.info('\nTraining parameters:') logger.info(' epoch size = %d' % ds.epoch_size) return funcs, cla, updates
def run_dnn(learning_rate=0.001, dnn_strategy='mix', possitive_punishment=1): #input_var = T.TensorType('float32', ((False,) * 3))() # Notice the () at the end input_var = T.ftensor3('X') target_var = T.imatrix('y') features_type = 16 perioid = 20 features_dim = features_type * perioid network = build_mix(input_var, 1, features_type, features_dim, perioid, activity=sigmoid) if dnn_strategy == 'dnn': build_dnn(input_var, 1, features_type, features_dim, perioid, activity=sigmoid) elif dnn_strategy == 'conv1d': build_conv1d(input_var, 1, features_type, features_dim, perioid, activity=sigmoid) elif dnn_strategy == 'cascade': build_cascade(input_var, 1, features_type, features_dim, perioid, activity=sigmoid) elif dnn_strategy == 'lstm': build_lstm(input_var, 1, features_type, features_dim, perioid, activity=sigmoid) elif dnn_strategy == 'partitioned': build_partitioned(input_var, 1, features_type, features_dim, perioid, activity=sigmoid) elif dnn_strategy == 'mix': pass else: raise AttributeError("This dnn_strategy is not supported!") l_output = get_output(network) loss = self_binary_crossentropy(l_output, target_var, possitive_punishment=possitive_punishment).mean() train_acc = binary_accuracy(l_output, target_var).mean() all_params = get_all_params(network, trainable=True) updates = adagrad(loss, all_params, learning_rate=learning_rate) train = theano.function([input_var, target_var], [loss, train_acc], updates=updates) test_prediction = get_output(network, deterministic=True) test_loss = self_binary_crossentropy(test_prediction, target_var, possitive_punishment=possitive_punishment).mean() test_acc = binary_accuracy(test_prediction, target_var).mean() #calculate win rate win_rate_result1 = [] win_rate_result2 = [] for win_rate_threhold in [0.5, 0.6, 0.7, 0.8, 0.9]: tmp1 = T.sum(T.switch(T.and_(T.gt(test_prediction, win_rate_threhold), T.eq(target_var, 1)), 1, 0), dtype=theano.config.floatX) tmp2 = T.sum(T.switch(T.gt(test_prediction, win_rate_threhold), 1, 0), dtype=theano.config.floatX) test_win_rate = (tmp1 + 0.00001) / (tmp2 + 0.00001) win_rate_result1.append(test_win_rate) win_rate_result2.append(tmp1) val = theano.function([input_var, target_var], [test_prediction, test_loss, test_acc, T.as_tensor_variable(win_rate_result1), T.as_tensor_variable(win_rate_result2)]) _, _, _, _, X_train, y_train, X_val, y_val, _, _ = load_dataset('../../data/800core') ''' test_data_list = [] test_label_list = [] for ix in range(103): file_name = '../../data/test_dis/data_' + str(ix) + '.txt' tmp_test_data, tmp_test_label, _, _, _, _, _, _ = load_dataset(file_name) test_data_list.append(tmp_test_data) test_label_list.append(tmp_test_label) ''' num_epochs = 150 batch_size = 128 for epoch in xrange(num_epochs): train_err = 0 train_acc = 0 train_batches = 0 start_time = time.time() #train for batch in iterate_minibatches(X_train, y_train, batch_size): inputs, targets = batch err, acc= train(inputs, targets) train_err += err train_acc += acc train_batches += 1 #validate _, val_err, val_acc, val_wr1, val_wr2 = val(X_val, y_val) # Then we print the results for this epoch: for ix in range(len([0.5, 0.6, 0.7, 0.8, 0.9])): sys.stdout.write(" validation win rate :\t\t{}\n".format(val_wr1[ix])) sys.stdout.write(" validation possitive num:\t\t{}\n".format(val_wr2[ix])) sys.stdout.write("Epoch {} of {} took {:.3f}s\n".format( epoch + 1, num_epochs, time.time() - start_time)) sys.stdout.write(" training loss:\t\t{}\n".format(train_err / train_batches)) sys.stdout.write(" training accuracy:\t\t{}\n".format(train_acc / train_batches)) sys.stdout.write(" validation loss:\t\t{}\n".format(val_err/1)) sys.stdout.write(" validation accuracy:\t\t{} %\n".format(val_acc * 100)) sys.stdout.write('\n') sys.stdout.flush() #sotre for gpu with open('../../model/' + dnn_strategy + '/' + 'learning_rate' + str(learning_rate) + '_punishment' + str(possitive_punishment) + '_epoch' + str(epoch) + '.model', 'w') as f: cPickle.dump(network, f, protocol=cPickle.HIGHEST_PROTOCOL) print 'Done!'
cost_d = -T.mean(T.log(prob_T)) - T.mean(T.log(1. - prob_F)) #### generator cost_g = -T.mean(T.log(prob_F)) #### cost variables to be considered cost_str = ['E_T', 'E_F', 'P_T', 'P_F'] cost_var = [E_T, E_F, P_T, P_F] ############################ # Gradient & Optimization ############################ #### Parameter updates # updates_d = sgd(cost_d, params_d, lr) # updates_g = sgd(cost_g, params_g, lr) updates_d = adagrad(cost_d, params_d, lr) updates_g = adagrad(cost_g, params_g, lr) #### Barchnorm state updates batchnorm_updates_d = disc_model.get_updates() print '# D batchnorm updates: %d' % (len(batchnorm_updates_d)) batchnorm_updates_g = gen_model.get_updates() print '# G batchnorm updates: %d' % (len(batchnorm_updates_g)) #### Collect all updates updates_d.update(batchnorm_updates_d) updates_g.update(batchnorm_updates_g) updates_all = OrderedDict() updates_all.update(updates_d)
def multi_task_classifier(args, input_var, target_var, wordEmbeddings, seqlen, num_feats, lambda_val = 0.5 * 1e-4): print("Building multi task model with 1D Convolution") vocab_size = wordEmbeddings.shape[1] wordDim = wordEmbeddings.shape[0] kw = 2 num_filters = seqlen-kw+1 stride = 1 filter_size=wordDim pool_size=num_filters input = InputLayer((None, seqlen, num_feats),input_var=input_var) batchsize, _, _ = input.input_var.shape emb = EmbeddingLayer(input, input_size=vocab_size, output_size=wordDim, W=wordEmbeddings.T) reshape = ReshapeLayer(emb, (batchsize, seqlen, num_feats*wordDim)) conv1d_1 = DimshuffleLayer(Conv1DLayer(reshape, num_filters=num_filters, filter_size=wordDim, stride=1, nonlinearity=tanh,W=GlorotUniform()), (0,2,1)) maxpool_1 = MaxPool1DLayer(conv1d_1, pool_size=pool_size) hid_1 = DenseLayer(maxpool_1, num_units=args.hiddenDim, nonlinearity=sigmoid) network_1 = DenseLayer(hid_1, num_units=2, nonlinearity=softmax) conv1d_2 = DimshuffleLayer(Conv1DLayer(reshape, num_filters=num_filters, filter_size=wordDim, stride=1, nonlinearity=tanh,W=GlorotUniform()), (0,2,1)) maxpool_2 = MaxPool1DLayer(conv1d_2, pool_size=pool_size) hid_2 = DenseLayer(maxpool_2, num_units=args.hiddenDim, nonlinearity=sigmoid) network_2 = DenseLayer(hid_2, num_units=4, nonlinearity=softmax) conv1d_3 = DimshuffleLayer(Conv1DLayer(reshape, num_filters=num_filters, filter_size=wordDim, stride=1, nonlinearity=tanh,W=GlorotUniform()), (0,2,1)) maxpool_3 = MaxPool1DLayer(conv1d_3, pool_size=pool_size) hid_3 = DenseLayer(maxpool_3, num_units=args.hiddenDim, nonlinearity=sigmoid) network_3 = DenseLayer(hid_3, num_units=3, nonlinearity=softmax) conv1d_4 = DimshuffleLayer(Conv1DLayer(reshape, num_filters=num_filters, filter_size=wordDim, stride=1, nonlinearity=tanh,W=GlorotUniform()), (0,2,1)) maxpool_4 = MaxPool1DLayer(conv1d_4, pool_size=pool_size) hid_4 = DenseLayer(maxpool_4, num_units=args.hiddenDim, nonlinearity=sigmoid) network_4 = DenseLayer(hid_4, num_units=3, nonlinearity=softmax) conv1d_5 = DimshuffleLayer(Conv1DLayer(reshape, num_filters=num_filters, filter_size=wordDim, stride=1, nonlinearity=tanh,W=GlorotUniform()), (0,2,1)) maxpool_5 = MaxPool1DLayer(conv1d_5, pool_size=pool_size) hid_5 = DenseLayer(maxpool_5, num_units=args.hiddenDim, nonlinearity=sigmoid) network_5 = DenseLayer(hid_5, num_units=2, nonlinearity=softmax) conv1d_6 = DimshuffleLayer(Conv1DLayer(reshape, num_filters=num_filters, filter_size=wordDim, stride=1, nonlinearity=tanh,W=GlorotUniform()), (0,2,1)) maxpool_6 = MaxPool1DLayer(conv1d_6, pool_size=pool_size) hid_6 = DenseLayer(maxpool_6, num_units=args.hiddenDim, nonlinearity=sigmoid) network_6 = DenseLayer(hid_6, num_units=4, nonlinearity=softmax) conv1d_7 = DimshuffleLayer(Conv1DLayer(reshape, num_filters=num_filters, filter_size=wordDim, stride=1, nonlinearity=tanh,W=GlorotUniform()), (0,2,1)) maxpool_7 = MaxPool1DLayer(conv1d_7, pool_size=pool_size) hid_7 = DenseLayer(maxpool_7, num_units=args.hiddenDim, nonlinearity=sigmoid) network_7 = DenseLayer(hid_7, num_units=3, nonlinearity=softmax) conv1d_8 = DimshuffleLayer(Conv1DLayer(reshape, num_filters=num_filters, filter_size=wordDim, stride=1, nonlinearity=tanh,W=GlorotUniform()), (0,2,1)) maxpool_8 = MaxPool1DLayer(conv1d_8, pool_size=pool_size) hid_8 = DenseLayer(maxpool_8, num_units=args.hiddenDim, nonlinearity=sigmoid) network_8 = DenseLayer(hid_8, num_units=3, nonlinearity=softmax) # Is this important? network_1_out, network_2_out, network_3_out, network_4_out, \ network_5_out, network_6_out, network_7_out, network_8_out = \ get_output([network_1, network_2, network_3, network_4, network_5, network_6, network_7, network_8]) loss_1 = T.mean(binary_crossentropy(network_1_out,target_var)) + regularize_layer_params_weighted({emb:lambda_val, conv1d_1:lambda_val, hid_1:lambda_val, network_1:lambda_val} , l2) updates_1 = adagrad(loss_1, get_all_params(network_1, trainable=True), learning_rate=args.step) train_fn_1 = theano.function([input_var, target_var], loss_1, updates=updates_1, allow_input_downcast=True) val_acc_1 = T.mean(binary_accuracy(get_output(network_1, deterministic=True), target_var)) val_fn_1 = theano.function([input_var, target_var], val_acc_1, allow_input_downcast=True) loss_2 = T.mean(categorical_crossentropy(network_2_out,target_var)) + regularize_layer_params_weighted({emb:lambda_val, conv1d_2:lambda_val, hid_2:lambda_val, network_2:lambda_val} , l2) updates_2 = adagrad(loss_2, get_all_params(network_2, trainable=True), learning_rate=args.step) train_fn_2 = theano.function([input_var, target_var], loss_2, updates=updates_2, allow_input_downcast=True) val_acc_2 = T.mean(categorical_accuracy(get_output(network_2, deterministic=True), target_var)) val_fn_2 = theano.function([input_var, target_var], val_acc_2, allow_input_downcast=True) loss_3 = T.mean(categorical_crossentropy(network_3_out,target_var)) + regularize_layer_params_weighted({emb:lambda_val, conv1d_3:lambda_val, hid_3:lambda_val, network_3:lambda_val} , l2) updates_3 = adagrad(loss_3, get_all_params(network_3, trainable=True), learning_rate=args.step) train_fn_3 = theano.function([input_var, target_var], loss_3, updates=updates_3, allow_input_downcast=True) val_acc_3 = T.mean(categorical_accuracy(get_output(network_3, deterministic=True), target_var)) val_fn_3 = theano.function([input_var, target_var], val_acc_3, allow_input_downcast=True) loss_4 = T.mean(categorical_crossentropy(network_4_out,target_var)) + regularize_layer_params_weighted({emb:lambda_val, conv1d_4:lambda_val, hid_4:lambda_val, network_4:lambda_val} , l2) updates_4 = adagrad(loss_4, get_all_params(network_4, trainable=True), learning_rate=args.step) train_fn_4 = theano.function([input_var, target_var], loss_4, updates=updates_4, allow_input_downcast=True) val_acc_4 = T.mean(categorical_accuracy(get_output(network_4, deterministic=True), target_var)) val_fn_4 = theano.function([input_var, target_var], val_acc_4, allow_input_downcast=True) loss_5 = T.mean(binary_crossentropy(network_5_out,target_var)) + regularize_layer_params_weighted({emb:lambda_val, conv1d_5:lambda_val, hid_5:lambda_val, network_5:lambda_val} , l2) updates_5 = adagrad(loss_5, get_all_params(network_5, trainable=True), learning_rate=args.step) train_fn_5 = theano.function([input_var, target_var], loss_5, updates=updates_5, allow_input_downcast=True) val_acc_5 = T.mean(binary_accuracy(get_output(network_5, deterministic=True), target_var)) val_fn_5 = theano.function([input_var, target_var], val_acc_5, allow_input_downcast=True) loss_6 = T.mean(categorical_crossentropy(network_6_out,target_var)) + regularize_layer_params_weighted({emb:lambda_val, conv1d_6:lambda_val, hid_6:lambda_val, network_6:lambda_val} , l2) updates_6 = adagrad(loss_6, get_all_params(network_6, trainable=True), learning_rate=args.step) train_fn_6 = theano.function([input_var, target_var], loss_6, updates=updates_6, allow_input_downcast=True) val_acc_6 = T.mean(categorical_accuracy(get_output(network_6, deterministic=True), target_var)) val_fn_6 = theano.function([input_var, target_var], val_acc_6, allow_input_downcast=True) loss_7 = T.mean(categorical_crossentropy(network_7_out,target_var)) + regularize_layer_params_weighted({emb:lambda_val, conv1d_7:lambda_val, hid_7:lambda_val, network_7:lambda_val} , l2) updates_7 = adagrad(loss_7, get_all_params(network_7, trainable=True), learning_rate=args.step) train_fn_7 = theano.function([input_var, target_var], loss_7, updates=updates_7, allow_input_downcast=True) val_acc_7 = T.mean(categorical_accuracy(get_output(network_7, deterministic=True), target_var)) val_fn_7 = theano.function([input_var, target_var], val_acc_7, allow_input_downcast=True) loss_8 = T.mean(categorical_crossentropy(network_8_out,target_var)) + regularize_layer_params_weighted({emb:lambda_val, conv1d_8:lambda_val, hid_8:lambda_val, network_8:lambda_val} , l2) updates_8 = adagrad(loss_8, get_all_params(network_8, trainable=True), learning_rate=args.step) train_fn_8 = theano.function([input_var, target_var], loss_8, updates=updates_8, allow_input_downcast=True) val_acc_8 = T.mean(categorical_accuracy(get_output(network_8, deterministic=True), target_var)) val_fn_8 = theano.function([input_var, target_var], val_acc_8, allow_input_downcast=True) return train_fn_1, val_fn_1, network_1, train_fn_2, val_fn_2, network_2, train_fn_3, val_fn_3, \ network_3, train_fn_4, val_fn_4, network_4, train_fn_5, val_fn_5, network_5, \ train_fn_6, val_fn_6, network_6, train_fn_7, val_fn_7, network_7, train_fn_8, val_fn_8, network_8
def event_span_classifier(args, input_var, input_mask_var, target_var, wordEmbeddings, seqlen): print("Building model with LSTM") vocab_size = wordEmbeddings.shape[1] wordDim = wordEmbeddings.shape[0] GRAD_CLIP = wordDim args.lstmDim = 150 input = InputLayer((None, seqlen),input_var=input_var) batchsize, seqlen = input.input_var.shape input_mask = InputLayer((None, seqlen),input_var=input_mask_var) emb = EmbeddingLayer(input, input_size=vocab_size, output_size=wordDim, W=wordEmbeddings.T) #emb.params[emb_1.W].remove('trainable') lstm = LSTMLayer(emb, num_units=args.lstmDim, mask_input=input_mask, grad_clipping=GRAD_CLIP, nonlinearity=tanh) lstm_back = LSTMLayer( emb, num_units=args.lstmDim, mask_input=input_mask, grad_clipping=GRAD_CLIP, nonlinearity=tanh, backwards=True) slice_forward = SliceLayer(lstm, indices=-1, axis=1) # out_shape (None, args.lstmDim) slice_backward = SliceLayer(lstm_back, indices=0, axis=1) # out_shape (None, args.lstmDim) concat = ConcatLayer([slice_forward, slice_backward]) hid = DenseLayer(concat, num_units=args.hiddenDim, nonlinearity=sigmoid) network = DenseLayer(hid, num_units=2, nonlinearity=softmax) prediction = get_output(network) loss = T.mean(binary_crossentropy(prediction,target_var)) lambda_val = 0.5 * 1e-4 layers = {emb:lambda_val, lstm:lambda_val, hid:lambda_val, network:lambda_val} penalty = regularize_layer_params_weighted(layers, l2) loss = loss + penalty params = get_all_params(network, trainable=True) if args.optimizer == "sgd": updates = sgd(loss, params, learning_rate=args.step) elif args.optimizer == "adagrad": updates = adagrad(loss, params, learning_rate=args.step) elif args.optimizer == "adadelta": updates = adadelta(loss, params, learning_rate=args.step) elif args.optimizer == "nesterov": updates = nesterov_momentum(loss, params, learning_rate=args.step) elif args.optimizer == "rms": updates = rmsprop(loss, params, learning_rate=args.step) elif args.optimizer == "adam": updates = adam(loss, params, learning_rate=args.step) else: raise "Need set optimizer correctly" test_prediction = get_output(network, deterministic=True) test_loss = T.mean(binary_crossentropy(test_prediction,target_var)) train_fn = theano.function([input_var, input_mask_var,target_var], loss, updates=updates, allow_input_downcast=True) test_acc = T.mean(binary_accuracy(test_prediction, target_var)) val_fn = theano.function([input_var, input_mask_var, target_var], [test_loss, test_acc], allow_input_downcast=True) return train_fn, val_fn, network
l4=MaxPool2DLayer(l3,(2,2)) l5=Conv2DLayer(l4,96,(5,5),nonlinearity=very_leaky_rectify,W=GlorotUniform('relu')) l6=MaxPool2DLayer(l5,(3,3)) l7=DenseLayer(l6,512,nonlinearity=very_leaky_rectify,W=lasagne.init.GlorotNormal()) #l7_5=cyclicpool(l7) #l7_5=lasagne.layers.DropoutLayer(l7) l8=DenseLayer(l7,2,nonlinearity=softmax) rate=theano.shared(.0002) params = lasagne.layers.get_all_params(l8) prediction = lasagne.layers.get_output(l8) loss = lasagne.objectives.categorical_crossentropy(prediction,y1) loss = loss.mean() updates_sgd = adagrad(loss, params, learning_rate=rate) updates = apply_nesterov_momentum(updates_sgd, params, momentum=0.9) train_model = theano.function([x1,y1],outputs=loss,updates=updates) pred = theano.function([x1,y1],outputs=lasagne.objectives.categorical_crossentropy(prediction,y1)) # pred=theano.function([x1,y1],outputs=prediction,on_unused_input='ignore') ### begin to train renewtrain=len(train_x)/batchsize renewtest=len(test_x)/batchsize for i in range(15000): if i>325 and i<3000: rate.set_value(.001)
def event_span_classifier(args, input_var, target_var, wordEmbeddings, seqlen, num_feats): print("Building model with 1D Convolution") vocab_size = wordEmbeddings.shape[1] wordDim = wordEmbeddings.shape[0] kw = 2 num_filters = seqlen-kw+1 stride = 1 #important context words as channels #CNN_sentence config filter_size=wordDim pool_size=seqlen-filter_size+1 input = InputLayer((None, seqlen, num_feats),input_var=input_var) batchsize, _, _ = input.input_var.shape emb = EmbeddingLayer(input, input_size=vocab_size, output_size=wordDim, W=wordEmbeddings.T) #emb.params[emb.W].remove('trainable') #(batchsize, seqlen, wordDim) #print get_output_shape(emb) reshape = ReshapeLayer(emb, (batchsize, seqlen, num_feats*wordDim)) #print get_output_shape(reshape) conv1d = Conv1DLayer(reshape, num_filters=num_filters, filter_size=wordDim, stride=1, nonlinearity=tanh,W=GlorotUniform()) #nOutputFrame = num_flters, #nOutputFrameSize = (num_feats*wordDim-filter_size)/stride +1 #print get_output_shape(conv1d) conv1d = DimshuffleLayer(conv1d, (0,2,1)) #print get_output_shape(conv1d) pool_size=num_filters maxpool = MaxPool1DLayer(conv1d, pool_size=pool_size) #print get_output_shape(maxpool) #forward = FlattenLayer(maxpool) #print get_output_shape(forward) hid = DenseLayer(maxpool, num_units=args.hiddenDim, nonlinearity=sigmoid) network = DenseLayer(hid, num_units=2, nonlinearity=softmax) prediction = get_output(network) loss = T.mean(binary_crossentropy(prediction,target_var)) lambda_val = 0.5 * 1e-4 layers = {emb:lambda_val, conv1d:lambda_val, hid:lambda_val, network:lambda_val} penalty = regularize_layer_params_weighted(layers, l2) loss = loss + penalty params = get_all_params(network, trainable=True) if args.optimizer == "sgd": updates = sgd(loss, params, learning_rate=args.step) elif args.optimizer == "adagrad": updates = adagrad(loss, params, learning_rate=args.step) elif args.optimizer == "adadelta": updates = adadelta(loss, params, learning_rate=args.step) elif args.optimizer == "nesterov": updates = nesterov_momentum(loss, params, learning_rate=args.step) elif args.optimizer == "rms": updates = rmsprop(loss, params, learning_rate=args.step) elif args.optimizer == "adam": updates = adam(loss, params, learning_rate=args.step) else: raise "Need set optimizer correctly" test_prediction = get_output(network, deterministic=True) test_loss = T.mean(binary_crossentropy(test_prediction,target_var)) train_fn = theano.function([input_var, target_var], loss, updates=updates, allow_input_downcast=True) test_acc = T.mean(binary_accuracy(test_prediction, target_var)) val_fn = theano.function([input_var, target_var], [test_loss, test_acc], allow_input_downcast=True) return train_fn, val_fn, network
def build_network_2dconv(args, input_var, target_var, wordEmbeddings, maxlen=60): print("Building model with 2D Convolution") vocab_size = wordEmbeddings.shape[1] wordDim = wordEmbeddings.shape[0] num_filters = 100 stride = 1 # CNN_sentence config filter_size = (3, wordDim) pool_size = (maxlen - 3 + 1, 1) input = InputLayer((None, maxlen), input_var=input_var) batchsize, seqlen = input.input_var.shape emb = EmbeddingLayer(input, input_size=vocab_size, output_size=wordDim, W=wordEmbeddings.T) emb.params[emb.W].remove("trainable") # (batchsize, maxlen, wordDim) reshape = ReshapeLayer(emb, (batchsize, 1, maxlen, wordDim)) conv2d = Conv2DLayer( reshape, num_filters=num_filters, filter_size=(filter_size), stride=stride, nonlinearity=rectify, W=GlorotUniform(), ) # (None, 100, 34, 1) maxpool = MaxPool2DLayer(conv2d, pool_size=pool_size) # (None, 100, 1, 1) forward = FlattenLayer(maxpool) # (None, 100) #(None, 50400) hid = DenseLayer(forward, num_units=args.hiddenDim, nonlinearity=sigmoid) network = DenseLayer(hid, num_units=2, nonlinearity=softmax) prediction = get_output(network) loss = T.mean(binary_crossentropy(prediction, target_var)) lambda_val = 0.5 * 1e-4 layers = {conv2d: lambda_val, hid: lambda_val, network: lambda_val} penalty = regularize_layer_params_weighted(layers, l2) loss = loss + penalty params = get_all_params(network, trainable=True) if args.optimizer == "sgd": updates = sgd(loss, params, learning_rate=args.step) elif args.optimizer == "adagrad": updates = adagrad(loss, params, learning_rate=args.step) elif args.optimizer == "adadelta": updates = adadelta(loss, params, learning_rate=args.step) elif args.optimizer == "nesterov": updates = nesterov_momentum(loss, params, learning_rate=args.step) elif args.optimizer == "rms": updates = rmsprop(loss, params, learning_rate=args.step) elif args.optimizer == "adam": updates = adam(loss, params, learning_rate=args.step) else: raise "Need set optimizer correctly" test_prediction = get_output(network, deterministic=True) test_loss = T.mean(binary_crossentropy(test_prediction, target_var)) train_fn = theano.function([input_var, target_var], loss, updates=updates, allow_input_downcast=True) test_acc = T.mean(binary_accuracy(test_prediction, target_var)) val_fn = theano.function([input_var, target_var], [test_loss, test_acc], allow_input_downcast=True) return train_fn, val_fn
def marginal_likelihood(x, y, n_epochs=100, lrate=.1): gp_params = np.array([2., 10.]) indep_noise = 0.3 #gp_params = np.array([.4966, 148.41]) #indep_noise = .0821 t_log_gp_params = theano.shared(np.log(gp_params)) t_log_indep_noise = theano.shared(np.log(indep_noise)) def print_params(): print t_log_gp_params.get_value() print t_log_indep_noise.get_value() print_params() t_x = T.vector('x') t_y = T.vector('y') t_gp_params = T.exp(t_log_gp_params) t_indep_noise = T.exp(t_log_indep_noise) x_col = t_x.dimshuffle(0, 'x') x_row = t_x.dimshuffle('x', 0) K = t_gp_params[0] * T.exp(-t_gp_params[1] * T.sqr(x_col - x_row)) K = K + t_indep_noise * T.identity_like(K) y_Kinv_y = t_y.dot(T.nlinalg.matrix_inverse(K)).dot(t_y) #logdetK = T.log(T.nlinalg.det(K)) logdetK = logabsdet(K) marginal_ll = -0.5 * (y_Kinv_y + logdetK) loss = -marginal_ll if batch_update: loss_fn = theano.function([t_x, t_y], loss) grad_loss_fn = theano.function( [t_x, t_y], theano.grad(loss, [t_log_gp_params, t_log_indep_noise])) n_params = len(gp_params) + 1 def f_df(params): a, b, c = params t_log_gp_params.set_value([a, b]) t_log_indep_noise.set_value(c) total_loss = 0 grad = np.zeros(n_params) for each_x, each_y in izip(x, y): total_loss += loss_fn(each_x, each_y) grad += np.append(*grad_loss_fn(each_x, each_y)) return total_loss, grad init_params = np.r_[gp_params, indep_noise] opt = fmin_l_bfgs_b( f_df, init_params, factr=1e3, pgtol=1e-07, disp=1, )[0] print opt opt_gp_params = opt[:-1] opt_indep_noise = opt[-1] else: # Stochastic optimization updates = adagrad(loss, [t_log_gp_params, t_log_indep_noise], learning_rate=lrate) loss_fn = theano.function([t_x, t_y], loss, updates=updates) grad_loss_fn = theano.function( [t_x, t_y], theano.grad(loss, [t_log_gp_params, t_log_indep_noise])) for i in xrange(n_epochs): count = 1 trace = [] sys.stderr.write('%4d ' % i) for each_x, each_y in izip(x, y): val = loss_fn(each_x, each_y) trace.append(val) count += 1 if count % 20 == 0: sys.stderr.write('.') print print np.mean(trace), np.std(trace) print_params() opt_gp_params = t_log_gp_params.get_value() opt_indep_noise = t_log_indep_noise.get_value() return opt_gp_params, opt_indep_noise
def multi_task_classifier(args, input_var, target_var, wordEmbeddings, seqlen, num_feats, lambda_val=0.5 * 1e-4): print("Building multi task model with 1D Convolution") vocab_size = wordEmbeddings.shape[1] wordDim = wordEmbeddings.shape[0] kw = 2 num_filters = seqlen - kw + 1 stride = 1 filter_size = wordDim pool_size = num_filters input = InputLayer((None, seqlen, num_feats), input_var=input_var) batchsize, _, _ = input.input_var.shape #span emb1 = EmbeddingLayer(input, input_size=vocab_size, output_size=wordDim, W=wordEmbeddings.T) reshape1 = ReshapeLayer(emb1, (batchsize, seqlen, num_feats * wordDim)) conv1d_1 = DimshuffleLayer( Conv1DLayer(reshape1, num_filters=num_filters, filter_size=wordDim, stride=1, nonlinearity=tanh, W=GlorotUniform()), (0, 2, 1)) maxpool_1 = MaxPool1DLayer(conv1d_1, pool_size=pool_size) hid_1 = DenseLayer(maxpool_1, num_units=args.hiddenDim, nonlinearity=sigmoid) network_1 = DenseLayer(hid_1, num_units=2, nonlinearity=softmax) """ #DocTimeRel emb2 = EmbeddingLayer(input, input_size=vocab_size, output_size=wordDim, W=wordEmbeddings.T) reshape2 = ReshapeLayer(emb2, (batchsize, seqlen, num_feats*wordDim)) conv1d_2 = DimshuffleLayer(Conv1DLayer(reshape2, num_filters=num_filters, filter_size=wordDim, stride=1, nonlinearity=tanh,W=GlorotUniform()), (0,2,1)) maxpool_2 = MaxPool1DLayer(conv1d_2, pool_size=pool_size) hid_2 = DenseLayer(maxpool_2, num_units=args.hiddenDim, nonlinearity=sigmoid) network_2 = DenseLayer(hid_2, num_units=5, nonlinearity=softmax) """ #Type emb3 = EmbeddingLayer(input, input_size=vocab_size, output_size=wordDim, W=wordEmbeddings.T) reshape3 = ReshapeLayer(emb3, (batchsize, seqlen, num_feats * wordDim)) conv1d_3 = DimshuffleLayer( Conv1DLayer(reshape3, num_filters=num_filters, filter_size=wordDim, stride=1, nonlinearity=tanh, W=GlorotUniform()), (0, 2, 1)) maxpool_3 = MaxPool1DLayer(conv1d_3, pool_size=pool_size) hid_3 = DenseLayer(maxpool_3, num_units=args.hiddenDim, nonlinearity=sigmoid) network_3 = DenseLayer(hid_3, num_units=4, nonlinearity=softmax) #Degree emb4 = EmbeddingLayer(input, input_size=vocab_size, output_size=wordDim, W=wordEmbeddings.T) reshape4 = ReshapeLayer(emb4, (batchsize, seqlen, num_feats * wordDim)) conv1d_4 = DimshuffleLayer( Conv1DLayer(reshape4, num_filters=num_filters, filter_size=wordDim, stride=1, nonlinearity=tanh, W=GlorotUniform()), (0, 2, 1)) maxpool_4 = MaxPool1DLayer(conv1d_4, pool_size=pool_size) hid_4 = DenseLayer(maxpool_4, num_units=args.hiddenDim, nonlinearity=sigmoid) network_4 = DenseLayer(hid_4, num_units=4, nonlinearity=softmax) #Polarity emb5 = EmbeddingLayer(input, input_size=vocab_size, output_size=wordDim, W=wordEmbeddings.T) reshape5 = ReshapeLayer(emb5, (batchsize, seqlen, num_feats * wordDim)) conv1d_5 = DimshuffleLayer( Conv1DLayer(reshape5, num_filters=num_filters, filter_size=wordDim, stride=1, nonlinearity=tanh, W=GlorotUniform()), (0, 2, 1)) maxpool_5 = MaxPool1DLayer(conv1d_5, pool_size=pool_size) hid_5 = DenseLayer(maxpool_5, num_units=args.hiddenDim, nonlinearity=sigmoid) network_5 = DenseLayer(hid_5, num_units=3, nonlinearity=softmax) #ContextualModality emb6 = EmbeddingLayer(input, input_size=vocab_size, output_size=wordDim, W=wordEmbeddings.T) reshape6 = ReshapeLayer(emb6, (batchsize, seqlen, num_feats * wordDim)) conv1d_6 = DimshuffleLayer( Conv1DLayer(reshape6, num_filters=num_filters, filter_size=wordDim, stride=1, nonlinearity=tanh, W=GlorotUniform()), (0, 2, 1)) maxpool_6 = MaxPool1DLayer(conv1d_6, pool_size=pool_size) hid_6 = DenseLayer(maxpool_6, num_units=args.hiddenDim, nonlinearity=sigmoid) network_6 = DenseLayer(hid_6, num_units=5, nonlinearity=softmax) """ #ContextualAspect emb7 = EmbeddingLayer(input, input_size=vocab_size, output_size=wordDim, W=wordEmbeddings.T) reshape7 = ReshapeLayer(emb7, (batchsize, seqlen, num_feats*wordDim)) conv1d_7 = DimshuffleLayer(Conv1DLayer(reshape7, num_filters=num_filters, filter_size=wordDim, stride=1, nonlinearity=tanh,W=GlorotUniform()), (0,2,1)) maxpool_7 = MaxPool1DLayer(conv1d_7, pool_size=pool_size) hid_7 = DenseLayer(maxpool_7, num_units=args.hiddenDim, nonlinearity=sigmoid) network_7 = DenseLayer(hid_7, num_units=4, nonlinearity=softmax) """ """ #Permanence emb8 = EmbeddingLayer(input, input_size=vocab_size, output_size=wordDim, W=wordEmbeddings.T) reshape8 = ReshapeLayer(emb8, (batchsize, seqlen, num_feats*wordDim)) conv1d_8 = DimshuffleLayer(Conv1DLayer(reshape8, num_filters=num_filters, filter_size=wordDim, stride=1, nonlinearity=tanh,W=GlorotUniform()), (0,2,1)) maxpool_8 = MaxPool1DLayer(conv1d_8, pool_size=pool_size) hid_8 = DenseLayer(maxpool_8, num_units=args.hiddenDim, nonlinearity=sigmoid) network_8 = DenseLayer(hid_8, num_units=4, nonlinearity=softmax) """ # Is this important? """ network_1_out, network_2_out, network_3_out, network_4_out, \ network_5_out, network_6_out, network_7_out, network_8_out = \ get_output([network_1, network_2, network_3, network_4, network_5, network_6, network_7, network_8]) """ network_1_out = get_output(network_1) network_3_out = get_output(network_3) network_4_out = get_output(network_4) network_5_out = get_output(network_5) network_6_out = get_output(network_6) loss_1 = T.mean(binary_crossentropy( network_1_out, target_var)) + regularize_layer_params_weighted( { emb1: lambda_val, conv1d_1: lambda_val, hid_1: lambda_val, network_1: lambda_val }, l2) updates_1 = adagrad(loss_1, get_all_params(network_1, trainable=True), learning_rate=args.step) train_fn_1 = theano.function([input_var, target_var], loss_1, updates=updates_1, allow_input_downcast=True) val_acc_1 = T.mean( binary_accuracy(get_output(network_1, deterministic=True), target_var)) val_fn_1 = theano.function([input_var, target_var], val_acc_1, allow_input_downcast=True) """ loss_2 = T.mean(categorical_crossentropy(network_2_out,target_var)) + regularize_layer_params_weighted({emb2:lambda_val, conv1d_2:lambda_val, hid_2:lambda_val, network_2:lambda_val} , l2) updates_2 = adagrad(loss_2, get_all_params(network_2, trainable=True), learning_rate=args.step) train_fn_2 = theano.function([input_var, target_var], loss_2, updates=updates_2, allow_input_downcast=True) val_acc_2 = T.mean(categorical_accuracy(get_output(network_2, deterministic=True), target_var)) val_fn_2 = theano.function([input_var, target_var], val_acc_2, allow_input_downcast=True) """ loss_3 = T.mean(categorical_crossentropy( network_3_out, target_var)) + regularize_layer_params_weighted( { emb3: lambda_val, conv1d_3: lambda_val, hid_3: lambda_val, network_3: lambda_val }, l2) updates_3 = adagrad(loss_3, get_all_params(network_3, trainable=True), learning_rate=args.step) train_fn_3 = theano.function([input_var, target_var], loss_3, updates=updates_3, allow_input_downcast=True) val_acc_3 = T.mean( categorical_accuracy(get_output(network_3, deterministic=True), target_var)) val_fn_3 = theano.function([input_var, target_var], val_acc_3, allow_input_downcast=True) loss_4 = T.mean(categorical_crossentropy( network_4_out, target_var)) + regularize_layer_params_weighted( { emb4: lambda_val, conv1d_4: lambda_val, hid_4: lambda_val, network_4: lambda_val }, l2) updates_4 = adagrad(loss_4, get_all_params(network_4, trainable=True), learning_rate=args.step) train_fn_4 = theano.function([input_var, target_var], loss_4, updates=updates_4, allow_input_downcast=True) val_acc_4 = T.mean( categorical_accuracy(get_output(network_4, deterministic=True), target_var)) val_fn_4 = theano.function([input_var, target_var], val_acc_4, allow_input_downcast=True) loss_5 = T.mean(categorical_crossentropy( network_5_out, target_var)) + regularize_layer_params_weighted( { emb5: lambda_val, conv1d_5: lambda_val, hid_5: lambda_val, network_5: lambda_val }, l2) updates_5 = adagrad(loss_5, get_all_params(network_5, trainable=True), learning_rate=args.step) train_fn_5 = theano.function([input_var, target_var], loss_5, updates=updates_5, allow_input_downcast=True) val_acc_5 = T.mean( categorical_accuracy(get_output(network_5, deterministic=True), target_var)) val_fn_5 = theano.function([input_var, target_var], val_acc_5, allow_input_downcast=True) loss_6 = T.mean(categorical_crossentropy( network_6_out, target_var)) + regularize_layer_params_weighted( { emb6: lambda_val, conv1d_6: lambda_val, hid_6: lambda_val, network_6: lambda_val }, l2) updates_6 = adagrad(loss_6, get_all_params(network_6, trainable=True), learning_rate=args.step) train_fn_6 = theano.function([input_var, target_var], loss_6, updates=updates_6, allow_input_downcast=True) val_acc_6 = T.mean( categorical_accuracy(get_output(network_6, deterministic=True), target_var)) val_fn_6 = theano.function([input_var, target_var], val_acc_6, allow_input_downcast=True) """ loss_7 = T.mean(categorical_crossentropy(network_7_out,target_var)) + regularize_layer_params_weighted({emb7:lambda_val, conv1d_7:lambda_val, hid_7:lambda_val, network_7:lambda_val} , l2) updates_7 = adagrad(loss_7, get_all_params(network_7, trainable=True), learning_rate=args.step) train_fn_7 = theano.function([input_var, target_var], loss_7, updates=updates_7, allow_input_downcast=True) val_acc_7 = T.mean(categorical_accuracy(get_output(network_7, deterministic=True), target_var)) val_fn_7 = theano.function([input_var, target_var], val_acc_7, allow_input_downcast=True) loss_8 = T.mean(categorical_crossentropy(network_8_out,target_var)) + regularize_layer_params_weighted({emb8:lambda_val, conv1d_8:lambda_val, hid_8:lambda_val, network_8:lambda_val} , l2) updates_8 = adagrad(loss_8, get_all_params(network_8, trainable=True), learning_rate=args.step) train_fn_8 = theano.function([input_var, target_var], loss_8, updates=updates_8, allow_input_downcast=True) val_acc_8 = T.mean(categorical_accuracy(get_output(network_8, deterministic=True), target_var)) val_fn_8 = theano.function([input_var, target_var], val_acc_8, allow_input_downcast=True) """ """ return train_fn_1, val_fn_1, network_1, train_fn_2, val_fn_2, network_2, train_fn_3, val_fn_3, \ network_3, train_fn_4, val_fn_4, network_4, train_fn_5, val_fn_5, network_5, \ train_fn_6, val_fn_6, network_6, train_fn_7, val_fn_7, network_7, train_fn_8, val_fn_8, network_8 """ return train_fn_1, val_fn_1, network_1, train_fn_3, val_fn_3, \ network_3, train_fn_4, val_fn_4, network_4, train_fn_5, val_fn_5, network_5, \ train_fn_6, val_fn_6, network_6
def update_params(self, loss, all_params): return adagrad(loss, all_params, self.learning_rate)
def build_network_2dconv( args, input1_var, input1_mask_var, input2_var, intut2_mask_var, target_var, wordEmbeddings, maxlen=36 ): print ("Building model with 2D Convolution") vocab_size = wordEmbeddings.shape[1] wordDim = wordEmbeddings.shape[0] num_filters = 100 stride = 1 # CNN_sentence config filter_size = (3, wordDim) pool_size = (maxlen - 3 + 1, 1) # two conv pool layer # filter_size=(10, 100) # pool_size=(4,4) input_1 = InputLayer((None, maxlen), input_var=input1_var) batchsize, seqlen = input_1.input_var.shape # input_1_mask = InputLayer((None, maxlen),input_var=input1_mask_var) emb_1 = EmbeddingLayer(input_1, input_size=vocab_size, output_size=wordDim, W=wordEmbeddings.T) emb_1.params[emb_1.W].remove("trainable") # (batchsize, maxlen, wordDim) reshape_1 = ReshapeLayer(emb_1, (batchsize, 1, maxlen, wordDim)) conv2d_1 = Conv2DLayer( reshape_1, num_filters=num_filters, filter_size=(filter_size), stride=stride, nonlinearity=rectify, W=GlorotUniform(), ) # (None, 100, 34, 1) maxpool_1 = MaxPool2DLayer(conv2d_1, pool_size=pool_size) # (None, 100, 1, 1) """ filter_size_2=(4, 10) pool_size_2=(2,2) conv2d_1 = Conv2DLayer(maxpool_1, num_filters=num_filters, filter_size=filter_size_2, stride=stride, nonlinearity=rectify,W=GlorotUniform()) #(None, 100, 34, 1) maxpool_1 = MaxPool2DLayer(conv2d_1, pool_size=pool_size_2) #(None, 100, 1, 1) (None, 100, 1, 20) """ forward_1 = FlattenLayer(maxpool_1) # (None, 100) #(None, 50400) input_2 = InputLayer((None, maxlen), input_var=input2_var) # input_2_mask = InputLayer((None, maxlen),input_var=input2_mask_var) emb_2 = EmbeddingLayer(input_2, input_size=vocab_size, output_size=wordDim, W=wordEmbeddings.T) emb_2.params[emb_2.W].remove("trainable") reshape_2 = ReshapeLayer(emb_2, (batchsize, 1, maxlen, wordDim)) conv2d_2 = Conv2DLayer( reshape_2, num_filters=num_filters, filter_size=filter_size, stride=stride, nonlinearity=rectify, W=GlorotUniform(), ) # (None, 100, 34, 1) maxpool_2 = MaxPool2DLayer(conv2d_2, pool_size=pool_size) # (None, 100, 1, 1) """ conv2d_2 = Conv2DLayer(maxpool_2, num_filters=num_filters, filter_size=filter_size_2, stride=stride, nonlinearity=rectify,W=GlorotUniform()) #(None, 100, 34, 1) maxpool_2 = MaxPool2DLayer(conv2d_2, pool_size=pool_size_2) #(None, 100, 1, 1) """ forward_2 = FlattenLayer(maxpool_2) # (None, 100) # elementwisemerge need fix the sequence length mul = ElemwiseMergeLayer([forward_1, forward_2], merge_function=T.mul) sub = AbsSubLayer([forward_1, forward_2], merge_function=T.sub) concat = ConcatLayer([mul, sub]) concat = ConcatLayer([forward_1, forward_2]) hid = DenseLayer(concat, num_units=args.hiddenDim, nonlinearity=sigmoid) if args.task == "sts": network = DenseLayer(hid, num_units=5, nonlinearity=softmax) elif args.task == "ent": network = DenseLayer(hid, num_units=3, nonlinearity=softmax) # prediction = get_output(network, {input_1:input1_var, input_2:input2_var}) prediction = get_output(network) loss = T.mean(categorical_crossentropy(prediction, target_var)) lambda_val = 0.5 * 1e-4 layers = {conv2d_1: lambda_val, hid: lambda_val, network: lambda_val} penalty = regularize_layer_params_weighted(layers, l2) loss = loss + penalty params = get_all_params(network, trainable=True) if args.optimizer == "sgd": updates = sgd(loss, params, learning_rate=args.step) elif args.optimizer == "adagrad": updates = adagrad(loss, params, learning_rate=args.step) elif args.optimizer == "adadelta": updates = adadelta(loss, params, learning_rate=args.step) elif args.optimizer == "nesterov": updates = nesterov_momentum(loss, params, learning_rate=args.step) elif args.optimizer == "rms": updates = rmsprop(loss, params, learning_rate=args.step) elif args.optimizer == "adam": updates = adam(loss, params, learning_rate=args.step) else: raise "Need set optimizer correctly" # test_prediction = get_output(network, {input_1:input1_var, input_2:input2_var}, deterministic=True) test_prediction = get_output(network, deterministic=True) test_loss = T.mean(categorical_crossentropy(test_prediction, target_var)) """ train_fn = theano.function([input1_var, input1_mask_var, input2_var, intut2_mask_var, target_var], loss, updates=updates, allow_input_downcast=True) """ train_fn = theano.function([input1_var, input2_var, target_var], loss, updates=updates, allow_input_downcast=True) if args.task == "sts": """ val_fn = theano.function([input1_var, input1_mask_var, input2_var, intut2_mask_var, target_var], [test_loss, test_prediction], allow_input_downcast=True) """ val_fn = theano.function( [input1_var, input2_var, target_var], [test_loss, test_prediction], allow_input_downcast=True ) elif args.task == "ent": # test_acc = T.mean(T.eq(T.argmax(test_prediction, axis=1), target_var), dtype=theano.config.floatX) test_acc = T.mean(categorical_accuracy(test_prediction, target_var)) """ val_fn = theano.function([input1_var, input1_mask_var, input2_var, intut2_mask_var, target_var], [test_loss, test_acc], allow_input_downcast=True) """ val_fn = theano.function([input1_var, input2_var, target_var], [test_loss, test_acc], allow_input_downcast=True) return train_fn, val_fn
def logisticRegression(X, Y, l, **options): #%% imports from theano import function from theano import shared import numpy as np from numpy import random as rng import matplotlib.pyplot as plt from theano import tensor as Tn from lasagne.updates import adagrad if options.get('plotFigure'): plotFigure = options.get('plotFigure') else: plotFigure = False if options.get('verbose'): verbose = options.get('verbose') else: verbose = False #%% load data l = np.array(l).astype(float) numObservations = len(Y) numFeatures = len(X) / numObservations Y = np.squeeze(np.array(Y).astype('int')) X = np.reshape(np.array(X).astype('float'), (numObservations, numFeatures), order='F') # remove any nans mskOutNans = (np.sum(np.isnan(X), axis=1) + np.squeeze(np.isnan(Y))) < 1 X = X[mskOutNans, :] Y = Y[mskOutNans] numObservations, numFeatures = X.shape Data = (X, Y) # data tuple #%% declaration of sympolic variables and initializations # Declare theano symbolic input/ output variables x = Tn.matrix('x') # sympolic feature variable y = Tn.vector('y') # sympolic label variable lambda_l2 = Tn.dscalar('lambda_l2') # sympolic l2-regularization parameter variable lambda_l1 = Tn.dscalar('lambda_l1') # sympolic l1-regularization parameter variable # Declare and initialize theano shared optimization variables w = shared(rng.randn(numFeatures), name='w') # initialize the weight vector (w) randomly b = shared(np.random.randn(), name='b') # initialize the bias variable (b) randomly clps = shared(0.5 * np.random.rand(), name='clps') # initialize the lapse rate variable (lps) randomly between 0 and 1 #%% functions expressions and compilations # function sympolic expressions XW = Tn.dot(x, w) L_Exp = Tn.shape(y) lps = 0.5 / (1. + Tn.exp(-10. * clps)) prob1Expression = lps + (1 - 2 * lps) / (1.0 + Tn.exp(-(XW + b))) # expression of logistic function (prob of 1) prob0Expression = 1.0 - prob1Expression # expression of logistic function (prob of 0) predictExpression = prob1Expression > 0.5 logLikelihood_Exp = (y * Tn.log(prob1Expression) + (1.0 - y) * Tn.log(prob0Expression)).sum() # loglikelihood expression costExpression = -logLikelihood_Exp / (L_Exp[0]) + lambda_l2 * ( w**2).sum() + lambda_l1 * abs(w).sum() # mean cost across all samples with the regularization perClassErExpression = abs(predictExpression - y).sum() / Tn.shape(y)[0] * 100 # percent classification error expression # compiling function expressions for speed prob1Fn = function(inputs=[x], outputs=prob1Expression) prob0Fn = function(inputs=[x], outputs=prob0Expression) predictFn = function(inputs=[x], outputs=predictExpression) costFn = function(inputs=[x, y, lambda_l2, lambda_l1], outputs=costExpression) perClassErFn = function(inputs=[x, y], outputs=perClassErExpression) # cost gradient with respect to all parameters grad_w, grad_b, grad_clps = Tn.grad(costExpression, [w, b, clps]) updates = adagrad([grad_w, grad_b, grad_clps], [w, b, clps], learning_rate=0.2, epsilon=0.0001) # training function trainFn = function( inputs=[x, y, lambda_l2, lambda_l1], outputs=[costExpression, perClassErExpression, logLikelihood_Exp], updates=updates #updates = ((w, w - learnRate*grad_w), (b, b - learnRate*grad_b), (lps, Tn.clip((lps - learnRate*grad_lps) , 0.0, 0.4999999))) ) #%% Training the model maxIter = 10000 numRepetitions = 4 w_0 = [] b_0 = [] clps_0 = [] minCost = np.inf rbest = 0 for r in range(numRepetitions): w_0.append(rng.randn(numFeatures)) b_0.append(np.random.rand()) clps_0.append(np.random.randn()) w.set_value(w_0[r]) b.set_value(b_0[r]) clps.set_value(clps_0[r]) b_i = [b.get_value()] lps_i = [lps.eval()] cost = [] perClassEr = [] lklhood_i = [] for i in range(int(maxIter)): Er1, Er2, lklhood = trainFn(Data[0], Data[1], l[0], l[1]) #lps.set_value(np.clip(lps.get_value(), 0., 0.5)) # enforce constraint lklhood_i.append(lklhood) cost.append(Er1) perClassEr.append(Er2) b_i.append(b.get_value()) lps_i.append(lps.eval()) if i > 500: if abs(cost[i - 100] - cost[i]) < (10.0**-6): break if verbose: print 'iteration %d , objective value %.5f, initial conditions %d out of %d' % ( i + 1, cost[i], r + 1, numRepetitions) if cost[-1] < minCost: rbest = r minCost = cost[-1] costbest = cost perClassErbest = perClassEr wbest = w.get_value() clpsbest = clps.get_value() bbest = b.get_value() bbest_i = b_i lpsbest_i = lps_i lklhoodBest_i = lklhood_i w_0 = w_0[rbest] cost = costbest perClassEr = perClassErbest w.set_value(wbest) clps.set_value(clpsbest) b.set_value(bbest) b_i = bbest_i lps_i = lpsbest_i lklhood_i = lklhoodBest_i #%% plot results #%% msk1 = Data[1] > 0 msk0 = Data[1] < 1 Y1 = Data[1][msk1] Y0 = Data[1][msk0] Yhat = predictFn(Data[0]) Yhat1 = Yhat[msk1] Yhat0 = Yhat[msk0] prob1 = prob1Fn(Data[0]) prob1_1 = prob1[msk1] prob1_0 = prob1[msk0] if plotFigure: plt.figure('cost') plt.subplot(3, 1, 1) plt.plot(cost) plt.xlabel('iteration') plt.ylabel('cross-entropy loss') plt.subplot(3, 1, 2) plt.plot(perClassEr) plt.ylim(0, 100) plt.xlabel('iteration') plt.ylabel('classification error (%)') plt.subplot(3, 1, 3) plt.plot(lklhood_i) plt.xlabel('iteration') plt.ylabel('log likelihood') plt.figure('prediction') plt.plot(np.arange(1, len(Y0) + 1), Yhat0, 'ro', label='prediction') plt.plot(np.arange(1, len(Y0) + 1), prob1_0, 'b.', label='likelihood of success') plt.plot(np.arange(len(Y0) + 1, len(Yhat) + 1), Yhat1, 'ro') plt.plot(np.arange(len(Y0) + 1, len(Yhat) + 1), prob1_1, 'b.') plt.plot(np.arange(1, len(Y0) + 1), Y0, 'k.', label='data') plt.plot(np.arange(len(Y0) + 1, len(Yhat) + 1), Y1, 'k.') plt.ylim(-0.1, 1.1) plt.xticks([0, 1]) plt.xlabel('observation') plt.ylabel('class label') plt.legend() plt.figure('weights') plt.subplot(3, 1, 1) plt.plot(w_0, 'g') plt.plot(w.get_value(), 'r') plt.xlabel('feature number') plt.ylabel('feature weight') plt.legend(('before optimization', 'after optimization')) plt.subplot(3, 1, 2) plt.plot(b_i) plt.xlabel('iteration') plt.ylabel('bias') plt.subplot(3, 1, 3) plt.plot(lps_i) plt.ylim(0., 1.) plt.xlabel('iteration') plt.ylabel('lapse rate') #%% save optimization parameters to class class optParamsClass: cost_per_iter = np.inf perClassEr_per_iter = 100. loglikelihood = -np.inf prediction = [] likelihood_trial = [] predictFn = [] perClassErFn = [] probSucessFn = [] def description(self): return 'object contains optimization parameters' optParams = optParamsClass() optParams.cost_per_iter = np.array(cost) optParams.perClassEr_per_iter = np.array(perClassEr) optParams.loglikelihood = lklhood_i optParams.prediction = Yhat optParams.likelihood_trial = prob1 optParams.predictFn = predictFn optParams.perClassErFn = perClassErFn optParams.probSucessFn = prob1Fn #%% return parameters return w.get_value(), b.get_value(), lps.eval( ), perClassEr[-1], cost[-1], optParams