def create_encoder_decoder_func(layers, apply_updates=False): X = T.fmatrix('X') X_batch = T.fmatrix('X_batch') X_hat = get_output(layers['l_decoder_out'], X, deterministic=False) # reconstruction loss encoder_decoder_loss = T.mean( T.mean(T.sqr(X - X_hat), axis=1) ) if apply_updates: # all layers that participate in the forward pass should be updated encoder_decoder_params = get_all_params( layers['l_decoder_out'], trainable=True) encoder_decoder_updates = nesterov_momentum( encoder_decoder_loss, encoder_decoder_params, 0.01, 0.9) else: encoder_decoder_updates = None encoder_decoder_func = theano.function( inputs=[theano.In(X_batch)], outputs=encoder_decoder_loss, updates=encoder_decoder_updates, givens={ X: X_batch, }, ) return encoder_decoder_func
def create_updates(loss, network, opt, learning_rate, momentum, beta1, beta2): params = lasagne.layers.get_all_params(network, trainable=True) grads = theano.grad(loss, params) # if max_norm: # names = ['crf.U', 'crf.W_h', 'crf.W_c', 'crf.b'] # constraints = [grad for param, grad in zip(params, grads) if param.name in names] # assert len(constraints) == 4 # scaled_grads = total_norm_constraint(constraints, max_norm=max_norm) # counter = 0 # for i in xrange(len(params)): # param = params[i] # if param.name in names: # grads[i] = scaled_grads[counter] # counter += 1 # assert counter == 4 if opt == 'adam': updates = adam(grads, params=params, learning_rate=learning_rate, beta1=beta1, beta2=beta2) elif opt == 'momentum': updates = nesterov_momentum(grads, params=params, learning_rate=learning_rate, momentum=momentum) else: raise ValueError('unkown optimization algorithm: %s' % opt) return updates
def __init__(self, C, lr): self.C = C self.X = T.ftensor4() self.Y = T.fmatrix() self.net = self._forward() params = layers.get_all_params(self.net['flatten'], trainable=True) netout = layers.get_output(self.net['out']) flattenout = layers.get_output(self.net['flatten']) reg = regularization.regularize_network_params(self.net['flatten'], regularization.l2) reg /= layers.helper.count_params(self.net['flatten']) self.flattenfn = theano.function([self.X], flattenout, allow_input_downcast=True) self.predictfn = theano.function([self.X], netout, allow_input_downcast=True) accrarcy = myUtils.basic.accuracy(netout, self.Y) self.scorefn = theano.function([self.X, self.Y], accrarcy, allow_input_downcast=True) self.sharedBeta = self.net['out'].get_params()[0] crossentropy = objectives.categorical_crossentropy(netout, self.Y) cost = T.mean(crossentropy) + C * reg updatesDict = updates.nesterov_momentum(cost, params, lr, 0.9) # 训练随机参数 self.trainfn = theano.function([self.X, self.Y], [cost, accrarcy], updates=updatesDict, allow_input_downcast=True)
def __build_loss_train__fn__(self): # create loss function prediction = layers.get_output(self.net) loss = objectives.categorical_crossentropy(prediction, self.__target_var__) loss = loss.mean() + 1e-4 * regularization.regularize_network_params( self.net, regularization.l2) val_acc = T.mean(T.eq(T.argmax(prediction, axis=1), self.__target_var__), dtype=theano.config.floatX) # create parameter update expressions params = layers.get_all_params(self.net, trainable=True) self.eta = theano.shared(sp.array(sp.float32(0.05), dtype=sp.float32)) update_rule = updates.nesterov_momentum(loss, params, learning_rate=self.eta, momentum=0.9) # compile training function that updates parameters and returns training loss self.__train_fn__ = theano.function( [self.__input_var__, self.__target_var__], loss, updates=update_rule) self.__predict_fn__ = theano.function( [self.__input_var__], layers.get_output(self.net, deterministic=True)) self.__val_fn__ = theano.function( [self.__input_var__, self.__target_var__], [loss, val_acc])
def build_network(nkwargs): opt_kwargs = {k: nkwargs[k] for k in ['learning_rate', 'momentum']} input_var = T.tensor4('input_var') target_var = T.tensor4('target_var') network, hid_layer = build_denoising_convae(input_var, nkwargs) #get outputs prediction = get_output(network, deterministic=False) hid_layer_output = get_output(hid_layer, deterministic=True) #make losses loss = squared_error(prediction, target_var).mean() test_prediction = get_output(network, deterministic=True) test_loss = squared_error(test_prediction, target_var).mean() test_acc = test_loss #set up updates params = get_all_params(network, trainable=True) updates = nesterov_momentum(loss, params, **opt_kwargs) #make fns train_fn = function([input_var, target_var], loss, updates=updates) val_fn = function([input_var, target_var], [test_loss, test_acc]) pred_fn = function([input_var], test_prediction) hlayer_fn = function([input_var], hid_layer_output) return train_fn, val_fn, pred_fn, hlayer_fn, network
def __init__(self, istrained, name=None, args=None): self.istrained = istrained self.X = T.tensor4('X') self.y = T.ivector('y') self.outprob = build_model(self.X) if self.istrained: params = cPickle.load(open(dataset_path + 'plain_cnn.pkl', 'r')) layers.set_all_param_values(self.outprob, params) self.yFullProb = layers.get_output(self.outprob, deterministic=True) self.predfn = makeFunc([self.X, ], [self.yFullProb, ], None) else: self.lr, self.C, self.momentum = args self.params = layers.get_all_params(self.outprob, trainable=True) reg = regularization.regularize_network_params(self.outprob, regularization.l2) reg /= layers.helper.count_params(self.outprob) # 训练集 self.yDropProb = layers.get_output(self.outprob) trCrossentropy = objectives.categorical_crossentropy(self.yDropProb, self.y) self.trCost = trCrossentropy.mean() + self.C * reg # 验证、测试集 self.yFullProb = layers.get_output(self.outprob, deterministic=True) vateCrossentropy = objectives.categorical_crossentropy(self.yFullProb, self.y) self.vateCost = vateCrossentropy.mean() + self.C * reg # 训练函数,输入训练集,输出训练损失和误差 updatesDict = updates.nesterov_momentum(self.trCost, self.params, self.lr, self.momentum) self.trainfn = makeFunc([self.X, self.y], [self.trCost, self.yDropProb], updatesDict) # 验证或测试函数,输入验证或测试集,输出损失和误差,不进行更新 self.vatefn = makeFunc([self.X, self.y], [self.vateCost, self.yFullProb], None)
def __init__(self, lr, C, momentum): self.lr = lr self.C = C self.momentum = momentum self.X = T.tensor4('X') self.y = T.ivector('y') self.network = self._build() self.params = layers.get_all_params(self.network, trainable=True) reg = regularization.regularize_network_params(self.network, regularization.l2) reg /= layers.helper.count_params(self.network) # 训练集 yDropProb = layers.get_output(self.network) self.trEqs = myUtils.basic.eqs(yDropProb, self.y) trCrossentropy = objectives.categorical_crossentropy(yDropProb, self.y) self.trCost = trCrossentropy.mean() + C * reg # 验证、测试集 yFullProb = layers.get_output(self.network, deterministic=True) self.vateEqs = myUtils.basic.eqs(yFullProb, self.y) vateCrossentropy = objectives.categorical_crossentropy(yFullProb, self.y) self.vateCost = vateCrossentropy.mean() + C * reg self.yPred = yFullProb # 训练函数,输入训练集,输出训练损失和误差 updatesDict = updates.nesterov_momentum(self.trCost, self.params, lr, momentum) self.trainfn = myUtils.basic.makeFunc([self.X, self.y], [self.trCost, self.trEqs], updatesDict) # 验证或测试函数,输入验证或测试集,输出损失和误差,不进行更新 self.vatefn = myUtils.basic.makeFunc([self.X, self.y], [self.vateCost, self.vateEqs], None)
def create_encoder_decoder_func(layers, apply_updates=False): X = T.fmatrix('X') X_batch = T.fmatrix('X_batch') X_hat = get_output(layers['l_decoder_out'], X, deterministic=False) # reconstruction loss encoder_decoder_loss = T.mean(T.mean(T.sqr(X - X_hat), axis=1)) if apply_updates: # all layers that participate in the forward pass should be updated encoder_decoder_params = get_all_params(layers['l_decoder_out'], trainable=True) encoder_decoder_updates = nesterov_momentum(encoder_decoder_loss, encoder_decoder_params, 0.01, 0.9) else: encoder_decoder_updates = None encoder_decoder_func = theano.function( inputs=[theano.In(X_batch)], outputs=encoder_decoder_loss, updates=encoder_decoder_updates, givens={ X: X_batch, }, ) return encoder_decoder_func
def build_updates(grad, params, optimization, learning_rate): """ setup optimization algorithm """ if optimization['optimizer'] == 'sgd': update_op = updates.sgd(grad, params, learning_rate=learning_rate) elif optimization['optimizer'] == 'nesterov_momentum': if momenum in optimization: momentum = optimization['momentum'] else: momentum = 0.9 update_op = updates.nesterov_momentum(grad, params, learning_rate=learning_rate, momentum=momentum) elif optimization['optimizer'] == 'adagrad': update_op = updates.adagrad(grad, params, learning_rate=learning_rate) elif optimization['optimizer'] == 'rmsprop': if 'rho' in optimization: rho = optimization['rho'] else: rho = 0.9 update_op = updates.rmsprop(grad, params, learning_rate=learning_rate, rho=rho) elif optimization['optimizer'] == 'adam': if 'beta1' in optimization: beta1 = optimization['beta1'] else: beta1 = 0.9 if 'beta2' in optimization: beta2 = optimization['beta2'] else: beta2 = 0.999 update_op = updates.adam(grad, params, learning_rate=learning_rate, beta1=beta1, beta2=beta2) return update_op
def generate_theano_func(args, network, penalty, input_dict, target_var): prediction = get_output(network, input_dict) # loss = T.mean( target_var * ( T.log(target_var) - prediction )) loss = T.mean(categorical_crossentropy(prediction, target_var)) # loss += 0.0001 * sum (T.sum(layer_params ** 2) for layer_params in get_all_params(network) ) # penalty = sum ( T.sum(lstm_param**2) for lstm_param in lstm_params ) # penalty = regularize_layer_params(l_forward_1_lstm, l2) # penalty = T.sum(lstm_param**2 for lstm_param in lstm_params) # penalty = 0.0001 * sum (T.sum(layer_params ** 2) for layer_params in get_all_params(l_forward_1) ) loss = loss + penalty params = get_all_params(network, trainable=True) if args.optimizer == "sgd": updates = sgd(loss, params, learning_rate=args.step) elif args.optimizer == "adagrad": updates = adagrad(loss, params, learning_rate=args.step) elif args.optimizer == "adadelta": updates = adadelta(loss, params, learning_rate=args.step) elif args.optimizer == "nesterov": updates = nesterov_momentum(loss, params, learning_rate=args.step) elif args.optimizer == "rms": updates = rmsprop(loss, params, learning_rate=args.step) elif args.optimizer == "adam": updates = adam(loss, params, learning_rate=args.step) else: raise "Need set optimizer correctly" test_prediction = get_output(network, input_dict, deterministic=True) # test_prediction = get_output(network, deterministic=True) # test_loss = T.mean( target_var * ( T.log(target_var) - test_prediction)) test_loss = T.mean(categorical_crossentropy(test_prediction, target_var)) train_fn = theano.function( [input1_var, input1_mask_var, input2_var, input2_mask_var, target_var], loss, updates=updates, allow_input_downcast=True, ) if args.task == "sts": val_fn = theano.function( [input1_var, input1_mask_var, input2_var, input2_mask_var, target_var], [test_loss, test_prediction], allow_input_downcast=True, ) elif args.task == "ent": # test_acc = T.mean(T.eq(T.argmax(test_prediction, axis=1), target_var), dtype=theano.config.floatX) test_acc = T.mean(categorical_accuracy(test_prediction, target_var)) val_fn = theano.function( [input1_var, input1_mask_var, input2_var, input2_mask_var, target_var], [test_loss, test_acc], allow_input_downcast=True, ) return train_fn, val_fn
def get_updates(nnet, train_obj, trainable_params): implemented_solvers = ("nesterov", "adagrad", "adadelta", "adam") if not hasattr(nnet, "solver") or nnet.solver not in implemented_solvers: nnet.sgd_solver = "nesterov" else: nnet.sgd_solver = nnet.solver if nnet.sgd_solver == "nesterov": updates = l_updates.nesterov_momentum(train_obj, trainable_params, learning_rate=Cfg.learning_rate, momentum=0.9) elif nnet.sgd_solver == "adagrad": updates = l_updates.adagrad(train_obj, trainable_params, learning_rate=Cfg.learning_rate) elif nnet.sgd_solver == "adadelta": updates = l_updates.adadelta(train_obj, trainable_params, learning_rate=Cfg.learning_rate) elif nnet.sgd_solver == "adam": updates = l_updates.adam(train_obj, trainable_params, learning_rate=Cfg.learning_rate) return updates
def __init__(self, conf): self.conf = conf if self.conf.act == "linear": self.conf.act = linear elif self.conf.act == "sigmoid": self.conf.act = sigmoid elif self.conf.act == "relu": self.conf.act = rectify elif self.conf.act == "tanh": self.conf.act = tanh else: raise ValueError("Unknown activation function", self.conf.act) input_var_first = T.matrix('inputs1') input_var_second = T.matrix('inputs2') target_var = T.matrix('targets') # create network self.autoencoder, encoder_first, encoder_second = self.__create_toplogy__( input_var_first, input_var_second) self.out = get_output(self.autoencoder) loss = squared_error(self.out, target_var) loss = loss.mean() params = get_all_params(self.autoencoder, trainable=True) updates = nesterov_momentum(loss, params, learning_rate=self.conf.lr, momentum=self.conf.momentum) # training function self.train_fn = theano.function( [input_var_first, input_var_second, target_var], loss, updates=updates) # fuction to reconstruct test_reconstruction = get_output(self.autoencoder, deterministic=True) self.reconstruction_fn = theano.function( [input_var_first, input_var_second], test_reconstruction) # encoding function test_encode = get_output([encoder_first, encoder_second], deterministic=True) self.encoding_fn = theano.function([input_var_first, input_var_second], test_encode) # utils blas = lambda name, ndarray: scipy.linalg.get_blas_funcs( (name, ), (ndarray, ))[0] self.blas_nrm2 = blas('nrm2', np.array([], dtype=float)) self.blas_scal = blas('scal', np.array([], dtype=float)) # load weights if necessary if self.conf.load_model is not None: self.load_model()
def set_update(self): params = get_all_params(self.model, trainable=True) self.lr_schedule = { 0: 0.01, 6: 0.001, NUM_EPOCHS-2: 0.0001 } self.lr = theano.shared(np.float32(self.lr_schedule[0])) self.updates = nesterov_momentum(self.train_loss, params, learning_rate=self.lr, momentum=0.9)
def create_discriminator_func(layers, apply_updates=False): X = T.fmatrix('X') pz = T.fmatrix('pz') X_batch = T.fmatrix('X_batch') pz_batch = T.fmatrix('pz_batch') # the discriminator receives samples from q(z|x) and p(z) # and should predict to which distribution each sample belongs discriminator_outputs = get_output( layers['l_discriminator_out'], inputs={ layers['l_prior_in']: pz, layers['l_encoder_in']: X, }, deterministic=False, ) # label samples from q(z|x) as 1 and samples from p(z) as 0 discriminator_targets = T.vertical_stack( T.ones((X_batch.shape[0], 1)), T.zeros((pz_batch.shape[0], 1)) ) discriminator_loss = T.mean( T.nnet.binary_crossentropy( discriminator_outputs, discriminator_targets, ) ) if apply_updates: # only layers that are part of the discriminator should be updated discriminator_params = get_all_params( layers['l_discriminator_out'], trainable=True, discriminator=True) discriminator_updates = nesterov_momentum( discriminator_loss, discriminator_params, 0.1, 0.0) else: discriminator_updates = None discriminator_func = theano.function( inputs=[ theano.In(X_batch), theano.In(pz_batch), ], outputs=discriminator_loss, updates=discriminator_updates, givens={ X: X_batch, pz: pz_batch, }, ) return discriminator_func
def trainModel(self, tr_X, va_X, tr_y, va_y, batchSize=64, maxIter=300, verbose=True, start=10, period=2, threshold=10, earlyStopTol=2, totalStopTol=2): trainfn = self.trainfn validatefn = self.vatefn lr = self.lr earlyStop = myUtils.basic.earlyStopGen(start, period, threshold, earlyStopTol) earlyStop.next() # 初始化生成器 totalStopCount = 0 for epoch in xrange(maxIter): # every epoch # In each epoch, we do a full pass over the training data: trAllPred = None trRandy = None trCostSum = 0. startTime = time.time() for batch in myUtils.basic.miniBatchGen(tr_X, tr_y, batchSize, shuffle=True): Xb, yb = batch trCost, trPred = trainfn(Xb, yb) trCostSum += trCost trAllPred = np.concatenate((trAllPred, trPred), axis=0) \ if trAllPred is not None else trPred trRandy = np.concatenate((trRandy, yb)) if trRandy is not None else yb trIter = len(tr_X) // batchSize if len(tr_X) % batchSize != 0: trIter += 1 trCostMean = trCostSum / trIter trAcc = myUtils.basic.accuracy(trAllPred, trRandy) trP, trR = myUtils.basic.precision_recall(trAllPred, trRandy) # And a full pass over the validation data: vaAllPred = None vaCostSum = 0. for batch in myUtils.basic.miniBatchGen(va_X, va_y, batchSize, shuffle=False): Xb, yb = batch vaCost, vaPred = validatefn(Xb, yb) vaCostSum += vaCost vaAllPred = np.concatenate((vaAllPred, vaPred), axis=0) \ if vaAllPred is not None else vaPred vaIter = len(va_X) // batchSize if len(va_X) % batchSize != 0: vaIter += 1 vaCostMean = vaCostSum / vaIter vaAcc = myUtils.basic.accuracy(vaAllPred, va_y) vaP, vaR = myUtils.basic.precision_recall(vaAllPred, va_y) if verbose: print 'epoch ', epoch, ' time: %.3f' % (time.time() - startTime), print 'trcost: %.5f tracc: %.5f trp: %.5f trr: %.5f' % (trCostMean, trAcc, trP, trR), print 'vacost: %.5f vaacc: %.5f vap: %.5f var: %.5f' % (vaCostMean, vaAcc, vaP, vaR) # Then we decide whether to early stop: if earlyStop.send((trCostMean, vaCostMean)): lr /= 10 # 如果一次早停止发生,则学习率降低继续迭代 updatesDict = updates.nesterov_momentum(self.trCost, self.params, lr, self.momentum) trainfn = myUtils.basic.makeFunc([self.X, self.y], [self.trCost, self.yDropProb], updatesDict) totalStopCount += 1 if totalStopCount > totalStopTol: # 如果学习率降低仍然发生早停止,则退出迭代 print 'stop' break if verbose: print 'learning rate decreases to ', lr
def build_train_fn(self): prediction = get_output(self.network, deterministic=False) loss = squared_error(prediction, self.target_var) loss = loss.mean() params = get_all_params(self.network, trainable=True) updates = nesterov_momentum(loss, params, learning_rate=self.learning_rate, momentum=self.momentum) self.train_fn = theano.function([self.input_var, self.target_var], loss, updates=updates)
def test_stochastic_layer_network(): learning_rate = 0.1 momentum = 0.9 num_epoch = 1000 input = T.fmatrix('input') output = T.fmatrix('output') print 'FF-Layer: (Batch_size, n_features)' print 'Building stochastic layer model' l_in = L.InputLayer(shape=(1, 10), input_var=input) l_2 = L.DenseLayer(l_in, num_units=10, nonlinearity=lasagne.nonlinearities.softmax, W=lasagne.init.Constant(0.)) print 'Input Layer shape: ', L.get_output_shape(l_in) print 'Dense Layer shape: ', L.get_output_shape(l_2) l_stochastic_layer = StochasticLayer(l_2, estimator='ST') print 'Stochastic Layer shape: ', L.get_output_shape(l_stochastic_layer) l_out = L.DenseLayer(l_stochastic_layer, num_units=10, b=lasagne.init.Constant(0.)) print 'Final Dense Layer shape: ', L.get_output_shape(l_out) network_output = L.get_output(l_out) print 'Building loss function...' loss = lasagne.objectives.squared_error(network_output, output) loss = loss.mean() params = L.get_all_params(l_out, trainable=True) updates = nesterov_momentum(loss, params, learning_rate, momentum) train = theano.function([input, output], loss, updates=updates, allow_input_downcast=True) output_fn = theano.function([input], network_output, allow_input_downcast=True) test_X = np.ones((1, 10)) test_Y = np.ones((1, 10)) losses = [] mean_losses = [] for epoch in range(num_epoch): print 'Epoch number: ', epoch losses.append(train(test_X, test_Y)) print 'epoch {} mean loss {}'.format(epoch, np.mean(losses)) print 'Current Output: ', output_fn(test_X) mean_losses.append(np.mean(losses)) plt.title("Mean loss") plt.xlabel("Training examples") plt.ylabel("Loss") plt.plot(mean_losses, label="train") plt.grid() plt.legend() plt.draw()
def contrastive_loss_iter(embedder, update_params={}): # assume it's the 1d version X_pairs = { 'te1':T.tensor3(), 'te2':T.tensor3(), } y = T.ivector() # basically class labels final_emb_layer = embedder[-1] all_layers = ll.get_all_layers(embedder) imwrite_architecture(all_layers, './layer_rep.png') # assume we get a list of predictions (e.g. for jet architecture, but should work w/just one pred) # another assumption (which must hold when the network is being made) # the last prediction layer is a) the end of the network and b) what we ultimately care about # however the other prediction layers will be incorporated into the training loss predicted_embeds_train = {k:ll.get_output(embedder, X)[-1] for k, X in X_pairs.items()} predicted_embeds_valid = {k:ll.get_output(final_emb_layer, X, deterministic=True) for k, X in X_pairs.items()} margin = 1 # if distance is 0 that's bad distance = lambda pred: (pred['te1'] - pred['te2'] + 1e-7).norm(2, axis=1) contrastive_loss = lambda pred: T.mean(y*(distance(pred)) + (1 - y)*(margin - distance(pred)).clip(0,np.inf)) failed_matches = lambda pred: T.switch(T.eq(T.sum(y),0), 0, T.sum((y*distance(pred)) > margin) / T.sum(y)) failed_nonmatches = lambda pred: T.switch(T.eq(T.sum(1-y),0), 0, T.sum((1-y*distance(pred)) < margin) / T.sum(1-y)) failed_pairs = lambda pred: 0.5*failed_matches(pred) + 0.5*failed_nonmatches(pred) decay = 0.001 reg = regularize_network_params(final_emb_layer, l2) * decay losses_reg = lambda pred: contrastive_loss(pred) + reg loss_train = losses_reg(predicted_embeds_train) loss_train.name = 'CL' # for the names #all_params = list(chain(*[ll.get_all_params(pred) for pred in embedder])) all_params = ll.get_all_params(embedder, trainable=True) # this should work with multiple 'roots' grads = T.grad(loss_train, all_params, add_names=True) updates = nesterov_momentum(grads, all_params, update_params['l_r'], momentum=update_params['momentum']) print("Compiling network for training") tic = time.time() train_iter = theano.function([X_pairs['te1'], X_pairs['te2'], y], [loss_train] + grads, updates=updates) toc = time.time() - tic print("Took %0.2f seconds" % toc) #theano.printing.pydotprint(loss, outfile='./loss_graph.png',var_with_name_simple=True) print("Compiling network for validation") tic = time.time() valid_iter = theano.function([X_pairs['te1'], X_pairs['te2'], y], [ contrastive_loss(predicted_embeds_valid), losses_reg(predicted_embeds_valid), failed_pairs(predicted_embeds_valid)]) toc = time.time() - tic print("Took %0.2f seconds" % toc) return {'train':train_iter, 'valid':valid_iter, 'gradnames':[g.name for g in grads]}
def create_generator2_func(layers, apply_updates=False): X = T.fmatrix('X') X_batch = T.fmatrix('X_batch') # no need to pass an input to l_prior_in here generator_outputs = get_output(layers['l_encoder_out2'], X, deterministic=False) # so pass the output of the generator as the output of the concat layer discriminator_outputs = get_output(layers['l_discriminator_out2'], inputs={ layers['l_prior_encoder_concat2']: generator_outputs, }, deterministic=False) # the discriminator learns to predict 1 for q(z|x), # so the generator should fool it into predicting 0 generator_targets = T.zeros_like(X_batch.shape[0]) # so the generator needs to push the discriminator's output to 0 generator_loss = T.mean( T.nnet.binary_crossentropy( discriminator_outputs, generator_targets, )) if apply_updates: # only layers that are part of the generator (i.e., encoder) # should be updated generator_params = get_all_params(layers['l_discriminator_out2'], trainable=True, generator2=True) generator_updates = nesterov_momentum(generator_loss, generator_params, 0.1, 0.0) else: generator_updates = None generator_func = theano.function( inputs=[ theano.In(X_batch), ], outputs=generator_loss, updates=generator_updates, givens={ X: X_batch, }, ) return generator_func
def create_generator_func(layers, apply_updates=False): X = T.fmatrix('X') X_batch = T.fmatrix('X_batch') # no need to pass an input to l_prior_in here generator_outputs = get_output( layers['l_encoder_out'], X, deterministic=False) # so pass the output of the generator as the output of the concat layer discriminator_outputs = get_output( layers['l_discriminator_out'], inputs={ layers['l_prior_encoder_concat']: generator_outputs, }, deterministic=False ) # the discriminator learns to predict 1 for q(z|x), # so the generator should fool it into predicting 0 generator_targets = T.zeros_like(X_batch.shape[0]) # so the generator needs to push the discriminator's output to 0 generator_loss = T.mean( T.nnet.binary_crossentropy( discriminator_outputs, generator_targets, ) ) if apply_updates: # only layers that are part of the generator (i.e., encoder) # should be updated generator_params = get_all_params( layers['l_discriminator_out'], trainable=True, generator=True) generator_updates = nesterov_momentum( generator_loss, generator_params, 0.1, 0.0) else: generator_updates = None generator_func = theano.function( inputs=[ theano.In(X_batch), ], outputs=generator_loss, updates=generator_updates, givens={ X: X_batch, }, ) return generator_func
def define_updates(output_layer, X, Y): output_train = lasagne.layers.get_output(output_layer) output_test = lasagne.layers.get_output(output_layer, deterministic=True) # set up the loss that we aim to minimize when using cat cross entropy our Y should be ints not one-hot loss = lasagne.objectives.categorical_crossentropy( T.clip(output_train, 0.000001, 0.999999), Y) loss = loss.mean() acc = T.mean(T.eq(T.argmax(output_train, axis=1), Y), dtype=theano.config.floatX) # if using ResNet use L2 regularization all_layers = lasagne.layers.get_all_layers(output_layer) l2_penalty = lasagne.regularization.regularize_layer_params( all_layers, lasagne.regularization.l2) * P.L2_LAMBDA loss = loss + l2_penalty # set up loss functions for validation dataset test_loss = lasagne.objectives.categorical_crossentropy( T.clip(output_test, 0.000001, 0.999999), Y) test_loss = test_loss.mean() test_loss = test_loss + l2_penalty test_acc = T.mean(T.eq(T.argmax(output_test, axis=1), Y), dtype=theano.config.floatX) # get parameters from network and set up sgd with nesterov momentum to update parameters, l_r is shared var so it can be changed l_r = theano.shared(np.array(LR_SCHEDULE[0], dtype=theano.config.floatX)) params = lasagne.layers.get_all_params(output_layer, trainable=True) updates = nesterov_momentum(loss, params, learning_rate=l_r, momentum=P.MOMENTUM) #updates = adam(loss, params, learning_rate=l_r) prediction_binary = T.argmax(output_train, axis=1) test_prediction_binary = T.argmax(output_test, axis=1) # set up training and prediction functions train_fn = theano.function( inputs=[X, Y], outputs=[loss, l2_penalty, acc, prediction_binary, output_train[:, 1]], updates=updates) valid_fn = theano.function(inputs=[X, Y], outputs=[ test_loss, l2_penalty, test_acc, test_prediction_binary, output_test[:, 1] ]) return train_fn, valid_fn, l_r
def trainModel(self, tr_X, va_X, tr_y, va_y, batchSize=128, maxIter=100, verbose=True, start=5, period=2, threshold=10, earlyStopTol=2, totalStopTol=2): trainfn = self.trainfn validatefn = self.vatefn lr = self.lr earlyStop = myUtils.basic.earlyStopGen(start, period, threshold, earlyStopTol) earlyStop.next() # 初始化生成器 totalStopCount = 0 for epoch in xrange(maxIter): # every epoch # In each epoch, we do a full pass over the training data: trEqCount = 0 trCostSum = 0. startTime = time.time() for batch in myUtils.basic.miniBatchGen(tr_X, tr_y, batchSize, shuffle=True): Xb, yb = batch trCost, trEqs = trainfn(Xb, yb) trCostSum += trCost trEqCount += trEqs trIter = len(tr_X) // batchSize if len(tr_X) % batchSize != 0: trIter += 1 trCostMean = trCostSum / trIter trAccuracy = float(trEqCount) / len(tr_X) # And a full pass over the validation data: vaEqCount = 0 vaCostSum = 0. for batch in myUtils.basic.miniBatchGen(va_X, va_y, batchSize, shuffle=False): Xb, yb = batch vaCost, vaEqs = validatefn(Xb, yb) vaCostSum += vaCost vaEqCount += vaEqs vaIter = len(va_X) // batchSize if len(va_X) % batchSize != 0: vaIter += 1 vaCostMean = vaCostSum / vaIter vaAccuracy = float(vaEqCount) / len(va_X) if verbose: print 'epoch ', epoch, ' time: %.3f' % (time.time() - startTime), print 'trcost: %.5f tracc: %.5f' % (trCostMean, trAccuracy), print 'vacost: %.5f vaacc: %.5f' % (vaCostMean, vaAccuracy) # Then we decide whether to early stop: if earlyStop.send((trCostMean, vaCostMean)): lr /= 10 # 如果一次早停止发生,则学习率降低继续迭代 updatesDict = updates.nesterov_momentum(self.trCost, self.params, lr, self.momentum) trainfn = myUtils.basic.makeFunc([self.X, self.y], [self.trCost, self.trEqs], updatesDict) totalStopCount += 1 if totalStopCount > totalStopTol: # 如果学习率降低仍然发生早停止,则退出迭代 print 'stop' break if verbose: print 'learning rate decreases to ', lr
def __init__(self, conf): self.conf = conf if self.conf.act == "linear": self.conf.act = linear elif self.conf.act == "sigmoid": self.conf.act = sigmoid elif self.conf.act == "relu": self.conf.act = rectify elif self.conf.act == "tanh": self.conf.act = tanh else: raise ValueError("Unknown activation function", self.conf.act) input_var_first = T.matrix('inputs1') input_var_second = T.matrix('inputs2') target_var = T.matrix('targets') # create network self.autoencoder, encoder_first, encoder_second = self.__create_toplogy__(input_var_first, input_var_second) self.out = get_output(self.autoencoder) loss = squared_error(self.out, target_var) loss = loss.mean() params = get_all_params(self.autoencoder, trainable=True) updates = nesterov_momentum(loss, params, learning_rate=self.conf.lr, momentum=self.conf.momentum) # training function self.train_fn = theano.function([input_var_first, input_var_second, target_var], loss, updates=updates) # fuction to reconstruct test_reconstruction = get_output(self.autoencoder, deterministic=True) self.reconstruction_fn = theano.function([input_var_first, input_var_second], test_reconstruction) # encoding function test_encode = get_output([encoder_first, encoder_second], deterministic=True) self.encoding_fn = theano.function([input_var_first, input_var_second], test_encode) # utils blas = lambda name, ndarray: scipy.linalg.get_blas_funcs((name,), (ndarray,))[0] self.blas_nrm2 = blas('nrm2', np.array([], dtype=float)) self.blas_scal = blas('scal', np.array([], dtype=float)) # load weights if necessary if self.conf.load_model is not None: self.load_model()
def get_updates(nnet, train_obj, trainable_params, solver=None): implemented_solvers = ("sgd", "momentum", "nesterov", "adagrad", "rmsprop", "adadelta", "adam", "adamax") if solver not in implemented_solvers: nnet.sgd_solver = "adam" else: nnet.sgd_solver = solver if nnet.sgd_solver == "sgd": updates = l_updates.sgd(train_obj, trainable_params, learning_rate=Cfg.learning_rate) elif nnet.sgd_solver == "momentum": updates = l_updates.momentum(train_obj, trainable_params, learning_rate=Cfg.learning_rate, momentum=Cfg.momentum) elif nnet.sgd_solver == "nesterov": updates = l_updates.nesterov_momentum(train_obj, trainable_params, learning_rate=Cfg.learning_rate, momentum=Cfg.momentum) elif nnet.sgd_solver == "adagrad": updates = l_updates.adagrad(train_obj, trainable_params, learning_rate=Cfg.learning_rate) elif nnet.sgd_solver == "rmsprop": updates = l_updates.rmsprop(train_obj, trainable_params, learning_rate=Cfg.learning_rate, rho=Cfg.rho) elif nnet.sgd_solver == "adadelta": updates = l_updates.adadelta(train_obj, trainable_params, learning_rate=Cfg.learning_rate, rho=Cfg.rho) elif nnet.sgd_solver == "adam": updates = l_updates.adam(train_obj, trainable_params, learning_rate=Cfg.learning_rate) elif nnet.sgd_solver == "adamax": updates = l_updates.adamax(train_obj, trainable_params, learning_rate=Cfg.learning_rate) return updates
def net_updates(net, loss, lr): # Get all trainable parameters (weights) of our net params = l.get_all_params(net, trainable=True) # We use the adam update, other options are available if cfg.OPTIMIZER == 'adam': param_updates = updates.adam(loss, params, learning_rate=lr, beta1=0.9) elif cfg.OPTIMIZER == 'nesterov': param_updates = updates.nesterov_momentum(loss, params, learning_rate=lr, momentum=0.9) elif cfg.OPTIMIZER == 'sgd': param_updates = updates.sgd(loss, params, learning_rate=lr) return param_updates
def __build_loss_train__fn__(self): # create loss function prediction = layers.get_output(self.net) loss = objectives.categorical_crossentropy(prediction, self.__target_var__) loss = loss.mean() + 1e-4 * regularization.regularize_network_params(self.net, regularization.l2) val_acc = T.mean(T.eq(T.argmax(prediction, axis=1), self.__target_var__),dtype=theano.config.floatX) # create parameter update expressions params = layers.get_all_params(self.net, trainable=True) self.eta = theano.shared(sp.array(sp.float32(0.05), dtype=sp.float32)) update_rule = updates.nesterov_momentum(loss, params, learning_rate=self.eta, momentum=0.9) # compile training function that updates parameters and returns training loss self.__train_fn__ = theano.function([self.__input_var__,self.__target_var__], loss, updates=update_rule) self.__predict_fn__ = theano.function([self.__input_var__], layers.get_output(self.net,deterministic=True)) self.__val_fn__ = theano.function([self.__input_var__,self.__target_var__], [loss,val_acc])
def similarity_iter(output_layer, match_layer, update_params, match_layer_w=0): X1 = T.tensor4() X2 = T.tensor4() y = T.ivector() # find the input layers # TODO this better all_layers = ll.get_all_layers(match_layer) # make image of all layers imwrite_architecture(all_layers, './layer_rep.png') input_1 = filter(lambda x: x.name == 'input1', all_layers)[0] input_2 = filter(lambda x: x.name == 'input2', all_layers)[0] descriptors_train, match_prob_train = ll.get_output([output_layer, match_layer], {input_1: X1, input_2: X2}) descriptors_eval, match_prob_eval = ll.get_output([output_layer, match_layer], {input_1: X1, input_2: X2}, deterministic=True) #descriptor_shape = ll.get_output_shape(output_layer, {input_1: X1, input_2: X2}) #print("Network output shape: %r" % (descriptor_shape,)) # distance minimization distance = lambda x: (x[:,0,:] - x[:,1,:] + 1e-7).norm(2, axis=1) #distance_eval = (descriptors_eval[:,0,:] - descriptors_eval[:,1,:] + 1e-7).norm(2, axis=1) # 9/21 squaring the loss seems to prevent it from getting to 0.5 really quickly (i.e. w/in 3 epochs) # let's see if it will learn something good margin = 1 decay = 0 reg = regularize_network_params(match_layer, l2) * decay loss = lambda x, z: ((1-match_layer_w)*T.mean(y*(distance(x)) + (1 - y)*(T.maximum(0, margin - distance(x))))/2 # constrastive loss + match_layer_w*T.mean(binary_crossentropy(z.T + 1e-7,y))) # matching loss loss_reg = lambda x, z: (loss(x,z) + reg) # this loss doesn't work since it just pushes all the descriptors near each other and then predicts 0 all the time for tha matching #jason_loss = lambda x, z: T.mean(distance(x)*y + (1-y)*binary_crossentropy(z.T + 1e-7,y)) #loss_eval = T.mean(y*(distance_eval**2) + (1 - y)*(T.maximum(0, 1 - distance_eval)**2)) all_params = ll.get_all_params(match_layer) # unsure how I would do this if there were truly two trainable branches... loss_train = loss_reg(descriptors_train, match_prob_train) loss_train.name = 'combined_loss' # for the names grads = T.grad(loss_train, all_params, add_names=True) #updates = adam(grads, all_params, **update_params) updates = nesterov_momentum(grads, all_params, **update_params) train_iter = theano.function([X1, X2, y], [loss_train, loss(descriptors_train, match_prob_train)] + grads, updates=updates) #theano.printing.pydotprint(loss, outfile='./loss_graph.png',var_with_name_simple=True) valid_iter = theano.function([X1, X2, y], loss(descriptors_eval, match_prob_eval)) return {'train':train_iter, 'valid':valid_iter, 'gradnames':[g.name for g in grads]}
def get_train_fn(self, last_only=False): input_var = self.net['input'].input_var target_var = T.ivector('targets') prediction = lasagne.layers.get_output(self.output_layer) loss = categorical_crossentropy(prediction, target_var) loss = loss.mean() error = T.mean(T.neq(T.argmax(prediction, axis=1), target_var), dtype=theano.config.floatX) regularization = self.regularizer_amount * regularize_network_params( self.output_layer, l2) if last_only: all_params = self.output_layer.get_params(trainable=True) else: all_params = lasagne.layers.get_all_params(self.output_layer, trainable=True) updates = nesterov_momentum(loss + regularization, all_params, learning_rate=self.lr) return theano.function([input_var, target_var], (loss, error), updates=updates)
def create_train_func(layers): Xa, Xb = T.tensor4('Xa'), T.tensor4('Xb') Xa_batch, Xb_batch = T.tensor4('Xa_batch'), T.tensor4('Xb_batch') Tp = get_output( layers['trans'], inputs={ layers['inputa']: Xa, layers['inputb']: Xb, }, deterministic=False, ) # transforms: ground-truth, predicted Tg = T.fmatrix('Tg') Tg_batch = T.fmatrix('Tg_batch') theta_gt = Tg.reshape((-1, 2, 3)) theta_pr = Tp.reshape((-1, 2, 3)) # grids: ground-truth, predicted Gg = T.dot(theta_gt, _meshgrid(20, 20)) Gp = T.dot(theta_pr, _meshgrid(20, 20)) train_loss = T.mean(T.sqr(Gg - Gp)) params = get_all_params(layers['trans'], trainable=True) updates = nesterov_momentum(train_loss, params, 1e-3, 0.9) corr_func = theano.function( inputs=[theano.In(Xa_batch), theano.In(Xb_batch), theano.In(Tg_batch)], outputs=[Tp, train_loss], updates=updates, givens={ Xa: Xa_batch, Xb: Xb_batch, # Ia, Ib Tg: Tg_batch, # transform Ia --> Ib }) return corr_func
def build_train_func(self, lr=0.01, mntm=0.9): y_hat = get_output(self.layers['out'], self.x, deterministic=False) train_loss = T.mean( T.nnet.categorical_crossentropy(y_hat, self.y) ) train_acc = T.mean( T.eq(y_hat.argmax(axis=1), self.y) ) all_params = get_all_params(self.layers['out'], trainable=True) updates = nesterov_momentum( train_loss, all_params, lr, mntm) train_func = theano.function( inputs=[theano.In(self.x_batch), theano.In(self.y_batch)], outputs=[train_loss, train_acc], updates=updates, givens={ self.x: self.x_batch, self.y: self.y_batch, }, ) return train_func
def define_updates(output_layer, X, Y): output_train = lasagne.layers.get_output(output_layer) output_test = lasagne.layers.get_output(output_layer, deterministic=True) # set up the loss that we aim to minimize when using cat cross entropy our Y should be ints not one-hot loss = lasagne.objectives.categorical_crossentropy(T.clip(output_train,0.000001,0.999999), Y) loss = loss.mean() acc = T.mean(T.eq(T.argmax(output_train, axis=1), Y), dtype=theano.config.floatX) # if using ResNet use L2 regularization all_layers = lasagne.layers.get_all_layers(output_layer) l2_penalty = lasagne.regularization.regularize_layer_params(all_layers, lasagne.regularization.l2) * P.L2_LAMBDA loss = loss + l2_penalty # set up loss functions for validation dataset test_loss = lasagne.objectives.categorical_crossentropy(T.clip(output_test,0.000001,0.999999), Y) test_loss = test_loss.mean() test_loss = test_loss + l2_penalty test_acc = T.mean(T.eq(T.argmax(output_test, axis=1), Y), dtype=theano.config.floatX) # get parameters from network and set up sgd with nesterov momentum to update parameters, l_r is shared var so it can be changed l_r = theano.shared(np.array(LR_SCHEDULE[0], dtype=theano.config.floatX)) params = lasagne.layers.get_all_params(output_layer, trainable=True) updates = nesterov_momentum(loss, params, learning_rate=l_r, momentum=P.MOMENTUM) #updates = adam(loss, params, learning_rate=l_r) prediction_binary = T.argmax(output_train, axis=1) test_prediction_binary = T.argmax(output_test, axis=1) # set up training and prediction functions train_fn = theano.function(inputs=[X,Y], outputs=[loss, l2_penalty, acc, prediction_binary, output_train[:,1]], updates=updates) valid_fn = theano.function(inputs=[X,Y], outputs=[test_loss, l2_penalty, test_acc, test_prediction_binary, output_test[:,1]]) return train_fn, valid_fn, l_r
def __init__(self, lr, C, momentum): self.lr = lr self.C = C self.momentum = momentum self.X = T.tensor4('X') self.y = T.ivector('y') self.network = build_model(self.X) self.prob = self.network['prob'] feat = self.network['pool5/7x7_s1'] featout = layers.get_output(feat, deterministic=True) self.params = layers.get_all_params(self.prob, trainable=True) reg = regularization.regularize_network_params(self.prob, regularization.l2) reg /= layers.helper.count_params(self.prob) # 训练集 self.yDropProb = layers.get_output(self.prob) trCrossentropy = objectives.categorical_crossentropy( self.yDropProb, self.y) self.trCost = trCrossentropy.mean() + C * reg # 验证、测试集 self.yFullProb = layers.get_output(self.prob, deterministic=True) vateCrossentropy = objectives.categorical_crossentropy( self.yFullProb, self.y) self.vateCost = vateCrossentropy.mean() + C * reg # 训练函数,输入训练集,输出训练损失和误差 updatesDict = updates.nesterov_momentum(self.trCost, self.params, lr, momentum) self.trainfn = myUtils.basic.makeFunc([self.X, self.y], [self.trCost, self.yDropProb], updatesDict) # 验证或测试函数,输入验证或测试集,输出损失和误差,不进行更新 self.vatefn = myUtils.basic.makeFunc([self.X, self.y], [self.vateCost, self.yFullProb], None) # 输出特征 self.featfn = myUtils.basic.makeFunc([self.X], [featout], None)
def train(files, batch_size, emb_dim_size, save_dir, load_dir, skip_window, num_skips): learning_rate = 0.1 momentum = 0.9 num_epochs = 1000 dataset = Dataset(files, load_dir=load_dir, skip_window=skip_window, num_skips=num_skips) data_stream = dataset.data_stream if save_dir: dataset.save_dictionary(save_dir) dictionary = dataset.dictionary reverse_dictionary = dict((v, k) for k, v in dictionary.iteritems()) print 'Dictionary size: ', len(dictionary) query_input = T.ivector('query') context_target = T.ivector('context') word2vec = Word2VecDiscrete(batch_size=batch_size, context_vocab_size=dataset.vocab_size, query_vocab_size=dataset.vocab_size, emb_dim_size=emb_dim_size) word2vec.build_model(query_input) prediction = word2vec.get_output() loss = lasagne.objectives.categorical_crossentropy(prediction, context_target) loss = loss.mean() params = word2vec.get_all_params() updates = nesterov_momentum(loss, params, learning_rate, momentum) train = theano.function([query_input, context_target], loss, updates=updates) losses = [] mean_losses = [] start = time.time() for epoch in range(num_epochs): mean_loss = 0 for i, batch in enumerate(data_stream.get_batches(batch_size)): queries, contexts = batch losses.append(train(queries, contexts)) if save_dir and i % 10000 == 0: word2vec.save(save_dir) mean_loss = np.mean(losses) mean_losses.append(mean_loss) if epoch % 1 == 0: print('epoch {} mean loss {}'.format(epoch, mean_loss)) #print 'Embedding for king is: ', word2vec.embed([dictionary['king']]) if save_dir: word2vec.save(save_dir) end = time.time() print("Time: ", end - start) print 'Top similar words: ' results = [ (word, spatial.distance.euclidean(word2vec.embed([dictionary['king']]), word2vec.embed([dictionary[word]]))) for (word, _) in dictionary.iteritems() ] results.sort(key=operator.itemgetter(1)) out = [r[0] for r in results] print 'closest to {} : {}'.format('king', out) print 'Top similar words: ' results = [ (word, spatial.distance.euclidean(word2vec.embed([dictionary['queen']]), word2vec.embed([dictionary[word]]))) for (word, _) in dictionary.iteritems() ] results.sort(key=operator.itemgetter(1)) out = [r[0] for r in results] print 'closest to {} : {}'.format('queen', out) plt.title("Mean loss") plt.xlabel("Epochs") plt.ylabel("Loss") plt.plot(mean_losses, label="train") plt.grid() plt.legend() plt.draw() plt.show()
def main(): # load the training and validation data sets train_X, test_X, train_y, test_y = load_data_cv('data/train.csv') X = T.ftensor4() Y = T.fmatrix() # set up theano functions to generate output by feeding data through network output_layer = lasagne_model() output_train = lasagne.layers.get_output(output_layer, X) output_valid = lasagne.layers.get_output(output_layer, X, deterministic=True) # set up the loss that we aim to minimize loss_train = T.mean(T.nnet.categorical_crossentropy(output_train, Y)) loss_valid = T.mean(T.nnet.categorical_crossentropy(output_valid, Y)) # prediction functions for classifications pred = T.argmax(output_train, axis=1) pred_valid = T.argmax(output_valid, axis=1) # get parameters from network and set up sgd with nesterov momentum to update parameters params = lasagne.layers.get_all_params(output_layer) updates = nesterov_momentum(loss_train, params, learning_rate=0.003, momentum=0.9) # set up training and prediction functions train = theano.function(inputs=[X, Y], outputs=loss_train, updates=updates, allow_input_downcast=True) valid = theano.function(inputs=[X, Y], outputs=loss_valid, allow_input_downcast=True) predict_valid = theano.function(inputs=[X], outputs=pred_valid, allow_input_downcast=True) # loop over training functions for however many iterations, print information while training train_eval = [] valid_eval = [] valid_acc = [] try: for i in range(45): train_loss = batch_iterator(train_X, train_y, BATCHSIZE, train) train_eval.append(train_loss) valid_loss = valid(test_X, test_y) valid_eval.append(valid_loss) acc = np.mean(np.argmax(test_y, axis=1) == predict_valid(test_X)) valid_acc.append(acc) print 'iter:', i, '| Tloss:', train_loss, '| Vloss:', valid_loss, '| valid acc:', acc except KeyboardInterrupt: pass # save weights all_params = helper.get_all_param_values(output_layer) f = gzip.open('data/weights.pklz', 'wb') pickle.dump(all_params, f) f.close() # plot loss and accuracy train_eval = np.array(train_eval) valid_eval = np.array(valid_eval) valid_acc = np.array(valid_acc) sns.set_style("whitegrid") pyplot.plot(train_eval, linewidth = 3, label = 'train loss') pyplot.plot(valid_eval, linewidth = 3, label = 'valid loss') pyplot.legend(loc = 2) pyplot.twinx() pyplot.plot(valid_acc, linewidth = 3, label = 'valid accuracy', color = 'r') pyplot.grid() pyplot.ylim([.9,1]) pyplot.legend(loc = 1) pyplot.savefig('data/training_plot.png')
def main(): # load the training and validation data sets # labels=int(0.7*speech.all_count) global valid_best data=speech.data_dict labels=data.keys() # assert (PIXELS,PIXELS)==speechSize train_X=np.zeros([num_speechs,1,speechSize[0],speechSize[1]]) train_y=np.zeros([num_speechs,len(labels)]) i=0 for label in (data.keys()): for im in data[label]: train_X[i,0]=im train_y[i]=label_binarize([label],labels)[0] i+=1 if i>=num_speechs: break if i%500==0: print 'idx of speechs:',i if i>=num_speechs: break zipp=zip(train_X,train_y) random.shuffle(zipp) xx=np.array([one[0] for one in zipp]) yy=np.array([one[1] for one in zipp]) del train_X,train_y train_X=xx[:size] train_y=yy[:size] valid_X=xx[size:] valid_y=yy[size:] del xx,yy print 'Shuffle finish. Begin to build model.' X = T.tensor4() Y = T.matrix() # set up theano functions to generate output by feeding data through network output_layer = lasagne_model() output_train = lasagne.layers.get_output(output_layer, X) output_valid = lasagne.layers.get_output(output_layer, X, deterministic=True) # set up the loss that we aim to minimize loss_train = T.mean(T.nnet.categorical_crossentropy(output_train, Y)) loss_valid = T.mean(T.nnet.categorical_crossentropy(output_valid, Y)) # prediction functions for classifications pred = T.argmax(output_train, axis=1) pred_valid = T.argmax(output_valid, axis=1) # get parameters from network and set up sgd with nesterov momentum to update parameters params = lasagne.layers.get_all_params(output_layer,trainable=True) updates = nesterov_momentum(loss_train, params, learning_rate=0.003, momentum=0.9) # updates =lasagne.updates.sgd(loss_train, params, learning_rate=0.01) # set up training and prediction functions train = theano.function(inputs=[X, Y], outputs=[loss_train,pred], updates=updates, allow_input_downcast=True) valid = theano.function(inputs=[X, Y], outputs=[loss_valid,pred_valid], allow_input_downcast=True) # predict_valid = theano.function(inputs=[X], outputs=[pred_valid], allow_input_downcast=True) # loop over training functions for however many iterations, print information while training train_eval = [] valid_eval = [] valid_acc = [] for i in range(450): batch_total_number = len(train_X) / BATCHSIZE for idx_batch in range (batch_total_number): xx_batch = np.float32(train_X[idx_batch * BATCHSIZE:(idx_batch + 1) * BATCHSIZE]) yy_batch = np.float32(train_y[idx_batch * BATCHSIZE:(idx_batch + 1) * BATCHSIZE]) train_loss ,pred = train(xx_batch,yy_batch) count=np.count_nonzero(np.int32(pred ==np.argmax(yy_batch,axis=1))) print i,idx_batch,'| Tloss:', train_loss,'| Count:',count,'| Acc:',float(count)/(BATCHSIZE) print pred print np.argmax(yy_batch,axis=1) print "time:",time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) if 1 and idx_batch%15==0: acc=0 valid_batch_number=len(valid_X)/BATCHSIZE for j in tqdm(range(valid_batch_number)): x_batch = np.float32(valid_X[j* BATCHSIZE:(j+ 1) * BATCHSIZE]) y_batch = np.float32(valid_y[j* BATCHSIZE:(j+ 1) * BATCHSIZE]) print len(x_batch),len(y_batch) valid_loss,pred = valid(x_batch,y_batch) # pred = predict_valid(x_batch)[0] acc += np.count_nonzero(np.int32(pred ==np.argmax(y_batch,axis=1))) acc=float(acc)/(valid_batch_number*BATCHSIZE) print 'iter:', i,idx_batch, '| Vloss:',valid_loss,'|Acc:',acc if acc>valid_best: print 'new valid_best:',valid_best,'-->',acc valid_best=acc all_params = helper.get_all_param_values(output_layer) f = gzip.open('speech_params/validbest_cnn_allbatchnorm_{}_{}_{}.pklz'.format(i,valid_best,time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())), 'wb') pickle.dump(all_params, f) f.close() # save weights if i%5: all_params = helper.get_all_param_values(output_layer) f = gzip.open('speech_params/validbest_cnn_allbatchnorm_{}_{}_{}.pklz'.format(i,acc,time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())), 'wb') pickle.dump(all_params, f) f.close()
def event_span_classifier(args, input_var, input_mask_var, target_var, wordEmbeddings, seqlen): print("Building model with LSTM") vocab_size = wordEmbeddings.shape[1] wordDim = wordEmbeddings.shape[0] GRAD_CLIP = wordDim args.lstmDim = 150 input = InputLayer((None, seqlen),input_var=input_var) batchsize, seqlen = input.input_var.shape input_mask = InputLayer((None, seqlen),input_var=input_mask_var) emb = EmbeddingLayer(input, input_size=vocab_size, output_size=wordDim, W=wordEmbeddings.T) #emb.params[emb_1.W].remove('trainable') lstm = LSTMLayer(emb, num_units=args.lstmDim, mask_input=input_mask, grad_clipping=GRAD_CLIP, nonlinearity=tanh) lstm_back = LSTMLayer( emb, num_units=args.lstmDim, mask_input=input_mask, grad_clipping=GRAD_CLIP, nonlinearity=tanh, backwards=True) slice_forward = SliceLayer(lstm, indices=-1, axis=1) # out_shape (None, args.lstmDim) slice_backward = SliceLayer(lstm_back, indices=0, axis=1) # out_shape (None, args.lstmDim) concat = ConcatLayer([slice_forward, slice_backward]) hid = DenseLayer(concat, num_units=args.hiddenDim, nonlinearity=sigmoid) network = DenseLayer(hid, num_units=2, nonlinearity=softmax) prediction = get_output(network) loss = T.mean(binary_crossentropy(prediction,target_var)) lambda_val = 0.5 * 1e-4 layers = {emb:lambda_val, lstm:lambda_val, hid:lambda_val, network:lambda_val} penalty = regularize_layer_params_weighted(layers, l2) loss = loss + penalty params = get_all_params(network, trainable=True) if args.optimizer == "sgd": updates = sgd(loss, params, learning_rate=args.step) elif args.optimizer == "adagrad": updates = adagrad(loss, params, learning_rate=args.step) elif args.optimizer == "adadelta": updates = adadelta(loss, params, learning_rate=args.step) elif args.optimizer == "nesterov": updates = nesterov_momentum(loss, params, learning_rate=args.step) elif args.optimizer == "rms": updates = rmsprop(loss, params, learning_rate=args.step) elif args.optimizer == "adam": updates = adam(loss, params, learning_rate=args.step) else: raise "Need set optimizer correctly" test_prediction = get_output(network, deterministic=True) test_loss = T.mean(binary_crossentropy(test_prediction,target_var)) train_fn = theano.function([input_var, input_mask_var,target_var], loss, updates=updates, allow_input_downcast=True) test_acc = T.mean(binary_accuracy(test_prediction, target_var)) val_fn = theano.function([input_var, input_mask_var, target_var], [test_loss, test_acc], allow_input_downcast=True) return train_fn, val_fn, network
def main(): # load model and parameters output_layer = lasagne_model() f = gzip.open('data/weights.pklz', 'rb') all_params = pickle.load(f) f.close() X = T.ftensor4() Y = T.fmatrix() # set up theano functions to generate output by feeding data through network output_layer = lasagne_model() output_train = lasagne.layers.get_output(output_layer, X) output_valid = lasagne.layers.get_output(output_layer, X, deterministic=True) # set up the loss that we aim to minimize loss_train = T.mean(T.nnet.categorical_crossentropy(output_train, Y)) loss_valid = T.mean(T.nnet.categorical_crossentropy(output_valid, Y)) # prediction functions for classifications pred = T.argmax(output_train, axis=1) pred_valid = T.argmax(output_valid, axis=1) # get parameters from network and set up sgd with nesterov momentum to update parameters helper.set_all_param_values(output_layer, all_params) params = lasagne.layers.get_all_params(output_layer) updates = nesterov_momentum(loss_train, params, learning_rate=0.0001, momentum=0.9) # set up training and prediction functions train = theano.function(inputs=[X, Y], outputs=loss_train, updates=updates, allow_input_downcast=True) valid = theano.function(inputs=[X, Y], outputs=loss_valid, allow_input_downcast=True) predict_valid = theano.function(inputs=[X], outputs=pred_valid, allow_input_downcast=True) # fine tune network train_X, test_X, train_y, test_y = load_data_cv('data/train.csv') train_eval = [] valid_eval = [] valid_acc = [] try: for i in range(5): train_loss = batch_iterator_no_aug(train_X, train_y, BATCHSIZE, train) train_eval.append(train_loss) valid_loss = valid(test_X, test_y) valid_eval.append(valid_loss) acc = np.mean(np.argmax(test_y, axis=1) == predict_valid(test_X)) valid_acc.append(acc) print 'iter:', i, '| Tloss:', train_loss, '| Vloss:', valid_loss, '| valid acc:', acc except KeyboardInterrupt: pass # after training create output for kaggle testing_inputs = load_test_data('data/test.csv') predictions = [] for j in range((testing_inputs.shape[0] + BATCHSIZE - 1) // BATCHSIZE): sl = slice(j * BATCHSIZE, (j + 1) * BATCHSIZE) X_batch = testing_inputs[sl] predictions.extend(predict_valid(X_batch)) out = pd.read_csv('data/convnet_preds.csv') out['Label'] = predictions out.to_csv('preds/convnet_preds.csv', index=False)
####################### UPDATES ######################### #we use dynamic learning rates which change after some epochs lr_dynamic = T.scalar(name='learning_rate') #get all trainable parameters (weights) of our net params = l.get_all_params(NET, trainable=True) #we use the adam update if OPTIMIZER == 'adam': param_updates = updates.adam(loss, params, learning_rate=lr_dynamic, beta1=0.5) elif OPTIMIZER == 'nesterov': param_updates = updates.nesterov_momentum(loss, params, learning_rate=lr_dynamic, momentum=0.9) #################### TRAIN FUNCTION ###################### #the theano train functions takes images and class targets as input print "COMPILING THEANO TRAIN FUNCTION...", start = time.time() train_net = theano.function( [l.get_all_layers(NET)[0].input_var, targets, lr_dynamic], loss, updates=param_updates) print "DONE! (", int(time.time() - start), "s )" ################# PREDICTION FUNCTION #################### #we need the prediction function to calculate the validation accuracy #this way we can test the net during/after training
def build_network_2dconv(args, input_var, target_var, wordEmbeddings, maxlen=60): print("Building model with 2D Convolution") vocab_size = wordEmbeddings.shape[1] wordDim = wordEmbeddings.shape[0] num_filters = 100 stride = 1 # CNN_sentence config filter_size = (3, wordDim) pool_size = (maxlen - 3 + 1, 1) input = InputLayer((None, maxlen), input_var=input_var) batchsize, seqlen = input.input_var.shape emb = EmbeddingLayer(input, input_size=vocab_size, output_size=wordDim, W=wordEmbeddings.T) emb.params[emb.W].remove("trainable") # (batchsize, maxlen, wordDim) reshape = ReshapeLayer(emb, (batchsize, 1, maxlen, wordDim)) conv2d = Conv2DLayer( reshape, num_filters=num_filters, filter_size=(filter_size), stride=stride, nonlinearity=rectify, W=GlorotUniform(), ) # (None, 100, 34, 1) maxpool = MaxPool2DLayer(conv2d, pool_size=pool_size) # (None, 100, 1, 1) forward = FlattenLayer(maxpool) # (None, 100) #(None, 50400) hid = DenseLayer(forward, num_units=args.hiddenDim, nonlinearity=sigmoid) network = DenseLayer(hid, num_units=2, nonlinearity=softmax) prediction = get_output(network) loss = T.mean(binary_crossentropy(prediction, target_var)) lambda_val = 0.5 * 1e-4 layers = {conv2d: lambda_val, hid: lambda_val, network: lambda_val} penalty = regularize_layer_params_weighted(layers, l2) loss = loss + penalty params = get_all_params(network, trainable=True) if args.optimizer == "sgd": updates = sgd(loss, params, learning_rate=args.step) elif args.optimizer == "adagrad": updates = adagrad(loss, params, learning_rate=args.step) elif args.optimizer == "adadelta": updates = adadelta(loss, params, learning_rate=args.step) elif args.optimizer == "nesterov": updates = nesterov_momentum(loss, params, learning_rate=args.step) elif args.optimizer == "rms": updates = rmsprop(loss, params, learning_rate=args.step) elif args.optimizer == "adam": updates = adam(loss, params, learning_rate=args.step) else: raise "Need set optimizer correctly" test_prediction = get_output(network, deterministic=True) test_loss = T.mean(binary_crossentropy(test_prediction, target_var)) train_fn = theano.function([input_var, target_var], loss, updates=updates, allow_input_downcast=True) test_acc = T.mean(binary_accuracy(test_prediction, target_var)) val_fn = theano.function([input_var, target_var], [test_loss, test_acc], allow_input_downcast=True) return train_fn, val_fn
output_test = lasagne.layers.get_output(output_layer, deterministic=True) # set up the loss that we aim to minimize, when using cat cross entropy our Y should be ints not one-hot loss = lasagne.objectives.categorical_crossentropy(output_train, Y) loss = loss.mean() # set up loss functions for validation dataset valid_loss = lasagne.objectives.categorical_crossentropy(output_test, Y) valid_loss = valid_loss.mean() valid_acc = T.mean(T.eq(T.argmax(output_test, axis=1), Y), dtype=theano.config.floatX) # get parameters from network and set up sgd with nesterov momentum to update parameters l_r = theano.shared(np.array(LR_SCHEDULE[0], dtype=theano.config.floatX)) params = lasagne.layers.get_all_params(output_layer, trainable=True) updates = nesterov_momentum(loss, params, learning_rate=l_r) # set up training and prediction functions train_fn = theano.function(inputs=[X,Y], outputs=loss, updates=updates) valid_fn = theano.function(inputs=[X,Y], outputs=[valid_loss, valid_acc]) # set up prediction function predict_proba = theano.function(inputs=[X], outputs=output_test) ''' load training data and start training ''' encoder = LabelEncoder() # load data X_train = np.load('data/cache/X_train_%d_f32_clean.npy'%PIXELS)
def build_finetune_functions(self, datasets, batch_size, learning_rate): """Generates a function `train_function` that implements one step of finetuning, a function `valid_score` that computes the cost on the validation set and a function 'test_score' that computes the reconstruction cost on the test set. :type datasets: list of pairs of theano.tensor.TensorType :param datasets: It is a list that contain all the datasets; the has to contain three pairs, `train`, `valid`, `test` in this order, where each pair is formed of two Theano variables, one for the datapoints, the other for the labels :type batch_size: int :param batch_size: size of a minibatch :type learning_rate: float (usually a shared variable so it can be updated) :param learning_rate: learning rate used during finetune stage """ (train_set_x, train_set_y) = datasets[0] (valid_set_x, valid_set_y) = datasets[1] (test_set_x, test_set_y) = datasets[2] index = T.lscalar('index') # index to a [mini]batch updates = nesterov_momentum(self.fine_tune_cost, self.params, learning_rate=learning_rate, momentum=0.9) train_function = theano.function( inputs=[index], outputs=self.fine_tune_cost, updates=updates, givens={ self.x: train_set_x[index * batch_size: (index + 1) * batch_size] }, name='train') valid_score_i = theano.function( [index], outputs=self.test_cost, givens={ self.x: valid_set_x[index * batch_size: (index + 1) * batch_size], }, name='valid' ) test_score_i = theano.function( [index], outputs=self.test_cost, givens={ self.x: test_set_x[index * batch_size: (index + 1) * batch_size], }, name='test' ) # Create a function that scans the entire validation set n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] // batch_size def valid_score(): return [valid_score_i(i) for i in range(n_valid_batches)] # Create a function that scans the entire test set n_test_batches = test_set_x.get_value(borrow=True).shape[0] // batch_size def test_score(): return [test_score_i(i) for i in range(n_test_batches)] return train_function, valid_score, test_score
def get_model(): # Prepare Theano variables for inputs and targets input_var = T.tensor4('inputs') target_var = T.matrix('targets') # input layer with unspecified batch size layer_0 = InputLayer(shape=(None, 30, 64, 64), input_var=input_var) # Z-score? # Convolution then batchNormalisation then activation layer, twice, then zero padding layer followed by a dropout layer layer_1 = batch_norm( Conv2DLayer(layer_0, 64, (3, 3), pad='same', nonlinearity=leaky_rectify)) layer_2 = batch_norm( Conv2DLayer(layer_1, 64, (3, 3), pad='valid', nonlinearity=leaky_rectify)) layer_3 = MaxPool2DLayer(layer_2, pool_size=(2, 2), stride=(2, 2), pad=(1, 1)) layer_4 = DropoutLayer(layer_3, p=0.25) # Convolution then batchNormalisation then activation layer, twice, then zero padding layer followed by a dropout layer layer_5 = batch_norm( Conv2DLayer(layer_4, 96, (3, 3), pad='same', nonlinearity=leaky_rectify)) layer_6 = batch_norm( Conv2DLayer(layer_5, 96, (3, 3), pad='valid', nonlinearity=leaky_rectify)) layer_7 = MaxPool2DLayer(layer_6, pool_size=(2, 2), stride=(2, 2), pad=(1, 1)) layer_8 = DropoutLayer(layer_7, p=0.25) # Convolution then batchNormalisation then activation layer, twice, then zero padding layer followed by a dropout layer layer_9 = batch_norm( Conv2DLayer(layer_8, 128, (3, 3), pad='same', nonlinearity=leaky_rectify)) layer_10 = batch_norm( Conv2DLayer(layer_9, 128, (3, 3), pad='valid', nonlinearity=leaky_rectify)) layer_11 = MaxPool2DLayer(layer_10, pool_size=(2, 2), stride=(2, 2), pad=(1, 1)) layer_12 = DropoutLayer(layer_11, p=0.25) # Last layers layer_13 = FlattenLayer(layer_12) layer_14 = DenseLayer(layer_13, 1024, nonlinearity=leaky_rectify) layer_15 = DropoutLayer(layer_14, p=0.5) layer_16 = DenseLayer(layer_15, 600, nonlinearity=softmax) # Loss prediction = get_output(layer_16) loss = squared_error(prediction, target_var) loss = loss.mean() + regularize_layer_params(layer_14, l2) #Updates : Stochastic Gradient Descent (SGD) with Nesterov momentum params = get_all_params(layer_16, trainable=True) updates = nesterov_momentum(loss, params, learning_rate=0.01, momentum=0.9) # Create a loss expression for validation/testing. The crucial difference # here is that we do a deterministic forward pass through the network, disabling dropout layers. test_prediction = get_output(layer_16, deterministic=True) test_loss = squared_error(test_prediction, target_var) test_loss = test_loss.mean() # Compile a function performing a training step on a mini-batch (by giving # the updates dictionary) and returning the corresponding training loss: train_fn = theano.function([input_var, target_var], loss, updates=updates) # Compile a second function computing the validation loss and accuracy: val_fn = theano.function([input_var, target_var], test_loss) # Compule a third function computing the prediction predict_fn = theano.function([input_var], test_prediction) return [layer_16, train_fn, val_fn, predict_fn]
def get_model(): # Prepare Theano variables for inputs and targets input_var = T.tensor4('inputs') target_var = T.matrix('targets') # input layer with unspecified batch size layer_both_0 = InputLayer(shape=(None, 30, 64, 64), input_var=input_var) # Z-score? # Convolution then batchNormalisation then activation layer, twice, then zero padding layer followed by a dropout layer layer_both_1 = batch_norm(Conv2DLayer(layer_both_0, 64, (3, 3), pad='same', nonlinearity=leaky_rectify)) layer_both_2 = batch_norm(Conv2DLayer(layer_both_1, 64, (3, 3), pad='valid', nonlinearity=leaky_rectify)) layer_both_3 = MaxPool2DLayer(layer_both_2, pool_size=(2, 2), stride=(2, 2), pad=(1, 1)) layer_both_4 = DropoutLayer(layer_both_3, p=0.25) # Systole : Convolution then batchNormalisation then activation layer, twice, then zero padding layer followed by a dropout layer layer_systole_0 = batch_norm(Conv2DLayer(layer_both_4, 96, (3, 3), pad='same',nonlinearity=leaky_rectify)) layer_systole_1 = batch_norm(Conv2DLayer(layer_systole_0, 96, (3, 3), pad='valid',nonlinearity=leaky_rectify)) layer_systole_2 = MaxPool2DLayer(layer_systole_1, pool_size=(2, 2), stride=(2, 2), pad=(1, 1)) layer_systole_3 = DropoutLayer(layer_systole_2, p=0.25) # Diastole : Convolution then batchNormalisation then activation layer, twice, then zero padding layer followed by a dropout layer layer_diastole_0 = batch_norm(Conv2DLayer(layer_both_4, 96, (3, 3), pad='same',nonlinearity=leaky_rectify)) layer_diastole_1 = batch_norm(Conv2DLayer(layer_diastole_0, 96, (3, 3), pad='valid',nonlinearity=leaky_rectify)) layer_diastole_2 = MaxPool2DLayer(layer_diastole_1, pool_size=(2, 2), stride=(2, 2), pad=(1, 1)) layer_diastole_3 = DropoutLayer(layer_diastole_2, p=0.25) # Systole : Convolution then batchNormalisation then activation layer, twice, then zero padding layer followed by a dropout layer layer_systole_4 = batch_norm(Conv2DLayer(layer_systole_3, 128, (3, 3), pad='same',nonlinearity=leaky_rectify)) layer_systole_5 = batch_norm(Conv2DLayer(layer_systole_4, 128, (3, 3), pad='valid',nonlinearity=leaky_rectify)) layer_systole_6 = MaxPool2DLayer(layer_systole_5, pool_size=(2, 2), stride=(2, 2), pad=(1, 1)) layer_systole_7 = DropoutLayer(layer_systole_6, p=0.25) # Diastole : Convolution then batchNormalisation then activation layer, twice, then zero padding layer followed by a dropout layer layer_diastole_4 = batch_norm(Conv2DLayer(layer_diastole_3, 128, (3, 3), pad='same',nonlinearity=leaky_rectify)) layer_diastole_5 = batch_norm(Conv2DLayer(layer_diastole_4, 128, (3, 3), pad='valid',nonlinearity=leaky_rectify)) layer_diastole_6 = MaxPool2DLayer(layer_diastole_5, pool_size=(2, 2), stride=(2, 2), pad=(1, 1)) layer_diastole_7 = DropoutLayer(layer_diastole_6, p=0.25) # Systole : Last layers layer_systole_8 = FlattenLayer(layer_systole_7) layer_systole_9 = DenseLayer(layer_systole_8, 1024, nonlinearity=leaky_rectify) layer_systole_10 = DropoutLayer(layer_systole_9, p=0.5) layer_systole_11 = DenseLayer(layer_systole_10, 600, nonlinearity=softmax) # Diastole : Last layers layer_diastole_8 = FlattenLayer(layer_diastole_7) layer_diastole_9 = DenseLayer(layer_diastole_8, 1024, nonlinearity=leaky_rectify) layer_diastole_10 = DropoutLayer(layer_diastole_9, p=0.5) layer_diastole_11 = DenseLayer(layer_diastole_10, 600, nonlinearity=softmax) # Merge layer_both_5 = ConcatLayer([layer_systole_11, layer_diastole_11]) # Loss prediction = get_output(layer_both_5) loss = squared_error(prediction, target_var) loss = loss.mean() + regularize_layer_params(layer_systole_9, l2) + regularize_layer_params(layer_diastole_9, l2) #Updates : Stochastic Gradient Descent (SGD) with Nesterov momentum params = get_all_params(layer_both_5, trainable=True) updates = nesterov_momentum(loss, params, learning_rate=0.01, momentum=0.9) # Create a loss expression for validation/testing. The crucial difference # here is that we do a deterministic forward pass through the network, disabling dropout layers. test_prediction = get_output(layer_both_5, deterministic=True) test_loss = squared_error(test_prediction, target_var) test_loss = test_loss.mean() # Compile a function performing a training step on a mini-batch (by giving # the updates dictionary) and returning the corresponding training loss: train_fn = theano.function([input_var, target_var], loss, updates=updates) # Compile a second function computing the validation loss and accuracy: val_fn = theano.function([input_var, target_var], test_loss) # Compule a third function computing the prediction predict_fn = theano.function([input_var], test_prediction) return [layer_both_5, train_fn, val_fn, predict_fn]
def main(): # load the training and validation data sets labels=int(0.7*image.all_count) data=image.ddd train_X=np.zeros([size,1,PIXELS,PIXELS]) train_y=np.zeros([size,labels]) for i in range(size): label=random.sample(range(labels),1)[0] train_X[i,0]=0.01*random.sample(data[label],1)[0] train_y[i]=label_binarize([label],range(labels))[0] X = T.tensor4() Y = T.matrix() # set up theano functions to generate output by feeding data through network output_layer = lasagne_model() output_train = lasagne.layers.get_output(output_layer, X) output_valid = lasagne.layers.get_output(output_layer, X, deterministic=True) # set up the loss that we aim to minimize loss_train = T.mean(T.nnet.categorical_crossentropy(output_train, Y)) loss_valid = T.mean(T.nnet.categorical_crossentropy(output_valid, Y)) # prediction functions for classifications pred = T.argmax(output_train, axis=1) pred_valid = T.argmax(output_valid, axis=1) # get parameters from network and set up sgd with nesterov momentum to update parameters params = lasagne.layers.get_all_params(output_layer) updates = nesterov_momentum(loss_train, params, learning_rate=0.03, momentum=0.9) # updates =lasagne.updates.adagrad(loss_train, params, learning_rate=0.003) # set up training and prediction functions train = theano.function(inputs=[X, Y], outputs=[loss_train,pred_valid], updates=updates, allow_input_downcast=True) valid = theano.function(inputs=[X, Y], outputs=loss_valid, allow_input_downcast=True) predict_valid = theano.function(inputs=[X], outputs=pred_valid, allow_input_downcast=True) # loop over training functions for however many iterations, print information while training train_eval = [] valid_eval = [] valid_acc = [] for i in range(450): batch_total_number = len(train_X) / BATCHSIZE for idx_batch in range (batch_total_number): xx_batch = np.float32(train_X[idx_batch * BATCHSIZE:(idx_batch + 1) * BATCHSIZE]) yy_batch = np.float32(train_y[idx_batch * BATCHSIZE:(idx_batch + 1) * BATCHSIZE]) train_loss ,pred = train(xx_batch,yy_batch) print i,idx_batch,'| Tloss:', train_loss,'| Count:',np.count_nonzero(np.int32(pred ==np.argmax(yy_batch,axis=1))) print pred print np.argmax(yy_batch,axis=1) acc=0 for j in range(batch_total_number): x_batch = np.float32(train_X[idx_batch * BATCHSIZE:(idx_batch + 1) * BATCHSIZE]) y_batch = np.float32(train_y[idx_batch * BATCHSIZE:(idx_batch + 1) * BATCHSIZE]) pred = predict_valid(x_batch) acc += np.count_nonzero(np.int32(pred ==np.argmax(y_batch,axis=1))) acc=float(acc)/(BATCHSIZE*batch_total_number) print 'iter:', i,idx_batch, '| Tloss:', train_loss,'|Acc:',acc # save weights all_params = helper.get_all_param_values(output_layer) f = gzip.open('params/weights.pklz', 'wb') pickle.dump(all_params, f) f.close()
def main(): parser = argparse.ArgumentParser(description='Tuning with bi-directional LSTM-CNN-CRF') parser.add_argument('--num_epochs', type=int, default=1000, help='Number of training epochs') parser.add_argument('--batch_size', type=int, default=10, help='Number of sentences in each batch') parser.add_argument('--num_units', type=int, default=100, help='Number of hidden units in LSTM') parser.add_argument('--num_filters', type=int, default=20, help='Number of filters in CNN') parser.add_argument('--learning_rate', type=float, default=0.1, help='Learning rate') parser.add_argument('--decay_rate', type=float, default=0.1, help='Decay rate of learning rate') parser.add_argument('--grad_clipping', type=float, default=0, help='Gradient clipping') parser.add_argument('--gamma', type=float, default=1e-6, help='weight for regularization') parser.add_argument('--delta', type=float, default=0.0, help='weight for expectation-linear regularization') parser.add_argument('--regular', choices=['none', 'l2'], help='regularization for training', required=True) parser.add_argument('--dropout', choices=['std', 'recurrent'], help='dropout patten') parser.add_argument('--schedule', nargs='+', type=int, help='schedule for learning rate decay') parser.add_argument('--output_prediction', action='store_true', help='Output predictions to temp files') parser.add_argument('--train') # "data/POS-penn/wsj/split1/wsj1.train.original" parser.add_argument('--dev') # "data/POS-penn/wsj/split1/wsj1.dev.original" parser.add_argument('--test') # "data/POS-penn/wsj/split1/wsj1.test.original" args = parser.parse_args() logger = get_logger("Sequence Labeling") train_path = args.train dev_path = args.dev test_path = args.test num_epochs = args.num_epochs batch_size = args.batch_size num_units = args.num_units num_filters = args.num_filters regular = args.regular grad_clipping = args.grad_clipping gamma = args.gamma delta = args.delta learning_rate = args.learning_rate momentum = 0.9 decay_rate = args.decay_rate schedule = args.schedule output_predict = args.output_prediction dropout = args.dropout p = 0.5 logger.info("Creating Alphabets") word_alphabet, char_alphabet, pos_alphabet, type_alphabet = data_utils.create_alphabets("data/alphabets/", [train_path, dev_path, test_path], 40000) logger.info("Word Alphabet Size: %d" % word_alphabet.size()) logger.info("Character Alphabet Size: %d" % char_alphabet.size()) logger.info("POS Alphabet Size: %d" % pos_alphabet.size()) num_labels = pos_alphabet.size() - 1 logger.info("Reading Data") data_train = data_utils.read_data(train_path, word_alphabet, char_alphabet, pos_alphabet, type_alphabet) data_dev = data_utils.read_data(dev_path, word_alphabet, char_alphabet, pos_alphabet, type_alphabet) data_test = data_utils.read_data(test_path, word_alphabet, char_alphabet, pos_alphabet, type_alphabet) num_data = sum([len(bucket) for bucket in data_train]) logger.info("constructing network...") # create variables target_var = T.imatrix(name='targets') mask_var = T.matrix(name='masks', dtype=theano.config.floatX) mask_nr_var = T.matrix(name='masks_nr', dtype=theano.config.floatX) word_var = T.imatrix(name='inputs') char_var = T.itensor3(name='char-inputs') network = build_network(word_var, char_var, mask_var, word_alphabet, char_alphabet, dropout, num_units, num_labels, grad_clipping, num_filters, p) logger.info("Network structure: hidden=%d, filter=%d, dropout=%s" % (num_units, num_filters, dropout)) # compute loss num_tokens = mask_var.sum(dtype=theano.config.floatX) num_tokens_nr = mask_nr_var.sum(dtype=theano.config.floatX) # get outpout of bi-lstm-cnn-crf shape [batch, length, num_labels, num_labels] energies_train = lasagne.layers.get_output(network) energies_train_det = lasagne.layers.get_output(network, deterministic=True) energies_eval = lasagne.layers.get_output(network, deterministic=True) loss_train_org = chain_crf_loss(energies_train, target_var, mask_var).mean() energy_shape = energies_train.shape # [batch, length, num_labels, num_labels] --> [batch*length, num_labels*num_labels] energies = T.reshape(energies_train, (energy_shape[0] * energy_shape[1], energy_shape[2] * energy_shape[3])) energies = nonlinearities.softmax(energies) energies_det = T.reshape(energies_train_det, (energy_shape[0] * energy_shape[1], energy_shape[2] * energy_shape[3])) energies_det = nonlinearities.softmax(energies_det) # [batch*length, num_labels*num_labels] --> [batch, length*num_labels*num_labels] energies = T.reshape(energies, (energy_shape[0], energy_shape[1] * energy_shape[2] * energy_shape[3])) energies_det = T.reshape(energies_det, (energy_shape[0], energy_shape[1] * energy_shape[2] * energy_shape[3])) loss_train_expect_linear = lasagne.objectives.squared_error(energies, energies_det) loss_train_expect_linear = loss_train_expect_linear.sum(axis=1) loss_train_expect_linear = loss_train_expect_linear.mean() loss_train = loss_train_org + delta * loss_train_expect_linear # l2 regularization? if regular == 'l2': l2_penalty = lasagne.regularization.regularize_network_params(network, lasagne.regularization.l2) loss_train = loss_train + gamma * l2_penalty _, corr_train = chain_crf_accuracy(energies_train, target_var) corr_nr_train = (corr_train * mask_nr_var).sum(dtype=theano.config.floatX) corr_train = (corr_train * mask_var).sum(dtype=theano.config.floatX) prediction_eval, corr_eval = chain_crf_accuracy(energies_eval, target_var) corr_nr_eval = (corr_eval * mask_nr_var).sum(dtype=theano.config.floatX) corr_eval = (corr_eval * mask_var).sum(dtype=theano.config.floatX) params = lasagne.layers.get_all_params(network, trainable=True) updates = nesterov_momentum(loss_train, params=params, learning_rate=learning_rate, momentum=momentum) # Compile a function performing a training step on a mini-batch train_fn = theano.function([word_var, char_var, target_var, mask_var, mask_nr_var], [loss_train, loss_train_org, loss_train_expect_linear, corr_train, corr_nr_train, num_tokens, num_tokens_nr], updates=updates) # Compile a second function evaluating the loss and accuracy of network eval_fn = theano.function([word_var, char_var, target_var, mask_var, mask_nr_var], [corr_eval, corr_nr_eval, num_tokens, num_tokens_nr, prediction_eval]) # Finally, launch the training loop. logger.info( "Start training: regularization: %s(%f), dropout: %s, delta: %.2f (#training data: %d, batch size: %d, clip: %.1f)..." \ % (regular, (0.0 if regular == 'none' else gamma), dropout, delta, num_data, batch_size, grad_clipping)) num_batches = num_data / batch_size + 1 dev_correct = 0.0 dev_correct_nr = 0.0 best_epoch = 0 test_correct = 0.0 test_correct_nr = 0.0 test_total = 0 test_total_nr = 0 test_inst = 0 lr = learning_rate for epoch in range(1, num_epochs + 1): print 'Epoch %d (learning rate=%.4f, decay rate=%.4f): ' % (epoch, lr, decay_rate) train_err = 0.0 train_err_org = 0.0 train_err_linear = 0.0 train_corr = 0.0 train_corr_nr = 0.0 train_total = 0 train_total_nr = 0 train_inst = 0 start_time = time.time() num_back = 0 for batch in xrange(1, num_batches + 1): wids, cids, pids, _, _, masks = data_utils.get_batch(data_train, batch_size) masks_nr = np.copy(masks) masks_nr[:, 0] = 0 err, err_org, err_linear, corr, corr_nr, num, num_nr = train_fn(wids, cids, pids, masks, masks_nr) train_err += err * wids.shape[0] train_err_org += err_org * wids.shape[0] train_err_linear += err_linear * wids.shape[0] train_corr += corr train_corr_nr += corr_nr train_total += num train_total_nr += num_nr train_inst += wids.shape[0] time_ave = (time.time() - start_time) / batch time_left = (num_batches - batch) * time_ave # update log sys.stdout.write("\b" * num_back) log_info = 'train: %d/%d loss: %.4f, loss_org: %.4f, loss_linear: %.4f, acc: %.2f%%, acc(no root): %.2f%%, time left (estimated): %.2fs' % ( batch, num_batches, train_err / train_inst, train_err_org / train_inst, train_err_linear / train_inst, train_corr * 100 / train_total, train_corr_nr * 100 / train_total_nr, time_left) sys.stdout.write(log_info) num_back = len(log_info) # update training log after each epoch assert train_inst == num_batches * batch_size assert train_total == train_total_nr + train_inst sys.stdout.write("\b" * num_back) print 'train: %d/%d loss: %.4f, loss_org: %.4f, loss_linear: %.4f, acc: %.2f%%, acc(no root): %.2f%%, time: %.2fs' % ( train_inst, train_inst, train_err / train_inst, train_err_org / train_inst, train_err_linear / train_inst, train_corr * 100 / train_total, train_corr_nr * 100 / train_total_nr, time.time() - start_time) # evaluate performance on dev data dev_corr = 0.0 dev_corr_nr = 0.0 dev_total = 0 dev_total_nr = 0 dev_inst = 0 for batch in data_utils.iterate_batch(data_dev, batch_size): wids, cids, pids, _, _, masks = batch masks_nr = np.copy(masks) masks_nr[:, 0] = 0 corr, corr_nr, num, num_nr, predictions = eval_fn(wids, cids, pids, masks, masks_nr) dev_corr += corr dev_corr_nr += corr_nr dev_total += num dev_total_nr += num_nr dev_inst += wids.shape[0] assert dev_total == dev_total_nr + dev_inst print 'dev corr: %d, total: %d, acc: %.2f%%, no root corr: %d, total: %d, acc: %.2f%%' % ( dev_corr, dev_total, dev_corr * 100 / dev_total, dev_corr_nr, dev_total_nr, dev_corr_nr * 100 / dev_total_nr) if dev_correct_nr < dev_corr_nr: dev_correct = dev_corr dev_correct_nr = dev_corr_nr best_epoch = epoch # evaluate on test data when better performance detected test_corr = 0.0 test_corr_nr = 0.0 test_total = 0 test_total_nr = 0 test_inst = 0 for batch in data_utils.iterate_batch(data_test, batch_size): wids, cids, pids, _, _, masks = batch masks_nr = np.copy(masks) masks_nr[:, 0] = 0 corr, corr_nr, num, num_nr, predictions = eval_fn(wids, cids, pids, masks, masks_nr) test_corr += corr test_corr_nr += corr_nr test_total += num test_total_nr += num_nr test_inst += wids.shape[0] assert test_total + test_total_nr + test_inst test_correct = test_corr test_correct_nr = test_corr_nr print "best dev corr: %d, total: %d, acc: %.2f%%, no root corr: %d, total: %d, acc: %.2f%% (epoch: %d)" % ( dev_correct, dev_total, dev_correct * 100 / dev_total, dev_correct_nr, dev_total_nr, dev_correct_nr * 100 / dev_total_nr, best_epoch) print "best test corr: %d, total: %d, acc: %.2f%%, no root corr: %d, total: %d, acc: %.2f%% (epoch: %d)" % ( test_correct, test_total, test_correct * 100 / test_total, test_correct_nr, test_total_nr, test_correct_nr * 100 / test_total_nr, best_epoch) if epoch in schedule: lr = lr * decay_rate updates = nesterov_momentum(loss_train, params=params, learning_rate=lr, momentum=momentum) train_fn = theano.function([word_var, char_var, target_var, mask_var, mask_nr_var], [loss_train, loss_train_org, loss_train_expect_linear, corr_train, corr_nr_train, num_tokens, num_tokens_nr], updates=updates)
def train_setup(): # Prepare Theano variables for inputs and targets input_var = T.tensor4('inputs') target_var = T.ivector('targets') # Create neural network model (depending on first command line parameter) print("Building model and compiling functions...") print( " with input dimension {0},{1},{2}".format( config.image_height, \ config.image_width, \ config. image_channel ) ) network = cnn_archi( input_var, \ config.image_channel,\ config.image_height, config.image_width,\ config.output_length ) print('Number of parameters : {0}'.format(count_params(network))) if (config.init_model is not None): with np.load(config.init_model) as f: param_values = [f['arr_%d' % i] for i in range(len(f.files))] set_all_param_values(network, param_values) # Create a loss expression for training, i.e., a scalar objective we want # to minimize (for our multi-class problem, it is the cross-entropy loss): prediction = lasagne.layers.get_output(network) ent_loss = categorical_crossentropy(prediction, target_var) ent_loss = ent_loss.mean() l1_regu = config.l1_regu * regularize_network_params(network, l1) l2_regu = config.l2_regu * regularize_network_params(network, l2) loss = ent_loss + l1_regu + l2_regu # We could add some weight decay as well here, see lasagne.regularization. # Create update expressions for training, i.e., how to modify the # parameters at each training step. Here, we'll use Stochastic Gradient # Descent (SGD) with Nesterov momentum, but Lasagne offers plenty more. params = get_all_params(network, trainable=True) #grads = T.grad( loss, params ) #scaled_grads = norm_constraint( grads, 5. ) updates = nesterov_momentum(loss, params, \ learning_rate=config.learning_rate, \ momentum=config.momentum ) #updates = rmsprop( loss , params, learning_rate = config.learning_rate ) for param in get_all_params(network, regularizable=True): norm_axis = None if param.ndim == 1: norm_axis = [0] updates[param] = norm_constraint( updates[param], \ 5. * compute_norms( param.get_value() ).mean(), norm_axes = norm_axis ) #Create a loss expression for validation/testing. The crucial difference # here is that we do a deterministic forward pass through the network, # disabling dropout layers. test_prediction = get_output(network, deterministic=True) test_classes = T.argmax(test_prediction, axis=1) test_loss = categorical_crossentropy(test_prediction, target_var) test_loss = test_loss.mean() # As a bonus, also create an expression for the classification accuracy: test_acc = T.eq(test_classes, target_var) # Compile a function performing a training step on a mini-batch (by giving # the updates dictionary) and returning the corresponding training loss: train_fn = theano.function([input_var,target_var], \ ent_loss,\ updates=updates, \ allow_input_downcast=True) # Compile a second function computing the validation loss and accuracy: val_fn = theano.function([input_var, target_var], \ [test_loss, test_prediction, test_acc], \ allow_input_downcast=True ) return network, train_fn, val_fn
def event_span_classifier(args, input_var, target_var, wordEmbeddings, seqlen, num_feats): print("Building model with 1D Convolution") vocab_size = wordEmbeddings.shape[1] wordDim = wordEmbeddings.shape[0] kw = 2 num_filters = seqlen-kw+1 stride = 1 #important context words as channels #CNN_sentence config filter_size=wordDim pool_size=seqlen-filter_size+1 input = InputLayer((None, seqlen, num_feats),input_var=input_var) batchsize, _, _ = input.input_var.shape emb = EmbeddingLayer(input, input_size=vocab_size, output_size=wordDim, W=wordEmbeddings.T) #emb.params[emb.W].remove('trainable') #(batchsize, seqlen, wordDim) #print get_output_shape(emb) reshape = ReshapeLayer(emb, (batchsize, seqlen, num_feats*wordDim)) #print get_output_shape(reshape) conv1d = Conv1DLayer(reshape, num_filters=num_filters, filter_size=wordDim, stride=1, nonlinearity=tanh,W=GlorotUniform()) #nOutputFrame = num_flters, #nOutputFrameSize = (num_feats*wordDim-filter_size)/stride +1 #print get_output_shape(conv1d) conv1d = DimshuffleLayer(conv1d, (0,2,1)) #print get_output_shape(conv1d) pool_size=num_filters maxpool = MaxPool1DLayer(conv1d, pool_size=pool_size) #print get_output_shape(maxpool) #forward = FlattenLayer(maxpool) #print get_output_shape(forward) hid = DenseLayer(maxpool, num_units=args.hiddenDim, nonlinearity=sigmoid) network = DenseLayer(hid, num_units=2, nonlinearity=softmax) prediction = get_output(network) loss = T.mean(binary_crossentropy(prediction,target_var)) lambda_val = 0.5 * 1e-4 layers = {emb:lambda_val, conv1d:lambda_val, hid:lambda_val, network:lambda_val} penalty = regularize_layer_params_weighted(layers, l2) loss = loss + penalty params = get_all_params(network, trainable=True) if args.optimizer == "sgd": updates = sgd(loss, params, learning_rate=args.step) elif args.optimizer == "adagrad": updates = adagrad(loss, params, learning_rate=args.step) elif args.optimizer == "adadelta": updates = adadelta(loss, params, learning_rate=args.step) elif args.optimizer == "nesterov": updates = nesterov_momentum(loss, params, learning_rate=args.step) elif args.optimizer == "rms": updates = rmsprop(loss, params, learning_rate=args.step) elif args.optimizer == "adam": updates = adam(loss, params, learning_rate=args.step) else: raise "Need set optimizer correctly" test_prediction = get_output(network, deterministic=True) test_loss = T.mean(binary_crossentropy(test_prediction,target_var)) train_fn = theano.function([input_var, target_var], loss, updates=updates, allow_input_downcast=True) test_acc = T.mean(binary_accuracy(test_prediction, target_var)) val_fn = theano.function([input_var, target_var], [test_loss, test_acc], allow_input_downcast=True) return train_fn, val_fn, network
def _prepare(self, X, y, X_valid=None, y_valid=None, sample_weight=None, whole_dataset_in_device=True): self._stats = [] self._class_label_encoder = LabelEncoder() if self.is_classification is True: self._class_label_encoder.fit(y) self.classes_ = self._class_label_encoder.classes_ y = self._class_label_encoder.transform(y).astype(y.dtype) self.y_train_transformed = y if y_valid is not None: y_valid_transformed = self._class_label_encoder.transform( y_valid).astype(y_valid.dtype) self._l_x_in = layers.InputLayer(shape=(None, X.shape[1])) batch_index, X_batch, y_batch, batch_slice = get_theano_batch_variables( self.batch_size, y_softmax=self.is_classification) if sample_weight is not None: t_sample_weight = T.vector('sample_weight') sample_weight = sample_weight.astype(theano.config.floatX) else: t_sample_weight = T.scalar('sample_weight') if self.is_classification is True: y_dim = len(set(y.flatten().tolist())) else: y_dim = y.shape[1] self._prediction_layer = self._build_model(y_dim) self._layers = layers.get_all_layers(self._prediction_layer) self._build_prediction_functions(X_batch, self._prediction_layer) if self.input_noise_function is None: output = layers.get_output(self._prediction_layer, X_batch) else: X_batch_noisy = self.input_noise_function(X_batch) output = layers.get_output(self._prediction_layer, X_batch_noisy) if self.is_classification: loss = -T.mean(t_sample_weight * T.log(output) [T.arange(y_batch.shape[0]), y_batch]) else: loss = T.mean( t_sample_weight * T.sum((output - y_batch) ** 2, axis=1)) loss_unreg = loss all_params = layers.get_all_params(self._prediction_layer) if self._output_softener_coefs is not None: all_params.append(self._output_softener_coefs) W_params = layers.get_all_param_values( self._prediction_layer, regularizable=True) # regularization if self.L1_factor is not None: for L1_factor_layer, W in zip(self.L1_factor, W_params): loss = loss + L1_factor_layer * T.sum(abs(W)) if self.L2_factor is not None: for L2_factor_layer, W in zip(self.L2_factor, W_params): loss = loss + L2_factor_layer * T.sum(W**2) if self.optimization_method == 'nesterov_momentum': gradient_updates = updates.nesterov_momentum(loss, all_params, learning_rate=self.learning_rate, momentum=self.momentum) elif self.optimization_method == 'adadelta': # don't need momentum there gradient_updates = updates.adadelta( loss, all_params, learning_rate=self.learning_rate) elif self.optimization_method == 'adam': gradient_updates = updates.Adam( loss, all_params, learning_rate=self.learning_rate) elif self.optimization_method == 'momentum': gradient_updates = updates.momentum( loss, all_params, learning_rate=self.learning_rate, momentum=self.momentum ) elif self.optimization_method == 'adagrad': gradient_updates = updates.adadelta( loss, all_params, learning_rate=self.learning_rate) elif self.optimization_method == 'rmsprop': gradient_updates = updates.adadelta( loss, all_params, learning_rate=self.learning_rate) elif self.optimization_method == 'sgd': gradient_updates = updates.sgd( loss, all_params, learning_rate=self.learning_rate, ) else: raise Exception("wrong optimization method") nb_batches = X.shape[0] // self.batch_size if (X.shape[0] % self.batch_size) != 0: nb_batches += 1 X = X.astype(theano.config.floatX) if self.is_classification == True: y = y.astype(np.int32) else: y = y.astype(theano.config.floatX) if whole_dataset_in_device == True: X_shared = theano.shared(X, borrow=True) y_shared = theano.shared(y, borrow=True) givens = { X_batch: X_shared[batch_slice], y_batch: y_shared[batch_slice] } if sample_weight is not None: sample_weight_shared = theano.shared( sample_weight, borrow=True) givens[t_sample_weight] = sample_weight_shared[batch_slice] else: givens[t_sample_weight] = T.as_tensor_variable( np.array(1., dtype=theano.config.floatX)) iter_update_batch = theano.function( [batch_index], loss, updates=gradient_updates, givens=givens, ) else: if sample_weight is None: iter_update_gradients = theano.function( [X_batch, y_batch], loss, updates=gradient_updates, givens={t_sample_weight: T.as_tensor_variable( np.array(1., dtype=theano.config.floatX))}, ) def iter_update_batch(batch_index): sl = slice(batch_index * self.batch_size, (batch_index + 1) * self.batch_size) return iter_update_gradients(X[sl], y[sl]) else: iter_update_gradients = theano.function( [X_batch, y_batch, t_sample_weight], loss, updates=gradient_updates ) def iter_update_batch(batch_index): sl = slice(batch_index * self.batch_size, (batch_index + 1) * self.batch_size) return iter_update_gradients(X[sl], y[sl], sample_weight[sl]) self._iter_update_batch = iter_update_batch self._get_loss = theano.function( [X_batch, y_batch, t_sample_weight], loss_unreg, allow_input_downcast=True) def iter_update(epoch): losses = [] #self.learning_rate.set_value(self.learning_rate.get_value() * np.array(0.99, dtype=theano.config.floatX)) for i in xrange(nb_batches): losses.append(self._iter_update_batch(i)) # max norm if self.max_norm is not None: for max_norm_layer, layer in zip(self.max_norm, self._layers): layer.W = updates.norm_constraint( layer.W, self.max_norm) losses = np.array(losses) d = OrderedDict() d["epoch"] = epoch #d["loss_train_std"] = losses.std() #d["loss_train"] = losses.mean() d["loss_train"] = self._get_loss( self.X_train, self.y_train_transformed, 1.) d["accuracy_train"] = ( self.predict(self.X_train) == self.y_train).mean() if X_valid is not None and y_valid is not None: d["loss_valid"] = self._get_loss( X_valid, y_valid_transformed, 1.) if self.is_classification == True: d["accuracy_valid"] = ( self.predict(X_valid) == y_valid).mean() if self.verbose > 0: if (epoch % self.report_each) == 0: print(tabulate([d], headers="keys")) self._stats.append(d) return d def quitter(update_status): cur_epoch = len(self._stats) - 1 if self.patience_nb_epochs > 0: # patience heuristic (for early stopping) cur_patience_stat = update_status[self.patience_stat] if self.cur_best_patience_stat is None: self.cur_best_patience_stat = cur_patience_stat first_time = True else: first_time = False thresh = self.patience_progression_rate_threshold if cur_patience_stat < self.cur_best_patience_stat * thresh or first_time: if self.verbose >= 2: fmt = "--Early stopping-- good we have a new best value : {0}={1}, last best : epoch {2}, value={3}" print(fmt.format(self.patience_stat, cur_patience_stat, self.cur_best_epoch, self.cur_best_patience_stat)) self.cur_best_epoch = cur_epoch self.cur_best_patience_stat = cur_patience_stat if hasattr(self, "set_state") and hasattr(self, "get_state"): self.cur_best_model = self.get_state() else: self.cur_best_model = pickle.dumps( self.__dict__, protocol=pickle.HIGHEST_PROTOCOL) if (cur_epoch - self.cur_best_epoch) >= self.patience_nb_epochs: finish = True if hasattr(self, "set_state") and hasattr(self, "get_state"): self.set_state(self.cur_best_model) else: self.__dict__.update(pickle.loads(self.cur_best_model)) self._stats = self._stats[0:self.cur_best_epoch + 1] if self.verbose >= 2: print("out of patience...take the model at epoch {0} and quit".format( self.cur_best_epoch + 1)) else: finish = False return finish else: return False def monitor(update_status): pass def observer(monitor_output): pass return (iter_update, quitter, monitor, observer)
def main(): # load the training and validation data sets # labels=int(0.7*speech.all_count) global valid_best X = T.tensor4() Y = T.matrix() # set up theano functions to generate output by feeding data through network output_layer,hidden_layer = lasagne_model() output_train = lasagne.layers.get_output(output_layer, X) output_valid = lasagne.layers.get_output(output_layer, X, deterministic=True) # set up the loss that we aim to minimize loss_train = T.mean(T.nnet.categorical_crossentropy(output_train, Y)) loss_valid = T.mean(T.nnet.categorical_crossentropy(output_valid, Y)) # prediction functions for classifications pred = T.argmax(output_train, axis=1) pred_valid = T.argmax(output_valid, axis=1) # get parameters from network and set up sgd with nesterov momentum to update parameters params = lasagne.layers.get_all_params(output_layer,trainable=True) print params updates = nesterov_momentum(loss_train, params, learning_rate=0.2, momentum=0.9) # updates =lasagne.updates.sgd(loss_train, params, learning_rate=0.1) # set up training and prediction functions train = theano.function(inputs=[X, Y], outputs=[loss_train,pred], updates=updates, allow_input_downcast=True) valid = theano.function(inputs=[X, Y], outputs=[loss_valid,pred_valid], allow_input_downcast=True) # predict_valid = theano.function(inputs=[X], outputs=[pred_valid], allow_input_downcast=True) # loop over training functions for however many iterations, print information while training train_eval = [] valid_eval = [] valid_acc = [] if 0: # pre_params = 'speech_params/validbest_cnn_spkall_22_0.95458984375_2017-06-24 00:50:39.pklz' pre_params='speech_params/validbest_cnn_fisher_0_idxbatch20000_0.65625_2017-08-17 16:14:04.pklz' # 1 shot: 99.0 2000多次 load_params = pickle.load(gzip.open(pre_params)) # load_params=load_params[:-2] lasagne.layers.set_all_param_values(output_layer,load_params) print 'load params: ', pre_params,'\n' for i in range(450): train_list,valid_list=get_data(train_files,num_speechs,size,valid_size) batch_total_number = size / BATCHSIZE if 1: valid_list_limit=3000 valid_list=valid_list[:valid_list_limit] print 'batch_total_number:',batch_total_number train_acc_aver=0.0 for idx_batch in range (batch_total_number): batch_list = train_list[idx_batch * BATCHSIZE:(idx_batch + 1) * BATCHSIZE] xx_batch=np.zeros((BATCHSIZE,1,speechSize[0],speechSize[1])) yy_batch=np.zeros((BATCHSIZE,num_labels_train)) for ind,one_name in enumerate(batch_list): aim_file_name,aim_idx=one_name.split('.') aim_file_name+='.npy' aim_file_name_full=train_load_path+aim_file_name xx_batch[ind,0]=np.load(aim_file_name_full)[int(aim_idx)] yy_batch[ind]=label_binarize([train_files.index(aim_file_name)],range(num_labels_train))[0] train_loss ,pred = train(xx_batch,yy_batch) count=np.count_nonzero(np.int32(pred ==np.argmax(yy_batch,axis=1))) train_acc = float(count) / (BATCHSIZE) print i,idx_batch,'| Tloss:', train_loss,'| Count:',count,'| Acc:',train_acc print pred print np.argmax(yy_batch,axis=1) print "time:",time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) train_acc_aver += train_acc # raise EOFError if train_acc>0.9 and idx_batch%1000==0 and idx_batch>=0: acc=0 valid_batch_number=len(valid_list)/BATCHSIZE for j in tqdm(range(valid_batch_number)): batch_list = valid_list[j * BATCHSIZE:(j + 1) * BATCHSIZE] x_batch=np.zeros((BATCHSIZE,1,speechSize[0],speechSize[1])) y_batch=np.zeros((BATCHSIZE,num_labels_train)) for ind,one_name in enumerate(batch_list): aim_file_name,aim_idx=one_name.split('.') aim_file_name+='.npy' aim_file_name_full=train_load_path+aim_file_name x_batch[ind,0]=np.load(aim_file_name_full)[int(aim_idx)] y_batch[ind]=label_binarize([train_files.index(aim_file_name)],range(num_labels_train))[0] valid_loss,pred = valid(x_batch,y_batch) acc += np.count_nonzero(np.int32(pred ==np.argmax(y_batch,axis=1))) acc=float(acc)/(valid_batch_number*BATCHSIZE) print 'iter:', i,idx_batch, '| Vloss:',valid_loss,'|Acc:',acc if acc>valid_best: print 'new valid_best:',valid_best,'-->',acc valid_best=acc all_params = helper.get_all_param_values(output_layer) f = gzip.open('speech_params/validbest_cnn_fisher_{}_validacc{}_{}.pklz'.format(i,valid_best,time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())), 'wb') pickle.dump(all_params, f) f.close() if idx_batch%3000==0 and idx_batch>0: all_params = helper.get_all_param_values(output_layer) f = gzip.open('speech_params/validbest_cnn_fisher_{}_idxbatch{}_{}.pklz'.format(i,idx_batch,time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())), 'wb') pickle.dump(all_params, f) f.close() # save weights if i%1==0: all_params = helper.get_all_param_values(output_layer) f = gzip.open('speech_params/validbest_cnn_fisher_averacc{}_{}_{}.pklz'.format(i,train_acc_aver/batch_total_number,time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())), 'wb') pickle.dump(all_params, f) f.close()
all_layers = lasagne.layers.get_all_layers(output_layer) l2_penalty = lasagne.regularization.regularize_layer_params( all_layers, lasagne.regularization.l2) * 0.0001 loss = loss + l2_penalty # set up loss functions for validation dataset test_loss = lasagne.objectives.categorical_crossentropy(output_test, Y) test_loss = test_loss.mean() test_acc = T.mean(T.eq(T.argmax(output_test, axis=1), Y), dtype=theano.config.floatX) # get parameters from network and set up sgd with nesterov momentum to update parameters, l_r is shared var so it can be changed l_r = theano.shared(np.array(LR_SCHEDULE[0], dtype=theano.config.floatX)) params = lasagne.layers.get_all_params(output_layer, trainable=True) updates = nesterov_momentum(loss, params, learning_rate=l_r, momentum=0.9) #updates = adam(loss, params, learning_rate=l_r) # set up training and prediction functions train_fn = theano.function(inputs=[X, Y], outputs=loss, updates=updates) valid_fn = theano.function(inputs=[X, Y], outputs=[test_loss, test_acc]) ''' load training data and start training ''' # load the training and validation data sets train_X, test_X, train_y, test_y = load_pickle_data_cv() print 'Train shape:', train_X.shape, 'Test shape:', test_X.shape print 'Train y shape:', train_y.shape, 'Test y shape:', test_y.shape print np.amax(train_X), np.amin(train_X), np.mean(train_X)
def main(): # load model and parameters output_layer = lasagne_model() f = gzip.open('data/weights.pklz', 'rb') all_params = pickle.load(f) f.close() X = T.ftensor4() Y = T.fmatrix() # set up theano functions to generate output by feeding data through network output_layer = lasagne_model() output_train = lasagne.layers.get_output(output_layer, X) output_valid = lasagne.layers.get_output(output_layer, X, deterministic=True) # set up the loss that we aim to minimize loss_train = T.mean(T.nnet.categorical_crossentropy(output_train, Y)) loss_valid = T.mean(T.nnet.categorical_crossentropy(output_valid, Y)) # prediction functions for classifications pred = T.argmax(output_train, axis=1) pred_valid = T.argmax(output_valid, axis=1) # get parameters from network and set up sgd with nesterov momentum to update parameters helper.set_all_param_values(output_layer, all_params) params = lasagne.layers.get_all_params(output_layer) updates = nesterov_momentum(loss_train, params, learning_rate=0.0001, momentum=0.9) # set up training and prediction functions train = theano.function(inputs=[X, Y], outputs=loss_train, updates=updates, allow_input_downcast=True) valid = theano.function(inputs=[X, Y], outputs=loss_valid, allow_input_downcast=True) predict_valid = theano.function(inputs=[X], outputs=pred_valid, allow_input_downcast=True) # fine tune network train_X, test_X, train_y, test_y = load_data_cv('data/train.csv') train_eval = [] valid_eval = [] valid_acc = [] try: for i in range(5): train_loss = batch_iterator_no_aug(train_X, train_y, BATCHSIZE, train) train_eval.append(train_loss) valid_loss = valid(test_X, test_y) valid_eval.append(valid_loss) acc = np.mean(np.argmax(test_y, axis=1) == predict_valid(test_X)) valid_acc.append(acc) print 'iter:', i, '| Tloss:', train_loss, '| Vloss:', valid_loss, '| valid acc:', acc except KeyboardInterrupt: pass # after training create output for kaggle testing_inputs = load_test_data('data/test.csv') predictions = [] for j in range((testing_inputs.shape[0] + BATCHSIZE -1) // BATCHSIZE): sl = slice(j * BATCHSIZE, (j + 1) * BATCHSIZE) X_batch = testing_inputs[sl] predictions.extend(predict_valid(X_batch)) out = pd.read_csv('data/convnet_preds.csv') out['Label'] = predictions out.to_csv('preds/convnet_preds.csv', index = False)
def build_network_2dconv( args, input1_var, input1_mask_var, input2_var, intut2_mask_var, target_var, wordEmbeddings, maxlen=36 ): print ("Building model with 2D Convolution") vocab_size = wordEmbeddings.shape[1] wordDim = wordEmbeddings.shape[0] num_filters = 100 stride = 1 # CNN_sentence config filter_size = (3, wordDim) pool_size = (maxlen - 3 + 1, 1) # two conv pool layer # filter_size=(10, 100) # pool_size=(4,4) input_1 = InputLayer((None, maxlen), input_var=input1_var) batchsize, seqlen = input_1.input_var.shape # input_1_mask = InputLayer((None, maxlen),input_var=input1_mask_var) emb_1 = EmbeddingLayer(input_1, input_size=vocab_size, output_size=wordDim, W=wordEmbeddings.T) emb_1.params[emb_1.W].remove("trainable") # (batchsize, maxlen, wordDim) reshape_1 = ReshapeLayer(emb_1, (batchsize, 1, maxlen, wordDim)) conv2d_1 = Conv2DLayer( reshape_1, num_filters=num_filters, filter_size=(filter_size), stride=stride, nonlinearity=rectify, W=GlorotUniform(), ) # (None, 100, 34, 1) maxpool_1 = MaxPool2DLayer(conv2d_1, pool_size=pool_size) # (None, 100, 1, 1) """ filter_size_2=(4, 10) pool_size_2=(2,2) conv2d_1 = Conv2DLayer(maxpool_1, num_filters=num_filters, filter_size=filter_size_2, stride=stride, nonlinearity=rectify,W=GlorotUniform()) #(None, 100, 34, 1) maxpool_1 = MaxPool2DLayer(conv2d_1, pool_size=pool_size_2) #(None, 100, 1, 1) (None, 100, 1, 20) """ forward_1 = FlattenLayer(maxpool_1) # (None, 100) #(None, 50400) input_2 = InputLayer((None, maxlen), input_var=input2_var) # input_2_mask = InputLayer((None, maxlen),input_var=input2_mask_var) emb_2 = EmbeddingLayer(input_2, input_size=vocab_size, output_size=wordDim, W=wordEmbeddings.T) emb_2.params[emb_2.W].remove("trainable") reshape_2 = ReshapeLayer(emb_2, (batchsize, 1, maxlen, wordDim)) conv2d_2 = Conv2DLayer( reshape_2, num_filters=num_filters, filter_size=filter_size, stride=stride, nonlinearity=rectify, W=GlorotUniform(), ) # (None, 100, 34, 1) maxpool_2 = MaxPool2DLayer(conv2d_2, pool_size=pool_size) # (None, 100, 1, 1) """ conv2d_2 = Conv2DLayer(maxpool_2, num_filters=num_filters, filter_size=filter_size_2, stride=stride, nonlinearity=rectify,W=GlorotUniform()) #(None, 100, 34, 1) maxpool_2 = MaxPool2DLayer(conv2d_2, pool_size=pool_size_2) #(None, 100, 1, 1) """ forward_2 = FlattenLayer(maxpool_2) # (None, 100) # elementwisemerge need fix the sequence length mul = ElemwiseMergeLayer([forward_1, forward_2], merge_function=T.mul) sub = AbsSubLayer([forward_1, forward_2], merge_function=T.sub) concat = ConcatLayer([mul, sub]) concat = ConcatLayer([forward_1, forward_2]) hid = DenseLayer(concat, num_units=args.hiddenDim, nonlinearity=sigmoid) if args.task == "sts": network = DenseLayer(hid, num_units=5, nonlinearity=softmax) elif args.task == "ent": network = DenseLayer(hid, num_units=3, nonlinearity=softmax) # prediction = get_output(network, {input_1:input1_var, input_2:input2_var}) prediction = get_output(network) loss = T.mean(categorical_crossentropy(prediction, target_var)) lambda_val = 0.5 * 1e-4 layers = {conv2d_1: lambda_val, hid: lambda_val, network: lambda_val} penalty = regularize_layer_params_weighted(layers, l2) loss = loss + penalty params = get_all_params(network, trainable=True) if args.optimizer == "sgd": updates = sgd(loss, params, learning_rate=args.step) elif args.optimizer == "adagrad": updates = adagrad(loss, params, learning_rate=args.step) elif args.optimizer == "adadelta": updates = adadelta(loss, params, learning_rate=args.step) elif args.optimizer == "nesterov": updates = nesterov_momentum(loss, params, learning_rate=args.step) elif args.optimizer == "rms": updates = rmsprop(loss, params, learning_rate=args.step) elif args.optimizer == "adam": updates = adam(loss, params, learning_rate=args.step) else: raise "Need set optimizer correctly" # test_prediction = get_output(network, {input_1:input1_var, input_2:input2_var}, deterministic=True) test_prediction = get_output(network, deterministic=True) test_loss = T.mean(categorical_crossentropy(test_prediction, target_var)) """ train_fn = theano.function([input1_var, input1_mask_var, input2_var, intut2_mask_var, target_var], loss, updates=updates, allow_input_downcast=True) """ train_fn = theano.function([input1_var, input2_var, target_var], loss, updates=updates, allow_input_downcast=True) if args.task == "sts": """ val_fn = theano.function([input1_var, input1_mask_var, input2_var, intut2_mask_var, target_var], [test_loss, test_prediction], allow_input_downcast=True) """ val_fn = theano.function( [input1_var, input2_var, target_var], [test_loss, test_prediction], allow_input_downcast=True ) elif args.task == "ent": # test_acc = T.mean(T.eq(T.argmax(test_prediction, axis=1), target_var), dtype=theano.config.floatX) test_acc = T.mean(categorical_accuracy(test_prediction, target_var)) """ val_fn = theano.function([input1_var, input1_mask_var, input2_var, intut2_mask_var, target_var], [test_loss, test_acc], allow_input_downcast=True) """ val_fn = theano.function([input1_var, input2_var, target_var], [test_loss, test_acc], allow_input_downcast=True) return train_fn, val_fn
def word2vec( files=[], directories=[], skip=[], save_dir=None, num_epochs=5, unigram_dictionary=None, noise_ratio=15, kernel=[1,2,3,4,5,5,4,3,2,1], t = 1.0e-5, batch_size = 1000, # Number of *signal* examples per batch num_embedding_dimensions=500, word_embedding_init=Normal(), context_embedding_init=Normal(), learning_rate=0.1, momentum=0.9, num_processes=3, load_dictionary_dir=None, min_frequency=10, macrobatch_size = 100000, max_queue_size=0, verbose=True ): ''' Helper function that handles all concerns involved in training A word2vec model using the approach of Mikolov et al. It surfaces all of the options. For customizations going beyond simply tweeking existing options and hyperparameters, substitute this function by writing your own training routine using the provided classes. This function would be a starting point for you. ''' # Make a Word2VecMinibatcher, pass through parameters sent by caller reader = DatasetReader( files=files, directories=directories, skip=skip, noise_ratio=noise_ratio, t=t, num_processes=num_processes, unigram_dictionary=unigram_dictionary, kernel=kernel, max_queue_size=max_queue_size, macrobatch_size=macrobatch_size, verbose=verbose ) # Prepare the minibatch generator # (this produces the counter_sampler stats) if load_dictionary_dir is None and unigram_dictionary is None: if verbose: print 'preparing dictionaries...' reader.prepare(save_dir=save_dir) # If min_frequency was specified, prune the dictionaries if min_frequency is not None: if verbose: print 'prunning dictionaries...' reader.prune(min_frequency) # Make a symbolic minibatcher minibatcher = NoiseContrastiveTheanoMinibatcher( batch_size=batch_size, noise_ratio=noise_ratio, dtype="int32", num_dims=2 ) # Make a Word2VecEmbedder object, feed it the combined input. # Note that the full batch includes noise examples and signal_examples # so is larger than batch_size, which is the number of signal_examples # only per batch. full_batch_size = batch_size * (1 + noise_ratio) embedder = Word2VecEmbedder( input_var=minibatcher.get_batch(), batch_size=full_batch_size, vocabulary_size=reader.get_vocab_size(), num_embedding_dimensions=num_embedding_dimensions, word_embedding_init=word_embedding_init, context_embedding_init=context_embedding_init ) # Architectue is ready. Make the loss function, and use it to create # the parameter updates responsible for learning loss = get_noise_contrastive_loss(embedder.get_output(), batch_size) updates = nesterov_momentum( loss, embedder.get_params(), learning_rate, momentum ) # Include minibatcher updates, which cause the symbolic batch to move # through the dataset like a sliding window updates.update(minibatcher.get_updates()) # Use the loss function and the updates to compile a training function. # Note that it takes no inputs because the dataset is fully loaded using # theano shared variables train = function([], loss, updates=updates) # Iterate through the dataset, training the embeddings for epoch in range(num_epochs): if verbose: print 'starting epoch %d' % epoch macrobatches = reader.generate_dataset_serial() macrobatch_num = 0 for signal_macrobatch, noise_macrobatch in macrobatches: macrobatch_num += 1 if verbose: print 'running macrobatch %d' % (macrobatch_num - 1) minibatcher.load_dataset(signal_macrobatch, noise_macrobatch) losses = [] for batch_num in range(minibatcher.get_num_batches()): if verbose: print 'running minibatch', batch_num losses.append(train()) if verbose: print '\taverage loss: %f' % np.mean(losses) # Save the model (the embeddings) if save_dir was provided if save_dir is not None: embedder.save(save_dir) # Return the trained embedder and the dictionary mapping tokens # to ids return embedder, reader
# load datasets X_train_fc7 = theano.shared(np.load('/root/proj/MIT_dumped/X_train_fc7.npy').astype(theano.config.floatX)) X_test_fc7 = theano.shared(np.load('/root/proj/MIT_dumped/X_test_fc7.npy').astype(theano.config.floatX)) all_params = layers.get_all_params(output) objective = objectives.Objective(output,loss_function=objectives.multinomial_nll) loss_train = objective.get_loss([X_batch_one, X_batch_two], target=y_batch) LEARNING_RATE =0.0122 MOMENTUM=0.9 REG = .0009 reg_loss = regularization.l2(output) * REG total_loss = loss_train + reg_loss upds = updates.nesterov_momentum(total_loss, all_params, LEARNING_RATE, MOMENTUM) pred = T.argmax( output.get_output([X_batch_one, X_batch_two], deterministic=True), axis=1) accuracy = T.mean(T.eq(pred, y_batch), dtype=theano.config.floatX) print "begin compiling" givens = {X_batch_one: X_train_fc6[batch_index*batch_size:(batch_index+1)*batch_size], X_batch_two: X_train_fc7[batch_index*batch_size:(batch_index+1)*batch_size], y_batch: y_train[batch_index*batch_size:(batch_index+1)*batch_size]} train = theano.function([batch_index], loss_train, updates=upds, givens=givens) test = theano.function([], accuracy, givens={X_batch_one:X_test_fc6, X_batch_two:X_test_fc7, y_batch:y_test}) num_epochs = 1000 for epoch in range(num_epochs): print "epoch %s" % epoch for batch in range(total/batch_size): loss = train(batch)
def update(all_grads, all_params, learning_rate): return nesterov_momentum(all_grads, all_params, learning_rate, momentum=m)