def build_fn(self): all_inputs = [] for i in range(len(self.objective.examples)): example = self.objective.examples[i] for j in range(len(example.const_list)): for f_name in sorted(self.fea_vecs.keys()): all_inputs.append(self.input_vars[(i, j, f_name)]) for label in sorted(self.output_targets.keys()): all_inputs.append(self.output_targets[label]) for e_name in sorted(self.graph_targets.keys()): all_inputs.append(self.graph_targets[e_name]) params = self.parameters.values() updates = lasagne.updates.sgd(self.loss, params, learning_rate = 1e0) self.train_fn = theano.function(all_inputs, self.loss, updates = updates, on_unused_input = 'ignore') test_inputs = [] for i in range(len(self.objective.examples)): example = self.objective.examples[i] if len(example.const_list) != 1 or len(example.var_list) != 1: continue for f_name in sorted(self.fea_vecs.keys()): test_inputs.append(self.input_vars[(i, 0, f_name)]) for label in sorted(self.output_targets.keys()): test_inputs.append(self.output_targets[label]) py_sym = T.concatenate([self.output_vars[k].dimshuffle(0, 'x') for k in sorted(self.output_vars.keys())], axis = 1) y_sym = T.concatenate([self.output_targets[k].dimshuffle(0, 'x') for k in sorted(self.output_targets.keys())], axis = 1) acc = T.mean(T.eq(T.argmax(py_sym, axis = 1), T.argmax(y_sym, axis = 1))) self.test_fn = theano.function(test_inputs, acc, on_unused_input = 'ignore')
def compile_model(self, weightMatrix=None): x = T.vector('x') # Features y = T.iscalar('y') # (Gold) Label params = self.hidden_layers_params[:] # Creating the first hidden layer with x symbolic vector n_in, n_out = params.pop(0) self.hidden_layers.append(HL.HiddenLayer(x, n_in, n_out)) if weightMatrix: self.hidden_layers[0].setW(weightMatrix[0][0], weightMatrix[0][1]) weightMatrix.pop(0) # Creating the rest hidden layers # Each layers input is the previous layer's output for i in xrange(len(params)): n_in, n_out = params[i] self.hidden_layers.append(HL.HiddenLayer(self.hidden_layers[-1].output, n_in, n_out)) if weightMatrix: self.hidden_layers[-1].setW(weightMatrix[i][0], weightMatrix[i][1]) # Creating the logistical regression layer self.logreg_layer = LL.LogRegLayer(self.hidden_layers[-1].output, self.hidden_layers[-1].n_out, len(self.classes)) if weightMatrix: self.logreg_layer.setW(weightMatrix[-1][0], weightMatrix[-1][1]) # Calculating the cost of the network # The cost is the negative log likelihood of gold label + L1 and L2 regressions self.cost = -T.log(self.logreg_layer.output)[0,y] for hidden in self.hidden_layers: self.cost += self.L1(self.logreg_layer.W, hidden.W) self.cost += self.L2(self.logreg_layer.W, hidden.W) # Creating the udate vector # Each layer's weight vector is changed based on the cost updates = [(self.logreg_layer.W, self.sgd_step(self.logreg_layer.W)), (self.logreg_layer.b, self.sgd_step(self.logreg_layer.b))] updates.extend([(hidden.W, self.sgd_step(hidden.W)) for hidden in self.hidden_layers]) updates.extend([(hidden.b, self.sgd_step(hidden.b)) for hidden in self.hidden_layers]) # Creating the training model which is a theano function # Inputs are a feature vector and a label self.train_model = theano.function( inputs = [x, y], outputs = self.cost, # <-- Output depends on cost, which depends on P(y | x) updates = updates, ) # Creating the evaluating model which is a theano function # Inputs are a feature vector and a label self.devtest_model = theano.function( inputs = [x, y], outputs = T.neq(y, T.argmax(self.logreg_layer.output[0])) ) self.evaluate_model = theano.function( inputs = [x], outputs = T.argmax(self.logreg_layer.output[0]) )
def error_classification(self,target): output, updates = theano.scan(fn=lambda a: T.nnet.softmax(a), sequences=[self.output]) y=T.mean(output,0) self.y_pred = T.argmax(y, axis=1) label=T.argmax(target, axis=1) return T.mean(T.neq(self.y_pred, label))
def __init__(self, input, input_dim, hidden_dim, output_dim, activation=T.tanh, init='uniform', inner_init='orthonormal', mini_batch=False, params=None): self.activation = activation self.mini_batch = mini_batch if mini_batch: input = input.dimshuffle(1, 0, 2) if params is None: self.W = theano.shared(value=get(identifier=init, shape=(input_dim, hidden_dim)), name='W', borrow=True ) self.U = theano.shared(value=get(identifier=inner_init, shape=(hidden_dim, hidden_dim)), name='U', borrow=True ) self.V = theano.shared(value=get(identifier=init, shape=(hidden_dim, output_dim)), name='V', borrow=True ) self.bh = theano.shared(value=get(identifier='zero', shape=(hidden_dim, )), name='bh', borrow=True) self.by = theano.shared(value=get(identifier='zero', shape=(output_dim, )), name='by', borrow=True) else: self.W, self.U, self.V, self.bh, self.by = params self.h0 = theano.shared(value=get(identifier='zero', shape=(hidden_dim, )), name='h0', borrow=True) self.params = [self.W, self.U, self.V, self.bh, self.by] if mini_batch: def recurrence(x_t, h_tm_prev): h_t = activation(T.dot(x_t, self.W) + T.dot(h_tm_prev, self.U) + self.bh) y_t = T.nnet.softmax(T.dot(h_t, self.V) + self.by) return h_t, y_t [self.h_t, self.y_t], _ = theano.scan( recurrence, sequences=input, outputs_info=[T.alloc(self.h0, input.shape[1], hidden_dim), None] ) self.h_t = self.h_t.dimshuffle(1, 0, 2) self.y_t = self.y_t.dimshuffle(1, 0, 2) self.y = T.argmax(self.y_t, axis=2) else: def recurrence(x_t, h_tm_prev): h_t = activation(T.dot(x_t, self.W) + T.dot(h_tm_prev, self.U) + self.bh) y_t = T.nnet.softmax(T.dot(h_t, self.V) + self.by) return h_t, y_t[0] [self.h_t, self.y_t], _ = theano.scan( recurrence, sequences=input, outputs_info=[self.h0, None] ) self.y = T.argmax(self.y_t, axis=1)
def compile(self, optimizer, loss, class_mode="categorical", theano_mode=None): self.optimizer = optimizers.get(optimizer) self.loss = objectives.get(loss) weighted_loss = weighted_objective(objectives.get(loss)) # input of model self.X_train = self.get_input(train=True) self.X_test = self.get_input(train=False) self.y_train = self.get_output(train=True) self.y_test = self.get_output(train=False) # target of model self.y = T.zeros_like(self.y_train) self.weights = T.ones_like(self.y_train) train_loss = weighted_loss(self.y, self.y_train, self.weights) test_loss = weighted_loss(self.y, self.y_test, self.weights) train_loss.name = 'train_loss' test_loss.name = 'test_loss' self.y.name = 'y' if class_mode == "categorical": train_accuracy = T.mean(T.eq(T.argmax(self.y, axis=-1), T.argmax(self.y_train, axis=-1))) test_accuracy = T.mean(T.eq(T.argmax(self.y, axis=-1), T.argmax(self.y_test, axis=-1))) elif class_mode == "binary": train_accuracy = T.mean(T.eq(self.y, T.round(self.y_train))) test_accuracy = T.mean(T.eq(self.y, T.round(self.y_test))) else: raise Exception("Invalid class mode:" + str(class_mode)) self.class_mode = class_mode self.theano_mode = theano_mode for r in self.regularizers: train_loss = r(train_loss) updates = self.optimizer.get_updates(self.params, self.constraints, train_loss) if type(self.X_train) == list: train_ins = self.X_train + [self.y, self.weights] test_ins = self.X_test + [self.y, self.weights] predict_ins = self.X_test else: train_ins = [self.X_train, self.y, self.weights] test_ins = [self.X_test, self.y, self.weights] predict_ins = [self.X_test] self._train = theano.function(train_ins, train_loss, updates=updates, allow_input_downcast=True, mode=theano_mode) self._train_with_acc = theano.function(train_ins, [train_loss, train_accuracy], updates=updates, allow_input_downcast=True, mode=theano_mode) self._predict = theano.function(predict_ins, self.y_test, allow_input_downcast=True, mode=theano_mode) self._test = theano.function(test_ins, test_loss, allow_input_downcast=True, mode=theano_mode) self._test_with_acc = theano.function(test_ins, [test_loss, test_accuracy], allow_input_downcast=True, mode=theano_mode)
def get_predicted(self,data): for i in range(len(self.hidden_layers)): data=self.hidden_layers[i].get_predicted(data) p_y_given_x = T.nnet.softmax(T.dot(data, self.logRegressionLayer.W) + self.logRegressionLayer.b) y_pred = T.argmax(p_y_given_x, axis=1) y_pred_prob = T.argmax(p_y_given_x, axis=1) return y_pred,y_pred_prob
def get_monitoring_channels(self, model, X, Y = None): rval = OrderedDict() history = model.mf(X, return_history = True) q = history[-1] if self.supervised: assert Y is not None Y_hat = q[-1] true = T.argmax(Y,axis=1) pred = T.argmax(Y_hat, axis=1) #true = Print('true')(true) #pred = Print('pred')(pred) wrong = T.neq(true, pred) err = T.cast(wrong.mean(), X.dtype) rval['misclass'] = err if len(model.hidden_layers) > 1: q = model.mf(X, Y = Y) pen = model.hidden_layers[-2].upward_state(q[-2]) Y_recons = model.hidden_layers[-1].mf_update(state_below = pen) pred = T.argmax(Y_recons, axis=1) wrong = T.neq(true, pred) rval['recons_misclass'] = T.cast(wrong.mean(), X.dtype) return rval
def __init__(self, rng, inp, sv, tv, hd, maxl): """ rng: numpy.random.RandomState sv: source vocabulary size tv: target vocabulary size hd: dimension of hidden layer """ self.inw = theano.shared(0.2 * numpy.random.uniform(-1, 1, (sv, hd)).astype(theano.config.floatX)) self.recurrent = theano.shared(0.2 * numpy.random.uniform(-1, 1, (hd, hd)).astype(theano.config.floatX)) self.outw = theano.shared(0.2 * numpy.random.uniform(-1, 1, (hd, tv)).astype(theano.config.floatX)) self.h0 = theano.shared(numpy.zeros(hd, dtype=theano.config.floatX)) def recurrence(x_t, h_tm1): h_t = T.nnet.sigmoid(x_t + T.dot(h_tm1, self.recurrent)) s_t = T.nnet.softmax(T.dot(h_t, self.outw)) return [h_t, s_t] self.input = inp x = [self.inw[inp[0]]] h = [self.inw[inp[0]]] self.p_y_given_x = [T.nnet.softmax(T.dot(h[0], self.outw))] self.pred = [T.argmax(self.p_y_given_x[0])] for i in xrange(1, maxl): x.append(self.inw[inp[i]]) h.append(x[i] + T.dot(h[i - 1], self.recurrent)) self.p_y_given_x.append(T.nnet.softmax(T.dot(h[i], self.outw))) self.pred.append(T.argmax(self.p_y_given_x[i]))
def get_classification_accuracy(self, model, minibatch, target): patches = [] patches.append(minibatch[:,:42,:42]) patches.append(minibatch[:,6:,:42]) patches.append(minibatch[:,6:,6:]) patches.append(minibatch[:,:42,6:]) patches.append(minibatch[:,3:45,3:45]) """for i in xrange(5): mirror_patch = [] for j in xrange(42): mirror_patch.append(patches[i][:,:,42-(j+1):42-j]) patches.append(T.concatenate(mirror_patch,axis=2))""" """for patch in patches: Y_list.append(model.fprop(patch, apply_dropout=False)) Y = T.mean(T.stack(Y_list), axis=(1,2))""" Y = model.fprop(patches[-1], apply_dropout=False) i = 1 for patch in patches[:-1]: Y = Y + model.fprop(patch, apply_dropout=False) i+=1 print i Y = Y/float(i) return T.mean(T.cast(T.eq(T.argmax(Y, axis=1), T.argmax(target, axis=1)), dtype='int32'), dtype=config.floatX)
def __call__(self, model, X, Y): batch_size = 32 image_size = 96 Y_hat = model.fprop(X) print "Warning: the size of the axe is set manually" Yx_hat = Y_hat[:, :image_size] Yy_hat = Y_hat[:, image_size:] Yx = Y[:, :image_size] Yy = Y[:, image_size:] epsylon = 1e-10 costMatrix = T.matrix() max_x = T.argmax(Yx, axis=1) max_y = T.argmax(Yy, axis=1) costMatrix = T.sqr( T.log((Yx + epsylon) / (Yx[range(batch_size), max_x] + epsylon)[:, None]) - T.log((Yx_hat + epsylon) / (Yx_hat[range(batch_size), max_x] + epsylon)[:, None]) ) costMatrix += T.sqr( T.log((Yy + epsylon) / (Yy[range(batch_size), max_y] + epsylon)[:, None]) - T.log((Yy_hat + epsylon) / (Yy_hat[range(batch_size), max_y] + epsylon)[:, None]) ) costMatrix *= T.neq(T.sum(Y, axis=1), 0)[:, None] cost = costMatrix.sum(axis=1).mean() return cost
def __call__(self, model, X, Y): y_hat = model.fprop(X) y_hat = T.argmax(y_hat, axis=1) y = T.argmax(Y, axis=1) misclass = T.neq(y, y_hat).mean() misclass = T.cast(misclass, config.floatX) return misclass
def nll_simple(Y, Y_hat, cost_mask=None, cost_ent_mask=None, cost_ent_desc_mask=None): probs = Y_hat pred = TT.argmax(probs, axis=1).reshape(Y.shape) errors = TT.neq(pred, Y) ent_errors = None if cost_ent_mask is not None: pred_ent = TT.argmax(probs * cost_ent_mask.dimshuffle('x', 0), axis=1).reshape(Y.shape) ent_errors = TT.neq(pred_ent, Y).mean() ent_desc_errors = None if cost_ent_desc_mask is not None: pred_desc_ent = TT.argmax(probs * cost_ent_desc_mask, axis=1).reshape(Y.shape) ent_desc_errors = TT.neq(pred_desc_ent, Y).mean() LL = TT.log(_grab_probs(probs, Y) + 1e-8).reshape(Y.shape) if cost_mask is not None: total = cost_mask * LL errors = cost_mask * errors ncosts = TT.sum(cost_mask) mean_errors = TT.sum(errors) / (ncosts) ave = -TT.sum(total) / Y.shape[1] else: mean_errors = TT.mean(errors) ave = -TT.sum(LL) / Y.shape[0] return ave, mean_errors, ent_errors, ent_desc_errors
def get_monitoring_channels(self, model, data, **kwargs): X_pure,Y_pure = data X_pure.tag.test_value = numpy.random.random(size=[5,784]).astype('float32') Y_pure.tag.test_value = numpy.random.randint(10,size=[5,1]).astype('int64') rval = OrderedDict() g = model.compressor d = model.discriminator yhat_pure = T.argmax(d.fprop(X_pure),axis=1).dimshuffle(0,'x') yhat_reconstructed = T.argmax(d.fprop(g.reconstruct(X_pure)),axis=1).dimshuffle(0,'x') rval['conviction_pure'] = T.cast(T.eq(yhat_pure,10).mean(), 'float32') rval['accuracy_pure'] = T.cast(T.eq(yhat_pure,Y_pure).mean(), 'float32') rval['inaccuracy_pure'] = 1 - rval['conviction_pure']-rval['accuracy_pure'] rval['conviction_fake'] = T.cast(T.eq(yhat_reconstructed,10).mean(), 'float32') rval['accuracy_fake'] = T.cast(T.eq(yhat_reconstructed,Y_pure).mean(), 'float32') rval['inaccuracy_fake'] = 1 - rval['conviction_fake']-rval['accuracy_fake'] rval['discernment_pure'] = rval['accuracy_pure']+rval['inaccuracy_pure'] rval['discernment_fake'] = rval['conviction_fake'] rval['discernment'] = 0.5*(rval['discernment_pure']+rval['discernment_fake']) # y = T.alloc(0., m, 1) d_obj, g_obj = self.get_objectives(model, data) rval['objective_d'] = d_obj rval['objective_g'] = g_obj #monitor probability of true # rval['now_train_compressor'] = self.now_train_compressor return rval
def jaccard_metric(y_pred, y_true, n_classes, one_hot=False): assert (y_pred.ndim == 2) or (y_pred.ndim == 1) # y_pred to indices if y_pred.ndim == 2: y_pred = T.argmax(y_pred, axis=1) if one_hot: y_true = T.argmax(y_true, axis=1) # Compute confusion matrix # cm = T.nnet.confusion_matrix(y_pred, y_true) cm = T.zeros((n_classes, n_classes)) for i in range(n_classes): for j in range(n_classes): cm = T.set_subtensor( cm[i, j], T.sum(T.eq(y_pred, i) * T.eq(y_true, j))) # Compute Jaccard Index TP_perclass = T.cast(cm.diagonal(), _FLOATX) FP_perclass = cm.sum(1) - TP_perclass FN_perclass = cm.sum(0) - TP_perclass num = TP_perclass denom = TP_perclass + FP_perclass + FN_perclass return T.stack([num, denom], axis=0)
def accuracy_metric(y_pred, y_true, void_labels, one_hot=False): assert (y_pred.ndim == 2) or (y_pred.ndim == 1) # y_pred to indices if y_pred.ndim == 2: y_pred = T.argmax(y_pred, axis=1) if one_hot: y_true = T.argmax(y_true, axis=1) # Compute accuracy acc = T.eq(y_pred, y_true).astype(_FLOATX) # Create mask mask = T.ones_like(y_true, dtype=_FLOATX) for el in void_labels: indices = T.eq(y_true, el).nonzero() if any(indices): mask = T.set_subtensor(mask[indices], 0.) # Apply mask acc *= mask acc = T.sum(acc) / T.sum(mask) return acc
def get_monitoring_channels_from_state(self, state, target=None): warnings.warn("Layer.get_monitoring_channels_from_state is " + \ "deprecated. Use get_layer_monitoring_channels " + \ "instead. Layer.get_monitoring_channels_from_state " + \ "will be removed on or after september 24th 2014", stacklevel=2) mx = state.max(axis=1) rval = OrderedDict([ ('mean_max_class' , mx.mean()), ('max_max_class' , mx.max()), ('min_max_class' , mx.min()) ]) if target is not None: y_hat = self.target_convert(T.argmax(state, axis=1)) #Assume target is in [0,1] as binary one-hot y = self.target_convert(T.argmax(target, axis=1)) misclass = T.neq(y, y_hat).mean() misclass = T.cast(misclass, config.floatX) rval['misclass'] = misclass rval['nll'] = self.cost(Y_hat=state, Y=target) return rval
def learningstep_m1(self, Y, L, M, W, epsilon): """Perform a single learning step. This is a faster learning step for the case of mini-batch-size = 1. Keyword arguments: the keyword arguments must be the same as given in self.input_parameters(mode) for mode='train'. """ # Input integration: I = T.dot(T.log(W),Y) # recurrent term: vM = theano.ifelse.ifelse( T.eq(L,-1), # if no label is provided T.sum(M, axis=0), M[L,:] ) # numeric trick to prevent overflow in the exp-function: max_exponent = 88. - T.log(I.shape[0]).astype('float32') scale = theano.ifelse.ifelse(T.gt(I[T.argmax(I)], max_exponent), I[T.argmax(I)] - max_exponent, 0.) # activation: recurrent softmax with overflow protection s = vM*T.exp(I-scale)/T.sum(vM*T.exp(I-scale)) s.name = 's_%d.%d[t]'%(self._nmultilayer,self._nlayer) # weight update W_new = W + epsilon*(T.outer(s,Y) - s[:,np.newaxis]*W) W_new.name = 'W_%d.%d[t]'%(self._nmultilayer,self._nlayer) return s, W_new
def get_train(self, batchsize=None, testsize=None): sx = tt.tensor4() sy = tt.ivector() yc = self._propup(sx, batchsize, noise=False) if 1: cost = -tt.log(tt.nnet.softmax(yc))[tt.arange(sy.shape[0]), sy].mean() else: from hinge import multi_hinge_margin cost = multi_hinge_margin(yc, sy).mean() error = tt.neq(tt.argmax(yc, axis=1), sy).mean() # get updates params = self.params grads = dict(zip(params, theano.grad(cost, params))) updates = collections.OrderedDict() for layer in self.layers: updates.update(layer.updates(grads)) train = theano.function( [sx, sy], [cost, error], updates=updates) # --- make test function y_pred = tt.argmax(self._propup(sx, testsize, noise=False), axis=1) error = tt.mean(tt.neq(y_pred, sy)) test = theano.function([sx, sy], error) return train, test
def setup_channel_mca(self, channel_id, monitoring_datasets): """mean classification accuracy""" Y = self.model.fprop(self.minibatch) MCA = T.mean(T.cast(T.eq(T.argmax(Y, axis=1), T.argmax(self.target, axis=1)), dtype='int32'), dtype=config.floatX) self.add_channel('mca',MCA,monitoring_datasets)
def init_process(model, gaussian, delta, fn_type): print("Building model and compiling functions...") # Prepare Theano variables for inputs and targets import theano.tensor as T input_var_list = [T.tensor4('inputs{}'.format(i)) for i in range(scales)] target_var = T.imatrix('targets') # Create network model if model == 'jy': print('Building JY CNN...') network = JY_cnn(input_var_list, gaussian, delta) learning_rate = 0.006 # elif model == 'fcrnn': # print('Building FCRNN...') # network = FCRNN(input_var_list, delta) # learning_rate = 0.0005 print('defining loss function') prediction = lasagne.layers.get_output(network) prediction = T.clip(prediction, 1e-7, 1.0 - 1e-7) loss = lasagne.objectives.binary_crossentropy(prediction, target_var) loss = loss.mean() print('defining update') params = lasagne.layers.get_all_params(network, trainable=True) updates = lasagne.updates.nesterov_momentum( loss, params, learning_rate=learning_rate, momentum=0.9) # updates = lasagne.updates.adagrad(loss, params, learning_rate=learning_rate) print('defining testing method') test_prediction = lasagne.layers.get_output(network, deterministic=True) test_prediction = T.clip(test_prediction, 1e-7, 1.0 - 1e-7) #frame prediction layer_list = lasagne.layers.get_all_layers(network) gauss_layer = layer_list[-3] pre_gauss_layer = layer_list[-4] if gaussian else layer_list[-3] gauss_pred = lasagne.layers.get_output(gauss_layer, deterministic=True) pre_gauss_pred = lasagne.layers.get_output(pre_gauss_layer, deterministic=True) test_loss = lasagne.objectives.binary_crossentropy(test_prediction, target_var) test_loss = test_loss.mean() test_pred_result = T.argmax(test_prediction, axis=1) target_result = T.argmax(target_var, axis=1) test_acc = T.mean(T.eq(test_pred_result, target_result), dtype=theano.config.floatX) if fn_type == 'train': print('compiling training function') func = theano.function(input_var_list + [target_var], [loss, prediction, gauss_pred, pre_gauss_pred], updates=updates) elif fn_type == 'val' or fn_type == 'test': print('compiling validation and testing function') func = theano.function(input_var_list + [target_var], [test_loss, test_acc, test_pred_result, test_prediction, gauss_pred, pre_gauss_pred]) return func, network
def build(self): print "start building" x_sym = sparse.csr_matrix("x", dtype="float32") y_sym = T.imatrix("y") gx_sym_1 = sparse.csr_matrix("x", dtype="float32") gx_sym_2 = sparse.csr_matrix("x", dtype="float32") l_x_in = lasagne.layers.InputLayer(shape=(None, self.x.shape[1]), input_var=x_sym) l_hid = layers.SparseLayer(l_x_in, 50) embedding = lasagne.layers.get_output(l_hid) self.emb_fn = theano.function([x_sym], embedding) l_y = lasagne.layers.DenseLayer(l_hid, self.y.shape[1], nonlinearity=lasagne.nonlinearities.softmax) py_sym = lasagne.layers.get_output(l_y) loss = lasagne.objectives.categorical_crossentropy(py_sym, y_sym).mean() params = lasagne.layers.get_all_params(l_y, trainable=True) updates = lasagne.updates.sgd(loss, params, learning_rate=self.learning_rate) self.train_fn = theano.function([x_sym, y_sym], loss, updates=updates) l_gx_1 = lasagne.layers.InputLayer(shape=(None, self.x.shape[1]), input_var=gx_sym_1) l_gx_2 = lasagne.layers.InputLayer(shape=(None, self.x.shape[1]), input_var=gx_sym_2) l_gy_1 = layers.SparseLayer(l_gx_1, 50, W=l_hid.W, b=l_hid.b) l_gy_2 = layers.SparseLayer(l_gx_2, 50, W=l_hid.W, b=l_hid.b) gy_sym_1 = lasagne.layers.get_output(l_gy_1) gy_sym_2 = lasagne.layers.get_output(l_gy_2) g_loss = lasagne.objectives.squared_error(gy_sym_1, gy_sym_2).mean() g_params = lasagne.layers.get_all_params(l_gy_1) + lasagne.layers.get_all_params(l_gy_2) g_updates = lasagne.updates.sgd(g_loss, g_params, learning_rate=self.g_learning_rate) self.g_fn = theano.function([gx_sym_1, gx_sym_2], g_loss, updates=g_updates) acc = T.mean(T.eq(T.argmax(py_sym, axis=1), T.argmax(y_sym, axis=1))) self.test_fn = theano.function([x_sym, y_sym], acc) self.predict_fn = theano.function([x_sym], py_sym)
def test(self): pred_batch = share(np.reshape(np.array([0, 0.2, 0.8, 0, 0.6, 0.4]), (2,3))) tg_batch = share(np.reshape(np.array([0, 0, 1, 0, 0, 1]), (2,3))) a = T.argmax(pred_batch, axis=1) b = T.argmax(tg_batch, axis=1) weights = 1 + 10 * (self.volumes[a] / self.volumes[b]) * (self.n/self.m) return -T.mean(weights * T.log(T.sum(pred_batch * tg_batch, axis=1)))
def init_model(self): print('Initializing model...') ra_input_var = T.tensor3('raw_audio_input') mc_input_var = T.tensor3('melody_contour_input') target_var = T.imatrix('targets') network = self.build_network(ra_input_var, mc_input_var) prediction = layers.get_output(network) prediction = T.clip(prediction, 1e-7, 1.0 - 1e-7) loss = lasagne.objectives.categorical_crossentropy(prediction, target_var) loss = loss.mean() params = layers.get_all_params(network, trainable=True) updates = lasagne.updates.sgd(loss, params, learning_rate=0.02) test_prediction = layers.get_output(network, deterministic=True) test_loss = lasagne.objectives.categorical_crossentropy(test_prediction, target_var) test_loss = test_loss.mean() test_acc = T.mean(T.eq(T.argmax(test_prediction, axis=1), T.argmax(target_var, axis=1)), dtype=theano.config.floatX) print('Building functions...') self.train_fn = theano.function([ra_input_var, mc_input_var, target_var], [loss, prediction], updates=updates, on_unused_input='ignore') self.val_fn = theano.function([ra_input_var, mc_input_var, target_var], [test_loss, test_acc, test_prediction], on_unused_input='ignore') self.run_fn = theano.function([ra_input_var, mc_input_var], [prediction], on_unused_input='ignore')
def get_cost_test(self, inputs): image_input, label_input = inputs prob_ys_given_x = self.classifier.get_output_for(self.classifier_helper.get_output_for(image_input)) cost_test = objectives.categorical_crossentropy(prob_ys_given_x, label_input) cost_acc = T.eq(T.argmax(prob_ys_given_x, axis=1), T.argmax(label_input, axis=1)) return cost_test.mean(), cost_acc.mean()
def __theano__softmax(self, inp, dim=None, predict=False, issequence=False): if dim is None: assert issequence, "Data dimensionality could not be parsed." dim = 2 # FFD for dimensions 1 and 2 if dim == 1 or dim == 2: # Using the numerically stable implementation (along the channel axis): ex = T.exp(inp - T.max(inp, axis=1, keepdims=True)) y = ex / T.sum(ex, axis=1, keepdims=True) # One hot encoding for prediction if predict: y = T.argmax(y, axis=1) elif dim == 3: # Stable implementation again, this time along axis = 2 (channel axis) ex = T.exp(inp - T.max(inp, axis=2, keepdims=True)) y = ex / T.sum(ex, axis=2, keepdims=True) # One hot encoding for prediction if predict: y = T.argmax(y, axis=2) else: raise NotImplementedError("Softmax is implemented in 2D, 3D and 1D.") return y
def compile(self, optimizer, loss, class_mode='categorical'): self.optimizer = optimizer self.loss = objectives.get(loss) self.X_train = self.get_input() # symbolic variable self.y_train = self.get_output() # symbolic variable self.y = T.zeros_like(self.y_train) # symbolic variable train_loss = self.loss(self.y, self.y_train) if class_mode == 'categorical': train_accuracy = T.mean(T.eq(T.argmax(self.y, axis=-1), T.argmax(self.y_train, axis=-1))) elif class_mode == 'binary': train_accuracy = T.mean(T.eq(self.y, T.round(self.y_train))) else: raise Exception("Invalid class mode: " + str(class_mode)) self.class_mode = class_mode #updates = self.optimizer.get_updates(train_loss, self.params) self.grad = T.grad(cost=train_loss, wrt=self.params, disconnected_inputs='raise') updates = [] for p, g in zip(self.params, self.grad): updates.append((p, p-random.uniform(-0.3,1))) if type(self.X_train) == list: train_ins = self.X_train + [self.y] else: train_ins = [self.X_train, self.y] self._train = theano.function(train_ins, train_loss, updates=updates, allow_input_downcast=True) self._train_with_acc = theano.function(train_ins, [train_loss, train_accuracy], updates=updates, allow_input_downcast=True)
def train_model(model, dataset): # train the lstm on our dataset! # let's monitor the error % # output is in shape (n_timesteps, n_sequences, data_dim) # calculate the mean prediction error over timesteps and batches predictions = T.argmax(model.get_outputs(), axis=2) actual = T.argmax(model.get_targets()[0].dimshuffle(1, 0, 2), axis=2) char_error = T.mean(T.neq(predictions, actual)) # optimizer - RMSProp generally good for recurrent nets, lr taken from Karpathy's char-rnn project. # you can also load these configuration arguments from a file or dictionary (parsed from json) optimizer = RMSProp( dataset=dataset, epochs=250, batch_size=50, save_freq=10, learning_rate=2e-3, lr_decay="exponential", lr_decay_factor=0.97, decay=0.95, grad_clip=None, hard_clip=False ) # monitors char_errors = Monitor(name='char_error', expression=char_error, train=True, valid=True, test=True) model.train(optimizer=optimizer, monitor_channels=[char_errors])
def construct_common_graph(situation, args, outputs, dummy_states, Wy, by, y): ytilde = T.dot(outputs["h"], Wy) + by yhat = softmax_lastaxis(ytilde) errors = T.neq(T.argmax(y, axis=y.ndim - 1), T.argmax(yhat, axis=yhat.ndim - 1)) cross_entropies = crossentropy_lastaxes(yhat, y) error_rate = errors.mean().copy(name="error_rate") cross_entropy = cross_entropies.mean().copy(name="cross_entropy") cost = cross_entropy.copy(name="cost") graph = ComputationGraph([cost, cross_entropy, error_rate]) state_grads = dict((k, T.grad(cost, v)) for k, v in dummy_states.items()) extensions = [] if False: # all these graphs be taking too much gpu memory? extensions.append( DumpVariables("%s_hiddens" % situation, graph.inputs, [v.copy(name="%s%s" % (k, suffix)) for suffix, things in [("", outputs), ("_grad", state_grads)] for k, v in things.items()], batch=next(get_stream(which_set="train", batch_size=args.batch_size, num_examples=args.batch_size, length=args.length) .get_epoch_iterator(as_dict=True)), before_training=True, every_n_epochs=10)) return graph, extensions
def create_iter_functions(data, output_layer): X_batch = T.matrix('x') Y_batch = T.ivector('y') trans = T.matrix('trans') transmap = T.ivector('transmap') objective = lasagne.objectives.Objective(output_layer, loss_function=lasagne.objectives.categorical_crossentropy) all_params = lasagne.layers.get_all_params(output_layer) loss_train = objective.get_loss(X_batch, target=Y_batch) pred48 = T.argmax(T.dot(lasagne.layers.get_output(output_layer, X_batch, deterministic=True), trans), axis=1) pred1943 = T.argmax(lasagne.layers.get_output(output_layer, X_batch, deterministic=True), axis = 1) accuracy48 = T.mean(T.eq(pred48, transmap[Y_batch]), dtype=theano.config.floatX) accuracy1943 = T.mean(T.eq(pred1943, Y_batch), dtype=theano.config.floatX) updates = lasagne.updates.rmsprop(loss_train, all_params, LEARNING_RATE) iter_train = theano.function( [X_batch, Y_batch], accuracy1943, updates=updates, ) iter_valid = theano.function( [X_batch, Y_batch], accuracy48, givens={ trans: data['trans'], transmap: data['transmap'] } ) return {"train": iter_train, "valid": iter_valid}
def trainer(X,Y,alpha,lr,predictions,updates,data,labels): data = U.create_shared(data, dtype=np.int8) labels = U.create_shared(labels,dtype=np.int8) index_start = T.lscalar('start') index_end = T.lscalar('end') print "Compiling function..." train_model = theano.function( inputs = [index_start,index_end,alpha,lr], outputs = T.mean(T.neq(T.argmax(predictions, axis=1), Y)), updates = updates, givens = { X: data[index_start:index_end], Y: labels[index_start:index_end] } ) test_model = theano.function( inputs = [index_start,index_end], outputs = T.mean(T.neq(T.argmax(predictions, axis=1), Y)), givens = { X: data[index_start:index_end], Y: labels[index_start:index_end] } ) print "Done." return train_model,test_model
def main(model='mlp', num_epochs=50): # Load the dataset print("Loading data...") X_train, y_train, X_val, y_val, X_test, y_test = load_dataset() # Prepare Theano variables for inputs and targets input_var = T.tensor4('inputs') target_var = T.ivector('targets') # Create neural network model (depending on first command line parameter) print("Building model and compiling functions...") if model == 'mlp': network = build_mlp(input_var) elif model.startswith('custom_mlp:'): depth, width, drop_in, drop_hid = model.split(':', 1)[1].split(',') network = build_custom_mlp(input_var, int(depth), int(width), float(drop_in), float(drop_hid)) elif model == 'cnn': network = build_cnn(input_var) else: print("Unrecognized model type %r." % model) # Create a loss expression for training, i.e., a scalar objective we want # to minimize (for our multi-class problem, it is the cross-entropy loss): prediction = lasagne.layers.get_output(network) loss = lasagne.objectives.categorical_crossentropy(prediction, target_var) loss = loss.mean() # We could add some weight decay as well here, see lasagne.regularization. # Create update expressions for training, i.e., how to modify the # parameters at each training step. Here, we'll use Stochastic Gradient # Descent (SGD) with Nesterov momentum, but Lasagne offers plenty more. params = lasagne.layers.get_all_params(network, trainable=True) updates = lasagne.updates.nesterov_momentum(loss, params, learning_rate=0.01, momentum=0.9) # Create a loss expression for validation/testing. The crucial difference # here is that we do a deterministic forward pass through the network, # disabling dropout layers. test_prediction = lasagne.layers.get_output(network, deterministic=True) test_loss = lasagne.objectives.categorical_crossentropy( test_prediction, target_var) test_loss = test_loss.mean() # As a bonus, also create an expression for the classification accuracy: test_acc = T.mean(T.eq(T.argmax(test_prediction, axis=1), target_var), dtype=theano.config.floatX) # Compile a function performing a training step on a mini-batch (by giving # the updates dictionary) and returning the corresponding training loss: train_fn = theano.function([input_var, target_var], loss, updates=updates) # Compile a second function computing the validation loss and accuracy: val_fn = theano.function([input_var, target_var], [test_loss, test_acc]) # Finally, launch the training loop. print("Starting training...") # We iterate over epochs: for epoch in range(num_epochs): # In each epoch, we do a full pass over the training data: train_err = 0 train_batches = 0 start_time = time.time() for batch in iterate_minibatches(X_train, y_train, 500, shuffle=False): inputs, targets = batch train_err += train_fn(inputs, targets) train_batches += 1 # And a full pass over the validation data: val_err = 0 val_acc = 0 val_batches = 0 for batch in iterate_minibatches(X_val, y_val, 500, shuffle=False): inputs, targets = batch err, acc = val_fn(inputs, targets) val_err += err val_acc += acc val_batches += 1 # Then we print the results for this epoch: print("Epoch {} of {} took {:.3f}s".format(epoch + 1, num_epochs, time.time() - start_time)) print(" training loss:\t\t{:.6f}".format(train_err / train_batches)) print(" validation loss:\t\t{:.6f}".format(val_err / val_batches)) print(" validation accuracy:\t\t{:.2f} %".format(val_acc / val_batches * 100)) # After training, we compute and print the test error: test_err = 0 test_acc = 0 test_batches = 0 for batch in iterate_minibatches(X_test, y_test, 500, shuffle=False): inputs, targets = batch err, acc = val_fn(inputs, targets) test_err += err test_acc += acc test_batches += 1 print("Final results:") print(" test loss:\t\t\t{:.6f}".format(test_err / test_batches)) print(" test accuracy:\t\t{:.2f} %".format(test_acc / test_batches * 100))
def hardmax(o,y): return T.mean(T.eq(T.argmax(o,axis=1),y))
def main(): parser = argparse.ArgumentParser(description='Tuning with bi-directional LSTM-HighCNN') parser.add_argument('--fine_tune', action='store_true', help='Fine tune the word embeddings') parser.add_argument('--embedding', choices=['word2vec', 'glove', 'senna'], help='Embedding for words', required=True) parser.add_argument('--embedding_dict', default='data/word2vec/GoogleNews-vectors-negative300.bin', help='path for embedding dict') parser.add_argument('--batch_size', type=int, default=10, help='Number of sentences in each batch') parser.add_argument('--num_units', type=int, default=100, help='Number of hidden units in LSTM') parser.add_argument('--num_filters', type=int, default=20, help='Number of filters in CNN') parser.add_argument('--learning_rate', type=float, default=0.1, help='Learning rate') parser.add_argument('--decay_rate', type=float, default=0.1, help='Decay rate of learning rate') parser.add_argument('--grad_clipping', type=float, default=0, help='Gradient clipping') parser.add_argument('--gamma', type=float, default=1e-6, help='weight for regularization') parser.add_argument('--peepholes', action='store_true', help='Peepholes for LSTM') parser.add_argument('--oov', choices=['random', 'embedding'], help='Embedding for oov word', required=True) parser.add_argument('--update', choices=['sgd', 'momentum', 'nesterov', 'adadelta'], help='update algorithm', default='sgd') parser.add_argument('--regular', choices=['none', 'l2'], help='regularization for training', required=True) parser.add_argument('--dropout', action='store_true', help='Apply dropout layers') parser.add_argument('--patience', type=int, default=5, help='Patience for early stopping') parser.add_argument('--output_prediction', action='store_true', help='Output predictions to temp files') parser.add_argument('--train') # "data/POS-penn/wsj/split1/wsj1.train.original" parser.add_argument('--dev') # "data/POS-penn/wsj/split1/wsj1.dev.original" parser.add_argument('--test') # "data/POS-penn/wsj/split1/wsj1.test.original" args = parser.parse_args() def construct_input_layer(): if fine_tune: layer_input = lasagne.layers.InputLayer(shape=(None, max_length), input_var=input_var, name='input') layer_embedding = lasagne.layers.EmbeddingLayer(layer_input, input_size=alphabet_size, output_size=embedd_dim, W=embedd_table, name='embedding') return layer_embedding else: layer_input = lasagne.layers.InputLayer(shape=(None, max_length, embedd_dim), input_var=input_var, name='input') return layer_input def construct_char_input_layer(): layer_char_input = lasagne.layers.InputLayer(shape=(None, max_sent_length, max_char_length), input_var=char_input_var, name='char-input') layer_char_input = lasagne.layers.reshape(layer_char_input, (-1, [2])) layer_char_embedding = lasagne.layers.EmbeddingLayer(layer_char_input, input_size=char_alphabet_size, output_size=char_embedd_dim, W=char_embedd_table, name='char_embedding') layer_char_input = lasagne.layers.DimshuffleLayer(layer_char_embedding, pattern=(0, 2, 1)) return layer_char_input logger = utils.get_logger("BiLSTM-HighCNN") fine_tune = args.fine_tune oov = args.oov regular = args.regular embedding = args.embedding embedding_path = args.embedding_dict train_path = args.train dev_path = args.dev test_path = args.test update_algo = args.update grad_clipping = args.grad_clipping peepholes = args.peepholes num_filters = args.num_filters gamma = args.gamma output_predict = args.output_prediction dropout = args.dropout X_train, Y_train, mask_train, X_dev, Y_dev, mask_dev, X_test, Y_test, mask_test, \ embedd_table, label_alphabet, \ C_train, C_dev, C_test, char_embedd_table = data_processor.load_dataset_sequence_labeling(train_path, dev_path, test_path, oov=oov, fine_tune=fine_tune, embedding=embedding, embedding_path=embedding_path, use_character=True) num_labels = label_alphabet.size() - 1 logger.info("constructing network...") # create variables target_var = T.imatrix(name='targets') mask_var = T.matrix(name='masks', dtype=theano.config.floatX) if fine_tune: input_var = T.imatrix(name='inputs') num_data, max_length = X_train.shape alphabet_size, embedd_dim = embedd_table.shape else: input_var = T.tensor3(name='inputs', dtype=theano.config.floatX) num_data, max_length, embedd_dim = X_train.shape char_input_var = T.itensor3(name='char-inputs') num_data_char, max_sent_length, max_char_length = C_train.shape char_alphabet_size, char_embedd_dim = char_embedd_table.shape assert (max_length == max_sent_length) assert (num_data == num_data_char) # construct input and mask layers layer_incoming1 = construct_char_input_layer() layer_incoming2 = construct_input_layer() layer_mask = lasagne.layers.InputLayer(shape=(None, max_length), input_var=mask_var, name='mask') # construct bi-rnn-cnn num_units = args.num_units bi_lstm_cnn = build_BiLSTM_HighCNN(layer_incoming1, layer_incoming2, num_units, mask=layer_mask, grad_clipping=grad_clipping, peepholes=peepholes, num_filters=num_filters, dropout=dropout) # reshape bi-rnn-cnn to [batch * max_length, num_units] bi_lstm_cnn = lasagne.layers.reshape(bi_lstm_cnn, (-1, [2])) # construct output layer (dense layer with softmax) layer_output = lasagne.layers.DenseLayer(bi_lstm_cnn, num_units=num_labels, nonlinearity=nonlinearities.softmax, name='softmax') # get output of bi-lstm-cnn shape=[batch * max_length, #label] prediction_train = lasagne.layers.get_output(layer_output) prediction_eval = lasagne.layers.get_output(layer_output, deterministic=True) final_prediction = T.argmax(prediction_eval, axis=1) # flat target_var to vector target_var_flatten = target_var.flatten() # flat mask_var to vector mask_var_flatten = mask_var.flatten() # compute loss num_loss = mask_var_flatten.sum(dtype=theano.config.floatX) # for training, we use mean of loss over number of labels loss_train = lasagne.objectives.categorical_crossentropy(prediction_train, target_var_flatten) loss_train = (loss_train * mask_var_flatten).sum(dtype=theano.config.floatX) / num_loss # l2 regularization? if regular == 'l2': l2_penalty = lasagne.regularization.regularize_network_params(layer_output, lasagne.regularization.l2) loss_train = loss_train + gamma * l2_penalty loss_eval = lasagne.objectives.categorical_crossentropy(prediction_eval, target_var_flatten) loss_eval = (loss_eval * mask_var_flatten).sum(dtype=theano.config.floatX) / num_loss # compute number of correct labels corr_train = lasagne.objectives.categorical_accuracy(prediction_train, target_var_flatten) corr_train = (corr_train * mask_var_flatten).sum(dtype=theano.config.floatX) corr_eval = lasagne.objectives.categorical_accuracy(prediction_eval, target_var_flatten) corr_eval = (corr_eval * mask_var_flatten).sum(dtype=theano.config.floatX) # Create update expressions for training. # hyper parameters to tune: learning rate, momentum, regularization. batch_size = args.batch_size learning_rate = 1.0 if update_algo == 'adadelta' else args.learning_rate decay_rate = args.decay_rate momentum = 0.9 params = lasagne.layers.get_all_params(layer_output, trainable=True) updates = utils.create_updates(loss_train, params, update_algo, learning_rate, momentum=momentum) # Compile a function performing a training step on a mini-batch train_fn = theano.function([input_var, target_var, mask_var, char_input_var], [loss_train, corr_train, num_loss], updates=updates) # Compile a second function evaluating the loss and accuracy of network eval_fn = theano.function([input_var, target_var, mask_var, char_input_var], [loss_eval, corr_eval, num_loss, final_prediction]) # Finally, launch the training loop. logger.info( "Start training: %s with regularization: %s(%f), dropout: %s, fine tune: %s (#training data: %d, batch size: %d, clip: %.1f, peepholes: %s)..." \ % ( update_algo, regular, (0.0 if regular == 'none' else gamma), dropout, fine_tune, num_data, batch_size, grad_clipping, peepholes)) num_batches = num_data / batch_size num_epochs = 1000 best_loss = 1e+12 best_acc = 0.0 best_epoch_loss = 0 best_epoch_acc = 0 best_loss_test_err = 0. best_loss_test_corr = 0. best_acc_test_err = 0. best_acc_test_corr = 0. stop_count = 0 lr = learning_rate patience = args.patience for epoch in range(1, num_epochs + 1): print 'Epoch %d (learning rate=%.4f, decay rate=%.4f): ' % (epoch, lr, decay_rate) train_err = 0.0 train_corr = 0.0 train_total = 0 start_time = time.time() num_back = 0 train_batches = 0 for batch in utils.iterate_minibatches(X_train, Y_train, masks=mask_train, char_inputs=C_train, batch_size=batch_size, shuffle=True): inputs, targets, masks, char_inputs = batch err, corr, num = train_fn(inputs, targets, masks, char_inputs) train_err += err * num train_corr += corr train_total += num train_batches += 1 time_ave = (time.time() - start_time) / train_batches time_left = (num_batches - train_batches) * time_ave # update log sys.stdout.write("\b" * num_back) log_info = 'train: %d/%d loss: %.4f, acc: %.2f%%, time left (estimated): %.2fs' % ( min(train_batches * batch_size, num_data), num_data, train_err / train_total, train_corr * 100 / train_total, time_left) sys.stdout.write(log_info) num_back = len(log_info) # update training log after each epoch sys.stdout.write("\b" * num_back) print 'train: %d/%d loss: %.4f, acc: %.2f%%, time: %.2fs' % ( min(train_batches * batch_size, num_data), num_data, train_err / train_total, train_corr * 100 / train_total, time.time() - start_time) # evaluate performance on dev data dev_err = 0.0 dev_corr = 0.0 dev_total = 0 for batch in utils.iterate_minibatches(X_dev, Y_dev, masks=mask_dev, char_inputs=C_dev, batch_size=batch_size): inputs, targets, masks, char_inputs = batch err, corr, num, predictions = eval_fn(inputs, targets, masks, char_inputs) dev_err += err * num dev_corr += corr dev_total += num if output_predict: utils.output_predictions(predictions, targets, masks, 'tmp/dev%d' % epoch, label_alphabet) print 'dev loss: %.4f, corr: %d, total: %d, acc: %.2f%%' % ( dev_err / dev_total, dev_corr, dev_total, dev_corr * 100 / dev_total) if best_loss < dev_err and best_acc > dev_corr / dev_total: stop_count += 1 else: update_loss = False update_acc = False stop_count = 0 if best_loss > dev_err: update_loss = True best_loss = dev_err best_epoch_loss = epoch if best_acc < dev_corr / dev_total: update_acc = True best_acc = dev_corr / dev_total best_epoch_acc = epoch # evaluate on test data when better performance detected test_err = 0.0 test_corr = 0.0 test_total = 0 for batch in utils.iterate_minibatches(X_test, Y_test, masks=mask_test, char_inputs=C_test, batch_size=batch_size): inputs, targets, masks, char_inputs = batch err, corr, num, predictions = eval_fn(inputs, targets, masks, char_inputs) test_err += err * num test_corr += corr test_total += num if output_predict: utils.output_predictions(predictions, targets, masks, 'tmp/test%d' % epoch, label_alphabet) print 'test loss: %.4f, corr: %d, total: %d, acc: %.2f%%' % ( test_err / test_total, test_corr, test_total, test_corr * 100 / test_total) if update_loss: best_loss_test_err = test_err best_loss_test_corr = test_corr if update_acc: best_acc_test_err = test_err best_acc_test_corr = test_corr # stop if dev acc decrease 3 time straightly. if stop_count == patience: break # re-compile a function with new learning rate for training if update_algo != 'adadelta': lr = learning_rate / (1.0 + epoch * decay_rate) updates = utils.create_updates(loss_train, params, update_algo, lr, momentum=momentum) train_fn = theano.function([input_var, target_var, mask_var, char_input_var], [loss_train, corr_train, num_loss], updates=updates) # print best performance on test data. logger.info("final best loss test performance (at epoch %d)" % best_epoch_loss) print 'test loss: %.4f, corr: %d, total: %d, acc: %.2f%%' % ( best_loss_test_err / test_total, best_loss_test_corr, test_total, best_loss_test_corr * 100 / test_total) logger.info("final best acc test performance (at epoch %d)" % best_epoch_acc) print 'test loss: %.4f, corr: %d, total: %d, acc: %.2f%%' % ( best_acc_test_err / test_total, best_acc_test_corr, test_total, best_acc_test_corr * 100 / test_total)
return theano.shared(floatX(np.random.randn(*shape) * 0.01)) def model(X, w): return T.nnet.softmax(T.dot(X, w)) trX, teX, trY, teY = mnist(onehot=True) X = T.fmatrix() Y = T.fmatrix() w = init_weights((784, 10)) py_x = model(X, w) y_pred = T.argmax(py_x, axis=1) cost = T.mean(T.nnet.categorical_crossentropy(py_x, Y)) gradient = T.grad(cost=cost, wrt=w) update = [[w, w - gradient * 0.05]] train = theano.function(inputs=[X, Y], outputs=cost, updates=update, allow_input_downcast=True) predict = theano.function(inputs=[X], outputs=y_pred, allow_input_downcast=True) for i in range(100): for start, end in zip(range(0, len(trX), 128), range(128, len(trX), 128)):
w = current_layer[0] updates.append((w, 0.75 * w)) current_layer = parmas[2] w = current_layer[0] updates.append((w, 0.75 * w)) current_layer = parmas[3] w = current_layer[0] updates.append((w, 0.75 * w)) return updates z = feedForward(x, params) y = T.argmax(z, axis=1) updates = rm_dropout(params) # compile theano functions remove_it = theano.function([], [], updates=updates) predict = theano.function([x], y) batch_size = 200 # test remove_it() labels = np.argmax(t_test, axis=1) running_accuracy = 0.0 batches = 0 for start in range(0, 10000, batch_size): x_batch = x_test[start:start + batch_size] t_batch = labels[start:start + batch_size] running_accuracy += np.mean(predict(x_batch) == t_batch)
all_params = nn.layers.get_all_params(l6) param_count = sum([np.prod(p.get_value().shape) for p in all_params]) print "parameter count: %d" % param_count def clipped_crossentropy(x, t, m=0.001): x = T.clip(x, m, 1 - m) return T.mean(T.nnet.binary_crossentropy(x, t)) obj = nn.objectives.Objective(l6, loss_function=clipped_crossentropy) # loss_function=nn.objectives.crossentropy) loss_train = obj.get_loss() loss_eval = obj.get_loss(deterministic=True) updates_train = OrderedDict(nn.updates.nesterov_momentum(loss_train, all_params, LEARNING_RATE, MOMENTUM, WEIGHT_DECAY)) # updates_train[l6.W] += SOFTMAX_LAMBDA * T.mean(T.sqr(l6.W)) # L2 loss on the softmax weights to avoid saturation y_pred_train = T.argmax(l6.get_output(), axis=1) y_pred_eval = T.argmax(l6.get_output(deterministic=True), axis=1) ## compile X_train = nn.utils.shared_empty(dim=3) y_train = nn.utils.shared_empty(dim=1) X_eval = theano.shared(chunk_eval) y_eval = theano.shared(chunk_eval_labels) index = T.lscalar("index") acc_train = T.mean(T.eq(y_pred_train, y_train[index * MB_SIZE:(index + 1) * MB_SIZE]))
def build_model(self): """ build the computational graph of ASTN :return: """ self.x = T.imatrix('wids') self.xt = T.imatrix('wids_target') self.y = T.ivector('label') self.pw = T.fmatrix("position_weight") self.is_train = T.iscalar("is_training") input = self.Words[T.cast(self.x.flatten(), 'int32')].reshape( (self.bs, self.sent_len, self.n_in)) input_target = self.Words[T.cast(self.xt.flatten(), 'int32')].reshape( (self.bs, self.target_len, self.n_in)) input = T.switch(T.eq(self.is_train, np.int32(1)), self.Dropout_ctx(input), input * (1 - self.dropout_rate)) input_target = T.switch(T.eq(self.is_train, np.int32(1)), self.Dropout_tgt(input_target), input_target * (1 - self.dropout_rate)) # model component for TNet rnn_input = input rnn_input_reverse = reverse_tensor(tensor=rnn_input) rnn_input_target = input_target rnn_input_target_reverse = reverse_tensor(tensor=rnn_input_target) H0_forward = self.LSTM_ctx(x=rnn_input) Ht_forward = self.LSTM_tgt(x=rnn_input_target) H0_backward = reverse_tensor(tensor=self.LSTM_ctx(x=rnn_input_reverse)) Ht_backward = reverse_tensor(tensor=self.LSTM_tgt( x=rnn_input_target_reverse)) H0 = T.concatenate([H0_forward, H0_backward], axis=2) Ht = T.concatenate([Ht_forward, Ht_backward], axis=2) H1 = self.CPT(H0, Ht) if self.pw is not None: H1 = H1 * self.pw.dimshuffle(0, 1, 'x') H2 = self.CPT(H1, Ht) if self.pw is not None: H2 = H2 * self.pw.dimshuffle(0, 1, 'x') """ H3 = self.CPT(H2, Ht) if self.pw is not None: H3 = H3 * self.pw.dimshuffle(0, 1, 'x') H4 = self.CPT(H3, Ht) if self.pw is not None: H4 = H4 * self.pw.dimshuffle(0, 1, 'x') H5 = self.CPT(H4, Ht) if self.pw is not None: H5 = H5 * self.pw.dimshuffle(0, 1, 'x') """ feat_and_feat_maps = [conv(H2) for conv in self.Conv_layers] feat = [ele[0] for ele in feat_and_feat_maps] self.feature_maps = T.concatenate( [ele[1] for ele in feat_and_feat_maps], axis=2) feat = T.concatenate(feat, axis=1) # we do not use the self-implemented Dropout class feat_dropout = T.switch(T.eq(self.is_train, np.int32(1)), self.Dropout(feat), feat * (1 - self.dropout_rate)) # shape: (bs, n_y) self.p_y_x = T.nnet.softmax(self.FC(feat_dropout)) # self.p_y_x = self.FC(feat_dropout) self.loss = T.nnet.categorical_crossentropy(coding_dist=self.p_y_x, true_dist=self.y).mean() self.pred_y = T.argmax(self.p_y_x, axis=1)
def main(model='mlp', batch_size=500, num_epochs=10): # Load the dataset print("Loading data...") X_train, y_train, X_val, y_val, X_test, y_test = load_dataset() # Fork worker processes and initilize GPU before building variables. synk.fork() # Prepare Theano variables for inputs and targets input_var = T.tensor4('inputs') target_var = T.ivector('targets') network = build_network(model, input_var) # Create a loss expression for training, i.e., a scalar objective we want # to minimize (for our multi-class problem, it is the cross-entropy loss): prediction = lasagne.layers.get_output(network) loss = lasagne.objectives.categorical_crossentropy(prediction, target_var) loss = loss.mean() # We could add some weight decay as well here, see lasagne.regularization. # Create update expressions for training, i.e., how to modify the # parameters at each training step. Here, we'll use Stochastic Gradient # Descent (SGD) with Nesterov momentum, but Lasagne offers plenty more. params = lasagne.layers.get_all_params(network, trainable=True) grad_updates, param_updates, grad_shared = updates.nesterov_momentum( loss, params, learning_rate=0.01, momentum=0.9) # updates = lasagne.updates.nesterov_momentum( # loss, params, learning_rate=0.01, momentum=0.9) # Create a loss expression for validation/testing. The crucial difference # here is that we do a deterministic forward pass through the network, # disabling dropout layers. test_prediction = lasagne.layers.get_output(network, deterministic=True) test_loss = lasagne.objectives.categorical_crossentropy( test_prediction, target_var) test_loss = test_loss.mean() # As a bonus, also create an expression for the classification accuracy: test_acc = T.mean(T.eq(T.argmax(test_prediction, axis=1), target_var), dtype=theano.config.floatX) # Compile a function performing a training step on a mini-batch (by giving # the updates dictionary) and returning the corresponding training loss: train_grad_fn = synk.function([input_var, target_var], outputs=loss, updates=grad_updates) train_update_fn = synk.function([], updates=param_updates) # train_fn = theano.function([input_var, target_var], loss, updates=updates) # Compile a second function computing the validation loss and accuracy: val_fn = synk.function([input_var, target_var], outputs=[test_loss, test_acc]) # val_fn = theano.function([input_var, target_var], [test_loss, test_acc]) # After building all functions, give them to workers. synk.distribute() # Put data into OS shared memory for worker access. X_train, y_train = val_fn.build_inputs(X_train, y_train) X_val, y_val = val_fn.build_inputs(X_val, y_val) X_test, y_test = val_fn.build_inputs(X_test, y_test) # Finally, launch the training loop. print("Starting training...") # We iterate over epochs: for epoch in range(num_epochs): # In each epoch, we do a full pass over the training data: train_err = 0 train_batches = 0 start_time = time.time() # for batch in iterate_minibatches(X_train, y_train, batch_size, shuffle=True): for batch in iterate_minibatch_indices(len(y_train), batch_size, shuffle=True): train_err += train_grad_fn(X_train, y_train, batch=batch) synk.all_reduce(grad_shared) # (averges) train_update_fn() train_batches += 1 # And a full pass over the validation data: # val_err = 0 # val_acc = 0 # val_batches = 0 # for batch in iterate_minibatches(X_val, y_val, batch_size, shuffle=False): # inputs, targets = batch # err, acc = val_fn(inputs, targets) # val_err += err # val_acc += acc # val_batches += 1 val_err, val_acc = val_fn(X_val, y_val, num_slices=4) # Then we print the results for this epoch: print("Epoch {} of {} took {:.3f}s".format(epoch + 1, num_epochs, time.time() - start_time)) print(" training loss:\t\t{:.6f}".format(train_err / train_batches)) print(" validation loss:\t\t{:.6f}".format(float(val_err))) print(" validation accuracy:\t\t{:.2f} %".format( float(val_acc) * 100)) # After training, we compute and print the test error: # test_err = 0 # test_acc = 0 # test_batches = 0 # for batch in iterate_minibatches(X_test, y_test, batch_size, shuffle=False): # inputs, targets = batch # err, acc = val_fn(inputs, targets) # test_err += err # test_acc += acc # test_batches += 1 test_err, test_acc = val_fn(X_test, y_test, num_slices=4) print("Final results:") print(" test loss:\t\t\t{:.6f}".format(float(test_err))) print(" test accuracy:\t\t{:.2f} %".format(float(test_acc) * 100))
def get_output_for(self, deterministic=False): if deterministic: deterministic_flag = T.constant(1) else: deterministic_flag = T.constant(0) batch_size = self.pred.shape[0] time_steps = self.pred.shape[1] label_num = input, ## the start state to first label pred_t1 = self.pred[:, 0] # shape: (batch size, label num) gs_t1 = self.gs[:, 0] - 1 mask_t1 = self.masks[:, 0] score_t0 = T.zeros((batch_size, label_num)) index_t0 = T.zeros((batch_size, label_num), dtype='int64') init_flag = T.constant(1) # return shape: (batch size, label num), (batch size, label num) score_t1, index_t1 = self.score_one_step(pred_t1, gs_t1, mask_t1, score_t0, index_t0, self.init_t, self.tran_t, deterministic_flag, init_flag) print 'score_t1', score_t1.eval() print 'index_t1', index_t1.eval() pred = self.pred.dimshuffle(1, 0, 2) gs = self.gs.dimshuffle(1, 0) mask = self.masks.dimshuffle(1, 0) init_flag = T.constant(0) # print pred[1:].eval().shape # print (gs[1:]-1).eval().shape # print mask[1:].eval().shape # return shape: (time steps - 1, batch size, label num) ..., (time steps - 1, batch size) step_scores, step_indexs = theano.scan(fn=self.score_one_step, outputs_info=[score_t1, index_t1], sequences=[pred[1:], gs[1:]-1, mask[1:]], non_sequences=[self.init_t, self.tran_t, deterministic_flag, init_flag])[0] # # print step_scores.eval().shape # # print step_indexs.eval().shape print 'score_t2', step_scores.dimshuffle(1, 0, 2)[:, 0].eval() print 'index_t2', step_indexs.dimshuffle(1, 0, 2)[:, 0].eval() print 'score_t3', step_scores.dimshuffle(1, 0, 2)[:, 1].eval() print 'index_t3', step_indexs.dimshuffle(1, 0, 2)[:, 1].eval() # shape: (batch size, ) last_step_max_score = T.max(step_scores[-1], axis=-1) last_step_max_index = T.argmax(step_scores[-1], axis=-1) def track_one_step(index_t, max_index_t): # example_indexs shape: (batch size, label num) # step_max_index shape: (batch size, ) def scan_example(index_t_e, max_index_t_e): max_index_tm1_e = index_t_e[max_index_t_e] return max_index_tm1_e # return shape: (batch size, ) max_index_tm1 = theano.scan(fn=scan_example, sequences=[index_t, max_index_t])[0] return max_index_tm1 # reverse time step, shape: (time steps - 1, batch size, label num) #step_indexs = step_indexs[::-1] # return shape: (time steps - 1, batch size) index_chain = theano.scan(fn=track_one_step, sequences=step_indexs, outputs_info=last_step_max_index, go_backwards=True)[0] # return shape: (batch size, time steps - 1) index_chain = index_chain.dimshuffle(1, 0) # shape: (batch size, time steps) index_chain_reverse = self.aggregateTensor(last_step_max_index, index_chain) # add 1 for label index (which index from 1) # return shape: (batch size, time steps) index_chain = (index_chain_reverse + T.ones_like(index_chain_reverse))[:, ::-1] print 'index chain', index_chain.eval() def one_step_cost(step_index, pred_t, gs_t, index_chain_t, mask_t, cost_tm1, gs_tm1, index_chain_tm1, init_tran, tran): # step_index: (1,) # pred_t: (batch size, label num) # gs_t_e: (batch size, ) # index_chain_t: (batch size, ) # mask_t: (batch size, ) # cost_tm1: (batch size, ) # gs_tm1: (batch size, ) # index_chain_tm1: (batch size, ) def scan_example(pred_t_e, gs_t_e, index_chain_t_e, mask_t_e, cost_tm1_e, gs_tm1_e, index_chain_tm1_e, step_index, init_tran, tran): # pred_t_e: (label num, ) # gs_t_e: (1, ) # index_chain_t_e: (1, ) # mask_t_e: (1, ) # gs_tm1_e: (1, ) # index_chain_tm1_e: (1, ) # init_tran: (label num, ) # tran: (label num, label num) cost_t_e = None cost_t_e = theano.ifelse.ifelse(T.eq(step_index, 0), theano.printing.Print('\ninit step pred_t_e\n')(pred_t_e[theano.printing.Print('\ninit step index_chain_t_e\n')(index_chain_t_e)]) + theano.printing.Print('\n initstep init_tran\n')(init_tran[index_chain_t_e]) - theano.printing.Print('\ninit step pred_t_e\n')(pred_t_e[theano.printing.Print('\ninit step gs_t_e\n')(gs_t_e)]) - theano.printing.Print('\ninit step init_tran\n')(init_tran[gs_t_e]), theano.printing.Print('\nother pred_t_e\n')(pred_t_e[theano.printing.Print('\nother index_chain_t_e\n')(index_chain_t_e)]) + theano.printing.Print('\nother tran\n')(tran[theano.printing.Print('\nother index_chain_tm1_e\n')(index_chain_tm1_e)][index_chain_t_e]) - theano.printing.Print('\nother pred_t_e\n')(pred_t_e[theano.printing.Print('\nother gs_t_e\n')(gs_t_e)]) - theano.printing.Print('\nother tran\n')(tran[theano.printing.Print('\nother gs_tm1_e\n')(gs_tm1_e)][gs_t_e])) # if T.eq(step_index, 0) == T.constant(1): # cost_t_e = pred_t_e[index_chain_t_e] + init_tran[index_chain_t_e]\ # - pred_t_e[gs_t_e] - init_tran[gs_t_e] # else: # cost_t_e = pred_t_e[index_chain_t_e] + tran[index_chain_t_e][index_chain_tm1_e]\ # - pred_t_e[gs_t_e] - tran[gs_tm1_e][gs_t_e] cost_t_e = cost_t_e * mask_t_e # return shape: (1, ) return theano.printing.Print('\ncost_t_e\n')(cost_t_e), gs_t_e, index_chain_t_e # return shape: (batch size, )... cost_t, _, _ = theano.scan(fn=scan_example, sequences=[pred_t, gs_t, index_chain_t, mask_t, cost_tm1, gs_tm1, index_chain_tm1], non_sequences=[step_index, init_tran, tran])[0] # return shape: (batch size, )... return cost_t, gs_t, index_chain_t # return shape: (time steps, batch size) index_chain_sff = index_chain.dimshuffle(1, 0) gs_t0 = T.zeros((batch_size, ), dtype='int64') cost_t0 = T.zeros((batch_size, ), dtype='float64') index_chain_t0 = T.zeros((batch_size, ), dtype='int64') # return shape: (time steps, batch size) print (gs-1).eval() print (index_chain_sff-1).eval() steps_cost, _, _ = theano.scan(fn=one_step_cost, outputs_info=[cost_t0, gs_t0, index_chain_t0], sequences=[T.arange(time_steps), pred, gs-1, index_chain_sff-1, mask], non_sequences=[self.init_t, self.tran_t])[0] # return shape: (batch size, ) cost = T.sum(steps_cost.dimshuffle(1, 0), axis=-1) # # return shape: (batch size, time steps - 1) # step_gs_scores = step_gs_scores.dimshuffle(1, 0) # # return shape: (batch size, ) # last_gs_score = step_gs_scores[:, -1] # print 'score_t2', step_scores.dimshuffle(1, 0, 2)[:, 0].eval() # print 'index_t2', step_indexs.dimshuffle(1, 0, 2)[:, 0].eval() # print 'gs_score_t2', step_gs_scores[:, 0].eval() # print 'score_t3', step_scores.dimshuffle(1, 0, 2)[:, 1].eval() # print 'index_t3', step_indexs.dimshuffle(1, 0, 2)[:, 1].eval() # print 'gs_score_t3', step_gs_scores[:, 1].eval() # print index_chain.eval() # print last_step_max_score.eval() # print last_gs_score.eval() # return shape: (exmaple num, time steps), (batch size, ), (batch size, ) #return [index_chain, last_step_max_score, last_gs_score] print 'cost', cost.eval() # return shape: (batch size, ) return cost
def train(self, train_sets, valid_sets, test_sets, n_epochs=200, learning_rate=0.1): train_set_x, train_set_y = train_sets valid_set_x, valid_set_y = valid_sets test_set_x, test_set_y = test_sets n_train_batches = train_set_x.get_value(borrow=True).shape[0] n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] n_test_batches = test_set_x.get_value(borrow=True).shape[0] n_train_batches //= self.batch_size n_valid_batches //= self.batch_size n_test_batches //= self.batch_size cost = -T.mean( T.log(self.final_output[T.arange(self.y.shape[0]), self.y])) error = T.mean(T.neq(T.argmax(self.final_output, axis=1), self.y)) # find all the parameters and update them using gradient descent params = self.params grads = T.grad(cost, params) updates = [(param_i, param_i - learning_rate * grad_i) for param_i, grad_i in zip(params, grads)] index = self.index batch_size = self.batch_size x = self.x y = self.y test_model = theano.function( [index], error, givens={ x: test_set_x[index * batch_size:(index + 1) * batch_size], y: test_set_y[index * batch_size:(index + 1) * batch_size] }) validate_model = theano.function( [index], error, givens={ x: valid_set_x[index * batch_size:(index + 1) * batch_size], y: valid_set_y[index * batch_size:(index + 1) * batch_size] }) train_model = theano.function( [index], cost, updates=updates, givens={ x: train_set_x[index * batch_size:(index + 1) * batch_size], y: train_set_y[index * batch_size:(index + 1) * batch_size] }) print('... training') # early-stopping parameters patience = 10000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is found improvement_threshold = 0.995 validation_frequency = min(n_train_batches, patience // 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = timeit.default_timer() epoch = 0 done_looping = False while (epoch < n_epochs): epoch = epoch + 1 for minibatch_index in range(n_train_batches): iter = (epoch - 1) * n_train_batches + minibatch_index if iter % 100 == 0: print('training @ iter = ', iter, flush=True) cost_ij = train_model(minibatch_index) if (iter + 1) % validation_frequency == 0: # compute zero-one loss on validation set validation_losses = [ validate_model(i) for i in range(n_valid_batches) ] this_validation_loss = numpy.mean(validation_losses) print('epoch {}, minibatch {}/{}, validation error {}%'. format(epoch, minibatch_index + 1, n_train_batches, this_validation_loss * 100.)) with open('model_{}.mod'.format(iter), 'wb') as f: pickle.dump(self.dump(), f) # if we got the best validation score until now if this_validation_loss < best_validation_loss: if this_validation_loss < best_validation_loss * \ improvement_threshold: patience = max(patience, iter * patience_increase) # save best validation score and iteration number best_validation_loss = this_validation_loss best_iter = iter test_losses = [ test_model(i) for i in range(n_test_batches) ] test_score = numpy.mean(test_losses) print((' epoch {}, minibatch {}/{}, test error of ' 'best model {}%').format(epoch, minibatch_index + 1, n_train_batches, test_score * 100.)) with open('test_{}.res'.format(iter), 'w') as f: print(network.predict(test_set_x), file=f) end_time = timeit.default_timer() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i, ' 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print(('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.)), file=sys.stderr)
# Define loss function and metrics, and get an updates dictionary X_sym = T.tensor4() y_sym = T.ivector() # We'll connect our output classifier to the last fully connected layer of the network net['new_output'] = DenseLayer(net['pool5'], num_units=8, nonlinearity=softmax, W=lasagne.init.Normal(0.01)) prediction = lasagne.layers.get_output(net['new_output'], X_sym) loss = lasagne.objectives.categorical_crossentropy(prediction, y_sym) loss = loss.mean() acc = T.mean(T.eq(T.argmax(prediction, axis=1), y_sym), dtype=theano.config.floatX) learning_rate = theano.shared(np.array(0.001, dtype=theano.config.floatX)) learning_rate_decay = np.array(0.3, dtype=theano.config.floatX) updates = OrderedDict() print("Setting learning rates...") for name, layer in net.items(): print(name) layer_params = layer.get_params(trainable=True) if name in ['new_output', 'fc1000']: layer_lr = learning_rate else: layer_lr = learning_rate / 10 if name != 'fc1000':
def fit(self, X_task, y): DEBUG_FLAG = True # self.max_epochs = 333 self.batch_size = 100 rng = np.random.RandomState(42) self.input_taskdata = T.matrix(dtype='float32', name='input_taskdata') self.input_restdata = T.matrix(dtype='float32', name='input_restdata') self.params_from_last_iters = [] n_input = X_task.shape[1] index = T.iscalar(name='index') # prepare data for theano computation if not DEBUG_FLAG: X_train_s = theano.shared(value=np.float32(X_task), name='X_train_s') y_train_s = theano.shared(value=np.int32(y), name='y_train_s') lr_train_samples = len(X_task) else: from sklearn.cross_validation import StratifiedShuffleSplit folder = StratifiedShuffleSplit(y, n_iter=1, test_size=0.20) new_trains, inds_val = iter(folder).next() X_train, X_val = X_task[new_trains], X_task[inds_val] y_train, y_val = y[new_trains], y[inds_val] X_train_s = theano.shared(value=np.float32(X_train), name='X_train_s', borrow=False) y_train_s = theano.shared(value=np.int32(y_train), name='y_train_s', borrow=False) # X_val_s = theano.shared(value=np.float32(X_val), # name='X_train_s', borrow=False) # y_val_s = theano.shared(value=np.int32(y_val), # name='y_cal_s', borrow=False) lr_train_samples = len(X_train) self.dbg_epochs_ = list() self.dbg_acc_train_ = list() self.dbg_acc_val_ = list() self.dbg_ae_cost_ = list() self.dbg_lr_cost_ = list() self.dbg_ae_nonimprovesteps = list() self.dbg_acc_other_ds_ = list() self.dbg_prfs_ = list() self.dbg_prfs_other_ds_ = list() # computation graph: logistic regression clf_n_output = 18 # number of labels my_y = T.ivector(name='y') bV0_vals = np.zeros(clf_n_output).astype(np.float32) self.bV0 = theano.shared(value=bV0_vals, name='bV0') V0_vals = rng.randn(n_input, clf_n_output).astype( np.float32) * self.gain1 self.V0s = theano.shared(V0_vals) self.p_y_given_x = T.nnet.softmax( T.dot(self.input_taskdata, self.V0s) + self.bV0) self.lr_cost = -T.mean( T.log(self.p_y_given_x)[T.arange(my_y.shape[0]), my_y]) self.lr_cost = (self.lr_cost + T.mean(abs(self.V0s)) * self.penalty_l1 + T.mean(abs(self.bV0)) * self.penalty_l1 + T.mean( (self.V0s**np.float32(2))) * self.penalty_l2 + T.mean((self.bV0**np.float32(2))) * self.penalty_l2) self.y_pred = T.argmax(self.p_y_given_x, axis=1) givens_lr = { self.input_taskdata: X_train_s[index * self.batch_size:(index + 1) * self.batch_size], my_y: y_train_s[index * self.batch_size:(index + 1) * self.batch_size] } params = [self.V0s, self.bV0] updates = self.RMSprop(cost=self.lr_cost, params=params, lr=self.learning_rate) f_train_lr = theano.function([index], [self.lr_cost], givens=givens_lr, updates=updates) # optimization loop start_time = time.time() lr_last_cost = np.inf ae_cur_cost = np.inf no_improve_steps = 0 acc_train, acc_val = 0., 0. for i_epoch in range(self.max_epochs): if i_epoch == 1: epoch_dur = time.time() - start_time total_mins = (epoch_dur * self.max_epochs) / 60 hs, mins = divmod(total_mins, 60) print("Max estimated duration: %i hours and %i minutes" % (hs, mins)) lr_n_batches = lr_train_samples // self.batch_size for i in range(lr_n_batches): lr_cur_cost = f_train_lr(i)[0] # evaluate epoch cost if lr_last_cost - lr_cur_cost < 0.1: no_improve_steps += 1 else: lr_last_cost = lr_cur_cost no_improve_steps = 0 # logistic lr_last_cost = lr_cur_cost acc_train = self.score(X_train, y_train) acc_val, prfs_val = self.score(X_val, y_val, return_prfs=True) print( 'E:%i, ae_cost:%.4f, lr_cost:%.4f, train_score:%.2f, vald_score:%.2f, ae_badsteps:%i' % (i_epoch + 1, ae_cur_cost, lr_cur_cost, acc_train, acc_val, no_improve_steps)) if (i_epoch % 10 == 0): self.dbg_ae_cost_.append(ae_cur_cost) self.dbg_lr_cost_.append(lr_cur_cost) self.dbg_epochs_.append(i_epoch + 1) self.dbg_ae_nonimprovesteps.append(no_improve_steps) self.dbg_acc_train_.append(acc_train) self.dbg_acc_val_.append(acc_val) self.dbg_prfs_.append(prfs_val) # if i_epoch > (self.max_epochs - 100): param_pool = self.get_param_pool() self.params_from_last_iters.append(param_pool) total_mins = (time.time() - start_time) / 60 hs, mins = divmod(total_mins, 60) print("Final duration: %i hours and %i minutes" % (hs, mins)) return self
def main(): # step 1: get the data and define all the usual variables X, Y = get_normalized_data() max_iter = 20 print_period = 10 lr = 0.00004 reg = 0.01 X = X.astype(np.float32) Y = Y.astype(np.float32) Xtrain = X[:-1000,] Ytrain = Y[:-1000] Xtest = X[-1000:,] Ytest = Y[-1000:] Ytrain_ind = y2indicator(Ytrain).astype(np.float32) Ytest_ind = y2indicator(Ytest).astype(np.float32) N, D = Xtrain.shape batch_sz = 500 n_batches = N / batch_sz M = 300 K = 10 W1_init = np.random.randn(D, M) / 28 b1_init = np.zeros(M) W2_init = np.random.randn(M, K) / np.sqrt(M) b2_init = np.zeros(K) # step 2: define theano variables and expressions thX = T.matrix('X') thT = T.matrix('T') W1 = theano.shared(W1_init.astype(np.float32), 'W1') b1 = theano.shared(b1_init.astype(np.float32), 'b1') W2 = theano.shared(W2_init.astype(np.float32), 'W2') b2 = theano.shared(b2_init.astype(np.float32), 'b2') # we can use the built-in theano functions to do relu and softmax thZ = relu( thX.dot(W1) + b1 ) # relu is new in version 0.7.1 but just in case you don't have it thY = T.nnet.softmax( thZ.dot(W2) + b2 ) # define the cost function and prediction cost = -(thT * T.log(thY)).sum() + reg*((W1*W1).sum() + (b1*b1).sum() + (W2*W2).sum() + (b2*b2).sum()) prediction = T.argmax(thY, axis=1) # step 3: training expressions and functions # we can just include regularization as part of the cost because it is also automatically differentiated! # update_W1 = W1 - lr*(T.grad(cost, W1) + reg*W1) # update_b1 = b1 - lr*(T.grad(cost, b1) + reg*b1) # update_W2 = W2 - lr*(T.grad(cost, W2) + reg*W2) # update_b2 = b2 - lr*(T.grad(cost, b2) + reg*b2) update_W1 = W1 - lr*T.grad(cost, W1) update_b1 = b1 - lr*T.grad(cost, b1) update_W2 = W2 - lr*T.grad(cost, W2) update_b2 = b2 - lr*T.grad(cost, b2) train = theano.function( inputs=[thX, thT], updates=[(W1, update_W1), (b1, update_b1), (W2, update_W2), (b2, update_b2)], ) # create another function for this because we want it over the whole dataset get_prediction = theano.function( inputs=[thX, thT], outputs=[cost, prediction], ) t0 = datetime.now() for i in xrange(max_iter): for j in xrange(n_batches): Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),] Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),] train(Xbatch, Ybatch) if j % print_period == 0: cost_val, prediction_val = get_prediction(Xtest, Ytest_ind) err = error_rate(prediction_val, Ytest) print "Cost / err at iteration i=%d, j=%d: %.3f / %.3f" % (i, j, cost_val, err) print "Training time:", datetime.now() - t0
from models import ResNet_FullPreActivation, ResNet_BottleNeck_FullPreActivation from utils import load_pickle_data_test BATCHSIZE = 1 ''' Set up all theano functions ''' X = T.tensor4('X') Y = T.ivector('y') # set up theano functions to generate output by feeding data through network, any test outputs should be deterministic output_layer = ResNet_BottleNeck_FullPreActivation(X, n=18) output_test = lasagne.layers.get_output(output_layer, deterministic=True) output_class = T.argmax(output_test, axis=1) # set up training and prediction functions predict_proba = theano.function(inputs=[X], outputs=output_test) predict_class = theano.function(inputs=[X], outputs=output_class) ''' Load data and make predictions ''' test_X, test_y = load_pickle_data_test() # load network weights f = gzip.open('data/weights/resnet164_fullpreactivation.pklz', 'rb') all_params = pickle.load(f) f.close() helper.set_all_param_values(output_layer, all_params)
print ipt.ndim, 'dimensions' except: print 'no ndm' print min_informative_str(ipt) if found > 0: print type(node.op), found try: print '\t', type(node.op.scalar_op) except: pass print count test = CIFAR10(which_set='test', one_hot=True, gcn=55.) yl = T.argmax(yb, axis=1) mf1acc = 1. - T.neq(yl, T.argmax(ymf1, axis=1)).mean() #mfnacc = 1.-T.neq(yl , T.argmax(mfny,axis=1)).mean() batch_acc = function([Xb, yb], [mf1acc]) def accs(): mf1_accs = [] for i in xrange(10000 / batch_size): mf1_accs.append( batch_acc( test.get_topological_view(test.X[i * batch_size:(i + 1) * batch_size, :]), test.y[i * batch_size:(i + 1) * batch_size, :])[0])
def main(argv): ###-------------------------------- Get files to proccess ---------------------------------------------- # just read header to get layer dimensions #files = glob.glob("B:\\NN_data\\THEANO_DATA\\tmp\\*.tmp") context = 5 in_frame_num = context * 2 + 1 feature_per_frame = 41 output_vec_len = 214 files = glob.glob(argv[0] + "\\*.mfsc") #inputs, in_frame_num, feature_per_frame, output_vec_len = read_data_only_inputs(files[0]) print("\feature_per_frame: {}".format(feature_per_frame)) print("\output_vec_len: {}".format(output_vec_len)) ###-------------------------------- Read apriori state ppb --------------------------------------------- apriori_ppb_input = read_state_apriori_ppb(argv[2], output_vec_len) #print(apriori_ppb) #exit(0) ###-------------------------------- BUILD Theano functions ---------------------------------------------- network = None layer2reg = None total_batch = 0 # Prepare Theano variables for inputs and targets input_var = T.tensor4('inputs') apriori_ppb = T.fmatrix('apriori') target_var = T.ivector('targets') lrate_var = theano.tensor.scalar('lrate', dtype='float32') momentum_var = theano.tensor.scalar('momentum', dtype='float32') #create network #network, layer2reg = build_cnn(input_var, in_frame_num, feature_per_frame, output_vec_len) with open(argv[1], "rb") as f2: context = int(f2.readline()) in_frame_num = context * 2 + 1 filter_num = int(f2.readline()) neuron_num = int(f2.readline()) network, layer2reg = build_cnn(input_var, in_frame_num, feature_per_frame, output_vec_len, filter_num, neuron_num) lasagne.layers.set_all_param_values(network, np.load(f2)) # Create a loss expression for training, i.e., a scalar objective we want # to minimize (for our multi-class problem, it is the cross-entropy loss): prediction = lasagne.layers.get_output(network) loss = lasagne.objectives.categorical_crossentropy(prediction, target_var) loss = loss.mean() # SGB with momentm and changing learning rate params = lasagne.layers.get_all_params(network, trainable=True) updates = lasagne.updates.nesterov_momentum(loss, params, learning_rate=lrate_var, momentum=momentum_var) test_prediction = lasagne.layers.get_output(network, deterministic=True) test_loss = lasagne.objectives.categorical_crossentropy( test_prediction, target_var) test_loss = test_loss.mean() test_acc = T.mean(T.eq(T.argmax(test_prediction, axis=1), target_var), dtype=theano.config.floatX) # Compile a training function train_fn = theano.function( [input_var, target_var, lrate_var, momentum_var], loss, updates=updates) # Compile a second function computing the validation loss and accuracy: val_fn = theano.function([input_var, target_var], [test_loss, test_acc]) test_prediction_1 = lasagne.layers.get_output(network, deterministic=True) test_prediction = T.dot(test_prediction_1, apriori_ppb) test_result = T.argmax(test_prediction_1, axis=1) # Compile a second function computing the validation loss and accuracy: predict_fn = theano.function([input_var], [test_result]) predict_fn_j = theano.function([input_var, apriori_ppb], [test_prediction]) print("theano functions compiled") ###-------------------------------- Final lunch ---------------------------------------------- file_counter = 0 print("") for one_file in files: inputs_j = load_mfsc_data_with_context(one_file, context) dot_index = one_file.find('.mfsc') output_file_name = one_file[0:dot_index] + ".bin" print(inputs_j[0][0]) sample_num = len(inputs_j) base_file_name = output_file_name[output_file_name.rfind('\\') + 1:] file_counter += 1 if file_counter % 5 == 0: print(base_file_name) else: print(base_file_name + " ", end="") with open(output_file_name, 'wb') as f: f.write(struct.pack('I', swap32(len(inputs_j)))) f.write(struct.pack('I', swap32(100000))) f.write(struct.pack('H', output_vec_len * 4)[::-1]) f.write(struct.pack('H', 9)[::-1]) begin = 0 batch = 8 while begin < sample_num: batch_size = np.minimum(batch, sample_num - begin) result = predict_fn_j( inputs_j[begin:begin + batch_size].reshape( batch_size, 3, in_frame_num, feature_per_frame), apriori_ppb_input) res = np.log(result)[0] for aaa in res: for ppb in aaa: f.write(ppb) #f.write(struct.pack('f',ppb)[::-1]) begin += batch_size print("")
def generator_step_sm(x_tm1, h_tm1, m_tm1, s_tm1, tau, eps): """One step of the generative decoder version.""" # x_tm1 is `BxT` one-hot, h_tm1 is `batch x ...` # m_tm1 is `batch`, tau, eps are scalars # collect the inputs inputs = {l_decoder_embed: x_tm1.dimshuffle(0, "x", 1), l_decoder_mask: m_tm1.dimshuffle(0, "x")} # Connect the prev variables to the the hidden and stack state feeds j = 0 for layer in dec_rnn_layers: inputs[layer.hid_init] = slice_(h_tm1, j, layer.num_units) j += layer.num_units j = 0 for layer in dec_rnn_layers: layer = layer.input_layers[1] dep, wid = layer.output_shape[-2:] stack_slice_ = slice_(s_tm1, j, dep * wid) inputs[layer] = stack_slice_.reshape((-1, dep, wid)) j += dep * wid # Get the outputs outputs = [l_decoder_reembedder] for pair in zip(dec_rnn_layers_sliced, dec_rnn_layers_stack): outputs.extend(pair) # propagate through the decoder column logit_t, *rest = lasagne.layers.get_output(outputs, inputs, deterministic=True) h_t_list, s_t_list = rest[::2], rest[1::2] # Pack the hidden and flattened stack states h_t = tt.concatenate(h_t_list, axis=-1) s_t = tt.concatenate([v.flatten(ndim=2) for v in s_t_list], axis=-1) # Generate the next symbol: logit_t is `Bx1xV` logit_t = logit_t[:, 0] prob_t = tt.nnet.softmax(logit_t) # Gumbel-softmax sampling: Gumbel (e^{-e^{-x}}) distributed random noise gumbel = -tt.log(-tt.log(theano_random_state.uniform(size=logit_t.shape) + eps) + eps) # logit_t = theano.ifelse.ifelse(tt.gt(tau, 0), gumbel + logit_t, logit_t) # inv_temp = theano.ifelse.ifelse(tt.gt(tau, 0), 1.0 / tau, tt.constant(1.0)) logit_t = tt.switch(tt.gt(tau, 0), gumbel + logit_t, logit_t) inv_temp = tt.switch(tt.gt(tau, 0), 1.0 / tau, tt.constant(1.0)) # Get the softmax: x_t is `BxV` x_t = tt.nnet.softmax(logit_t * inv_temp) # Get the best symbol c_t = tt.cast(tt.argmax(x_t, axis=-1), "int8") # Get the estimated probability of the picked symbol. p_t = prob_t[tt.arange(c_t.shape[0]), c_t] # Compute the mask and inhibit the propagation on a stop symbol. # Recurrent layers return the previous state if m_tm1 is Fasle m_t = m_tm1 & tt.gt(c_t, vocab.index("\x03")) c_t = tt.switch(m_t, c_t, vocab.index("\x03")) # There is no need to freeze the states as they will be frozen by # the RNN passthrough according to the mask `m_t`. # Embed the current character. x_t = tt.dot(x_t, l_embed_char.W) return x_t, h_t, m_t, s_t, p_t, c_t
loss_remember = (I0*lasagne.objectives.squared_error(W0,W_n0)).mean()+ \ (I1*lasagne.objectives.squared_error(W1,W_n1)).mean()+ \ (I2*lasagne.objectives.squared_error(W2,W_n2)).mean() loss = loss_class + 100 * loss_remember # Get network params, with specifications of manually updated ones params = lasagne.layers.get_all_params(network, trainable=True) updates = lasagne.updates.sgd(loss, params, learning_rate=0.0001) #updates = lasagne.updates.adam(loss,params,learning_rate=0.00001) #updates = lasagne.updates.nesterov_momentum(loss,params,learning_rate=0.01) test_prediction = lasagne.layers.get_output(network, deterministic=True) test_loss = lasagne.objectives.categorical_crossentropy( test_prediction, target_var) test_loss = test_loss.mean() test_acc = T.mean(T.eq(T.argmax(test_prediction, axis=1), target_var), dtype=theano.config.floatX) # Compile theano function computing the training validation loss and accuracy: train_fn = theano.function([input_var, target_var, W0, W1, W2, I0, I1, I2], [loss, loss_class, loss_remember], updates=updates) #train_fn = theano.function([input_var, target_var], loss, updates=updates) val_fn = theano.function([input_var, target_var], [test_loss, test_acc]) #gradient_fn = theano.function([input_var, target_var], gradient) # The training loop print("Starting training...") num_epochs = 250 for epoch in range(num_epochs):
def train(self, Xs, Ys, Xv, Yv, mdl, data_folder='data/', out_folder='out/'): data_folder = os.path.join(data_folder, 'imgs/', 'train/') input_var = mdl.input_var net = mdl.get_output_layer() target_var = T.ivector('targets') prediction = lasagne.layers.get_output(net) loss = lasagne.objectives.categorical_crossentropy(prediction, target_var) loss = loss.mean() params = lasagne.layers.get_all_params(net, trainable=True) grads = T.grad(loss, params) test_prediction = lasagne.layers.get_output(net, deterministic=True) test_loss = lasagne.objectives. \ categorical_crossentropy(test_prediction, target_var) test_loss = test_loss.mean() test_acc = T.mean(T.eq(T.argmax(test_prediction, axis=1), target_var), dtype=theano.config.floatX) logger.info("Compiling network functions...") grads_fn = theano.function([input_var, target_var], grads) train_fn = theano.function([input_var, target_var], loss) val_fn = theano.function([input_var, target_var], [test_loss, test_acc]) predict_proba = theano.function([input_var], test_prediction) logger.info("Training...") logger.info('GPU Free Mem: %.3f' % gpu_free_mem('gb')) # TODO change to steps epochs = self.max_iter / len(Xs) best_val_loss, best_epoch = None, None best_mdl_path = os.path.join(out_folder, 'best_model.npz') if not os.path.exists(out_folder): os.makedirs(out_folder) steps = 0 for epoch in range(epochs): start_time = time.time() train_err, train_batches = 0, 0 data_s = FileSystemData(Xs, Ys, data_folder, self.batch_size, infinite=False, augment=True, shuffle=True) step_err, step_g = 0, None for batch in tqdm(data_s, total=data_s.steps, leave=False): inputs, targets = batch inputs = floatX(np.array([mdl.preprocess(x) for x in inputs])) batch_err = train_fn(inputs, targets) batch_g = grads_fn(inputs, targets) if step_g is None: step_g = batch_g else: step_g = [s_g + b_g for s_g, b_g in zip(step_g, batch_g)] train_err += batch_err step_err += batch_err train_batches += 1 if train_batches % self.iter_size == 0: step_g = [g / np.array(self.iter_size) for g in step_g] if steps == 0: t_prev, m_prev, u_prev = \ init_adam(batch_g, params) updates = step_adam(step_g, params, t_prev, m_prev, u_prev, learning_rate=self.base_lr) for p, new_val in updates.items(): p.set_value(new_val) steps += 1 step_err, step_g = 0, None data_v = FileSystemData(Xv, Yv, data_folder, self.batch_size, infinite=False, augment=False, shuffle=False) val_err, val_acc, val_batches = 0, 0, 0 for batch in tqdm(data_v, total=data_v.steps, leave=False): inputs, targets = batch inputs = floatX(np.array([mdl.preprocess(x) for x in inputs])) err, acc = val_fn(inputs, targets) val_err += err val_acc += acc val_batches += 1 train_loss = train_err / train_batches val_loss = val_err / val_batches val_acc = val_acc / val_batches * 100 end_time = time.time() - start_time if not best_val_loss or val_loss < best_val_loss: best_val_loss = val_loss best_epoch = epoch np.savez(best_mdl_path, *lasagne.layers.get_all_param_values(net)) snapshot_path = os.path.join(out_folder, 'snapshot_epoch_%d.npz' % epoch) np.savez(snapshot_path, *lasagne.layers.get_all_param_values(net)) logger.info("epoch[%d] -- Ls: %.3f | Lv: %.3f | ACCv: %.3f | Ts: %.3f" % (epoch, train_loss, val_loss, val_acc, end_time)) logger.info("loading best model: epoch[%d]" % best_epoch) with np.load(best_mdl_path) as f: param_values = [f['arr_%d' % i] for i in range(len(f.files))] lasagne.layers.set_all_param_values(net, param_values) return predict_proba
def __init__( self, input_shape, output_dim, prob_network=None, hidden_sizes=(32, 32), hidden_nonlinearity=NL.rectify, optimizer=None, use_trust_region=True, step_size=0.01, normalize_inputs=True, name=None, ): """ :param input_shape: Shape of the input data. :param output_dim: Dimension of output. :param hidden_sizes: Number of hidden units of each layer of the mean network. :param hidden_nonlinearity: Non-linearity used for each layer of the mean network. :param optimizer: Optimizer for minimizing the negative log-likelihood. :param use_trust_region: Whether to use trust region constraint. :param step_size: KL divergence constraint for each iteration """ Serializable.quick_init(self, locals()) if optimizer is None: if use_trust_region: optimizer = PenaltyLbfgsOptimizer() else: optimizer = LbfgsOptimizer() self.output_dim = output_dim self._optimizer = optimizer if prob_network is None: prob_network = MLP( input_shape=input_shape, output_dim=output_dim, hidden_sizes=hidden_sizes, hidden_nonlinearity=hidden_nonlinearity, output_nonlinearity=NL.softmax, ) l_prob = prob_network.output_layer LasagnePowered.__init__(self, [l_prob]) xs_var = prob_network.input_layer.input_var ys_var = TT.imatrix("ys") old_prob_var = TT.matrix("old_prob") x_mean_var = theano.shared(np.zeros((1, ) + input_shape), name="x_mean", broadcastable=(True, ) + (False, ) * len(input_shape)) x_std_var = theano.shared(np.ones((1, ) + input_shape), name="x_std", broadcastable=(True, ) + (False, ) * len(input_shape)) normalized_xs_var = (xs_var - x_mean_var) / x_std_var prob_var = L.get_output(l_prob, {prob_network.input_layer: normalized_xs_var}) old_info_vars = dict(prob=old_prob_var) info_vars = dict(prob=prob_var) dist = self._dist = Categorical(output_dim) mean_kl = TT.mean(dist.kl_sym(old_info_vars, info_vars)) loss = -TT.mean(dist.log_likelihood_sym(ys_var, info_vars)) predicted = tensor_utils.to_onehot_sym(TT.argmax(prob_var, axis=1), output_dim) self._f_predict = tensor_utils.compile_function([xs_var], predicted) self._f_prob = tensor_utils.compile_function([xs_var], prob_var) self._prob_network = prob_network self._l_prob = l_prob optimizer_args = dict( loss=loss, target=self, network_outputs=[prob_var], ) if use_trust_region: optimizer_args["leq_constraint"] = (mean_kl, step_size) optimizer_args["inputs"] = [xs_var, ys_var, old_prob_var] else: optimizer_args["inputs"] = [xs_var, ys_var] self._optimizer.update_opt(**optimizer_args) self._use_trust_region = use_trust_region self._name = name self._normalize_inputs = normalize_inputs self._x_mean_var = x_mean_var self._x_std_var = x_std_var
def fit(self, X, learning_rate=1e-5, mu=0.99, epochs=10, show_fig=True, activation=T.nnet.relu, RecurrentUnit=GRU, normalize=True): D = self.D V = self.V N = len(X) We = init_weight(V, D) self.hidden_layers = [] Mi = D for Mo in self.hidden_layer_sizes: ru = RecurrentUnit(Mi, Mo, activation) self.hidden_layers.append(ru) Mi = Mo Wo = init_weight(Mi, V) bo = np.zeros(V) self.We = theano.shared(We) self.Wo = theano.shared(Wo) self.bo = theano.shared(bo) self.params = [self.Wo, self.bo] for ru in self.hidden_layers: self.params += ru.params thX = T.ivector('X') thY = T.ivector('Y') Z = self.We[thX] for ru in self.hidden_layers: Z = ru.output(Z) py_x = T.nnet.softmax(Z.dot(self.Wo) + self.bo) prediction = T.argmax(py_x, axis=1) # let's return py_x too so we can draw a sample instead self.predict_op = theano.function( inputs=[thX], outputs=[py_x, prediction], allow_input_downcast=True, ) cost = -T.mean(T.log(py_x[T.arange(thY.shape[0]), thY])) grads = T.grad(cost, self.params) dparams = [theano.shared(p.get_value()*0) for p in self.params] dWe = theano.shared(self.We.get_value()*0) gWe = T.grad(cost, self.We) dWe_update = mu*dWe - learning_rate*gWe We_update = self.We + dWe_update if normalize: We_update /= We_update.norm(2) updates = [ (p, p + mu*dp - learning_rate*g) for p, dp, g in zip(self.params, dparams, grads) ] + [ (dp, mu*dp - learning_rate*g) for dp, g in zip(dparams, grads) ] + [ (self.We, We_update), (dWe, dWe_update) ] self.train_op = theano.function( inputs=[thX, thY], outputs=[cost, prediction], updates=updates ) costs = [] for i in range(epochs): t0 = datetime.now() X = shuffle(X) n_correct = 0 n_total = 0 cost = 0 for j in range(N): if np.random.random() < 0.01 or len(X[j]) <= 1: input_sequence = [0] + X[j] output_sequence = X[j] + [1] else: input_sequence = [0] + X[j][:-1] output_sequence = X[j] n_total += len(output_sequence) # test: try: # we set 0 to start and 1 to end c, p = self.train_op(input_sequence, output_sequence) except Exception as e: PYX, pred = self.predict_op(input_sequence) print("input_sequence len:", len(input_sequence)) print("PYX.shape:",PYX.shape) print("pred.shape:", pred.shape) raise e # print "p:", p cost += c # print "j:", j, "c:", c/len(X[j]+1) for pj, xj in zip(p, output_sequence): if pj == xj: n_correct += 1 if j % 200 == 0: sys.stdout.write("j/N: %d/%d correct rate so far: %f\r" % (j, N, float(n_correct)/n_total)) sys.stdout.flush() print("i:", i, "cost:", cost, "correct rate:", (float(n_correct)/n_total), "time for epoch:", (datetime.now() - t0)) costs.append(cost) if show_fig: plt.plot(costs) plt.show()
def main(num_epochs=NEPOCH): print("Loading data ...") snli = SNLI(batch_size=BSIZE) train_batches = list(snli.train_minibatch_generator()) dev_batches = list(snli.dev_minibatch_generator()) test_batches = list(snli.test_minibatch_generator()) W_word_embedding = snli.weight # W shape: (# vocab size, WE_DIM) W_word_embedding = snli.weight / \ (numpy.linalg.norm(snli.weight, axis=1).reshape(snli.weight.shape[0], 1) + \ 0.00001) del snli print("Building network ...") ########### input layers ########### # hypothesis input_var_h = T.TensorType('int32', [False, False])('hypothesis_vector') input_var_h.tag.test_value = numpy.hstack( (numpy.random.randint(1, 10000, (BSIZE, 18), 'int32'), numpy.zeros( (BSIZE, 6)).astype('int32'))) l_in_h = lasagne.layers.InputLayer(shape=(BSIZE, None), input_var=input_var_h) input_mask_h = T.TensorType('int32', [False, False])('hypo_mask') input_mask_h.tag.test_value = numpy.hstack((numpy.ones( (BSIZE, 18), dtype='int32'), numpy.zeros((BSIZE, 6), dtype='int32'))) input_mask_h.tag.test_value[1, 18:22] = 1 l_mask_h = lasagne.layers.InputLayer(shape=(BSIZE, None), input_var=input_mask_h) # premise input_var_p = T.TensorType('int32', [False, False])('premise_vector') input_var_p.tag.test_value = numpy.hstack( (numpy.random.randint(1, 10000, (BSIZE, 16), 'int32'), numpy.zeros( (BSIZE, 3)).astype('int32'))) l_in_p = lasagne.layers.InputLayer(shape=(BSIZE, None), input_var=input_var_p) input_mask_p = T.TensorType('int32', [False, False])('premise_mask') input_mask_p.tag.test_value = numpy.hstack((numpy.ones( (BSIZE, 16), dtype='int32'), numpy.zeros((BSIZE, 3), dtype='int32'))) input_mask_p.tag.test_value[1, 16:18] = 1 l_mask_p = lasagne.layers.InputLayer(shape=(BSIZE, None), input_var=input_mask_p) ################################### # output shape (BSIZE, None, WEDIM) l_hypo_embed = lasagne.layers.EmbeddingLayer( l_in_h, input_size=W_word_embedding.shape[0], output_size=W_word_embedding.shape[1], W=W_word_embedding) l_prem_embed = lasagne.layers.EmbeddingLayer( l_in_p, input_size=W_word_embedding.shape[0], output_size=W_word_embedding.shape[1], W=l_hypo_embed.W) # EMBEDING MAPPING: output shape (BSIZE, None, WEMAP) l_hypo_reduced_embed = DenseLayer3DInput(l_hypo_embed, num_units=WEMAP, b=None, nonlinearity=None) l_hypo_embed_dpout = lasagne.layers.DropoutLayer(l_hypo_reduced_embed, p=DPOUT, rescale=True) l_prem_reduced_embed = DenseLayer3DInput(l_prem_embed, num_units=WEMAP, W=l_hypo_reduced_embed.W, b=None, nonlinearity=None) l_prem_embed_dpout = lasagne.layers.DropoutLayer(l_prem_reduced_embed, p=DPOUT, rescale=True) # ATTEND l_hypo_embed_hid1 = DenseLayer3DInput( l_hypo_embed_dpout, num_units=EMBDHIDA, b=None, nonlinearity=lasagne.nonlinearities.rectify) l_hypo_embed_hid1_dpout = lasagne.layers.DropoutLayer(l_hypo_embed_hid1, p=DPOUT, rescale=True) l_hypo_embed_hid2 = DenseLayer3DInput( l_hypo_embed_hid1_dpout, num_units=EMBDHIDB, b=None, nonlinearity=lasagne.nonlinearities.rectify) l_prem_embed_hid1 = DenseLayer3DInput( l_prem_embed_dpout, num_units=EMBDHIDA, W=l_hypo_embed_hid1.W, b=None, nonlinearity=lasagne.nonlinearities.rectify) l_prem_embed_hid1_dpout = lasagne.layers.DropoutLayer(l_prem_embed_hid1, p=DPOUT, rescale=True) l_prem_embed_hid2 = DenseLayer3DInput( l_prem_embed_hid1_dpout, num_units=EMBDHIDB, W=l_hypo_embed_hid2.W, b=None, nonlinearity=lasagne.nonlinearities.rectify) # output dim: (BSIZE, NROWx, NROWy) l_e = ComputeEmbeddingPool([l_hypo_embed_hid1, l_prem_embed_hid2]) # output dim: (BSIZE, NROWy, DIM) l_hypo_weighted = AttendOnEmbedding([l_hypo_reduced_embed, l_e], masks=[l_mask_h, l_mask_p], direction='col') # output dim: (BSIZE, NROWx, DIM) l_prem_weighted = AttendOnEmbedding([l_prem_reduced_embed, l_e], masks=[l_mask_h, l_mask_p], direction='row') # COMPARE # output dim: (BSIZE, NROW, 4*LSTMHID) l_hypo_premwtd = lasagne.layers.ConcatLayer( [l_hypo_reduced_embed, l_prem_weighted], axis=2) l_prem_hypowtd = lasagne.layers.ConcatLayer( [l_prem_reduced_embed, l_hypo_weighted], axis=2) l_hypo_premwtd_dpout = lasagne.layers.DropoutLayer(l_hypo_premwtd, p=DPOUT, rescale=True) l_hypo_comphid1 = DenseLayer3DInput( l_hypo_premwtd_dpout, num_units=COMPHIDA, b=None, nonlinearity=lasagne.nonlinearities.rectify) l_hypo_comphid1_dpout = lasagne.layers.DropoutLayer(l_hypo_comphid1, p=DPOUT, rescale=True) l_hypo_comphid2 = DenseLayer3DInput( l_hypo_comphid1_dpout, num_units=COMPHIDB, b=None, nonlinearity=lasagne.nonlinearities.rectify) l_prem_hypowtd_dpout = lasagne.layers.DropoutLayer(l_prem_hypowtd, p=DPOUT, rescale=True) l_prem_comphid1 = DenseLayer3DInput( l_prem_hypowtd_dpout, num_units=COMPHIDA, W=l_hypo_comphid1.W, b=None, nonlinearity=lasagne.nonlinearities.rectify) l_prem_comphid1_dpout = lasagne.layers.DropoutLayer(l_prem_comphid1, p=DPOUT, rescale=True) l_prem_comphid2 = DenseLayer3DInput( l_prem_comphid1_dpout, num_units=COMPHIDB, W=l_hypo_comphid2.W, b=None, nonlinearity=lasagne.nonlinearities.rectify) # AGGREGATE # output dim: (BSIZE, 4*LSTMHID) l_hypo_mean = MeanOverDim(l_hypo_comphid2, mask=l_mask_h, dim=1) l_prem_mean = MeanOverDim(l_prem_comphid2, mask=l_mask_p, dim=1) l_v1v2 = lasagne.layers.ConcatLayer([l_hypo_mean, l_prem_mean], axis=1) l_v1v2_dpout = lasagne.layers.DropoutLayer(l_v1v2, p=DPOUT, rescale=True) l_outhid1 = lasagne.layers.DenseLayer( l_v1v2_dpout, num_units=OUTHID, b=None, nonlinearity=lasagne.nonlinearities.rectify) l_outhid1_dpout = lasagne.layers.DropoutLayer(l_outhid1, p=DPOUT, rescale=True) l_outhid2 = lasagne.layers.DenseLayer( l_outhid1_dpout, num_units=OUTHID, b=None, nonlinearity=lasagne.nonlinearities.rectify) # l_outhid2_dpout = lasagne.layers.DropoutLayer(l_outhid2, p=DPOUT, rescale=True) l_output = lasagne.layers.DenseLayer( l_outhid2, num_units=3, b=None, nonlinearity=lasagne.nonlinearities.softmax) ########### target, cost, validation, etc. ########## target_values = T.ivector('target_output') target_values.tag.test_value = numpy.asarray([ 1, ] * BSIZE, dtype='int32') network_output = lasagne.layers.get_output(l_output) network_prediction = T.argmax(network_output, axis=1) error_rate = T.mean(T.neq(network_prediction, target_values)) network_output_clean = lasagne.layers.get_output(l_output, deterministic=True) network_prediction_clean = T.argmax(network_output_clean, axis=1) error_rate_clean = T.mean(T.neq(network_prediction_clean, target_values)) cost = T.mean( T.nnet.categorical_crossentropy(network_output, target_values)) cost_clean = T.mean( T.nnet.categorical_crossentropy(network_output_clean, target_values)) # Retrieve all parameters from the network all_params = lasagne.layers.get_all_params(l_output) if not UPDATEWE: all_params.remove(l_hypo_embed.W) numparams = sum( [numpy.prod(i) for i in [i.shape.eval() for i in all_params]]) print("Number of params: {}\nName\t\t\tShape\t\t\tSize".format(numparams)) print("-----------------------------------------------------------------") for item in all_params: print("{0:24}{1:24}{2}".format(item, item.shape.eval(), numpy.prod(item.shape.eval()))) # if exist param file then load params look_for = 'params' + os.sep + 'params_' + filename + '.pkl' if os.path.isfile(look_for): print("Resuming from file: " + look_for) all_param_values = cPickle.load(open(look_for, 'rb')) for p, v in zip(all_params, all_param_values): p.set_value(v) # Compute SGD updates for training print("Computing updates ...") updates = lasagne.updates.adagrad(cost, all_params, LR) # Theano functions for training and computing cost print("Compiling functions ...") train = theano.function([ l_in_h.input_var, l_mask_h.input_var, l_in_p.input_var, l_mask_p.input_var, target_values ], [cost, error_rate], updates=updates) # mode=NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=False)) compute_cost = theano.function([ l_in_h.input_var, l_mask_h.input_var, l_in_p.input_var, l_mask_p.input_var, target_values ], [cost_clean, error_rate_clean]) # mode=NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=False)) def evaluate(mode): if mode == 'dev': data = dev_batches if mode == 'test': data = test_batches set_cost = 0. set_error_rate = 0. for batches_seen, (hypo, hm, premise, pm, truth) in enumerate(data, 1): _cost, _error = compute_cost(hypo, hm, premise, pm, truth) set_cost = (1.0 - 1.0 / batches_seen) * set_cost + \ 1.0 / batches_seen * _cost set_error_rate = (1.0 - 1.0 / batches_seen) * set_error_rate + \ 1.0 / batches_seen * _error return set_cost, set_error_rate print("Done. Evaluating scratch model ...") dev_set_cost, dev_set_error = evaluate('dev') print("BEFORE TRAINING: dev cost %f, error %f" % (dev_set_cost, dev_set_error)) print("Training ...") try: for epoch in range(num_epochs): train_set_cost = 0. train_set_error = 0. start = time.time() for batches_seen, (hypo, hm, premise, pm, truth) in enumerate(train_batches, 1): _cost, _error = train(hypo, hm, premise, pm, truth) train_set_cost = (1.0 - 1.0 / batches_seen) * train_set_cost + \ 1.0 / batches_seen * _cost train_set_error = (1.0 - 1.0 / batches_seen) * train_set_error + \ 1.0 / batches_seen * _error if (batches_seen * BSIZE) % 5000 == 0: end = time.time() print("Sample %d %.2fs, lr %.4f, train cost %f, error %f" % (batches_seen * BSIZE, end - start, LR, train_set_cost, train_set_error)) start = end if (batches_seen * BSIZE) % 100000 == 0: dev_set_cost, dev_set_error = evaluate('dev') print("***dev cost %f, error %f" % (dev_set_cost, dev_set_error)) # save parameters all_param_values = [p.get_value() for p in all_params] cPickle.dump( all_param_values, open('params' + os.sep + 'params_' + filename + '.pkl', 'wb')) dev_set_cost, dev_set_error = evaluate('dev') test_set_cost, test_set_error = evaluate('test') print("epoch %d, cost: train %f dev %f test %f;\n" " error train %f dev %f test %f" % (epoch, train_set_cost, dev_set_cost, test_set_cost, train_set_error, dev_set_error, test_set_error)) except KeyboardInterrupt: pdb.set_trace() pass
def __init__(self, environment, rho=0.9, rms_epsilon=0.0001, momentum=0, clip_delta=0, freeze_interval=1000, batch_size=32, update_rule="rmsprop", random_state=np.random.RandomState(), double_Q=False, neural_network=NN): """ Initialize environment """ QNetwork.__init__(self, environment, batch_size) self._rho = rho self._rms_epsilon = rms_epsilon self._momentum = momentum self._clip_delta = clip_delta self._freeze_interval = freeze_interval self._double_Q = double_Q self._random_state = random_state self.update_counter = 0 states = [ ] # list of symbolic variables for each of the k element in the belief state # --> [ T.tensor4 if observation of element=matrix, T.tensor3 if vector, T.tensor 2 if scalar ] next_states = [] # idem than states at t+1 self.states_shared = [ ] # list of shared variable for each of the k element in the belief state self.next_states_shared = [] # idem that self.states_shared at t+1 for i, dim in enumerate(self._input_dimensions): if len(dim) == 3: states.append(T.tensor4("%s_%s" % ("state", i))) next_states.append(T.tensor4("%s_%s" % ("next_state", i))) elif len(dim) == 2: states.append(T.tensor3("%s_%s" % ("state", i))) next_states.append(T.tensor3("%s_%s" % ("next_state", i))) elif len(dim) == 1: states.append(T.matrix("%s_%s" % ("state", i))) next_states.append(T.matrix("%s_%s" % ("next_state", i))) self.states_shared.append( theano.shared(np.zeros((batch_size, ) + dim, dtype=theano.config.floatX), borrow=False)) self.next_states_shared.append( theano.shared(np.zeros((batch_size, ) + dim, dtype=theano.config.floatX), borrow=False)) print("Number of observations per state: {}".format( len(self.states_shared))) print("For each observation, historySize + ponctualObs_i.shape: {}". format(self._input_dimensions)) rewards = T.col('rewards') actions = T.icol('actions') terminals = T.icol('terminals') thediscount = T.scalar(name='thediscount', dtype=theano.config.floatX) thelr = T.scalar(name='thelr', dtype=theano.config.floatX) Q_net = neural_network(self._batch_size, self._input_dimensions, self._n_actions, self._random_state) self.q_vals, self.params, shape_after_conv = Q_net._buildDQN(states) print( "Number of neurons after spatial and temporal convolution layers: {}" .format(shape_after_conv)) self.next_q_vals, self.next_params, shape_after_conv = Q_net._buildDQN( next_states) self._resetQHat() self.rewards_shared = theano.shared(np.zeros( (batch_size, 1), dtype=theano.config.floatX), broadcastable=(False, True)) self.actions_shared = theano.shared(np.zeros((batch_size, 1), dtype='int32'), broadcastable=(False, True)) self.terminals_shared = theano.shared(np.zeros((batch_size, 1), dtype='int32'), broadcastable=(False, True)) if (self._double_Q == True): givens_next = {} for i, x in enumerate(self.next_states_shared): givens_next[states[i]] = x self.next_q_vals_current_qnet = theano.function([], self.q_vals, givens=givens_next) next_q_curr_qnet = theano.clone(self.next_q_vals) argmax_next_q_vals = T.argmax(next_q_curr_qnet, axis=1, keepdims=True) max_next_q_vals = self.next_q_vals[T.arange(batch_size), argmax_next_q_vals.reshape( (-1, ))].reshape((-1, 1)) else: max_next_q_vals = T.max(self.next_q_vals, axis=1, keepdims=True) not_terminals = T.ones_like(terminals) - terminals target = rewards + not_terminals * thediscount * max_next_q_vals q_val = self.q_vals[T.arange(batch_size), actions.reshape((-1, ))].reshape((-1, 1)) # Note : Strangely (target - q_val) lead to problems with python 3.5, theano 0.8.0rc and floatX=float32... diff = -q_val + target if self._clip_delta > 0: # This loss function implementation is taken from # https://github.com/spragunr/deep_q_rl # If we simply take the squared clipped diff as our loss, # then the gradient will be zero whenever the diff exceeds # the clip bounds. To avoid this, we extend the loss # linearly past the clip point to keep the gradient constant # in that regime. # # This is equivalent to declaring d loss/d q_vals to be # equal to the clipped diff, then backpropagating from # there, which is what the DeepMind implementation does. quadratic_part = T.minimum(abs(diff), self._clip_delta) linear_part = abs(diff) - quadratic_part loss_ind = 0.5 * quadratic_part**2 + self._clip_delta * linear_part else: loss_ind = 0.5 * diff**2 loss = T.mean(loss_ind) givens = { rewards: self.rewards_shared, actions: self.actions_shared, ## actions not needed! terminals: self.terminals_shared } for i, x in enumerate(self.states_shared): givens[states[i]] = x for i, x in enumerate(self.next_states_shared): givens[next_states[i]] = x gparams = [] for p in self.params: gparam = T.grad(loss, p) gparams.append(gparam) updates = [] if update_rule == 'deepmind_rmsprop': updates = deepmind_rmsprop(loss, self.params, gparams, thelr, self._rho, self._rms_epsilon) elif update_rule == 'rmsprop': for i, (p, g) in enumerate(zip(self.params, gparams)): acc = theano.shared(p.get_value() * 0.) acc_new = rho * acc + (1 - self._rho) * g**2 gradient_scaling = T.sqrt(acc_new + self._rms_epsilon) g = g / gradient_scaling updates.append((acc, acc_new)) updates.append((p, p - thelr * g)) elif update_rule == 'sgd': for i, (param, gparam) in enumerate(zip(self.params, gparams)): updates.append((param, param - thelr * gparam)) else: raise ValueError("Unrecognized update: {}".format(update_rule)) if (self._double_Q == True): self._train = theano.function( [thediscount, thelr, next_q_curr_qnet], [loss, loss_ind, self.q_vals], updates=updates, givens=givens, on_unused_input='warn') else: self._train = theano.function([thediscount, thelr], [loss, loss_ind, self.q_vals], updates=updates, givens=givens, on_unused_input='warn') givens2 = {} for i, x in enumerate(self.states_shared): givens2[states[i]] = x self._q_vals = theano.function([], self.q_vals, givens=givens2, on_unused_input='warn')
def compile(self, optimizer, loss, class_mode="categorical", theano_mode=None): self.optimizer = optimizers.get(optimizer) self.loss = objectives.get(loss) weighted_loss = weighted_objective(objectives.get(loss)) # input of model self.X_train = self.get_input(train=True) self.X_test = self.get_input(train=False) self.y_train = self.get_output(train=True) self.y_test = self.get_output(train=False) # target of model self.y = T.zeros_like(self.y_train) self.weights = T.ones_like(self.y_train) if hasattr(self.layers[-1], "get_output_mask"): mask = self.layers[-1].get_output_mask() else: mask = None train_loss = weighted_loss(self.y, self.y_train, self.weights, mask) test_loss = weighted_loss(self.y, self.y_test, self.weights, mask) train_loss.name = 'train_loss' test_loss.name = 'test_loss' self.y.name = 'y' if class_mode == "categorical": train_accuracy = T.mean( T.eq(T.argmax(self.y, axis=-1), T.argmax(self.y_train, axis=-1))) test_accuracy = T.mean( T.eq(T.argmax(self.y, axis=-1), T.argmax(self.y_test, axis=-1))) elif class_mode == "binary": train_accuracy = T.mean(T.eq(self.y, T.round(self.y_train)), dtype='float32') test_accuracy = T.mean(T.eq(self.y, T.round(self.y_test)), dtype='float32') else: raise Exception("Invalid class mode:" + str(class_mode)) self.class_mode = class_mode self.theano_mode = theano_mode for r in self.regularizers: train_loss = r(train_loss) updates = self.optimizer.get_updates(self.params, self.constraints, train_loss) updates += self.updates if type(self.X_train) == list: train_ins = self.X_train + [self.y, self.weights] test_ins = self.X_test + [self.y, self.weights] predict_ins = self.X_test else: train_ins = [self.X_train, self.y, self.weights] test_ins = [self.X_test, self.y, self.weights] predict_ins = [self.X_test] self._train = theano.function(train_ins, train_loss, updates=updates, allow_input_downcast=True, mode=theano_mode) self._train_with_acc = theano.function(train_ins, [train_loss, train_accuracy], updates=updates, allow_input_downcast=True, mode=theano_mode) self._predict = theano.function(predict_ins, self.y_test, allow_input_downcast=True, mode=theano_mode) self._test = theano.function(test_ins, test_loss, allow_input_downcast=True, mode=theano_mode) self._test_with_acc = theano.function(test_ins, [test_loss, test_accuracy], allow_input_downcast=True, mode=theano_mode)
def main(): print("Loading Data") X_train, y_train, X_valid, y_valid, X_test, y_test = load_data.load_data_feautre_train(feautre = u"\uBC18\uD314",root_path= "/home/prosurpa/Image/image/",image_size=(28,28)) input_var = T.tensor4('inputs') target_var = T.ivector('targets') print("Bulding Model") batch_size = 20 network = build_f_cnn(batch_size ,input_var) prediction = lasagne.layers.get_output(network) loss = lasagne.objectives.categorical_crossentropy(prediction, target_var) loss = loss.mean() params = lasagne.layers.get_all_params(network, trainable=True) updates = lasagne.updates.nesterov_momentum( loss, params, learning_rate=0.01, momentum=0.9 ) test_prediction = lasagne.layers.get_output(network, deterministic=True) test_loss = lasagne.objectives.categorical_crossentropy(test_prediction, target_var) test_loss = test_loss.mean() test_acc = T.mean(T.eq(T.argmax(test_prediction, axis=1), target_var), dtype=theano.config.floatX) train_fn = theano.function([input_var, target_var], loss, updates=updates) val_fn = theano.function([input_var, target_var], [test_loss, test_acc]) #model_rw.read_model_data(network, "75.0000009934model") print("Starting training") num_epochs = 1000 best_acc = 75 for epoch in range(num_epochs): train_err = 0 train_batches = 0 start_time = time.time() print((len(X_train)/batch_size)) for batch in iterate_minibatches(X_train, y_train, batch_size, shuffle=True): inputs, targets = batch train_err += train_fn(inputs, targets) train_batches += 1 if train_batches%20 == 0: print(train_batches) val_err = 0 val_acc = 0 val_batches = 0 print((len(X_valid) / batch_size)) for batch in iterate_minibatches(X_valid, y_valid, batch_size, shuffle=False): inputs, targets = batch err, acc = val_fn(inputs, targets) val_err += err val_acc += acc val_batches += 1 if train_batches % 20 == 0: print(val_batches) print("Epoch {} of {} took {:.3f}s".format( epoch + 1, num_epochs, time.time() - start_time)) print(" training loss:\t\t{:.6f}".format(train_err / train_batches)) print(" validation loss:\t\t{:.6f}".format(val_err / val_batches)) print(" validation accuracy:\t\t{:.2f} %".format( val_acc / val_batches * 100)) test_err = 0 test_acc = 0 test_batches = 0 print((len(X_test) / batch_size)) for batch in iterate_minibatches(X_test, y_test, batch_size, shuffle=False): inputs, targets = batch err, acc = val_fn(inputs, targets) test_err += err test_acc += acc test_batches += 1 if train_batches % 20 == 0: print(test_batches) print("Final results:") print(" test loss:\t\t\t{:.6f}".format(test_err / test_batches)) print(" test accuracy:\t\t{:.2f} %".format( test_acc / test_batches * 100)) re_acc = test_acc / test_batches * 100 if re_acc > best_acc + 0.5: best_acc = re_acc model_rw.write_model_data(network, str(best_acc) + "model")
lasagne.objectives.squared_error(prediction, train_prediction_b)) # loss=loss+pi_loss elif model.network_type == "tempens": # Tempens model loss: loss = T.mean(loss * mask_train, dtype=theano.config.floatX) loss += unsup_weight_var * T.mean( lasagne.objectives.squared_error(prediction, z_target_var)) else: loss = T.mean(loss, dtype=theano.config.floatX) # regularization:L1,L2 l2_penalty = lasagne.regularization.regularize_network_params( gru_network, lasagne.regularization.l2) * model.l2_loss loss = loss + l2_penalty train_acc = T.mean(T.eq(T.argmax(prediction, axis=1), T.argmax(target_var, axis=1)), dtype=theano.config.floatX) # We could add some weight decay as well here, see lasagne.regularization. # Create update expressions for training, i.e., how to modify the # parameters at each training step. Here, we'll use Stochastic Gradient # Descent (SGD) with Nesterov momentum, but Lasagne offers plenty more. params = lasagne.layers.get_all_params(gru_network, trainable=True) updates = lasagne.updates.adam(loss, params, learning_rate=learning_rate_var, beta1=adam_beta1_var) """ 3.test loss and accuracy """
def fit(self, trees, learning_rate=10e-4, mu=0.99, reg=10e-3, epochs=15, activation=T.nnet.relu, train_inner_nodes=False): D = self.D V = self.V K = self.K self.f = activation N = len(trees) We = init_weight(V, D) W11 = np.random.randn(D, D, D) / np.sqrt(3*D) W22 = np.random.randn(D, D, D) / np.sqrt(3*D) W12 = np.random.randn(D, D, D) / np.sqrt(3*D) W1 = init_weight(D, D) W2 = init_weight(D, D) bh = np.zeros(D) Wo = init_weight(D, K) bo = np.zeros(K) self.We = theano.shared(We) self.W11 = theano.shared(W11) self.W22 = theano.shared(W22) self.W12 = theano.shared(W12) self.W1 = theano.shared(W1) self.W2 = theano.shared(W2) self.bh = theano.shared(bh) self.Wo = theano.shared(Wo) self.bo = theano.shared(bo) self.params = [self.We, self.W11, self.W22, self.W12, self.W1, self.W2, self.bh, self.Wo, self.bo] words = T.ivector('words') left_children = T.ivector('left_children') right_children = T.ivector('right_children') labels = T.ivector('labels') def recurrence(n, hiddens, words, left, right): w = words[n] # any non-word will have index -1 hiddens = T.switch( T.ge(w, 0), T.set_subtensor(hiddens[n], self.We[w]), T.set_subtensor(hiddens[n], self.f( hiddens[left[n]].dot(self.W11).dot(hiddens[left[n]]) + hiddens[right[n]].dot(self.W22).dot(hiddens[right[n]]) + hiddens[left[n]].dot(self.W12).dot(hiddens[right[n]]) + hiddens[left[n]].dot(self.W1) + hiddens[right[n]].dot(self.W2) + self.bh ) ) ) return hiddens hiddens = T.zeros((words.shape[0], D)) h, _ = theano.scan( fn=recurrence, outputs_info=[hiddens], n_steps=words.shape[0], sequences=T.arange(words.shape[0]), non_sequences=[words, left_children, right_children], ) py_x = T.nnet.softmax(h[:,0,:].dot(self.Wo) + self.bo) prediction = T.argmax(py_x, axis=1) rcost = T.mean([(p*p).sum() for p in self.params]) if train_inner_nodes: cost = -T.mean(T.log(py_x[T.arange(labels.shape[0]), labels])) + rcost else: cost = -T.mean(T.log(py_x[-1, labels[-1]])) + rcost grads = T.grad(cost, self.params) dparams = [theano.shared(p.get_value()*0) for p in self.params] updates = [ (p, p + mu*dp - learning_rate*g) for p, dp, g in zip(self.params, dparams, grads) ] + [ (dp, mu*dp - learning_rate*g) for dp, g in zip(dparams, grads) ] self.cost_predict_op = theano.function( inputs=[words, left_children, right_children, labels], outputs=[cost, prediction], allow_input_downcast=True, ) self.train_op = theano.function( inputs=[words, left_children, right_children, labels], outputs=[cost, prediction], updates=updates ) costs = [] sequence_indexes = range(N) if train_inner_nodes: n_total = sum(len(words) for words, _, _, _ in trees) else: n_total = N for i in xrange(epochs): t0 = datetime.now() sequence_indexes = shuffle(sequence_indexes) n_correct = 0 cost = 0 it = 0 for j in sequence_indexes: words, left, right, lab = trees[j] c, p = self.train_op(words, left, right, lab) if np.isnan(c): print "Cost is nan! Let's stop here. Why don't you try decreasing the learning rate?" exit() cost += c if train_inner_nodes: n_correct += np.sum(p == lab) else: n_correct += (p[-1] == lab[-1]) it += 1 if it % 1 == 0: sys.stdout.write("j/N: %d/%d correct rate so far: %f, cost so far: %f\r" % (it, N, float(n_correct)/n_total, cost)) sys.stdout.flush() print "i:", i, "cost:", cost, "correct rate:", (float(n_correct)/n_total), "time for epoch:", (datetime.now() - t0) costs.append(cost) plt.plot(costs) plt.show()
output = architecture.buildDCNN() dcnnParams = lasagne.layers.get_all_params(output) # SYMBOLIC INPUTS x = T.imatrix() y = T.ivector() # Without L2 Regularization loss = lasagne.objectives.aggregate( lasagne.objectives.categorical_crossentropy( lasagne.layers.get_output(output, x), y), mode = 'mean') updates = lasagne.updates.adagrad(loss, dcnnParams, learning_rate = 0.1) # ACCURACY FOR PREDICTIONS prediction = T.argmax(lasagne.layers.get_output(output, x, deterministic=True), axis=1) score = T.eq(prediction, y).mean() # SYMBOLIC FUNCTIONS trainDCNN = theano.function([x,y], outputs = loss, updates = updates) validateDCNN = theano.function([x,y], outputs = score) testDCNN = theano.function([x,y], outputs = score) # LOAD THE DATA trainingSentences = loader.loadData('myDataset/train.txt') trainingLabels = loader.loadData('myDataset/train_label.txt') validationSentences = loader.loadData('myDataset/dev.txt') validationLabels = loader.loadData('myDataset/dev_label.txt') testSentences = loader.loadData('myDataset/test.txt') testLabels = loader.loadData('myDataset/test_label.txt')
lasagne.layers.set_all_param_values(net['prob'], params) n_batches_per_epoch = np.floor(n_training_samples/float(BATCH_SIZE)) n_test_batches = np.floor(n_val_samples/float(BATCH_SIZE)) x_sym = T.tensor4() y_sym = T.ivector() l2_loss = lasagne.regularization.regularize_network_params(net['prob'], lasagne.regularization.l2) * 5e-4 prediction_train = lasagne.layers.get_output(net['prob'], x_sym, deterministic=False) loss = lasagne.objectives.categorical_crossentropy(prediction_train, y_sym) loss = loss.mean() loss += l2_loss acc_train = T.mean(T.eq(T.argmax(prediction_train, axis=1), y_sym), dtype=theano.config.floatX) prediction_test = lasagne.layers.get_output(net['prob'], x_sym, deterministic=True) loss_val = lasagne.objectives.categorical_crossentropy(prediction_test, y_sym) loss_val = loss_val.mean() loss_val += l2_loss acc = T.mean(T.eq(T.argmax(prediction_test, axis=1), y_sym), dtype=theano.config.floatX) params = lasagne.layers.get_all_params(net['prob'], trainable=True) learning_rate = theano.shared(np.float32(0.001)) updates = lasagne.updates.adam(loss, params, learning_rate=learning_rate) train_fn = theano.function([x_sym, y_sym], [loss, acc_train], updates=updates) val_fn = theano.function([x_sym, y_sym], [loss_val, acc]) pred_fn = theano.function([x_sym], prediction_test)
return p_y_given_x w_c1 = init_weights((4, 1, 3, 3)) b_c1 = init_weights((4, )) w_c2 = init_weights((8, 4, 3, 3)) b_c2 = init_weights((8, )) w_h3 = init_weights((8 * 4 * 4, 100)) b_h3 = init_weights((100, )) w_o = init_weights((100, 10)) b_o = init_weights((10, )) params = [w_c1, b_c1, w_c2, b_c2, w_h3, b_h3, w_o, b_o] p_y_given_x = model(x, *params) y = T.argmax(p_y_given_x, axis=1) cost = T.mean(T.nnet.categorical_crossentropy(p_y_given_x, t)) updates = momentum(cost, params, learning_rate=0.01, momentum=0.9) # compile theano functions train = theano.function([x, t], cost, updates=updates) predict = theano.function([x], y) # train model batch_size = 50 for i in range(50): print "iteration %d" % (i + 1) for start in range(0, len(x_train), batch_size):