def build(self): # input and output variables x = T.matrix('x') y = T.matrix('y') index = T.lscalar() batch_count = T.lscalar() LR = T.scalar('LR', dtype=theano.config.floatX) M = T.scalar('M', dtype=theano.config.floatX) # before the build, you work with symbolic variables # after the build, you work with numeric variables self.train_batch = theano.function(inputs=[index,LR,M], updates=self.model.updates(x,y,LR,M),givens={ x: self.shared_x[index * self.batch_size:(index + 1) * self.batch_size], y: self.shared_y[index * self.batch_size:(index + 1) * self.batch_size]}, name = "train_batch", on_unused_input='warn') self.test_batch = theano.function(inputs=[index],outputs=self.model.errors(x,y),givens={ x: self.shared_x[index * self.batch_size:(index + 1) * self.batch_size], y: self.shared_y[index * self.batch_size:(index + 1) * self.batch_size]}, name = "test_batch") if self.format == "DFXP" : self.update_range = theano.function(inputs=[batch_count],updates=self.model.range_updates(batch_count), name = "update_range")
def train_function_momentum(self, x1, x2, i1, i2, l1, l2, y, z): """Train model with momentum""" learning_rate = T.scalar('lr') # learning rate to use regularization = T.scalar('reg') # regularization to use momentum = T.scalar('mom') # momentum to use cost, updates = self.get_cost_updates_momentum(learning_rate, regularization, momentum) train_fn = theano.function( inputs=[ theano.Param(learning_rate, default=0.1), theano.Param(regularization, default=0.0), theano.Param(momentum, default=0.9) ], outputs=cost, updates=updates, givens={ self.x1: x1, self.x2: x2, self.indices1: i1, self.indices2: i2, self.l1: l1, self.l2: l2, self.y: y, self.z: z }, name='train_momentum', on_unused_input='warn' ) return train_fn
def get_train_fn(self, dataX, batch_size=1, k=1): """ dataX: theano shared data dataY: theano shared label """ learning_rate = T.scalar('lr') Beta = T.scalar('beta') Gamma = T.scalar('gamma') Sparseness = T.scalar('sparseness') cost, updates = self._get_cost_update(lr=learning_rate, beta=Beta, gamma=Gamma, s_constrain=Sparseness, k=k) index = T.lscalar('index') fn = theano.function(inputs=[index, theano.Param(learning_rate, default=0.01), theano.Param(Beta, default=0.1), theano.Param(Gamma, default=0.0001), theano.Param(Sparseness, default=0.05)], outputs=cost, updates=updates, givens={self.x: dataX[index * batch_size:(index + 1) * batch_size]}, name='train_rbm_S_L2') return fn
def get_training_functions(self, x_lab_np=None, y_lab_np=None, x_unlab_np=None): # assert xlab.shape[0] == len(y_lab) assert self.x_lab_np.shape[0] == len(y_lab) self.x_lab = self._shared_dataset(self.x_lab_np) self.y_lab = self._shared_dataset(self.y_lab_np) self.x_unlab = self._shared_dataset(self.x_unlab_np) self.alpha = float(xlab.shape[0] / xunlab.shape[0]) index_unlab = T.ivector('index_unlab') index_lab = T.ivector('index_lab') momentum = T.scalar('momentum') learning_rate = T.scalar('learning_rate') # cost, updates = self.get_cost_updates(self.x_lab, self.x_unlab, self.y_lab) self.batch_size_lab = self.batch_size * self.alpha self.batch_size_unlab = self.batch_size * (1-self.alpha) x_lab = T.matrix('x_lab') x_unlab = T.matrix('x_unlab') y_lab = T.ivector('y_lab') self.num_labels = self.x_lab_np.shape[0] self.num_unlabels = self.x_unlab_np[0] self.num_samples = num_labels + num_unlabels num_batches = num_samples / float(self.batch_size) pretraining_fns = [] for i in xrange(len(hidden_layers)): ssda = self.layers[i] exit() cost, updates = ssda.get_cost_updates(self.x_lab, self.x_unlab, self.y_lab) train_fn = theano.function(inputs=[index_lab, index_unlab], updates=updates, outputs=[cost], givens={self.x_lab:self.x_lab[index_lab], self.x_unlab:self.x_unlab[index_unlab], self.y_lab:self.y_lab[index_lab]}) pretraining_fns.append(train_fn) return pretraining_fns
def pretraining_functions(self, train_set_x, batch_size): index = T.lscalar('index') corruption_level = T.scalar('corruption') learning_rate = T.scalar('lr') batch_begin = index * batch_size batch_end = batch_begin + batch_size pretrain_fns = [] for dA in self.dA_layers: cost, updates = dA.get_cost_updates(corruption_level, learning_rate) fn = theano.function( inputs=[ index, theano.In(corruption_level, value=0.1), theano.In(learning_rate, value=0.1) ], outputs=cost, updates=updates, givens={ self.x: train_set_x[batch_begin: batch_end] } ) pretrain_fns.append(fn) return pretrain_fns
def __init__(self, dnodex,dim): X = T.matrix() Y = T.matrix() eta = T.scalar() temperature=T.scalar() num_input = len(format(dnodex.npoi,'b')) num_hidden = dim num_output = len(format(dnodex.npoi,'b')) inputs = InputLayer(X, name="inputs") lstm1 = LSTMLayer(num_input, num_hidden, input_layer=inputs, name="lstm1") lstm2 = LSTMLayer(num_hidden, num_hidden, input_layer=lstm1, name="lstm2") #lstm3 = LSTMLayer(num_hidden, num_hidden, input_layer=lstm2, name="lstm3") softmax = SoftmaxLayer(num_hidden, num_output, input_layer=lstm2, name="yhat", temperature=temperature) Y_hat = softmax.output() self.layers = inputs, lstm1, lstm2, softmax params = get_params(self.layers) caches = make_caches(params) cost = T.mean(T.nnet.categorical_crossentropy(Y_hat, Y)) updates = momentum(cost, params, caches, eta) self.train = theano.function([X, Y, eta, temperature], cost, updates=updates, allow_input_downcast=True) predict_updates = one_step_updates(self.layers) self.predict_char = theano.function([X, temperature], Y_hat, updates=predict_updates, allow_input_downcast=True)
def _compile_func(): beta = T.vector('beta') b = T.scalar('b') X = T.matrix('X') y = T.vector('y') C = T.scalar('C') params = [beta, b, X, y, C] cost = 0.5 * (T.dot(beta, beta) + b * b) + C * T.sum( T.nnet.softplus( -T.dot(T.diag(y), T.dot(X, beta) + b) ) ) # Function computing in one go the cost, its gradient # with regard to beta and with regard to the bias. cost_grad = theano.function(params,[ cost, T.grad(cost, beta), T.grad(cost, b) ]) # Function for computing element-wise sigmoid, used for # prediction. log_predict = theano.function( [beta, b, X], T.nnet.sigmoid(b + T.dot(X, beta)), on_unused_input='warn' ) return (cost_grad, log_predict)
def build_model(self): ###################### # BUILD ACTUAL MODEL # ###################### logger.info('... building the model') U, W, V, bh, by = self.U, self.W, self.V, self.bh, self.by x = T.matrix('x') y = T.matrix('y') def forward_prop_step(x_t, s_tm1, U, W, bh): s_t = self.activation(T.dot(U, x_t) + T.dot(W, s_tm1) + bh) return s_t s, _ = theano.scan( forward_prop_step, sequences=x, outputs_info=[dict(initial=T.zeros(self.hidden_dim))], non_sequences=[U, W, bh], mode='DebugMode') p_y = T.nnet.softmax(T.dot(self.V, s[-1]) + by) prediction = T.argmax(p_y, axis=1) o_error = T.sum(T.nnet.categorical_crossentropy(p_y, y)) self.cost = o_error + self.L1_reg * self.L1 + self.L2_reg * self.L2_sqr # Assign functions self.forward_propagation = theano.function([x], s[-1]) self.predict = theano.function([x], prediction) self.ce_error = theano.function([x, y], o_error) l_r = T.scalar('l_r', dtype=theano.config.floatX) # learning rate (may change) mom = T.scalar('mom', dtype=theano.config.floatX) # momentum self.bptt, self.f_update = self.Momentum(x, y, l_r, mom)
def build_pretraining_function(self, train_set_x, batch_size): index = T.lscalar('index') corruption_level = T.scalar('corruption') # % of corruption to use learning_rate = T.scalar('lr') # learning rate to use # number of batches n_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size pretrain_fns = [] for pretrain in self.pretrain_layers: cost, updates = pretrain.get_cost_updates(corruption_level, \ learning_rate) fn = theano.function(inputs=[index, corruption_level, \ learning_rate], outputs=cost, updates=updates, givens= { self.x: train_set_x[index * batch_size: \ (index + 1) * batch_size]}) pretrain_fns.append(fn) return pretrain_fns
def more_complex_test(): notimpl = NotImplementedOp() ifelseifelseif = IfElseIfElseIf() x1 = T.scalar('x1') x2 = T.scalar('x2') c1 = T.scalar('c1') c2 = T.scalar('c2') t1 = ifelse(c1, x1, notimpl(x2)) t1.name = 't1' t2 = t1 * 10 t2.name = 't2' t3 = ifelse(c2, t2, x1 + t1) t3.name = 't3' t4 = ifelseifelseif(T.eq(x1, x2), x1, T.eq(x1, 5), x2, c2, t3, t3 + 0.5) t4.name = 't4' f = function([c1, c2, x1, x2], t4, mode=Mode(linker='vm', optimizer='fast_run')) if theano.config.vm.lazy is False: try: f(1, 0, numpy.array(10, dtype=x1.dtype), 0) assert False except NotImplementedOp.E: pass else: print(f(1, 0, numpy.array(10, dtype=x1.dtype), 0)) assert f(1, 0, numpy.array(10, dtype=x1.dtype), 0) == 20.5 print('... passed')
def __form_input_tensor(self, name): left_entity = T.scalar(name='le_' + name, dtype='int32') right_entity = T.scalar(name='re_' + name, dtype='int32') relation = T.scalar(name='rel_' + name, dtype='int32') return T.stack([left_entity, right_entity, relation])
def build_train_fn(self,): self.lr_theano = T.scalar('lr') self.grad_inputs = self.inputs + [self.lr_theano] if self.momentum: self.mom_theano = T.scalar('mom') self.grad_inputs = self.grad_inputs + [self.mom_theano] self.gparams = T.grad(self.costs[0],self.params,consider_constant=self.consider_constant) if not self.momentum: print 'Building SGD optimization graph without momentum' updates = OrderedDict((i, i - self.lr_theano*j) for i, j in zip(self.params, self.gparams)) else: print 'Building SGD optimization graph with momentum' updates = OrderedDict() for param,param_mom,gparam in zip(self.params,self.params_mom,self.gparams): param_inc = self.mom_theano * param_mom - self.lr_theano * gparam updates[param_mom] = param_inc updates[param] = param + param_inc self.calc_cost = theano.function(self.inputs,self.costs) if self.updates_old: updates_old = copy.copy(updates_old) #To avoid updating the model dict if updates dict belongs to model class, very unlikely case. self.updates_old.update(updates) else: self.updates_old = OrderedDict() self.updates_old.update(updates) self.f = theano.function(self.grad_inputs, self.costs, updates=self.updates_old)
def test_reallocation(): x = tensor.scalar('x') y = tensor.scalar('y') z = tensor.tanh(3 * x + y) + tensor.cosh(x + 5 * y) # The functinality is currently implement for non lazy and non c VM only. for l in [vm.VM_Linker(allow_gc=False, lazy=False, use_cloop=False), vm.VM_Linker(allow_gc=True, lazy=False, use_cloop=False)]: m = theano.compile.get_mode(theano.Mode(linker=l)) m = m.excluding('fusion', 'inplace') f = theano.function([x, y], z, name="test_reduce_memory", mode=m) output = f(1, 2) assert output storage_map = f.fn.storage_map def check_storage(storage_map): from theano.tensor.var import TensorConstant for i in storage_map: if not isinstance(i, TensorConstant): keys_copy = list(storage_map.keys())[:] keys_copy.remove(i) for o in keys_copy: if (storage_map[i][0] and storage_map[i][0] is storage_map[o][0]): return [True, storage_map[o][0]] return [False, None] assert check_storage(storage_map)[0] assert len(set(id(v) for v in itervalues(storage_map))) < len(storage_map)
def get_bivariate_normal_spec(): X1,X2,mu,sigma = [T.scalar('X1'),T.scalar('X2'), T.vector('mu'), T.matrix('sigma')] GaussianDensitySpec = FunctionSpec(variables=[X1, X2, mu, sigma], output_expression = -0.5*T.dot(T.dot((T.concatenate([X1.dimshuffle('x'),X2.dimshuffle('x')])-mu).T, nlinalg.matrix_inverse(sigma)), (T.concatenate([X1.dimshuffle('x'),X2.dimshuffle('x')])-mu))) return GaussianDensitySpec
def adam(loss, param_list): """ Recommended default settings are α = 0.001, β1 = 0.9, β2 = 0.999 and eps= 10e−8. t is timestep. """ alpha = T.scalar("alpha") beta1 = T.scalar("beta1") beta2 = T.scalar("beta2") eps = T.scalar("eps") t = T.scalar("t") gparam_list = [T.grad(loss, p) for p in param_list] first_moment_list = [zero_shared(p.shape.eval()) for p in param_list] second_moment_list = [zero_shared(p.shape.eval()) for p in param_list] updates = OrderedDict() for param, gparam, first_moment, second_moment\ in zip(param_list, gparam_list, first_moment_list, second_moment_list): m = beta1*first_moment + (1.-beta1)*gparam v = beta2*second_moment + (1.-beta2)*gparam*gparam m_hat = m / (1.-beta1**t) v_hat = v / (1.-beta2**t) updates[param] = param - alpha*m_hat / (T.sqrt(v_hat)+eps) updates[first_moment] = m updates[second_moment] = v opt_params = [alpha, beta1, beta2, eps, t] return updates, opt_params
def pretraining_functions(self, train_set_x, batch_size): # index to a [mini]batch index = T.lscalar('index') # index to a minibatch corruption_level = T.scalar('corruption') # % of corruption to use learning_rate = T.scalar('lr') # learning rate to use # begining of a batch, given `index` batch_begin = index * batch_size # ending of a batch given `index` batch_end = batch_begin + batch_size pretrain_fns = [] for dA in self.dA_layers: # get the cost and the updates list cost, updates = dA.get_cost_updates(corruption_level,##$ learning_rate) # compile the theano function fn = theano.function( inputs=[ index, theano.Param(corruption_level, default=0.2),##$ theano.Param(learning_rate, default=0.1) ], outputs=cost, updates=updates, givens={ self.x: train_set_x[batch_begin: batch_end] } ) # append `fn` to the list of functions pretrain_fns.append(fn) return pretrain_fns
def create_TrainFunc_tranPES(simfn, embeddings, marge=0.5, alpha=1., beta=1.): # parse the embedding data embedding = embeddings[0] # D x N matrix lembedding = embeddings[1] # declare the symbolic variables for training triples hp = S.csr_matrix('head positive') # N x batchsize matrix rp = S.csr_matrix('relation') tp = S.csr_matrix('tail positive') hn = S.csr_matrix('head negative') tn = S.csr_matrix('tail negative') lemb = T.scalar('embedding learning rate') lremb = T.scalar('relation learning rate') subtensorE = T.ivector('batch entities set') subtensorR = T.ivector('batch link set') # Generate the training positive and negative triples hpmat = S.dot(embedding.E, hp).T # batchsize x D dense matrix rpmat = S.dot(lembedding.E, rp).T tpmat = S.dot(embedding.E, tp).T hnmat = S.dot(embedding.E, hn).T tnmat = S.dot(embedding.E, tn).T # calculate the score pos = tranPES3(simfn, T.concatenate([hpmat, tpmat], axis=1).reshape((hpmat.shape[0], 2, hpmat.shape[1])).dimshuffle(0, 2, 1), hpmat, rpmat, tpmat) negh = tranPES3(simfn, T.concatenate([hnmat, tpmat], axis=1).reshape((hnmat.shape[0], 2, hnmat.shape[1])).dimshuffle(0, 2, 1), hnmat, rpmat, tpmat) negt = tranPES3(simfn, T.concatenate([hpmat, tnmat], axis=1).reshape((hpmat.shape[0], 2, hpmat.shape[1])).dimshuffle(0, 2, 1), hpmat, rpmat, tnmat) costh, outh = margeCost(pos, negh, marge) costt, outt = margeCost(pos, negt, marge) embreg = regEmb(embedding, subtensorE, alpha) lembreg = regLink(lembedding, subtensorR, beta) cost = costh + costt + embreg[0] + lembreg out = T.concatenate([outh, outt]) outc = embreg[1] # list of inputs to the function list_in = [lemb, lremb, hp, rp, tp, hn, tn, subtensorE, subtensorR] # updating the embeddings using gradient descend emb_grad = T.grad(cost, embedding.E) New_embedding = embedding.E - lemb*emb_grad remb_grad = T.grad(cost, lembedding.E) New_rembedding = lembedding.E - lremb * remb_grad updates = OrderedDict({embedding.E: New_embedding, lembedding.E: New_rembedding}) return theano.function(list_in, [cost, T.mean(out), T.mean(outc), embreg[0], lembreg], updates=updates, on_unused_input='ignore')
def directRNN(): ####################### NumPy x0=0.5 s=0.5 times=[1,10,20,30,40,50] yhat=direct(x0, s, times) ############################### Symbolic x0_ = T.scalar("x0") c_= T.log((1-x0_)/x0_) times_ = T.ivector("times") S__=theano.shared(np.asarray(s, dtype = theano.config.floatX), 'S') yhat_= T.nnet.sigmoid(S__*times_/2-c_) Predict_ = theano.function(inputs=[x0_,times_], outputs=yhat_) ############################### Symbolic Recursive x0_ = T.scalar("x0") times_ = T.ivector("times") S__=theano.shared(np.asarray(s, dtype = theano.config.floatX), 'S') # predall_, updatesRecurrence_ = theano.scan(lambda x_prev, s: (s*x_prev*x_prev+s*x_prev +2*x_prev)/(2*s*x_prev+2), outputs_info=x0_,non_sequences=S__,n_steps=times_[-1]) predall_, updatesRecurrence_ = theano.scan(lambda x_prev, s: x_prev+(s*x_prev*(1-x_prev))/(2*s*x_prev+2), outputs_info=x0_,non_sequences=S__,n_steps=times_[-1]) pred_=predall_[times_-1] #we only have target at some generations e.g. 10,20,... Feedforward_ = theano.function(inputs=[x0_,times_], outputs=pred_, updates=updatesRecurrence_) ############################# Comparison x_0=0.5 x_1=x_0+(s*x_0*(1-x_0))/(2*s*x_0+2) print '{:20s}{}'.format('NumPy', yhat) print '{:20s}{}'.format('Symbolic Direct', Predict_(x0,list(times))) print '{:20s}{}'.format('Symbolic Recursive', Feedforward_(x0,list(times))) print '{:20s}[ {} ]'.format('x_1', x_1)
def build_nnet(layer_sizes, normalize_layers=False): X = T.vector(dtype='float32') t = T.scalar(dtype='int32') alpha = T.scalar(dtype='float32') t_onehot = extra.to_one_hot(t.reshape((1, 1)), 10) weights = [] # We always want to normalize the inputs to the first layer Y, W = layer(normalize(X), 784, layer_sizes[0]) weights.append(W) for l1, l2 in zip(layer_sizes[1:-1], layer_sizes[2:]): if normalize_layers: Y = normalize(Y) Y, W = layer(Y, l1, l2) weights.append(W) if normalize_layers: Y = normalize(Y) Y, W = layer(Y, layer_sizes[-1], 10, activation=nnet.softmax) weights.append(W) mse = T.mean(T.sqr(Y - t_onehot)) updates = [(W, W - alpha * T.grad(cost=mse, wrt=W)) for W in weights] prediction = T.argmax(Y) confidence = T.max(Y) eval_nnet = theano.function(inputs=[X], outputs=[prediction, confidence]) train_nnet = theano.function(inputs=[X, t, alpha], outputs=mse, updates=updates) return eval_nnet, train_nnet
def pretraining_functions(self, train_set_x, train_set_y, batch_size): index = tensor.lscalar('index') index = tensor.lscalar('index') corruption_level = tensor.scalar('corruption') corruption_level = tensor.scalar('corruption') learning_rate = tensor.scalar('lr') learning_rate = tensor.scalar('lr') switch = tensor.iscalar('switch') n_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size batch_begin = index * batch_size batch_end = batch_begin + batch_size pretrain_fns = [] for sugar in self.sugar_layers: cost, updates = sugar.get_cost_updates(corruption_level, learning_rate, switch) fn = function(inputs=[index, Param(corruption_level, default=0.2), Param(learning_rate, default=0.1), Param(switch, default=1)], outputs=[cost], updates=updates, givens={self.x: train_set_x[batch_begin:batch_end], self.y: train_set_y[batch_begin:batch_end]}, on_unused_input='ignore') pretrain_fns.append(fn) return pretrain_fns
def __init__(self, dnodex,inputdim,dim): X=T.ivector() Y=T.ivector() Z=T.lscalar() eta = T.scalar() temperature=T.scalar() self.dnodex=dnodex num_input = inputdim dnodex.umatrix=theano.shared(floatX(np.random.randn(*(self.dnodex.nuser,inputdim, inputdim)))) dnodex.pmatrix=theano.shared(floatX(np.random.randn(*(self.dnodex.npoi,inputdim)))) dnodex.p_l2_norm=(dnodex.pmatrix**2).sum() dnodex.u_l2_norm=(dnodex.umatrix**2).sum() num_hidden = dim num_output = inputdim inputs = InputPLayer(dnodex.pmatrix[X,:], dnodex.umatrix[Z,:,:], name="inputs") lstm1 = LSTMLayer(num_input, num_hidden, input_layer=inputs, name="lstm1") lstm2 = LSTMLayer(num_hidden, num_hidden, input_layer=lstm1, name="lstm2") lstm3 = LSTMLayer(num_hidden, num_hidden, input_layer=lstm2, name="lstm3") softmax = SoftmaxPLayer(num_hidden, num_output, dnodex.umatrix[Z,:,:], input_layer=lstm3, name="yhat", temperature=temperature) Y_hat = softmax.output() self.layers = inputs, lstm1,lstm2,lstm3,softmax params = get_params(self.layers) #caches = make_caches(params) cost = T.mean(T.nnet.categorical_crossentropy(Y_hat, T.dot(dnodex.pmatrix[Y,:],dnodex.umatrix[Z,:,:])))+eta*dnodex.p_l2_norm+eta*dnodex.u_l2_norm updates = PerSGD(cost,params,eta,X,Z,dnodex)#momentum(cost, params, caches, eta) self.train = theano.function([X,Y,Z, eta, temperature], cost, updates=updates, allow_input_downcast=True) predict_updates = one_step_updates(self.layers) self.predict_char = theano.function([X, Z, temperature], Y_hat, updates=predict_updates, allow_input_downcast=True)
def __init__(self, *args, learning_rate=0.001, decay=0.9, epsilon=1e-8, **kwargs): super().__init__(*args, **kwargs) self.learning_rate = learning_rate0 self.decay = deay0 self.epsilon = epsilon0 learning_rate = T.scalar('learning_rate') decay = T.scalar('decay') squares = self.create_shadows('squares') new_squares = [decay*square + (1.0-decay)*T.sqr(g) for g, square in zip( self.grad.values(), squares.values())] ds = [-g*learning_rate/T.sqrt(square + self.epsilon) for g,square in zip(self.grad.values(), new_squares)] updates = [(p, p+d) for p,d in zip(self.params.values(), ds)] \ + list(zip(squares.values(), new_squares)) \ self.step1 = function( inputs=self.inputs+self.outputs+[ learning_rate, decay], default_mode=1, outputs=self.loss, name='RMSProp_step1', updates=updates)
def __init__(self,final_momentum=0.9, initial_momentum=0.5,momentum_switchover=5,times=[10,20,30,40,50],S=3,lr=1e-2,maxIter=10000,initS=0.0,numReplicates=3,theta=20,n=2000): self.times=times[times!=0].astype(np.float32) self.momentum_ = T.scalar('momentum', dtype=floatX) self.final_momentum=final_momentum; self.initial_momentum=initial_momentum;self.momentum_switchover=momentum_switchover;self.W=3;self.lr=lr;self.maxIter=maxIter;self.numReplicates=numReplicates;self.initS=initS;self.n=n;self.theta=theta self.lr_ = T.scalar();self.target_ = (T.matrix(),T.vector())[self.numReplicates==1]; self.times_ = T.ivector("times"); self.x0_ = T.scalar("x0 ");self.n_ = T.scalar("n");self.theta_ = T.scalar("theta") self.S__=theano.shared(np.asarray(self.initS, dtype = floatX), 'S') self.predall_, self.updatesRecurrence_ = theano.scan(lambda x_prev, s: (s*x_prev*x_prev+s*x_prev +2*x_prev)/(2*s*x_prev+2), outputs_info=self.x0_,non_sequences=self.S__,n_steps=self.times_[-1]) self.pred_=Z(self.predall_[self.times_-1],self.n_,self.theta_) #we only have target at some generations e.g. 10,20,... self.Feedforward_ = theano.function(inputs=[self.x0_,self.times_,self.n_,self.theta_], outputs=self.pred_, updates=self.updatesRecurrence_) if self.numReplicates==1: self.cost_ = 0.5*((self.target_ - self.pred_)**2).mean(axis=0).sum() else: self.cost_=0 for j in range(self.numReplicates): self.cost_ += 0.5*((self.target_[:,j] - self.pred_)**2).mean(axis=0).sum() self.Loss_ = theano.function(inputs=[self.target_,self.pred_], outputs=self.cost_) self.gW_ = T.grad(self.cost_, [self.S__])[0] self.weightUpdate__ = theano.shared(np.asarray(0, dtype = floatX)) upd = self.momentum_ * self.weightUpdate__ - self.lr_ * self.gW_ self.updatesW=[(self.weightUpdate__, upd),(self.S__, self.S__ + upd)] self.Objective_ = theano.function([self.x0_, self.target_, self.lr_,self.times_,self.momentum_,self.n_,self.theta_], self.cost_, on_unused_input='warn',updates=self.updatesW,allow_input_downcast=True)
def pretraining_functions(self, train_set_x, batch_size): # index to a [mini]batch index = T.lscalar('index') # index to a minibatch corruption_level = T.scalar('corruption') # % of corruption to use learning_rate = T.scalar('lr') # learning rate to use batch_begin = index * batch_size batch_end = batch_begin + batch_size pretrain_fns = [] for dA in self.dA_layers: cost, updates = dA.get_cost_updates(corruption_level, learning_rate) fn = theano.function( inputs=[ index, corruption_level, learning_rate], # http://stackoverflow.com/questions/35622784/what-is-the-right-way-to-pass-inputs-parameters-to-a-theano-function #index, theano.In(corruption_level, value=0.2), #theano.In(learning_rate, value=0.1)], outputs=cost, updates=updates, givens={ self.x: train_set_x[batch_begin: batch_end] }) pretrain_fns.append(fn) return pretrain_fns
def __theano_build__(self): params = self.params param_names = self.param_names hidden_dim = self.hidden_dim x1 = T.imatrix('x1') # first sentence x2 = T.imatrix('x2') # second sentence x1_mask = T.fmatrix('x1_mask') #mask x2_mask = T.fmatrix('x2_mask') y = T.ivector('y') # label y_c = T.ivector('y_c') # class weights # Embdding words _E1 = params["E"].dot(params["W"][0]) + params["B"][0] _E2 = params["E"].dot(params["W"][1]) + params["B"][1] statex1 = _E1[x1.flatten(), :].reshape([x1.shape[0], x1.shape[1], hidden_dim]) statex2 = _E2[x2.flatten(), :].reshape([x2.shape[0], x2.shape[1], hidden_dim]) def rnn_cell(x, mx, ph, Wh): h = T.tanh(ph.dot(Wh) + x) h = mx[:, None] * h + (1-mx[:, None]) * ph return [h] [h1], updates = theano.scan( fn=rnn_cell, sequences=[statex1, x1_mask], truncate_gradient=self.truncate, outputs_info=[dict(initial=T.zeros([self.batch_size, self.hidden_dim]))], non_sequences=params["W"][2]) [h2], updates = theano.scan( fn=rnn_cell, sequences=[statex2, x2_mask], truncate_gradient=self.truncate, outputs_info=[dict(initial=h1[-1])], non_sequences=params["W"][3]) #predict _s = T.nnet.softmax(h1[-1].dot(params["lrW"][0]) + h2[-1].dot(params["lrW"][1]) + params["lrb"]) _p = T.argmax(_s, axis=1) _c = T.nnet.categorical_crossentropy(_s, y) _c = T.sum(_c * y_c) _l = T.sum(params["lrW"]**2) _cost = _c + 0.01 * _l # SGD parameters learning_rate = T.scalar('learning_rate') decay = T.scalar('decay') # Gradients and updates _grads, _updates = rms_prop(_cost, param_names, params, learning_rate, decay) # Assign functions self.bptt = theano.function([x1, x2, x1_mask, x2_mask, y, y_c], _grads) self.loss = theano.function([x1, x2, x1_mask, x2_mask, y, y_c], _c) self.weights = theano.function([x1, x2, x1_mask, x2_mask], _s) self.predictions = theano.function([x1, x2, x1_mask, x2_mask], _p) self.sgd_step = theano.function( [x1, x2, x1_mask, x2_mask, y, y_c, learning_rate, decay], updates=_updates)
def pretraining_functions(self, train_set_x, batch_size, k , weight_cost): index = T.lscalar('index') momentum = T.scalar('momentum') learning_rate = T.scalar('lr') # number of mini-batches n_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size # start and end index of this mini-batch batch_begin = index * batch_size batch_end = batch_begin + batch_size pretrain_fns = [] for rbm in self.rbm_layers: r_cost, fe_cost, updates = rbm.get_cost_updates(batch_size, learning_rate, momentum, weight_cost, persistent=None, k = k) # compile the theano function fn = theano.function(inputs=[index, theano.Param(learning_rate, default=0.0001), theano.Param(momentum, default=0.5)], outputs= [r_cost, fe_cost], updates=updates, givens={self.x: train_set_x[batch_begin:batch_end]}) # append function to the list of functions pretrain_fns.append(fn) return pretrain_fns
def __init__(self, num_input, num_cells=50, num_output=1, lr=0.01, rho=0.95): X = T.matrix('x') Y = T.matrix('y') eta = T.scalar('eta') alpha = T.scalar('alpha') self.num_input = num_input self.num_output = num_output self.num_cells = num_cells self.eta = eta inputs = InputLayer(X, name="inputs") lstm = LSTMLayer(num_input, num_cells, input_layer=inputs, name="lstm") fc = FullyConnectedLayer(num_cells, num_output, input_layer=lstm) Y_hat = T.mean(fc.output(), axis=2) layer = inputs, lstm, fc self.params = get_params(layer) self.caches = make_caches(self.params) self.layers = layer mean_cost = T.mean((Y - Y_hat)**2) last_cost = T.mean((Y[-1] - Y_hat[-1])**2) self.cost = alpha*mean_cost + (1-alpha)*last_cost """" self.updates = momentum(self.cost, self.params, self.caches, self.eta, clip_at=3.0) """ self.updates,_,_,_,_ = create_optimization_updates(self.cost, self.params, method="adadelta", lr= lr, rho=rho) self.train = theano.function([X, Y, alpha], [self.cost, last_cost] ,\ updates=self.updates, allow_input_downcast=True) self.costfn = theano.function([X, Y, alpha], [self.cost, last_cost],\ allow_input_downcast=True) self.predict = theano.function([X], [Y_hat], allow_input_downcast=True)
def __init__(self, *args, learning_rate=0.01, momentum=0.9, **kwargs): super().__init__(*args, **kwargs) self.learning_rate = learning_rate self.momentum = momentum learning_rate = T.scalar('learning_rate') momentum = T.scalar('momentum') vs = self.create_shadows('v') updates1 = [(p, p + momentum*v) for p,v in zip(self.params.values(), vs.values())] updates2 = [(v, momentum*v - learning_rate*grad) for v,grad in zip(vs.values(), self.grad.values())] \ + [(p, p - learning_rate*grad) for p,grad in zip(self.params.values(), self.grad.values())] self.step1 = theano.function( inputs=[momentum], outputs=[], name='Nesterov_step1', updates=updates1) self.step2 = function( inputs=self.inputs+self.outputs+[ learning_rate, momentum], default_mode=1, outputs=self.loss, name='Nesterov_step2', updates=updates2)
def get_update(Ws_s, bs_s): x, fx = train.get_model(Ws_s, bs_s) # Ground truth (who won) y = T.vector('y') # Compute loss (just log likelihood of a sigmoid fit) y_pred = sigmoid(fx) loss = -( y * T.log(y_pred) + (1 - y) * T.log(1 - y_pred)).mean() # Metrics on the number of correctly predicted ones frac_correct = ((fx > 0) * y + (fx < 0) * (1 - y)).mean() # Updates learning_rate_s = T.scalar(dtype=theano.config.floatX) momentum_s = T.scalar(dtype=theano.config.floatX) updates = train.nesterov_updates(loss, Ws_s + bs_s, learning_rate_s, momentum_s) f_update = theano.function( inputs=[x, y, learning_rate_s, momentum_s], outputs=[loss, frac_correct], updates=updates, ) return f_update
def compile_functions(self, x, y): mb = T.scalar('mb',dtype='int64') lr = T.scalar('lr') index = T.scalar('index',dtype='int64') print("Compiling theano functions...\n") t0 = time.time() self.feed_forward = theano.function([x],self.model.out) self.cost = self.model.cross_entropy_SGD(y) self.error = self.model.error_SGD(y) grad_params = [T.grad(self.cost, param) for param in self.model.params] updates = [(param, param-lr*gparam) for param, gparam in zip(self.model.params, grad_params)] self.train_model = theano.function( inputs = [index, lr,mb], outputs = self.cost, updates = updates, givens = { x: self.dataset.in_train[(index*mb):(index+1)*mb], y: self.dataset.obs_train[(index*mb):(index+1)*mb], } ) self.error = theano.function( inputs = [x,y], outputs = self.error, ) print("Functions compiled. Took {:.2f} seconds".format(time.time() - t0))
def test_aggregation_buffer_name_uniqueness(): x1 = tensor.scalar('x') x2 = tensor.scalar('x') assert_raises_regex(ValueError, 'unique', AggregationBuffer, [x1, x2])
def augment_system(ode_func, n_states, n_theta): """ Function to create augmented system. Take a function which specifies a set of differential equations and return a compiled function which allows for computation of gradients of the differential equation's solition with repsect to the parameters. Uses float64 even if floatX=float32, because the scipy integrator always uses float64. Parameters ---------- ode_func: function Differential equation. Returns array-like. n_states: int Number of rows of the sensitivity matrix. (n_states) n_theta: int Number of ODE parameters Returns ------- system: function Augemted system of differential equations. """ # Present state of the system t_y = tt.vector("y", dtype="float64") t_y.tag.test_value = np.ones((n_states,), dtype="float64") # Parameter(s). Should be vector to allow for generaliztion to multiparameter # systems of ODEs. Is m dimensional because it includes all initial conditions as well as ode parameters t_p = tt.vector("p", dtype="float64") t_p.tag.test_value = np.ones((n_states + n_theta,), dtype="float64") # Time. Allow for non-automonous systems of ODEs to be analyzed t_t = tt.scalar("t", dtype="float64") t_t.tag.test_value = 2.459 # Present state of the gradients: # Will always be 0 unless the parameter is the inital condition # Entry i,j is partial of y[i] wrt to p[j] dydp_vec = tt.vector("dydp", dtype="float64") dydp_vec.tag.test_value = make_sens_ic(n_states, n_theta, "float64") dydp = dydp_vec.reshape((n_states, n_states + n_theta)) # Get symbolic representation of the ODEs by passing tensors for y, t and theta yhat = ode_func(t_y, t_t, t_p[n_states:]) # Stack the results of the ode_func into a single tensor variable if not isinstance(yhat, (list, tuple)): yhat = (yhat,) t_yhat = tt.stack(yhat, axis=0) # Now compute gradients J = tt.jacobian(t_yhat, t_y) Jdfdy = tt.dot(J, dydp) grad_f = tt.jacobian(t_yhat, t_p) # This is the time derivative of dydp ddt_dydp = (Jdfdy + grad_f).flatten() system = theano.function( inputs=[t_y, t_t, t_p, dydp_vec], outputs=[t_yhat, ddt_dydp], on_unused_input="ignore" ) return system
def __init__(self, n_dim, n_out, n_chan=1, n_superbatch=12800, opt_alg='adam', opt_params={ 'lr': 1e-3, 'b1': 0.9, 'b2': 0.99 }): self.numpy_rng = np.random.RandomState(1234) self.theano_rng = RandomStreams(self.numpy_rng.randint(2**30)) self.n_dim = n_dim self.n_out = n_out self.n_superbatch = n_superbatch self.alg = opt_alg self.n_class = 10 lr = opt_params.get('lr') n_batch = opt_params.get('nb') train_set_x = theano.shared( np.empty((n_superbatch, n_chan, n_dim, n_dim), dtype=theano.config.floatX), borrow=False, ) val_set_x = theano.shared( np.empty((n_superbatch, n_chan, n_dim, n_dim), dtype=theano.config.floatX), borrow=False, ) train_set_y = theano.shared( np.empty((n_superbatch, ), dtype=theano.config.floatX), borrow=False, ) val_set_y = theano.shared( np.empty((n_superbatch, ), dtype=theano.config.floatX), borrow=False, ) train_set_y_int = T.cast(train_set_y, 'int32') val_set_y_int = T.cast(val_set_y, 'int32') train_rbm_px_mu = theano.shared( np.empty((n_superbatch, self.n_aux), dtype=theano.config.floatX), borrow=False, ) X = T.tensor4(dtype=theano.config.floatX) S = T.tensor3(dtype=theano.config.floatX) Y = T.ivector() px_mu = T.lscalar(dtype=config.floatX) idx1, idx2 = T.lscalar(), T.lscalar() alpha = T.scalar(dtype=theano.config.floatX) # learning rate self.inputs = (X, Y, idx1, idx2, S, px_mu) # ---------------------------- # Begin RBM-only self.rbm_network = self.create_rbm_model(n_dim, n_out, n_chan) persistent_chain = theano.shared( np.zeros((n_batch, self.n_hidden), dtype=theano.config.floatX), borrow=True, ) rbm_cost, rbm_acc, rbm_updates = self.get_rbm_objective_and_updates( alpha, lr=lr, persistent=persistent_chain, ) self.rbm_objectives = (rbm_cost, rbm_acc) self.rbm_train = theano.function( [idx1, idx2, alpha], [rbm_cost, rbm_acc], updates=rbm_updates, givens={ X: train_set_x[idx1:idx2], Y: train_set_y_int[idx1:idx2] }, on_unused_input='warn', ) # End RBM-only # ---------------------------- # Begin DADGM-only tau = theano.shared( np.float32(5.0), name='temperature', allow_downcast=True, borrow=False, ) self.tau = tau self.dadgm_network = self.create_dadgm_model( X, Y, n_dim, n_out, n_chan, ) dadgm_loss, dadgm_acc = self.create_dadgm_objectives(False) self.dadgm_objectives = (dadgm_loss, dadgm_acc) dadgm_params = self.get_dadgm_params() dadgm_grads = self.create_dadgm_gradients(dadgm_loss, False) dadgm_updates = self.create_dadgm_updates( dadgm_grads, dadgm_params, alpha, opt_alg, opt_params, ) self.dadgm_train = theano.function( [idx1, idx2, alpha], [dadgm_loss, dadgm_acc], updates=dadgm_updates, givens={ X: train_set_x[idx1:idx2], Y: train_set_y_int[idx1:idx2], px_mu: train_rbm_px_mu, }, on_unused_input='warn', ) self.dadgm_loss = theano.function( [X, Y], [dadgm_loss, dadgm_acc], on_unused_input='warn', ) # End DADGM-only # ---------------------------- self.n_batch = n_batch # parameters for sampling self.n_chain = 100 # save data variables self.train_set_x = train_set_x self.train_set_y = train_set_y self.val_set_x = val_set_x self.val_set_y = val_set_y self.train_rbm_px_mu = train_rbm_px_mu self.data_loaded = False
def single_layer_lstm(n_in, n_out): Wxb = theano.shared(np.random.randn(n_in, n_out), ) Whb = theano.shared(np.random.randn(n_out, n_out), ) bb = theano.shared(np.random.randn(n_out)) Wxi = theano.shared(np.random.randn(n_in, n_out), ) Whi = theano.shared(np.random.randn(n_out, n_out), ) bi = theano.shared(np.random.randn(n_out)) Wxf = theano.shared(np.random.randn(n_in, n_out), ) Whf = theano.shared(np.random.randn(n_out, n_out), ) bf = theano.shared(np.random.randn(n_out)) Wxo = theano.shared(np.random.randn(n_in, n_out), ) Who = theano.shared(np.random.randn(n_out, n_out), ) bo = theano.shared(np.random.randn(n_out)) Wo = theano.shared(np.random.randn(n_out, n_out)) bout = theano.shared(np.random.randn(n_out)) params = [Wxb, Whb, bb, Wxi, Whi, bi, Wxf, Whf, bf, Wxo, Who, bo, Wo, bout] def step(x,htm1,ctm1,Wxb,Whb,bb,\ Wxi,Whi,bi,\ Wxf,Whf,bf,\ Wxo,Who,bo,Wo,bout): z = T.tanh(T.dot(x, Wxb) + T.dot(htm1, Whb) + bb) i = T.nnet.sigmoid(T.dot(x, Wxi) + T.dot(htm1, Whi) + bi) f = T.nnet.sigmoid(T.dot(x, Wxf) + T.dot(htm1, Whf) + bf) c = i * z + f * ctm1 o = T.nnet.sigmoid(T.dot(x, Wxo) + T.dot(htm1, Who) + bo) h = o * T.tanh(c) y = T.dot(h, Wo) + bout return [h, c, y] X = T.matrix() h0 = T.vector() c0 = T.vector() yt = T.ivector() lr = T.scalar() mom = T.scalar() [h, c, y], _ = theano.scan(step, sequences=X, outputs_info=[h0, c0, None], non_sequences=[ Wxb, Whb, bb, Wxi, Whi, bi, Wxf, Whf, bf, Wxo, Who, bo, Wo, bout ]) yout = T.nnet.softmax(y) L2 = T.scalar() L2 = 0 for param in params: L2 += (param**2).sum() L2 = 0.001 * L2 def loss(y_pred, y_true): return -T.mean(T.log(y_pred)[T.arange(y_true.shape[0]), y_true]) #oloss = loss(yout,yt) #cost = theano.function( [X,h0,c0,yt], oloss ) funch = theano.function([X, h0, c0], c) funcy = theano.function([X, h0, c0], y) oloss = loss(yout, yt) + L2 cost = loss(yout, yt) gparams = [] for param in params: gparams.append(T.grad(oloss, param)) # zip just concatenate two lists updates_t = {} for param in params: updates_t[param] = theano.shared(value=np.zeros( param.get_value(borrow=True).shape, dtype=theano.config.floatX), name='updates') updates = {} for param, gparam in zip(params, gparams): weight_update = updates_t[param] upd = mom * weight_update - lr * gparam updates[weight_update] = upd updates[param] = param + upd """ for param, gparam in zip(params, gparams): #mparam = theano.shared(param.get_value()*0.) upd = -lr*gparam# + mom*mparam# - 0.01*param# + #updates[mparam] = upd updates[param] = param + upd """ """ weight_update = updates[param] upd = -lr * gparam - 0.01*param updates[weight_update] = upd updates[param] = param + upd """ #gWxo = T.grad(oloss,Wxo) #fgradwxo = theano.function( [X,h0,c0,yt], gWxo ) trainer = theano.function([X, h0, c0, yt, lr, mom], [cost], updates=updates) return funcy, trainer
def func_update_policy(self, Tmax, use_x0=False, accumulators=None): U = tensor.tensor3('U') # Inputs Q = tensor.tensor3('Q') # Noise if use_x0: x0_ = tensor.matrix('x0_') else: x0 = self.policy_net.params['x0'] x0_ = tensor.alloc(x0, U.shape[1], x0.shape[0]) log_z_0 = self.policy_net.get_outputs_0(x0_, log=True) r, log_z = self.policy_net.get_outputs(U, Q, x0_, log=True) # Learning rate lr = tensor.scalar('lr') A = tensor.tensor3('A') R = tensor.matrix('R') b = tensor.matrix('b') M = tensor.matrix('M') logpi_0 = tensor.sum(log_z_0 * A[0], axis=-1) * M[0] logpi_t = tensor.sum(log_z * A[1:], axis=-1) * M[1:] # Entropy #entropy_0 = tensor.sum(tensor.exp(log_z_0)*log_z_0, axis=-1)*M[0] #entropy_t = tensor.sum(tensor.exp(log_z)*log_z, axis=-1)*M[1:] #entropy = (tensor.sum(entropy_0) + tensor.sum(entropy_t))/tensor.sum(M) #def f(x): # return -x**2/2/self.sigma**2 #logpi_0 = tensor.sum(f(A[0] - z_0), axis=-1)*M[0] #logpi_t = tensor.sum(f(A[1:] - z), axis=-1)*M[1:] # Enforce causality Mcausal = theanotools.zeros((Tmax - 1, Tmax - 1)) for i in xrange(Mcausal.shape[0]): Mcausal[i, i:] = 1 Mcausal = theanotools.shared(Mcausal, 'Mcausal') J0 = logpi_0 * R[0] J0 = tensor.mean(J0) J = (logpi_t.T).dot(Mcausal).dot(R[1:] * M[1:]) J = tensor.nlinalg.trace(J) / J.shape[0] J += J0 # Second term Jb0 = logpi_0 * b[0] Jb0 = tensor.mean(Jb0) Jb = logpi_t * b[1:] Jb = tensor.mean(tensor.sum(Jb, axis=0)) J -= Jb0 + Jb # Objective function obj = -J + self.policy_net.get_regs(x0_, r, M) # + 0.0005*entropy # SGD self.policy_sgd = Adam(self.policy_net.trainables, accumulators=accumulators) if self.policy_net.type == 'simple': i = self.policy_net.index('Wrec') grads = tensor.grad(obj, self.policy_net.trainables) grads[i] += self.policy_net.get_dOmega_dWrec(-J, r) norm, grads, updates = self.policy_sgd.get_updates(obj, lr, grads=grads) else: norm, grads, updates = self.policy_sgd.get_updates(obj, lr) if use_x0: args = [x0_] else: args = [] args += [U, Q, A, R, b, M, lr] return theano.function(args, norm, updates=updates)
def fit(self, X_train, Y_train, X_test=None, Y_test=None, validation_frequency=100): """ Fit model Pass in X_test, Y_test to compute test error and report during training. X_train : ndarray (n_seq x n_steps x n_in) Y_train : ndarray (n_seq x n_steps x n_out) validation_frequency : int in terms of number of sequences (or number of weight updates) """ if X_test is not None: assert (Y_test is not None) self.interactive = True test_set_x, test_set_y = self.shared_dataset((X_test, Y_test)) else: self.interactive = False train_set_x, train_set_y = self.shared_dataset((X_train, Y_train)) n_train = train_set_x.get_value(borrow=True).shape[0] if self.interactive: n_test = test_set_x.get_value(borrow=True).shape[0] ###################### # BUILD ACTUAL MODEL # ###################### logger.info('... building the model') index = T.lscalar('index') # index to a case # learning rate (may change) l_r = T.scalar('l_r', dtype=theano.config.floatX) mom = T.scalar('mom', dtype=theano.config.floatX) # momentum cost = self.rnn.loss(self.y) \ + self.L1_reg * self.rnn.L1 \ + self.L2_reg * self.rnn.L2_sqr compute_train_error = theano.function( inputs=[ index, ], outputs=self.rnn.loss(self.y), givens={ self.x: train_set_x[index], self.y: train_set_y[index] }, mode=theano.compile.MonitorMode(post_func=self.detect_nan)) #mode=mode) if self.interactive: compute_test_error = theano.function(inputs=[ index, ], outputs=self.rnn.loss(self.y), givens={ self.x: test_set_x[index], self.y: test_set_y[index] }, mode=mode) # compute the gradient of cost with respect to theta = (W, W_in, W_out) # gradients on the weights using BPTT gparams = [] for param in self.rnn.params: gparam = T.grad(cost, param) gparams.append(gparam) updates = {} for param, gparam in zip(self.rnn.params, gparams): weight_update = self.rnn.updates[param] upd = mom * weight_update - l_r * gparam updates[weight_update] = upd updates[param] = param + upd # compiling a Theano function `train_model` that returns the # cost, but in the same time updates the parameter of the # model based on the rules defined in `updates` train_model = theano.function(inputs=[index, l_r, mom], outputs=cost, updates=updates, givens={ self.x: train_set_x[index], self.y: train_set_y[index] }, mode=mode) ############### # TRAIN MODEL # ############### logger.info('... training') epoch = 0 while (epoch < self.n_epochs): epoch = epoch + 1 for idx in xrange(n_train): effective_momentum = self.final_momentum \ if epoch > self.momentum_switchover \ else self.initial_momentum example_cost = train_model(idx, self.learning_rate, effective_momentum) # iteration number (how many weight updates have we made?) # epoch is 1-based, index is 0 based iter = (epoch - 1) * n_train + idx + 1 if iter % validation_frequency == 0: # compute loss on training set train_losses = [ compute_train_error(i) for i in xrange(n_train) ] this_train_loss = np.mean(train_losses) if self.interactive: test_losses = [ compute_test_error(i) for i in xrange(n_test) ] this_test_loss = np.mean(test_losses) logger.info('epoch %i, seq %i/%i, tr loss %f ' 'te loss %f lr: %f' % \ (epoch, idx + 1, n_train, this_train_loss, this_test_loss, self.learning_rate)) else: logger.info('epoch %i, seq %i/%i, train loss %f ' 'lr: %f' % \ (epoch, idx + 1, n_train, this_train_loss, self.learning_rate)) self.learning_rate *= self.learning_rate_decay
def ready(self): # input (where first dimension is time) self.x = T.matrix() # target (where first dimension is time) if self.output_type == 'real': self.y = T.matrix(name='y', dtype=theano.config.floatX) elif self.output_type == 'binary': self.y = T.matrix(name='y', dtype='int32') elif self.output_type == 'softmax': # only vector labels supported self.y = T.vector(name='y', dtype='int32') else: raise NotImplementedError # initial hidden state of the RNN self.h0 = T.vector() # learning rate self.lr = T.scalar() if self.activation == 'tanh': activation = T.tanh elif self.activation == 'sigmoid': activation = T.nnet.sigmoid elif self.activation == 'relu': activation = lambda x: x * (x > 0) elif self.activation == 'cappedrelu': activation = lambda x: T.minimum(x * (x > 0), 6) else: raise NotImplementedError self.rnn = RNN(input=self.x, n_in=self.n_in, n_hidden=self.n_hidden, n_out=self.n_out, activation=activation, output_type=self.output_type, use_symbolic_softmax=self.use_symbolic_softmax) if self.output_type == 'real': self.predict = theano.function(inputs=[ self.x, ], outputs=self.rnn.y_pred, mode=mode) elif self.output_type == 'binary': self.predict_proba = theano.function(inputs=[ self.x, ], outputs=self.rnn.p_y_given_x, mode=mode) self.predict = theano.function(inputs=[ self.x, ], outputs=T.round( self.rnn.p_y_given_x), mode=mode) elif self.output_type == 'softmax': self.predict_proba = theano.function(inputs=[ self.x, ], outputs=self.rnn.p_y_given_x, mode=mode) self.predict = theano.function(inputs=[ self.x, ], outputs=self.rnn.y_out, mode=mode) else: raise NotImplementedError
def evaluate_lenet5(learning_rate=0.1, n_epochs=200, dataset=DataSet, nkerns=[cls1, cls2], batch_size=100): """ Demonstrates lenet on MNIST dataset :type learning_rate: float :param learning_rate: learning rate used (factor for the stochastic gradient) :type n_epochs: int :param n_epochs: maximal number of epochs to run the optimizer :type dataset: string :param dataset: path to the dataset used for training /testing (MNIST here) :type nkerns: list of ints :param nkerns: number of kernels on each layer """ rng = numpy.random.RandomState(23455) datasets = load_data(dataset) train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] test_set_x, test_set_y = datasets[2] print(type(train_set_x)) #train_set_x.set_value(train_set_x.get_value(borrow=True)[:,:540]) #valid_set_x.set_value(valid_set_x.get_value(borrow=True)[:,:540]) #test_set_x.set_value(test_set_x.get_value(borrow=True)[:,:540]) #train_set_x = train_set_x / 100 #valid_set_x = valid_set_x / 100 #test_set_x = test_set_x / 100 # compute number of minibatches for training, validation and testing n_train_batches = train_set_x.get_value(borrow=True).shape[0] n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] n_test_batches = test_set_x.get_value(borrow=True).shape[0] n_train_batches /= batch_size n_valid_batches /= batch_size n_test_batches /= batch_size #n_test_batches = (n_test_batches/batch_size) + (n_test_batches % batch_size > 0) print(n_test_batches) # allocate symbolic variables for the data index = T.lscalar() # index to a [mini]batch Alr = T.scalar('Alr') x = T.matrix('x') # the data is presented as rasterized images y = T.ivector('y') # the labels are presented as 1D vector of # [int] labels ishape = (nFB, nFs) # this is the size of MNIST images ###################### # BUILD ACTUAL MODEL # ###################### print('... building the model') # Reshape matrix of rasterized images of shape (batch_size,28*28) # to a 4D tensor, compatible with our LeNetConvPoolLayer dFeatureV = iFMs * nFB * nFs xinp = x[:, :dFeatureV] # print (x.shahpe) layer0_input = xinp.reshape((batch_size, iFMs, nFB, nFs)) layer1H_input = x[:, dFeatureV:] # Construct the first convolutional pooling layer: # filtering reduces the image size to (28-5+1,28-5+1)=(24,24) # maxpooling reduces this further to (24/2,24/2) = (12,12) # 4D output tensor is thus of shape (batch_size,nkerns[0],12,12) layer0 = LeNetConvPoolLayer(rng, input=layer0_input, image_shape=(batch_size, iFMs, nFB, nFs), filter_shape=(nkerns[0], iFMs, fsx, fsy), poolsize=(1, p)) cl2x = (nFB - fsx + 1) / 1 cl2y = (nFs - fsy + 1) / p layer1H = HiddenLayer(rng, input=layer1H_input, n_in=27, n_out=nhu1 / 4, activation=T.tanh) # Construct the second convolutional pooling layer # filtering reduces the image size to (12-5+1,12-5+1)=(8,8) # maxpooling reduces this further to (8/2,8/2) = (4,4) # 4D output tensor is thus of shape (nkerns[0],nkerns[1],4,4) #layer1 = LeNetConvPoolLayer(rng, input=layer0.output, # image_shape=(batch_size, nkerns[0], cl2x, cl2y), # filter_shape=(nkerns[1], nkerns[0], fsx, 1), poolsize=(p2, 1)) #hl1 = (cl2x - fsx + 1)/p2 hl1 = cl2x * cl2y # the HiddenLayer being fully-connected, it operates on 2D matrices of # shape (batch_size,num_pixels) (i.e matrix of rasterized images). # This will generate a matrix of shape (20,32*4*4) = (20,512) layer2_input = layer0.output.flatten(2) #layer2_inputT = T.concatenate([layer2_input,x[:,dFeatureV:]],axis = 1) layer2_inputT = T.concatenate([layer2_input, layer1H.output], axis=1) # construct a fully-connected sigmoidal layer layer2 = HiddenLayer(rng, input=layer2_inputT, n_in=(nkerns[0] * hl1 * 1) + nhu1 / 4, n_out=nhu1 * 2, activation=T.tanh) layer22 = HiddenLayer(rng, input=layer2.output, n_in=nhu1 * 2, n_out=nhu1, activation=T.tanh) layer23 = HiddenLayer(rng, input=layer22.output, n_in=nhu1, n_out=nhu1, activation=T.tanh) # classify the values of the fully-connected sigmoidal layer layer3 = LogisticRegression(input=layer23.output, n_in=nhu1, n_out=n_out) # the cost we minimize during training is the NLL of the model cost = layer3.negative_log_likelihood(y) #yPred = layer3.ypred(layer2.output) # create a function to compute the mistakes that are made by the model test_model = theano.function( [index], [layer3.errors(y), layer3.y_pred], givens={ x: test_set_x[index * batch_size:(index + 1) * batch_size], y: test_set_y[index * batch_size:(index + 1) * batch_size] }) validate_model = theano.function( [index], layer3.errors(y), givens={ x: valid_set_x[index * batch_size:(index + 1) * batch_size], y: valid_set_y[index * batch_size:(index + 1) * batch_size] }) # create a list of all model parameters to be fit by gradient descent #params = layer3.params + layer22.params + layer2.params + layer1.params + layer0.params params = layer3.params + layer23.params + layer22.params + layer2.params + layer1H.params + layer0.params # create a list of gradients for all model parameters grads = T.grad(cost, params) # train_model is a function that updates the model parameters by # SGD Since this model has many parameters, it would be tedious to # manually create an update rule for each model parameter. We thus # create the updates list by automatically looping over all # (params[i],grads[i]) pairs. updates = [] for param_i, grad_i in zip(params, grads): #updates.append((param_i, param_i - learning_rate * grad_i)) updates.append((param_i, param_i - Alr * grad_i)) train_model = theano.function( [index, Alr], cost, updates=updates, givens={ x: train_set_x[index * batch_size:(index + 1) * batch_size][:], y: train_set_y[index * batch_size:(index + 1) * batch_size][:] }) ############### # TRAIN MODEL # ############### print('... training') # early-stopping parameters patience = 10000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience / 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch #best_params = None best_params = [] best_validation_loss = numpy.inf prev_validation_loss = 200 best_iter = 0 test_score = 0. start_time = time.clock() Alrc = 0.1 AlrE = 0.00001 epochC = 0 epoch = 0 done_looping = False for param in params: best_params.append(param.get_value()) while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 epochC = epochC + 1 for minibatch_index in xrange(n_train_batches): iter = (epoch - 1) * n_train_batches + minibatch_index if iter % 100 == 0: print('training @ iter = ', iter) cost_ij = train_model(minibatch_index, Alrc) if (iter + 1) % validation_frequency == 0: # compute zero-one loss on validation set validation_losses = [ validate_model(i) for i in xrange(n_valid_batches) ] this_validation_loss = numpy.mean(validation_losses) lossratio = (this_validation_loss - prev_validation_loss) / (prev_validation_loss + 1) print(lossratio) print('epoch %i, minibatch %i/%i, validation error %f, lr %f %%' % \ (epoch, minibatch_index + 1, n_train_batches, \ this_validation_loss * 100., Alrc)) # if we got the best validation score until now #if this_validation_loss < best_validation_loss: if lossratio <= 0.0: for i in range(len(params)): best_params[i] = params[i].get_value() #improve patience if loss improvement is good enough if this_validation_loss < best_validation_loss * \ improvement_threshold: patience = max(patience, iter * patience_increase) # save best validation score and iteration number best_validation_loss = this_validation_loss prev_validation_loss = this_validation_loss best_iter = iter # test it on the test set #tm = test_model(0) yP = numpy.asarray([]) test_losses = [ test_model(i)[0] for i in xrange(n_test_batches) ] for i in xrange(n_test_batches): yP = numpy.concatenate((yP, test_model(i)[1])) print(yP.shape) test_score = numpy.mean(test_losses) #yP = yPred#yPred(layer2.output.owner.inputs[0].get_value()) #y = test_set_y.owner.inputs[0].get_value()[:2300] y = yP print(yP.shape) print(y.shape) I1 = numpy.nonzero(y == 0.0) I2 = numpy.nonzero(y == 1.0) I3 = numpy.nonzero(y == 2.0) I4 = numpy.nonzero(y == 3.0) print(I1[0].shape) print(I2[0].shape) print(I3[0].shape) print(I4[0].shape) I11 = numpy.nonzero(yP[I1[0]] == 0) I12 = numpy.nonzero(yP[I1[0]] == 1) I13 = numpy.nonzero(yP[I1[0]] == 2) I14 = numpy.nonzero(yP[I1[0]] == 3) I21 = numpy.nonzero(yP[I2[0]] == 0) I22 = numpy.nonzero(yP[I2[0]] == 1) I23 = numpy.nonzero(yP[I2[0]] == 2) I24 = numpy.nonzero(yP[I2[0]] == 3) I31 = numpy.nonzero(yP[I3[0]] == 0) I32 = numpy.nonzero(yP[I3[0]] == 1) I33 = numpy.nonzero(yP[I3[0]] == 2) I34 = numpy.nonzero(yP[I3[0]] == 3) I41 = numpy.nonzero(yP[I4[0]] == 0) I42 = numpy.nonzero(yP[I4[0]] == 1) I43 = numpy.nonzero(yP[I4[0]] == 2) I44 = numpy.nonzero(yP[I4[0]] == 3) acc1 = 100 #float(float(I11[0].size)/float(I1[0].size)) acc2 = 100 #float(float(I22[0].size)/float(I2[0].size)) if n_out == 3: acc3 = 100 #float(float(I33[0].size)/float(I3[0].size)) acc4 = 0 elif n_out == 4: acc3 = float(float(I33[0].size) / float(I3[0].size)) acc4 = float(float(I44[0].size) / float(I4[0].size)) else: acc3 = 0 acc4 = 0 print(( ' epoch %i, minibatch %i/%i, test error of ' 'best model %f, acc1 = %f, acc2 = %f, acc3 = %f, acc4 = %f, I11 = %i, I12 = %i, I13 = %i, I14 = %i, I21 = %i, I22 = %i, I23 = %i, I24 = %i, I31 = %i, I32 = %i, I33 = %i, I34 = %i, I41 = %i, I42 = %i, I43 = %i, I44 = %i %%' ) % (epoch, minibatch_index + 1, n_train_batches, test_score * 100., acc1 * 100., acc2 * 100., acc3 * 100, acc4 * 100, I11[0].size, I12[0].size, I13[0].size, I14[0].size, I21[0].size, I22[0].size, I23[0].size, I24[0].size, I31[0].size, I32[0].size, I33[0].size, I34[0].size, I41[0].size, I42[0].size, I43[0].size, I44[0].size)) #print((' epoch %i, minibatch %i/%i, test error of best ' # 'model %f %%') % # (epoch, minibatch_index + 1, n_train_batches, # test_score * 100.)) else: if Alrc <= AlrE: done_looping = True break elif epochC > 40: Alrc = Alrc / 2 for param, best_param in zip(params, best_params): param.set_value(best_param) epochC = 0 #if patience <= iter: # done_looping = True # break end_time = time.clock() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i,'\ 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) #print >> sys.stderr, ('The code for file ' + # os.path.split(__file__)[1] + # ' ran for %.2fm' % ((end_time - start_time) / 60.)) OF = open(outFile, 'a') print(DataSet, n_out, fsx, fsy, p, cls1, cls2, nhu1, nFB, nFs, iFMs, nhus, batch_size, test_score * 100., acc1 * 100., acc2 * 100., acc3 * 100, acc4 * 100, I11[0].size, I12[0].size, I13[0].size, I14[0].size, I21[0].size, I22[0].size, I23[0].size, I24[0].size, I31[0].size, I32[0].size, I33[0].size, I34[0].size, I41[0].size, I42[0].size, I43[0].size, I44[0].size, file=OF) OF.close()
def train( dim_word=100, dim_word_src=200, enc_dim=1000, dec_dim=1000, # the number of LSTM units patience=-1, # early stopping patience max_epochs=5000, finish_after=-1, # finish after this many updates decay_c=0., # L2 regularization penalty alpha_c=0., # alignment regularization clip_c=-1., # gradient clipping threshold lrate=0.01, # learning rate n_words_src=100000, # source vocabulary size n_words=100000, # target vocabulary size maxlen=1000, # maximum length of the description maxlen_trg=1000, # maximum length of the description maxlen_sample=1000, optimizer='rmsprop', batch_size=[1, 2, 3, 4], valid_batch_size=16, sort_size=20, save_path=None, save_file_name='model', save_best_models=0, dispFreq=100, validFreq=100, saveFreq=1000, # save the parameters after every saveFreq updates sampleFreq=-1, pbatchFreq=-1, verboseFreq=10000, datasets=[ 'data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.en.tok', '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.fr.tok' ], valid_datasets=[ '../data/dev/newstest2011.en.tok', '../data/dev/newstest2011.fr.tok' ], dictionaries=[ '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.en.tok.pkl', '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.fr.tok.pkl' ], source_word_level=0, target_word_level=0, use_dropout=False, re_load=False, re_load_old_setting=False, uidx=None, eidx=None, cidx=None, layers=None, save_every_saveFreq=0, save_burn_in=20000, use_bpe=0, init_params=None, build_model=None, build_sampler=None, gen_sample=None, **kwargs): # Model options model_options = locals().copy() del model_options['init_params'] del model_options['build_model'] del model_options['build_sampler'] del model_options['gen_sample'] # load dictionaries and invert them # dictionaries[0] : src # dictionaries[1] : trg worddicts = [None] * len(dictionaries) worddicts_r = [None] * len(dictionaries) # ii, dd : 0 = source, 1 = target for ii, dd in enumerate(dictionaries): with open(dd, 'rb') as f: worddicts[ii] = cPickle.load(f) worddicts_r[ii] = dict() for kk, vv in worddicts[ii].iteritems(): worddicts_r[ii][vv] = kk print 'Building model' if not os.path.exists(save_path): os.makedirs(save_path) file_name = '%s%s.npz' % (save_path, save_file_name) best_file_name = '%s%s.best.npz' % (save_path, save_file_name) opt_file_name = '%s%s%s.npz' % (save_path, save_file_name, '.grads') best_opt_file_name = '%s%s%s.best.npz' % (save_path, save_file_name, '.grads') model_name = '%s%s.pkl' % (save_path, save_file_name) params = init_params(model_options) cPickle.dump(model_options, open(model_name, 'wb')) history_errs = [[], [], [], []] # reload options # reload : False if re_load and os.path.exists(file_name): print 'You are reloading your experiment.. do not panic dude..' if re_load_old_setting: with open(model_name, 'rb') as f: models_options = cPickle.load(f) params = load_params(file_name, params) # reload history model = numpy.load(file_name) history_errs = list(lst.tolist() for lst in model['history_errs']) if uidx is None: uidx = model['uidx'] if eidx is None: eidx = model['eidx'] if cidx is None: try: cidx = model['cidx'] except: cidx = 0 else: if uidx is None: uidx = 0 if eidx is None: eidx = 0 if cidx is None: cidx = 0 print 'Loading data' train = MultiTextIterator(source=datasets[0], target=datasets[1], source_dict=dictionaries[0], target_dict=dictionaries[1], n_words_source=n_words_src, n_words_target=n_words, source_word_level=source_word_level, target_word_level=target_word_level, batch_size=batch_size, sort_size=sort_size) valid = [ TextIterator(source=valid_dataset[0], target=valid_dataset[1], source_dict=dictionaries[0], target_dict=dictionaries[1], n_words_source=n_words_src, n_words_target=n_words, source_word_level=source_word_level, target_word_level=target_word_level, batch_size=valid_batch_size, sort_size=sort_size) for valid_dataset in valid_datasets ] # create shared variables for parameters tparams = init_tparams(params) trng, use_noise, \ x, x_mask, y, y_mask, \ opt_ret, \ cost = \ build_model(tparams, model_options) # NOTE : this is where we build the model inps = [x, x_mask, y, y_mask] print 'Building sampler...\n', f_init, f_next = build_sampler(tparams, model_options, trng, use_noise) #print 'Done' # before any regularizer print 'Building f_log_probs...', f_log_probs = theano.function(inps, cost, profile=profile) # NOTE : f_log_probs : [x, x_mask, y, y_mask], cost print 'Done' if re_load: # NOTE : this whole thing is False use_noise.set_value(0.) valid_scores = [] for ii, vv in enumerate(valid): valid_errs = pred_probs(f_log_probs, prepare_data, model_options, vv, verboseFreq=verboseFreq) valid_err = valid_errs.mean() if numpy.isnan(valid_err): import ipdb ipdb.set_trace() print 'Reload sanity check: Valid ', valid_err cost = cost.mean() # apply L2 regularization on weights # decay_c : 0 if decay_c > 0.: decay_c = theano.shared(numpy.float32(decay_c), name='decay_c') weight_decay = 0. for kk, vv in tparams.iteritems(): weight_decay += (vv**2).sum() weight_decay *= decay_c cost += weight_decay # regularize the alpha weights # alpha_c : 0 if alpha_c > 0. and not model_options['decoder'].endswith('simple'): alpha_c = theano.shared(numpy.float32(alpha_c), name='alpha_c') alpha_reg = alpha_c * ( (tensor.cast(y_mask.sum(0) // x_mask.sum(0), 'float32')[:, None] - opt_ret['dec_alphas'].sum(0))**2).sum(1).mean() cost += alpha_reg # after all regularizers - compile the computational graph for cost print 'Building f_cost...', f_cost = theano.function(inps, cost, profile=profile) # NOTE : why is this not referenced somewhere later? print 'Done' print 'Computing gradient...', grads = tensor.grad(cost, wrt=itemlist(tparams)) print 'Done' if clip_c > 0: grads, not_finite, clipped = gradient_clipping(grads, tparams, clip_c) else: not_finite = 0 clipped = 0 # compile the optimizer, the actual computational graph is compiled here lr = tensor.scalar(name='lr') print 'Building optimizers...', if re_load and os.path.exists(file_name): if clip_c > 0: f_grad_shared, f_update, toptparams = eval(optimizer)( lr, tparams, grads, inps, cost=cost, not_finite=not_finite, clipped=clipped, file_name=opt_file_name) else: f_grad_shared, f_update, toptparams = eval(optimizer)( lr, tparams, grads, inps, cost=cost, file_name=opt_file_name) else: # re_load = False, clip_c = 1 if clip_c > 0: f_grad_shared, f_update, toptparams = eval(optimizer)( lr, tparams, grads, inps, cost=cost, not_finite=not_finite, clipped=clipped) else: f_grad_shared, f_update, toptparams = eval(optimizer)(lr, tparams, grads, inps, cost=cost) # f_grad_shared = theano.function(inp, [cost, not_finite, clipped], updates=gsup, profile=profile) # f_update = theano.function([lr], [], updates=updates, # on_unused_input='ignore', profile=profile) # toptparams print 'Done' print 'Optimization' best_p = None bad_counter = 0 # will never be true if validFreq == -1: validFreq = len(train[0]) / batch_size if saveFreq == -1: saveFreq = len(train[0]) / batch_size # Training loop ud_start = time.time() estop = False if re_load: # IndexError: index 14 is out of bounds for axis 1 with size 13 print "Checkpointed minibatch number: %d" % cidx for cc in xrange(cidx): if numpy.mod(cc, 1000) == 0: print "Jumping [%d / %d] examples" % (cc, cidx) train.next() for epoch in xrange(max_epochs): time0 = time.time() n_samples = 0 NaN_grad_cnt = 0 NaN_cost_cnt = 0 clipped_cnt = 0 update_idx = 0 if re_load: re_load = 0 else: cidx = 0 for x, y in train: # NOTE : x, y are [sen1, sen2, sen3 ...] where sen_i are of different length update_idx += 1 cidx += 1 uidx += 1 use_noise.set_value(1.) # NOTE : n_x <= batch_size x, x_mask, y, y_mask, n_x = prepare_data(x, y, maxlen=maxlen, maxlen_trg=maxlen_trg, n_words_src=n_words_src, n_words=n_words) n_samples += n_x if x is None: print 'Minibatch with zero sample under length ', maxlen uidx -= 1 uidx = max(uidx, 0) continue # compute cost, grads and copy grads to shared variables if clip_c > 0: cost, not_finite, clipped = f_grad_shared(x, x_mask, y, y_mask) else: cost = f_grad_shared(x, x_mask, y, y_mask) if clipped: clipped_cnt += 1 # check for bad numbers, usually we remove non-finite elements # and continue training - but not done here if numpy.isnan(cost) or numpy.isinf(cost): import ipdb ipdb.set_trace() NaN_cost_cnt += 1 if not_finite: import ipdb ipdb.set_trace() NaN_grad_cnt += 1 continue # do the update on parameters f_update(lrate) if numpy.isnan(cost) or numpy.isinf(cost): continue if float(NaN_grad_cnt) > max_epochs * 0.5 or float( NaN_cost_cnt) > max_epochs * 0.5: print 'Too many NaNs, abort training' return 1., 1., 1. # verbose if numpy.mod(uidx, dispFreq) == 0: ud = time.time() - ud_start wps = n_samples / float(time.time() - time0) print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost, 'NaN_in_grad', NaN_grad_cnt,\ 'NaN_in_cost', NaN_cost_cnt, 'Gradient_clipped', clipped_cnt, 'UD ', ud, "%.2f sentence/s" % wps ud_start = time.time() if numpy.mod(uidx, pbatchFreq) == 0 and pbatchFreq != -1: pbatch(x, worddicts_r[0]) # generate some samples with the model and display them if numpy.mod(uidx, sampleFreq) == 0 and sampleFreq != -1: gen_list = [ 0, batch_size[0], batch_size[0] + batch_size[1], batch_size[0] + batch_size[1] + batch_size[2] ] gen_list = [ii for ii in gen_list if ii < n_x] for jj in gen_list: # jj = min(5, n_samples) stochastic = True use_noise.set_value(0.) # x : maxlen X n_samples sample, score = gen_sample(tparams, f_init, f_next, x[:, jj][:, None], model_options, trng=trng, k=1, maxlen=maxlen_sample, stochastic=stochastic, argmax=False) print print 'Source ', jj, ': ', if source_word_level: for vv in x[:, jj]: if vv == 0: break if vv in worddicts_r[0]: if use_bpe: print(worddicts_r[0][vv]).replace( '@@', ''), else: print worddicts_r[0][vv], else: print 'UNK', print else: source_ = [] for vv in x[:, jj]: if vv == 0: break if vv in worddicts_r[0]: source_.append(worddicts_r[0][vv]) else: source_.append('UNK') print "".join(source_) print 'Truth ', jj, ' : ', if target_word_level: for vv in y[:, jj]: if vv == 0: break if vv in worddicts_r[1]: if use_bpe: print(worddicts_r[1][vv]).replace( '@@', ''), else: print worddicts_r[1][vv], else: print 'UNK', print else: truth_ = [] for vv in y[:, jj]: if vv == 0: break if vv in worddicts_r[1]: truth_.append(worddicts_r[1][vv]) else: truth_.append('UNK') print "".join(truth_) print 'Sample ', jj, ': ', if stochastic: ss = sample else: score = score / numpy.array([len(s) for s in sample]) ss = sample[score.argmin()] if target_word_level: for vv in ss: if vv == 0: break if vv in worddicts_r[1]: if use_bpe: print(worddicts_r[1][vv]).replace( '@@', ''), else: print worddicts_r[1][vv], else: print 'UNK', print else: sample_ = [] for vv in ss: if vv == 0: break if vv in worddicts_r[1]: sample_.append(worddicts_r[1][vv]) else: sample_.append('UNK') print "".join(sample_) print # validate model on validation set and early stop if necessary if numpy.mod(uidx, validFreq) == 0: valid_scores = [] for ii, vv in enumerate(valid): use_noise.set_value(0.) # NOTE : when validation, don't pass maxlen, maxlen_trg # meaning, don't limit sentence lengths... # sort of makes sense i suppose? valid_errs = pred_probs( f_log_probs, prepare_data, model_options, vv, verboseFreq=verboseFreq, ) valid_err = valid_errs.mean() valid_scores.append(valid_err) history_errs[ii].append(valid_err) # patience == -1, never happens if len(history_errs[ii]) > patience and valid_err >= \ numpy.array(history_errs[ii])[:-patience].min() and patience != -1: bad_counter += 1 if bad_counter > patience: print 'Early Stop!' estop = True break if numpy.isnan(valid_err): import ipdb ipdb.set_trace() cnt = 0 for ii in xrange(4): if uidx == 0 or valid_scores[ii] <= numpy.array( history_errs[ii]).min(): cnt += 1 if len(history_errs[0]) > 1: if numpy.sum(valid_scores) <= numpy.sum( [aa[:-2] for aa in history_errs]): less_sum = True else: less_sum = False else: less_sum = True if cnt >= 2 and less_sum: best_p = unzip(tparams) best_optp = unzip(toptparams) bad_counter = 0 if saveFreq != validFreq and save_best_models: numpy.savez(best_file_name, history_errs=history_errs, uidx=uidx, eidx=eidx, cidx=cdix, **best_p) numpy.savez(best_opt_file_name, **best_optp) print 'Valid : DE {}\t CS {}\t FI {}\t RU {}'.format( valid_scores[0], valid_scores[1], valid_scores[2], valid_scores[3]) # save the best model so far if numpy.mod(uidx, saveFreq) == 0: print 'Saving...', if not os.path.exists(save_path): os.mkdir(save_path) params = unzip(tparams) optparams = unzip(toptparams) numpy.savez(file_name, history_errs=history_errs, uidx=uidx, eidx=eidx, cidx=cidx, **params) numpy.savez(opt_file_name, **optparams) if save_every_saveFreq and (uidx >= save_burn_in): this_file_name = '%s%s.%d.npz' % (save_path, save_file_name, uidx) this_opt_file_name = '%s%s%s.%d.npz' % ( save_path, save_file_name, '.grads', uidx) numpy.savez(this_file_name, history_errs=history_errs, uidx=uidx, eidx=eidx, cidx=cidx, **params) numpy.savez(this_opt_file_name, history_errs=history_errs, uidx=uidx, eidx=eidx, cidx=cidx, **params) if best_p is not None and saveFreq != validFreq: this_best_file_name = '%s%s.%d.best.npz' % ( save_path, save_file_name, uidx) numpy.savez(this_best_file_name, history_errs=history_errs, uidx=uidx, eidx=eidx, cidx=cidx, **best_p) print 'Done...', print 'Saved to %s' % file_name # finish after this many updates if uidx >= finish_after and finish_after != -1: print 'Finishing after %d iterations!' % uidx estop = True break print 'Seen %d samples' % n_samples lang_nos = (4535523, 12122376, 1926115, 2326893) lang_done = [x * update_idx for x in batch_size] lang_rem = [x - y for x, y in zip(lang_nos, lang_done)] print "Remaining : DE({}), CS({}), FI({}), RU({})".format( lang_rem[0], lang_rem[1], lang_rem[2], lang_rem[3]) eidx += 1 if estop: break use_noise.set_value(0.) valid_scores = [] for ii, vv in enumerate(valid): valid_err = pred_probs(f_log_probs, prepare_data, model_options, vv).mean() valid_scores.append(valid_err) print 'Valid : DE {}\t CS {}\t FI {}\t RU {}'.format( valid_scores[0], valid_scores[1], valid_scores[2], valid_scores[3]) params = unzip(tparams) optparams = unzip(toptparams) file_name = '%s%s.%d.npz' % (save_path, save_file_name, uidx) opt_file_name = '%s%s%s.%d.npz' % (save_path, save_file_name, '.grads', uidx) numpy.savez(file_name, history_errs=history_errs, uidx=uidx, eidx=eidx, cidx=cidx, **params) numpy.savez(opt_file_name, **optparams) if best_p is not None and saveFreq != validFreq: best_file_name = '%s%s.%d.best.npz' % (save_path, save_file_name, uidx) best_opt_file_name = '%s%s%s.%d.best.npz' % (save_path, save_file_name, '.grads', uidx) numpy.savez(best_file_name, history_errs=history_errs, uidx=uidx, eidx=eidx, cidx=cidx, **best_p) numpy.savez(best_opt_file_name, **best_optp) return valid_err
def fit(self, X, y): batchsize = self.batchsize n_valid = int(min(self.validset_max_examples, self.validset_fraction * X.shape[0])) # increase to a multiple of batchsize while n_valid % batchsize: n_valid += 1 n_train = X.shape[0] - n_valid # decrease to a multiple of batchsize while n_train % batchsize: n_train -= 1 if self.center_and_normalize and self.copy_X: X = X.copy() train_features = X[:n_train] valid_features = X[n_train:] train_labels = y[:n_train] valid_labels = y[n_train:] if self.center_and_normalize: print("Computing mean and std.dev") #this loop seems more memory efficient than numpy m= np.zeros(train_features.shape[1]) msq= np.zeros(train_features.shape[1]) for i in xrange(train_features.shape[0]): alpha = 1.0 / (i+1) v = train_features[i] m = alpha * v + (1-alpha)*m msq = alpha * v*v + (1-alpha)*msq self.X_mean_ = theano.shared(m.astype(X.dtype)) self.X_std_ = theano.shared( np.maximum( self.min_feature_std, np.sqrt(msq - m*m)).astype(X.dtype)) X -= self.X_mean_.get_value() X /= self.X_std_.get_value() x_i = tensor.matrix(dtype=X.dtype) y_i = tensor.vector(dtype=y.dtype) lr = tensor.scalar(dtype=X.dtype) feature_logreg = LogisticRegression.new(x_i, n_in = train_features.shape[1], n_out=self.n_classes, dtype=x_i.dtype) if self.loss_fn=='log': traincost = feature_logreg.nll(y_i).sum() elif self.loss_fn=='hinge': raw_output = tensor.dot(feature_logreg.input, feature_logreg.w)+feature_logreg.b traincost = multi_hinge_margin(raw_output, y_i).sum() else: raise NotImplementedError(self.loss_fn) traincost = traincost + abs(feature_logreg.w).sum() * self.l1_regularization traincost = traincost + (feature_logreg.w**2).sum() * self.l2_regularization train_logreg_fn = theano.function([x_i, y_i, lr], [feature_logreg.nll(y_i).mean(), feature_logreg.errors(y_i).mean()], updates=pylearn.gd.sgd.sgd_updates( params=feature_logreg.params, grads=tensor.grad(traincost, feature_logreg.params), stepsizes=[lr/batchsize,lr/(10*batchsize)])) test_logreg_fn = theano.function([x_i, y_i], feature_logreg.errors(y_i)) if self.center_and_normalize: feature_logreg_test = LogisticRegression( (x_i - self.X_mean_)/self.X_std_, feature_logreg.w, feature_logreg.b) self.predict_fn_ = theano.function([x_i], feature_logreg_test.argmax) else: self.predict_fn_ = theano.function([x_i], feature_logreg.argmax) best_epoch = -1 best_epoch_valid = -1 best_epoch_train = -1 best_epoch_test = -1 valid_rate=-1 test_rate=-1 train_rate=-1 for epoch in xrange(self.n_epochs): # validate # Marc'Aurelio, you crazy!! # the division by batchsize is done in the cost function e_lr = np.float32(self.learnrate / max(1.0, np.floor(max(1., (epoch+1)/float(self.anneal_epoch))-2))) if n_valid: l01s = [] for i in xrange(n_valid/batchsize): x_i = valid_features[i*batchsize:(i+1)*batchsize] y_i = valid_labels[i*batchsize:(i+1)*batchsize] #lr=0.0 -> no learning, safe for validation set l01 = test_logreg_fn((x_i), y_i) l01s.append(l01) valid_rate = 1-np.mean(l01s) #print('Epoch %i validation accuracy: %f'%(epoch, valid_rate)) if valid_rate > best_epoch_valid: best_epoch = epoch best_epoch_test = test_rate best_epoch_valid = valid_rate best_epoch_train = train_rate print('Epoch=%i best epoch %i valid %f test %f best train %f current train %f'%( epoch, best_epoch, best_epoch_valid, best_epoch_test, best_epoch_train, train_rate)) if epoch > self.anneal_epoch and epoch > 2*best_epoch: break else: print('Epoch=%i current train %f'%( epoch, train_rate)) #train l01s = [] nlls = [] for i in xrange(n_train/batchsize): x_i = train_features[i*batchsize:(i+1)*batchsize] y_i = train_labels[i*batchsize:(i+1)*batchsize] nll, l01 = train_logreg_fn((x_i), y_i, e_lr) nlls.append(nll) l01s.append(l01) train_rate = 1-np.mean(l01s)
def __init__(self, feature_count, transformer, k=8, stdev=0.1, X_format="dense"): # ************************************************************ # * Option Processing # ************************************************************ self.X_format = str(X_format).lower() if self.X_format not in _SUPPORTED_FORMATS: raise ValueError("Unsupported format: {}").format(X_format) d = feature_count # ************************************************************ # * Symbolic Variables # ************************************************************ # design matrix if X_format == "dense": self.X = T.matrix() elif X_format == "csr": self.X = S.csr_matrix() elif X_format == "csc": self.X = S.csc_matrix() self.y = T.vector() # response self.s = T.vector() # sample weights self.e = T.scalar() # current epoch # ************************************************************ # * Model Parameters # ************************************************************ # bias term (intercept) w0_init = np.zeros(1) self.w0 = theano.shared(w0_init, allow_downcast=True) # first order coefficients w1_init = np.zeros(d) self.w1 = theano.shared(w1_init, allow_downcast=True) # interaction factors v_init = stdev * np.random.randn(k, d) self.v = theano.shared(v_init, allow_downcast=True) # ************************************************************ # * The Model # ************************************************************ dot = T.dot mul = T.mul if X_format in ("csc", "csr"): dot = S.dot mul = S.mul # The formula for pairwise interactions is from the bottom left # of page 997 of Rendle 2010, "Factorization Machines." # This version scales linearly in k and d, as opposed to O(d^2). interactions = 0.5 * T.sum((dot(self.X, T.transpose(self.v)) ** 2) \ - dot(mul(self.X, self.X), T.transpose(self.v ** 2)), axis=1) self.y_hat = self.w0[0] + dot(self.X, self.w1) + interactions self.y_hat = transformer.transform(self.y_hat) # ************************************************************ # * Prediction # ************************************************************ self.theano_predict = theano.function(inputs=[self.X], outputs=self.y_hat, allow_input_downcast=True)
h1 *= castx(srng.binomial(n=1, p=0.5, size=h1.shape)) else: h1 *= 0.5 h2 = activation(T.dot(h1, params["W2_d"]) + params["b2_d"]) if trainMode: h2 *= castx(srng.binomial(n=1, p=0.5, size=h2.shape)) else: h2 *= 0.5 y = T.dot(T.concatenate([h2], axis = 1), params["W3_d"]) + params["b3_d"] return T.nnet.sigmoid(y) learning_rate = T.scalar() x = T.matrix() #z = T.matrix() #z = srng.normal(avg = 0,std = 1, size = (100, var_dimensionality)) z2 = srng.binomial(size = (100,var_dimensionality / 4), n = 1, p = 0.5, dtype = 'float32') z3 = srng.multinomial(size = (100,), n = 1, pvals = [1.0 / (var_dimensionality / 4)] * (var_dimensionality / 4), dtype = 'float32') z4 = srng.multinomial(size = (100,), n = 1, pvals = [1.0 / (var_dimensionality / 4)] * (var_dimensionality / 4), dtype = 'float32') z5 = srng.multinomial(size = (100,), n = 1, pvals = [1.0 / (var_dimensionality / 4)] * (var_dimensionality / 4), dtype = 'float32') z = T.concatenate([z2,z3,z4,z5], axis = 1) #z = T.erfinv(z) #Value between 0 and 1 corresponding to the probability that a point belongs to the true data distribution discriminator_true_value = discriminator_network(x, discriminator_params, trainMode = True)
import theano from theano import tensor as T import numpy as np trX = np.linspace(-1, 1, 101) print(trX) trY = 2 * trX + np.random.randn(*trX.shape) * 0.33 print(trY) X = T.scalar() Y = T.scalar() def model(X, w): return X * w w = theano.shared(np.asarray(-1000., dtype=theano.config.floatX)) y = model(X, w) cost = T.mean(T.sqr(y - Y)) gradient = T.grad(cost=cost, wrt=w) updates = [[w, w - gradient * 0.01]] train = theano.function(inputs=[X, Y], outputs=cost, updates=updates, allow_input_downcast=True) for i in range(100): for x, y in zip(trX, trY): """print (x, y)""" v = train(x, y) """print (v)""" """print(v)"""
def train_mlp_probe(train_labels, train_samples, test_labels, test_samples, hyperparams): batch_size = hyperparams['batch_size'] learning_rate = hyperparams['learning_rate'] n_epochs = hyperparams['n_epochs'] lambda_reg = hyperparams['lambda_reg'] num_hidden = hyperparams['num_hidden'] num_hidden_2 = hyperparams['num_hidden_2'] borrow = True arr = np.arange(train_labels.shape[0]) np.random.shuffle(arr) train_samples_x = train_samples[arr, :] train_samples_y = train_labels if len(train_labels.shape) == 1: train_samples_y.shape = (train_samples_y.shape[0], 1) train_samples_y = train_samples_y[arr, :] train_set_x = theano.shared(np.asarray(train_samples_x, dtype=theano.config.floatX), borrow=borrow) train_set_y = theano.shared(np.asarray(train_samples_y, dtype=theano.config.floatX), borrow=borrow) # compute number of minibatches for training, validation and testing n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size index = T.lscalar() # index to a [mini]batch x = T.matrix('x') # the data is presented as rasterized images y = T.matrix('y') # the labels are presented as 1D vector of # [int] labels learning_rate_t = T.scalar('learning_rate') # [int] labels num_out = train_set_y.shape[1].eval() num_in = train_samples_x.shape[1] # construct the logistic regression class # classifier = LogisticRegressionCrossEnt(input=x, n_in=num_in, n_out=num_out, lambda_reg=lambda_reg) # for random weight intialisation rng = np.random.RandomState(1234) # construct the MLP class classifier = MLP(rng=rng, input=x, n_in=num_in, n_hidden=num_hidden, n_hidden_2=num_hidden_2, n_out=num_out) # the cost we minimize during training is the negative log likelihood of # the model in symbolic format cost = classifier.euclidean_loss(y) + lambda_reg * classifier.L2_sqr # compute the gradient of cost with respect to theta (sotred in params) # the resulting gradients will be stored in a list gparams gparams = [] for param in classifier.params: gparam = T.grad(cost, param) gparams.append(gparam) # specify how to update the parameters of the model as a list of # (variable, update expression) pairs updates = [] # given two list the zip A = [a1, a2, a3, a4] and B = [b1, b2, b3, b4] of # same length, zip generates a list C of same size, where each element # is a pair formed from the two lists : # C = [(a1, b1), (a2, b2), (a3, b3), (a4, b4)] for param, gparam in zip(classifier.params, gparams): updates.append((param, param - learning_rate_t * gparam)) # compiling a Theano function `train_model` that returns the cost, but in # the same time updates the parameter of the model based on the rules # defined in `updates` train_model = theano.function( inputs=[index, learning_rate_t], outputs=cost, updates=updates, givens={ x: train_set_x[index * batch_size:(index + 1) * batch_size], y: train_set_y[index * batch_size:(index + 1) * batch_size] }) ############### # TRAIN MODEL # ############### #print '... training the model' # early-stopping parameters start_time = time.clock() validation_scores = np.array([]) costs = np.array([]) moving_scores = np.array([]) moving_costs = np.array([]) done_looping = False epoch = 0 best_validation_score = -np.inf best_cost = np.inf validation_improved_in = 0 cost_improved_in = 0 while (epoch < n_epochs) and (not done_looping): epoch += 1 costs_epoch = np.array([]) for minibatch_index in xrange(n_train_batches): minibatch_avg_cost = train_model(minibatch_index, learning_rate) costs_epoch = np.hstack([costs_epoch, minibatch_avg_cost]) # iteration number iter = (epoch - 1) * n_train_batches + minibatch_index W1 = classifier.params[0].eval() b1 = classifier.params[1].eval() W2 = classifier.params[2].eval() b2 = classifier.params[3].eval() W3 = classifier.params[4].eval() b3 = classifier.params[5].eval() # Evaluate the model on current eopch _, _, _, _, f1, prec, rec = test_mlp(test_labels, test_samples, (W1, b1, W2, b2, W3, b3)) curr_f1 = np.mean(f1) if np.isnan(curr_f1): best_validation_score = 0 break W = classifier.params[0].eval() if np.isnan(np.sum(W)): best_validation_score = 0 break validation_scores = np.hstack([validation_scores, curr_f1]) epoch_cost = np.mean(costs_epoch) costs = np.hstack([costs, epoch_cost]) if (epoch <= 10): # print 'Epoch - %d, cost - %f, F1 - %f' % (epoch, epoch_cost, curr_f1) moving_costs = np.hstack([moving_costs, epoch_cost]) moving_scores = np.hstack([moving_scores, curr_f1]) else: moving_costs = np.hstack([moving_costs, np.mean(costs[-10:])]) moving_scores = np.hstack( [moving_scores, np.mean(validation_scores[-10:])]) if moving_costs[-1] < best_cost: best_cost = moving_costs[-1] cost_improved_in = 0 else: cost_improved_in += 1 if moving_scores[-1] > best_validation_score: W1_best = classifier.params[0].eval() b1_best = classifier.params[1].eval() W2_best = classifier.params[2].eval() b2_best = classifier.params[3].eval() W3_best = classifier.params[4].eval() b3_best = classifier.params[5].eval() best_validation_score = moving_scores[-1] score_improved_in = 0 validation_improved_in = 0 else: score_improved_in += 1 validation_improved_in += 1 if score_improved_in > 10: print 'Rate reduced' learning_rate /= 1.5 score_improved_in = 0 # If the score has not improved in some time terminate early if validation_improved_in > 60: print 'Early termination' break print 'Epoch - %d, cost - %f (%f, %d), F1 - %f (%f, %d)' % \ (epoch, epoch_cost, moving_costs[-1], cost_improved_in, curr_f1, moving_scores[-1], score_improved_in) # if(epoch > 10) # costs end_time = time.clock() print 'Optimization complete with best validation score of %f ' % np.max( validation_scores) print 'The code run for %d epochs, with %f epochs/sec' % ( epoch, 1. * epoch / (end_time - start_time)) #plot(moving_costs) #plot(moving_scores) #draw() #show() return (W1_best, b1_best, W2_best, b2_best, W3_best, b3_best)
code and theano.py""" #### Libraries # Third Party Libraries import matplotlib.pyplot as plt import numpy as np import theano import theano.tensor as T rng = np.random.RandomState(23455) # Paramater iniitilization # Symbolic variable X = T.matrix(name='X', dtype=theano.config.floatX) y = T.vector(name='y', dtype=theano.config.floatX) lr = T.scalar(name='learn_rate', dtype=theano.config.floatX) # Variables that will be updated, hence are declared as `theano.share` theta = theano.shared(name='theta', value=rng.uniform(-1.0, 1.0, size=(3)).astype(theano.config.floatX)) bias = theano.shared(name='bias', value=rng.uniform(13, 17, size=(1, 1)).astype(theano.config.floatX), broadcastable=(True, True)) # ADAM Parameters beta1 = T.scalar(name='beta1', dtype=theano.config.floatX) beta2 = T.scalar(name='beta2', dtype=theano.config.floatX) eps = T.scalar(name='eps', dtype=theano.config.floatX)
def train( dim_word=100, # word vector dimensionality dim=100, # the number of GRU units encoder='tree_lstm', # encoder model decoder='tree_lstm', # decoder model patience=10, # early stopping patience max_epochs=5000, finish_after=10000000, # finish after this many updates decay_c=0., # L2 regularization penalty clip_c=-1., # gradient clipping threshold lrate=0.01, # learning rate n_words=100000, # vocabulary size maxlen=100, # maximum length of the description optimizer='adadelta', batch_size=16, valid_batch_size=16, saveto='model.npz', dispFreq=100, validFreq=1000, saveFreq=1000, # save the parameters after every saveFreq updates use_dropout=False, reload_=False, verbose=False, # print verbose information for debug but slow speed datasets=[], valid_datasets=[], test_datasets=[], dictionary='', embedding='', # pretrain embedding file, such as word2vec, GLOVE ): logging.basicConfig( level=logging.DEBUG, format="%(asctime)s: %(name)s: %(levelname)s: %(message)s") # Model options model_options = locals().copy() # load dictionary and invert them with open(dictionary, 'rb') as f: worddicts = pkl.load(f) # reload options if reload_ and os.path.exists(saveto): print 'Reload options' with open('%s.pkl' % saveto, 'rb') as f: model_options = pkl.load(f) logger.debug(pprint.pformat(model_options)) print 'Loading data' train = TextIterator(datasets[0], datasets[1], datasets[2], dictionary, n_words=n_words, batch_size=batch_size, maxlen=maxlen) train_valid = TextIterator(datasets[0], datasets[1], datasets[2], dictionary, n_words=n_words, batch_size=valid_batch_size, shuffle=False) valid = TextIterator(valid_datasets[0], valid_datasets[1], valid_datasets[2], dictionary, n_words=n_words, batch_size=valid_batch_size, shuffle=False) test = TextIterator(test_datasets[0], test_datasets[1], test_datasets[2], dictionary, n_words=n_words, batch_size=valid_batch_size, shuffle=False) # Initialize (or reload) the parameters using 'model_options' # then build the Theano graph print 'Building model' params = init_params(model_options, worddicts) # reload parameters if reload_ and os.path.exists(saveto): print 'Reload parameters' params = load_params(saveto, params) # numpy arrays -> theano shared variables tparams = init_tparams(params) trng, use_noise, \ x1, x1_mask, x1_left_mask, x1_right_mask, \ x2, x2_mask, x2_left_mask, x2_right_mask, \ y, \ opt_ret, \ cost, \ f_pred, f_prods = \ build_model(tparams, model_options) inps = [x1, x1_mask, x1_left_mask, x1_right_mask, \ x2, x2_mask, x2_left_mask, x2_right_mask, \ y] # before any regularizer print 'Building f_log_probs...', f_log_probs = theano.function(inps, cost, profile=profile) print 'Done' cost = cost.mean() # apply L2 regularization on weights if decay_c > 0.: decay_c = theano.shared(numpy.float32(decay_c), name='decay_c') weight_decay = 0. for kk, vv in tparams.iteritems(): weight_decay += (vv**2).sum() weight_decay *= decay_c cost += weight_decay # after all regularizers - compile the computational graph for cost print 'Building f_cost...', f_cost = theano.function(inps, cost, profile=profile) print 'Done' print 'Computing gradient...', grads = tensor.grad(cost, wrt=itemlist(tparams)) print 'Done' # apply gradient clipping here if clip_c > 0.: g2 = 0. for g in grads: g2 += (g**2).sum() new_grads = [] for g in grads: new_grads.append( tensor.switch(g2 > (clip_c**2), g / tensor.sqrt(g2) * clip_c, g)) grads = new_grads if verbose: print 'Building function of gradient\'s norm' f_norm_g = theano.function(inps, tensor.sqrt(g2)) # compile the optimizer, the actual computational graph is compiled here lr = tensor.scalar(name='lr') print 'Building optimizers...', f_grad_shared, f_update = eval(optimizer)(lr, tparams, grads, inps, cost) print 'Done' print 'Optimization' history_errs = [] # reload history if reload_ and os.path.exists(saveto): print 'Reload history error' history_errs = list(numpy.load(saveto)['history_errs']) best_p = None bad_counter = 0 if validFreq == -1: validFreq = len(train[0]) / batch_size if saveFreq == -1: saveFreq = len(train[0]) / batch_size uidx = 0 estop = False valid_acc_record = [] test_acc_record = [] best_epoch_num = 0 lr_change_list = [] wait_counter = 0 wait_N = 1 for eidx in xrange(max_epochs): n_samples = 0 for x1, x2, y in train: n_samples += len(x1) uidx += 1 use_noise.set_value(1.) x1, x2, y = prepare_data(x1, x2, y) inps = [x1[0], x1[1], x1[2], x1[3], x2[0], x2[1], x2[2], x2[3], y] if x1 is None: print 'Minibatch with zero sample under length ', maxlen uidx -= 1 continue ud_start = time.time() # compute cost, grads and copy grads to shared variables cost = f_grad_shared(*inps) if verbose: if clip_c > 0.: norm_g = f_norm_g(*inps) # do the update on parameters f_update(lrate) ud = time.time() - ud_start # check for bad numbers, usually we remove non-finite elements # and continue training - but not done here if numpy.isnan(cost) or numpy.isinf(cost): print 'NaN detected' return None # verbose if numpy.mod(uidx, dispFreq) == 0: logger.debug('Epoch {0} Update {1} Cost {2} UD {3}'.format( eidx, uidx, cost, ud)) if verbose: if clip_c > 0.: logger.debug('Grad {0}'.format(norm_g)) # save the best model so far if numpy.mod(uidx, saveFreq) == 0: print 'Saving...', if best_p is not None: params = best_p else: params = unzip(tparams) numpy.savez(saveto, history_errs=history_errs, **params) pkl.dump(model_options, open('%s.pkl' % saveto, 'wb')) print 'Done' # validate model on validation set and early stop if necessary if numpy.mod(uidx, validFreq) == 0: use_noise.set_value(0.) valid_cost = pred_probs(f_log_probs, prepare_data, model_options, valid).mean() valid_acc = pred_acc(f_pred, prepare_data, model_options, valid) valid_err = 1.0 - valid_acc history_errs.append(valid_err) test_cost = pred_probs(f_log_probs, prepare_data, model_options, test).mean() test_acc = pred_acc(f_pred, prepare_data, model_options, test) print 'Valid cost', valid_cost print 'Valid accuracy', valid_acc print 'Test cost', test_cost print 'Test accuracy', test_acc print 'lrate:', lrate valid_acc_record.append(valid_acc) test_acc_record.append(test_acc) if uidx == 0 or valid_err <= numpy.array(history_errs).min(): best_p = unzip(tparams) best_epoch_num = eidx wait_counter = 0 if valid_err > numpy.array(history_errs).min(): wait_counter += 1 if wait_counter >= wait_N: print 'wait_counter max, need to half the lr' bad_counter += 1 wait_counter = 0 print 'bad_counter: ' + str(bad_counter) lrate = lrate * 0.5 lr_change_list.append(eidx) print 'lrate change to: ' + str(lrate) zipp(best_p, tparams) if bad_counter > patience: print 'Early Stop!' estop = True break if numpy.isnan(valid_err): ipdb.set_trace() # finish after this many updates if uidx >= finish_after: print 'Finishing after %d iterations!' % uidx estop = True break print 'Seen %d samples' % n_samples if estop: break if best_p is not None: zipp(best_p, tparams) with open('record.csv', 'w') as f: f.write(str(best_epoch_num) + '\n') f.write(','.join(map(str, lr_change_list)) + '\n') f.write(','.join(map(str, valid_acc_record)) + '\n') f.write(','.join(map(str, test_acc_record)) + '\n') use_noise.set_value(0.) print '=' * 80 print 'Final Result' print '=' * 80 train_cost = pred_probs(f_log_probs, prepare_data, model_options, train_valid).mean() train_acc = pred_acc(f_pred, prepare_data, model_options, train_valid) print 'Train cost', train_cost print 'Train accuracy', train_acc valid_cost = pred_probs(f_log_probs, prepare_data, model_options, valid).mean() valid_acc = pred_acc(f_pred, prepare_data, model_options, valid) print 'Valid cost', valid_cost print 'Valid accuracy', valid_acc test_cost = pred_probs(f_log_probs, prepare_data, model_options, test).mean() test_acc = pred_acc(f_pred, prepare_data, model_options, test) print 'Test cost', test_cost print 'Test accuracy', test_acc params = copy.copy(best_p) numpy.savez(saveto, zipped_params=best_p, history_errs=history_errs, **params) logger.debug('Done') return None
def make_scalar(): """ Returns a new Theano scalar. """ return T.scalar()
def build_nlm_model(rng, model_params, self_norm_coeff, act_func, dropout, is_test=0): """ Adapt from this tutorial http://deeplearning.net/tutorial/mlp.html """ # symbolic variables x = T.matrix('x') y = T.ivector( 'y' ) # GPU stores values in float32, so now we have to convert to int32 lr = T.scalar('lr') # classsifier if act_func == 'tanh': sys.stderr.write('# act_func=tanh\n') activation = T.tanh elif act_func == 'relu': sys.stderr.write('# act_func=rectifier\n') activation = rectifier elif act_func == 'leakyrelu': sys.stderr.write('# act_func=leaky rectifier\n') activation = leaky_rect else: sys.stderr.write( '! Unknown activation function %s, not tanh or relu\n' % (act_func)) sys.exit(1) sys.stderr.write('# self_norm_coeff=%f\n' % self_norm_coeff) classifier = NLM(rng, x, model_params, self_norm_coeff, activation, dropout, is_test) if is_test == 1: return (classifier, x, y) # cost cost = classifier.nll(y) if self_norm_coeff > 0: cost = cost + self_norm_coeff * classifier.mean_square_log_norm mean_abs_log_norm = classifier.mean_abs_log_norm # grad gparams = [] #clip_range = 0.1 grad_norm = 0.0 for param in classifier.params: gparam = T.grad(cost, param) grad_norm += (gparam**2).sum() #gparam = T.clip(T.grad(cost, param), -clip_range, clip_range) # clip gradients gparams.append(gparam) grad_norm = T.sqrt(grad_norm) # grad norm is small overall #max_grad_norm = 5 #if T.gt(grad_norm, max_grad_norm): # lr = lr * max_grad_norm / grad_norm # update updates = [] for param, gparam in zip(classifier.params, gparams): updates.append((param, param - lr * gparam)) if self_norm_coeff > 0: return (classifier, x, y, lr, cost, grad_norm, mean_abs_log_norm, updates) else: return (classifier, x, y, lr, cost, grad_norm, updates)
which_sources=('sp', )) data_stream = ScaleAndShift(data_stream, scale=1 / f0_std, shift=-f0_mean / f0_std, which_sources=('f0', )) data_stream = Mapping(data_stream, _zero_for_unvoiced) data_stream = Mapping(data_stream, _transpose) data_stream = SegmentSequence(data_stream, 8 * seq_size, add_flag=True) data_stream = ForceFloatX(data_stream) valid_stream = data_stream ################# # Model ################# start_flag = tensor.scalar('start_flag') x = tensor.tensor3('sp') #x = tensor.tensor3('features') f0 = tensor.matrix('f0') voiced = tensor.matrix('voiced') f0s = f0.dimshuffle(0, 1, 'x') voiceds = voiced.dimshuffle(0, 1, 'x') context = tensor.concatenate([f0s, voiceds], 2) activations_x = [Rectifier()] * depth_x dims_x = [frame_size] + [hidden_size_mlp_x]*(depth_x-1) + \ [hidden_size_recurrent]
def __init__(self, objective, params, inputs=None, param_constrainers=None, max_iter=-1, lr_scalers=None, verbose=0, tol=None, init_alpha=None, min_init_alpha=1e-3, reset_alpha=True, conjugate=False, reset_conjugate=True, gradients=None, gradient_updates=None, line_search_mode=None, accumulate=False, theano_function_mode=None): self.__dict__.update(locals()) del self.self if line_search_mode is None: if init_alpha is None: init_alpha = (.001, .005, .01, .05, .1) else: assert line_search_mode == 'exhaustive' if init_alpha is None: init_alpha = (.5, 1.) self.init_alpha = tuple([float(elem) for elem in init_alpha]) if inputs is None: inputs = [] if param_constrainers is None: param_constrainers = [] obj = objective self.verbose = verbose # TODO: remove verbose statements (handled by logging) if self.verbose > 0: logger.setLevel(logging.DEBUG) param_to_grad_sym = OrderedDict() param_to_grad_shared = OrderedDict() updates = OrderedDict() if self.gradient_updates is not None: updates.update(self.gradient_updates) self.params = [param for param in params] for param in params: if self.gradients is not None and param in self.gradients: g = self.gradients[param] else: g = grad(objective, param) param_to_grad_sym[param] = g if param.name is not None: param_name = param.name else: param_name = 'anon_param' grad_name = 'BatchGradientDescent.grad_' + param_name grad_shared = sharedX(param.get_value() * 0., name=grad_name) param_to_grad_shared[param] = grad_shared updates[grad_shared] = g self.param_to_grad_shared = param_to_grad_shared if self.verbose: logger.debug('batch gradient class compiling gradient function') t1 = time.time() if self.accumulate: self._compute_grad = Accumulator(inputs, updates=updates) else: self._compute_grad = function( inputs, updates=updates, mode=self.theano_function_mode, name='BatchGradientDescent._compute_grad') if self.verbose: t2 = time.time() logger.debug('done. Took {0}'.format(t2 - t1)) if self.verbose: logger.debug('batch gradient class compiling objective function') if self.accumulate: self.obj = Accumulator(inputs, obj) else: self.obj = function(inputs, obj, mode=self.theano_function_mode, name='BatchGradientDescent.obj') if self.verbose: logger.debug('done') self.param_to_cache = OrderedDict() alpha = T.scalar(name='alpha') alpha.tag.test_value = np.cast[alpha.dtype](.01) cache_updates = OrderedDict() goto_updates = OrderedDict() for param in params: if param.name is None: param_name = 'anon_param' else: param_name = param.name cache_name = 'BatchGradientDescent.param_to_cache[%s]' % param_name self.param_to_cache[param] = sharedX(param.get_value(borrow=False), name=cache_name) cache_updates[self.param_to_cache[param]] = param cached = self.param_to_cache[param] g = self.param_to_grad_shared[param] if lr_scalers is not None and param in lr_scalers: scaled_alpha = alpha * lr_scalers[param] else: scaled_alpha = alpha mul = scaled_alpha * g diff = cached - mul goto_updates[param] = diff self._cache_values = function( [], updates=cache_updates, mode=self.theano_function_mode, name='BatchGradientDescent._cache_values') assert isinstance(param_constrainers, (list, tuple)) for param_constrainer in param_constrainers: param_constrainer(goto_updates) self._goto_alpha = function([alpha], updates=goto_updates, mode=self.theano_function_mode, name='BatchGradientDescent._goto_alpha') norm = T.sqrt( sum([ T.sqr(elem).sum() for elem in self.param_to_grad_shared.values() ])) norm.name = 'BatchGradientDescent.norm' normalize_grad_updates = OrderedDict() for grad_shared in self.param_to_grad_shared.values(): normalize_grad_updates[grad_shared] = grad_shared / norm # useful for monitoring self.ave_grad_size = sharedX(0.) self.new_weight = sharedX(1.) normalize_grad_updates[self.ave_grad_size] = self.new_weight * norm + ( 1. - self.new_weight) * self.ave_grad_size self._normalize_grad = function( [], norm, updates=normalize_grad_updates, mode=self.theano_function_mode, name='BatchGradientDescent._normalize_grad') if self.conjugate: grad_shared = self.param_to_grad_shared.values() grad_to_old_grad = OrderedDict() for elem in grad_shared: grad_to_old_grad[elem] = sharedX(elem.get_value(), 'old_' + elem.name) self._store_old_grad = function( [norm], updates=OrderedDict([(grad_to_old_grad[g_], g_ * norm) for g_ in grad_to_old_grad]), mode=self.theano_function_mode, name='BatchGradientDescent._store_old_grad') grad_ordered = list(grad_to_old_grad.keys()) old_grad_ordered = [grad_to_old_grad[g_] for g_ in grad_ordered] def dot_product(x, y): return sum([(x_elem * y_elem).sum() for x_elem, y_elem in safe_zip(x, y)]) beta_pr = (dot_product(grad_ordered, grad_ordered) - dot_product(grad_ordered, old_grad_ordered)) / \ (1e-7+dot_product(old_grad_ordered, old_grad_ordered)) assert beta_pr.ndim == 0 beta = T.maximum(beta_pr, 0.) #beta_pr is the Polak-Ribiere formula for beta. #According to wikipedia, the beta to use for NCG is "a matter of heuristics or taste" #but max(0, beta_pr) is "a popular choice... which provides direction reset automatically." #(ie, it is meant to revert to steepest descent when you have traveled far enough that #the objective function is behaving non-quadratically enough that the conjugate gradient #formulas aren't working anymore) #http://en.wikipedia.org/wiki/Nonlinear_conjugate_gradient_method assert grad not in grad_to_old_grad make_conjugate_updates = [(g_, g_ + beta * grad_to_old_grad[g_]) for g_ in grad_ordered] mode = self.theano_function_mode if mode is not None and hasattr(mode, 'record'): for v, u in make_conjugate_updates: mode.record.handle_line('BatchGradientDescent._make_conjugate var ' \ + var_descriptor(v) + '\n') mode.record.handle_line('BatchGradientDescent._make_conjugate update ' \ + var_descriptor(u) + '\n') self._make_conjugate = function( [], updates=make_conjugate_updates, mode=self.theano_function_mode, name='BatchGradientDescent._make_conjugate') if mode is not None and hasattr(mode, 'record'): for output in self._make_conjugate.maker.fgraph.outputs: mode.record.handle_line('BatchGradientDescent._make_conjugate output ' \ + var_descriptor(output) + '\n') if tol is None: if objective.dtype == "float32": self.tol = 1e-6 else: self.tol = 3e-7 else: self.tol = tol self.ave_step_size = sharedX(0.) self.ave_grad_mult = sharedX(0.)
def __init__(self, window_size, n_quadratic_filters, activation_function, reconstruction_cost_function, tie_weights=False, # _input, # _targ ): super(ConvolutionalMLP, self).__init__() #self.lr = module.Member(T.scalar()) self.lr = (T.scalar()) self.inputs = [T.dmatrix() for i in range(window_size)] self.targ = T.lvector() self.input_representations = [] self.input_representations.append(QDAA( input=self.inputs[0], tie_weights=tie_weights, n_quadratic_filters=n_quadratic_filters, activation_function=activation_function, reconstruction_cost_function = reconstruction_cost_function ) ) for i in self.inputs[1:]: self.input_representations.append( QDAA( input=i, tie_weights=tie_weights, n_quadratic_filters=n_quadratic_filters, activation_function=activation_function, reconstruction_cost_function = reconstruction_cost_function, _w1 = self.input_representations[0].w1, _w2 = self.input_representations[0].w2, _b1 = self.input_representations[0].b1, _b2 = self.input_representations[0].b2, _qfilters = self.input_representations[0].qfilters ) ) assert self.input_representations[-1].w1 is \ self.input_representations[0].w1 self.input_representation = T.concatenate([i. hidden for i in self.input_representations], axis=1) self.hidden = QDAA( input=self.input_representation, tie_weights=tie_weights, n_quadratic_filters=n_quadratic_filters, activation_function=activation_function, reconstruction_cost_function = reconstruction_cost_function ) self.output = Module_Nclass(x=self.hidden.hidden, targ=self.targ) input_pretraining_params = [ self.input_representations[0].w1, self.input_representations[0].w2, self.input_representations[0].b1, self.input_representations[0].b2 ] + self.input_representations[0].qfilters hidden_pretraining_params = [ self.hidden.w1, self.hidden.w2, self.hidden.b1, self.hidden.b2 ] + self.hidden.qfilters input_pretraining_cost = sum(i.ncost for i in self. input_representations) hidden_pretraining_cost = self.hidden.ncost input_pretraining_gradients = T.grad(input_pretraining_cost, input_pretraining_params) hidden_pretraining_gradients = T.grad( hidden_pretraining_cost, hidden_pretraining_params) pretraining_updates = \ dict((p, p - self.lr * g) for p, g in \ zip(input_pretraining_params, input_pretraining_gradients) \ + zip(hidden_pretraining_params, hidden_pretraining_gradients)) self.pretraining_update = module.Method(self.inputs, [input_pretraining_cost, hidden_pretraining_cost], pretraining_updates) finetuning_params = \ [self.input_representations[0].w1, self.input_representations[0].b1] + self.input_representations[0].qfilters + \ [self.hidden.w1, self.hidden.b1] + self.hidden.qfilters + \ [self.output.w, self.output.b] finetuning_cost = self.output.cost finetuning_gradients = T.grad(finetuning_cost, finetuning_params) finetuning_updates = dict((p, p - self.lr * g) for p, g in zip(finetuning_params, finetuning_gradients)) self.finetuning_update = module.Method(self.inputs + [self. targ], self.output.cost, finetuning_updates)
# = arg_i - log sum_j exp(arg_j) return example_costs.mean() confidence = ymf1_arg - ((1 - yb) * ymf1_arg).max(axis=1).dimshuffle(0, 'x') misclass_cost = -(confidence * yb).sum(axis=1).mean() mf1_cost = - log_p_yb ( ymf1_arg) + \ l1wd * T.sqr(mf1mod.W1).sum() +\ l2wd * T.sqr(mf1mod.W2).sum() +\ l3wd * T.sqr(mf1mod.W3).sum() updates = {} alpha = T.scalar() alpha.tag.test_value = 1e-4 tv = T.scalar() momentum = 1. - 1. / tv for cost, params in [(mf1_cost, mf1mod.params())]: for param in params: inc = sharedX(np.zeros(param.get_value().shape)) updates[inc] = momentum * inc - alpha * T.grad(cost, param) updates[param] = param + updates[inc] from theano import function func = function([idx, alpha, tv], [mf1_cost], updates=updates)
# test value x = np.eye(1000, dtype=theano.config.floatX) tic = time.time() A = compute_norm_lines(x) print('It took %f seconds' % (time.time() - tic)) # comparison with numpy tic = time.time() B = np.sqrt((x**2).sum(1)) print('It took %f seconds' % (time.time() - tic)) print '-' * 50 coefficients = theano.tensor.vector("coefficients") x = T.scalar("x") max_coefficients_supported = 10000 # Generate the components of the polynomial components, updates = theano.scan( fn=lambda coefficient, power, free_variable: coefficient * (free_variable**power), sequences=[coefficients, theano.tensor.arange(max_coefficients_supported)], non_sequences=x) # Sum them up polynomial = components.sum() # Compile a function
def __init__(self, input=None, # regularize = False, tie_weights=False, n_quadratic_filters=1, _w1=None, _w2=None, _b1=None, _b2=None, _qfilters=None, activation_function=NN.sigmoid, reconstruction_cost_function=cross_entropy): """ :param input: WRITEME :param regularize: WRITEME :param tie_weights: WRITEME :param activation_function: WRITEME :param reconstruction_cost: Should return one cost per example (row) :todo: Default noise level for all daa levels """ super(QuadraticDenoisingAA, self).__init__() self.random = T.RandomStreams() # MODEL CONFIGURATION # self.regularize = regularize self.tie_weights = tie_weights self.activation_function = activation_function self.reconstruction_cost_function = reconstruction_cost_function # ACQUIRE/MAKE INPUT if not input: input = T.matrix('input') #self.input = theano.External(input) self.input = (input) # HYPER-PARAMETERS #self.lr = theano.Member(T.scalar()) self.lr = (T.scalar()) # PARAMETERS if _qfilters is None: #self.qfilters = [theano.Member(T.dmatrix('q%i'%i)) for i in xrange(n_quadratic_filters)] self.qfilters = [(T.dmatrix('q%i' % i)) for i in xrange(n_quadratic_filters)] else: #self.qfilters = [theano.Member(q) for q in _qfilters] self.qfilters = [(q) for q in _qfilters] #self.w1 = theano.Member(T.matrix('w1')) if _w1 is None else theano.Member(_w1) if _w1 is None: self.w1 = (T.matrix('w1')) else: self.w1 = (_w1) if _w2 is None: if not tie_weights: #self.w2 = theano.Member(T.matrix()) self.w2 = (T.matrix()) else: self.w2 = self.w1.T else: #self.w2 = theano.Member(_w2) self.w2 = (_w2) #self.b1 = theano.Member(T.vector('b1')) if _b1 is None else theano.Member(_b1) if _b1 is None: self.b1 = (T.vector('b1')) else: self.b1 = (_b1) #self.b2 = theano.Member(T.vector('b2')) if _b2 is None else theano.Member(_b2) if _b2 is None: self.b2 = (T.vector('b2')) else: self.b2 = (_b2) # # REGULARIZATION COST # self.regularization = self.build_regularization() ### NOISELESS ### # HIDDEN LAYER def _act(x): if len(self.qfilters) > 0: qsum = 10e-10 # helps to control the gradient in the square-root below for qf in self.qfilters: qsum = qsum + T.dot(x, qf) ** 2 return T.dot(x, self.w1) + self.b1 + T.sqrt(qsum) else: return T.dot(x, self.w1) + self.b1 self.hidden_activation = _act(self.input) # noise-free hidden self.hidden = self.hid_activation_function(self.hidden_activation) # RECONSTRUCTION LAYER self.output_activation = T.dot(self.hidden, self.w2) + self.b2 self.output = self.out_activation_function(self.output_activation) # RECONSTRUCTION COST self.reconstruction_costs = self.build_reconstruction_costs(self.output) self.reconstruction_cost = T.mean(self.reconstruction_costs) # TOTAL COST self.cost = self.reconstruction_cost # if self.regularize: # self.cost = self.cost + self.regularization ### WITH NOISE ### self.corrupted_input = self.build_corrupted_input() # HIDDEN LAYER self.nhidden_activation = _act(self.corrupted_input) self.nhidden = self.hid_activation_function(self.nhidden_activation) # RECONSTRUCTION LAYER self.noutput_activation = T.dot(self.nhidden, self.w2) + self.b2 self.noutput = self.out_activation_function(self.noutput_activation) # RECONSTRUCTION COST self.nreconstruction_costs = self.build_reconstruction_costs(self.noutput) self.nreconstruction_cost = T.mean(self.nreconstruction_costs) # TOTAL COST self.ncost = self.nreconstruction_cost # if self.regularize: # self.ncost = self.ncost + self.regularization # GRADIENTS AND UPDATES if self.tie_weights: self.params = [self.w1, self.b1, self.b2] + self.qfilters else: self.params = [self.w1, self.w2, self.b1, self.b2] + self.qfilters gradients = T.grad(self.ncost, self.params) updates = dict((p, p - self.lr * g) for p, g in zip(self. params, gradients))
def build( self, initial_stepsize, n_steps, target_acceptance_rate=.65, stepsize_dec=0.98, stepsize_min=0.0001, stepsize_max=0.5, stepsize_inc=1.02, # used in geometric avg. 1.0 would be not moving at all avg_acceptance_slowness=0.9, seed=12345, init_state=None): if init_state is None: init_h = np.random.normal( 0, 1, size=[self.n_sam * self.batch_size, self.hdim]).astype(np.float32) else: init_h = init_state print('load init_state') init_m = np.random.randn(self.n_sam * self.batch_size, self.hdim).astype(np.float32) # For HMC # h denotes current states self.h = sharedX(init_h) # m denotes momentum t = T.scalar() self.generated = self.generate(self.h) lld = T.reshape(-self.energy_fn(self.h), [self.n_sam, self.batch_size]) self.eval_lld = theano.function([t], lld, givens={ self.obs: self.obs_val, self.t: t }) # allocate shared variables stepsize = sharedX(initial_stepsize) avg_acceptance_rate = sharedX(target_acceptance_rate) s_rng = TT.shared_randomstreams.RandomStreams(seed) # define graph for an `n_steps` HMC simulation accept, final_pos = hmc_move(s_rng, self.h, self.energy_fn, stepsize, n_steps) # define the dictionary of updates, to apply on every `simulate` call simulate_updates = hmc_updates( self.h, stepsize, avg_acceptance_rate, final_pos=final_pos, accept=accept, stepsize_min=stepsize_min, stepsize_max=stepsize_max, stepsize_inc=stepsize_inc, stepsize_dec=stepsize_dec, target_acceptance_rate=target_acceptance_rate, avg_acceptance_slowness=avg_acceptance_slowness) self.step = theano.function([t], [accept], updates=simulate_updates, givens={ self.obs: self.obs_val, self.t: t })
def build_corrupted_input(self): #self.noise_level = theano.Member(T.scalar()) self.noise_level = (T.scalar()) return self.random.binomial(T.shape(self.input), 1, 1 - self.noise_level) * self.input
def __init__(self, n_x, n_a, n_z, n_y, qa_hid, qz_hid, qy_hid, px_hid, pa_hid, nonlinearity=rectify, px_nonlinearity=None, x_dist='bernoulli', batchnorm=False, seed=1234): """ Initialize an skip deep generative model consisting of discriminative classifier q(y|a,x), generative model P p(a|z,y) and p(x|a,z,y), inference model Q q(a|x) and q(z|a,x,y). Weights are initialized using the Bengio and Glorot (2010) initialization scheme. :param n_x: Number of inputs. :param n_a: Number of auxiliary. :param n_z: Number of latent. :param n_y: Number of classes. :param qa_hid: List of number of deterministic hidden q(a|x). :param qz_hid: List of number of deterministic hidden q(z|a,x,y). :param qy_hid: List of number of deterministic hidden q(y|a,x). :param px_hid: List of number of deterministic hidden p(a|z,y) & p(x|z,y). :param nonlinearity: The transfer function used in the deterministic layers. :param x_dist: The x distribution, 'bernoulli', 'multinomial', or 'gaussian'. :param batchnorm: Boolean value for batch normalization. :param seed: The random seed. """ super(SDGM, self).__init__(n_x, qz_hid + px_hid, n_a + n_z, nonlinearity) self.x_dist = x_dist self.n_y = n_y self.n_x = n_x self.n_a = n_a self.n_z = n_z self.batchnorm = batchnorm self._srng = RandomStreams(seed) # Decide Glorot initializaiton of weights. init_w = 1e-3 hid_w = "" if nonlinearity == rectify or nonlinearity == softplus: hid_w = "relu" # Define symbolic variables for theano functions. self.sym_beta = T.scalar('beta') # scaling constant beta self.sym_x_l = T.matrix('x') # labeled inputs self.sym_t_l = T.matrix('t') # labeled targets self.sym_x_u = T.matrix('x') # unlabeled inputs self.sym_bs_l = T.iscalar('bs_l') # number of labeled data self.sym_samples = T.iscalar('samples') # MC samples self.sym_z = T.matrix('z') # latent variable z self.sym_a = T.matrix('a') # auxiliary variable a # Assist methods for collecting the layers def dense_layer(layer_in, n, dist_w=init.GlorotNormal, dist_b=init.Normal): dense = DenseLayer(layer_in, n, dist_w(hid_w), dist_b(init_w), None) if batchnorm: dense = BatchNormLayer(dense) return NonlinearityLayer(dense, self.transf) def stochastic_layer(layer_in, n, samples, nonlin=None): mu = DenseLayer(layer_in, n, init.Normal(init_w), init.Normal(init_w), nonlin) logvar = DenseLayer(layer_in, n, init.Normal(init_w), init.Normal(init_w), nonlin) return SampleLayer(mu, logvar, eq_samples=samples, iw_samples=1), mu, logvar # Input layers l_x_in = InputLayer((None, n_x)) l_y_in = InputLayer((None, n_y)) # Auxiliary q(a|x) l_qa_x = l_x_in for hid in qa_hid: l_qa_x = dense_layer(l_qa_x, hid) l_qa_x, l_qa_x_mu, l_qa_x_logvar = stochastic_layer( l_qa_x, n_a, self.sym_samples) # Classifier q(y|a,x) l_qa_to_qy = DenseLayer(l_qa_x, qy_hid[0], init.GlorotNormal(hid_w), init.Normal(init_w), None) l_qa_to_qy = ReshapeLayer(l_qa_to_qy, (-1, self.sym_samples, 1, qy_hid[0])) l_x_to_qy = DenseLayer(l_x_in, qy_hid[0], init.GlorotNormal(hid_w), init.Normal(init_w), None) l_x_to_qy = DimshuffleLayer(l_x_to_qy, (0, 'x', 'x', 1)) l_qy_xa = ReshapeLayer(ElemwiseSumLayer([l_qa_to_qy, l_x_to_qy]), (-1, qy_hid[0])) if batchnorm: l_qy_xa = BatchNormLayer(l_qy_xa) l_qy_xa = NonlinearityLayer(l_qy_xa, self.transf) if len(qy_hid) > 1: for hid in qy_hid[1:]: l_qy_xa = dense_layer(l_qy_xa, hid) l_qy_xa = DenseLayer(l_qy_xa, n_y, init.GlorotNormal(), init.Normal(init_w), softmax) # Recognition q(z|x,a,y) l_qa_to_qz = DenseLayer(l_qa_x, qz_hid[0], init.GlorotNormal(hid_w), init.Normal(init_w), None) l_qa_to_qz = ReshapeLayer(l_qa_to_qz, (-1, self.sym_samples, 1, qz_hid[0])) l_x_to_qz = DenseLayer(l_x_in, qz_hid[0], init.GlorotNormal(hid_w), init.Normal(init_w), None) l_x_to_qz = DimshuffleLayer(l_x_to_qz, (0, 'x', 'x', 1)) l_y_to_qz = DenseLayer(l_y_in, qz_hid[0], init.GlorotNormal(hid_w), init.Normal(init_w), None) l_y_to_qz = DimshuffleLayer(l_y_to_qz, (0, 'x', 'x', 1)) l_qz_axy = ReshapeLayer( ElemwiseSumLayer([l_qa_to_qz, l_x_to_qz, l_y_to_qz]), (-1, qz_hid[0])) if batchnorm: l_qz_axy = BatchNormLayer(l_qz_axy) l_qz_axy = NonlinearityLayer(l_qz_axy, self.transf) if len(qz_hid) > 1: for hid in qz_hid[1:]: l_qz_axy = dense_layer(l_qz_axy, hid) l_qz_axy, l_qz_axy_mu, l_qz_axy_logvar = stochastic_layer( l_qz_axy, n_z, 1) # Generative p(a|z,y) l_y_to_pa = DenseLayer(l_y_in, pa_hid[0], init.GlorotNormal(hid_w), init.Normal(init_w), None) l_y_to_pa = DimshuffleLayer(l_y_to_pa, (0, 'x', 'x', 1)) l_qz_to_pa = DenseLayer(l_qz_axy, pa_hid[0], init.GlorotNormal(hid_w), init.Normal(init_w), None) l_qz_to_pa = ReshapeLayer(l_qz_to_pa, (-1, self.sym_samples, 1, pa_hid[0])) l_pa_zy = ReshapeLayer(ElemwiseSumLayer([l_qz_to_pa, l_y_to_pa]), [-1, pa_hid[0]]) if batchnorm: l_pa_zy = BatchNormLayer(l_pa_zy) l_pa_zy = NonlinearityLayer(l_pa_zy, self.transf) if len(pa_hid) > 1: for hid in pa_hid[1:]: l_pa_zy = dense_layer(l_pa_zy, hid) l_pa_zy, l_pa_zy_mu, l_pa_zy_logvar = stochastic_layer(l_pa_zy, n_a, 1) # Generative p(x|a,z,y) l_qa_to_px = DenseLayer(l_qa_x, px_hid[0], init.GlorotNormal(hid_w), init.Normal(init_w), None) l_qa_to_px = ReshapeLayer(l_qa_to_px, (-1, self.sym_samples, 1, px_hid[0])) l_y_to_px = DenseLayer(l_y_in, px_hid[0], init.GlorotNormal(hid_w), init.Normal(init_w), None) l_y_to_px = DimshuffleLayer(l_y_to_px, (0, 'x', 'x', 1)) l_qz_to_px = DenseLayer(l_qz_axy, px_hid[0], init.GlorotNormal(hid_w), init.Normal(init_w), None) l_qz_to_px = ReshapeLayer(l_qz_to_px, (-1, self.sym_samples, 1, px_hid[0])) l_px_azy = ReshapeLayer( ElemwiseSumLayer([l_qa_to_px, l_qz_to_px, l_y_to_px]), [-1, px_hid[0]]) if batchnorm: l_px_azy = BatchNormLayer(l_px_azy) l_px_azy = NonlinearityLayer(l_px_azy, self.transf) if len(px_hid) > 1: for hid in px_hid[1:]: l_px_azy = dense_layer(l_px_azy, hid) if x_dist == 'bernoulli': l_px_azy = DenseLayer(l_px_azy, n_x, init.GlorotNormal(), init.Normal(init_w), sigmoid) elif x_dist == 'multinomial': l_px_azy = DenseLayer(l_px_azy, n_x, init.GlorotNormal(), init.Normal(init_w), softmax) elif x_dist == 'gaussian': l_px_azy, l_px_zy_mu, l_px_zy_logvar = stochastic_layer( l_px_azy, n_x, 1, px_nonlinearity) # Reshape all the model layers to have the same size self.l_x_in = l_x_in self.l_y_in = l_y_in self.l_a_in = l_qa_x self.l_qa = ReshapeLayer(l_qa_x, (-1, self.sym_samples, 1, n_a)) self.l_qa_mu = DimshuffleLayer(l_qa_x_mu, (0, 'x', 'x', 1)) self.l_qa_logvar = DimshuffleLayer(l_qa_x_logvar, (0, 'x', 'x', 1)) self.l_qz = ReshapeLayer(l_qz_axy, (-1, self.sym_samples, 1, n_z)) self.l_qz_mu = ReshapeLayer(l_qz_axy_mu, (-1, self.sym_samples, 1, n_z)) self.l_qz_logvar = ReshapeLayer(l_qz_axy_logvar, (-1, self.sym_samples, 1, n_z)) self.l_qy = ReshapeLayer(l_qy_xa, (-1, self.sym_samples, 1, n_y)) self.l_pa = ReshapeLayer(l_pa_zy, (-1, self.sym_samples, 1, n_a)) self.l_pa_mu = ReshapeLayer(l_pa_zy_mu, (-1, self.sym_samples, 1, n_a)) self.l_pa_logvar = ReshapeLayer(l_pa_zy_logvar, (-1, self.sym_samples, 1, n_a)) self.l_px = ReshapeLayer(l_px_azy, (-1, self.sym_samples, 1, n_x)) self.l_px_mu = ReshapeLayer(l_px_zy_mu, (-1, self.sym_samples, 1, n_x)) if x_dist == "gaussian" else None self.l_px_logvar = ReshapeLayer( l_px_zy_logvar, (-1, self.sym_samples, 1, n_x)) if x_dist == "gaussian" else None # Predefined functions inputs = [self.sym_x_l, self.sym_samples] outputs = get_output(self.l_qy, self.sym_x_l, deterministic=True).mean(axis=(1, 2)) self.f_qy = theano.function(inputs, outputs) inputs = [self.sym_x_l, self.sym_samples] outputs = get_output(self.l_qa, self.sym_x_l, deterministic=True).mean(axis=(1, 2)) self.f_qa = theano.function(inputs, outputs) inputs = {l_qz_axy: self.sym_z, l_y_in: self.sym_t_l} outputs = get_output(self.l_pa, inputs, deterministic=True) self.f_pa = theano.function( [self.sym_z, self.sym_t_l, self.sym_samples], outputs) inputs = { l_qa_x: self.sym_a, l_qz_axy: self.sym_z, l_y_in: self.sym_t_l } outputs = get_output(self.l_px, inputs, deterministic=True) self.f_px = theano.function( [self.sym_a, self.sym_z, self.sym_t_l, self.sym_samples], outputs) # Define model parameters self.model_params = get_all_params([self.l_qy, self.l_pa, self.l_px]) self.trainable_model_params = get_all_params( [self.l_qy, self.l_pa, self.l_px], trainable=True)
def __theano_build__(self): E, V, U, W, b, c, W_att, b_att = self.E, self.V, self.U, self.W, self.b , self.c, self.W_att, self.b_att x_a = T.ivector('x_a') x_b = T.ivector('x_b') y = T.lvector('y') def forward_direction_step(x_t,s_t_prev): # Word embedding layer x_e = E[:,x_t] # GRU layer 1 z_t = T.nnet.hard_sigmoid(U[0].dot(x_e)+W[0].dot(s_t_prev)) + b[0] r_t = T.nnet.hard_sigmoid(U[1].dot(x_e)+W[1].dot(s_t_prev)) + b[1] c_t = T.tanh(U[2].dot(x_e)+W[2].dot(s_t_prev*r_t)+b[2]) s_t = (T.ones_like(z_t) - z_t) * c_t + z_t*s_t_prev # directly return the hidden state as intermidate output return [s_t] def backward_direction_step(x_t,s_t_prev): # Word embedding layer x_e = E[:,x_t] # GRU layer 2 z_t = T.nnet.hard_sigmoid(U[3].dot(x_e)+W[3].dot(s_t_prev)) + b[3] r_t = T.nnet.hard_sigmoid(U[4].dot(x_e)+W[4].dot(s_t_prev)) + b[4] c_t = T.tanh(U[5].dot(x_e)+W[5].dot(s_t_prev*r_t)+b[5]) s_t = (T.ones_like(z_t) - z_t) * c_t + z_t*s_t_prev # directly return the hidden state as intermidate output return [s_t] # sentence a vector (states) forward direction a_s_f , updates = theano.scan( forward_direction_step, sequences=x_a, truncate_gradient=self.bptt_truncate, outputs_info=T.zeros(self.hidden_dim)) # sentence b vector (states) backward direction a_s_b , updates = theano.scan( backward_direction_step, sequences=x_a[::-1], truncate_gradient=self.bptt_truncate, outputs_info=T.zeros(self.hidden_dim)) # sentence b vector (states) forward direction b_s_f , updates = theano.scan( forward_direction_step, sequences=x_b, truncate_gradient=self.bptt_truncate, outputs_info=T.zeros(self.hidden_dim)) # sentence b vector (states) backward direction b_s_b , updates = theano.scan( backward_direction_step, sequences=x_b[::-1], truncate_gradient=self.bptt_truncate, outputs_info=T.zeros(self.hidden_dim)) # combine the sena a_s = T.concatenate([a_s_f,a_s_b[::-1]],axis=1) b_s = T.concatenate([b_s_f,b_s_b[::-1]],axis=1) def soft_attention(h_i): return T.tanh(W_att.dot(h_i)+b_att) def weight_attention(h_i,a_j): return h_i*a_j a_att, updates = theano.scan( soft_attention, sequences=a_s ) b_att, updates = theano.scan( soft_attention, sequences=b_s ) # softmax # a_att = (59,1) # b_att = (58,1) a_att = T.exp(a_att) a_att = a_att.flatten() a_att = a_att / a_att.sum() b_att = T.exp(b_att) b_att = b_att.flatten() b_att = b_att / b_att.sum() a_s_att,updates = theano.scan( weight_attention, sequences=[a_s,a_att] ) b_s_att,updates = theano.scan( weight_attention, sequences=[b_s,b_att] ) # eps = np.asarray([1.0e-10]*self.label_dim,dtype=theano.config.floatX) # semantic similarity # s_sim = manhattan_distance(a_s[-1],b_s[-1]) # for classification using simple strategy # for now we still use the last word vector as sentence vector # apply a simple single hidden layer on each word in sentence # # a (wi) = attention(wi) = tanh(w_att.dot(wi)+b) # theano scan # exp(a) # sena = a_s_att.sum(axis=0) senb = b_s_att.sum(axis=0) combined_s = T.concatenate([sena,senb],axis=0) # softmax class o = T.nnet.softmax(V.dot(combined_s)+c)[0] # in case the o contains 0 which cause inf and nan eps = np.asarray([1.0e-10]*self.label_dim,dtype=theano.config.floatX) o = o + eps om = o.reshape((1,o.shape[0])) prediction = T.argmax(om,axis=1) o_error = T.nnet.categorical_crossentropy(om,y) # cost cost = T.sum(o_error) # updates updates = sgd_updates_adadelta(norm=0,params=self.params,cost=cost) # monitor parameter mV = V * T.ones_like(V) mc = c * T.ones_like(c) mU = U * T.ones_like(U) mW = W * T.ones_like(W) gV = T.grad(cost,V) gc = T.grad(cost,c) gU = T.grad(cost,U) gW = T.grad(cost,W) mgV = gV * T.ones_like(gV) mgc = gc * T.ones_like(gc) mgU = gU * T.ones_like(gU) mgW = gW * T.ones_like(gW) # Assign functions self.comsen = theano.function([x_a,x_b],[a_att,b_att]) self.monitor = theano.function([x_a,x_b],[sena,senb,mV,mc,mU,mW]) self.monitor_grad = theano.function([x_a,x_b,y],[mgV,mgc,mgU,mgW]) self.predict = theano.function([x_a,x_b],om) self.predict_class = theano.function([x_a,x_b],prediction) self.ce_error = theano.function([x_a,x_b,y],cost) # self.bptt = theano.function([x,y],[dE,dU,dW,db,dV,dc]) # SGD parameters learning_rate = T.scalar('learning_rate') decay = T.scalar('decay') # rmsprop cache updates # find the nan self.sgd_step = theano.function( [x_a,x_b,y], [], updates=updates # mode=NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=True) )
def fit(self, X_train, Y_train, X_test=None, Y_test=None, validate_every=100, optimizer='sgd', compute_zero_one=False, show_norms=True, show_output=True): """ Fit model Pass in X_test, Y_test to compute test error and report during training. X_train : ndarray (T x n_in) Y_train : ndarray (T x n_out) validation_frequency : int in terms of number of epochs optimizer : string Optimizer type. Possible values: 'sgd' : batch stochastic gradient descent 'cg' : nonlinear conjugate gradient algorithm (scipy.optimize.fmin_cg) 'bfgs' : quasi-Newton method of Broyden, Fletcher, Goldfarb, and Shanno (scipy.optimize.fmin_bfgs) 'l_bfgs_b' : Limited-memory BFGS (scipy.optimize.fmin_l_bfgs_b) compute_zero_one : bool in the case of binary output, compute zero-one error in addition to cross-entropy error show_norms : bool Show L2 norms of individual parameter groups while training. show_output : bool Show the model output on first training case while training. """ if X_test is not None: assert (Y_test is not None) self.interactive = True test_set_x, test_set_y = self.shared_dataset((X_test, Y_test)) else: self.interactive = False train_set_x, train_set_y = self.shared_dataset((X_train, Y_train)) if compute_zero_one: assert(self.output_type == 'binary' \ or self.output_type == 'softmax') # compute number of minibatches for training # note that cases are the second dimension, not the first n_train = train_set_x.get_value(borrow=True).shape[1] n_train_batches = int(np.ceil(1.0 * n_train / self.batch_size)) if self.interactive: n_test = test_set_x.get_value(borrow=True).shape[1] n_test_batches = int(np.ceil(1.0 * n_test / self.batch_size)) #validate_every is specified in terms of epochs validation_frequency = validate_every * n_train_batches ###################### # BUILD ACTUAL MODEL # ###################### logger.info('... building the model') index = T.lscalar('index') # index to a [mini]batch n_ex = T.lscalar('n_ex') # total number of examples # learning rate (may change) l_r = T.scalar('l_r', dtype=theano.config.floatX) mom = T.scalar('mom', dtype=theano.config.floatX) # momentum cost = self.rnn.loss(self.y) \ + self.L1_reg * self.rnn.L1 \ + self.L2_reg * self.rnn.L2_sqr # Proper implementation of variable-batch size evaluation # Note that classifier.errors() returns the mean error # But the last batch may be a smaller size # So we keep around the effective_batch_size (whose last element may # be smaller than the rest) # And weight the reported error by the batch_size when we average # Also, by keeping batch_start and batch_stop as symbolic variables, # we make the theano function easier to read batch_start = index * self.batch_size batch_stop = T.minimum(n_ex, (index + 1) * self.batch_size) effective_batch_size = batch_stop - batch_start get_batch_size = theano.function(inputs=[index, n_ex], outputs=effective_batch_size) compute_train_error = theano.function( inputs=[index, n_ex], outputs=self.rnn.loss(self.y), givens={ self.x: train_set_x[:, batch_start:batch_stop], self.y: train_set_y[:, batch_start:batch_stop] }, mode=mode) if compute_zero_one: compute_train_zo = theano.function( inputs=[index, n_ex], outputs=self.rnn.errors(self.y), givens={ self.x: train_set_x[:, batch_start:batch_stop], self.y: train_set_y[:, batch_start:batch_stop] }, mode=mode) if self.interactive: compute_test_error = theano.function( inputs=[index, n_ex], outputs=self.rnn.loss(self.y), givens={ self.x: test_set_x[:, batch_start:batch_stop], self.y: test_set_y[:, batch_start:batch_stop] }, mode=mode) if compute_zero_one: compute_test_zo = theano.function( inputs=[index, n_ex], outputs=self.rnn.errors(self.y), givens={ self.x: test_set_x[:, batch_start:batch_stop], self.y: test_set_y[:, batch_start:batch_stop] }, mode=mode) self.get_norms = {} for param in self.rnn.params: self.get_norms[param] = theano.function( inputs=[], outputs=self.rnn.l2_norms[param], mode=mode) # compute the gradient of cost with respect to theta using BPTT gtheta = T.grad(cost, self.rnn.theta) if optimizer == 'sgd': updates = {} theta = self.rnn.theta theta_update = self.rnn.theta_update # careful here, update to the shared variable # cannot depend on an updated other shared variable # since updates happen in parallel # so we need to be explicit upd = mom * theta_update - l_r * gtheta updates[theta_update] = upd updates[theta] = theta + upd # compiling a Theano function `train_model` that returns the # cost, but in the same time updates the parameter of the # model based on the rules defined in `updates` train_model = theano.function( inputs=[index, n_ex, l_r, mom], outputs=cost, updates=updates, givens={ self.x: train_set_x[:, batch_start:batch_stop], self.y: train_set_y[:, batch_start:batch_stop] }, mode=mode) ############### # TRAIN MODEL # ############### logger.info('... training') epoch = 0 while (epoch < self.n_epochs): epoch = epoch + 1 effective_momentum = self.final_momentum \ if epoch > self.momentum_switchover \ else self.initial_momentum for minibatch_idx in xrange(n_train_batches): minibatch_avg_cost = train_model(minibatch_idx, n_train, self.learning_rate, effective_momentum) # iteration number (how many weight updates have we made?) # epoch is 1-based, index is 0 based iter = (epoch - 1) * n_train_batches + minibatch_idx + 1 if iter % validation_frequency == 0: # compute loss on training set train_losses = [ compute_train_error(i, n_train) for i in xrange(n_train_batches) ] train_batch_sizes = [ get_batch_size(i, n_train) for i in xrange(n_train_batches) ] this_train_loss = np.average(train_losses, weights=train_batch_sizes) if compute_zero_one: train_zero_one = [ compute_train_zo(i, n_train) for i in xrange(n_train_batches) ] this_train_zero_one = np.average( train_zero_one, weights=train_batch_sizes) if self.interactive: test_losses = [ compute_test_error(i, n_test) for i in xrange(n_test_batches) ] test_batch_sizes = [ get_batch_size(i, n_test) for i in xrange(n_test_batches) ] this_test_loss = np.average( test_losses, weights=test_batch_sizes) if compute_zero_one: test_zero_one = [ compute_test_zo(i, n_test) for i in xrange(n_test_batches) ] this_test_zero_one = np.average( test_zero_one, weights=test_batch_sizes) if compute_zero_one: logger.info('epoch %i, mb %i/%i, tr loss %f, ' 'tr zo %f, te loss %f ' 'te zo %f lr: %f' % \ (epoch, minibatch_idx + 1, n_train_batches, this_train_loss, this_train_zero_one, this_test_loss, this_test_zero_one, self.learning_rate)) else: logger.info('epoch %i, mb %i/%i, tr loss %f ' 'te loss %f lr: %f' % \ (epoch, minibatch_idx + 1, n_train_batches, this_train_loss, this_test_loss, self.learning_rate)) else: if compute_zero_one: logger.info( 'epoch %i, mb %i/%i, train loss %f' ' train zo %f ' 'lr: %f' % (epoch, minibatch_idx + 1, n_train_batches, this_train_loss, this_train_zero_one, self.learning_rate)) else: logger.info( 'epoch %i, mb %i/%i, train loss %f' ' lr: %f' % (epoch, minibatch_idx + 1, n_train_batches, this_train_loss, self.learning_rate)) self.optional_output(train_set_x, show_norms, show_output) self.learning_rate *= self.learning_rate_decay if self.snapshot_every is not None: if (epoch + 1) % self.snapshot_every == 0: date_obj = datetime.datetime.now() date_str = date_obj.strftime('%Y-%m-%d-%H:%M:%S') class_name = self.__class__.__name__ fname = '%s.%s-snapshot-%d.pkl' % (class_name, date_str, epoch + 1) fabspath = os.path.join(self.snapshot_path, fname) self.save(fpath=fabspath) elif optimizer == 'cg' or optimizer == 'bfgs' \ or optimizer == 'l_bfgs_b': # compile a theano function that returns the cost of a minibatch batch_cost = theano.function( inputs=[index, n_ex], outputs=cost, givens={ self.x: train_set_x[:, batch_start:batch_stop], self.y: train_set_y[:, batch_start:batch_stop] }, mode=mode, name="batch_cost") # compile a theano function that returns the gradient of the # minibatch with respect to theta batch_grad = theano.function( inputs=[index, n_ex], outputs=T.grad(cost, self.rnn.theta), givens={ self.x: train_set_x[:, batch_start:batch_stop], self.y: train_set_y[:, batch_start:batch_stop] }, mode=mode, name="batch_grad") # creates a function that computes the average cost on the training # set def train_fn(theta_value): self.rnn.theta.set_value(theta_value, borrow=True) train_losses = [ batch_cost(i, n_train) for i in xrange(n_train_batches) ] train_batch_sizes = [ get_batch_size(i, n_train) for i in xrange(n_train_batches) ] return np.average(train_losses, weights=train_batch_sizes) # creates a function that computes the average gradient of cost # with respect to theta def train_fn_grad(theta_value): self.rnn.theta.set_value(theta_value, borrow=True) train_grads = [ batch_grad(i, n_train) for i in xrange(n_train_batches) ] train_batch_sizes = [ get_batch_size(i, n_train) for i in xrange(n_train_batches) ] return np.average(train_grads, weights=train_batch_sizes, axis=0) # validation function, prints useful output after each iteration def callback(theta_value): self.epoch += 1 if (self.epoch) % validate_every == 0: self.rnn.theta.set_value(theta_value, borrow=True) # compute loss on training set train_losses = [ compute_train_error(i, n_train) for i in xrange(n_train_batches) ] train_batch_sizes = [ get_batch_size(i, n_train) for i in xrange(n_train_batches) ] this_train_loss = np.average(train_losses, weights=train_batch_sizes) if compute_zero_one: train_zero_one = [ compute_train_zo(i, n_train) for i in xrange(n_train_batches) ] this_train_zero_one = np.average( train_zero_one, weights=train_batch_sizes) if self.interactive: test_losses = [ compute_test_error(i, n_test) for i in xrange(n_test_batches) ] test_batch_sizes = [ get_batch_size(i, n_test) for i in xrange(n_test_batches) ] this_test_loss = np.average(test_losses, weights=test_batch_sizes) if compute_zero_one: test_zero_one = [ compute_test_zo(i, n_test) for i in xrange(n_test_batches) ] this_test_zero_one = np.average( test_zero_one, weights=test_batch_sizes) if compute_zero_one: logger.info('epoch %i, tr loss %f, ' 'tr zo %f, te loss %f ' 'te zo %f' % \ (self.epoch, this_train_loss, this_train_zero_one, this_test_loss, this_test_zero_one)) else: logger.info('epoch %i, tr loss %f, te loss %f' % \ (self.epoch, this_train_loss, this_test_loss, self.learning_rate)) else: if compute_zero_one: logger.info('epoch %i, train loss %f' ', train zo %f ' % \ (self.epoch, this_train_loss, this_train_zero_one)) else: logger.info('epoch %i, train loss %f ' % \ (self.epoch, this_train_loss)) self.optional_output(train_set_x, show_norms, show_output) ############### # TRAIN MODEL # ############### logger.info('... training') # using scipy conjugate gradient optimizer import scipy.optimize if optimizer == 'cg': of = scipy.optimize.fmin_cg elif optimizer == 'bfgs': of = scipy.optimize.fmin_bfgs elif optimizer == 'l_bfgs_b': of = scipy.optimize.fmin_l_bfgs_b logger.info("Optimizing using %s..." % of.__name__) start_time = time.clock() # keep track of epochs externally # these get updated through callback self.epoch = 0 # interface to l_bfgs_b is different than that of cg, bfgs # however, this will be changed in scipy 0.11 # unified under scipy.optimize.minimize if optimizer == 'cg' or optimizer == 'bfgs': best_theta = of( f=train_fn, x0=self.rnn.theta.get_value(), # x0=np.zeros(self.rnn.theta.get_value().shape, # dtype=theano.config.floatX), fprime=train_fn_grad, callback=callback, disp=1, retall=1, maxiter=self.n_epochs) elif optimizer == 'l_bfgs_b': best_theta, f_best_theta, info = of( func=train_fn, x0=self.rnn.theta.get_value(), fprime=train_fn_grad, iprint=validate_every, maxfun=self.n_epochs) # max number of feval end_time = time.clock() print "Optimization time: %f" % (end_time - start_time) else: raise NotImplementedError
def build_model(self, train_set_unlabeled, train_set_labeled, test_set, validation_set=None): """ Build the auxiliary deep generative model from the initialized hyperparameters. Define the lower bound term and compile it into a training function. :param train_set_unlabeled: Unlabeled train set containing variables x, t. :param train_set_labeled: Unlabeled train set containing variables x, t. :param test_set: Test set containing variables x, t. :param validation_set: Validation set containing variables x, t. :return: train, test, validation function and dicts of arguments. """ super(SDGM, self).build_model(train_set_unlabeled, test_set, validation_set) sh_train_x_l = theano.shared(np.asarray(train_set_labeled[0], dtype=theano.config.floatX), borrow=True) sh_train_t_l = theano.shared(np.asarray(train_set_labeled[1], dtype=theano.config.floatX), borrow=True) n = self.sh_train_x.shape[0].astype( theano.config.floatX) # no. of data points n_l = sh_train_x_l.shape[0].astype( theano.config.floatX) # no. of labeled data points # Define the layers for the density estimation used in the lower bound. l_log_qa = GaussianLogDensityLayer(self.l_qa, self.l_qa_mu, self.l_qa_logvar) l_log_qz = GaussianLogDensityLayer(self.l_qz, self.l_qz_mu, self.l_qz_logvar) l_log_qy = MultinomialLogDensityLayer(self.l_qy, self.l_y_in, eps=1e-8) l_log_pz = StandardNormalLogDensityLayer(self.l_qz) l_log_pa = GaussianLogDensityLayer(self.l_qa, self.l_pa_mu, self.l_pa_logvar) if self.x_dist == 'bernoulli': l_log_px = BernoulliLogDensityLayer(self.l_px, self.l_x_in) elif self.x_dist == 'multinomial': l_log_px = MultinomialLogDensityLayer(self.l_px, self.l_x_in) elif self.x_dist == 'gaussian': l_log_px = GaussianLogDensityLayer(self.l_x_in, self.l_px_mu, self.l_px_logvar) def lower_bound(log_pa, log_qa, log_pz, log_qz, log_py, log_px): lb = log_px + log_py + log_pz + log_pa - log_qa - log_qz return lb # Lower bound for labeled data out_layers = [ l_log_pa, l_log_pz, l_log_qa, l_log_qz, l_log_px, l_log_qy ] inputs = {self.l_x_in: self.sym_x_l, self.l_y_in: self.sym_t_l} out = get_output(out_layers, inputs, batch_norm_update_averages=False, batch_norm_use_averages=False) log_pa_l, log_pz_l, log_qa_x_l, log_qz_axy_l, log_px_zy_l, log_qy_ax_l = out # Prior p(y) expecting that all classes are evenly distributed py_l = softmax(T.zeros((self.sym_x_l.shape[0], self.n_y))) log_py_l = -categorical_crossentropy(py_l, self.sym_t_l).reshape( (-1, 1)).dimshuffle((0, 'x', 'x', 1)) lb_l = lower_bound(log_pa_l, log_qa_x_l, log_pz_l, log_qz_axy_l, log_py_l, log_px_zy_l) lb_l = lb_l.mean(axis=(1, 2)) # Mean over the sampling dimensions log_qy_ax_l *= ( self.sym_beta * (n / n_l) ) # Scale the supervised cross entropy with the alpha constant lb_l += log_qy_ax_l.mean(axis=( 1, 2 )) # Collect the lower bound term and mean over sampling dimensions # Lower bound for unlabeled data bs_u = self.sym_x_u.shape[0] # For the integrating out approach, we repeat the input matrix x, and construct a target (bs * n_y) x n_y # Example of input and target matrix for a 3 class problem and batch_size=2. 2D tensors of the form # x_repeat t_repeat # [[x[0,0], x[0,1], ..., x[0,n_x]] [[1, 0, 0] # [x[1,0], x[1,1], ..., x[1,n_x]] [1, 0, 0] # [x[0,0], x[0,1], ..., x[0,n_x]] [0, 1, 0] # [x[1,0], x[1,1], ..., x[1,n_x]] [0, 1, 0] # [x[0,0], x[0,1], ..., x[0,n_x]] [0, 0, 1] # [x[1,0], x[1,1], ..., x[1,n_x]]] [0, 0, 1]] t_eye = T.eye(self.n_y, k=0) t_u = t_eye.reshape((self.n_y, 1, self.n_y)).repeat(bs_u, axis=1).reshape( (-1, self.n_y)) x_u = self.sym_x_u.reshape( (1, bs_u, self.n_x)).repeat(self.n_y, axis=0).reshape( (-1, self.n_x)) # Since the expectation of var a is outside the integration we calculate E_q(a|x) first a_x_u = get_output(self.l_qa, self.sym_x_u, batch_norm_update_averages=True, batch_norm_use_averages=False) a_x_u_rep = a_x_u.reshape( (1, bs_u * self.sym_samples, self.n_a)).repeat(self.n_y, axis=0).reshape( (-1, self.n_a)) out_layers = [l_log_pa, l_log_pz, l_log_qa, l_log_qz, l_log_px] inputs = {self.l_x_in: x_u, self.l_y_in: t_u, self.l_a_in: a_x_u_rep} out = get_output(out_layers, inputs, batch_norm_update_averages=False, batch_norm_use_averages=False) log_pa_u, log_pz_u, log_qa_x_u, log_qz_axy_u, log_px_zy_u = out # Prior p(y) expecting that all classes are evenly distributed py_u = softmax(T.zeros((bs_u * self.n_y, self.n_y))) log_py_u = -categorical_crossentropy(py_u, t_u).reshape( (-1, 1)).dimshuffle((0, 'x', 'x', 1)) lb_u = lower_bound(log_pa_u, log_qa_x_u, log_pz_u, log_qz_axy_u, log_py_u, log_px_zy_u) lb_u = lb_u.reshape( (self.n_y, 1, 1, bs_u)).transpose(3, 1, 2, 0).mean(axis=(1, 2)) inputs = { self.l_x_in: self.sym_x_u, self.l_a_in: a_x_u.reshape((-1, self.n_a)) } y_u = get_output(self.l_qy, inputs, batch_norm_update_averages=True, batch_norm_use_averages=False).mean(axis=(1, 2)) y_u += 1e-8 # Ensure that we get no NANs when calculating the entropy y_u /= T.sum(y_u, axis=1, keepdims=True) lb_u = (y_u * (lb_u - T.log(y_u))).sum(axis=1) if self.batchnorm: # TODO: implement the BN layer correctly. inputs = { self.l_x_in: self.sym_x_u, self.l_y_in: y_u, self.l_a_in: a_x_u } get_output(out_layers, inputs, weighting=None, batch_norm_update_averages=True, batch_norm_use_averages=False) # Regularizing with weight priors p(theta|N(0,1)), collecting and clipping gradients weight_priors = 0.0 for p in self.trainable_model_params: if 'W' not in str(p): continue weight_priors += log_normal(p, 0, 1).sum() # Collect the lower bound and scale it with the weight priors. elbo = ((lb_l.mean() + lb_u.mean()) * n + weight_priors) / -n lb_labeled = -lb_l.mean() lb_unlabeled = -lb_u.mean() grads_collect = T.grad(elbo, self.trainable_model_params) params_collect = self.trainable_model_params sym_beta1 = T.scalar('beta1') sym_beta2 = T.scalar('beta2') clip_grad, max_norm = 1, 5 mgrads = total_norm_constraint(grads_collect, max_norm=max_norm) mgrads = [T.clip(g, -clip_grad, clip_grad) for g in mgrads] updates = adam(mgrads, params_collect, self.sym_lr, sym_beta1, sym_beta2) # Training function indices = self._srng.choice(size=[self.sym_bs_l], a=sh_train_x_l.shape[0], replace=False) x_batch_l = sh_train_x_l[indices] t_batch_l = sh_train_t_l[indices] x_batch_u = self.sh_train_x[self.batch_slice] if self.x_dist == 'bernoulli': # Sample bernoulli input. x_batch_u = self._srng.binomial(size=x_batch_u.shape, n=1, p=x_batch_u, dtype=theano.config.floatX) x_batch_l = self._srng.binomial(size=x_batch_l.shape, n=1, p=x_batch_l, dtype=theano.config.floatX) givens = { self.sym_x_l: x_batch_l, self.sym_x_u: x_batch_u, self.sym_t_l: t_batch_l } inputs = [ self.sym_index, self.sym_batchsize, self.sym_bs_l, self.sym_beta, self.sym_lr, sym_beta1, sym_beta2, self.sym_samples ] outputs = [elbo, lb_labeled, lb_unlabeled] f_train = theano.function(inputs=inputs, outputs=outputs, givens=givens, updates=updates) # Default training args. Note that these can be changed during or prior to training. self.train_args['inputs']['batchsize_unlabeled'] = 100 self.train_args['inputs']['batchsize_labeled'] = 100 self.train_args['inputs']['beta'] = 0.1 self.train_args['inputs']['learningrate'] = 3e-4 self.train_args['inputs']['beta1'] = 0.9 self.train_args['inputs']['beta2'] = 0.999 self.train_args['inputs']['samples'] = 1 self.train_args['outputs']['lb'] = '%0.4f' self.train_args['outputs']['lb-labeled'] = '%0.4f' self.train_args['outputs']['lb-unlabeled'] = '%0.4f' # Validation and test function y = get_output(self.l_qy, self.sym_x_l, deterministic=True).mean(axis=(1, 2)) class_err = (1. - categorical_accuracy(y, self.sym_t_l).mean()) * 100 givens = {self.sym_x_l: self.sh_test_x, self.sym_t_l: self.sh_test_t} f_test = theano.function(inputs=[self.sym_samples], outputs=[class_err], givens=givens) # Test args. Note that these can be changed during or prior to training. self.test_args['inputs']['samples'] = 1 self.test_args['outputs']['test'] = '%0.2f%%' f_validate = None if validation_set is not None: givens = { self.sym_x_l: self.sh_valid_x, self.sym_t_l: self.sh_valid_t } f_validate = theano.function(inputs=[self.sym_samples], outputs=[class_err], givens=givens) # Default validation args. Note that these can be changed during or prior to training. self.validate_args['inputs']['samples'] = 1 self.validate_args['outputs']['validation'] = '%0.2f%%' return f_train, f_test, f_validate, self.train_args, self.test_args, self.validate_args