def GetClassifier(self): def PairwiseLoss(x, mutation): """ This function takes two matrix and return a vector by first calculating the loss for each row and then take element-wise maximum for each row. """ return T.maximum(0., 1. - self.F(x) + self.F(mutation)).sum() inputs = T.tensor3(name='input', dtype='int32') mutations = T.tensor3(name='mutations', dtype='int32') components, updates = theano.scan(fn=PairwiseLoss, outputs_info=None, sequences=[inputs, mutations]) loss = components.sum() gparams = [T.grad(loss, param) for param in self.params] updates = [(param, param - self.learning_rate * gparam) for param, gparam in zip(self.params, gparams)] return theano.function(inputs=[inputs, mutations], outputs=loss, updates=updates)
def init_exprs(self): inpt = T.tensor3('inpt') if self.pooling is None: target = T.tensor3('target') else: target = T.matrix('tensor3') pars = self.parameters hidden_to_hiddens = [getattr(pars, 'hidden_to_hidden_%i' % i) for i in range(len(self.n_hiddens) - 1)] hidden_biases = [getattr(pars, 'hidden_bias_%i' % i) for i in range(len(self.n_hiddens))] recurrents = [getattr(pars, 'recurrent_%i' % i) for i in range(len(self.n_hiddens))] ingate_peepholes = [getattr(pars, 'ingate_peephole_%i' % i) for i in range(len(self.n_hiddens))] outgate_peepholes = [getattr(pars, 'outgate_peephole_%i' % i) for i in range(len(self.n_hiddens))] forgetgate_peepholes = [getattr(pars, 'forgetgate_peephole_%i' % i) for i in range(len(self.n_hiddens))] self.exprs = self.make_exprs( inpt, target, pars.in_to_hidden, hidden_to_hiddens, pars.hidden_to_out, hidden_biases, recurrents, pars.out_bias, ingate_peepholes, outgate_peepholes, forgetgate_peepholes, self.hidden_transfers, self.out_transfer, self.loss, self.pooling, self.leaky_coeffs)
def _setup_vars(self, sparse_input): '''Setup Theano variables for our network. Parameters ---------- sparse_input : bool Not used -- sparse inputs are not supported for recurrent networks. Returns ------- vars : list of theano variables A list of the variables that this network requires as inputs. ''' _warn_dimshuffle() assert not sparse_input, 'Theanets does not support sparse recurrent models!' # the first dimension indexes time, the second indexes the elements of # each minibatch, and the third indexes the variables in a given frame. self.x = TT.tensor3('x') # for a regressor, this specifies the correct outputs for a given input. self.targets = TT.tensor3('targets') # the weights are the same shape as the output and specify the strength # of each entries in the error computation. self.weights = TT.tensor3('weights') if self.weighted: return [self.x, self.targets, self.weights] return [self.x, self.targets]
def __init__(self, rng, dim_proj, W=None, U=None, b=None): self._init_params(rng, dim_proj, W, U, b, 5) word_matrix = T.tensor3('Word matrix', dtype=config.floatX) c_mask = T.matrix('Child mask', dtype=config.floatX) node_mask = T.matrix('Node mask', dtype=config.floatX) children = T.tensor3('Children', dtype='int64') self.X = word_matrix self.mask = node_mask self.c_mask = c_mask self.input = [word_matrix, children, c_mask, node_mask] n_samples = word_matrix.shape[1] self.h, self.c_memory = self.project(word_matrix, children, c_mask) all_samples = T.arange(n_samples) self.max_pooled_h = (self.h * node_mask[:, :, None]).max(axis=0) self.sum_pooled_h = (self.h * node_mask[:, :, None]).sum(axis=0) self.mean_pooled_h = self.sum_pooled_h /\ T.maximum(c_mask.sum(axis=0)[:, None], 1) num_inner_nodes = c_mask.sum(axis=0).astype('int64') num_nodes = num_inner_nodes * 2 + 1 self.top_h = self.h[num_nodes - 1, all_samples, :]
def __init__(self, inpShape, outputNum, clip): num_units = 256 # By setting the first two dimensions as None, we are allowing them to vary # They correspond to batch size and sequence length, so we will be able to # feed in batches of varying size with sequences of varying length. self.l_inp = InputLayer(inpShape) # We can retrieve symbolic references to the input variable's shape, which # we will later use in reshape layers. batchsize, seqlen, _ = self.l_inp.input_var.shape self.l_lstm = LSTMLayer(self.l_inp, num_units=num_units) # In order to connect a recurrent layer to a dense layer, we need to # flatten the first two dimensions (our "sample dimensions"); this will # cause each time step of each sequence to be processed independently l_shp = ReshapeLayer(self.l_lstm, (-1, num_units)) self.l_dense = DenseLayer(l_shp, num_units=outputNum) # To reshape back to our original shape, we can use the symbolic shape # variables we retrieved above. self.l_out = ReshapeLayer(self.l_dense, (batchsize, seqlen, outputNum)) net_output = lasagne.layers.get_output(self.l_out) truth = T.tensor3() mask = T.tensor3() loss = T.mean(mask*(net_output-truth)**2) params = lasagne.layers.get_all_params(self.l_out) grads = lasagne.updates.total_norm_constraint(T.grad(loss, params), clip) update = lasagne.updates.rmsprop(grads, params, 0.002) self.train = theano.function([self.l_inp.input_var, truth, mask], loss, updates=update) self.get_output = theano.function([self.l_inp.input_var], outputs=net_output)
def main(): pars = "model/4GRAM_BI/76.69" print("Loading data...") (feats_in,feats_out) = iodata.iodata_forPre() feats_in = np.array(feats_in).astype(theano.config.floatX) print("{}".format((QUESTION_SIZE*(NGRAMS+1)*NUM_CHOICES,1,WORD_2_VEC_FEATURES))) feats_out = np.array(feats_out).astype(theano.config.floatX).reshape((QUESTION_SIZE*(NGRAMS+1)*NUM_CHOICES,1,WORD_2_VEC_FEATURES)) #print(feats_out.shape) #print(feats_in) #print(lenfeats_out) output_layer = build_model(bi_directional = True) network.layers.set_all_param_values(output_layer, pickle.load(open(pars, "r"))) x = T.tensor3('x', dtype=theano.config.floatX) y =T.tensor3('y',dtype = theano.config.floatX) cos_distance_ls = np.zeros((QUESTION_SIZE,NUM_CHOICES)) predict = theano.function([x,y],calculate_cos_dis(output_layer.get_output(x,deterministic=True),y),on_unused_input='ignore') for index in range(QUESTION_SIZE): try: print(feats_in[(index)*NUM_CHOICES:(index+1)*NUM_CHOICES].shape) print(feats_out[(index)*NUM_CHOICES:(index+1)*NUM_CHOICES].shape) pred = predict(feats_in[(index)*NUM_CHOICES:(index+1)*NUM_CHOICES],feats_out[(index)*NUM_CHOICES:(index+1)*NUM_CHOICES]) print("OHOHOH") except RuntimeError: pass cos_distance_ls[index,:] = cos_distance_ls[index,:] + pred
def init_model(self): print('Initializing model...') ra_input_var = T.tensor3('raw_audio_input') mc_input_var = T.tensor3('melody_contour_input') target_var = T.imatrix('targets') network = self.build_network(ra_input_var, mc_input_var) prediction = layers.get_output(network) prediction = T.clip(prediction, 1e-7, 1.0 - 1e-7) loss = lasagne.objectives.categorical_crossentropy(prediction, target_var) loss = loss.mean() params = layers.get_all_params(network, trainable=True) updates = lasagne.updates.sgd(loss, params, learning_rate=0.02) test_prediction = layers.get_output(network, deterministic=True) test_loss = lasagne.objectives.categorical_crossentropy(test_prediction, target_var) test_loss = test_loss.mean() test_acc = T.mean(T.eq(T.argmax(test_prediction, axis=1), T.argmax(target_var, axis=1)), dtype=theano.config.floatX) print('Building functions...') self.train_fn = theano.function([ra_input_var, mc_input_var, target_var], [loss, prediction], updates=updates, on_unused_input='ignore') self.val_fn = theano.function([ra_input_var, mc_input_var, target_var], [test_loss, test_acc, test_prediction], on_unused_input='ignore') self.run_fn = theano.function([ra_input_var, mc_input_var], [prediction], on_unused_input='ignore')
def build_decoder(tparams, options): """ build an encoder, given pre-computed word embeddings """ # description string: #words x #samples # text: text sentence # hypothesis: hypothesis sentence text_embedding = tensor.tensor3('text_embedding', dtype='float32') # text = tensor.matrix('text', dtype='int64') text_mask = tensor.matrix('text_mask', dtype='float32') hypothesis_embedding = tensor.tensor3('hypothesis_embedding', dtype='float32') # hypothesis = tensor.matrix('hypothesis', dtype='int64') hypothesis_mask = tensor.matrix('hypothesis_mask', dtype='float32') # encoder proj = get_layer(options['encoder'])[1](tparams, text_embedding, None, options, prefix='encoder', mask=text_mask) ctx = proj[0][-1] dec_ctx = ctx # decoder (hypothesis) proj_hypo = get_layer(options['decoder'])[1](tparams, hypothesis_embedding, dec_ctx, options, prefix='decoder_f', mask=hypothesis_mask) hypo_ctx = proj_hypo[0][-1] return text_embedding, text_mask, hypothesis_embedding, hypothesis_mask, hypo_ctx
def build_model_EvoMN(options, tparams): trng = RandomStreams(SEED) use_noise = theano.shared(numpy_floatX(0.)) use_linear = theano.shared(numpy_floatX(0.)) x = tensor.tensor3('x', dtype='int64') # x is n_sent * n_word * n_samples xmask = tensor.tensor3('xmask', dtype=config.floatX) # same as x q = tensor.matrix('q', dtype='int64') # q is nword * n_samples qmask = tensor.matrix('qmask', dtype=config.floatX) y = tensor.vector('y',dtype='int64') # nsamples * 1 nhops = tensor.scalar('nhops',dtype='int64') # nhops, used to loop. wmat = tensor.matrix('wmat',dtype=config.floatX) # dim_word * (maxSentLen+1) aEmbSeq, bEmbSeq, qSeq = memLayers(tparams, options, x, xmask, q, qmask, nhops, wmat, use_linear) proj = qSeq[-1] # nsamples * dim_hidden if options['use_dropout']: proj = dropout_layer(proj, use_noise, trng) pred = tensor.nnet.softmax(tensor.dot(proj, tparams['Wemb_B_' + str(options['nhops'])].T)) # nsamples * vocab_size pred_ans = pred.argmax(axis=1) # nsamples vector off = 1e-7 cost = -tensor.log(pred[tensor.arange(y.shape[0]), y] + off).sum() f_debug = theano.function([x,xmask,q,qmask,y,nhops,wmat], [x,xmask,q,qmask,y,nhops,wmat,proj,pred,pred_ans,aEmbSeq,bEmbSeq,qSeq],name='f_debug') print 'f_debug complete~' f_pred = theano.function([x,xmask,q,qmask,nhops,wmat], pred, name='f_pred') print 'f_pred complete~' f_ans = theano.function([x,xmask,q,qmask,nhops,wmat], pred_ans, name='f_ans') print 'f_ans complete~' return use_noise, use_linear, x, xmask, q, qmask, y, nhops, wmat, proj, pred, pred_ans, cost, f_debug, f_pred, f_ans
def main(): """ a = tensor.tensor3('a') b = tensor.tensor3('b') c = tensor.tensor3('c') d = tensor.concatenate([a,b,c], axis=0) f = theano.function([a,b,c],d) aval = np.array([[[2,2,2,2]],[[2,2,2,2]], [[2,2,2,2]]]) bval = 2*aval cval = 3*aval ans = f(a=aval, b=bval, c = None) print(ans) print(ans.shape) """ a = tensor.tensor3("a") b = tensor.tensor3("b") c = tensor.tensor3("c") d = con(rep0=a, rep1=b, rep2=c) f = theano.function([a, b, c], d) aval = np.array([[[2, 2, 2, 2]], [[2, 2, 2, 2]], [[2, 2, 2, 2]]]) tval = np.zeros(aval.shape) print tval print tval.shape bval = 2 * aval cval = 3 * aval ans = f(a=aval, b=bval, c=cval) print (ans) print (ans.shape) print (tensor.dim(ans)) """
def build_model(args): x = tensor.tensor3('features', dtype=floatX) y = tensor.tensor3('targets', dtype=floatX) linear = Linear(input_dim=1, output_dim=4 * args.units) rnn = LSTM(dim=args.units, activation=Tanh()) linear2 = Linear(input_dim=args.units, output_dim=1) prediction = Tanh().apply(linear2.apply(rnn.apply(linear.apply(x)))) prediction = prediction[:-1, :, :] # SquaredError does not work on 3D tensor y = y.reshape((y.shape[0] * y.shape[1], y.shape[2])) prediction = prediction.reshape((prediction.shape[0] * prediction.shape[1], prediction.shape[2])) cost = SquaredError().apply(y, prediction) # Initialization linear.weights_init = IsotropicGaussian(0.1) linear2.weights_init = IsotropicGaussian(0.1) linear.biases_init = Constant(0) linear2.biases_init = Constant(0) rnn.weights_init = Orthogonal() return cost
def step_fun(self): if self._step_fun is None: inputs = T.matrix('inputs') states_tm1 = [T.matrix('state_%d_%d_tm1' % (layer, state)) for layer in range(self.n_layers) for state in range(self.gate0.n_states)] if self.gates[-1].use_attention: raise NotImplementedError('Stacked RNN with attention') attended=T.tensor3('attended') attended_dot_u=T.tensor3('attended_dot_u') attention_mask=T.matrix('attention_mask') self._step_fun = function( [inputs] + states_tm1 + [ attended, attended_dot_u, attention_mask], self.step(*([inputs, T.ones(inputs.shape[:-1])] + states_tm1 + [T.ones_like(states_tm1[0]), attended, attended_dot_u, attention_mask])), name='%s_step_fun'%self.name) else: self._step_fun = function( [inputs] + states_tm1, self.step(*([inputs, T.ones(inputs.shape[:-1])] + states_tm1 + [T.ones_like(states_tm1[0])])), name='%s_step_fun'%self.name) return self._step_fun
def mulclassfunc(self, layerid, wdecay): # mult-label loss x = [] for j in range(self.emb_num): x.append(T.tensor3(dtype = 'int32')) y = T.tensor3(dtype = 'int8') label = T.matrix(dtype = 'int8') iin = [] iin.extend(x) iin.append(y) iin.append(label) wikiloss = self.mulclassloss(layerid,x,y,label) loss = wikiloss + self.l2reg(self.unsuperw, wdecay) w = self.unsuperw witems = w.values() if not self.fix_emb: witems += self.dicw.values() g = T.grad(loss, witems) up = self.upda(g,witems,self.lrate, self.mweight,self.opt,self.fix_emb) mulclassfunc = theano.function(iin, loss, updates = up) return mulclassfunc
def initialize_data_nodes(loss_function, input_type, out_every_t): x = T.tensor3() if input_type == 'real' else T.matrix(dtype=INT_STR) if loss_function == 'CE': y = T.matrix(dtype=INT_STR) if out_every_t else T.vector(dtype=INT_STR) else: y = T.tensor3() if out_every_t else T.matrix() return x, y
def test_reset_only_many_steps(self): x = tensor.tensor3('x') ri = tensor.tensor3('ri') mask = tensor.matrix('mask') h = self.reset_only.apply(x, reset_inputs=ri, mask=mask) calc_h = theano.function(inputs=[x, ri, mask], outputs=[h]) x_val = 0.1 * numpy.asarray(list(itertools.permutations(range(4))), dtype=floatX) x_val = numpy.ones((24, 4, 3), dtype=floatX) * x_val[..., None] ri_val = 0.3 - x_val mask_val = numpy.ones((24, 4), dtype=floatX) mask_val[12:24, 3] = 0 h_val = numpy.zeros((25, 4, 3), dtype=floatX) W = self.reset_only.state_to_state.get_value() U = self.reset_only.state_to_reset.get_value() for i in range(1, 25): r_val = numpy.tanh(h_val[i - 1].dot(U) + ri_val[i - 1]) h_val[i] = numpy.tanh((r_val * h_val[i - 1]).dot(W) + x_val[i - 1]) h_val[i] = (mask_val[i - 1, :, None] * h_val[i] + (1 - mask_val[i - 1, :, None]) * h_val[i - 1]) h_val = h_val[1:] # TODO Figure out why this tolerance needs to be so big assert_allclose(h_val, calc_h(x_val, ri_val, mask_val)[0], 1e-03)
def __init__(self,n_in,n_hidden,n_out): self.n_in=int(n_in) self.n_hidden=int(n_hidden) self.n_out=int(n_out) self.input= T.tensor3() self.output= T.tensor3() self.x_mask=T.matrix() #self.y_mask=T.matrix() self.W_z = glorot_normal((n_out,n_hidden)) self.U_z = glorot_normal((n_hidden,n_hidden)) self.b_z = zero((n_hidden,)) self.W_r = glorot_normal((n_out,n_hidden)) self.U_r = glorot_normal((n_hidden,n_hidden)) self.b_r = zero((n_hidden,)) self.W_h = glorot_normal((n_out,n_hidden)) self.U_h = glorot_normal((n_hidden,n_hidden)) self.b_h = zero((n_hidden,)) self.U_att= glorot_normal((self.n_in,1)) self.b_att= zero((1,)) self.W_yc=glorot_normal((self.n_out,)) self.W_cy = glorot_normal((self.n_in,self.n_hidden)) self.W_cs= glorot_normal((self.n_in,self.n_hidden)) self.W_ha = glorot_normal((self.n_in,self.n_in)) self.W_sa= glorot_normal((self.n_hidden,self.n_in)) self.W_cl= glorot_normal((self.n_in,self.n_out)) self.W_yl= glorot_normal((self.n_out,self.n_out)) self.W_hl= glorot_normal((self.n_hidden,self.n_out)) self.params=[self.W_z,self.U_z,self.b_z,self.W_r,self.U_r,self.b_r, self.W_h,self.U_h,self.b_h,self.W_cy,self.W_cs,self.W_ha,self.W_sa ,self.W_cl,self.W_yl,self.W_hl,self.U_att,self.b_att] self.L1 = T.sum(abs(self.W_z))+T.sum(abs(self.U_z))+\ T.sum(abs(self.W_r))+T.sum(abs(self.U_r))+\ T.sum(abs(self.W_h))+T.sum(abs(self.U_h))+\ T.sum(abs(self.W_cy))+T.sum(abs(self.W_cs))+\ T.sum(abs(self.W_ha))+T.sum(abs(self.W_sa))+\ T.sum(abs(self.W_cl))+T.sum(abs(self.W_yl))+\ T.sum(abs(self.W_hl))+T.sum(abs(self.U_att)) self.L2_sqr = T.sum(self.W_z**2) + T.sum(self.U_z**2)+\ T.sum(self.W_r**2) + T.sum(self.U_r**2)+\ T.sum(self.W_h**2) + T.sum(self.U_h**2)+\ T.sum(self.W_cy**2) + T.sum(self.W_cs**2)+\ T.sum(self.W_ha**2) + T.sum(self.W_sa**2)+\ T.sum(self.W_cl**2) + T.sum(self.W_yl**2)+\ T.sum(self.W_hl**2) + T.sum(self.U_att**2)
def test7(): morph = T.tensor3("morph") morph_mask = T.tensor3("mask") rel = T.matrix("rel") morphStruct = MorphStruct() morph_out = morphStruct.apply(morph , morph_mask , rel) fn = theano.function(inputs = [morph , morph_mask , rel] ,outputs = [morph_out] , on_unused_input='ignore') #rel : batch * sentence #state_below_morph : batch * sentence * n_emb_morph i = [ [ [1,1,1,1,1] , [2,2,2,2,2] , [3,3,3,3,3] , [4,4,4,4,4] , [1,1,1,1,1] , [2,2,2,2,2] , [3,3,3,3,3] , [4,4,4,4,4] , [1,1,1,1,1] , [2,2,2,2,2] , [3,3,3,3,3] , [4,4,4,4,4] ], [ [1,1,1,1,1] , [2,2,2,2,2] , [3,3,3,3,3] , [4,4,4,4,4] , [1,1,1,1,1] , [2,2,2,2,2] , [3,3,3,3,3] , [4,4,4,4,4] , [1,1,1,1,1] , [2,2,2,2,2] , [3,3,3,3,3] , [4,4,4,4,4] ], [ [1,1,1,1,1] , [2,2,2,2,2] , [3,3,3,3,3] , [4,4,4,4,4] , [1,1,1,1,1] , [2,2,2,2,2] , [3,3,3,3,3] , [4,4,4,4,4] , [1,1,1,1,1] , [2,2,2,2,2] , [3,3,3,3,3] , [4,4,4,4,4] ] ] m = i r = [[1,1,2,1,3,1,3,0,0,0] , [1,2,1,2,1,1,1,1,1,1],[3,1,1,5,1,1,0,0,0,0]] #res = fn(i , m , r) mat = np.array(i) print mat.shape print mat.sum(2) print mat.sum(1).shape
def set_evaluation_function(generator_rnn_model, generator_output_model): # input sequence data (time_length * num_samples * input_dims) input_sequence = tensor.tensor3(name='input_sequence', dtype=floatX) target_sequence = tensor.tensor3(name='target_sequence', dtype=floatX) # set generator input data list generator_input_data_list = [input_sequence,] # get generator output data generator_output = generator_rnn_model[0].forward(generator_input_data_list, is_training=True) generator_hidden = generator_output[0] generator_cell = generator_output[1] generator_sample = get_tensor_output(generator_hidden, generator_output_model, is_training=True) # get square error square_error = tensor.sqr(target_sequence-generator_sample).sum(axis=2) # set evaluation inputs evaluation_inputs = [input_sequence, target_sequence] # set evaluation outputs evaluation_outputs = [square_error,] # set evaluation function evaluation_function = theano.function(inputs=evaluation_inputs, outputs=evaluation_outputs, on_unused_input='ignore') return evaluation_function
def generate_subpop_input(r_E, r_I, n_pairs): c = T.scalar("c", dtype='float32') h = T.matrix("h", dtype='float32') W_EE = T.tensor3("W_EE", dtype='float32') W_EI = T.tensor3("W_EI", dtype='float32') W_IE = T.tensor3("W_IE", dtype='float32') W_II = T.tensor3("W_II", dtype='float32') r_e = T.matrix("r_e", dtype='float32') r_i = T.matrix("r_i", dtype='float32') I_E = T.matrix('I_E', dtype='float32') I_I = T.matrix('I_I', dtype='float32') I_thresh_E = T.matrix('I_thresh_E', dtype='float32') I_thresh_I = T.matrix('I_thresh_I', dtype='float32') # Compile functions: I_E = c*h + T.sum(T.sum(W_EE*r_e,1),1).reshape((n_pairs, n_pairs)).T - T.sum(T.sum(W_EI*r_i,1),1).reshape((n_pairs, n_pairs)).T I_I = c*h + T.sum(T.sum(W_IE*r_e,1),1).reshape((n_pairs, n_pairs)).T - T.sum(T.sum(W_II*r_i,1),1).reshape((n_pairs, n_pairs)).T I_thresh_E = T.switch(T.lt(I_E,0), 0, I_E) I_thresh_I = T.switch(T.lt(I_I,0), 0, I_I) inputs = theano.function(inputs=[c,h,W_EE,W_EI,W_IE,W_II], outputs=[I_thresh_E, I_thresh_I], givens={r_e:r_E, r_i:r_I}, allow_input_downcast=True) return inputs
def multaskfunc(self,layerid,wdecay, LMweight): #language model + multiple label classification # multi-label loss x = [] for j in range(self.emb_num): x.append(T.tensor3(dtype = 'int32')) y = T.tensor3(dtype = 'int8') label = T.matrix(dtype = 'int8') nextwords = T.imatrix() iin = [] iin.extend(x) iin.append(y) iin.append(label) iin.append(nextwords) mulloss, posLMloss, negLMloss = self.LMmulcloss(layerid,x,y,label,nextwords) loss = mulloss + LMweight*(posLMloss+negLMloss)+self.l2reg(self.unsuperw, wdecay) w = self.unsuperw witems = w.values() if not self.fix_emb: witems += self.dicw.values() g = T.grad(loss, witems) up = self.upda(g,witems,self.lrate, self.mweight,self.opt,self.fix_emb) mtaskfunc = theano.function(iin, loss, updates = up) return mtaskfunc
def _get_net(self): net = OrderedDict() net['l_in_x'] = InputLayer(shape=(None, None, TOKEN_REPRESENTATION_SIZE), input_var=T.tensor3(name="enc_ix"), name="encoder_seq_ix") net['l_in_y'] = InputLayer(shape=(None, None, TOKEN_REPRESENTATION_SIZE), input_var=T.tensor3(name="dec_ix"), name="decoder_seq_ix") # encoder ############################################### net['l_enc'] = LSTMLayer( incoming=net['l_in_x'], num_units=HIDDEN_LAYER_DIMENSION, grad_clipping=GRAD_CLIP, only_return_final=True, name='lstm_encoder' ) # decoder ############################################### net['l_dec'] = LSTMLayer( incoming=net['l_in_y'], num_units=HIDDEN_LAYER_DIMENSION, hid_init=net['l_enc'], grad_clipping=GRAD_CLIP, name='lstm_decoder' ) # decoder returns the batch of sequences of though vectors, each corresponds to a decoded token # reshape this 3d tensor to 2d matrix so that the next Dense layer can convert each though vector to # probability distribution vector # output ############################################### # cut off the last prob vectors for every prob sequence: # they correspond to the tokens that go after EOS_TOKEN and we are not interested in it net['l_slice'] = SliceLayer( incoming=net['l_dec'], indices=slice(0, -1), # keep all but the last token axis=1, # sequneces axis name='slice_layer' ) net['l_dec_long'] = ReshapeLayer( incoming=net['l_slice'], shape=(-1, HIDDEN_LAYER_DIMENSION), name='reshape_layer' ) net['l_dist'] = DenseLayer( incoming=net['l_dec_long'], num_units=self.vocab_size, nonlinearity=lasagne.nonlinearities.softmax, name="dense_output_probas" ) # don't need to reshape back, can compare this "long" output with true one-hot vectors return net
def __init__(self, input_size, hidden_size, output_size): self.input_size = input_size self.hidden_size = hidden_size self.output_size = output_size x = tensor.tensor3('x', dtype=floatX) y = tensor.tensor3('y', dtype=floatX) x_to_lstm = Linear(name="x_to_lstm", input_dim=input_size, output_dim=4 * hidden_size, weights_init=IsotropicGaussian(), biases_init=Constant(0)) lstm = LSTM(dim=hidden_size, name="lstm", weights_init=IsotropicGaussian(), biases_init=Constant(0)) lstm_to_output = Linear(name="lstm_to_output", input_dim=hidden_size, output_dim=output_size, weights_init=IsotropicGaussian(), biases_init=Constant(0)) x_transform = x_to_lstm.apply(x) h, c = lstm.apply(x_transform) y_hat = lstm_to_output.apply(h) y_hat = Logistic(name="y_hat").apply(y_hat) self.cost = BinaryCrossEntropy(name="cost").apply(y, y_hat) x_to_lstm.initialize() lstm.initialize() lstm_to_output.initialize() self.computation_graph = ComputationGraph(self.cost)
def set_tf_update_function(input_emb_param, generator_rnn_model, generator_output_model, generator_optimizer, generator_grad_clipping): # input sequence data (time_length * num_samples * input_dims) input_sequence = tensor.tensor3(name='input_sequence', dtype=floatX) target_sequence = tensor.tensor3(name='target_sequence', dtype=floatX) # embedding sequence input_emb_sequence = tensor.dot(input_sequence, input_emb_param) target_emb_sequence = tensor.dot(target_sequence, input_emb_param) # set generator input data list generator_input_data_list = [input_emb_sequence,] # get generator output data generator_output = generator_rnn_model[0].forward(generator_input_data_list, is_training=True) generator_hidden = generator_output[0] generator_cell = generator_output[1] generator_emb_sequence = get_tensor_output(generator_hidden, generator_output_model, is_training=True) generator_sequence = tensor.dot(generator_emb_sequence, tensor.transpose(input_emb_param)) # get square error square_error = tensor.sqr(target_sequence-generator_sequence).sum(axis=2) # set generator update tf_updates_cost = square_error.mean() tf_updates_dict = get_model_and_params_updates(layers=generator_rnn_model+generator_output_model, params=[input_emb_param,], cost=tf_updates_cost, optimizer=generator_optimizer) generator_gradient_dict = get_model_and_params_gradients(layers=generator_rnn_model+generator_output_model, params=[input_emb_param,], cost=tf_updates_cost) generator_gradient_norm = 0. for grad in generator_gradient_dict: generator_gradient_norm += tensor.sum(grad**2) generator_gradient_norm = tensor.sqrt(generator_gradient_norm) # set tf update inputs tf_updates_inputs = [input_sequence, target_sequence] # set tf update outputs tf_updates_outputs = [square_error, generator_gradient_norm,] # set tf update function tf_updates_function = theano.function(inputs=tf_updates_inputs, outputs=tf_updates_outputs, updates=tf_updates_dict, on_unused_input='ignore') return tf_updates_function
def __init__(self): print("Initialising network...") import theano import theano.tensor as T import lasagne from lasagne.layers import (InputLayer, LSTMLayer, ReshapeLayer, ConcatLayer, DenseLayer) theano.config.compute_test_value = 'raise' # Construct LSTM RNN: One LSTM layer and one dense output layer l_in = InputLayer(shape=input_shape) # setup fwd and bck LSTM layer. l_fwd = LSTMLayer( l_in, N_HIDDEN, backwards=False, learn_init=True, peepholes=True) l_bck = LSTMLayer( l_in, N_HIDDEN, backwards=True, learn_init=True, peepholes=True) # concatenate forward and backward LSTM layers concat_shape = (N_SEQ_PER_BATCH * SEQ_LENGTH, N_HIDDEN) l_fwd_reshape = ReshapeLayer(l_fwd, concat_shape) l_bck_reshape = ReshapeLayer(l_bck, concat_shape) l_concat = ConcatLayer([l_fwd_reshape, l_bck_reshape], axis=1) l_recurrent_out = DenseLayer(l_concat, num_units=N_OUTPUTS, nonlinearity=None) l_out = ReshapeLayer(l_recurrent_out, output_shape) input = T.tensor3('input') target_output = T.tensor3('target_output') # add test values input.tag.test_value = rand( *input_shape).astype(theano.config.floatX) target_output.tag.test_value = rand( *output_shape).astype(theano.config.floatX) print("Compiling Theano functions...") # Cost = mean squared error cost = T.mean((l_out.get_output(input) - target_output)**2) # Use NAG for training all_params = lasagne.layers.get_all_params(l_out) updates = lasagne.updates.nesterov_momentum(cost, all_params, LEARNING_RATE) # Theano functions for training, getting output, and computing cost self.train = theano.function( [input, target_output], cost, updates=updates, on_unused_input='warn', allow_input_downcast=True) self.y_pred = theano.function( [input], l_out.get_output(input), on_unused_input='warn', allow_input_downcast=True) self.compute_cost = theano.function( [input, target_output], cost, on_unused_input='warn', allow_input_downcast=True) print("Done initialising network.")
def init_exprs(self): inpt_mean = T.tensor3('inpt_mean') inpt_var = T.tensor3('inpt_var') target = T.tensor3('target') pars = self.parameters hidden_to_hiddens = [getattr(pars, 'hidden_to_hidden_%i' % i) for i in range(len(self.n_hiddens) - 1)] hidden_biases = [getattr(pars, 'hidden_bias_%i' % i) for i in range(len(self.n_hiddens))] hidden_var_biases_sqrt = [1 if i else 0 for i in self.use_varprop_at] recurrents = [getattr(pars, 'recurrent_%i' % i) for i in range(len(self.n_hiddens))] initial_hiddens = [getattr(pars, 'initial_hidden_%i' % i) for i in range(len(self.n_hiddens))] self.exprs = self.make_exprs( inpt_mean, inpt_var, target, pars.in_to_hidden, hidden_to_hiddens, pars.hidden_to_out, hidden_biases, hidden_var_biases_sqrt, initial_hiddens, recurrents, pars.out_bias, self.hidden_transfers, self.out_transfer, self.loss, self.pooling, self.leaky_coeffs, [self.p_dropout_inpt] + [self.p_dropout_hidden] * len(recurrents), self.hotk_inpt)
def set_evaluation_function(generator_model): # input sequence data (time_length * num_samples * input_dims) input_sequence = tensor.tensor3(name='input_sequence', dtype=floatX) target_sequence = tensor.tensor3(name='target_sequence', dtype=floatX) # set generator input data list generator_input_data_list = [input_sequence,] # get generator output data generator_output = generator_model[0].forward(generator_input_data_list, is_training=True) output_sequence = generator_output[0] generator_random = generator_output[-1] # get square error sample_cost = tensor.sqr(target_sequence-output_sequence).sum(axis=2) # set evaluation inputs evaluation_inputs = [input_sequence, target_sequence] # set evaluation outputs evaluation_outputs = [sample_cost, output_sequence] # set evaluation function evaluation_function = theano.function(inputs=evaluation_inputs, outputs=evaluation_outputs, updates=generator_random, on_unused_input='ignore') return evaluation_function
def build_model(tparams, options): opt_ret = dict() trng = RandomStreams(1234) p = 0.5 retain_prob = 1. - p print('dropout: {0}'.format(p)) # description string: #words x #samples # text: text sentence # hypothesis: hypothesis sentence text_embedding = tensor.tensor3('text_embedding', dtype='float32') # text = tensor.matrix('text', dtype='int64') text_mask = tensor.matrix('text_mask', dtype='float32') hypothesis_embedding = tensor.tensor3('hypothesis_embedding', dtype='float32') # hypothesis = tensor.matrix('hypothesis', dtype='int64') hypothesis_mask = tensor.matrix('hypothesis_mask', dtype='float32') label = tensor.vector('label', dtype='int64') # encoder proj = get_layer(options['encoder'])[1](tparams, text_embedding, None, options, prefix='encoder', mask=text_mask) ctx = proj[0][-1] dec_ctx = ctx # dropout dec_ctx_dropped = dec_ctx dec_ctx_dropped *= trng.binomial(dec_ctx_dropped.shape, p=retain_prob, dtype=dec_ctx_dropped.dtype) dec_ctx_dropped /= retain_prob # decoder (hypothesis) proj_hypo = get_layer(options['decoder'])[1](tparams, hypothesis_embedding, dec_ctx, options, prefix='h_decode_t', mask=hypothesis_mask) proj_hypo_dropped = get_layer(options['decoder'])[1](tparams, hypothesis_embedding, dec_ctx_dropped, options, prefix='h_decode_t', mask=hypothesis_mask) hypo_ctx = proj_hypo[0][-1] hypo_ctx_dropped = proj_hypo_dropped[0][-1] # dropout hypo_ctx_dropped *= trng.binomial(hypo_ctx_dropped.shape, p=retain_prob, dtype=hypo_ctx_dropped.dtype) hypo_ctx_dropped /= retain_prob # cost (cross entropy) logit = get_layer('ff')[1](tparams, hypo_ctx, options, prefix='ff_logit', activ='tensor.nnet.sigmoid') logit_dropped = get_layer('ff')[1](tparams, hypo_ctx_dropped, options, prefix='ff_logit', activ='tensor.nnet.sigmoid') # flatten logit logit = logit.flatten() logit_dropped = logit_dropped.flatten() cost = binary_crossentropy(logit_dropped, label) cost = tensor.mean(cost) acc = tensor.mean(tensor.eq(tensor.round(logit), label)) return text_embedding, text_mask, hypothesis_embedding, hypothesis_mask, label, cost, acc
def __init_symb(self): """ Initialize the symbolic variables of the model (e.g. input and output) :return: """ self.input = TT.tensor3('input') self.target_output = TT.tensor3('target_output') self.mask = TT.matrix("mask")
def build(self): x_range=T.tensor4() x_label=T.tensor3() x_action=T.tensor3() x_reward=T.vector() x_memory=T.tensor4() self.x_range_shared=theano.shared(np.zeros((self.batch_size,self.path_length,self.x_dim[0],self.x_dim[1]),dtype=theano.config.floatX),borrow=True) self.x_range_label=theano.shared(np.zeros((self.batch_size,self.path_length,self.n_classes),dtype=theano.config.floatX),borrow=True) self.x_range_action=theano.shared(np.zeros((self.batch_size,self.path_length,self.n_classes),dtype=theano.config.floatX),borrow=True) self.x_range_reward=theano.shared(np.zeros(self.batch_size,dtype=theano.config.floatX),borrow=True) self.x_range_memory=theano.shared(np.zeros((self.batch_size,self.path_length,self.n_classes,self.h_dim),dtype=theano.config.floatX),borrow=True) '''前期的框架模型,主要是得到x到h的映射,以及memory的构建''' D1, D2, D3 = lasagne.init.Normal(std=self.std,mean=0), lasagne.init.Normal(std=self.std,mean=0), lasagne.init.Normal(std=self.std,mean=0) # D1, D2, D3 = lasagne.init.Uniform(-1,1), lasagne.init.Uniform(-1,1), lasagne.init.Uniform(-1,1) l_range_in = lasagne.layers.InputLayer(shape=(self.batch_size,self.path_length,self.x_dim[0],self.x_dim[1])) # l_range_flatten = lasagne.layers.ReshapeLayer(l_range_in, [self.batch_size * self.path_length, 1, self.x_dim[0],self.x_dim[1]]) # l_range_dense2 = lasagne.layers.DenseLayer(l_range_flatten,self.tmp_h_dim,W=D1,nonlinearity=lasagne.nonlinearities.rectify) #[bs*path_length,dimension] # l_range_dense2 = lasagne.layers.DenseLayer(l_range_dense2,self.tmp_h_dim,W=D1,nonlinearity=lasagne.nonlinearities.rectify) #[bs*path_length,dimension] # l_range_label = lasagne.layers.InputLayer(shape=(self.batch_size,self.path_length,self.n_classes)) l_range_hidden=lasagne.layers.ReshapeLayer(l_range_in,[self.batch_size*self.path_length,1,self.tmp_h_dim]) l_range_dense2_origin=lasagne.layers.ReshapeLayer(l_range_in,[self.batch_size,self.path_length,self.tmp_h_dim]) '''Policy Gradient Methods的模型,主要是从Memory状态得到action的概率''' l_range_memory_in = lasagne.layers.InputLayer(shape=(self.batch_size,self.path_length,self.n_classes,self.h_dim)) l_range_memory = lasagne.layers.ReshapeLayer(l_range_memory_in,[self.batch_size*self.path_length,self.n_classes,self.h_dim]) if 1: l_range_status=ChoiceLayer((l_range_memory,l_range_hidden),D3,D3,D3,nonlinearity=lasagne.nonlinearities.tanh) #[bs*pl,(n_class+1),dim] l_range_mu = lasagne.layers.ReshapeLayer(l_range_status,[self.batch_size,self.path_length,self.n_classes]) '''模型的总体参数和更新策略等''' hidden = lasagne.layers.helper.get_output(l_range_dense2_origin, {l_range_in: x_range,l_range_label:x_label}) probas_range = lasagne.layers.helper.get_output(l_range_mu, {l_range_in: x_range,l_range_memory_in:x_memory,l_range_label:x_label}) params=lasagne.layers.helper.get_all_params(l_range_mu,trainable=True) params=[]#相当于只更新最后一个参数,别的不参与更新了 givens = { x_range: self.x_range_shared, x_label:self.x_range_label, x_action: self.x_range_action, x_reward: self.x_range_reward, x_memory: self.x_range_memory } cost=-T.mean(T.sum(T.sum(T.log(probas_range)*x_action,axis=2),axis=1)*x_reward) grads=T.grad(cost,params) scaled_grads = lasagne.updates.total_norm_constraint(grads, self.max_norm) updates = self.update_method(scaled_grads, params, learning_rate=self.lr) self.output_model_range = theano.function([],[probas_range,cost,hidden],givens=givens,on_unused_input='ignore',allow_input_downcast=True) self.output_model_range_updates = theano.function([],[probas_range,cost,hidden],updates=updates,givens=givens,on_unused_input='ignore',allow_input_downcast=True) self.output_hidden = theano.function([x_range,x_label],[hidden[:,0]],on_unused_input='ignore',allow_input_downcast=True) self.network=l_range_mu
def __init__(self, n_in, n_hid, n_out, lr=0.05, batch_size=64, single_output=True, output_activation=T.nnet.softmax, cost_function='nll'): self.n_in = n_in self.n_hid = n_hid self.n_out = n_out self.W_in = init_weight((self.n_in, self.n_hid),'W_in') self.W_out = init_weight((self.n_hid, self.n_out),'W_out') self.W_rec = init_weight((self.n_hid, self.n_hid),'W_rec', 'svd') self.b_hid = shared(np.zeros(shape = n_hid, dtype=dtype)) self.b_out = shared(np.zeros(shape = n_out, dtype=dtype)) self.params = [self.W_in,self.W_out,self.W_rec,self.b_out,self.b_hid] self.activation = output_activation def step(x_t, h_tm1): h_t = T.tanh(T.dot(x_t, self.W_in) + T.dot(h_tm1, self.W_rec) + self.b_hid) y_t = T.nnet.softmax(T.dot(h_t, self.W_out) + self.b_out) return [h_t, y_t] X = T.tensor3() # batch of sequence of vector Y = T.tensor3() # batch of sequence of vector (should be 0 when X is not null) if single_output: Y = T.matrix() else: Y = T.tensor3() h0 = shared(np.zeros(shape=(batch_size,self.n_hid), dtype=dtype)) # initial hidden state lr = shared(np.cast[dtype](lr)) [h_vals, y_vals], _ = theano.scan(fn=step, sequences=X.dimshuffle(1,0,2), outputs_info=[h0, None]) if single_output: self.output = y_vals[-1] else: self.output = y_vals.dimshuffle(1,0,2) cxe = T.mean(T.nnet.binary_crossentropy(self.output, Y)) nll = -T.mean(Y * T.log(self.output)+ (1.- Y) * T.log(1. - self.output)) mse = T.mean((self.output - Y) ** 2) cost = 0 if cost_function == 'mse': cost = mse elif cost_function == 'cxe': cost = cxe else: cost = nll gparams = T.grad(cost, self.params) updates = OrderedDict() for param, gparam in zip(self.params, gparams): updates[param] = param - gparam * lr self.loss = theano.function(inputs = [X, Y], outputs = cost) self.train = theano.function(inputs = [X, Y], outputs = cost, updates=updates) self.predictions = theano.function(inputs = [X], outputs = self.output) self.debug = theano.function(inputs = [X, Y], outputs = [X.shape, Y.shape, y_vals.shape, self.output.shape])
def print_layer(self): v = '--------------------\n' v += 'Read Layer ' + self.name + '\n' v += 'Input Shape: ' + str((self.width, self.height)) + '\n' return v + 'Output Shape: ' + str((self.N, self.N)) + '\n' if __name__ == '__main__': # testing theano.config.optimizer = 'fast_compile' attn = TemporalAttentionLayer(batch_size=10, N=5, channels=6, use_gpu=False) time_mask = T.imatrix('time_mask') features = T.tensor3('features') res, (g, s2, d) = attn.run(features, time_mask) f = theano.function([features, time_mask], [res, g, s2, d], on_unused_input='warn') fts = np.random.random((10, 6, 12)) tm = np.ones((10, 12)) tm[0, 6:] = 0 tm[1, 4:] = 0 tm[2, 2:] = 0 tm[3, 8:] = 0 tm[4, 9:] = 0 tm[5, 1:] = 0 tm[6, 3:] = 0
def __init__(self, mask_value=0.): super(Masking, self).__init__() self.mask_value = mask_value self.input = T.tensor3()
def compile_theano_func_build_G_mtx(): tau_inter_x, tau_inter_y = TT.scalar('tau_inter_x'), TT.scalar( 'tau_inter_y') M, N = TT.scalar('M'), TT.scalar('N') m_grid, n_grid = TT.vector('m_grid'), TT.vector('n_grid') cross_beamShape_r, cross_beamShape_i = \ TT.tensor3('cross_beamShape_r'), TT.tensor3('cross_beamShape_i') baseline_x, baseline_y = TT.tensor3('baseline_x'), TT.tensor3('baseline_y') pi = TT.constant(np.pi) def theano_periodic_sinc(in_sig, bandwidth): eps = TT.constant(1e-10) denominator = TT.mul(TT.sin(TT.true_div(in_sig, bandwidth)), bandwidth) idx_modi = TT.lt(TT.abs_(denominator), eps) numerator = TT.switch(idx_modi, TT.cos(in_sig), TT.sin(in_sig)) denominator = TT.switch(idx_modi, TT.cos(TT.true_div(in_sig, bandwidth)), denominator) return TT.true_div(numerator, denominator) # def theano_periodic_sinc(in_sig, bandwidth): # eps = TT.constant(1e-10) # numerator = TT.sin(in_sig) # denominator = TT.mul(TT.sin(TT.true_div(in_sig, bandwidth)), bandwidth) # out0 = TT.true_div(numerator, denominator) # out1 = TT.true_div(TT.cos(in_sig), TT.cos(TT.true_div(in_sig, bandwidth))) # idx_modi = TT.lt(TT.abs_(denominator), eps) # out = TT.switch(idx_modi, out1, out0) # return out # define the function def f_inner(cross_beamShape_r, cross_beamShape_i, baseline_x, baseline_y, tau_inter_x, tau_inter_y, m_grid, n_grid, M, N): periodic_sinc_2d = \ TT.mul( theano_periodic_sinc( 0.5 * (TT.shape_padright(tau_inter_x * baseline_x, n_ones=1) - 2 * pi * TT.shape_padleft(m_grid, n_ones=2)), M * tau_inter_x ), theano_periodic_sinc( 0.5 * (TT.shape_padright(tau_inter_y * baseline_y, n_ones=1) - 2 * pi * TT.shape_padleft(n_grid, n_ones=2)), N * tau_inter_y ) ) G_mtx_r = TT.tensordot(cross_beamShape_r, periodic_sinc_2d, axes=[[0, 1], [0, 1]]) G_mtx_i = TT.tensordot(cross_beamShape_i, periodic_sinc_2d, axes=[[0, 1], [0, 1]]) return G_mtx_r, G_mtx_i G_mtx_r, G_mtx_i = theano.map(fn=f_inner, sequences=(cross_beamShape_r, cross_beamShape_i, baseline_x, baseline_y), non_sequences=(tau_inter_x, tau_inter_y, m_grid, n_grid, M, N))[0] # compile the function func = theano.function([ tau_inter_x, tau_inter_y, M, N, m_grid, n_grid, baseline_x, baseline_y, cross_beamShape_r, cross_beamShape_i ], [G_mtx_r, G_mtx_i], allow_input_downcast=True) return func
def setup(self, model, dataset): if self.cost is None: self.cost = model.get_default_cost() inf_params = [ param for param in model.get_params() if np.any(np.isinf(param.get_value())) ] if len(inf_params) > 0: raise ValueError("These params are Inf: " + str(inf_params)) if any([ np.any(np.isnan(param.get_value())) for param in model.get_params() ]): nan_params = [ param for param in model.get_params() if np.any(np.isnan(param.get_value())) ] raise ValueError("These params are NaN: " + str(nan_params)) self.model = model batch_size = self.batch_size if hasattr(model, "force_batch_size"): if model.force_batch_size > 0: if batch_size is not None: if batch_size != model.force_batch_size: if self.set_batch_size: model.set_batch_size(batch_size) else: raise ValueError( "batch_size argument to SGD conflicts with model's force_batch_size attribute" ) else: self.batch_size = model.force_batch_size model._test_batch_size = self.batch_size self.monitor = Monitor.get_monitor(model) # TODO: come up with some standard scheme for associating training runs # with monitors / pushing the monitor automatically, instead of just # enforcing that people have called push_monitor assert self.monitor.get_examples_seen() == 0 self.monitor._sanity_check() X = model.get_input_space().make_theano_batch(name="%s[X]" % self.__class__.__name__) self.topo = not X.ndim == 2 if config.compute_test_value == 'raise': if self.topo: X.tag.test_value = dataset.get_batch_topo(self.batch_size) else: X.tag.test_value = dataset.get_batch_design(self.batch_size) Y = T.tensor3(name="%s[Y]" % self.__class__.__name__) if self.cost.supervised: if config.compute_test_value == 'raise': _, Y.tag.test_value = dataset.get_batch_design( self.batch_size, True) self.supervised = True cost_value = self.cost(model, X, Y) else: self.supervised = False cost_value = self.cost(model, X) if cost_value is not None and cost_value.name is None: if self.supervised: cost_value.name = 'objective(' + X.name + ', ' + Y.name + ')' else: cost_value.name = 'objective(' + X.name + ')' # Set up monitor to model the objective value, learning rate, # momentum (if applicable), and extra channels defined by # the cost learning_rate = self.learning_rate if self.monitoring_dataset is not None: self.monitor.setup(dataset=self.monitoring_dataset, cost=self.cost, batch_size=self.batch_size, num_batches=self.monitoring_batches, extra_costs=self.monitoring_costs) if self.supervised: ipt = (X, Y) else: ipt = X dataset_name = self.monitoring_dataset.keys()[0] monitoring_dataset = self.monitoring_dataset[dataset_name] #TODO: have Monitor support non-data-dependent channels self.monitor.add_channel(name='learning_rate', ipt=ipt, val=learning_rate, dataset=monitoring_dataset) if self.momentum: self.monitor.add_channel(name='momentum', ipt=ipt, val=self.momentum, dataset=monitoring_dataset) ''' Ypred = model.fprop(X) Y_ = (T.arange(0,96).dimshuffle('x','x',0)*Ypred).sum(axis = 2) y = monitoring_dataset.y the_y = T.matrix('targetsss') mse = Print('MSE')(T.mean(T.square(Y_-the_y))) funct = function(inputs=[X], outputs=mse) real_funct = function(inputs=[X,the_y], outputs=funct(), givens=[y=monitoring_dataset.y]) self.monitor.add_channel(name='MSE', ipt=(y, X), val = 2, dataset=monitoring_dataset, prereqs=(funct)) ''' params = list(model.get_params()) assert len(params) > 0 for i, param in enumerate(params): if param.name is None: param.name = 'sgd_params[%d]' % i if self.cost.supervised: grads, updates = self.cost.get_gradients(model, X, Y) else: grads, updates = self.cost.get_gradients(model, X) for param in grads: assert param in params for param in params: assert param in grads for param in grads: if grads[param].name is None and cost_value is not None: grads[param].name = ('grad(%(costname)s, %(paramname)s)' % { 'costname': cost_value.name, 'paramname': param.name }) lr_scalers = model.get_lr_scalers() for key in lr_scalers: if key not in params: raise ValueError("Tried to scale the learning rate on " +\ str(key)+" which is not an optimization parameter.") log.info('Parameter and initial learning rate summary:') for param in params: param_name = param.name if param_name is None: param_name = 'anon_param' lr = learning_rate.get_value() * lr_scalers.get(param, 1.) log.info('\t' + param_name + ': ' + str(lr)) if self.momentum is None: updates.update( dict(safe_zip(params, [param - learning_rate * \ lr_scalers.get(param, 1.) * grads[param] for param in params]))) else: for param in params: inc = sharedX(param.get_value() * 0.) if param.name is not None: inc.name = 'inc_' + param.name updated_inc = self.momentum * inc - learning_rate * lr_scalers.get( param, 1.) * grads[param] updates[inc] = updated_inc updates[param] = param + updated_inc for param in params: if updates[param].name is None: updates[param].name = 'sgd_update(' + param.name + ')' model.censor_updates(updates) for param in params: update = updates[param] if update.name is None: update.name = 'censor(sgd_update(' + param.name + '))' for update_val in get_debug_values(update): if np.any(np.isinf(update_val)): raise ValueError("debug value of %s contains infs" % update.name) if np.any(np.isnan(update_val)): raise ValueError("debug value of %s contains nans" % update.name) with log_timing(log, 'Compiling sgd_update'): if self.supervised: fn_inputs = [X, Y] else: fn_inputs = [X] self.sgd_update = function(fn_inputs, updates=updates, name='sgd_update', on_unused_input='ignore', mode=self.theano_function_mode) self.params = params
#Get in pro_data.mask_train_input() # Random shuffle. indices = np.arange(len(training_word_pos_vec3D)) np.random.shuffle(indices) training_word_pos_vec3D = training_word_pos_vec3D[indices] training_sen_length = training_sen_length[indices] training_label = training_label[indices] # training_label_1hot=training_label_1hot[indices] train_left_sdp_length = train_left_sdp_length[indices] """ new model """ model = Network() # Prepare Theano variables for inputs and targets input_var = T.tensor3('inputs') target_var = T.ivector('targets') mask_var = T.imatrix('mask_layer') #Pi model variables: if model.network_type == "pi": input_b_var = T.tensor3('inputs_b') mask_train = T.vector('mask_train') unsup_weight_var = T.scalar('unsup_weight') elif model.network_type == "tempens": #tempens model variables: z_target_var = T.matrix('z_targets') mask_train = T.vector('mask_train') unsup_weight_var = T.scalar('unsup_weight') learning_rate_var = T.scalar('learning_rate') adam_beta1_var = T.scalar('adam_beta1')
def _get_compiled_forward_backward_theano_func(self): """Returns a compiled theano function that perform forward-backward and either updates log posterior probabilities or returns it. Note: The returned theano function takes 6 inputs: num_states (integer scalar), temperature (float scalar), log_prior_c (float vector), og_trans_tcc (float tensor3), log_emission_tc (float matrix) prev_log_posterior_tc (float matrix) If a `log_posterior_output` shared tensor is given to the class initializer, the return tuple will be: update_norm_t, log_data_likelihood, (+ alpha_tc, beta_tc if self.include_alpha_beta_output == True) and the posterior will be directly written to `self.log_posterior_output`. Otherwise, return tuple will be: admixed_log_posterior_tc, update_norm_t, log_data_likelihood, (+ alpha_tc, beta_tc if self.include_alpha_beta_output == True) Returns: A compiled theano function """ num_states = tt.iscalar('num_states') temperature = tt.scalar('temperature') log_prior_c = tt.vector('log_prior_c') log_trans_tcc = tt.tensor3('log_trans_tcc') log_emission_tc = tt.matrix('log_emission_tc') prev_log_posterior_tc = tt.matrix('prev_log_posterior_tc') new_log_posterior_tc, log_data_likelihood_t, alpha_tc, beta_tc = self._get_symbolic_log_posterior( num_states, temperature, log_prior_c, log_trans_tcc, log_emission_tc, self.resolve_nans) admixed_log_posterior_tc = commons.safe_logaddexp( new_log_posterior_tc + np.log(self.admixing_rate), prev_log_posterior_tc + np.log(1.0 - self.admixing_rate)) log_data_likelihood = log_data_likelihood_t[ -1] # in theory, they are all the same update_norm_t = commons.get_jensen_shannon_divergence( admixed_log_posterior_tc, prev_log_posterior_tc) ext_output = [alpha_tc, beta_tc ] if self.include_alpha_beta_output else [] inputs = [ num_states, temperature, log_prior_c, log_trans_tcc, log_emission_tc, prev_log_posterior_tc ] if self.log_posterior_output is not None: return th.function( inputs=inputs, outputs=[update_norm_t, log_data_likelihood] + ext_output, updates=[(self.log_posterior_output, admixed_log_posterior_tc) ]) else: return th.function(inputs=inputs, outputs=[ admixed_log_posterior_tc, update_norm_t, log_data_likelihood ] + ext_output)
def retrain(trainX, trainY, testX, testY, theta_mat, lambda_mat, learning_rate=5e-4, rate_decay=1.0, init_scale=0.2, scale_decay=0.998, momentum=0.0, minibatch_size=64, num_epochs=70, rng_seed=2017, model_path=None, model_to_save=None): if rng_seed is not None: print("Setting RandomState with seed=%i" % (rng_seed)) rng = np.random.RandomState(rng_seed) set_rng(rng) index = T.lscalar() # Minibatch index x = T.tensor3('x') # Inputs y = T.fmatrix('y') # Target #define and initialize RNN network network_0 = build_rnn_net(input_var=x, input_width=time_step, input_dim=feature_dim, nin_units=12, h_num_units=[16, 16], h_grad_clip=5.0, output_width=time_step) if not os.path.isfile(model_path): print("Model file does not exist!") return None init_model = np.load(model_path) init_params = init_model[init_model.files[0]] LL.set_all_param_values([network_0], init_params) train_set_y = theano.shared(np.zeros((1, time_step), dtype=theano.config.floatX), borrow=True) train_set_x = theano.shared(np.zeros((1, time_step, feature_dim), dtype=theano.config.floatX), borrow=True) valid_set_y = theano.shared(np.zeros((1, time_step), dtype=theano.config.floatX), borrow=True) valid_set_x = theano.shared(np.zeros((1, time_step, feature_dim), dtype=theano.config.floatX), borrow=True) test_set_x = theano.shared(np.zeros((1, time_step, feature_dim), dtype=theano.config.floatX), borrow=True) theta = theano.shared( np.zeros((time_step, time_step), dtype=theano.config.floatX)) lamda = theano.shared( np.zeros((time_step, time_step), dtype=theano.config.floatX)) out_x = LL.BatchNormLayer(network_0) #define updates params = LL.get_all_params(out_x, trainable=True) r = lasagne.regularization.regularize_network_params(out_x, l2) semi_x = LL.get_output(out_x, deterministic=True) #define SGCRF in theano expressions S_yy = T.dot(y.T, y) / minibatch_size S_yx = T.dot(y.T, semi_x) / minibatch_size S_xx = T.dot(semi_x.T, semi_x) / minibatch_size ilamda = T.nlinalg.matrix_inverse(lamda) t1 = T.dot(S_yy, lamda) t2 = 2 * T.dot(S_yx, theta) t3 = T.dot(T.dot(T.dot(ilamda, theta.T), S_xx), theta) det_lamda = T.nlinalg.det(lamda) loss = -T.log(det_lamda) + T.nlinalg.trace(t1 + t2 + t3) eigen_lamda, _ = T.nlinalg.eig(lamda) train_loss = -T.sum(T.log(eigen_lamda)) + T.nlinalg.trace(t1 + t2 + t3) lamda_diag = T.nlinalg.diag(lamda) regularized_loss = loss + 1e-4 * r + 1e-3 * l1(theta) + 1e-3 * l1( lamda - lamda_diag) learn_rate = T.scalar('learn_rate', dtype=theano.config.floatX) momentum = T.scalar('momentum', dtype=theano.config.floatX) scale_rate = T.scalar('scale_rate', dtype=theano.config.floatX) # scale the grads of theta, lamda new_params = [theta, lamda] new_grads = T.grad(regularized_loss, new_params) for i in range(len(new_grads)): new_grads[i] *= scale_rate grads = T.grad(regularized_loss, params) params += new_params grads += new_grads clipped_grads = lasagne.updates.total_norm_constraint(grads, 5.0) updates = lasagne.updates.nesterov_momentum(clipped_grads, params, learning_rate=learn_rate, momentum=momentum) pred_x = LL.get_output(out_x, deterministic=True) valid_predictions = -T.dot(T.dot(ilamda, theta.T), pred_x.T).T valid_loss = T.mean(T.abs_(pred_x - y)) train_model = theano.function( [index, learn_rate, momentum, scale_rate], train_loss, updates=updates, givens={ x: train_set_x[(index * minibatch_size):((index + 1) * minibatch_size)], y: train_set_y[(index * minibatch_size):((index + 1) * minibatch_size)] }) validate_model = theano.function( [index], valid_loss, givens={ x: valid_set_x[index * minibatch_size:(index + 1) * minibatch_size], y: valid_set_y[index * minibatch_size:(index + 1) * minibatch_size] }) test_model = theano.function( [index], valid_predictions, givens={ x: test_set_x[(index * minibatch_size):((index + 1) * minibatch_size)], }) this_train_loss = 0.0 this_valid_loss = 0.0 best_valid_loss = np.inf best_train_loss = np.inf best_test_loss = np.inf eval_starts = 0 near_convergence = 1500 # to be set eval_multiple = 10 eval_num = 1000 train_eval_scores = np.ones(eval_num) valid_eval_scores = np.ones(eval_num) test_eval_scores = np.ones(eval_num) cum_iterations = 0 eval_index = 0 theta.set_value(theta_mat.astype(np.float32)) lamda.set_value(lambda_mat.astype(np.float32)) batch_num = trainX.shape[0] // minibatch_size near_convergence = batch_num * (num_epochs - 10) for i in range(num_epochs): x_train, y_train, x_cv, y_cv = shuffle_data(trainX, trainY, testX, testY) train_batch_num = x_train.shape[ 0] // minibatch_size #discard last small batch valid_batch_num = x_cv.shape[0] // minibatch_size + 1 start_time = time.time() train_set_y.set_value(y_train[:]) train_set_x.set_value(x_train) valid_set_y.set_value(y_cv[:]) valid_set_x.set_value(x_cv) test_set_x.set_value(x_cv) # if(num_epochs % 10 == 0): # learning_rate *= 0.7 # Iterate over minibatches in each batch for mini_index in xrange(train_batch_num): this_rate = np.float32(learning_rate * (rate_decay**cum_iterations)) this_scale_rate = np.float32(init_scale * (scale_decay**cum_iterations)) # adaptive momentum this_momentum = 0.99 if cum_iterations > near_convergence: this_momentum = 0.90 this_train_loss += train_model(mini_index, this_rate, this_momentum, this_scale_rate) cum_iterations += 1 if np.isnan(this_train_loss): print "Training Error!!!!!!!!!" return # begin evaluation and report loss if (cum_iterations % eval_multiple == 0 and cum_iterations > eval_starts): this_train_loss = this_train_loss / eval_multiple this_valid_loss = np.mean( [validate_model(k) for k in xrange(valid_batch_num)]) predictions = np.concatenate( [test_model(k) for k in xrange(valid_batch_num)]) this_test_loss = np.mean(np.abs(predictions - y_cv)) train_eval_scores[eval_index] = this_train_loss valid_eval_scores[eval_index] = this_valid_loss test_eval_scores[eval_index] = this_test_loss # Save model if best validation score if (this_valid_loss < best_valid_loss): best_valid_loss = this_valid_loss if (this_test_loss < best_test_loss): best_test_loss = this_test_loss #np.savez(model_to_save, LL.get_all_param_values(network_0)) print("Training Loss:", this_train_loss) print("Validation Loss:", this_valid_loss) print("Test Loss:", this_test_loss) print("Current scale rate:", this_scale_rate) eval_index += 1 this_train_loss = 0.0 this_valid_loss = 0.0 end_time = time.time() print("Computing time for epoch %d: %f" % (i, end_time - start_time)) cur_train_loss = np.min(train_eval_scores) cur_valid_loss = np.min(valid_eval_scores) cur_test_loss = np.min(test_eval_scores) print( "The best training loss in epoch!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! %f" % cur_train_loss) print( "The best validation loss in epoch!!!!!!!!!!!!!!!!!!!!!!!!!!!!! : %f" % cur_valid_loss) print("The best test loss in epoch!!!!!!!!!!!!!!!!!!!!!!!!!!!!! : %f" % cur_test_loss) print("Best loss in training: %f" % best_train_loss) print("Best loss in cross-validation: %f" % best_valid_loss) print("Best loss in testing: %f" % best_test_loss) del train_set_x, train_set_y, valid_set_x, valid_set_y, trainX, trainY gc.collect()
numUnits, True, nonlinearity=None) transLayerMatricies = transLayer.W.get_value() transLayerSharedMatrix = transLayer.W_Shared.get_value() transLayerMatricies[0] = np.zeros((fShape[0] * fShape[1], numUnits)) transLayerMatricies[1] = -1 * np.eye(fShape[0] * fShape[1]) transLayerMatricies[2] = 2 * np.eye(fShape[0] * fShape[1]) transLayerSharedMatrix = 1 * np.eye(fShape[0] * fShape[1]) transLayer.W.set_value(transLayerMatricies) transLayer.W_Shared.set_value(transLayerSharedMatrix) x = T.tensor3() output = lasagne.layers.get_output(transLayer, x) f = theano.function([x], output) tasks = [0, 1, 2, 3] transLayer.setTaskIndices(tasks) testValues = np.random.randn(batch_size, fShape[0], fShape[1]) print "Task matrix indicies: " print tasks print "Task matricies:" print transLayer.W.get_value() print "ShareMatrix:" print transLayer.W_Shared.get_value()
scale * numpy.random.uniform(-1.0, 1.0, (nClasses, 1)).astype(theano.config.floatX), 'Sb') #eps = theano.shared(scale * numpy.ones(1).astype(theano.config.floatX), 'eps') * 0.0001 # bundle params = [h0, Wr, Ur, br, Wz, Uz, bz, W, U, b, S, Sb] # Adagrad shared variables hists = {} for param in params: hists[param.name + 'Hist'] = theano.shared( numpy.zeros_like(param.get_value())) x = T.tensor3('x') expected = T.matrix('expected') def recurrence(x_t, h_tm1): # reset gate r_t = T.nnet.sigmoid(T.dot(Wr, x_t) + T.dot(Ur, h_tm1) + br) # update gate z_t = T.nnet.sigmoid(T.dot(Wz, x_t) + T.dot(Uz, h_tm1) + bz) # proposed hidden state _h_t = T.tanh(T.dot(W, x_t) + T.dot(U, r_t * h_tm1) + b) # actual hidden state h_t = z_t * h_tm1 + (1 - z_t) * _h_t return h_t
test_m /= mmm test_f /= mmm test_x /= mmm ####### n_features = train_m.shape[1] # this time they are 512 if tpe is 0 or 2: nonlin = sigmoid if tpe is 1: nonlin = sigmoid max_len = 50 NUM_UNITS_ENC = 1000 NUM_UNITS_DEC = 1000 x_sym = T.tensor3() mask_x_sym = T.matrix() m_sym = T.tensor3() f_sym = T.tensor3() mask_m_sym = T.tensor3() mask_f_sym = T.tensor3() n_sym = T.tensor3() mask_n_sym = T.tensor3() l_in = lasagne.layers.InputLayer(shape=(None, max_len, n_features)) l_dec_fwd = lasagne.layers.GRULayer(l_in, num_units=NUM_UNITS_DEC, name='GRUDecoder', backwards=False) l_dec_bwd = lasagne.layers.GRULayer(l_in,
from scipy.spatial.distance import cdist rng = np.random.RandomState(42) d = 20 # dimension nX = 10 nY = 30 x = np.random.rand(1, 121, 52).astype('float32') y = np.random.rand(1, 200, 52).astype('float32') #x = np.asarray([[ [1,2,3], [1,2,1] ]]).astype('float32') #y = np.asarray([[ [1,2,3], [1,2,3], [1,2,3] ]]).astype('float32') print cdist(x[0], y[0]) X = T.tensor3('X', dtype='float32') Y = T.tensor3('Y', dtype='float32') x_square = K.square(X) y_square = K.square(Y) x_sq_sum = K.repeat(K.sum(x_square, axis=-1), n=y_square.shape[1]) y_sq_sum = K.repeat(K.sum(y_square, axis=-1), n=x_square.shape[1]) dot = K.batch_dot(X, K.permute_dimensions(Y, (0, 2, 1)), axes=(2, 1)) squared_euclidean_distances = K.sqrt( K.permute_dimensions(x_sq_sum, (0, 2, 1)) + y_sq_sum - 2 * dot) f_x = theano.function([X, Y], x_sq_sum) f_y = theano.function([Y, X], y_sq_sum)
updates = [] for p, g in zip(params, grads): acc = theano.shared(p.get_value() * 0.) acc_new = rho * acc + (1 - rho) * g ** 2 gradient_scaling = T.sqrt(acc_new + epsilon) g = g / gradient_scaling updates.append((acc, acc_new)) updates.append((p, p - T.clip(lr * g, -0.01, 0.01))) return updates inp = T.matrix() # batchsize x imgsize ** 2 randn = T.tensor3() # timestep x batchsize x latent_vector_size enc = LSTM(784 + 784 + 784, 256) dec = LSTM(256, 784) enc_to_mean = init_weights([256, 256]) enc_to_variance = init_weights([256, 256]) dec_to_write = init_weights([784, 784]) def encoder(canvas, decoder_hidden_1, encoder_hidden_1, encoder_cell_1): error = inp - T.nnet.sigmoid(canvas) read_vec = T.concatenate([inp, error, decoder_hidden_1], axis = 1) enc_hidden, enc_cell = enc.recurrence(read_vec, encoder_hidden_1, encoder_cell_1)
def test_machine_translation(self): """ This test case comes from https://github.com/rizar/scan-grad-speed and is an example of actual computation done with scan in the context of machine translation 'dim' has been reduced from 1000 to 5 to make the test run faster """ # Parameters from an actual machine tranlation run batch_size = 80 seq_len = 50 n_words = 80 * 50 dim = 5 # Weight matrices U = theano.shared( numpy.random.normal(size=(dim, dim), scale=0.0001).astype(config.floatX)) U.name = 'U' V = theano.shared(U.get_value()) V.name = 'V' W = theano.shared(U.get_value()) W.name = 'W' # Variables and their values x = T.tensor3('x') x_value = numpy.random.normal(size=(seq_len, batch_size, dim), scale=0.0001).astype(config.floatX) ri = T.tensor3('ri') ri_value = x_value zi = T.tensor3('zi') zi_value = x_value init = T.alloc(numpy.cast[config.floatX](0), batch_size, dim) def rnn_step1( # sequences x, ri, zi, # outputs_info h): pre_r = ri + h.dot(U) pre_z = zi + h.dot(V) r = T.nnet.sigmoid(pre_r) z = T.nnet.sigmoid(pre_z) after_r = r * h pre_h = x + after_r.dot(W) new_h = T.tanh(pre_h) res_h = z * new_h + (1 - z) * h return res_h # Compile the function twice, once with the optimization and once # without opt_mode = mode.including("scan") h, _ = theano.scan(rnn_step1, sequences=[x, ri, zi], n_steps=seq_len, outputs_info=init, name='fpass1', mode=opt_mode) cost = h[-1].sum() grad1 = T.grad(cost, [U, V, W]) f_opt = theano.function(inputs=[x, ri, zi], outputs=grad1, mode=opt_mode) no_opt_mode = mode.excluding("scanOp_pushout_output") h, _ = theano.scan(rnn_step1, sequences=[x, ri, zi], n_steps=seq_len, outputs_info=init, name='fpass1', mode=no_opt_mode) cost = h[-1].sum() grad1 = T.grad(cost, [U, V, W]) f_no_opt = theano.function(inputs=[x, ri, zi], outputs=grad1, mode=no_opt_mode) # Validate that the optimization has been applied scan_node_grad = [ node for node in f_opt.maker.fgraph.toposort() if isinstance(node.op, Scan) ][1] for output in scan_node_grad.op.outputs: assert not ( isinstance(output.owner.op, T.elemwise.Elemwise) and any([isinstance(i, T.Dot) for i in output.owner.inputs])) # Compare the outputs of the two functions on the same input data. f_opt_output = f_opt(x_value, ri_value, zi_value) f_no_opt_output = f_no_opt(x_value, ri_value, zi_value) utt.assert_allclose(f_opt_output, f_no_opt_output)
def main(): x1 = tensor.tensor3("x1", dtype=THEANOTYPE) x2 = tensor.tensor3("x2", dtype=THEANOTYPE) x3 = tensor.tensor3("x3", dtype=THEANOTYPE) x1_indices = tensor.ivector("x1_indices") x2_indices = tensor.ivector("x2_indices") x3_indices = tensor.ivector("x3_indices") m1 = tensor.matrix("m1", dtype=THEANOTYPE) m2 = tensor.matrix("m2", dtype=THEANOTYPE) m3 = tensor.matrix("m3", dtype=THEANOTYPE) rng = numpy.random.RandomState(0) n_data = 1000 max_sequence_length = 50 n_dim = 5 n_hiddens = [10, 10] model = SiameseTripletBatchLSTM(rng, x1, x2, x3, m1, m2, m3, n_in=n_dim, n_hiddens=n_hiddens) xs = theano.shared( rng.randn(n_data, max_sequence_length, n_dim).astype(THEANOTYPE)) masks = theano.shared( rng.randn(n_data, max_sequence_length).astype(THEANOTYPE)) xs_numpy = xs.get_value() masks_numpy = masks.get_value() x1_lstms = lstm.BatchMultiLayerLSTM(rng, x1, m1, n_dim, n_hiddens=n_hiddens, output_type="last", prefix="lstms_x1") f1 = theano.function(inputs=[x1, m1], outputs=x1_lstms.output) small_n_data = 10 sequence_lengths = [5, 10, 15, 20] xs0 = [ rng.randn(small_n_data, n_dim).astype(THEANOTYPE) for n_data in sequence_lengths ] xs_arr0, mask = lstm.batchify(xs0) f1_ind = theano.function(inputs=[x1_indices], outputs=x1_lstms.output, givens={ x1: xs[x1_indices].swapaxes(0, 1), m1: masks[x1_indices] }) fbatch = theano.function(inputs=[x1_indices, x2_indices, x3_indices], outputs=[ model.x1_lstms.output, model.x2_lstms.output, model.x3_lstms.output ], givens={ x1: xs[x1_indices].swapaxes(0, 1), m1: masks[x1_indices], x2: xs[x2_indices].swapaxes(0, 1), m2: masks[x2_indices], x3: xs[x3_indices].swapaxes(0, 1), m3: masks[x3_indices], }) ind1 = numpy.asarray([1, 2], dtype=numpy.int32) ind2 = numpy.asarray([1, 2], dtype=numpy.int32) ind3 = numpy.asarray([1, 2], dtype=numpy.int32) import pdb pdb.set_trace()
def _run(self, num_features, num_timesteps, batch_size, mode): # determine shapes of inputs and targets depending on the batch size if batch_size == 1: inputs_size = (num_timesteps, num_features) targets_size = (num_timesteps, 1) else: inputs_size = (num_timesteps, batch_size, num_features) targets_size = (num_timesteps, batch_size, 1) # make inputs and targets shared variables inputs = theano.shared(self.rng.uniform(size=inputs_size).astype( config.floatX), borrow=True) targets = theano.shared(self.rng.uniform(size=targets_size).astype( config.floatX), borrow=True) # create symbolic inputs and targets variables if batch_size == 1: x = T.matrix('inputs') t = T.matrix('targets') else: x = T.tensor3('inputs') t = T.tensor3('inputs') x.tag.test_value = inputs.get_value(borrow=True) t.tag.test_value = targets.get_value(borrow=True) # create a set of parameters for a simple RNN W_xh = theano.shared( (0.01 * self.rng.uniform(size=(num_features, 10))).astype( config.floatX), borrow=True) W_hh = theano.shared( (0.01 * self.rng.uniform(size=(10, 10))).astype(config.floatX), borrow=True) W_hy = theano.shared( (0.01 * self.rng.uniform(size=(10, 1))).astype(config.floatX), borrow=True) b_h = theano.shared(numpy.zeros(10).astype(config.floatX), borrow=True) b_y = theano.shared(numpy.zeros(1).astype(config.floatX), borrow=True) params = [W_xh, W_hh, W_hy, b_h, b_y] # recurrent function def step(x_t, h_tm1): h = T.tanh(T.dot(h_tm1, W_hh) + T.dot(x_t, W_xh) + b_h) return h # build recurrent graph if batch_size == 1: h_0 = T.alloc(0.0, 10).astype(config.floatX) else: h_0 = T.alloc(0.0, batch_size, 10).astype(config.floatX) h, updates = theano.scan(step, sequences=[x], outputs_info=[h_0]) # network output y = T.dot(h, W_hy) + b_y # Create Gauss-Newton-Matrix object. Not really of any use here, but I # need it for Hessian-Free optimization. gn = GaussNewtonMatrix(y) # compute MSE cost = ((t - y)**2).sum(axis=1).mean() # Compute the cost at some other point in the parameter # space. Not really of any use here, but this is how I do it # during certain iterations of CG in the HF algorithm. There, # it's in fact `pi + current update proposal`. For simplicity, # I just multiply by 2 here. cost_ = theano.clone(cost, replace=dict([(pi, 2 * pi) for pi in params])) # Compute Gauss-Newton-Matrix times some vector `v` which is `p` in CG, # but for simplicity, I just take the parameters vector because it's # already there. Gv = gn(v=params, cost=cost, parameters=params, damp=T.constant(1.0)) # compile Theano function f = theano.function([], [cost_] + Gv, givens={ x: inputs, t: targets }, mode=mode) # execute f()
def __init__(self, rng, input_x1, input_x2, input_x3, input_m1, input_m2, input_m3, n_in, n_hiddens, output_type="last", srng=None, dropout=0.0): """ Initialize symbolic parameters and expressions. Many of the parameters are identical to that of `cnn.build_cnn_layers`. Some of the other parameters are described below. Parameters ---------- input_x1 : symbolic matrix The matrix is reshaped according to `input_shape` and then treated as the input of the first side of the Siamese network. input_x2 : symbolic matrix The matrix is reshaped according to `input_shape` and then treated as the input of the second side of the Siamese network, forming a same-pair with `input_x1`. input_x3 : symbolic matrix The matrix is reshaped according to `input_shape` and then treated as the input of the third side of the Siamese network, forming a different-pair with `input_x1`. """ # Build common layers to which the Siamese layers are tied input = T.tensor3("x", dtype=THEANOTYPE) mask = T.matrix("m", dtype=THEANOTYPE) self.input = input self.mask = mask self.n_in = n_in self.n_hiddens = n_hiddens self.n_layers = len(self.n_hiddens) self.lstms = lstm.BatchMultiLayerLSTM(rng, input, mask, n_in, n_hiddens, output_type=output_type, prefix="lstms", srng=srng, dropout=dropout) self.dropout = dropout self.x1_lstms = lstm.BatchMultiLayerLSTM( rng, input_x1, input_m1, n_in, n_hiddens, parameters=self.lstms.parameters, output_type=output_type, prefix="lstms_x1", srng=srng, dropout=dropout) self.x2_lstms = lstm.BatchMultiLayerLSTM( rng, input_x2, input_m2, n_in, n_hiddens, parameters=self.lstms.parameters, output_type=output_type, prefix="lstms_x2", srng=srng, dropout=dropout) self.x3_lstms = lstm.BatchMultiLayerLSTM( rng, input_x3, input_m3, n_in, n_hiddens, parameters=self.lstms.parameters, output_type=output_type, prefix="lstms_x3", srng=srng, dropout=dropout) self.parameters = self.lstms.parameters self.l2 = self.lstms.l2 self.output = self.lstms.output
def defineGCN(params, nodeNames, nodeList, edgeList, edgeListComplete, edgeFeatures, nodeFeatureLength, nodeToEdgeConnections, new_idx, featureRange, adjacency): gradient_method = Momentum(momentum=params.momentum) if (params.gcnType == 0): print("-------") from neuralmodels.layers.GraphConvolution import GraphConvolution elif (params.gcnType == 1): print("=======") from neuralmodels.layers.GraphConvolution_temporal import GraphConvolution_t as GraphConvolution elif (params.gcnType == 2): print("########") from neuralmodels.layers.GraphConvolution_temporal_pairwise import GraphConvolution_tp as GraphConvolution edgeRNNs = {} nodeRNNs = {} finalLayer = {} nodeLabels = {} edgeListComplete = [] for nm in nodeNames: num_classes = nodeList[nm] if (params.test == 1): nodeRNNs[nm] = [FCLayer('linear', params.fc_init, size=1, rng=rng)] et = nm + '_temporal' edgeListComplete.append(et) edgeRNNs[et] = [ TemporalInputFeatures(edgeFeatures[et]), FCLayer('rectify', params.fc_init, size=params.fc_size, rng=rng) ] et = nm + '_normal' edgeListComplete.append(et) edgeRNNs[et] = [ TemporalInputFeatures(edgeFeatures[et]), FCLayer('rectify', params.fc_init, size=params.fc_size, rng=rng) ] finalLayer[nm] = [ FCLayer_out('rectify', params.fc_init, size=params.fc_size, rng=rng, flag=1), FCLayer('linear', params.fc_init, size=num_classes, rng=rng), ] else: LSTMs = [ LSTM('tanh', 'sigmoid', params.lstm_init, truncate_gradient=params.truncate_gradient, size=params.node_lstm_size, rng=rng, g_low=-params.g_clip, g_high=params.g_clip) ] nodeRNNs[nm] = [ #multilayerLSTM(LSTMs, skip_input=True,skip_output=True, input_output_fused=True), FCLayer('rectify', params.fc_init, size=params.fc_size, rng=rng), FCLayer('linear', params.fc_init, size=params.fc_size, rng=rng), ] et = nm + '_temporal' edgeListComplete.append(et) edgeRNNs[et] = [ TemporalInputFeatures(edgeFeatures[et]), FCLayer('rectify', params.fc_init, size=params.fc_size, rng=rng), FCLayer('linear', params.fc_init, size=params.fc_size, rng=rng) ] et = nm + '_normal' edgeListComplete.append(et) edgeRNNs[et] = [ TemporalInputFeatures(edgeFeatures[et]), FCLayer('rectify', params.fc_init, size=params.fc_size, rng=rng), FCLayer('linear', params.fc_init, size=params.fc_size, rng=rng) ] nodeLabels[nm] = T.tensor3(dtype=theano.config.floatX) finalLayer[nm] = [ FCLayer_out('rectify', params.fc_init, size=params.fc_size, rng=rng, flag=1), FCLayer('rectify', params.fc_init, size=100, rng=rng), FCLayer('linear', params.fc_init, size=num_classes, rng=rng), ] if (params.test == 1): graphLayers = [ GraphConvolution(params.fc_size, adjacency), AddNoiseToInput(rng=rng, dropout_noise=True), AddNoiseToInput(rng=rng, dropout=True), ] else: graphLayers = [ GraphConvolution(params.fc_size, adjacency), GraphConvolution(params.fc_size, adjacency), # AddNoiseToInput(rng=rng, dropout_noise=True), GraphConvolution(params.fc_size, adjacency), # AddNoiseToInput(rng=rng, dropout=True), GraphConvolution(params.fc_size, adjacency), # AddNoiseToInput(rng=rng, dropout=True), GraphConvolution(params.fc_size, adjacency), # AddNoiseToInput(rng=rng, dropout=True), GraphConvolution(params.fc_size, adjacency), # AddNoiseToInput(rng=rng, dropout=True), GraphConvolution(params.fc_size, adjacency, activation_str='linear'), ] # --------------------------------------------------------------------------------------------- learning_rate = T.scalar(dtype=theano.config.floatX) learning_rate.tag.test_value = 1.0 gcnn = GCNN(params, graphLayers, finalLayer, nodeNames, edgeRNNs, nodeRNNs, nodeToEdgeConnections, edgeListComplete, euclidean_loss, nodeLabels, learning_rate, new_idx, featureRange, clipnorm=params.clipnorm, update_type=gradient_method, weight_decay=params.weight_decay) return gcnn
def __init__(self, rng, input_x1, input_x2, input_x3, input_m1, input_m2, input_m3, input_shape, filter_shape, n_lstm_hiddens, n_outputs, prefix="triplet_convlstm", output_type="max", truncate_gradient=-1, srng=None, dropout=0.0, use_dropout_regularization=False, stabilize_activations=None): """ Initialize symbolic parameters and expressions. Many of the parameters are identical to that of `cnn.build_cnn_layers`. Some of the other parameters are described below. Parameters ---------- input_x1 : symbolic matrix The matrix is reshaped according to `input_shape` and then treated as the input of the first side of the Siamese network. input_x2 : symbolic matrix The matrix is reshaped according to `input_shape` and then treated as the input of the second side of the Siamese network, forming a same-pair with `input_x1`. input_x3 : symbolic matrix The matrix is reshaped according to `input_shape` and then treated as the input of the third side of the Siamese network, forming a different-pair with `input_x1`. """ # Build common layers to which the Siamese layers are tied input = T.tensor3("x", dtype=THEANOTYPE) mask = T.matrix("m", dtype=THEANOTYPE) self.use_dropout_regularization = use_dropout_regularization self.input = input self.mask = mask self.output_type = output_type self.input_shape = input_shape self.filter_shape = filter_shape self.n_lstm_hiddens = n_lstm_hiddens self.prefix = prefix self.srng = srng self.dropout = dropout self.truncate_gradient = truncate_gradient self.stabilize_activations = stabilize_activations self.model = lstm.BatchMultiLayerConvLSTM( rng, input, mask, input_shape, filter_shape, n_lstm_hiddens, n_outputs=n_outputs, output_type=self.output_type, prefix="%s_lstm" % self.prefix, truncate_gradient=self.truncate_gradient, srng=self.srng, dropout=self.dropout, use_dropout_regularization=self.use_dropout_regularization, stabilize_activations=self.stabilize_activations) self.n_outputs = self.model.n_outputs self.x1_model = lstm.BatchMultiLayerConvLSTM( rng, input_x1, input_m1, input_shape, filter_shape, n_lstm_hiddens, n_outputs=self.n_outputs, V=self.model.V, parameters=self.model.parameters[1:], output_type=self.output_type, prefix="%s_lstm1" % self.prefix, truncate_gradient=self.truncate_gradient, srng=self.srng, dropout=self.dropout, out_W=self.model.out_W, out_b=self.model.out_b, use_dropout_regularization=self.use_dropout_regularization, stabilize_activations=self.stabilize_activations) self.x2_model = lstm.BatchMultiLayerConvLSTM( rng, input_x2, input_m2, input_shape, filter_shape, n_lstm_hiddens, n_outputs=self.n_outputs, V=self.model.V, parameters=self.model.parameters[1:], output_type=self.output_type, prefix="%s_lstm2" % self.prefix, truncate_gradient=self.truncate_gradient, srng=self.srng, dropout=self.dropout, out_W=self.model.out_W, out_b=self.model.out_b, use_dropout_regularization=self.use_dropout_regularization, stabilize_activations=self.stabilize_activations) self.x3_model = lstm.BatchMultiLayerConvLSTM( rng, input_x3, input_m3, input_shape, filter_shape, n_lstm_hiddens, n_outputs=self.n_outputs, V=self.model.V, parameters=self.model.parameters[1:], output_type=self.output_type, prefix="%s_lstm3" % self.prefix, truncate_gradient=self.truncate_gradient, srng=self.srng, dropout=self.dropout, out_W=self.model.out_W, out_b=self.model.out_b, use_dropout_regularization=self.use_dropout_regularization, stabilize_activations=self.stabilize_activations) self.parameters = self.model.parameters self.l2 = self.model.l2 self.output = self.model.output
def build_model_core(self): # gradient clipping function self.clipg = lambda x: grad_clip( x, -self.conf['GRAD_CLIP_SIZE'], self.conf['GRAD_CLIP_SIZE']) shared_layers = {} if self.conf['BATCH_NORM']: if not hasattr(self, 'gamma_h'): gamma_h_val = np.ones( (self.conf['lstm_hidden_size'] * 2,), dtype=theano.config.floatX) shared_layers['gamma_h'] = gamma_h_val if not hasattr(self, 'beta_h'): beta_h_val = np.zeros( (self.conf['lstm_hidden_size'] * 2,), dtype=theano.config.floatX) shared_layers['beta_h'] = beta_h_val # set the default network weights if not hasattr(self, 'wemb'): wemb_val = init_layer_k( self.conf['vocab_size'], self.conf['emb_size']) shared_layers['wemb'] = wemb_val if not hasattr(self, 'h0_hidden'): h0_hidden_val = np.zeros( (self.conf['lstm_hidden_size'], ), dtype=theano.config.floatX) shared_layers['h0_hidden'] = h0_hidden_val if not hasattr(self, 'h0_cell'): h0_cell_val = np.zeros( (self.conf['lstm_hidden_size'], ), dtype=theano.config.floatX) shared_layers['h0_cell'] = h0_cell_val # mapping from visual space to word space if not hasattr(self, 'wvm'): wvm_val = init_layer_k( self.conf['visual_size'], self.conf['emb_size']) shared_layers['wvm'] = wvm_val if not hasattr(self, 'bmv'): bmv_val = np.zeros( (self.conf['emb_size'],), dtype=theano.config.floatX) shared_layers['bmv'] = bmv_val # LSTM layer parameters if not hasattr(self, 'w_lstm'): w_lstm_val = init_layer_k( self.conf['lstm_hidden_size']*2, self.conf['lstm_hidden_size']*4) shared_layers['w_lstm'] = w_lstm_val # mapping from RNN hidden output to vocabulary if not hasattr(self, 'w'): w_val = init_layer_k( self.conf['lstm_hidden_size'], self.conf['output_size']) shared_layers['w'] = w_val if not hasattr(self, 'b'): b_val = np.zeros( (self.conf['output_size'],), dtype=theano.config.floatX) if self.conf["INIT_OUTPUT_BIAS"]: # set the bias on the last layer to be the log prob of each of the words in the vocab wcount = 0 w2i = self.dp.w2i w2c = self.dp.get_word_counts(RNNDataProvider.TRAIN) for w in w2i: if w in w2c: wcount += w2c[w] wcount += self.X_train.shape[0] b_val[w2i[RNNDataProvider.STOP_TOKEN]] = np.log( self.X_train.shape[0]/float(wcount)) for w in w2i: if w in w2c: b_val[w2i[w]] = np.log(w2c[w]/float(wcount)) b_val -= np.max(b_val[1:]) shared_layers['b'] = b_val self.build_shared_layers(shared_layers) # input variables for training self.x = T.imatrix("x") self.v = T.matrix("v") self.xlen = T.matrix("xlen") # input variables for generation self.v_single = T.vector("v") self.nstep = T.iscalar("nstep") # the dropout masks self.x_drop = T.tensor3("x_drop") # drop the input self.y_drop = T.tensor3("y_drop") # drop the output self.forced_word = T.imatrix("forced_word") h_tm1 = T.vector("h_tm1") # hidden layer ouput word_t = T.ivector("word_t") # word indexes v_i = T.vector("v") # visual information # Generates the next word based on the: previous true word, hidden state & visual features # inputs: hiddent_layer, last_predicted word, visual features def recurrance(word_t, x_drop_slice, hh_drop_slice, use_v, h_tm1_hidden, h_tm1_cell, v_i): #word_t = theano.printing.Print("word_t")(word_t) # get the word embedding matrix or the context information if self.conf['DECODER']: x_t = ifelse(T.eq(use_v, 1), T.dot( v_i, self.wvm) + self.bmv, self.wemb[word_t]) else: x_t = ifelse(T.eq(use_v, 1), T.zeros_like( self.wemb[word_t]), self.wemb[word_t]) # if we are not doing minibatch training if word_t.ndim == 0: x_t = x_t.reshape((1, x_t.shape[0])) h_tm1_hidden = h_tm1_hidden.reshape((1, h_tm1_hidden.shape[0])) h_tm1_cell = h_tm1_cell.reshape((1, h_tm1_cell.shape[0])) # dropout on the input embddings if self.conf['DROP_INPUT']: x_t *= x_drop_slice # clip the gradients so they dont get too large h_tm1_hidden_clip = self.clipg(h_tm1_hidden) in_state = T.concatenate([x_t, h_tm1_hidden_clip], axis=1) if self.conf['BATCH_NORM']: mu = T.mean(in_state, axis=0, keepdims=True) var = T.var(in_state, axis=0, keepdims=True) normed_is = (in_state - mu) / T.sqrt(var + T.constant(1e-10, dtype=theano.config.floatX)) in_state = self.gamma_h * in_state + self.beta_h # calculate 8 dot products in one go dot_out = T.dot(in_state, self.w_lstm) lstm_hidden_size = self.conf['lstm_hidden_size'] # input gate ig = T.nnet.sigmoid(dot_out[:, :lstm_hidden_size]) # forget gate fg = T.nnet.sigmoid( dot_out[:, lstm_hidden_size:lstm_hidden_size*2]) # output gate og = T.nnet.sigmoid( dot_out[:, lstm_hidden_size*2:lstm_hidden_size*3]) # cell memory cc = fg * h_tm1_cell + ig * T.tanh(dot_out[:, lstm_hidden_size*3:]) # hidden state hh = og * cc # drop the output state if self.conf['DROP_OUTPUT']: hh_d = hh * hh_drop_slice # the distribution over output words if self.conf['SOFTMAX_OUT']: s_t = T.nnet.softmax(T.dot(hh_d, self.w) + self.b) else: s_t = T.nnet.sigmoid(T.dot(hh_d, self.w) + self.b) #hh = ifelse(T.eq(word_t, 0) and T.eq(use_v, 0), h_tm1_hidden, hh) #cc = ifelse(T.eq(word_t, 0) and T.eq(use_v, 0), h_tm1_cell, cc) if not self.conf['DECODER']: keep_idx = T.and_(T.eq(word_t, 0), T.eq(use_v, 0)) #keep_idx = theano.printing.Print("keep_idx")(keep_idx) if word_t.ndim != 0: keep_idx = keep_idx.dimshuffle((0, 'x')) #hh_ret = hh #hh_ret[keep_idx, :] = h_tm1_hidden[keep_idx, :] hh_ret = keep_idx * h_tm1_hidden + (1-keep_idx) * hh cc_ret = keep_idx * h_tm1_cell + (1-keep_idx) * cc else: hh_ret = hh cc_ret = cc # if we are not doing minibatch training if word_t.ndim == 0: hh_ret = hh_ret[0] cc_ret = cc_ret[0] return [hh_ret, cc_ret, s_t] # Generates the next word by feeding the old word as input # inputs: hiddent_layer, last_predicted word, visual features def recurrance_word_feedback(h_tm1_hidden, h_tm1_cell, word_t, use_visual, v_i): x_drop_val = T.ones( (self.conf['emb_size'],), dtype=theano.config.floatX) y_drop_val = T.ones( (self.conf['lstm_hidden_size'],), dtype=theano.config.floatX) [hh, cc, s_t] = recurrance( word_t, x_drop_val, y_drop_val, use_visual, h_tm1_hidden, h_tm1_cell, v_i) # the predicted word w_idx = T.cast(T.argmax(s_t, axis=1), dtype='int32')[0] return [hh, cc, s_t[0], w_idx, T.zeros((0,), dtype='int32')[0]] def recurrance_partial_word_feedback(word_t_real, x_drop_val, y_drop_val, use_visual, forced_word, h_tm1_hidden, h_tm1_cell, word_t_pred, v_i): word_last = T.switch(forced_word, word_t_real, word_t_pred) [hh, cc, s_t] = recurrance( word_last, x_drop_val, y_drop_val, use_visual, h_tm1_hidden, h_tm1_cell, v_i) # the predicted word w_idx = T.cast(T.argmax(s_t, axis=1), dtype='int32') return [hh, cc, s_t, w_idx] # build the teacher forcing loop use_visual_info = T.concatenate([T.ones((1,), dtype=np.int32), T.zeros( (self.conf['MAX_SENTENCE_LEN'],), dtype=np.int32)]) if self.conf['DECODER']: #h0_hidden_matrix = self.encoder.hh_out[self.encoder.conf['MAX_SENTENCE_LEN']] h0_hidden_matrix = self.h0_hidden * \ T.ones((self.x.shape[0], self.h0_hidden.shape[0])) v_input = T.concatenate( [self.encoder.hh_out[self.encoder.conf['MAX_SENTENCE_LEN']], self.v], axis=1) #v_input = T.printing.Print("v_input")(v_input) else: h0_hidden_matrix = self.h0_hidden * \ T.ones((self.x.shape[0], self.h0_hidden.shape[0])) v_input = self.v #v_input = T.printing.Print("v_input_v")(v_input) h0_cell_matrix = self.h0_cell * \ T.ones((self.x.shape[0], self.h0_cell.shape[0])) x_adj = T.concatenate( [T.zeros((1, self.x.T[0].shape[0]), dtype=self.x.dtype), self.x.T]) y_adj = T.concatenate( [self.x.T, T.zeros((1, self.x.T[0].shape[0]), dtype=self.x.dtype)]) [self.hh_out, self.cc_out, s], _ = theano.scan(fn=recurrance, sequences=[x_adj, self.x_drop.dimshuffle( (1, 0, 2)), self.y_drop.dimshuffle((1, 0, 2)), use_visual_info], n_steps=self.conf['MAX_SENTENCE_LEN']+1, non_sequences=v_input, outputs_info=[h0_hidden_matrix, h0_cell_matrix, None]) # build the semi-forced loop [_, _, s_semi, _], _ = theano.scan(fn=recurrance_partial_word_feedback, sequences=[x_adj, self.x_drop.dimshuffle((1, 0, 2)), self.y_drop.dimshuffle((1, 0, 2)), use_visual_info, self.forced_word[:, :self.x.shape[0]]], n_steps=self.conf['MAX_SENTENCE_LEN']+1, non_sequences=self.v, outputs_info=[h0_hidden_matrix, h0_cell_matrix, None, T.zeros((self.x.shape[0],), dtype=np.int32)]) # build the un-forced loop [_, _, _, self.wout_fb, _], _ = theano.scan(fn=recurrance_word_feedback, non_sequences=self.v_single, outputs_info=[self.h0_hidden, self.h0_cell, None, np.array( 0, dtype=np.int32), T.ones((1,), dtype=np.int32)[0]], n_steps=self.nstep) if self.conf['SEMI_FORCED'] < 1: s = s_semi self.new_s = s.reshape((s.shape[0] * s.shape[1], s.shape[2])) softmax_out = self.build_loss_function(self.new_s, y_adj) self.softmax_out = softmax_out # calculate the perplexity ff_small = T.constant(1e-20, dtype=theano.config.floatX) ppl_idx = softmax_out.shape[1] * \ T.arange(softmax_out.shape[0]) + T.flatten(y_adj) hsum = -T.log2(T.flatten(softmax_out)[ppl_idx] + ff_small) hsum_new = hsum.reshape((s.shape[0], s.shape[1])).T self.perplexity_sentence = 2 ** (T.sum(hsum_new, axis=1) / T.sum(self.xlen, axis=1)) self.perplexity_batch = 2 ** (T.sum(hsum * T.flatten(self.xlen.T)) / T.sum(self.xlen)) self.perplexity_batch_v = T.sum(hsum * T.flatten(self.xlen.T)) self.perplexity_batch_n = T.sum(self.xlen) # build the single step code h_hid = T.vector("h_hid") h_cell = T.vector("h_cell") x_drop_val = T.ones( (self.conf['emb_size'],), dtype=theano.config.floatX) y_drop_val = T.ones( (self.conf['lstm_hidden_size'],), dtype=theano.config.floatX) use_v = T.iscalar("use_v") word_t_s = T.iscalar("word_t_s") one_step_theano = recurrance( word_t_s, x_drop_val, y_drop_val, use_v, h_hid, h_cell, v_i) if self.conf['DECODER']: self.one_step = theano.function( [word_t_s, use_v, h_hid, h_cell, v_i], outputs=one_step_theano) else: tmp_x = T.imatrix("tmp_x") tmp_v = T.matrix("tmp_v") x_d_tmp = T.ones( (1, self.conf['MAX_SENTENCE_LEN'], self.conf['emb_size']), dtype=theano.config.floatX) y_d_tmp = T.ones( (1, self.conf['MAX_SENTENCE_LEN'], self.conf['lstm_hidden_size']), dtype=theano.config.floatX) x_d_tmp.type.broadcastable = (False, False, False) y_d_tmp.type.broadcastable = (False, False, False) self.start_step = theano.function([tmp_x, tmp_v], outputs=self.hh_out[self.conf['MAX_SENTENCE_LEN']], givens={self.x_drop: x_d_tmp, self.y_drop: y_d_tmp, self.x: tmp_x, self.v: tmp_v})
# dense output layer l_in = lasagne.layers.InputLayer(shape=(N_BATCH, N_TIME_STEPS, N_INPUT_FEATURES)) # Followed by a Dense Layer to Produce Action l_action = lasagne.layers.DenseLayer(incoming=l_in, W=lasagne.init.Uniform([-0.1, 0.1]), num_units=N_ACTIONS, nonlinearity=None, b=None) l_action_formed = lasagne.layers.ReshapeLayer(input_layer=l_action, shape=(N_BATCH, N_TIME_STEPS, N_ACTIONS)) # Cost function is mean squared error input = T.tensor3('input') target_output = T.tensor3('target_output') # create environment env = CartPoleEnvironment() # create task task = BalanceTask(env, 200, desiredValue=None) # action_prediction = theano.function([input], l_action_formed.get_output(input)) all_params = lasagne.layers.get_all_params(l_action_formed) records = [] for time in xrange(50):
def build_text_only_network(d_word, d_hidden, lr, eps=1e-6): # input theano vars in_context_fc7 = T.tensor3(name='context_images') in_context_bb = T.tensor4(name='context_bb') in_bbmask = T.tensor3(name='bounding_box_mask') in_context = T.itensor4(name='context') in_cmask = T.tensor4(name='context_mask') in_answer_fc7 = T.matrix(name='answer_images') in_answer_bb = T.matrix(name='answer_bb') in_answers = T.itensor3(name='answers') in_amask = T.tensor3(name='answer_mask') in_labels = T.imatrix(name='labels') # define network l_context_fc7 = lasagne.layers.InputLayer(shape=(None, 3, 4096), input_var=in_context_fc7) l_answer_fc7 = lasagne.layers.InputLayer(shape=(None, 4096), input_var=in_answer_fc7) l_context = lasagne.layers.InputLayer(shape=(None, max_panels, max_boxes, max_words), input_var=in_context) l_answers = lasagne.layers.InputLayer(shape=(None, 3, max_words), input_var=in_answers) l_cmask = lasagne.layers.InputLayer(shape=l_context.shape, input_var=in_cmask) l_amask = lasagne.layers.InputLayer(shape=l_answers.shape, input_var=in_amask) l_bbmask = lasagne.layers.InputLayer(shape=(None, 3, max_boxes), input_var=in_bbmask) # contexts and answers should share embeddings l_context_emb = lasagne.layers.EmbeddingLayer(l_context, len_voc, d_word, name='word_emb') l_answer_emb = lasagne.layers.EmbeddingLayer(l_answers, len_voc, d_word, W=l_context_emb.W) l_context_box_reps = SumAverageLayer([l_context_emb, l_cmask], compute_sum=True, num_dims=4) l_box_reshape = lasagne.layers.ReshapeLayer(l_context_box_reps, (-1, max_boxes, d_word)) l_bbmask_reshape = lasagne.layers.ReshapeLayer(l_bbmask, (-1, max_boxes)) l_box_lstm = lasagne.layers.LSTMLayer(l_box_reshape, num_units=d_word, mask_input=l_bbmask_reshape, only_return_final=True) l_context_panel_reps = lasagne.layers.ReshapeLayer(l_box_lstm, (-1, 3, d_word)) l_context_final_reps = lasagne.layers.LSTMLayer(l_context_panel_reps, num_units=d_word, only_return_final=True) l_ans_reps = SumAverageLayer([l_answer_emb, l_amask], compute_sum=True, num_dims=3) l_scores = InnerProductLayer([l_context_final_reps, l_ans_reps]) preds = lasagne.layers.get_output(l_scores) loss = T.mean(lasagne.objectives.categorical_crossentropy( preds, in_labels)) all_params = lasagne.layers.get_all_params(l_scores, trainable=True) updates = lasagne.updates.adam(loss, all_params, learning_rate=lr) train_fn = theano.function([ in_context_fc7, in_context_bb, in_bbmask, in_context, in_cmask, in_answer_fc7, in_answer_bb, in_answers, in_amask, in_labels ], loss, updates=updates, on_unused_input='warn') pred_fn = theano.function([ in_context_fc7, in_context_bb, in_bbmask, in_context, in_cmask, in_answer_fc7, in_answer_bb, in_answers, in_amask ], preds, on_unused_input='warn') return train_fn, pred_fn, l_scores
def _compile_ll_F_Y(): Y = tensor.matrix('Y') Wf = tensor.tensor3('Wf') sigma_inv = tensor.matrix('sigma_inv') c = 1.0 / 2 * (theano.dot((Y - Wf), sigma_inv) * (Y - Wf)).sum(axis=2) return theano.function([Y, Wf, sigma_inv], c)
def create_iter_functions(dataset, output_layer, batch_size=BATCH_SIZE, learning_rate=LEARNING_RATE, momentum=MOMENTUM): """ Create functions for training, validation and testing to iterate one epoch. """ batch_index = T.iscalar('batch_index') X_batch = T.tensor3('input') y_batch = T.matrix('target_output') batch_slice = slice(batch_index * batch_size, (batch_index + 1) * batch_size) prediction = T.argmax(lasagne.layers.get_output(output_layer, X_batch, deterministic=True), axis=-1) accuracy = T.mean(T.eq(prediction, T.argmax(y_batch, axis=-1)), dtype=theano.config.floatX) loss_train = cross_ent_cost( lasagne.layers.get_output(output_layer, X_batch, deterministic=False), y_batch) loss_eval = cross_ent_cost( lasagne.layers.get_output(output_layer, X_batch, deterministic=True), y_batch) all_params = lasagne.layers.get_all_params(output_layer) updates = lasagne.updates.adam(loss_train, all_params, learning_rate=learning_rate) iter_train = theano.function( [batch_index], loss_train, updates=updates, givens={ X_batch: dataset['X_train'][batch_slice], y_batch: dataset['y_train'][batch_slice], }, ) iter_valid = theano.function( [batch_index], [loss_eval, accuracy], givens={ X_batch: dataset['X_valid'][batch_slice], y_batch: dataset['y_valid'][batch_slice], }, ) iter_test = theano.function( [batch_index], [loss_eval, accuracy], givens={ X_batch: dataset['X_test'][batch_slice], y_batch: dataset['y_test'][batch_slice], }, ) return dict( train=iter_train, valid=iter_valid, test=iter_test, )
def test_gpu_rowwise_switch(): assert theano.config.device.startswith("gpu"), "Need to test on GPU!" data = [ # 4 x 2 (np.array([[0.22323515, 0.36703175], [0.82260513, 0.3461504], [0.82362652, 0.81626087], [0.95270008, 0.2226797]]), np.array([[0.36341551, 0.20102882], [0.24144639, 0.45237923], [0.39951822, 0.7348066], [0.16649647, 0.60306537]]), np.array([1, 0, 1, 1]), np.array([[0.22323515, 0.36703175], [0.24144639, 0.45237923], [0.82362652, 0.81626087], [0.95270008, 0.2226797]])), # 2 x 3 x 4 (np.array([[[0.48769062, 0.82649632, 0.2047115, 0.41437615], [0.25290664, 0.87164914, 0.80968588, 0.49295084], [0.71438099, 0.97913502, 0.37598001, 0.76958707]], [[0.37605973, 0.538358, 0.74304674, 0.84346291], [0.95310617, 0.61540292, 0.49881143, 0.1028554], [0.83481996, 0.90969569, 0.40410424, 0.34419989]]]), np.array([[[0.7289117, 0.97323253, 0.19070121, 0.64164653], [0.26816493, 0.76093069, 0.95284825, 0.77350426], [0.55415519, 0.39431256, 0.86588665, 0.50031027]], [[0.1980869, 0.7753601, 0.26810868, 0.3628802], [0.2488143, 0.21278388, 0.09724567, 0.58457886], [0.12295105, 0.75321368, 0.37258797, 0.27756972]]]), np.array([1, 0]), np.array([[[0.48769062, 0.82649632, 0.2047115, 0.41437615], [0.25290664, 0.87164914, 0.80968588, 0.49295084], [0.71438099, 0.97913502, 0.37598001, 0.76958707]], [[0.1980869, 0.7753601, 0.26810868, 0.3628802], [0.2488143, 0.21278388, 0.09724567, 0.58457886], [0.12295105, 0.75321368, 0.37258797, 0.27756972]]])) ] A2, B2 = T.matrices("AB") A3, B3 = T.tensor3("A"), T.tensor3("B") mask = T.ivector("mask") switch2 = T.switch(mask.dimshuffle(0, "x"), A2, B2) switch3 = T.switch(mask.dimshuffle(0, "x", "x"), A3, B3) f2 = theano.function([A2, B2, mask], switch2) f3 = theano.function([A3, B3, mask], switch3) print "Graph of 2dim switch:" theano.printing.debugprint(f2.maker.fgraph.outputs[0]) print "Graph of 3dim switch:" theano.printing.debugprint(f3.maker.fgraph.outputs[0]) for instance in data: # Retrieve appropriate function func = f2 if instance[0].ndim == 2 else f3 # Cast to float-friendly types instance = [ x.astype(np.float32) if x.dtype.kind == 'f' else x.astype(np.int32) for x in instance ] yield tuple([_test_gpu_rowwise_switch_inner, func] + instance)
def theano_vars(self): return [T.tensor3('x', dtype=theano.config.floatX), T.tensor3('y', dtype=theano.config.floatX)]
grus = [gru0] for i in xrange(1, N_GRUS): gru = lib.ops.LowMemGRU('Recurrence.GRU' + str(i), DIM, DIM, grus[-1], h0=h0[:, i]) grus.append(gru) last_hidden = T.stack([gru[:, -1] for gru in grus], axis=1) return (grus[-1], last_hidden) sequences = T.imatrix('sequences') h0 = T.tensor3('h0') reset = T.iscalar('reset') frames = sequences.reshape((sequences.shape[0], -1, FRAME_SIZE)) processed_frames = FrameProcessor(frames) contexts, new_h0 = Recurrence(processed_frames[:, :-1], h0, reset) mu_prior, log_sigma_prior = Prior(contexts) mu_post, log_sigma_post = Encoder(processed_frames[:, 1:], contexts) # log_sigma_prior = T.log(T.nnet.softplus(log_sigma_prior)) # log_sigma_post = T.log(T.nnet.softplus(log_sigma_post)) eps = theano_srng.normal(mu_post.shape).astype('float32') latents = mu_post
def test_lstm_hid_init_layer_eval(): # Test `hid_init` as a `Layer` with some dummy input. Compare the output of # a network with a `Layer` as input to `hid_init` to a network with a # `np.array` as input to `hid_init` n_units = 7 n_test_cases = 2 in_shp = (n_test_cases, 2, 3) in_h_shp = (1, n_units) in_cell_shp = (1, n_units) # dummy inputs X_test = np.ones(in_shp, dtype=theano.config.floatX) Xh_test = np.ones(in_h_shp, dtype=theano.config.floatX) Xc_test = np.ones(in_cell_shp, dtype=theano.config.floatX) Xh_test_batch = np.tile(Xh_test, (n_test_cases, 1)) Xc_test_batch = np.tile(Xc_test, (n_test_cases, 1)) # network with `Layer` initializer for hid_init l_inp = InputLayer(in_shp) l_inp_h = InputLayer(in_h_shp) l_inp_cell = InputLayer(in_cell_shp) l_rec_inp_layer = LSTMLayer(l_inp, n_units, hid_init=l_inp_h, cell_init=l_inp_cell, nonlinearity=None) # network with `np.array` initializer for hid_init l_rec_nparray = LSTMLayer(l_inp, n_units, hid_init=Xh_test, cell_init=Xc_test, nonlinearity=None) # copy network parameters from l_rec_inp_layer to l_rec_nparray l_il_param = dict([(p.name, p) for p in l_rec_inp_layer.get_params()]) l_rn_param = dict([(p.name, p) for p in l_rec_nparray.get_params()]) for k, v in l_rn_param.items(): if k in l_il_param: v.set_value(l_il_param[k].get_value()) # build the theano functions X = T.tensor3() Xh = T.matrix() Xc = T.matrix() output_inp_layer = lasagne.layers.get_output(l_rec_inp_layer, { l_inp: X, l_inp_h: Xh, l_inp_cell: Xc }) output_nparray = lasagne.layers.get_output(l_rec_nparray, {l_inp: X}) # test both nets with dummy input output_val_inp_layer = output_inp_layer.eval({ X: X_test, Xh: Xh_test_batch, Xc: Xc_test_batch }) output_val_nparray = output_nparray.eval({X: X_test}) # check output given `Layer` is the same as with `np.array` assert np.allclose(output_val_inp_layer, output_val_nparray)
rating_freq = np.zeros((6040, 5)) init_b = np.zeros((6040, 5)) for batch in valid_monitor_stream.get_epoch_iterator(): inp_r, out_r, inp_m, out_m = batch rating_freq += inp_r.sum(axis=0) log_rating_freq = np.log(rating_freq + 1e-8) log_rating_freq_diff = np.diff(log_rating_freq, axis=1) init_b[:, 1:] = log_rating_freq_diff init_b[:, 0] = log_rating_freq[:, 0] # init_b = np.log(rating_freq / (rating_freq.sum(axis=1)[:, None] + 1e-8) +1e-8) * (rating_freq>0) new_items = np.where(rating_freq.sum(axis=1) == 0)[0] input_ratings = T.tensor3(name='input_ratings', dtype=theano.config.floatX) output_ratings = T.tensor3(name='output_ratings', dtype=theano.config.floatX) input_masks = T.matrix(name='input_masks', dtype=theano.config.floatX) output_masks = T.matrix(name='output_masks', dtype=theano.config.floatX) input_ratings_cum = T.extra_ops.cumsum(input_ratings[:, :, ::-1], axis=2)[:, :, ::-1] # hidden_size = [256] if activation_function == 'reclin': act = Rectifier elif activation_function == 'tanh': act = Tanh elif activation_function == 'sigmoid': act = Logistic
import pickle import lasagne from lasagne.layers import helper import theano import theano.tensor as T from permutationlayer import PermutationalLayer from simulate import doSimulation SITES = 8 VARS = 4 HIDDEN = 128 invar = T.tensor3() targ = T.tensor3() input = lasagne.layers.InputLayer((None, VARS, SITES), input_var=invar) # Define subnetwork for 1st layer dinp_1 = lasagne.layers.InputLayer((None, 2 * VARS, SITES, SITES)) dense1_1 = lasagne.layers.NINLayer(dinp_1, num_units=HIDDEN) dense2_1 = lasagne.layers.NINLayer(dense1_1, num_units=HIDDEN) dense3_1 = lasagne.layers.NINLayer(dense2_1, num_units=HIDDEN) dense4_1 = lasagne.layers.NINLayer(dense3_1, num_units=HIDDEN) # Define subnetwork for 2nd layer dinp2 = lasagne.layers.InputLayer((None, 2 * HIDDEN, SITES, SITES)) dense1_2 = lasagne.layers.NINLayer(dinp2, num_units=HIDDEN) dense2_2 = lasagne.layers.NINLayer(dense1_2, num_units=HIDDEN)
def test_gru_grad_clipping(): # test that you can set grad_clip variable x = T.tensor3() l_rec = GRULayer(InputLayer((2, 2, 3)), 5, grad_clipping=1) output = lasagne.layers.get_output(l_rec, x)