def build_and_train_model(self,n_hu,n_hl): print('Building Model') input_phrase = T.imatrix('train_inputmatrix') labels = T.imatrix('trainphrase_matrix') network = self.define_layers(input_phrase,labels,n_hu,n_hl) print("Defining loss") #Prediction or loss prediction = [] prediction.append(T.clip(lasagne.layers.get_output(network[0]),1.0e-7,1.0-1.0e-7)) prediction.append(T.clip(lasagne.layers.get_output(network[1]),1.0e-7,1.0-1.0e-7)) loss = l.define_loss(prediction[0],prediction[1]) self.model = network #define params params = lasagne.layers.get_all_params(network) updates = lasagne.updates.adadelta(loss,params) #run test train_fn = theano.function([input_phrase,labels],[loss, prediction[0], prediction[1]],updates=updates,allow_input_downcast=True) print("Model and params defined now training") epoch = 0 for epoch in range(self.end_epoch): train_loss = 0 train_pred = [] start_time = time.time() loss, predicted, phrase = train_fn(self.train_inputmatrix,self.trainphrase_matrix) print('Training Loss: ' + str(loss) + ' Train Epoch ' + str(epoch)) self.save_best(loss,predicted,network)
def _classify(self,dataset_static,dataset_nonstatic): """ Classify method for static or non-static models. :param classifier: model :param conv_layers: list of convPoolLayer objects :param Words: Dictionary of word index to word vectors :param dataset: Indices of words for the current sentence/dataset :param dim: dimension of word vector :param img_h: length of sentence vector after padding :return: [y_pred,prob_pred] The probability for each class """ x_static = T.imatrix('x_static') x_nonstatic = T.imatrix('x_nonstatic') y = T.ivector('y') Words_static = theano.shared(value = self.Words_static, name = "Words_static") Words_nonstatic = theano.shared(value = self.Words_nonstatic, name = "Words_nonstatic") test_pred_layers = [] test_size = np.shape(dataset_static)[0] test_layer0_input_static = Words_static[T.cast(x_static.flatten(),dtype="int32")].reshape((test_size,1,self.img_h,self.Words_static.shape[1])) test_layer0_input_nonstatic = Words_nonstatic[T.cast(x_nonstatic.flatten(),dtype="int32")].reshape((test_size,1,self.img_h,self.Words_nonstatic.shape[1])) for i in range(len(self.conv_layers)/2): test_layer0_output = self.conv_layers[i].predict(test_layer0_input_nonstatic, test_size) test_pred_layers.append(test_layer0_output.flatten(2)) for i in range(len(self.conv_layers)/2,len(self.conv_layers)): test_layer0_output = self.conv_layers[i].predict(test_layer0_input_static, test_size) test_pred_layers.append(test_layer0_output.flatten(2)) test_layer1_input = T.concatenate(test_pred_layers, 1) test_y_pred = self.classifier.predict(test_layer1_input) test_prob_pred = self.classifier.predict_p(test_layer1_input) test_model_all = theano.function([x_static,x_nonstatic], (test_y_pred,test_prob_pred)) return test_model_all(dataset_static,dataset_nonstatic)
def run(): batch_size = 16 prems = np.random.randint(low=0, high=99, size=(batch_size, 5), dtype='int32') hypoes = np.random.randint(low=0, high=99, size=(batch_size, 3), dtype='int32') labels = np.random.randint(low=0, high=3, size=(batch_size,), dtype='int32') print prems print hypoes print labels ematrix = np.random.uniform(low=-1, high=1, size=(100, 100)).astype(theano.config.floatX) t_prems = T.imatrix('p') t_hypoes = T.imatrix('h') t_ematrix = theano.shared(ematrix, 't_ematrix') r_prems = T.repeat(t_prems, 3, axis= 1) r_hypoes = T.concatenate([t_hypoes]* 5, axis=1) batch_prems = t_ematrix[r_prems] batch_hypoes = t_ematrix[r_hypoes] batch_prem_hypo = T.concatenate((batch_prems, batch_hypoes), axis=2) get_b_prems = theano.function(inputs=[t_prems], outputs=batch_prems) get_r_prems = theano.function(inputs=[t_prems], outputs=r_prems) get_b_hypoes = theano.function(inputs=[t_hypoes], outputs=batch_hypoes) get_r_hypoes = theano.function(inputs=[t_hypoes], outputs=r_hypoes) get_b_ph = theano.function(inputs=[t_prems, t_hypoes], outputs=batch_prem_hypo) # print get_b_prems(prems) print get_r_prems(prems) print get_r_hypoes(hypoes) print get_b_prems(prems).shape print get_b_hypoes(hypoes).shape print get_b_ph(prems, hypoes).shape W = theano.shared( value=np.random.uniform( low=-np.sqrt(1. / 6), high=np.sqrt(1. / 6), size=(200, 400) ).astype(theano.config.floatX), name='W' ) U = theano.shared( value=np.random.uniform( low=-np.sqrt(1. / 6), high=np.sqrt(1. / 6), size=(400,) ).astype(theano.config.floatX), name='U' ) result = T.dot(T.dot(batch_prem_hypo, W), U) get_result = theano.function(inputs=[t_prems, t_hypoes], outputs=result) print get_result(prems, hypoes).shape
def train_ready(self): print "adopt softmax model plus contractive regularization ........ " print "weight 1 : "+str(self.lowreg_weight) print "weight 2 : "+str(self.highreg_weight) print "variance : "+str(self.variance) print "nc : "+str(self.nc) var_x = T.imatrix() var_y = T.imatrix() loss = self.reg_logp(var_x,var_y, self.lowreg_weight, self.highreg_weight, self.variance, self.nc) witems = self.w.values() #ave_w = sum(T.sum(item**2) for item in witems)/len(witems) wg = T.grad(loss, witems) #ave_g = sum(T.sum(item**2) for item in wg) /len(wg) weight_up = self.upda(wg, witems, self.lrate, self.mweight, self.opt, self.gradbound) if not self.fix_emb: dicitems = self.dic.values() dg = T.grad(loss, dicitems) dic_up = self.upda(dg, dicitems, self.lrate/10., self.mweight, self.opt) weight_up.update(dic_up) up = weight_up self.updatefunc = theano.function([var_x, var_y], loss,updates = up)
def create_model(num_timesteps, num_blocks, hidden_size, learning_rate, \ grad_clip=10, dropout_p=0.5, num_lstm_layers=1, use_forward_and_backward_lstm=False): ''' returns train function which reports both loss and accuracy and test function, which also reports both loss and accuracy ''' l_in, l_mask, l_out, l_out_slice, l_lstm, l_lstm_slice = \ _build_net_layers(num_timesteps, num_blocks, hidden_size, learning_rate, \ grad_clip, dropout_p, num_lstm_layers, use_forward_and_backward_lstm) inp = T.tensor3('input') truth = T.imatrix("truth") mask = T.imatrix("mask") # pred should be of shape (batchsize, num_timesteps, num_asts) pred = lasagne.layers.get_output(l_out) # pred_slice should be of shape (batchsize, num_asts), only contains # predictions for the last timestep pred_slice = lasagne.layers.get_output(l_out_slice) # the hidden representations for the last timestep (batchsize, hidden_size) hidden_slice = lasagne.layers.get_output(l_lstm_slice) # truth should also be of shape (batchsize, num_timesteps, num_asts) pred_2d = pred.reshape((-1, num_blocks)) truth_1d = truth.reshape((-1,)) # pred_2d_shape = T.shape(pred_2d) # truth_1d_shape = T.shape(truth_1d) # categorical_crossentropy loss = T.nnet.categorical_crossentropy(pred_2d, truth_1d).mean() # categorical accuracy # acc = T.nnet.categorical_crossentropy(pred_2d, truth_1d).mean() acc = lasagne.objectives.categorical_accuracy(pred_2d, truth_1d).mean() # update function print("Computing updates ...") all_params = lasagne.layers.get_all_params(l_out) updates = lasagne.updates.adam(loss, all_params, learning_rate) # training function print("Compiling functions ...") train_loss = theano.function([l_in.input_var, l_mask.input_var, truth], loss, updates=updates, allow_input_downcast=True) compute_loss = theano.function([l_in.input_var, l_mask.input_var, truth], loss, allow_input_downcast=True) # training function, returns loss and acc compute_pred = theano.function([l_in.input_var, l_mask.input_var, truth], [pred_2d, truth_1d], updates=updates, allow_input_downcast=True) train_loss_acc = theano.function([l_in.input_var, l_mask.input_var, truth], [loss, acc, pred], updates=updates, allow_input_downcast=True) # computes loss and accuracy, without training compute_loss_acc = theano.function([l_in.input_var, l_mask.input_var, truth], [loss, acc, pred], allow_input_downcast=True) # In order to generate text from the network, we need the probability distribution of the next character given # the state of the network and the input (a seed). # In order to produce the probability distribution of the prediction, we compile a function called probs. probs = theano.function([l_in.input_var, l_mask.input_var], pred_slice, allow_input_downcast=True) generate_hidden_representations = theano.function([l_in.input_var, l_mask.input_var], hidden_slice, allow_input_downcast=True) print("Compiling done!") return train_loss_acc, compute_loss_acc, probs, generate_hidden_representations, compute_pred, l_out
def __init__( self, config, qvocab_len, max_qlen, num_ans, num_qtypes, l_saver): self.config = config self.qn = T.imatrix() self.lstm_mask = T.imatrix() self.iX = T.fmatrix() self.Y = T.ivector() self.qtype = T.ivector() self.sparse_indices = T.ivector() self.qembd = T.fmatrix() self.ql_out = T.fmatrix() self.timer = l.timer_type() self.saver, self.exp_saver = l_saver self.qlstm_hidden_dim = 300 self.qn_classifier_emb_size = 75 self.max_ql = max_qlen self.qvocab_len = qvocab_len self.bptt_trunk_steps = -1 self.mlp_input_dim = 1024 self.num_qtypes = num_qtypes self.num_ans = num_ans self.grad_clip = config['grad_clip'] self.params = {} print "Models Initialization done ..."
def test_sparseblockgemvF(self): """ Test the fortan order for W (which can happen in the grad for some graphs). """ b = tensor.fmatrix() W = tensor.ftensor4() h = tensor.ftensor3() iIdx = tensor.imatrix() oIdx = tensor.imatrix() o = self.gemv_op(b.take(oIdx, axis=0), tensor.DimShuffle((False, False, False, False), (0, 1, 3, 2)) (tensor.as_tensor_variable(W)), h, iIdx, oIdx) f = theano.function([W, h, iIdx, b, oIdx], o, mode=self.mode) W_val, h_val, iIdx_val, b_val, oIdx_val = \ BlockSparse_Gemv_and_Outer.gemv_data() th_out = f(numpy.swapaxes(W_val, 2, 3), h_val, iIdx_val, b_val, oIdx_val) ref_out = BlockSparse_Gemv_and_Outer.gemv_numpy( b_val.take(oIdx_val, axis=0), W_val, h_val, iIdx_val, oIdx_val) utt.assert_allclose(ref_out, th_out)
def __init__(self, size_vocab, size_embed, size, size_out, depth, network, alpha=0.5, gru_activation=clipped_rectify, visual_activation=linear, visual_encoder=StackedGRUH0, cost_visual=CosineDistance, max_norm=None, lr=0.0002, dropout_prob=0.0): autoassign(locals()) self.network = network(self.size_vocab, self.size_embed, self.size, self.size_out, self.depth, gru_activation=self.gru_activation, visual_activation=self.visual_activation, visual_encoder=self.visual_encoder, dropout_prob=self.dropout_prob) self.input = T.imatrix() self.output_t_prev = T.imatrix() self.output_t = T.imatrix() self.output_v = T.fmatrix() self.OH = OneHot(size_in=self.size_vocab) self.output_t_oh = self.OH(self.output_t) self.updater = util.Adam(max_norm=self.max_norm, lr=self.lr) self.train = self._make_train() self.loss_test = self._make_loss_test()
def set_model(argv, vocab_word, init_emb): x_span = T.imatrix("x_span") x_word = T.imatrix("x_word") x_ctx = T.imatrix("x_ctx") x_dist = T.imatrix("x_dist") x_slen = T.imatrix("x_slen") y = T.ivector("y") """ Set params for the model """ n_vocab = vocab_word.size() dim_x_word = argv.emb dim_x_dist = 10 # (0, ..., 10-) dim_h = argv.hidden L2_reg = argv.reg """ Instantiate the model """ return Model( x_span=x_span, x_word=x_word, x_ctx=x_ctx, x_dist=x_dist, x_slen=x_slen, y=y, init_emb=init_emb, n_vocab=n_vocab, dim_w_p=dim_x_word, dim_d=dim_x_dist, dim_h=dim_h, L2_reg=L2_reg, )
def run(): # params dims = 10 negrate = 1 batsize = 300 epochs = 300 #paths datafileprefix = "../../data/nycfilms/" dirfwdsuffix = "direct_forward.plustypes.ssd" # get the data and split dirfwdf = open(datafileprefix+dirfwdsuffix) datadf = readdata(dirfwdf) traind, validd, testd = datadf.split((70, 15, 15), random=True) numents = int(datadf.ix[:, 0].max())+1 print numents numrels = int(datadf.ix[:, 1].max())+1 print numrels # define model inp = Input(T.imatrix()) eemb = VectorEmbed.indim(numents).outdim(dims).Wreg(l2reg(0.00001))() remb = VectorEmbed.indim(numrels).outdim(dims).Wreg(l2reg(0.00001))() # for debugging eembd = SymTensor(T.fmatrix()) rembd = SymTensor(T.fmatrix()) dotp = SymTensor(T.fmatrix()) out = ((inp[:, 0] >> eemb >> eembd) & (inp[:, 1] >> remb >> rembd)) >> DotProduct() >> dotp >> Tanh() # for plotting purposes: relation to relation dot product (or relation-type) r2rinp = Input(T.imatrix()) rel2rel = ((r2rinp[:, 0] >> remb) & (r2rinp[:, 1] >> remb)) >> DotProduct() outtest = Output(T.fvector()) loss = (out & outtest) >> HingeLoss() trainer = Trainer\ .batsize(batsize)\ .epochs(epochs)\ .onrun(getonrun())\ .offrun(offrun)\ .offepoch(getoffepoch(out, rel2rel))\ .onbatch(getonbatch(negrate, numents, numrels))\ .optimizer(sgd(lr=1.))\ .batchtransformer(transbat) trainer\ .loss(loss)\ trainer.train(traind.values, validd.values)\ .test(testd.values) explore(eemb, remb) # functions for interactive exploration embed()
def train_model_func(self, batch_size, num_batches, summary_sz, input_sz): summaries = T.imatrix('summaries') docs = T.imatrix('docs') s = np.zeros((batch_size * num_batches, summary_sz)) d = np.zeros((batch_size * num_batches, input_sz)) summary_superbatch = theano.shared( s.astype(theano.config.floatX), name = 's_summs', borrow = True ) doc_superbatch = theano.shared( d.astype(theano.config.floatX), name = 's_docs', borrow = True ) self.ssb = summary_superbatch self.dsb = doc_superbatch cost = self.negative_log_likelihood_batch(docs, summaries, batch_size) regularization_cost = self.l2_coefficient * sum([(p ** 2).sum() for p in self.params]) self.get_batch_cost_unregularized = theano.function([docs, summaries], cost, allow_input_downcast=True) #theano.printing.debugprint(cost) cost = cost + regularization_cost params = {p.name: p for p in self.params} grads = T.grad(cost, self.params) #grads = theano.printing.Print("grads")(grads) # learning rate lr = T.scalar(name='lr') gradient_update = optimisers.sgd_(lr, self.params, grads, docs, summaries, cost, self.dsb, self.ssb, batch_size) return gradient_update
def build_model_1(self): x = T.imatrix('x').astype(theano.config.floatX) drop_masks = T.imatrix('drop_masks').astype(theano.config.floatX) y = T.ivector('y') self.layers[0] = LSTMLayer(random_state=self.random_state,input=x,drop_masks=drop_masks,input_dim=self.input_dim,output_dim=self.hidden_dims[0]) params = self.layers[0].params self.layers[1] = OutputLayer(input=self.layers[0].output, input_dim=self.layers[0].output_dim, output_dim=self.output_dim,random_state=self.random_state) params += self.layers[1].params _EPSILON = 10e-8 L1 = 0.001 * T.sum([T.sum(param) for param in params]) L2 = 0.001 * T.sum([T.sum(param ** param) for param in params]) cost = T.sum(T.nnet.categorical_crossentropy(T.clip(self.layers[self.number_of_layers].probabilities[-1], _EPSILON, 1.0 - _EPSILON),y)) + L1 + L2 #grads = T.grad(cost, params) #updates = [(param_i, param_i - self.learning_rate * grad_i) for param_i,grad_i in zip(params,grads)] updates = LearningAlgorithms.adam(cost,params,learning_rate=0.001) self.sgd_step = theano.function([x,drop_masks, y], L1, updates=updates) self.predict = theano.function([x,drop_masks],self.layers[self.number_of_layers].probabilities[-1]) self.test_model = theano.function([x,drop_masks, y], cost)
def _make_stack(self, seq_length=4): self.embedding_dim = embedding_dim = 3 self.vocab_size = vocab_size = 10 self.seq_length = seq_length def compose_network(inp, inp_dim, outp_dim, vs, name="compose"): # Just add the two embeddings! W = T.concatenate([T.eye(outp_dim), T.eye(outp_dim)], axis=0) return inp.dot(W) X = T.imatrix("X") transitions = T.imatrix("transitions") apply_dropout = T.scalar("apply_dropout") vs = VariableStore() self.stack = HardStack( embedding_dim, embedding_dim, vocab_size, seq_length, compose_network, IdentityLayer, apply_dropout, vs, X=X, transitions=transitions, make_test_fn=True, ) # Swap in our own dummy embeddings and weights. embeddings = np.arange(vocab_size).reshape((vocab_size, 1)).repeat(embedding_dim, axis=1) self.stack.embeddings.set_value(embeddings)
def build_model(self): print '\n... building the model with unroll=%d, backroll=%d' \ % (self.source.unroll, self.source.backroll) x = T.imatrix('x') y = T.imatrix('y') reset = T.scalar('reset') hiddens = [h['init'] for h in self.hiddens.values()] outputs_info = [None] * 3 + hiddens [losses, probs, errors, hids], updates = \ theano.scan(self.step, sequences=[x, y], outputs_info=outputs_info) loss = losses.sum() error = errors.sum() / T.cast((T.neq(y, 255).sum()), floatX) hidden_updates_train = [] hidden_updates_test = [] for h in self.hiddens.values(): h_train = ifelse(T.eq(reset, 0), \ hids[-1-self.source.backroll, :], T.ones_like(h['init'])) h_test = ifelse(T.eq(reset, 0), \ hids[-1, :], T.ones_like(h['init'])) hidden_updates_train.append((h['init'], h_train)) hidden_updates_test.append((h['init'], h_test)) updates = self.source.get_updates(loss, self.sgd_params) updates += hidden_updates_train rets = [loss, probs[-1, :], error] mode = theano.Mode(linker='cvm') train_model = theano.function([x, y, reset, self.lr], rets, \ updates=updates, mode=mode) test_model = theano.function([x, y, reset], rets, \ updates=hidden_updates_test, mode=mode) return train_model, test_model
def setup_encode(self): # dimensions: (batch, time, 12) chord_types = T.btensor3() # dimensions: (batch, time) chord_roots = T.imatrix() # dimensions: (batch, time) relative_posns = [T.imatrix() for _ in self.encodings] # dimesions: (batch, time, output_data) encoded_melodies = [T.btensor3() for _ in self.encodings] n_batch, n_time = chord_roots.shape all_activations = [] for encoding, enc_lstmstack, encoded_melody, relative_pos in zip(self.encodings, self.enc_lstmstacks, encoded_melodies, relative_posns): activations = enc_lstmstack.do_preprocess_scan( timestep=T.tile(T.arange(n_time), (n_batch,1)) , relative_position=relative_pos, cur_chord_type=chord_types, cur_chord_root=chord_roots, cur_input=encoded_melody, deterministic_dropout=True ) all_activations.append(activations) reduced_activations = functools.reduce((lambda x,y: x+y), all_activations) strengths, vects = self.qman.get_strengths_and_vects(reduced_activations) self.encode_fun = theano.function( inputs=[chord_types, chord_roots] + relative_posns + encoded_melodies, outputs=[strengths, vects], allow_input_downcast=True, mode=(NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=True) if self.nanguard else None))
def _get_input_tensor_variables(self): # x_w: 1D: batch, 2D: n_words, 3D: 5 + window; word id # x_p: 1D: batch, 2D: n_words; posit id # y: 1D: batch, 2D: n_words; label id if self.argv.mark_phi: return [T.itensor3('x_w'), T.imatrix('x_p'), T.imatrix('y')] return [T.itensor3('x_w'), T.imatrix('y')]
def build(self): """build the model. This method should be called after self.add_data. """ x_sym = sparse.csr_matrix('x', dtype = 'float32') y_sym = T.imatrix('y') g_sym = T.imatrix('g') gy_sym = T.vector('gy') ind_sym = T.ivector('ind') l_x_in = lasagne.layers.InputLayer(shape = (None, self.x.shape[1]), input_var = x_sym) l_g_in = lasagne.layers.InputLayer(shape = (None, 2), input_var = g_sym) l_ind_in = lasagne.layers.InputLayer(shape = (None, ), input_var = ind_sym) l_gy_in = lasagne.layers.InputLayer(shape = (None, ), input_var = gy_sym) num_ver = max(self.graph.keys()) + 1 l_emb_in = lasagne.layers.SliceLayer(l_g_in, indices = 0, axis = 1) l_emb_in = lasagne.layers.EmbeddingLayer(l_emb_in, input_size = num_ver, output_size = self.embedding_size) l_emb_out = lasagne.layers.SliceLayer(l_g_in, indices = 1, axis = 1) if self.neg_samp > 0: l_emb_out = lasagne.layers.EmbeddingLayer(l_emb_out, input_size = num_ver, output_size = self.embedding_size) l_emd_f = lasagne.layers.EmbeddingLayer(l_ind_in, input_size = num_ver, output_size = self.embedding_size, W = l_emb_in.W) l_x_hid = layers.SparseLayer(l_x_in, self.y.shape[1], nonlinearity = lasagne.nonlinearities.softmax) if self.use_feature: l_emd_f = layers.DenseLayer(l_emd_f, self.y.shape[1], nonlinearity = lasagne.nonlinearities.softmax) l_y = lasagne.layers.ConcatLayer([l_x_hid, l_emd_f], axis = 1) l_y = layers.DenseLayer(l_y, self.y.shape[1], nonlinearity = lasagne.nonlinearities.softmax) else: l_y = layers.DenseLayer(l_emd_f, self.y.shape[1], nonlinearity = lasagne.nonlinearities.softmax) py_sym = lasagne.layers.get_output(l_y) loss = lasagne.objectives.categorical_crossentropy(py_sym, y_sym).mean() if self.layer_loss and self.use_feature: hid_sym = lasagne.layers.get_output(l_x_hid) loss += lasagne.objectives.categorical_crossentropy(hid_sym, y_sym).mean() emd_sym = lasagne.layers.get_output(l_emd_f) loss += lasagne.objectives.categorical_crossentropy(emd_sym, y_sym).mean() if self.neg_samp == 0: l_gy = layers.DenseLayer(l_emb_in, num_ver, nonlinearity = lasagne.nonlinearities.softmax) pgy_sym = lasagne.layers.get_output(l_gy) g_loss = lasagne.objectives.categorical_crossentropy(pgy_sym, lasagne.layers.get_output(l_emb_out)).sum() else: l_gy = lasagne.layers.ElemwiseMergeLayer([l_emb_in, l_emb_out], T.mul) pgy_sym = lasagne.layers.get_output(l_gy) g_loss = - T.log(T.nnet.sigmoid(T.sum(pgy_sym, axis = 1) * gy_sym)).sum() params = [l_emd_f.W, l_emd_f.b, l_x_hid.W, l_x_hid.b, l_y.W, l_y.b] if self.use_feature else [l_y.W, l_y.b] if self.update_emb: params = lasagne.layers.get_all_params(l_y) updates = lasagne.updates.sgd(loss, params, learning_rate = self.learning_rate) self.train_fn = theano.function([x_sym, y_sym, ind_sym], loss, updates = updates, on_unused_input = 'ignore') self.test_fn = theano.function([x_sym, ind_sym], py_sym, on_unused_input = 'ignore') self.l = [l_gy, l_y] g_params = lasagne.layers.get_all_params(l_gy, trainable = True) g_updates = lasagne.updates.sgd(g_loss, g_params, learning_rate = self.g_learning_rate) self.g_fn = theano.function([g_sym, gy_sym], g_loss, updates = g_updates, on_unused_input = 'ignore')
def __theano_build__(self): params = self.params param_names = self.param_names hidden_dim = self.hidden_dim x1 = T.imatrix('x1') # first sentence x2 = T.imatrix('x2') # second sentence x1_mask = T.fmatrix('x1_mask') #mask x2_mask = T.fmatrix('x2_mask') y = T.ivector('y') # label y_c = T.ivector('y_c') # class weights # Embdding words _E1 = params["E"].dot(params["W"][0]) + params["B"][0] _E2 = params["E"].dot(params["W"][1]) + params["B"][1] statex1 = _E1[x1.flatten(), :].reshape([x1.shape[0], x1.shape[1], hidden_dim]) statex2 = _E2[x2.flatten(), :].reshape([x2.shape[0], x2.shape[1], hidden_dim]) def rnn_cell(x, mx, ph, Wh): h = T.tanh(ph.dot(Wh) + x) h = mx[:, None] * h + (1-mx[:, None]) * ph return [h] [h1], updates = theano.scan( fn=rnn_cell, sequences=[statex1, x1_mask], truncate_gradient=self.truncate, outputs_info=[dict(initial=T.zeros([self.batch_size, self.hidden_dim]))], non_sequences=params["W"][2]) [h2], updates = theano.scan( fn=rnn_cell, sequences=[statex2, x2_mask], truncate_gradient=self.truncate, outputs_info=[dict(initial=h1[-1])], non_sequences=params["W"][3]) #predict _s = T.nnet.softmax(h1[-1].dot(params["lrW"][0]) + h2[-1].dot(params["lrW"][1]) + params["lrb"]) _p = T.argmax(_s, axis=1) _c = T.nnet.categorical_crossentropy(_s, y) _c = T.sum(_c * y_c) _l = T.sum(params["lrW"]**2) _cost = _c + 0.01 * _l # SGD parameters learning_rate = T.scalar('learning_rate') decay = T.scalar('decay') # Gradients and updates _grads, _updates = rms_prop(_cost, param_names, params, learning_rate, decay) # Assign functions self.bptt = theano.function([x1, x2, x1_mask, x2_mask, y, y_c], _grads) self.loss = theano.function([x1, x2, x1_mask, x2_mask, y, y_c], _c) self.weights = theano.function([x1, x2, x1_mask, x2_mask], _s) self.predictions = theano.function([x1, x2, x1_mask, x2_mask], _p) self.sgd_step = theano.function( [x1, x2, x1_mask, x2_mask, y, y_c, learning_rate, decay], updates=_updates)
def __init__(self,rng,model_params): self.input = T.itensor3('input') # the data is a minibatch self.label = T.imatrix('label') # label's shape (mini_batch size, max_term_per_sent) self.sent_length= T.ivector('sent_length') # sent_length is the number of terms in each sentence self.masks = T.imatrix('masks') # masks which used in error and likelihood calculation self.core = SentenceLevelNeuralModelCore(rng,self.input,self.label,self.sent_length,self.masks,model_params) self.params = self.core.wordvec.params() \ + self.core.POSvec.params() \ + self.core.wordpos_vec.params() \ + self.core.verbpos_vec.params() \ + self.core.conv_word.params() \ + self.core.conv_POS.params() \ + self.core.conv_wordpos.params() \ + self.core.conv_verbpos.params() \ + self.core.hidden_layer.params self.L2_sqr = (self.core.wordvec.embeddings ** 2).sum() \ + (self.core.POSvec.embeddings ** 2).sum() \ + (self.core.wordpos_vec.embeddings ** 2).sum() \ + (self.core.verbpos_vec.embeddings ** 2).sum() \ + (self.core.conv_word.W ** 2).sum() \ + (self.core.conv_POS.W ** 2).sum() \ + (self.core.conv_wordpos.W ** 2).sum() \ + (self.core.conv_verbpos.W ** 2).sum() \ + (self.core.hidden_layer.W ** 2).sum() self.negative_log_likelihood = self.core.likelihood() self.errors = self.core.errors() # we only use L2 regularization self.cost = self.negative_log_likelihood \ + self.core.L2_reg * self.L2_sqr self.gparams = [] for param in self.params: gparam = T.grad(self.cost, param) self.gparams.append(gparam) self.updates = [] learning_rate = model_params['learning_rate'] for param, gparam in zip(self.params, self.gparams): self.updates.append((param, param - learning_rate * gparam)) #self.train_model = theano.function(inputs=[self.input,self.label,self.masks], outputs=self.core.conv_word.output,on_unused_input='ignore') #self.train_model = theano.function(inputs=[self.input,self.label,self.masks], outputs=self.core.conv_POS.output,on_unused_input='ignore') #self.train_model = theano.function(inputs=[self.input,self.label,self.masks], outputs=self.core.conv_verbpos.output,on_unused_input='ignore') #self.train_model = theano.function(inputs=[self.input,self.label,self.masks], outputs=self.core.conv_wordpos.output,on_unused_input='ignore') #self.train_model = theano.function(inputs=[self.input,self.label,self.masks], outputs=self.core.conv_out,on_unused_input='ignore') #self.train_model = theano.function(inputs=[self.input,self.label,self.masks], outputs=self.core.max_out,on_unused_input='ignore') #self.train_model = theano.function(inputs=[self.input,self.label,self.masks], outputs=self.core.hidden_layer.output,on_unused_input='ignore') #self.train_model = theano.function(inputs=[self.input,self.label,self.masks], outputs=self.core.negative_log_likelihood,on_unused_input='ignore') #self.train_model = theano.function(inputs=[self.input,self.label,self.masks], outputs=self.cost,on_unused_input='ignore') self.train_model = theano.function(inputs=[self.input,self.label,self.masks], outputs=self.cost,updates=self.updates,on_unused_input='ignore') self.valid_model = theano.function(inputs=[self.input,self.label,self.masks], outputs=[self.errors,self.core.sentce_loglikelihood.y_pred_pointwise],on_unused_input='ignore')
def __init__(self, config): autoassign(locals()) self.updater = util.Adam(max_norm=config['max_norm'], lr=config['lr']) self.Decode = Decoder(config['size_vocab'], config['size_embed'], config['size'], config['depth']) self.ToTxt = Dense(config['size'], config['size_vocab']) self.inputs = [T.imatrix()] self.target = T.imatrix()
def ndim_itensor(ndim, name=None): if ndim == 2: return T.imatrix(name) elif ndim == 3: return T.itensor3(name) elif ndim == 4: return T.itensor4(name) return T.imatrix(name=name)
def defmodel(self): pathidxs = T.imatrix("pathidxs") # integers of (batsize, seqlen) zidxs = T.imatrix("zidxs") # integers of (batsize, seqlen) occluder = T.imatrix("occluder") scores = self.definnermodel(pathidxs) # predictions, floats of (batsize, seqlen, vocabsize) # probs = T.nnet.softmax(scores) # row-wise softmax; probs: (batsize, seqlen, vocabsize) #softmax doesn't work on tensor3D probs, _ = theano.scan(fn=T.nnet.softmax, sequences=scores, outputs_info=[None]) return probs, zidxs, occluder, [pathidxs, zidxs, occluder]
def create_rnn(hidden_dim, vocab_dim,mode="rnn"): # input x = tensor.imatrix('inchar') y = tensor.imatrix('outchar') # W = LookupTable( name = "W1", #dim = hidden_dim*4, dim = hidden_dim, length = vocab_dim, weights_init = initialization.IsotropicGaussian(0.01), biases_init = initialization.Constant(0) ) if mode == "lstm": # Long Short Term Memory H = LSTM( hidden_dim, name = 'H', weights_init = initialization.IsotropicGaussian(0.01), biases_init = initialization.Constant(0.0) ) else: # recurrent history weight H = SimpleRecurrent( name = "H", dim = hidden_dim, activation = Tanh(), weights_init = initialization.IsotropicGaussian(0.01) ) # S = Linear( name = "W2", input_dim = hidden_dim, output_dim = vocab_dim, weights_init = initialization.IsotropicGaussian(0.01), biases_init = initialization.Constant(0) ) A = NDimensionalSoftmax( name = "softmax" ) initLayers([W,H,S]) activations = W.apply(x) hiddens = H.apply(activations)#[0] activations2 = S.apply(hiddens) y_hat = A.apply(activations2, extra_ndim=1) cost = A.categorical_cross_entropy(y, activations2, extra_ndim=1).mean() cg = ComputationGraph(cost) #print VariableFilter(roles=[WEIGHT])(cg.variables) #W1,H,W2 = VariableFilter(roles=[WEIGHT])(cg.variables) layers = (x, W, H, S, A, y) return cg, layers, y_hat, cost
def build_model(options): print('Build model...') sys.stdout.flush() weights = None if options['flag_random_lookup_table'] == False: weights = options['embedding'] embed_layer = Embedding(input_dim = options['embedding'].shape[0], output_dim = options['embedding'].shape[1], weights = weights) dense_layers = [] dense_layers.append(Dense(input_dim = options['embedding'].shape[1] * 2, output_dim = options['size_hidden_layer'], activation = 'tanh')) dense_layers.append(Dense(input_dim = options['size_hidden_layer'], output_dim = 1, activation = 'sigmoid')) # for training sentence1 = T.imatrix('s1') # sentence1, n_samples * len_sentence sentence1_mask = T.matrix('s1_mask') sentence2 = T.imatrix('s2') # sentence2, n_samples * len_sentence sentence2_mask = T.matrix('s2_mask') y = T.ivector('y1') # n_samples embed_s1 = embed_layer.get_output(sentence1) # n_samples * len_sentence * embed_dim embed_s2 = embed_layer.get_output(sentence2) # n_samples * len_sentence * embed_dim if options['sentence_modeling'] == 'CBoW': embed_s1 = ave_embed(embed_s1,sentence1_mask) # n_samples * embed_dim embed_s2 = ave_embed(embed_s2,sentence2_mask) # n_samples * embed_dim elif options['sentence_modeling'] == 'CNN': sentence_encode_layer = Convolution1D(input_dim = options['embedding'].shape[1], activation = 'tanh', nb_filter = options['embedding'].shape[1], filter_length = options['CNN_filter_length'], border_mode = 'same') embed_s1 = CNN_embed(embed_s1,sentence1_mask,sentence_encode_layer) # n_samples * embed_dim embed_s2 = CNN_embed(embed_s2,sentence2_mask,sentence_encode_layer) # n_samples * embed_dim elif options['sentence_modeling'] == 'LSTM': sentence_encode_layer = LSTM(input_dim = options['embedding'].shape[1], output_dim = options['embedding'].shape[1]) embed_s1 = LSTM_embed(embed_s1,sentence1_mask,sentence_encode_layer,options) # n_samples * embed_dim embed_s2 = LSTM_embed(embed_s2,sentence2_mask,sentence_encode_layer,options) # n_samples * embed_dim else: print 'Error: No model called %s available!' % options['sentence_modeling'] return output = T.concatenate([embed_s1,embed_s2],axis = -1) # n_samples * (embed_dim * 2) if options['flag_dropout'] == True: output = dropout(output, level=options['dropoutRates']) for dense_layer in dense_layers: output = dense_layer.get_output(output) f_pred = theano.function([sentence1,sentence1_mask,sentence2,sentence2_mask],output, allow_input_downcast=True) output = output.reshape((output.shape[0],)) #y = y.reshape((output.shape[0],1)) cost = T.nnet.binary_crossentropy(output, y).mean() f_debug = theano.function([sentence1,sentence1_mask,sentence2,sentence2_mask,y],[output,y,T.nnet.binary_crossentropy(output, y),cost], allow_input_downcast=True) tparams = [] tparams += embed_layer.params if options['sentence_modeling'] != 'CBoW': tparams += sentence_encode_layer.params for dense_layer in dense_layers: tparams += dense_layer.params return sentence1,sentence1_mask,sentence2,sentence2_mask,y,cost,f_pred,tparams,f_debug
def __init__(self, voca_size, hidden_size, ydim, num_layers=2, learning_rate=0.1): self.hidden_size = hidden_size self.n_out = ydim self.learning_rate = learning_rate self.num_layers = num_layers self.layers = [] self.params = [] self.emb = WordEmbeder(voca_size, hidden_size) self.params += self.emb.params x = tensor.imatrix() #symbolic mask = tensor.imatrix() y = tensor.ivector() state_below = self.emb.embed_it(x) for _ in range(self.num_layers): binet = BiLSTM(self.hidden_size, self.learning_rate) self.layers += binet, self.params += binet.params state_below = binet.forward(state_below, mask) self.U = theano.shared(name="biU", value=utils.init_norm(self.hidden_size, self.n_out), borrow=True) self.by = theano.shared(name="by", value=np.zeros(self.n_out), borrow=True) self.params += [self.U, self.by] #mean pooling hs = state_below mp = (hs*mask[:,:,None]).sum(axis=0) mp = mp / mask.sum(axis=0)[:,None] #classifier pred_p = tensor.nnet.softmax(tensor.dot(mp, self.U) + self.by) pred_y = pred_p.argmax(axis=1) #nll off_set = 1e-8 cost = -tensor.log( pred_p[tensor.arange(mask.shape[1]), y] + off_set ).mean() gparams = [tensor.grad(cost, param) for param in self.params] updates = [(param, param - self.learning_rate*gparam) for param, gparam in zip(self.params, gparams)] vinputs = tensor.imatrix("vinputs")#variable vmask = tensor.imatrix("vmask") vy = tensor.ivector("vy") self._train = theano.function( inputs=[vinputs, vmask, vy], outputs=cost, updates=updates, givens={x:vinputs, mask:vmask, y:vy} ) self._predict = theano.function( inputs=[vinputs, vmask], outputs=pred_y, givens={x:vinputs, mask:vmask} )
def add_b(): w = T.imatrix('w') a = T.imatrix('a') y = w + a.repeat(4, 0) f = theano.function(inputs=[w, a], outputs=[y]) e = np.asarray([[2, 4], [2, 1], [3, 2], [4, 1]], dtype='int32') b = np.asarray([[2, 1]], dtype='int32') print f(e, b)
def build_batch(self): x = TT.imatrix('x') # 2D int32 x_mask = TT.fmatrix('x_mask') # float32 y = TT.imatrix('y') # 2D int32 y_given_x = self.fprop_batch(x) # 3D, shape (seq_len, bs, n_out) self.get_y_given_x = theano.function(inputs = [x], outputs = y_given_x) y_given_x_ = y_given_x.reshape((y_given_x.shape[0]*y_given_x.shape[1], y_given_x.shape[2])) y_ = y.reshape((y.shape[0]*y.shape[1], )) nll = -TT.sum( TT.log( y_given_x_[TT.arange(y_.shape[0]), y_] ) * x_mask.reshape( (x_mask.shape[0]*x_mask.shape[1], ) ) ) / x_mask.shape[1] # nll is the sum of nll divided by batch size cost = nll # l2 norm cost if self.l2_weight is not None: L2 = 0 for p in self.params_l2: L2 += TT.sum(p ** 2) cost += self.l2_weight * L2 print '[SimpleRNNLM] L2 norm used %g' % self.l2_weight else: print '[SimpleRNNLM] L2 norm not used' lr = TT.scalar('lr') print '[SimpleRNNLM] ... get grads ...' grads = TT.grad(cost, self.params) grad_norm = TT.sqrt(sum([TT.sum(g**2) for g in grads])) if self.grad_clip is not None: grads = clip_grad(grads, grad_norm, self.grad_clip) grad_norm = TT.sqrt(sum([TT.sum(g**2) for g in grads])) else: print '[SimpleRNNLM] no grad_clip is used' print '[SimpleRNNLM] ... got grads ...' print '[SimpleRNNLM] algo = ', self.algo if self.algo == 'SGD': updates = SGD(self.params, grads, lr) else: sys.stderr.write('Not recognized training algorithm') sys.exit(1) print '[SimpleRNNLM] ...build training function...' self.train_batch_fn = theano.function(inputs = [x, x_mask, y, lr], outputs = nll, updates = updates) print '[SimpleRNNLM] ...build training function done...' # valid_fn return nll self.valid_batch_fn = theano.function(inputs = [x, x_mask, y], outputs = nll) # detailed valid function return both nll and y_given_x self.detailed_valid_batch_fn = theano.function(inputs = [x, x_mask, y], outputs = [nll, y_given_x]) print '[SimpleRNNLM] build train_fn and valid_fn done!' return self.train_batch_fn, self.valid_batch_fn
def test_dot_infershape(self): b = tensor.fmatrix() W = tensor.ftensor4() h = tensor.ftensor3() iIdx = tensor.imatrix() oIdx = tensor.imatrix() self._compile_and_check( [W, h, iIdx, b, oIdx], [sparse_block_dot(W, h, iIdx, b, oIdx)], self.gemv_data(), self.gemv_class )
def test_outer_infershape(self): o = tensor.ftensor4() x = tensor.ftensor3() y = tensor.ftensor3() xIdx = tensor.imatrix() yIdx = tensor.imatrix() self._compile_and_check( [o, x, y, xIdx, yIdx], [self.outer_op(o, x, y, xIdx, yIdx)], self.outer_data(), self.outer_class )
def __init__(self, config): autoassign(locals()) self.updater = util.Adam(max_norm=config['max_norm'], lr=config['lr']) self.Decode = Decoder(config['size_vocab'], config['size_embed'], config['size'], config['depth'], activation=eval(config.get('activation','clipped_rectify')), residual=config.get('residual', False)) self.ToTxt = Dense(config['size'], config['size_vocab']) self.inputs = [T.imatrix()] self.target = T.imatrix()
def __init__(self, dv, dh, dx, nc, alpha=1.0, init_scale=0.2, initial_embeddings=None, params_init=None, update='adagrad', seed=None, drop_p=0.5, momentum=0.9): self.dv = dv # vocabulary size self.dh = dh # hidden node size self.dx = dx # word embedding size self.nc = nc # number of classes self.alpha = alpha # regularization strength self.drop_p = drop_p # probability of dropping an input with dropout # adagrad parameters self.epsilon = 0.00001 if initial_embeddings is None: self.emb = theano.shared(name='embeddings', value=init_scale * np.random.uniform(-1.0, 1.0, (dv, dx)).astype(theano.config.floatX)) else: self.emb = theano.shared(name='embeddings', value=initial_embeddings.astype(theano.config.floatX)) self.W_x_i = theano.shared(name='W_x_i', value=init_scale * np.random.uniform(-1.0, 1.0, (dx, dh)) .astype(theano.config.floatX)) self.W_hl_i = theano.shared(name='W_hl_i', value=init_scale * np.random.uniform(-1.0, 1.0, (dh, dh)) .astype(theano.config.floatX)) self.W_hr_i = theano.shared(name='W_hr_i', value=init_scale * np.random.uniform(-1.0, 1.0, (dh, dh)) .astype(theano.config.floatX)) self.b_h_i = theano.shared(name='b_h_i', value=np.array(np.zeros(dh), dtype=theano.config.floatX)) self.W_x_f = theano.shared(name='W_x_f', value=init_scale * np.random.uniform(-1.0, 1.0, (dx, dh)) .astype(theano.config.floatX)) self.b_h_f = theano.shared(name='b_h_f', value=np.array(np.random.uniform(0.0, 1.0, dh), dtype=theano.config.floatX)) self.W_hl_fl = theano.shared(name='W_hl_fl', value=init_scale * np.random.uniform(-1.0, 1.0, (dh, dh)) .astype(theano.config.floatX)) self.W_hr_fl = theano.shared(name='W_hr_fl', value=init_scale * np.random.uniform(-1.0, 1.0, (dh, dh)) .astype(theano.config.floatX)) self.W_hl_fr = theano.shared(name='W_hl_fr', value=init_scale * np.random.uniform(-1.0, 1.0, (dh, dh)) .astype(theano.config.floatX)) self.W_hr_fr = theano.shared(name='W_hr_fr', value=init_scale * np.random.uniform(-1.0, 1.0, (dh, dh)) .astype(theano.config.floatX)) self.W_x_o = theano.shared(name='W_x_o', value=init_scale * np.random.uniform(-1.0, 1.0, (dx, dh)) .astype(theano.config.floatX)) self.W_hl_o = theano.shared(name='W_hl_o', value=init_scale * np.random.uniform(-1.0, 1.0, (dh, dh)) .astype(theano.config.floatX)) self.W_hr_o = theano.shared(name='W_hr_o', value=init_scale * np.random.uniform(-1.0, 1.0, (dh, dh)) .astype(theano.config.floatX)) self.b_h_o = theano.shared(name='b_h_o', value=np.array(np.zeros(dh), dtype=theano.config.floatX)) self.W_x_u = theano.shared(name='W_x_u', value=init_scale * np.random.uniform(-1.0, 1.0, (dx, dh)) .astype(theano.config.floatX)) self.W_hl_u = theano.shared(name='W_hl_u', value=init_scale * np.random.uniform(-1.0, 1.0, (dh, dh)) .astype(theano.config.floatX)) self.W_hr_u = theano.shared(name='W_hr_u', value=init_scale * np.random.uniform(-1.0, 1.0, (dh, dh)) .astype(theano.config.floatX)) self.b_h_u = theano.shared(name='b_h_u', value=np.array(np.zeros(dh), dtype=theano.config.floatX)) self.W_z = theano.shared(name='W_z', value=init_scale * np.random.uniform(-1.0, 1.0, (dh, nc)) .astype(theano.config.floatX)) self.b_z = theano.shared(name='b_z', value=np.array(np.zeros(nc), dtype=theano.config.floatX)) self.params = [self.W_x_i, self.W_hl_i, self.W_hr_i, self.b_h_i] self.params += [self.W_x_f, self.W_hl_fl, self.W_hr_fl, self.b_h_f] self.params += [self.W_hl_fr, self.W_hr_fr] self.params += [self.W_x_o, self.W_hl_o, self.W_hr_o, self.b_h_o] self.params += [self.W_x_u, self.W_hl_u, self.W_hr_u, self.b_h_u] self.params += [self.W_z, self.b_z] self.param_shapes = [(dx, dh), (dh, dh), (dh, dh), (dh,), (dx, dh), (dh, dh), (dh, dh), (dh,), (dh, dh), (dh, dh), (dx, dh), (dh, dh), (dh, dh), (dh,), (dx, dh), (dh, dh), (dh, dh), (dh,), (dh, nc), (nc,)] if update == 'adagrad': self.grad_histories = [ theano.shared( value=np.zeros(param_shape, dtype=theano.config.floatX), borrow=True, name="grad_hist:" + param.name ) for param_shape, param in zip(self.param_shapes, self.params) ] elif update == 'sgdm': self.velocity = [ theano.shared( value=np.zeros(param_shape, dtype=theano.config.floatX), borrow=True, name="momentum:" + param.name ) for param_shape, param in zip(self.param_shapes, self.params) ] self.momentum = momentum self.theano_rng = RandomStreams(seed) idxs = T.ivector() sequence_length = T.shape(idxs)[0] temp = self.emb[idxs] x = temp.reshape([sequence_length, dx]) #counter = T.ivector('counter') left_mask = T.imatrix('left_mask') right_mask = T.imatrix('right_mask') y = T.iscalar('y') lr = T.scalar('lr', dtype=theano.config.floatX) is_train = T.iscalar('is_train') drop_x = T.iscalar('drop_x') # This is a bit annoying; the 0th dimension of x needs to be sequence, so we can iterate over it # but the 0th dimension of the hidden nodes needs to be hidden-node dimension, so that we can broadcast # the mask out to it def treefwd(x_t, left_mask_t, right_mask_t, counter_t, h_tm1, c_tm1): h_t = h_tm1 c_t = c_tm1 # zero out the input unless this is a leaf node input = T.switch(T.eq(T.sum(left_mask_t) + T.sum(right_mask_t), 0), x_t, x_t*0) i_t = T.nnet.sigmoid(T.dot(input, self.W_x_i) + T.sum(T.dot(self.W_hl_i.T, (left_mask_t * h_tm1)).T, axis=0) + T.sum(T.dot(self.W_hr_i.T, (right_mask_t * h_tm1)).T, axis=0) + self.b_h_i) fl_t = T.nnet.sigmoid(T.dot(input, self.W_x_f) + T.sum(T.dot(self.W_hl_fl.T, (left_mask_t * h_tm1)).T, axis=0) + T.sum(T.dot(self.W_hr_fl.T, (right_mask_t * h_tm1)).T, axis=0) + self.b_h_f) fr_t = T.nnet.sigmoid(T.dot(input, self.W_x_f) + T.sum(T.dot(self.W_hl_fr.T, (left_mask_t * h_tm1)).T, axis=0) + T.sum(T.dot(self.W_hr_fr.T, (right_mask_t * h_tm1)).T, axis=0) + self.b_h_f) o_t = T.nnet.sigmoid(T.dot(input, self.W_x_o) + T.sum(T.dot(self.W_hl_o.T, (left_mask_t * h_tm1)).T, axis=0) + T.sum(T.dot(self.W_hr_o.T, (right_mask_t * h_tm1)).T, axis=0) + self.b_h_o) u_t = T.tanh(T.dot(input, self.W_x_u) + T.sum(T.dot(self.W_hl_u.T, (left_mask_t * h_tm1)).T, axis=0) + T.sum(T.dot(self.W_hr_u.T, (right_mask_t * h_tm1)).T, axis=0) + self.b_h_u) c_temp = i_t * u_t + fl_t * T.sum((left_mask_t * c_tm1).T, axis=0) + fr_t * T.sum((right_mask_t * c_tm1).T, axis=0) h_temp = o_t * T.tanh(c_temp) h_t = T.set_subtensor(h_t[:, counter_t], h_temp) c_t = T.set_subtensor(c_t[:, counter_t], c_temp) return h_t, c_t def drop(drop_input, drop_p, is_train): mask = self.theano_rng.binomial(p=1.0-drop_p, size=drop_input.shape, dtype=theano.config.floatX) return T.cast(T.switch(T.neq(is_train, 0), drop_input * mask, drop_input * (1.0-self.drop_p)), dtype=theano.config.floatX) ds, dx = T.shape(x) # do dropout on x, if specified x = T.switch(T.neq(drop_x, 0), drop(x, self.drop_p, is_train), x) output, _ = theano.scan(fn=treefwd, sequences=[x, left_mask, right_mask, T.arange(0, ds)], outputs_info=[T.zeros((dh, ds), dtype=theano.config.floatX), T.zeros((dh, ds), dtype=theano.config.floatX)]) full_h, full_c = output h = full_h[-1, :, -1] h = drop(h, self.drop_p, is_train) temp = T.dot(h, self.W_z) + self.b_z p_y_given_x = T.nnet.softmax(temp)[0] pred_y = T.argmax(p_y_given_x) log_loss = T.sum(-T.log(p_y_given_x[y])) penalty = T.sum([T.sum(p ** 2) for p in self.params]) cost = log_loss + alpha * penalty / 2.0 gradients = [T.grad(cost, param) for param in self.params] if update == 'adagrad': new_grad_histories = [ T.cast(g_hist + g ** 2, dtype=theano.config.floatX) for g_hist, g in zip(self.grad_histories, gradients) ] grad_hist_update = zip(self.grad_histories, new_grad_histories) param_updates = [(param, T.cast(param - lr / (T.sqrt(g_hist) + self.epsilon) * param_grad, dtype=theano.config.floatX)) for param, param_grad, g_hist in zip(self.params, gradients, new_grad_histories)] updates = grad_hist_update + param_updates # sgd with momentum elif update == 'sgdm': velocity_t = [momentum * v + lr * g for v, g in zip(self.velocity, gradients)] velocity_updates = [(v, T.cast(v_t, theano.config.floatX)) for v, v_t in zip(self.velocity, velocity_t)] param_updates = [(param, T.cast(param - v_t, theano.config.floatX)) for param, v_t in zip(self.params, velocity_t)] updates = velocity_updates + param_updates # else, basic sgd else: updates = OrderedDict((p, T.cast(p - lr * g, dtype=theano.config.floatX)) for p, g in zip(self.params, gradients)) self.train = theano.function(inputs=[idxs, left_mask, right_mask, y, lr, is_train, drop_x], outputs=[pred_y, p_y_given_x, log_loss, cost], updates=updates, on_unused_input='ignore') self.predict = theano.function(inputs=[idxs, left_mask, right_mask, is_train, drop_x], outputs=[pred_y, p_y_given_x]) # good example of how to see a value in a tensor; way easier than theano.printing.Print() idx = T.iscalar('idx') emb = self.emb[idx] self.get_embedding = theano.function(inputs=[idx], outputs=emb)
RECURR_SGDM_LR.set_value(RECURR_SGDM_LR.get_value() * EPOCH_LR_COEFF) ADAM_EPOCHS = 0 else: for _ in xrange(max_epoch): RESNET_ADAM_LR.set_value(RESNET_ADAM_LR.get_value() * EPOCH_LR_COEFF) RECURR_ADAM_LR.set_value(RECURR_ADAM_LR.get_value() * EPOCH_LR_COEFF) param_values_file = 'param_values_{}.pkl'.format(max_epoch) logger.info('Building the network.') im_features = lasagne.layers.get_output(resnet['pool5']) im_features = T.flatten(im_features, outdim=2) # batch size, number of features cap_out_var = T.imatrix('cap_out') # batch size, seq len cap_in_var = T.imatrix('cap_in') # batch size, seq len mask_var = T.bmatrix('mask_var') # batch size, seq len gate = lasagne.layers.Gate(W_in=lasagne.init.Orthogonal(), W_hid=lasagne.init.Orthogonal(), W_cell=lasagne.init.Normal(), b=lasagne.init.Constant(0.0)) cell_gate = lasagne.layers.Gate(W_in=lasagne.init.Orthogonal(), W_hid=lasagne.init.Orthogonal(), W_cell=None, b=lasagne.init.Constant(0.0), nonlinearity=lasagne.nonlinearities.tanh) forget_gate = lasagne.layers.Gate(W_in=lasagne.init.Orthogonal(), W_hid=lasagne.init.Orthogonal(), W_cell=lasagne.init.Normal(), b=lasagne.init.Constant(5.0))
def __init__(self, We_initial, params): self.eta = params.eta We = theano.shared(We_initial) embsize = We_initial.shape[1] hidden = params.hidden g = T.imatrix() gmask = T.fmatrix() y = T.ivector() idxs = T.ivector() l_in_word = lasagne.layers.InputLayer((None, None)) l_mask_word = lasagne.layers.InputLayer(shape=(None, None)) if params.emb == 1: l_emb_word = lasagne.layers.EmbeddingLayer( l_in_word, input_size=We_initial.shape[0], output_size=embsize, W=We) else: l_emb_word = lasagne_embedding_layer_2(l_in_word, embsize, We) #l_emb_word = lasagne.layers.EmbeddingLayer(l_in_word, input_size= We_initial.shape[0] , output_size = embsize , W =We) #l_emb_word = lasagne_embedding_layer_2(l_in_word, embsize , We) if params.dropout: l_emb_word = lasagne.layers.DropoutLayer(l_emb_word, p=0.5) if (params.inf == 0): l_lstm_wordf = lasagne.layers.LSTMLayer(l_emb_word, hidden, mask_input=l_mask_word) l_lstm_wordb = lasagne.layers.LSTMLayer(l_emb_word, hidden, mask_input=l_mask_word, backwards=True) l_reshapef = lasagne.layers.ReshapeLayer(l_lstm_wordf, (-1, hidden)) l_reshapeb = lasagne.layers.ReshapeLayer(l_lstm_wordb, (-1, hidden)) concat2 = lasagne.layers.ConcatLayer([l_reshapef, l_reshapeb]) elif (params.inf == 1): l_cnn_input = lasagne.layers.DimshuffleLayer(l_emb_word, (0, 2, 1)) l_cnn_1 = lasagne.layers.Conv1DLayer(l_cnn_input, hidden, 1, 1, pad='same') l_cnn_3 = lasagne.layers.Conv1DLayer(l_cnn_input, hidden, 3, 1, pad='same') l_cnn = lasagne.layers.ConcatLayer([l_cnn_1, l_cnn_3], axis=1) #l_cnn = lasagne.layers.Conv1DLayer(l_cnn_input, hidden, 1, 1, pad = 'same') concat2 = lasagne.layers.DimshuffleLayer(l_cnn, (0, 2, 1)) #concat2 = lasagne.layers.ConcatLayer([l_emb_word, concat2], axis =2) concat2 = lasagne.layers.ReshapeLayer(concat2, (-1, 2 * hidden)) else: l_cnn_input = lasagne.layers.DimshuffleLayer(l_emb_word, (0, 2, 1)) l_cnn = lasagne.layers.Conv1DLayer(l_cnn_input, hidden, 3, 1, pad='same') concat2 = lasagne.layers.DimshuffleLayer(l_cnn, (0, 2, 1)) concat2 = lasagne.layers.ReshapeLayer(concat2, (-1, hidden)) concat2 = lasagne.layers.DenseLayer(concat2, num_units=hidden) if params.dropout: concat2 = lasagne.layers.DropoutLayer(concat2, p=0.5) #l_emb = lasagne.layers.DenseLayer(concat2, num_units=hidden, nonlinearity=lasagne.nonlinearities.tanh) l_out = lasagne.layers.DenseLayer( concat2, num_units=params.num_labels, nonlinearity=lasagne.nonlinearities.softmax) output = lasagne.layers.get_output(l_out, { l_in_word: g, l_mask_word: gmask }) output_1 = output[idxs] test_output = lasagne.layers.get_output(l_out, { l_in_word: g, l_mask_word: gmask }, deterministic=True) test_output_1 = test_output[idxs] model_params = lasagne.layers.get_all_params(l_out, trainable=True) self.model_p = lasagne.layers.get_all_params(l_out, trainable=True) reg = sum(lasagne.regularization.l2(x) for x in model_params) cost = lasagne.objectives.categorical_crossentropy(output_1, y) cost = T.mean(cost) + params.L2 * reg #pred = T.argmax(output_1, axis=1) final_pred = T.argmax(test_output_1, axis=1) y1 = T.ones_like(y) SUM = T.sum(y1) acc = 1.0 * T.sum(T.eq(final_pred, y)) / SUM self.acc_function = theano.function([g, gmask, y, idxs], [acc, final_pred], on_unused_input='warn') #updates = lasagne.updates.adam(cost, model_params, self.eta) #from adam import adam #updates = adam(cost, model_params, self.eta) updates = lasagne.updates.sgd(cost, model_params, self.eta) updates = lasagne.updates.apply_momentum(updates, model_params, momentum=0.9) self.train_function = theano.function([g, gmask, y, idxs], [cost, acc], updates=updates, on_unused_input='warn')
data_dir = "/nikel/dhpark/fundus/kaggle/original/training/train_medium" label_file = "/nikel/dhpark/fundus/kaggle/original/training/trainLabels.csv" #mean_file = "" #model = "models/softmax_regression" #model = "models/double_softmax" model = "models/512x512_model" #model = "models/vgg_bn_pairwise" dst_path = "/nikel/dhpark/fundus_saved_weights/vgg_pairwise" #dst_path = "/nikel/dhpark/fundus_saved_weights/multi_task_loss_oversampled" #dst_path = "/nikel/dhpark/fundus_saved_weights/hybrid_loss" #dst_path = "/nikel/dhpark/fundus_saved_weights/vgg_bn_pairwise" # Load the model x = T.tensor4('x') y = T.imatrix('y') input_layer, output_layer = load_model(model).build_model(x) # Get batchiterator #First load the files and split to train and validation set #Then create a iterator using these files = data_util.get_image_files(data_dir) names = data_util.get_names(files) labels = data_util.get_labels(names, label_file=label_file).astype(np.int32) print('{} files loaded'.format(len(files))) paired_files, paired_labels, merged_labels = data_util.pair_up(files, labels) sss = StratifiedShuffleSplit(merged_labels, n_iter=1, test_size=0.1, random_state=123) train_idx, valid_idx = next(iter(sss))
def __init__(self, name='gnic', nimg=2048, nh=512, nw=512, nout=8843, model_file=None): self.name = name if model_file is not None: with h5py.File(model_file, 'r') as f: nimg = f.attrs['nimg'] nh = f.attrs['nh'] nw = f.attrs['nw'] nout = f.attrs['nout'] self.config = {'nimg': nimg, 'nh': nh, 'nw': nw, 'nout': nout} # word embedding layer self.embedding = Embedding(n_emb=nout, dim_emb=nw, name=self.name + '@embedding') # initialization mlp layer self.proj_mlp = MLP(layer_sizes=[nimg, 2 * nh], output_type='tanh', name=self.name + '@proj_mlp') # lstm self.lstm = BasicLSTM(dim_x=nw, dim_h=nh, name=self.name + '@lstm') # prediction mlp self.pred_mlp = MLP(layer_sizes=[nh + nw, nout], output_type='softmax', name=self.name + '@pred_mlp') # inputs cap = T.imatrix('cap') img = T.matrix('img') self.inputs = [cap, img] # go through sequence init_state = self.proj_mlp.compute(img) (state, self.p, loss), _ = theano.scan(fn=self.scan_func, sequences=[cap[0:-1, :], cap[1:, :]], outputs_info=[init_state, None, None]) # loss function loss = T.mean(loss) self.costs = [loss] # layers and parameters self.layers = [self.embedding, self.proj_mlp, self.lstm, self.pred_mlp] self.params = sum([l.params for l in self.layers], []) # load weights from file, if model_file is not None if model_file is not None: self.load_weights(model_file) # these functions are used in test stage self._init_func = None self._step_func = None
def __init__(self, input_width, input_height, num_actions, num_frames, discount, learning_rate, rho, rms_epsilon, momentum, clip_delta, freeze_interval, batch_size, network_type, update_rule, lambda_reg, batch_accumulator, pretrained_net, rng, input_scale=255.0): self.input_width = input_width self.input_height = input_height self.num_actions = num_actions self.num_frames = num_frames self.batch_size = batch_size self.discount = discount self.rho = rho self.lr = learning_rate self.rms_epsilon = rms_epsilon self.momentum = momentum self.clip_delta = clip_delta self.freeze_interval = freeze_interval self.rng = rng self.lambda_reg = lambda_reg lasagne.random.set_rng(self.rng) self.update_counter = 0 self.l_in, self.l_act_in, self.l_out, self.pred_z, self.true_z = \ self.build_network(network_type, \ input_width, input_height, num_actions,\ num_frames, batch_size) if self.freeze_interval > 0: self.next_l_in, self.next_l_act_in, self.next_l_out, _d, _d = \ self.build_network(network_type, input_width, \ input_height, num_actions, num_frames, batch_size) self.reset_q_hat() states = T.tensor4('states') next_states = T.tensor4('next_states') rewards = T.col('rewards') actions = T.imatrix('actions') terminals = T.icol('terminals') # Shared variables for training from a minibatch of replayed # state transitions, each consisting of num_frames + 1 (due to # overlap) images, along with the chosen action and resulting # reward and terminal status. self.imgs_shared = theano.shared( np.zeros((batch_size, num_frames*2+1, input_height, input_width), dtype=theano.config.floatX)) self.rewards_shared = theano.shared( np.zeros((batch_size, 1), dtype=theano.config.floatX), broadcastable=(False, True)) self.actions_shared = theano.shared( np.zeros((batch_size, num_frames), dtype='int32') ) self.terminals_shared = theano.shared( np.zeros((batch_size, 1), dtype='int32'), broadcastable=(False, True)) # Shared variable for a single state, to calculate q_vals. self.state_shared = theano.shared( np.zeros((num_frames*2, input_height, input_width), dtype=theano.config.floatX)) q_vals, z_pred, z_true = lasagne.layers.get_output( [self.l_out, self.pred_z, self.true_z], inputs = {self.l_in: states / input_scale, self.l_act_in: actions} ) if self.freeze_interval > 0: next_q_vals = lasagne.layers.get_output( self.next_l_out, {self.next_l_in: next_states / input_scale, self.next_l_act_in: actions} ) else: next_q_vals = lasagne.layers.get_output( self.l_out, {self.l_in: next_states / input_scale, self.l_act_in: actions} ) next_q_vals = theano.gradient.disconnected_grad(next_q_vals) terminalsX = terminals.astype(theano.config.floatX) actionmask = T.eq(T.arange(num_actions).reshape((1, -1)), actions[:, 0].reshape((-1, 1))).astype(theano.config.floatX) target = (rewards + (T.ones_like(terminalsX) - terminalsX) * self.discount * T.max(next_q_vals, axis=1, keepdims=True)) output = (q_vals * actionmask).sum(axis=1).reshape((-1, 1)) diff = target - output diff_reg = z_true - z_pred if self.clip_delta > 0: # If we simply take the squared clipped diff as our loss, # then the gradient will be zero whenever the diff exceeds # the clip bounds. To avoid this, we extend the loss # linearly past the clip point to keep the gradient constant # in that regime. # # This is equivalent to declaring d loss/d q_vals to be # equal to the clipped diff, then backpropagating from # there, which is what the DeepMind implementation does. quadratic_part = T.minimum(abs(diff), self.clip_delta) linear_part = abs(diff) - quadratic_part loss = 0.5 * quadratic_part ** 2 + self.clip_delta * linear_part else: loss = 0.5 * diff ** 2 loss = loss + 0.5 * self.lambda_reg * (diff_reg ** 2).sum(axis=1) if batch_accumulator == 'sum': loss = T.sum(loss) elif batch_accumulator == 'mean': loss = T.mean(loss) else: raise ValueError("Bad accumulator: {}".format(batch_accumulator)) params = lasagne.layers.helper.get_all_params([self.l_out, self.pred_z, self.true_z]) train_givens = { states: self.imgs_shared[:, :-1], next_states: self.imgs_shared[:, 1:], rewards: self.rewards_shared, actions: self.actions_shared, terminals: self.terminals_shared } if update_rule == 'deepmind_rmsprop': updates = deepmind_rmsprop(loss, params, self.lr, self.rho, self.rms_epsilon) elif update_rule == 'rmsprop': updates = lasagne.updates.rmsprop(loss, params, self.lr, self.rho, self.rms_epsilon) elif update_rule == 'sgd': updates = lasagne.updates.sgd(loss, params, self.lr) else: raise ValueError("Unrecognized update: {}".format(update_rule)) if self.momentum > 0: updates = lasagne.updates.apply_momentum(updates, None, self.momentum) self._train = theano.function([], [loss], updates=updates, givens=train_givens) q_givens = { states: self.state_shared.reshape((1, self.num_frames*2, self.input_height, self.input_width)) } self._q_vals = theano.function([], q_vals[0], givens=q_givens)
config = importlib.import_module('configurations.%s' % metadata['configuration']) # samples dir if not os.path.isdir('samples'): os.makedirs('samples') target_path = "samples/%s-s%d-%.2f-%s.txt" % ( metadata['experiment_id'], rng_seed, temperature, time.strftime("%Y%m%d-%H%M%S", time.localtime())) token2idx = metadata['token2idx'] idx2token = dict((v, k) for k, v in token2idx.iteritems()) vocab_size = len(token2idx) print('Building the model') x = T.imatrix('x') l_inp = InputLayer((1, None), input_var=x) W_emb = np.eye( vocab_size, dtype='float32') if config.one_hot else lasagne.init.Orthogonal() emb_output_size = vocab_size if config.one_hot else config.embedding_size l_emb = EmbeddingLayer(l_inp, input_size=vocab_size, output_size=emb_output_size, W=W_emb) main_layers = [] for _ in xrange(config.num_layers): if not main_layers:
def ready(self): args = self.args w_emb_layer = self.w_emb_layer c_emb_layer = self.c_emb_layer r_emb_layers = self.r_emb_layers r_matrix_layers = self.r_matrix_layers char_dim = self.char_dim = args.char_dim char_lstm_dim = self.char_lstm_dim = args.char_lstm_dim word_dim = self.word_dim = args.word_dim word_lstm_dim = self.word_lstm_dim = args.word_lstm_dim dropout = self.dropout = theano.shared( np.float64(args.dropout).astype(theano.config.floatX)) word_ids = self.word_ids = T.ivector('word_ids') char_ids = self.char_ids = T.imatrix('char_ids') char_lens = self.char_lens = T.fvector('char_lens') char_masks = self.char_masks = T.imatrix('char_masks') up_ids = self.up_ids = T.imatrix('up_ids') up_rels = self.up_rels = T.imatrix('up_rels') up_id_masks = self.up_id_masks = T.imatrix('up_id_masks') down_ids = self.down_ids = T.imatrix('down_ids') down_rels = self.down_rels = T.imatrix('down_rels') down_id_masks = self.down_id_masks = T.imatrix('down_id_masks') tag_ids = self.tag_ids = T.ivector('tag_ids') layers = self.layers = [w_emb_layer, c_emb_layer] layers.extend(r_emb_layers) layers.extend(r_matrix_layers) inputs = self.inputs = [] inputs.append(self.word_ids) inputs.append(self.char_ids) inputs.append(self.char_lens) inputs.append(self.char_masks) inputs.append(self.up_ids) inputs.append(self.up_rels) inputs.append(self.up_id_masks) inputs.append(self.down_ids) inputs.append(self.down_rels) inputs.append(self.down_id_masks) inputs.append(self.tag_ids) wslices = w_emb_layer.forward(word_ids) cslices = c_emb_layer.forward(char_ids.ravel()) cslices = cslices.reshape( (char_ids.shape[0], char_ids.shape[1], char_dim)) cslices = cslices.dimshuffle(1, 0, 2) bv_ur_slicess = [] bv_dr_slicess = [] b_ur_slicess = [] b_dr_slicess = [] bv_ur_matrixss = [] bv_dr_matrixss = [] b_ur_matrixss = [] b_dr_matrixss = [] for r_matrix_layer in r_matrix_layers: bv_ur_matrixs = r_matrix_layer.forward1(up_rels.ravel()) bv_dr_matrixs = r_matrix_layer.forward1(down_rels.ravel()) b_ur_matrixs = r_matrix_layer.forward2(up_rels.ravel()) b_dr_matrixs = r_matrix_layer.forward2(down_rels.ravel()) bv_ur_matrixss.append( bv_ur_matrixs.reshape( (up_rels.shape[0], up_rels.shape[1], word_dim, word_dim))) bv_dr_matrixss.append( bv_dr_matrixs.reshape((down_rels.shape[0], down_rels.shape[1], word_dim, word_dim))) b_ur_matrixss.append( b_ur_matrixs.reshape( (up_rels.shape[0], up_rels.shape[1], word_dim, word_dim))) b_dr_matrixss.append( b_dr_matrixs.reshape((down_rels.shape[0], down_rels.shape[1], word_dim, word_dim))) for r_emb_layer in r_emb_layers: bv_ur_slices = r_emb_layer.forward(up_rels.ravel()) bv_dr_slices = r_emb_layer.forward(down_rels.ravel()) b_ur_slices = r_emb_layer.forward2(up_rels.ravel()) b_dr_slices = r_emb_layer.forward2(down_rels.ravel()) bv_ur_slicess.append( bv_ur_slices.reshape( (up_rels.shape[0], up_rels.shape[1], word_dim))) bv_dr_slicess.append( bv_dr_slices.reshape( (down_rels.shape[0], down_rels.shape[1], word_dim))) b_ur_slicess.append( b_ur_slices.reshape( (up_rels.shape[0], up_rels.shape[1], word_dim))) b_dr_slicess.append( b_dr_slices.reshape( (down_rels.shape[0], down_rels.shape[1], word_dim))) char_masks = char_masks.dimshuffle(1, 0) prev_output = wslices prev_size = word_dim if char_dim: layers.append( LSTM(n_in=char_dim, n_out=char_lstm_dim, direction='bi' if args.char_bidirect else 'si')) prev_output_2 = cslices prev_output_2 = apply_dropout(prev_output_2, dropout, v2=True) prev_output_2 = layers[-1].forward_all(cslices, char_masks) prev_output_2 = T.sum(prev_output_2, axis=0) prev_output_2 = prev_output_2 / (1e-6 * T.ones_like(char_lens) + char_lens).dimshuffle(0, 'x') prev_size += char_lstm_dim prev_output = T.concatenate([prev_output, prev_output_2], axis=1) prev_output = apply_dropout(prev_output, dropout) if args.conv != 0: for ind in range(args.clayer): layers.append(GraphCNNTensor( n_in=prev_size, n_out=prev_size, )) residual = True if ind == 0: residual = False prev_output = layers[-1].forward_all(prev_output, up_ids, up_id_masks, bv_ur_slicess[ind], bv_ur_matrixss[ind], b_ur_slicess[ind], b_ur_matrixss[ind], down_ids, down_id_masks, bv_dr_slicess[ind], bv_dr_matrixss[ind], b_dr_slicess[ind], b_dr_matrixss[ind], residual=residual) prev_output = apply_dropout(prev_output, dropout) prev_size *= 3 layers.append( LSTM(n_in=prev_size, n_out=word_lstm_dim, direction='bi' if args.word_bidirect else 'si')) prev_output = prev_output.dimshuffle(0, 'x', 1) prev_output = layers[-1].forward_all(prev_output) prev_output = prev_output.reshape( (prev_output.shape[0], prev_output.shape[-1])) prev_size = word_lstm_dim layers.append( Layer( n_in=prev_size, n_out=args.classes, activation=linear, #ReLU, has_bias=False)) n_tags = args.classes s_len = char_ids.shape[0] tags_scores = layers[-1].forward(prev_output) transitions = shared((n_tags + 2, n_tags + 2), 'transitions') small = -1000 b_s = np.array([[small] * n_tags + [0, small]]).astype(np.float32) e_s = np.array([[small] * n_tags + [small, 0]]).astype(np.float32) observations = T.concatenate([tags_scores, small * T.ones((s_len, 2))], axis=1) observations = T.concatenate([b_s, observations, e_s], axis=0) real_path_score = tags_scores[T.arange(s_len), tag_ids].sum() b_id = theano.shared(value=np.array([n_tags], dtype=np.int32)) e_id = theano.shared(value=np.array([n_tags + 1], dtype=np.int32)) padded_tags_ids = T.concatenate([b_id, tag_ids, e_id], axis=0) pre_ids = T.arange(s_len + 1) s_ids = T.arange(s_len + 1) + 1 real_path_score += transitions[padded_tags_ids[pre_ids], padded_tags_ids[s_ids]].sum() all_paths_scores = CRFForward(observations, transitions) self.nll_loss = nll_loss = -(real_path_score - all_paths_scores) preds = CRFForward(observations, transitions, viterbi=True, return_alpha=False, return_best_sequence=True) self.pred = preds[1:-1] self.l2_sqr = None params = self.params = [transitions] for layer in layers: self.params += layer.params for p in self.params: if self.l2_sqr is None: self.l2_sqr = args.l2_reg * T.sum(p**2) else: self.l2_sqr += args.l2_reg * T.sum(p**2) #for l, i in zip(layers[3:], range(len(layers[3:]))): for l, i in zip( layers[2 + len(r_emb_layers) + len(r_matrix_layers):], range( len(layers[2 + len(r_emb_layers) + len(r_matrix_layers):]))): say("layer {}: n_in={}\tn_out={}\n".format(i, l.n_in, l.n_out)) nparams = sum(len(x.get_value(borrow=True).ravel()) \ for x in self.params) say("total # parameters: {}\n".format(nparams)) cost = self.nll_loss + self.l2_sqr lr_method_name = args.learning lr_method_parameters = {} lr_method_parameters['lr'] = args.learning_rate updates = Optimization(clip=5.0).get_updates(lr_method_name, cost, params, **lr_method_parameters) f_train = theano.function(inputs=self.inputs, outputs=[cost, nll_loss], updates=updates, allow_input_downcast=True) f_eval = theano.function(inputs=self.inputs[:-1], outputs=self.pred, allow_input_downcast=True) return f_train, f_eval
def build(self, dropout, char_dim, char_lstm_dim, char_bidirect, word_dim, word_lstm_dim, word_bidirect, lr_method, pre_emb, crf, cap_dim, training=True, hard_training_labels=True, crf_probs=False, **kwargs): """ Build the network. """ # Training parameters n_words = len(self.id_to_word) n_chars = len(self.id_to_char) n_tags = len(self.id_to_tag) # Number of capitalization features if cap_dim: n_cap = 4 # Network variables is_train = T.iscalar('is_train') word_ids = T.ivector(name='word_ids') char_for_ids = T.imatrix(name='char_for_ids') char_rev_ids = T.imatrix(name='char_rev_ids') char_pos_ids = T.ivector(name='char_pos_ids') if hard_training_labels: tag_ids = T.ivector(name='tag_ids') else: tag_dist = T.imatrix(name='tag_dist') if cap_dim: cap_ids = T.ivector(name='cap_ids') # Sentence length s_len = (word_ids if word_dim else char_pos_ids).shape[0] # Final input (all word features) input_dim = 0 inputs = [] # # Word inputs # if word_dim: input_dim += word_dim word_layer = EmbeddingLayer(n_words, word_dim, name='word_layer') word_input = word_layer.link(word_ids) inputs.append(word_input) # Initialize with pretrained embeddings if pre_emb and training: new_weights = word_layer.embeddings.get_value() print('Loading pretrained embeddings from %s...' % pre_emb) pretrained = {} emb_invalid = 0 for i, line in enumerate(codecs.open(pre_emb, 'r', 'utf-8')): if i > 100000: break # we don't need all the embeddings line = line.rstrip().split() if len(line) == word_dim + 1: pretrained[line[0]] = np.array( [float(x) for x in line[1:]]).astype(np.float32) else: emb_invalid += 1 if emb_invalid > 0: print('WARNING: %i invalid lines' % emb_invalid) c_found = 0 c_lower = 0 c_zeros = 0 # Lookup table initialization for i in range(n_words): word = self.id_to_word[i] if word in pretrained: new_weights[i] = pretrained[word] c_found += 1 elif word.lower() in pretrained: new_weights[i] = pretrained[word.lower()] c_lower += 1 elif re.sub('\d', '0', word.lower()) in pretrained: new_weights[i] = pretrained[re.sub( '\d', '0', word.lower())] c_zeros += 1 word_layer.embeddings.set_value(new_weights) print('Loaded %i pretrained embeddings.' % len(pretrained)) print(('%i / %i (%.4f%%) words have been initialized with ' 'pretrained embeddings.') % (c_found + c_lower + c_zeros, n_words, 100. * (c_found + c_lower + c_zeros) / n_words)) print(('%i found directly, %i after lowercasing, ' '%i after lowercasing + zero.') % (c_found, c_lower, c_zeros)) # # Chars inputs # if char_dim: input_dim += char_lstm_dim char_layer = EmbeddingLayer(n_chars, char_dim, name='char_layer') char_lstm_for = LSTM(char_dim, char_lstm_dim, with_batch=True, name='char_lstm_for') char_lstm_rev = LSTM(char_dim, char_lstm_dim, with_batch=True, name='char_lstm_rev') char_lstm_for.link(char_layer.link(char_for_ids)) char_lstm_rev.link(char_layer.link(char_rev_ids)) char_for_output = char_lstm_for.h.dimshuffle( (1, 0, 2))[T.arange(s_len), char_pos_ids] char_rev_output = char_lstm_rev.h.dimshuffle( (1, 0, 2))[T.arange(s_len), char_pos_ids] inputs.append(char_for_output) if char_bidirect: inputs.append(char_rev_output) input_dim += char_lstm_dim # # Capitalization feature # if cap_dim: input_dim += cap_dim cap_layer = EmbeddingLayer(n_cap, cap_dim, name='cap_layer') inputs.append(cap_layer.link(cap_ids)) # Prepare final input inputs = T.concatenate(inputs, axis=1) if len(inputs) != 1 else inputs[0] # # Dropout on final input # if dropout: dropout_layer = DropoutLayer(p=dropout) input_train = dropout_layer.link(inputs) input_test = (1 - dropout) * inputs inputs = T.switch(T.neq(is_train, 0), input_train, input_test) # LSTM for words word_lstm_for = LSTM(input_dim, word_lstm_dim, with_batch=False, name='word_lstm_for') word_lstm_rev = LSTM(input_dim, word_lstm_dim, with_batch=False, name='word_lstm_rev') word_lstm_for.link(inputs) word_lstm_rev.link(inputs[::-1, :]) word_for_output = word_lstm_for.h word_rev_output = word_lstm_rev.h[::-1, :] if word_bidirect: final_output = T.concatenate([word_for_output, word_rev_output], axis=1) tanh_layer = HiddenLayer(2 * word_lstm_dim, word_lstm_dim, name='tanh_layer', activation='tanh') final_output = tanh_layer.link(final_output) else: final_output = word_for_output # Sentence to Named Entity tags - Score final_layer = HiddenLayer(word_lstm_dim, n_tags, name='final_layer', activation=(None if crf else 'softmax')) tags_scores = final_layer.link(final_output) # No CRF if not crf: # Here we pass in hard labels as 'tag_ids'. We can also pass in a matrix in this place where each row is # a categorical distribution to provide a soft label. if hard_training_labels: cost = T.nnet.categorical_crossentropy(tags_scores, tag_ids).mean() else: cost = T.nnet.categorical_crossentropy(tags_scores, tag_dist).mean() # CRF else: transitions = shared((n_tags + 2, n_tags + 2), 'transitions') small = -1000 b_s = np.array([[small] * n_tags + [0, small]]).astype(np.float32) e_s = np.array([[small] * n_tags + [small, 0]]).astype(np.float32) observations = T.concatenate( [tags_scores, small * T.ones((s_len, 2))], axis=1) observations = T.concatenate([b_s, observations, e_s], axis=0) # Score from tags -- uses tag_ids as hard labels here. if hard_training_labels: real_path_score = tags_scores[T.arange(s_len), tag_ids].sum() else: # soft training labels (probabilities) real_path_score = (tags_scores * tag_dist).sum() b_id = theano.shared(value=np.array([n_tags], dtype=np.int32)) e_id = theano.shared(value=np.array([n_tags + 1], dtype=np.int32)) if hard_training_labels: padded_tags_ids = T.concatenate([b_id, tag_ids, e_id], axis=0) else: padded_tags_ids = T.concatenate([b_id, tag_dist, e_id], axis=0) real_path_score += transitions[padded_tags_ids[T.arange(s_len + 1)], padded_tags_ids[T.arange(s_len + 1) + 1]].sum() all_paths_scores = forward(observations, transitions) cost = -(real_path_score - all_paths_scores) # Network parameters params = [] if word_dim: self.add_component(word_layer) params.extend(word_layer.params) if char_dim: self.add_component(char_layer) self.add_component(char_lstm_for) params.extend(char_layer.params) params.extend(char_lstm_for.params) if char_bidirect: self.add_component(char_lstm_rev) params.extend(char_lstm_rev.params) self.add_component(word_lstm_for) params.extend(word_lstm_for.params) if word_bidirect: self.add_component(word_lstm_rev) params.extend(word_lstm_rev.params) if cap_dim: self.add_component(cap_layer) params.extend(cap_layer.params) self.add_component(final_layer) params.extend(final_layer.params) if crf: self.add_component(transitions) params.append(transitions) if word_bidirect: self.add_component(tanh_layer) params.extend(tanh_layer.params) # Prepare train and eval inputs eval_inputs = [] if word_dim: eval_inputs.append(word_ids) if char_dim: eval_inputs.append(char_for_ids) if char_bidirect: eval_inputs.append(char_rev_ids) eval_inputs.append(char_pos_ids) if cap_dim: eval_inputs.append(cap_ids) if hard_training_labels: train_inputs = eval_inputs + [tag_ids] else: train_inputs = eval_inputs + [tag_dist] # Parse optimization method parameters if "-" in lr_method: lr_method_name = lr_method[:lr_method.find('-')] lr_method_parameters = {} for x in lr_method[lr_method.find('-') + 1:].split('-'): split = x.split('_') assert len(split) == 2 lr_method_parameters[split[0]] = float(split[1]) else: lr_method_name = lr_method lr_method_parameters = {} # Compile training function print('Compiling...') if training: updates = Optimization(clip=5.0).get_updates( lr_method_name, cost, params, **lr_method_parameters) f_train = theano.function(inputs=train_inputs, outputs=cost, updates=updates, givens=({ is_train: np.cast['int32'](1) } if dropout else {})) else: f_train = None # Compile evaluation function if not crf: f_eval = theano.function(inputs=eval_inputs, outputs=tags_scores, givens=({ is_train: np.cast['int32'](0) } if dropout else {})) else: f_eval = theano.function(inputs=eval_inputs, outputs=forward( observations, transitions, viterbi=True, return_alpha=crf_probs, return_best_sequence=not crf_probs), givens=({ is_train: np.cast['int32'](0) } if dropout else {})) return f_train, f_eval
def build_model(options): print('Build model...') sys.stdout.flush() weights = None if options['flag_random_lookup_table'] == False: weights = options['embedding'] embed_layer = Embedding(input_dim=options['embedding'].shape[0], output_dim=options['embedding'].shape[1], weights=weights) dense_layers = [] dense_layers.append( Dense(input_dim=options['embedding'].shape[1] * 2, output_dim=options['size_hidden_layer'], activation='tanh')) dense_layers.append( Dense(input_dim=options['size_hidden_layer'], output_dim=1, activation='sigmoid')) # for training sentence1 = T.imatrix('s1') # sentence1, n_samples * len_sentence sentence1_mask = T.matrix('s1_mask') sentence2 = T.imatrix('s2') # sentence2, n_samples * len_sentence sentence2_mask = T.matrix('s2_mask') y = T.ivector('y1') # n_samples embed_s1 = embed_layer.get_output( sentence1) # n_samples * len_sentence * embed_dim embed_s2 = embed_layer.get_output( sentence2) # n_samples * len_sentence * embed_dim if options['sentence_modeling'] == 'CBoW': embed_s1 = ave_embed(embed_s1, sentence1_mask) # n_samples * embed_dim embed_s2 = ave_embed(embed_s2, sentence2_mask) # n_samples * embed_dim elif options['sentence_modeling'] == 'CNN': sentence_encode_layer = Convolution1D( input_dim=options['embedding'].shape[1], activation='tanh', nb_filter=options['embedding'].shape[1], filter_length=options['CNN_filter_length'], border_mode='same') embed_s1 = CNN_embed(embed_s1, sentence1_mask, sentence_encode_layer) # n_samples * embed_dim embed_s2 = CNN_embed(embed_s2, sentence2_mask, sentence_encode_layer) # n_samples * embed_dim elif options['sentence_modeling'] == 'LSTM': sentence_encode_layer = LSTM(input_dim=options['embedding'].shape[1], output_dim=options['embedding'].shape[1]) embed_s1 = LSTM_embed(embed_s1, sentence1_mask, sentence_encode_layer, options) # n_samples * embed_dim embed_s2 = LSTM_embed(embed_s2, sentence2_mask, sentence_encode_layer, options) # n_samples * embed_dim else: print 'Error: No model called %s available!' % options[ 'sentence_modeling'] return output = T.concatenate([embed_s1, embed_s2], axis=-1) # n_samples * (embed_dim * 2) if options['flag_dropout'] == True: output = dropout(output, level=options['dropoutRates']) for dense_layer in dense_layers: output = dense_layer.get_output(output) f_pred = theano.function( [sentence1, sentence1_mask, sentence2, sentence2_mask], output, allow_input_downcast=True) output = output.reshape((output.shape[0], )) #y = y.reshape((output.shape[0],1)) cost = T.nnet.binary_crossentropy(output, y).mean() f_debug = theano.function( [sentence1, sentence1_mask, sentence2, sentence2_mask, y], [output, y, T.nnet.binary_crossentropy(output, y), cost], allow_input_downcast=True) tparams = [] tparams += embed_layer.params if options['sentence_modeling'] != 'CBoW': tparams += sentence_encode_layer.params for dense_layer in dense_layers: tparams += dense_layer.params return sentence1, sentence1_mask, sentence2, sentence2_mask, y, cost, f_pred, tparams, f_debug
initialization='he', weightnorm=WEIGHT_NORM) out = T.nnet.relu(out) # Output # We apply the softmax later out = lib.ops.Linear('SampleLevel.Output', DIM, Q_LEVELS, out, weightnorm=WEIGHT_NORM) return out print('----got to T var---') sequences = T.imatrix('sequences') h0 = T.tensor3('h0') reset = T.iscalar('reset') mask = T.matrix('mask') #sequences_lab = T.tensor3('sequences_lab') sequences_lab = T.itensor3('sequences_lab') if args.debug: # Solely for debugging purposes. # Maybe I should set the compute_test_value=warn from here. sequences.tag.test_value = numpy.zeros((BATCH_SIZE, SEQ_LEN + OVERLAP), dtype='int32') h0.tag.test_value = numpy.zeros((BATCH_SIZE, N_RNN, H0_MULT * DIM), dtype='float32') reset.tag.test_value = numpy.array(1, dtype='int32') mask.tag.test_value = numpy.ones((BATCH_SIZE, SEQ_LEN + OVERLAP),
def run_epoch(): # define symbolic variables q = T.imatrix('q') q_mask = T.matrix('q_mask', dtype=theano.config.floatX) l = T.imatrix('l') l_mask = T.matrix('l_mask', dtype=theano.config.floatX) a = T.imatrix('a') a_mask = T.matrix('a_mask', dtype=theano.config.floatX) y = T.ivector('y') lr = T.scalar(name='lr') np_emb = get_embedding_matrix_from_param_file(config.embedding_param_file) # build model print '...building model' model = DMN(q, q_mask, l, l_mask, a, a_mask, y, np_emb, options['word_size'], options['hidden_size'], options['use_dropout'], options['drop_p']) cost = model.loss grads = T.grad(cost, wrt=list(model.params.values())) optimizer = options['optimizer'] f_grad_shared, f_update = optimizer(lr, model.params, grads, [q, q_mask, l, l_mask, a, a_mask, y], cost) detector = theano.function(inputs=[q, q_mask, l, l_mask, a, a_mask, y], outputs=model.error, on_unused_input='ignore') p_predictor = theano.function(inputs=[q, q_mask, l, l_mask, a, a_mask], outputs=model.p_d, on_unused_input='ignore') # load parameters from specified file if not options['loaded_params'] is None: print '...loading parameters from ' + options['loaded_params'] file_name = options['loaded_params'] with open(file_name, 'rb') as f: param_dict = cPickle.load(f) for k, v in model.params.items(): v.set_value(param_dict[k]) # test the performance of initialized parameters print '...testing the performance of initialized parameters' p_ds = [] ys = [] for q_, q_mask_, l_, l_mask_, a_, a_mask_, y_ in gkhmc_qla_iterator( path=config.dataset, batch_size=options['valid_batch_size'], is_train=False): p_d = p_predictor(q_, q_mask_, l_, l_mask_, a_, a_mask_) p_ds.extend(p_d) ys.extend(y_) right_num, total_num, _ = pred_check(p_ds, ys) print right_num, '/', total_num best_perform = -np.inf print '...training model' for i in xrange(options['max_epochs']): total_loss = 0. idx = 0 for q_, q_mask_, l_, l_mask_, a_, a_mask_, y_ in gkhmc_qla_iterator( path=config.dataset, batch_size=options['batch_size'], is_train=True): model.emb_set_value_zero() this_cost = f_grad_shared(q_, q_mask_, l_, l_mask_, a_, a_mask_, y_) f_update(options['lrate']) total_loss += this_cost print '\r', 'epoch:', i, ', idx:', idx, ', this_loss:', this_cost, idx += 1 print ', total loss:', total_loss # validate model performance when necessary if (i + 1) % options['valid_freq'] == 0: # test performance on train set errors = [] for q_, q_mask_, l_, l_mask_, a_, a_mask_, y_ in gkhmc_qla_iterator( path=config.dataset, batch_size=options['valid_batch_size'], is_train=True): error = detector(q_, q_mask_, l_, l_mask_, a_, a_mask_, y_) errors.append(error) print '\ttrain error of epoch ' + str(i) + ': ' + str( np.mean(errors) * 100) + '%' # test performance on test set p_ds = [] ys = [] for q_, q_mask_, l_, l_mask_, a_, a_mask_, y_ in gkhmc_qla_iterator( path=config.dataset, batch_size=options['valid_batch_size'], is_train=False): p_d = p_predictor(q_, q_mask_, l_, l_mask_, a_, a_mask_) p_ds.extend(p_d) ys.extend(y_) right_num, total_num, _ = pred_check(p_ds, ys) # judge whether it's necessary to save the parameters save = False if float(right_num) / float(total_num) > best_perform: best_perform = float(right_num) / float(total_num) save = True print '\ttest performance of epoch', i, ':', right_num, '/', total_num, '\t', \ float(right_num * 10000 / total_num) / 100., '%', '\tbest through:', float(int(best_perform * 10000)) / 100. # save parameters if need if save: print '\t...saving parameters' file_name = options['param_path'] + model.name + '_hidden' + str(options['hidden_size']) + '_lrate' + \ str(options['lrate']) + '_batch' + str(options['batch_size']) + '_epoch' + str(i+1) + \ '_perform' + str(float(int(best_perform * 10000)) / 100.) + '.pickle' with open(file_name, 'wb') as f: new_dict = {} for k, v in model.params.items(): new_dict[k] = v.get_value() cPickle.dump(new_dict, f)
def build_network_from_ae(classn): input_var = T.tensor4('input_var') target_var = T.imatrix('targets') layer = layers.InputLayer(shape=(None, 3, PS, PS), input_var=input_var) layer = batch_norm( layers.Conv2DLayer(layer, 100, filter_size=(5, 5), stride=1, nonlinearity=leaky_rectify)) layer = batch_norm( layers.Conv2DLayer(layer, 100, filter_size=(5, 5), stride=1, nonlinearity=leaky_rectify)) layer = batch_norm( layers.Conv2DLayer(layer, 120, filter_size=(4, 4), stride=1, nonlinearity=leaky_rectify)) layer = layers.MaxPool2DLayer(layer, pool_size=(3, 3), stride=2) layer = batch_norm( layers.Conv2DLayer(layer, 240, filter_size=(3, 3), stride=1, nonlinearity=leaky_rectify)) layer = batch_norm( layers.Conv2DLayer(layer, 320, filter_size=(3, 3), stride=1, nonlinearity=leaky_rectify)) layer = batch_norm( layers.Conv2DLayer(layer, 320, filter_size=(3, 3), stride=1, nonlinearity=leaky_rectify)) layer = batch_norm( layers.Conv2DLayer(layer, 320, filter_size=(3, 3), stride=1, nonlinearity=leaky_rectify)) layer = batch_norm( layers.Conv2DLayer(layer, 320, filter_size=(3, 3), stride=1, nonlinearity=leaky_rectify)) layer = batch_norm( layers.Conv2DLayer(layer, 320, filter_size=(3, 3), stride=1, nonlinearity=leaky_rectify)) layer = batch_norm( layers.Conv2DLayer(layer, 320, filter_size=(3, 3), stride=1, nonlinearity=leaky_rectify)) layer = batch_norm( layers.Conv2DLayer(layer, 320, filter_size=(3, 3), stride=1, nonlinearity=leaky_rectify)) layer = batch_norm( layers.Conv2DLayer(layer, 480, filter_size=(3, 3), stride=1, nonlinearity=leaky_rectify)) layer = batch_norm( layers.Conv2DLayer(layer, 480, filter_size=(3, 3), stride=1, nonlinearity=leaky_rectify)) layer = batch_norm( layers.Conv2DLayer(layer, 480, filter_size=(3, 3), stride=1, nonlinearity=leaky_rectify)) layer = batch_norm( layers.Conv2DLayer(layer, 480, filter_size=(3, 3), stride=1, nonlinearity=leaky_rectify)) layer = layers.Pool2DLayer(layer, pool_size=(20, 20), stride=20, mode='average_inc_pad') network = layers.DenseLayer(layer, classn, nonlinearity=sigmoid) return network, input_var, target_var
def __init__(self, args, sent_emb_dim, flic_dim, load_model=None, epochs_done=0): """Initializes the model and constructs the Theano Computation Graph.""" self.args = args self.sig_handler = GracefullExit() self.best_val_error = sys.float_info.max if self.args.sample_all_sentences: print("We sample all items from the generator in each iteration.") else: print("We sample {} sets from the generator in each iteration.". format(args.num_samples)) self.flic_dim = flic_dim self.sent_emb_dim = sent_emb_dim # TODO: Implement or remove self.dropout_encoder = theano.shared( np.float64(args.dropout_encoder).astype(theano.config.floatX)) self.dropout_generator = theano.shared( np.float64(args.dropout_generator).astype(theano.config.floatX)) # Generator and Encoder Layers self.generator = Generator(args, None, self.sent_emb_dim, flic_dim) if self.args.context == 'train_context': self.encoder = Encoder(args, None, self.sent_emb_dim) else: self.encoder = Encoder(args, None, flic_dim) #--------------------------------------------------------------------------------------------------------------- # Construct computation graph #--------------------------------------------------------------------------------------------------------------- print("Constructing computation graph.") # (Input) Tensors sent_embs_t = T.matrix('sent_embs', dtype=theano.config.floatX) context_t = T.vector('context', dtype=theano.config.floatX) sample_sentences_padded_t = T.tensor3('sample_sent_embeddings', dtype=theano.config.floatX) item_counts_t = T.ivector('item_counts') y_t = T.scalar('y', dtype=theano.config.floatX) samples_t = T.imatrix('samples') # Sentence embedding max_num_sents_t = T.iscalar('max_num_sents') transformed_context_t = self.generator.transform_context( context_t, normalize_embeddings=True) transformed_sent_embs_t = self.generator.transform_sent_embs( sent_embs_t, normalize_embeddings=True) if not self.args.sample_all_sentences: # Construct L to sample from the DPP. L_t = self.generator.get_L(transformed_sent_embs_t, transformed_context_t) self.get_L_t = theano.function( inputs=[sent_embs_t, context_t], outputs=L_t, #mode='DebugMode', #profile=True, allow_input_downcast=True, #mode=NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=True), on_unused_input='warn') # The samples will be passed into the Deep Set Layer to calculate the final cost. # Encoder cost & updates padded_sents_t, sents_count_t = self.generator.get_padded_embeddings_from_samples_t( samples_t, transformed_sent_embs_t, max_num_sents_t) probs_t, costs_encoder_t, \ probs_mean_t, costs_encoder_mean_t, rand_updates = self.encoder.cost(padded_sents_t, sents_count_t, transformed_context_t, y_t) # Generator cost & updates lprob_t, cost_enc_t, cost_generator_t = self.generator.cost( L_t, samples_t, costs_encoder_t, sents_count_t) # Sample all sentences # TODO: Implement or remove else: probs_mean_t, costs_encoder_mean_t = self.encoder.cost_all_sentences( transformed_sent_embs_t, transformed_context_t, y_t) cost_generator_t = costs_encoder_mean_t # Updates of the Generator and Encoder Parameters updates_e, self.lr_e, gnorm_e, self.params_opt_e = create_optimization_updates( cost=costs_encoder_mean_t, params=self.encoder.get_params(), method=self.args.learning, lr=self.args.learning_rate_encoder)[:4] updates_g, self.lr_g, gnorm_g, self.params_opt_g = create_optimization_updates( cost=cost_generator_t, params=self.generator.get_params(), method=self.args.learning, lr=self.args.learning_rate_generator)[:4] if self.args.adaptive_lrs: self.adaptive_learning_rate = adaptive_learning_rate( lr_1=self.lr_e, lr_2=self.lr_g) else: self.adaptive_learning_rate = adaptive_learning_rate() if not self.args.sample_all_sentences: # Compile training graph self.train_model_t = theano.function( inputs=[ sent_embs_t, samples_t, max_num_sents_t, context_t, y_t ], outputs=[probs_mean_t, costs_encoder_mean_t], updates=updates_e.items() + updates_g.items() + rand_updates, allow_input_downcast=True, #mode=NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=False), #mode='DebugMode', on_unused_input='warn') # Compile graph for validation data self.validate_t = theano.function( inputs=[ sent_embs_t, samples_t, max_num_sents_t, context_t, y_t ], outputs=[probs_mean_t, costs_encoder_mean_t], updates=rand_updates, allow_input_downcast=True, on_unused_input='warn') else: # Compile train graph self.train_model_t = theano.function( inputs=[sent_embs_t, context_t, y_t], outputs=[probs_mean_t, costs_encoder_mean_t], updates=updates_g.items() + updates_e.items(), allow_input_downcast=True, # mode=NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=True), on_unused_input='warn') # Compile graph for validation data self.validate_t = theano.function( inputs=[sent_embs_t, context_t, y_t], outputs=[probs_mean_t, costs_encoder_mean_t], updates=[], allow_input_downcast=True, on_unused_input='warn') # Load pretrained model if load_model: self.load(load_model) elif self.args.load_model: self.load(args.load_model) self.epochs_done = epochs_done
# ##################################### # Create the train and predict_labels function n_in = 2*windowSize+1 n_hidden = numHiddenUnits n_out = len(label2Idx) number_of_epochs = 10 minibatch_size = 35 embedding_size = embeddings.shape[1] dim_case = 6 x = T.imatrix('x') # the data, one word+context per row y = T.ivector('y') # the labels are presented as 1D vector of [int] labels print "Embeddings shape",embeddings.shape words = Sequential() words.add(Embedding(output_dim=embeddings.shape[1], input_dim=embeddings.shape[0], input_length=n_in, weights=[embeddings])) words.layers[0].trainable_weights = [] #Fixed Embedding layer words.add(Flatten()) casing = Sequential() casing.add(Embedding(output_dim=caseMatrix.shape[1], input_dim=caseMatrix.shape[0], input_length=n_in, weights=[caseMatrix])) casing.layers[0].trainable_weights = [] #Fixed Embedding layer casing.add(Flatten())
def setupSetMacroBatchSubset(self): if isinstance(self.tvsData_x, list): data_block = [ T.tensor4('data_block_{}'.format(i)) for i in range(len(self.tvsData_x)) ] data_updates = [(dx, T.set_subtensor(dx[:db.shape[0]], db)) for (dx, db) in zip(self.tvsData_x, data_block)] else: data_block = T.tensor4('data_block') data_updates = [ (self.tvsData_x, T.set_subtensor(self.tvsData_x[:data_block.shape[0]], data_block)) ] self.tfSetMacroBatchSubsetData = theano.function(inputs=[data_block], updates=data_updates) if self.cfgParams.use_labels: y_block = T.ivector('y_block') y_updates = [(self.tvsData_y, T.set_subtensor(self.tvsData_y[:y_block.shape[0]], y_block))] self.tfSetMacroBatchSubsetY = theano.function(inputs=[y_block], updates=y_updates) if self.cfgParams.use_regtargets: yr_block = T.vector('yr_block') yr_updates = [(self.tvsData_yr, T.set_subtensor(self.tvsData_yr[:yr_block.shape[0]], yr_block))] self.tfSetMacroBatchSubsetYR = theano.function(inputs=[yr_block], updates=yr_updates) if self.cfgParams.use_pairs: pairIdx_block = T.imatrix('pairIdx_block') pairLabels_block = T.ivector('pairLabels_block') pair_updates = [ (self.tvsData_pairIdx, T.set_subtensor(self.tvsData_pairIdx[:pairIdx_block.shape[0]], pairIdx_block)), (self.tvsData_pairLabels, T.set_subtensor( self.tvsData_pairLabels[:pairLabels_block.shape[0]], pairLabels_block)) ] self.tfSetMacroBatchSubsetPairs = theano.function( inputs=[pairIdx_block, pairLabels_block], updates=pair_updates) if self.cfgParams.use_triplets: tripletIdx_block = T.imatrix('tripletIdx_block') triplets_updates = [ (self.tvsData_tripletIdx, T.set_subtensor( self.tvsData_tripletIdx[:tripletIdx_block.shape[0]], tripletIdx_block)) ] self.tfSetMacroBatchSubsetTriplets = theano.function( inputs=[tripletIdx_block], updates=triplets_updates) if self.cfgParams.use_tripletPools: tripletPoolIdx_block = T.imatrix('tripletPoolIdx_block') tripletPools_updates = [ (self.tvsData_tripletPoolIdx, T.set_subtensor( self.tvsData_tripletPoolIdx[:tripletPoolIdx_block. shape[0]], tripletPoolIdx_block)) ] self.tfSetMacroBatchSubsetTripletPools = theano.function( inputs=[tripletPoolIdx_block], updates=tripletPools_updates)
import theano import theano.tensor as T import numpy as np a = T.imatrix() b = T.imatrix() ok = T.horizontal_stack(a, b) myfunc = theano.function([a, b], ok) a_init = np.reshape(np.arange(10, dtype='int32'), (2, 5)) b_init = np.reshape(np.arange(10, 20, dtype='int32'), (2, 5)) ok = myfunc(a_init, b_init) print ok
which_set=args.train_dataset, batch_size=args.batch_size) valid_datastream = get_datastream(path=args.data_path, which_set=args.valid_dataset, batch_size=args.batch_size) test_datastream = get_datastream(path=args.data_path, which_set=args.test_dataset, batch_size=args.batch_size) ################# # build network # ################# print('Building and compiling network') input_data = T.ftensor3('input_data') input_mask = T.fmatrix('input_mask') target_data = T.imatrix('target_data') target_mask = T.fmatrix('target_mask') network_output = deep_prj_lstm_model_v1(input_var=input_data, mask_var=input_mask, num_inputs=input_dim, num_outputs=output_dim, num_layers=args.num_layers, num_units=args.num_units, num_prjs=args.num_prjs, grad_clipping=args.grad_clipping, dropout=args.dropout) network = network_output network_params = get_all_params(network, trainable=True) network_reg_params = get_all_params(network, trainable=True,
def __init__(self): super(SimpleVLblNceTrainer, self).__init__() self.h_indices = debug_print(T.imatrix('h'), 'h_indices') self.w_indices = debug_print(T.ivector(name='w'), 'w_indices') self.inputs = [self.h_indices, self.w_indices]
def build_network_from_ae(classn): input_var = T.tensor4('input_var') layer = layers.InputLayer(shape=(None, 3, PS, PS), input_var=input_var) layer = batch_norm( layers.Conv2DLayer(layer, 100, filter_size=(5, 5), stride=1, pad='same', nonlinearity=leaky_rectify)) layer = batch_norm( layers.Conv2DLayer(layer, 120, filter_size=(5, 5), stride=1, pad='same', nonlinearity=leaky_rectify)) layer = layers.Pool2DLayer(layer, pool_size=(2, 2), stride=2, mode='average_inc_pad') layer = batch_norm( layers.Conv2DLayer(layer, 240, filter_size=(3, 3), stride=1, pad='same', nonlinearity=leaky_rectify)) layer = batch_norm( layers.Conv2DLayer(layer, 320, filter_size=(3, 3), stride=1, pad='same', nonlinearity=leaky_rectify)) layer = layers.Pool2DLayer(layer, pool_size=(2, 2), stride=2, mode='average_inc_pad') layer = batch_norm( layers.Conv2DLayer(layer, 640, filter_size=(3, 3), stride=1, pad='same', nonlinearity=leaky_rectify)) prely = batch_norm( layers.Conv2DLayer(layer, 1024, filter_size=(3, 3), stride=1, pad='same', nonlinearity=leaky_rectify)) featm = batch_norm( layers.Conv2DLayer(prely, 640, filter_size=(1, 1), nonlinearity=leaky_rectify)) feat_map = batch_norm( layers.Conv2DLayer(featm, 100, filter_size=(1, 1), nonlinearity=rectify, name="feat_map")) maskm = batch_norm( layers.Conv2DLayer(prely, 100, filter_size=(1, 1), nonlinearity=leaky_rectify)) mask_rep = batch_norm(layers.Conv2DLayer(maskm, 1, filter_size=(1, 1), nonlinearity=None), beta=None, gamma=None) mask_map = SoftThresPerc(mask_rep, perc=97.0, alpha=0.1, beta=init.Constant(0.5), tight=100.0, name="mask_map") enlyr = ChInnerProdMerge(feat_map, mask_map, name="encoder") layer = batch_norm( layers.Deconv2DLayer(enlyr, 1024, filter_size=(3, 3), stride=1, crop='same', nonlinearity=leaky_rectify)) layer = batch_norm( layers.Deconv2DLayer(layer, 640, filter_size=(3, 3), stride=1, crop='same', nonlinearity=leaky_rectify)) layer = batch_norm( layers.Deconv2DLayer(layer, 640, filter_size=(4, 4), stride=2, crop=(1, 1), nonlinearity=leaky_rectify)) layer = batch_norm( layers.Deconv2DLayer(layer, 320, filter_size=(3, 3), stride=1, crop='same', nonlinearity=leaky_rectify)) layer = batch_norm( layers.Deconv2DLayer(layer, 320, filter_size=(3, 3), stride=1, crop='same', nonlinearity=leaky_rectify)) layer = batch_norm( layers.Deconv2DLayer(layer, 240, filter_size=(4, 4), stride=2, crop=(1, 1), nonlinearity=leaky_rectify)) layer = batch_norm( layers.Deconv2DLayer(layer, 120, filter_size=(5, 5), stride=1, crop='same', nonlinearity=leaky_rectify)) layer = batch_norm( layers.Deconv2DLayer(layer, 100, filter_size=(5, 5), stride=1, crop='same', nonlinearity=leaky_rectify)) layer = layers.Deconv2DLayer(layer, 3, filter_size=(1, 1), stride=1, crop='same', nonlinearity=identity) glblf = batch_norm( layers.Conv2DLayer(prely, 128, filter_size=(1, 1), nonlinearity=leaky_rectify)) glblf = layers.Pool2DLayer(glblf, pool_size=(5, 5), stride=5, mode='average_inc_pad') glblf = batch_norm( layers.Conv2DLayer(glblf, 64, filter_size=(3, 3), stride=1, pad='same', nonlinearity=leaky_rectify)) gllyr = batch_norm(layers.Conv2DLayer(glblf, 5, filter_size=(1, 1), nonlinearity=rectify), name="global_feature") glblf = batch_norm( layers.Deconv2DLayer(gllyr, 256, filter_size=(3, 3), stride=1, crop='same', nonlinearity=leaky_rectify)) glblf = batch_norm( layers.Deconv2DLayer(glblf, 128, filter_size=(3, 3), stride=1, crop='same', nonlinearity=leaky_rectify)) glblf = batch_norm( layers.Deconv2DLayer(glblf, 128, filter_size=(9, 9), stride=5, crop=(2, 2), nonlinearity=leaky_rectify)) glblf = batch_norm( layers.Deconv2DLayer(glblf, 128, filter_size=(3, 3), stride=1, crop='same', nonlinearity=leaky_rectify)) glblf = batch_norm( layers.Deconv2DLayer(glblf, 128, filter_size=(3, 3), stride=1, crop='same', nonlinearity=leaky_rectify)) glblf = batch_norm( layers.Deconv2DLayer(glblf, 64, filter_size=(4, 4), stride=2, crop=(1, 1), nonlinearity=leaky_rectify)) glblf = batch_norm( layers.Deconv2DLayer(glblf, 64, filter_size=(3, 3), stride=1, crop='same', nonlinearity=leaky_rectify)) glblf = batch_norm( layers.Deconv2DLayer(glblf, 64, filter_size=(3, 3), stride=1, crop='same', nonlinearity=leaky_rectify)) glblf = batch_norm( layers.Deconv2DLayer(glblf, 32, filter_size=(4, 4), stride=2, crop=(1, 1), nonlinearity=leaky_rectify)) glblf = batch_norm( layers.Deconv2DLayer(glblf, 32, filter_size=(3, 3), stride=1, crop='same', nonlinearity=leaky_rectify)) glblf = batch_norm( layers.Deconv2DLayer(glblf, 32, filter_size=(3, 3), stride=1, crop='same', nonlinearity=leaky_rectify)) glblf = layers.Deconv2DLayer(glblf, 3, filter_size=(1, 1), stride=1, crop='same', nonlinearity=identity) layer = layers.ElemwiseSumLayer([layer, glblf]) network = ReshapeLayer(layer, ([0], -1)) layers.set_all_param_values(network, pickle.load(open(filename_model_ae, 'rb'))) mask_map.beta.set_value(np.float32(0.8 * mask_map.beta.get_value())) old_params = layers.get_all_params(network, trainable=True) # Adding more layers aug_var = T.matrix('aug_var') target_var = T.imatrix('targets') add_a = batch_norm( layers.Conv2DLayer(enlyr, 320, filter_size=(1, 1), nonlinearity=leaky_rectify)) add_b = batch_norm( layers.Conv2DLayer(add_a, 320, filter_size=(1, 1), nonlinearity=leaky_rectify)) add_c = batch_norm( layers.Conv2DLayer(add_b, 320, filter_size=(1, 1), nonlinearity=leaky_rectify)) add_d = batch_norm( layers.Conv2DLayer(add_c, 320, filter_size=(1, 1), nonlinearity=leaky_rectify)) add_0 = layers.Pool2DLayer(add_d, pool_size=(25, 25), stride=25, mode='average_inc_pad') add_1 = batch_norm( layers.DenseLayer(add_0, 100, nonlinearity=leaky_rectify)) add_2 = batch_norm( layers.DenseLayer(gllyr, 320, nonlinearity=leaky_rectify)) add_3 = batch_norm( layers.DenseLayer(add_2, 320, nonlinearity=leaky_rectify)) add_4 = batch_norm( layers.DenseLayer(add_3, 100, nonlinearity=leaky_rectify)) aug_layer = layers.InputLayer(shape=(None, aug_fea_n), input_var=aug_var) cat_layer = lasagne.layers.ConcatLayer([add_1, add_4, aug_layer], axis=1) hidden_layer = layers.DenseLayer(cat_layer, 80, nonlinearity=leaky_rectify) network = layers.DenseLayer(hidden_layer, classn, nonlinearity=sigmoid) all_params = layers.get_all_params(network, trainable=True) new_params = [x for x in all_params if x not in old_params] return network, new_params, input_var, aug_var, target_var
def __init__(self, num_hidden, num_classes, context_win_size, embeddings, featdim=0, fine_tuning=False, truncate_gradient=-1): """ num_hidden :: dimension of the hidden layer num_classes :: number of classes context_win_size :: word window context size embeddings :: matrix """ # hyper parameters of the model self.hyperparams = {} # nh :: dimension of the hidden layer nh = num_hidden self.hyperparams['nh'] = nh # nc :: number of classes nc = num_classes self.hyperparams['nc'] = nc # de :: dimension of the word embeddings de = embeddings.shape[1] self.hyperparams['de'] = de # cs :: word window context size cs = context_win_size self.hyperparams['cs'] = cs self.hyperparams['featdim'] = featdim self.hyperparams['fine_tuning'] = fine_tuning self.hyperparams['truncate_gradient'] = truncate_gradient # parameters of the model self.emb = theano.shared(embeddings.astype(theano.config.floatX)) # inputs idxs = T.imatrix() w = T.fscalar('w') x = self.emb[idxs].reshape((idxs.shape[0], de * cs))*w y = T.iscalar('y') y_sentence = T.ivector('y_sentence') f = T.matrix('f') f.reshape((idxs.shape[0], featdim)) # forward parameters of the model self.fWx = theano.shared(0.2 * np.random.uniform(-1.0, 1.0, (de * cs, nh)).astype(theano.config.floatX)) self.fWh = theano.shared(0.2 * np.random.uniform(-1.0, 1.0, (nh, nh)).astype(theano.config.floatX)) self.fbh = theano.shared(np.zeros(nh, dtype=theano.config.floatX)) self.fh0 = theano.shared(np.zeros(nh, dtype=theano.config.floatX)) fparams = [self.fWx, self.fWh, self.fbh, self.fh0] fnames = ['fWx', 'fWh', 'fbh', 'fh0'] def frecurrence(x_t, h_tm1): h_t = T.nnet.sigmoid(T.dot(x_t, self.fWx) + T.dot(h_tm1, self.fWh) + self.fbh) return h_t fh, _ = theano.scan(fn=frecurrence, sequences=x, outputs_info=[self.fh0], n_steps=x.shape[0], truncate_gradient=truncate_gradient) # backwards parameters of the model self.bWx = theano.shared(0.2 * np.random.uniform(-1.0, 1.0, (de * cs, nh)).astype(theano.config.floatX)) self.bWh = theano.shared(0.2 * np.random.uniform(-1.0, 1.0, (nh, nh)).astype(theano.config.floatX)) self.bbh = theano.shared(np.zeros(nh, dtype=theano.config.floatX)) self.bh0 = theano.shared(np.zeros(nh, dtype=theano.config.floatX)) bparams = [self.bWx, self.bWh, self.bbh, self.bh0] bnames = ['bWx', 'bWh', 'bbh', 'bh0'] def brecurrence(x_t, h_tm1): h_t = T.nnet.sigmoid(T.dot(x_t, self.bWx) + T.dot(h_tm1, self.bWh) + self.bbh) return h_t bh, _ = theano.scan(fn=brecurrence, sequences=x, outputs_info=[self.bh0], n_steps=x.shape[0], go_backwards=True, truncate_gradient=truncate_gradient) # inverting backwards hidden bh = bh[::-1] # concatenation parameters self.bW = theano.shared(0.2 * np.random.uniform(-1.0, 1.0, (nh+featdim, nc)).astype(theano.config.floatX)) self.fW = theano.shared(0.2 * np.random.uniform(-1.0, 1.0, (nh+featdim, nc)).astype(theano.config.floatX)) self.b = theano.shared(np.zeros(nc, dtype=theano.config.floatX)) # adding features if featdim > 0: fh_final = T.concatenate([fh, f], axis=1) bh_final = T.concatenate([bh, f], axis=1) else: fh_final = fh bh_final = bh # "concatenating" forward and backward hidden states h = T.dot(bh_final, self.bW) + T.dot(fh_final, self.fW) s = T.nnet.softmax(h + self.b) p_y_given_x_lastword = s[-1, :] p_y_given_x_sentence = s self.params = fparams + bparams + [self.bW, self.fW, self.b] self.names = fnames + bnames + ['bW', 'fW', 'b'] if fine_tuning: self.params.append(self.emb) self.names.append("emb") # prediction y_pred = T.argmax(p_y_given_x_sentence, axis=1) # cost functions sentence_nll = -T.mean(T.log(p_y_given_x_sentence) [T.arange(x.shape[0]), y_sentence]) nll = -T.mean(T.log(p_y_given_x_lastword)[y]) # gradients sentence_gradients = T.grad(sentence_nll, self.params) gradients = T.grad(nll, self.params) # learning rate lr = T.scalar('lr') # updates sentence_updates = OrderedDict((p, p - lr * g) for p, g in zip(self.params, sentence_gradients)) updates = OrderedDict((p, p - lr * g) for p, g in zip(self.params, gradients)) # theano functions self.classify = theano.function(inputs=[idxs, f, In(w, value=1.0)], outputs=y_pred, on_unused_input='ignore') self.sentence_train = theano.function(inputs=[idxs, f, y_sentence, lr, In(w, value=1.0)], outputs=sentence_nll, updates=sentence_updates, on_unused_input='ignore') self.train = theano.function(inputs=[idxs, f, y, lr, In(w, value=1.0)], outputs=nll, updates=updates, on_unused_input='ignore') self.predict = theano.function(inputs=[idxs, f, In(w, value=1.0)], outputs=p_y_given_x_sentence, on_unused_input='ignore') self.normalize = theano.function(inputs=[], updates={self.emb:\ self.emb/T.sqrt((self.emb**2).sum(axis=1)).dimshuffle(0, 'x')})
def main(): parser = argparse.ArgumentParser(description='Tuning with bi-directional LSTM-CNN-CRF') parser.add_argument('--fine_tune', action='store_true', help='Fine tune the word embeddings') parser.add_argument('--embedding', choices=['word2vec', 'glove', 'senna', 'random'], help='Embedding for words', required=True) parser.add_argument('--embedding_dict', default=None, help='path for embedding dict') parser.add_argument('--batch_size', type=int, default=10, help='Number of sentences in each batch') parser.add_argument('--num_units', type=int, default=100, help='Number of hidden units in LSTM') parser.add_argument('--num_filters', type=int, default=20, help='Number of filters in CNN') parser.add_argument('--learning_rate', type=float, default=0.1, help='Learning rate') parser.add_argument('--decay_rate', type=float, default=0.1, help='Decay rate of learning rate') parser.add_argument('--grad_clipping', type=float, default=0, help='Gradient clipping') parser.add_argument('--gamma', type=float, default=1e-6, help='weight for regularization') parser.add_argument('--peepholes', action='store_true', help='Peepholes for LSTM') parser.add_argument('--oov', choices=['random', 'embedding'], help='Embedding for oov word', required=True) parser.add_argument('--update', choices=['sgd', 'momentum', 'nesterov', 'adadelta'], help='update algorithm', default='sgd') parser.add_argument('--regular', choices=['none', 'l2'], help='regularization for training', required=True) parser.add_argument('--dropout', action='store_true', help='Apply dropout layers') parser.add_argument('--patience', type=int, default=5, help='Patience for early stopping') parser.add_argument('--output_prediction', action='store_true', help='Output predictions to temp files') parser.add_argument('--train') # "data/POS-penn/wsj/split1/wsj1.train.original" parser.add_argument('--dev') # "data/POS-penn/wsj/split1/wsj1.dev.original" parser.add_argument('--test') # "data/POS-penn/wsj/split1/wsj1.test.original" args = parser.parse_args() def construct_input_layer(): if fine_tune: layer_input = lasagne.layers.InputLayer(shape=(None, max_length), input_var=input_var, name='input') layer_embedding = lasagne.layers.EmbeddingLayer(layer_input, input_size=alphabet_size, output_size=embedd_dim, W=embedd_table, name='embedding') return layer_embedding else: layer_input = lasagne.layers.InputLayer(shape=(None, max_length, embedd_dim), input_var=input_var, name='input') return layer_input def construct_char_input_layer(): layer_char_input = lasagne.layers.InputLayer(shape=(None, max_sent_length, max_char_length), input_var=char_input_var, name='char-input') layer_char_input = lasagne.layers.reshape(layer_char_input, (-1, [2])) layer_char_embedding = lasagne.layers.EmbeddingLayer(layer_char_input, input_size=char_alphabet_size, output_size=char_embedd_dim, W=char_embedd_table, name='char_embedding') layer_char_input = lasagne.layers.DimshuffleLayer(layer_char_embedding, pattern=(0, 2, 1)) return layer_char_input logger = utils.get_logger("BiLSTM-CNN-CRF") fine_tune = args.fine_tune oov = args.oov regular = args.regular embedding = args.embedding embedding_path = args.embedding_dict train_path = args.train dev_path = args.dev test_path = args.test update_algo = args.update grad_clipping = args.grad_clipping peepholes = args.peepholes num_filters = args.num_filters gamma = args.gamma output_predict = args.output_prediction dropout = args.dropout X_train, Y_train, mask_train, X_dev, Y_dev, mask_dev, X_test, Y_test, mask_test, \ embedd_table, label_alphabet, \ C_train, C_dev, C_test, char_embedd_table = data_processor.load_dataset_sequence_labeling(train_path, dev_path, test_path,word_column=0, label_column=1, oov=oov, fine_tune=fine_tune,embedding=embedding, embedding_path=embedding_path, use_character=True) num_labels = label_alphabet.size() - 1 logger.info("constructing network...") # create variables target_var = T.imatrix(name='targets') mask_var = T.matrix(name='masks', dtype=theano.config.floatX) if fine_tune: input_var = T.imatrix(name='inputs') num_data, max_length = X_train.shape alphabet_size, embedd_dim = embedd_table.shape else: input_var = T.tensor3(name='inputs', dtype=theano.config.floatX) num_data, max_length, embedd_dim = X_train.shape char_input_var = T.itensor3(name='char-inputs') num_data_char, max_sent_length, max_char_length = C_train.shape char_alphabet_size, char_embedd_dim = char_embedd_table.shape assert (max_length == max_sent_length) assert (num_data == num_data_char) # construct input and mask layers layer_incoming1 = construct_char_input_layer() layer_incoming2 = construct_input_layer() layer_mask = lasagne.layers.InputLayer(shape=(None, max_length), input_var=mask_var, name='mask') # construct bi-rnn-cnn num_units = args.num_units bi_lstm_cnn_crf = build_BiLSTM_CNN_CRF(layer_incoming1, layer_incoming2, num_units, num_labels, mask=layer_mask, grad_clipping=grad_clipping, peepholes=peepholes, num_filters=num_filters, dropout=dropout) logger.info("Network structure: hidden=%d, filter=%d" % (num_units, num_filters)) # compute loss num_tokens = mask_var.sum(dtype=theano.config.floatX) # get outpout of bi-lstm-cnn-crf shape [batch, length, num_labels, num_labels] energies_train = lasagne.layers.get_output(bi_lstm_cnn_crf) energies_eval = lasagne.layers.get_output(bi_lstm_cnn_crf, deterministic=True) loss_train = crf_loss(energies_train, target_var, mask_var).mean() loss_eval = crf_loss(energies_eval, target_var, mask_var).mean() # l2 regularization? if regular == 'l2': l2_penalty = lasagne.regularization.regularize_network_params(bi_lstm_cnn_crf, lasagne.regularization.l2) loss_train = loss_train + gamma * l2_penalty _, corr_train = crf_accuracy(energies_train, target_var) corr_train = (corr_train * mask_var).sum(dtype=theano.config.floatX) prediction_eval, corr_eval = crf_accuracy(energies_eval, target_var) corr_eval = (corr_eval * mask_var).sum(dtype=theano.config.floatX) debug_out = crf_nbest_debug(energies_eval,target_var) # crf_para = crf_parameter(energies_eval) # Create update expressions for training. # hyper parameters to tune: learning rate, momentum, regularization. batch_size = args.batch_size learning_rate = 1.0 if update_algo == 'adadelta' else args.learning_rate decay_rate = args.decay_rate momentum = 0.9 params = lasagne.layers.get_all_params(bi_lstm_cnn_crf, trainable=True) updates = utils.create_updates(loss_train, params, update_algo, learning_rate, momentum=momentum) # Compile a function performing a training step on a mini-batch train_fn = theano.function([input_var, target_var, mask_var, char_input_var], [loss_train, corr_train, num_tokens], updates=updates) # Compile a second function evaluating the loss and accuracy of network eval_fn = theano.function([input_var, target_var, mask_var, char_input_var], [loss_eval, corr_eval, num_tokens, prediction_eval, energies_eval]) debug_fn = theano.function([input_var, target_var, mask_var, char_input_var], debug_out,on_unused_input='ignore') # Finally, launch the training loop. logger.info( "Start training: %s with regularization: %s(%f), dropout: %s, fine tune: %s (#training data: %d, batch size: %d, clip: %.1f, peepholes: %s)..." \ % ( update_algo, regular, (0.0 if regular == 'none' else gamma), dropout, fine_tune, num_data, batch_size, grad_clipping, peepholes)) num_batches = num_data / batch_size num_epochs = 3 best_loss = 1e+12 best_acc = 0.0 best_epoch_loss = 0 best_epoch_acc = 0 best_loss_test_err = 0. best_loss_test_corr = 0. best_acc_test_err = 0. best_acc_test_corr = 0. stop_count = 0 lr = learning_rate patience = args.patience for epoch in range(1, num_epochs + 1): print 'Epoch %d (learning rate=%.4f, decay rate=%.4f): ' % (epoch, lr, decay_rate) train_err = 0.0 train_corr = 0.0 train_total = 0 train_inst = 0 start_time = time.time() num_back = 0 train_batches = 0 for batch in utils.iterate_minibatches(X_train, Y_train, masks=mask_train, char_inputs=C_train, batch_size=batch_size, shuffle=True): inputs, targets, masks, char_inputs = batch err, corr, num = train_fn(inputs, targets, masks, char_inputs) train_err += err * inputs.shape[0] train_corr += corr train_total += num train_inst += inputs.shape[0] train_batches += 1 time_ave = (time.time() - start_time) / train_batches time_left = (num_batches - train_batches) * time_ave # update log sys.stdout.write("\b" * num_back) log_info = 'train: %d/%d loss: %.4f, acc: %.2f%%, time left (estimated): %.2fs' % ( min(train_batches * batch_size, num_data), num_data, train_err / train_inst, train_corr * 100 / train_total, time_left) sys.stdout.write(log_info) num_back = len(log_info) # update training log after each epoch assert train_inst == num_data sys.stdout.write("\b" * num_back) print 'train: %d/%d loss: %.4f, acc: %.2f%%, time: %.2fs' % ( min(train_batches * batch_size, num_data), num_data, train_err / num_data, train_corr * 100 / train_total, time.time() - start_time) # evaluate performance on dev data dev_err = 0.0 dev_corr = 0.0 dev_total = 0 dev_inst = 0 for batch in utils.iterate_minibatches(X_dev, Y_dev, masks=mask_dev, char_inputs=C_dev, batch_size=batch_size): inputs, targets, masks, char_inputs = batch err, corr, num, predictions, crf_para = eval_fn(inputs, targets, masks, char_inputs) dev_err += err * inputs.shape[0] dev_corr += corr dev_total += num dev_inst += inputs.shape[0] if output_predict: utils.output_predictions(predictions, targets, masks, 'tmp/dev%d' % epoch, label_alphabet, is_flattened=False) # debug_out = debug_fn(inputs, targets, masks, char_inputs) # print "debug out:", debug_out[1] crf_nbest.write_nbest(inputs, targets, masks, crf_para,label_alphabet,'tmp/dev_nbest%d' % epoch, 10, is_flattened=False) print 'dev loss: %.4f, corr: %d, total: %d, acc: %.2f%%' % ( dev_err / dev_inst, dev_corr, dev_total, dev_corr * 100 / dev_total) if best_loss < dev_err and best_acc > dev_corr / dev_total: stop_count += 1 else: update_loss = False update_acc = False stop_count = 0 if best_loss > dev_err: update_loss = True best_loss = dev_err best_epoch_loss = epoch if best_acc < dev_corr / dev_total: update_acc = True best_acc = dev_corr / dev_total best_epoch_acc = epoch # evaluate on test data when better performance detected test_err = 0.0 test_corr = 0.0 test_total = 0 test_inst = 0 for batch in utils.iterate_minibatches(X_test, Y_test, masks=mask_test, char_inputs=C_test, batch_size=batch_size): inputs, targets, masks, char_inputs = batch err, corr, num, predictions,crf_para = eval_fn(inputs, targets, masks, char_inputs) test_err += err * inputs.shape[0] test_corr += corr test_total += num test_inst += inputs.shape[0] if output_predict: utils.output_predictions(predictions, targets, masks, 'tmp/test%d' % epoch, label_alphabet, is_flattened=False) print 'test loss: %.4f, corr: %d, total: %d, acc: %.2f%%' % ( test_err / test_inst, test_corr, test_total, test_corr * 100 / test_total) if update_loss: best_loss_test_err = test_err best_loss_test_corr = test_corr if update_acc: best_acc_test_err = test_err best_acc_test_corr = test_corr # stop if dev acc decrease 3 time straightly. if stop_count == patience: break # re-compile a function with new learning rate for training if update_algo != 'adadelta': lr = learning_rate / (1.0 + epoch * decay_rate) updates = utils.create_updates(loss_train, params, update_algo, lr, momentum=momentum) train_fn = theano.function([input_var, target_var, mask_var, char_input_var], [loss_train, corr_train, num_tokens], updates=updates) # print best performance on test data. logger.info("final best loss test performance (at epoch %d)" % best_epoch_loss) print 'test loss: %.4f, corr: %d, total: %d, acc: %.2f%%' % ( best_loss_test_err / test_inst, best_loss_test_corr, test_total, best_loss_test_corr * 100 / test_total) logger.info("final best acc test performance (at epoch %d)" % best_epoch_acc) print 'test loss: %.4f, corr: %d, total: %d, acc: %.2f%%' % ( best_acc_test_err / test_inst, best_acc_test_corr, test_total, best_acc_test_corr * 100 / test_total) print "BiLSTM-CNN-CRF model finished!"
def __init__(self, ne, de, na, n_lstm, n_out, cs, npos, lr=0.05, single_output=True, output_activation=T.nnet.softmax, cost_function='nll'): ''' ne :: number of word embeddings in the vocabulary de :: dimension of the word embeddings na :: number of acoustic or language model features at each word step (acoustic context size in frames * number of features) n_lstm :: dimension of the lstm layer n_out :: number of classes cs :: word window context size npos :: number of pos tags ''' # add one to ne for PADDING self.emb = init_weight((ne + 1, de), 'emb') self.n_in = (de * cs) + (npos * cs) self.n_lstm = n_lstm self.n_out = n_out self.W_xi = init_weight((self.n_in, self.n_lstm), 'W_xi') self.W_hi = init_weight((self.n_lstm, self.n_lstm), 'W_hi', 'svd') self.W_ci = init_weight((self.n_lstm, self.n_lstm), 'W_ci', 'svd') # bias to the input: self.b_i = shared(np.cast[dtype](np.random.uniform(-0.5, .5, size=n_lstm))) # forget gate weights: self.W_xf = init_weight((self.n_in, self.n_lstm), 'W_xf') self.W_hf = init_weight((self.n_lstm, self.n_lstm), 'W_hf', 'svd') self.W_cf = init_weight((self.n_lstm, self.n_lstm), 'W_cf', 'svd') # bias self.b_f = shared(np.cast[dtype](np.random.uniform(0, 1., size=n_lstm))) # memory cell gate weights: self.W_xc = init_weight((self.n_in, self.n_lstm), 'W_xc') self.W_hc = init_weight((self.n_lstm, self.n_lstm), 'W_hc', 'svd') # bias to the memory cell: self.b_c = shared(np.zeros(n_lstm, dtype=dtype)) # output gate weights: self.W_xo = init_weight((self.n_in, self.n_lstm), 'W_xo') self.W_ho = init_weight((self.n_lstm, self.n_lstm), 'W_ho', 'svd') self.W_co = init_weight((self.n_lstm, self.n_lstm), 'W_co', 'svd') # bias on output gate: self.b_o = shared(np.cast[dtype](np.random.uniform(-0.5, .5, size=n_lstm))) # hidden to y matrix weights: self.W_hy = init_weight((self.n_lstm, self.n_out), 'W_hy') self.b_y = shared(np.zeros(n_out, dtype=dtype)) # output bias # Weights for L1 and L2 self.L1_reg = 0.0 self.L2_reg = 0.00001 self.params = [ self.W_xi, self.W_hi, self.W_ci, self.b_i, self.W_xf, self.W_hf, self.W_cf, self.b_f, self.W_xc, self.W_hc, self.b_c, self.W_ho, self.W_co, self.W_co, self.b_o, self.W_hy, self.b_y, self.emb ] self.names = [ "W_xi", "W_hi", "W_ci", "b_i", "W_xf", "W_hf", "W_cf", "b_f", "W_xc", "W_hc", "b_c", "W_ho", "W_co", "W_co", "b_o", "W_hy", "b_y", "embeddings" ] def step_lstm(x_t, h_tm1, c_tm1): i_t = T.nnet.sigmoid( T.dot(x_t, self.W_xi) + T.dot(h_tm1, self.W_hi) + T.dot(c_tm1, self.W_ci) + self.b_i) f_t = T.nnet.sigmoid( T.dot(x_t, self.W_xf) + T.dot(h_tm1, self.W_hf) + T.dot(c_tm1, self.W_cf) + self.b_f) c_t = f_t * c_tm1 + i_t * T.tanh( T.dot(x_t, self.W_xc) + T.dot(h_tm1, self.W_hc) + self.b_c) o_t = T.nnet.sigmoid( T.dot(x_t, self.W_xo) + T.dot(h_tm1, self.W_ho) + T.dot(c_t, self.W_co) + self.b_o) h_t = o_t * T.tanh(c_t) y_t = T.nnet.softmax(T.dot(h_t, self.W_hy) + self.b_y) return [h_t, c_t, y_t] # batch of sequence of vectors self.idxs = T.imatrix() self.pos_idxs = T.imatrix() # The eye function (diagonal 1s) for the POS, small in memory self.pos = T.eye(npos, npos, 0) # TODO No pos # x = self.emb[self.idxs].reshape((self.idxs.shape[0], de*cs)) # POS version x = T.concatenate((self.emb[self.idxs].reshape( (self.idxs.shape[0], de * cs)), self.pos[self.pos_idxs].reshape( (self.pos_idxs.shape[0], npos * cs))), 1) self.y = T.iscalar('y') # initial hidden state self.h0 = shared(np.zeros(shape=self.n_lstm, dtype=dtype)) self.c0 = shared(np.zeros(shape=self.n_lstm, dtype=dtype)) self.lr = T.scalar('lr') [h_vals, c_vals, y_vals], _ = theano.scan(fn=step_lstm, sequences=x, outputs_info=[self.h0, self.c0, None], n_steps=x.shape[0]) self.output = y_vals p_y_given_x_lastword = self.output[-1, 0, :] p_y_given_x_sentence = self.output[:, 0, :] p_y_given_x_sentence_hidden = (h_vals, c_vals, self.output[:, 0, :]) y_pred = T.argmax(p_y_given_x_sentence, axis=1) # y_pred_word = T.argmax(p_y_given_x_lastword) self.cxe = T.mean(T.nnet.binary_crossentropy(self.output, self.y)) self.nll = -T.mean(T.log(p_y_given_x_lastword)[self.y]) self.mse = T.mean((self.output - self.y)**2) self.sentence_nll = -T.mean( T.log(p_y_given_x_sentence)[T.arange(x.shape[0]), self.y]) self.L2_sqr = sum([(p**2).sum() for p in self.params]) self.cost = self.nll + self.L2_reg * self.L2_sqr if cost_function == 'mse': self.cost = self.mse + self.L2_reg * self.L2_sqr elif cost_function == 'cxe': self.cost = self.cxe + self.L2_reg * self.L2_sqr self.debug = theano.function( inputs=[x, self.y], outputs=[x.shape, self.y.shape, y_vals.shape, self.cost.shape]) gradients = T.grad(self.cost, self.params) self.updates = OrderedDict( (p, p - self.lr * g) for p, g in zip(self.params, gradients)) self.loss = theano.function(inputs=[x, self.y], outputs=self.cost) # if na == 0: #assume no acoustic features for now # simply outputs the soft_max distribution for each word in utterance self.soft_max = theano.function(inputs=[self.idxs, self.pos_idxs], outputs=p_y_given_x_sentence) self.soft_max_return_hidden_layer = theano.function( inputs=[self.idxs, self.pos_idxs], outputs=p_y_given_x_sentence_hidden) if na == 0: self.train = theano.function( inputs=[self.idxs, self.pos_idxs, self.y, self.lr], outputs=self.cost, updates=self.updates) self.classify = theano.function(inputs=[self.idxs, self.pos_idxs], outputs=y_pred) else: self.train = theano.function(inputs=[ self.idxs, self.pos_idxs, self.acoustic, self.y, self.lr ], outputs=self.cost, updates=self.updates) self.classify = theano.function( inputs=[self.idxs, self.pos_idxs, self.acoustic], outputs=y_pred) self.normalize = theano.function( inputs=[], updates={ self.emb: self.emb / T.sqrt( (self.emb**2).sum(axis=1)).dimshuffle(0, 'x') })
def evaluate_lenet5(learning_rate=0.02, n_epochs=4, L2_weight=1e-5, extra_size=4, emb_size=300, batch_size=20, filter_size=[3, 3], maxSentLen=40, hidden_size=[300, 300], max_term_len=4): model_options = locals().copy() print "model options", model_options seed = 1234 np.random.seed(seed) rng = np.random.RandomState( seed) #random seed, control the model generates the same results all_sentences_l, all_masks_l, all_sentences_r, all_masks_r, all_word1, all_word2, all_word1_mask, all_word2_mask, all_labels, word2id = load_wordnet_hyper_vs_all_with_words( maxlen=maxSentLen, wordlen=max_term_len ) #minlen, include one label, at least one word in the sentence # test_sents_l, test_masks_l, test_sents_r, test_masks_r, test_labels, word2id =load_ACE05_dataset(maxSentLen, word2id) test_sents_l, test_masks_l, test_sents_r, test_masks_r, test_word1, test_word2, test_word1_mask, test_word2_mask, test_labels, word2id = load_EVAlution_hyper_vs_all_with_words( maxSentLen, word2id, wordlen=max_term_len) total_size = len(all_sentences_l) hold_test_size = 10000 train_size = total_size - hold_test_size train_sents_l = np.asarray(all_sentences_l[:train_size], dtype='int32') # dev_sents_l=np.asarray(all_sentences_l[1], dtype='int32') # test_sents_l=np.asarray(all_sentences_l[-test_size:], dtype='int32') test_sents_l = np.asarray(test_sents_l, dtype='int32') train_masks_l = np.asarray(all_masks_l[:train_size], dtype=theano.config.floatX) # dev_masks_l=np.asarray(all_masks_l[1], dtype=theano.config.floatX) # test_masks_l=np.asarray(all_masks_l[-test_size:], dtype=theano.config.floatX) test_masks_l = np.asarray(test_masks_l, dtype=theano.config.floatX) train_sents_r = np.asarray(all_sentences_r[:train_size], dtype='int32') # dev_sents_r=np.asarray(all_sentences_r[1] , dtype='int32') # test_sents_r=np.asarray(all_sentences_r[-test_size:], dtype='int32') test_sents_r = np.asarray(test_sents_r, dtype='int32') train_masks_r = np.asarray(all_masks_r[:train_size], dtype=theano.config.floatX) # dev_masks_r=np.asarray(all_masks_r[1], dtype=theano.config.floatX) # test_masks_r=np.asarray(all_masks_r[-test_size:], dtype=theano.config.floatX) test_masks_r = np.asarray(test_masks_r, dtype=theano.config.floatX) train_word1 = np.asarray(all_word1[:train_size], dtype='int32') train_word2 = np.asarray(all_word2[:train_size], dtype='int32') test_word1 = np.asarray(test_word1, dtype='int32') test_word2 = np.asarray(test_word2, dtype='int32') train_word1_mask = np.asarray(all_word1_mask[:train_size], dtype=theano.config.floatX) train_word2_mask = np.asarray(all_word2_mask[:train_size], dtype=theano.config.floatX) test_word1_mask = np.asarray(test_word1_mask, dtype=theano.config.floatX) test_word2_mask = np.asarray(test_word2_mask, dtype=theano.config.floatX) train_labels_store = np.asarray(all_labels[:train_size], dtype='int32') # dev_labels_store=np.asarray(all_labels[1], dtype='int32') # test_labels_store=np.asarray(all_labels[-test_size:], dtype='int32') test_labels_store = np.asarray(test_labels, dtype='int32') # train_size=len(train_labels_store) # dev_size=len(dev_labels_store) test_size = len(test_labels_store) print 'train size: ', train_size, ' test size: ', test_size vocab_size = len(word2id) + 1 rand_values = rng.normal( 0.0, 0.01, (vocab_size, emb_size)) #generate a matrix by Gaussian distribution #here, we leave code for loading word2vec to initialize words rand_values[0] = np.array(np.zeros(emb_size), dtype=theano.config.floatX) id2word = {y: x for x, y in word2id.iteritems()} word2vec = load_word2vec() rand_values = load_word2vec_to_init(rand_values, id2word, word2vec) init_embeddings = theano.shared( value=np.array(rand_values, dtype=theano.config.floatX), borrow=True ) #wrap up the python variable "rand_values" into theano variable #now, start to build the input form of the model sents_ids_l = T.imatrix() sents_mask_l = T.fmatrix() sents_ids_r = T.imatrix() sents_mask_r = T.fmatrix() word1_ids = T.imatrix() word2_ids = T.imatrix() word1_mask = T.fmatrix() word2_mask = T.fmatrix() labels = T.ivector() ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' def embed_input(emb_matrix, sent_ids): return emb_matrix[sent_ids.flatten()].reshape( (batch_size, maxSentLen, emb_size)).dimshuffle(0, 2, 1) embed_input_l = embed_input( init_embeddings, sents_ids_l ) #embeddings[sents_ids_l.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1) #the input format can be adapted into CNN or GRU or LSTM embed_input_r = embed_input( init_embeddings, sents_ids_r ) #embeddings[sents_ids_r.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1) embed_word1 = init_embeddings[word1_ids.flatten()].reshape( (batch_size, max_term_len, emb_size)) embed_word2 = init_embeddings[word2_ids.flatten()].reshape( (batch_size, max_term_len, emb_size)) word1_embedding = T.sum(embed_word1 * word1_mask.dimshuffle(0, 1, 'x'), axis=1) word2_embedding = T.sum(embed_word2 * word2_mask.dimshuffle(0, 1, 'x'), axis=1) '''create_AttentiveConv_params ''' conv_W, conv_b = create_conv_para(rng, filter_shape=(hidden_size[1], 1, emb_size, filter_size[0])) conv_W_context, conv_b_context = create_conv_para( rng, filter_shape=(hidden_size[1], 1, emb_size, 1)) NN_para = [conv_W, conv_b, conv_W_context] ''' attentive convolution function ''' attentive_conv_layer = Conv_for_Pair( rng, origin_input_tensor3=embed_input_l, origin_input_tensor3_r=embed_input_r, input_tensor3=embed_input_l, input_tensor3_r=embed_input_r, mask_matrix=sents_mask_l, mask_matrix_r=sents_mask_r, image_shape=(batch_size, 1, emb_size, maxSentLen), image_shape_r=(batch_size, 1, emb_size, maxSentLen), filter_shape=(hidden_size[1], 1, emb_size, filter_size[0]), filter_shape_context=(hidden_size[1], 1, emb_size, 1), W=conv_W, b=conv_b, W_context=conv_W_context, b_context=conv_b_context) attentive_sent_embeddings_l = attentive_conv_layer.attentive_maxpool_vec_l attentive_sent_embeddings_r = attentive_conv_layer.attentive_maxpool_vec_r "form input to LR classifier" LR_input = T.concatenate([ attentive_sent_embeddings_l, attentive_sent_embeddings_r, attentive_sent_embeddings_l * attentive_sent_embeddings_r, attentive_sent_embeddings_l - attentive_sent_embeddings_r, word1_embedding, word2_embedding, word1_embedding * word2_embedding ], axis=1) LR_input_size = 4 * hidden_size[1] + 3 * emb_size U_a = create_ensemble_para( rng, 2, LR_input_size) # the weight matrix hidden_size*2 LR_b = theano.shared(value=np.zeros((2, ), dtype=theano.config.floatX), name='LR_b', borrow=True) #bias for each target class LR_para = [U_a, LR_b] layer_LR = LogisticRegression( rng, input=LR_input, n_in=LR_input_size, n_out=2, W=U_a, b=LR_b ) #basically it is a multiplication between weight matrix and input feature vector loss = layer_LR.negative_log_likelihood( labels ) #for classification task, we usually used negative log likelihood as loss, the lower the better. # L2_reg = (conv_W**2).sum()+(conv_W_context**2).sum()+(U_a**2).sum() params = NN_para + LR_para #[init_embeddings] cost = loss #+L2_weight*L2_reg updates = Gradient_Cost_Para(cost, params, learning_rate) train_model = theano.function([ sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, word1_ids, word2_ids, word1_mask, word2_mask, labels ], cost, updates=updates, allow_input_downcast=True, on_unused_input='ignore') test_model = theano.function([ sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, word1_ids, word2_ids, word1_mask, word2_mask, labels ], [layer_LR.errors(labels), layer_LR.y_pred, layer_LR.prop_for_posi], allow_input_downcast=True, on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 50000000000 # look as this many examples regardless start_time = time.time() mid_time = start_time past_time = mid_time epoch = 0 done_looping = False n_train_batches = train_size / batch_size train_batch_start = list( np.arange(n_train_batches) * batch_size) + [train_size - batch_size] # n_dev_batches=dev_size/batch_size # dev_batch_start=list(np.arange(n_dev_batches)*batch_size)+[dev_size-batch_size] n_test_batches = test_size / batch_size n_test_remain = test_size % batch_size if n_test_remain != 0: test_batch_start = list( np.arange(n_test_batches) * batch_size) + [test_size - batch_size] else: test_batch_start = list(np.arange(n_test_batches) * batch_size) # max_acc_dev=0.0 max_ap_test = 0.0 max_ap_topk_test = 0.0 max_f1 = 0.0 cost_i = 0.0 train_indices = range(train_size) while epoch < n_epochs: epoch = epoch + 1 random.Random(100).shuffle( train_indices ) #shuffle training set for each new epoch, is supposed to promote performance, but not garrenteed iter_accu = 0 for batch_id in train_batch_start: #for each batch # iter means how many batches have been run, taking into loop iter = (epoch - 1) * n_train_batches + iter_accu + 1 iter_accu += 1 train_id_batch = train_indices[batch_id:batch_id + batch_size] cost_i += train_model( train_sents_l[train_id_batch], train_masks_l[train_id_batch], train_sents_r[train_id_batch], train_masks_r[train_id_batch], train_word1[train_id_batch], train_word2[train_id_batch], train_word1_mask[train_id_batch], train_word2_mask[train_id_batch], train_labels_store[train_id_batch]) #after each 1000 batches, we test the performance of the model on all test data if iter % 100 == 0: print 'Epoch ', epoch, 'iter ' + str( iter) + ' average cost: ' + str(cost_i / iter), 'uses ', ( time.time() - past_time) / 60.0, 'min' past_time = time.time() pred_labels = [] probs = [] gold_labels = [] error_sum = 0.0 for idd, test_batch_id in enumerate( test_batch_start): # for each test batch error_i, pred_i, prob_i = test_model( test_sents_l[test_batch_id:test_batch_id + batch_size], test_masks_l[test_batch_id:test_batch_id + batch_size], test_sents_r[test_batch_id:test_batch_id + batch_size], test_masks_r[test_batch_id:test_batch_id + batch_size], test_word1[test_batch_id:test_batch_id + batch_size], test_word2[test_batch_id:test_batch_id + batch_size], test_word1_mask[test_batch_id:test_batch_id + batch_size], test_word2_mask[test_batch_id:test_batch_id + batch_size], test_labels_store[test_batch_id:test_batch_id + batch_size]) error_sum += error_i pred_labels += list(pred_i) probs += list(prob_i) if n_test_remain != 0: probs = probs[:(len(test_batch_start) - 1) * batch_size] + probs[-n_test_remain:] assert len(test_labels) == len(probs) # test_acc=1.0-error_sum/(len(test_batch_start)) test_ap = apk(test_labels, probs, k=len(test_labels)) test_ap_top100 = apk(test_labels, probs, k=100) if test_ap > max_ap_test: max_ap_test = test_ap if test_ap_top100 > max_ap_topk_test: max_ap_topk_test = test_ap_top100 print '\t\tcurrent ap:', test_ap, ' ; ', '\t\tmax_ap: ', max_ap_test, 'ap@100: ', test_ap_top100, '\tmax_ap@100:', max_ap_topk_test print 'Epoch ', epoch, 'uses ', (time.time() - mid_time) / 60.0, 'min' mid_time = time.time() #print 'Batch_size: ', update_freq end_time = time.time() print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def _InitializeModelThatPredictsCharsMultiSoftmax(self,learning_rate, num_softmaxes=5): image_input = T.tensor4('image_input') print ("num_of_softmax: " + str(num_softmaxes)) #prediction_layer = self._BuildModelToPredictFirstChar(image_input) prediction_layer = self._BuildModelToPredictCharsMultiSoftmax( image_input, num_softmaxes=num_softmaxes) target_chars_input = T.imatrix('target_chars_input') target_chars = target_chars_input[:, :num_softmaxes].reshape(shape=(-1,)) # Create a loss expression for training, Using cross-entropy loss. prediction = lasagne.layers.get_output(prediction_layer) l_loss = lasagne.objectives.categorical_crossentropy(prediction, target_chars) loss = l_loss.mean() # Create update expressions for training, i.e., how to modify the # parameters at each training step. Here, we'll use Stochastic Gradient # Descent (SGD) with Nesterov momentum. params = lasagne.layers.get_all_params(prediction_layer, trainable=True) updates = lasagne.updates.nesterov_momentum( loss, params, learning_rate, momentum=0.9) #updates = lasagne.updates.adagrad(loss, params, learning_rate=0.0001) # Create a loss expression for validation/testing. The crucial difference # here is that we do a deterministic forward pass through the network, # disabling dropout layers. test_prediction = lasagne.layers.get_output(prediction_layer, deterministic=True) test_loss = lasagne.objectives.categorical_crossentropy(test_prediction, target_chars) test_loss = test_loss.mean() predicted_chars = T.argmax(test_prediction, axis=1) correctly_predicted_chars = T.eq(predicted_chars, target_chars) # An expression for the classification accuracy: test_acc = T.mean(correctly_predicted_chars, dtype=theano.config.floatX) predicted_chars = predicted_chars.reshape(shape=(-1, num_softmaxes)) correctly_predicted_chars = correctly_predicted_chars.reshape(shape=(-1, num_softmaxes)) num_chars_matched = T.sum(correctly_predicted_chars, axis=1, dtype=theano.config.floatX) seq_test_acc = T.mean(T.eq(num_chars_matched, T.fill(num_chars_matched, num_softmaxes)), dtype=theano.config.floatX) test_prediction = test_prediction.reshape(shape=(-1, num_softmaxes, len(self.CHARS))) # Compile a function performing a training step on a mini-batch (by giving # the updates dictionary) and returning the corresponding training loss: train_fn = theano.function( [image_input, target_chars_input], loss, updates=updates, allow_input_downcast=True) # Compile a second function computing the prediction, validation loss and accuracy: test_fn = theano.function([image_input, target_chars_input], [test_loss, test_acc, seq_test_acc], allow_input_downcast=True) # Compile a third function computing the prediction. inference_fn = theano.function([image_input], [predicted_chars, test_prediction], allow_input_downcast=True) return prediction_layer, train_fn, test_fn, inference_fn
def _get_input_tensor_variables(): # x_w: 1D: batch, 2D: n_words, 3D: 5 + window; word id # x_p: 1D: batch, 2D: n_words; posit id # y: 1D: batch, 2D: n_words; label id return T.itensor3('x_w'), T.imatrix('x_p'), T.imatrix('y')
def _InitializeModelThatPredictsAllChars( self, learning_rate, bidirectional_rnn=False, use_mask_input=False, lstm_layer_units=256, cnn_dense_layer_sizes = 256, lstm_grad_clipping=False): image_input = T.tensor4('image_input') num_rnn_steps = self.num_rnn_steps target_chars_input = T.imatrix('target_chars') target_chars = target_chars_input[:, :num_rnn_steps] target_chars = target_chars.reshape(shape=(-1,)) mask_input_input = None mask_input = None if use_mask_input: mask_input_input = T.imatrix('mask_input') mask_input = mask_input_input[:, :num_rnn_steps] #mask_input = mask_input.reshape(shape=(-1,)) prediction_layer, l_cnn, l_lstm = self._BuildModelToPredictAllChars( image_input, num_rnn_steps=num_rnn_steps, mask_input=mask_input, bidirectional_rnn=bidirectional_rnn, lstm_layer_units=lstm_layer_units, cnn_dense_layer_sizes= cnn_dense_layer_sizes, lstm_grad_clipping=lstm_grad_clipping) #lstm_grad_clipping=False) # Create a loss expression for training, Using cross-entropy loss. #prediction = lasagne.layers.get_output(prediction_layer) prediction, l_cnn, l_lstm = tuple( lasagne.layers.get_output([prediction_layer, l_cnn, l_lstm])) l_loss = lasagne.objectives.categorical_crossentropy(prediction, target_chars) print ("prediction",prediction.shape,"target_char",target_chars.shape,"$$$$$$$$") if use_mask_input: l_loss = l_loss.reshape(shape=(-1, num_rnn_steps)) l_loss *= mask_input loss = l_loss.sum() / mask_input.sum() else: loss = l_loss.mean() # Create update expressions for training, i.e., how to modify the # parameters at each training step. Here, we'll use Stochastic Gradient # Descent (SGD) with Nesterov momentum. params = lasagne.layers.get_all_params(prediction_layer, trainable=True) #grads = theano.grad(loss, params) if lstm_grad_clipping: print('doing grad clipping') max_grad_norm = 15.0 grads = theano.grad(loss, params) grads = [grad.clip(-5., 5.) for grad in grads] #grads, norm = lasagne.updates.total_norm_constraint( # grads, max_grad_norm, return_norm=True) grads = [lasagne.updates.norm_constraint( grad, max_grad_norm, range(grad.ndim)) for grad in grads] updates = lasagne.updates.adam(grads, params, learning_rate=learning_rate) else: updates = lasagne.updates.nesterov_momentum( loss, params, learning_rate, momentum=0.9) #updates = lasagne.updates.adagrad(loss, params, learning_rate=0.001) # Create a loss expression for validation/testing. The crucial difference # here is that we do a deterministic forward pass through the network, # disabling dropout layers. test_prediction = lasagne.layers.get_output(prediction_layer, deterministic=True) test_loss = lasagne.objectives.categorical_crossentropy(test_prediction, target_chars) test_loss = test_loss.mean() predicted_chars = T.argmax(test_prediction, axis=1) correctly_predicted_chars = T.eq(predicted_chars, target_chars) # An expression for the classification accuracy: test_acc = T.mean(correctly_predicted_chars, dtype=theano.config.floatX) predicted_chars = predicted_chars.reshape(shape=(-1, num_rnn_steps)) correctly_predicted_chars = correctly_predicted_chars.reshape(shape=(-1, num_rnn_steps)) num_chars_matched = T.sum(correctly_predicted_chars, axis=1, dtype=theano.config.floatX) seq_test_acc = T.mean(T.eq(num_chars_matched, T.fill(num_chars_matched, num_rnn_steps)), dtype=theano.config.floatX) test_prediction = test_prediction.reshape(shape=(-1, num_rnn_steps, len(self.CHARS))) mask_input_vec = [mask_input_input] if use_mask_input else [] # Compile a function performing a training step on a mini-batch (by giving # the updates dictionary) and returning the corresponding training loss: print ("target chars",image_input) train_fn = theano.function( [image_input, target_chars_input] + mask_input_vec, loss, updates=updates, allow_input_downcast=True) # Compile a second function computing the prediction, validation loss and accuracy: test_fn = theano.function([image_input, target_chars_input] + mask_input_vec, [test_loss, test_acc, seq_test_acc,predicted_chars,target_chars,correctly_predicted_chars], allow_input_downcast=True) # Compile a third function computing the prediction. inference_fn = theano.function([image_input] + mask_input_vec, [predicted_chars, test_prediction], allow_input_downcast=True) return prediction_layer, train_fn, test_fn, inference_fn
def main(): configure_theano() options = parse_options() config_file = options['config'] config = ConfigParser.ConfigParser() config.read(config_file) print('CLI options: {}'.format(options.items())) print('Reading Config File: {}...'.format(config_file)) print(config.items('stream1')) print(config.items('lstm_classifier')) print(config.items('training')) print('preprocessing dataset...') data = load_mat_file(config.get('stream1', 'data')) has_encoder = config.getboolean('stream1', 'has_encoder') stream1_dim = config.getint('stream1', 'input_dimensions') imagesize = tuple([int(d) for d in config.get('stream1', 'imagesize').split(',')]) if has_encoder: stream1 = config.get('stream1', 'model') stream1_shape = config.get('stream1', 'shape') stream1_nonlinearities = config.get('stream1', 'nonlinearities') # lstm classifier output_classes = config.getint('lstm_classifier', 'output_classes') output_classnames = config.get('lstm_classifier', 'output_classnames').split(',') lstm_size = config.getint('lstm_classifier', 'lstm_size') matlab_target_offset = config.getboolean('lstm_classifier', 'matlab_target_offset') # lstm classifier configurations weight_init = options['weight_init'] if 'weight_init' in options else config.get('lstm_classifier', 'weight_init') use_peepholes = options['use_peepholes'] if 'use_peepholes' in options else config.getboolean('lstm_classifier', 'use_peepholes') use_blstm = True if config.has_option('lstm_classifier', 'use_blstm') else False windowsize = config.getint('lstm_classifier', 'windowsize') # capture training parameters validation_window = int(options['validation_window']) \ if 'validation_window' in options else config.getint('training', 'validation_window') num_epoch = int(options['num_epoch']) if 'num_epoch' in options else config.getint('training', 'num_epoch') learning_rate = options['learning_rate'] if 'learning_rate' in options \ else config.getfloat('training', 'learning_rate') epochsize = config.getint('training', 'epochsize') batchsize = config.getint('training', 'batchsize') weight_init_fn = las.init.GlorotUniform() if weight_init == 'glorot': weight_init_fn = las.init.GlorotUniform() if weight_init == 'norm': weight_init_fn = las.init.Normal(0.1) if weight_init == 'uniform': weight_init_fn = las.init.Uniform() if weight_init == 'ortho': weight_init_fn = las.init.Orthogonal() data_matrix = data['dataMatrix'].astype('float32') targets_vec = data['targetsVec'].reshape((-1,)) subjects_vec = data['subjectsVec'].reshape((-1,)) vidlen_vec = data['videoLengthVec'].reshape((-1,)) iter_vec = data['iterVec'].reshape((-1,)) data_matrix = presplit_dataprocessing(data_matrix, vidlen_vec, config, 'stream1', imagesize=imagesize) indexes = create_split_index(len(data_matrix), vidlen_vec, iter_vec) train_vidlen_vec, test_vidlen_vec = split_videolen(vidlen_vec, iter_vec) if matlab_target_offset: targets_vec -= 1 # split the data train_data = data_matrix[indexes == True] train_targets = targets_vec[indexes == True] train_targets = train_targets.reshape((len(train_targets),)) test_data = data_matrix[indexes == False] test_targets = targets_vec[indexes == False] test_targets = test_targets.reshape((len(test_targets),)) train_data, test_data = postsplit_datapreprocessing(train_data, test_data, config, 'stream1') inputs = T.tensor3('inputs', dtype='float32') window = T.iscalar('theta') mask = T.matrix('mask', dtype='uint8') targets = T.imatrix('targets') print('constructing end to end model...') if not has_encoder: network = deltanet_v1.create_model((None, None, stream1_dim), inputs, (None, None), mask, window, lstm_size, output_classes, weight_init_fn, use_peepholes, use_blstm) else: ae1 = load_decoder(stream1, stream1_shape, stream1_nonlinearities) network = deltanet_majority_vote.create_model(ae1, (None, None, stream1_dim), inputs, (None, None), mask, lstm_size, window, output_classes, weight_init_fn, use_peepholes) print_network(network) draw_to_file(las.layers.get_all_layers(network), 'network.png', verbose=True) # exit() print('compiling model...') predictions = las.layers.get_output(network, deterministic=False) all_params = las.layers.get_all_params(network, trainable=True) cost = temporal_softmax_loss(predictions, targets, mask) updates = las.updates.adam(cost, all_params, learning_rate) train = theano.function( [inputs, targets, mask, window], cost, updates=updates, allow_input_downcast=True) compute_train_cost = theano.function([inputs, targets, mask, window], cost, allow_input_downcast=True) test_predictions = las.layers.get_output(network, deterministic=True) test_cost = temporal_softmax_loss(test_predictions, targets, mask) compute_test_cost = theano.function( [inputs, targets, mask, window], test_cost, allow_input_downcast=True) val_fn = theano.function([inputs, mask, window], test_predictions, allow_input_downcast=True) # We'll train the network with 10 epochs of 30 minibatches each print('begin training...') cost_train = [] cost_val = [] class_rate = [] STRIP_SIZE = 3 val_window = circular_list(validation_window) train_strip = np.zeros((STRIP_SIZE,)) best_val = float('inf') best_conf = None best_cr = 0.0 datagen = gen_lstm_batch_random(train_data, train_targets, train_vidlen_vec, batchsize=batchsize) val_datagen = gen_lstm_batch_random(test_data, test_targets, test_vidlen_vec, batchsize=len(test_vidlen_vec)) # We'll use this "validation set" to periodically check progress X_val, y_val, mask_val, idxs_val = next(val_datagen) # reshape the targets for validation y_val_evaluate = y_val y_val = y_val.reshape((-1, 1)).repeat(mask_val.shape[-1], axis=-1) for epoch in range(num_epoch): time_start = time.time() for i in range(epochsize): X, y, m, batch_idxs = next(datagen) # repeat targets based on max sequence len y = y.reshape((-1, 1)) y = y.repeat(m.shape[-1], axis=-1) print_str = 'Epoch {} batch {}/{}: {} examples at learning rate = {:.4f}'.format( epoch + 1, i + 1, epochsize, len(X), learning_rate) print(print_str, end='') sys.stdout.flush() train(X, y, m, windowsize) print('\r', end='') cost = compute_train_cost(X, y, m, windowsize) val_cost = compute_test_cost(X_val, y_val, mask_val, windowsize) cost_train.append(cost) cost_val.append(val_cost) train_strip[epoch % STRIP_SIZE] = cost val_window.push(val_cost) gl = 100 * (cost_val[-1] / np.min(cost_val) - 1) pk = 1000 * (np.sum(train_strip) / (STRIP_SIZE * np.min(train_strip)) - 1) pq = gl / pk cr, val_conf = evaluate_model2(X_val, y_val_evaluate, mask_val, windowsize, val_fn) class_rate.append(cr) print("Epoch {} train cost = {}, validation cost = {}, " "generalization loss = {:.3f}, GQ = {:.3f}, classification rate = {:.3f} ({:.1f}sec)" .format(epoch + 1, cost_train[-1], cost_val[-1], gl, pq, cr, time.time() - time_start)) if val_cost < best_val: best_val = val_cost best_conf = val_conf best_cr = cr if epoch >= validation_window and early_stop2(val_window, best_val, validation_window): break print('Best Model') print('classification rate: {}, validation loss: {}'.format(best_cr, best_val)) print('confusion matrix: ') plot_confusion_matrix(best_conf, output_classnames, fmt='latex') plot_validation_cost(cost_train, cost_val, class_rate)
def train_lbl(train_data, dev_data, test_data=[], K=20, context_sz=2, learning_rate=1.0, rate_update='simple', epochs=10, batch_size=1, rng=None, patience=None, patience_incr=2, improvement_thrs=0.995, validation_freq=1000): """ Train log-bilinear model """ # create vocabulary from train data, plus <s>, </s> logger.info("Creating vocabulary dictionary...") vocab = Dictionary.from_corpus(train_data, unk='<unk>') logger.info("Creating tag dictionary...") vocab_tags = Dictionary.from_corpus_tags(train_data, unk='<unk>') vocab.add_word('<s>') vocab.add_word('</s>') V = vocab.size() vocab_tags.add_word('<s>') vocab_tags.add_word('</s>') V_tag = vocab_tags.size() #print train_data # initialize random generator if not provided rng = np.random.RandomState() if not rng else rng logger.info("Making instances...") # generate (context, target) pairs of word ids train_set_x, train_set_y, train_set_tags = make_instances(train_data, vocab, vocab_tags, context_sz) dev_set_x, dev_set_y, dev_set_tags = make_instances(dev_data, vocab, vocab_tags, context_sz) test_set_x, test_set_y, test_set_tags = make_instances(test_data, vocab, vocab_tags, context_sz) # make feature_matrix # very sparse matrix...better way to do it? feature_matrix = np.zeros((vocab_tags.size(),vocab_tags.num_sub_tags)) feature_matrix[(0,0)] = 1 # unk encoding for tag,tag_id in vocab_tags: if tag == "<s>": feature_matrix[(tag_id,1)] = 1 elif tag == "</s>": feature_matrix[(tag_id,2)] = 1 else: for sub_tag in vocab_tags.map_tag_to_sub_tags[tag]: val = vocab_tags.map_sub_to_ids[sub_tag] feature_matrix[(tag_id,val)] = 1 feature_matrix[1,:] = np.zeros((vocab_tags.num_sub_tags)) # number of minibatches for training n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size n_dev_batches = dev_set_x.get_value(borrow=True).shape[0] / batch_size n_test_batches = test_set_x.get_value(borrow=True).shape[0] / batch_size # build the model logger.info("Build the model ...") index = T.lscalar() x = T.imatrix('x') y = T.ivector('y') t = T.ivector('t') # the tag vector # create log-bilinear model lbl = LogBilinearLanguageModel(x, V, K, vocab_tags.num_sub_tags, feature_matrix, context_sz, rng) # cost function is negative log likelihood of the training data cost = lbl.negative_log_likelihood(y,t) # compute the gradient gparams = [] for param in lbl.params: gparam = T.grad(cost, param) gparams.append(gparam) # specify how to update the parameter of the model updates = [] for param_i,(param, gparam) in enumerate(zip(lbl.params, gparams)): updates.append((param, param-learning_rate*gparam)) # function that computes log-probability of the dev set logprob_dev = theano.function(inputs=[index], outputs=cost, givens={x: dev_set_x[index*batch_size: (index+1)*batch_size], y: dev_set_y[index*batch_size: (index+1)*batch_size], t: dev_set_tags[index*batch_size:(index+1)*batch_size] }) # function that computes log-probability of the test set logprob_test = theano.function(inputs=[index], outputs=cost, givens={x: test_set_x[index*batch_size: (index+1)*batch_size], y: test_set_y[index*batch_size: (index+1)*batch_size], t: test_set_tags[index*batch_size:(index+1)*batch_size] }) # function that returns the cost and updates the parameter train_model = theano.function(inputs=[index], outputs=cost, updates=updates, givens={x: train_set_x[index*batch_size: (index+1)*batch_size], y: train_set_y[index*batch_size: (index+1)*batch_size], t: train_set_tags[index*batch_size:(index+1)*batch_size] }) # perplexity functions def compute_dev_logp(): return np.mean([logprob_dev(i) for i in xrange(n_dev_batches)]) def compute_test_logp(): return np.mean([logprob_test(i) for i in xrange(n_test_batches)]) def ppl(neg_logp): return np.power(2.0, neg_logp) # train model logger.info("training model...") best_params = None last_epoch_dev_ppl = np.inf best_dev_ppl = np.inf test_ppl = np.inf test_core = 0 start_time = time.clock() done_looping = False for epoch in xrange(epochs): if done_looping: break logger.info('epoch %i' % epoch) for minibatch_index in xrange(n_train_batches): itr = epoch * n_train_batches + minibatch_index train_logp = train_model(minibatch_index) logger.info('epoch %i, minibatch %i/%i, train minibatch log prob %.4f ppl %.4f' % (epoch, minibatch_index+1, n_train_batches, train_logp, ppl(train_logp))) if (itr+1) % validation_freq == 0: # compute perplexity on dev set, lower is better dev_logp = compute_dev_logp() dev_ppl = ppl(dev_logp) logger.debug('epoch %i, minibatch %i/%i, dev log prob %.4f ppl %.4f' % (epoch, minibatch_index+1, n_train_batches, dev_logp, ppl(dev_logp))) # if we got the lowest perplexity until now if dev_ppl < best_dev_ppl: # improve patience if loss improvement is good enough if patience and dev_ppl < best_dev_ppl * improvement_thrs: patience = max(patience, itr * patience_incr) best_dev_ppl = dev_ppl test_logp = compute_test_logp() test_ppl = ppl(test_logp) logger.debug('epoch %i, minibatch %i/%i, test log prob %.4f ppl %.4f' % (epoch, minibatch_index+1, n_train_batches, test_logp, ppl(test_logp))) # stop learning if no improvement was seen for a long time if patience and patience <= itr: done_looping = True break # adapt learning rate if rate_update == 'simple': # set learning rate to 1 / (epoch+1) learning_rate = 1.0 / (epoch+1) elif rate_update == 'adaptive': # half learning rate if perplexity increased at end of epoch (Mnih and Teh 2012) this_epoch_dev_ppl = ppl(compute_dev_logp()) if this_epoch_dev_ppl > last_epoch_dev_ppl: learning_rate /= 2.0 last_epoch_dev_ppl = this_epoch_dev_ppl elif rate_update == 'constant': # keep learning rate constant pass else: raise ValueError("Unknown learning rate update strategy: %s" %rate_update) end_time = time.clock() total_time = end_time - start_time logger.info('Optimization complete with best dev ppl of %.4f and test ppl %.4f' % (best_dev_ppl, test_ppl)) logger.info('Training took %d epochs, with %.1f epochs/sec' % (epoch+1, float(epoch+1) / total_time)) logger.info("Total training time %d days %d hours %d min %d sec." % (total_time/60/60/24, total_time/60/60%24, total_time/60%60, total_time%60)) # return model return lbl
sys.exit("'Hidden layer size' argument missing!") if len(sys.argv) > 3: learning_rate = float(sys.argv[3]) else: sys.exit("'Learning rate' argument missing!") model_file_name = "Model_%s_h%d_lr%s.pcl" % (model_name, num_hidden, learning_rate) print num_hidden, learning_rate, model_file_name word_vocabulary = data.read_vocabulary(data.WORD_VOCAB_FILE) punctuation_vocabulary = data.iterable_to_dict(data.PUNCTUATION_VOCABULARY) x = T.imatrix('x') y = T.imatrix('y') lr = T.scalar('lr') if os.path.isfile(model_file_name): print "Loading previous model state" net, state = models.load(model_file_name, MINIBATCH_SIZE, x) gsums, learning_rate, validation_ppl_history, starting_epoch, rng = state best_ppl = min(validation_ppl_history) else: rng = np.random rng.seed(1) print "Building model..."