def build_model(self, train_set, test_set, validation_set): """ Building the model should be done prior to training. It will implement the training, testing and validation functions. This method should be called from any subsequent inheriting model. :param loss: The loss funciton applied to training (cf. updates.py), e.g. mse. :param update: The update function (optimization framework) used for training (cf. updates.py), e.g. sgd. :param update_args: The args for the update function applied to training, e.g. (0.001,). """ print "### BUILDING MODEL ###" self.train_args = {} self.train_args['inputs'] = OrderedDict({}) self.train_args['outputs'] = OrderedDict({}) self.test_args = {} self.test_args['inputs'] = OrderedDict({}) self.test_args['outputs'] = OrderedDict({}) self.validate_args = {} self.validate_args['inputs'] = OrderedDict({}) self.validate_args['outputs'] = OrderedDict({}) self.sym_index = T.iscalar('index') self.sym_batchsize = T.iscalar('batchsize') self.sym_lr = T.scalar('learningrate') self.batch_slice = slice(self.sym_index * self.sym_batchsize, (self.sym_index + 1) * self.sym_batchsize) self.sh_train_x = theano.shared(np.asarray(train_set[0], dtype=theano.config.floatX), borrow=True) self.sh_train_t = theano.shared(np.asarray(train_set[1], dtype=theano.config.floatX), borrow=True) self.sh_test_x = theano.shared(np.asarray(test_set[0], dtype=theano.config.floatX), borrow=True) self.sh_test_t = theano.shared(np.asarray(test_set[1], dtype=theano.config.floatX), borrow=True) if validation_set is not None: self.sh_valid_x = theano.shared(np.asarray(validation_set[0], dtype=theano.config.floatX), borrow=True) self.sh_valid_t = theano.shared(np.asarray(validation_set[1], dtype=theano.config.floatX), borrow=True)
def getMinibatchTrainer(self, costFunction, variableToData, rms=True): # define params lr = T.fscalar('lr') start = T.iscalar('start') end = T.iscalar('end') # Get the cost and its parameters. params = costFunction[0] cost = costFunction[1] # Get the updates. updates = self.getUpdates(cost, params, lr, rms) # Store all state variables. stateManager = StateManager([u[0] for u in updates]) # Slice the data givens = dict() for item in variableToData: givens[item.variable] = item.slice(start,end) # Define the training function. train_model = theano.function(inputs=[theano.Param(start, borrow = True), theano.Param(end, borrow=True), theano.Param(lr, borrow=True)], outputs=theano.Out(cost, borrow=True), updates=updates, givens=givens) return train_model, stateManager
def f_train(self, t_x, t_corrupt = 0.2, t_rate = 0.1): """ return training function of the following signiture: input: lower and upper indices on training data alternative training data return: likelihood based cost square distance between training data and prediction """ x = T.matrix('x') # pipe data through this symble q = self.t_corrupt(x, t_corrupt) h = self.t_encode(q) z = self.t_decode(h) L = - T.sum(x * T.log(z) + (1 - x) * T.log(1 - z), axis=1) cost = T.mean(L) # to be returned dist = T.mean(T.sqrt(T.sum((x - z) ** 2, axis = 1))) # to be returned grad = T.grad(cost, self.parm) diff = [(p, p - t_rate * g) for p, g in zip(self.parm, grad)] t_fr = T.iscalar() t_to = T.iscalar() return theano.function( [t_fr, t_to], [cost, dist], updates = diff, givens = {x : t_x[t_fr:t_to]}, name = "DA_trainer")
def SimFnIdx(fnsim, embeddings, leftop, rightop): """ This function returns a Theano function to measure the similarity score for a given triplet of entity indexes. :param fnsim: similarity function (on Theano variables). :param embeddings: an Embeddings instance. :param leftop: class for the 'left' operator. :param rightop: class for the 'right' operator. """ embedding, relationl, relationr = parse_embeddings(embeddings) # Inputs idxo = T.iscalar('idxo') idxr = T.iscalar('idxr') idxl = T.iscalar('idxl') # Graph lhs = (embedding.E[:, idxl]).reshape((1, embedding.D)) rhs = (embedding.E[:, idxr]).reshape((1, embedding.D)) rell = (relationl.E[:, idxo]).reshape((1, relationl.D)) relr = (relationr.E[:, idxo]).reshape((1, relationr.D)) simi = fnsim(leftop(lhs, rell), rightop(rhs, relr)) """ Theano function inputs. :input idxl: index value of the 'left' member. :input idxr: index value of the 'right' member. :input idxo: index value of the relation member. Theano function output. :output simi: score value. """ return theano.function([idxl, idxr, idxo], [simi], on_unused_input='ignore')
def RankRightFnIdx_filtered(fnsim, embeddings, leftop, rightop, subtensorspec=None): """ This function returns a Theano function to measure the similarity score of all 'right' entities given couples of relation and 'left' entities (as index values). """ embedding, relationl, relationr = parse_embeddings(embeddings) # Inputs idxl, idxo = T.iscalar('idxl'), T.iscalar('idxo') rightparts = T.ivector('rightparts') # Graph lhs = (embedding.E[:, idxl]).reshape((1, embedding.D)) # lhs: 1xD vector containing the embedding of idxl if subtensorspec is not None: # We compute the score only for a subset of entities rhs = (embedding.E[:, :subtensorspec]).T else: rhs = embedding.E.T # rhs: NxD embedding matrix rhs = rhs[rightparts, :] # select the right parts not appearing # in the train/valid/test sets rell = (relationl.E[:, idxo]).reshape((1, relationl.D)) # rell: 1xD vector containing the embedding of idxo (relationl) relr = (relationr.E[:, idxo]).reshape((1, relationr.D)) # relr: 1xD vector containing the embedding of idxo (relationr) tmp = leftop(lhs, rell) # a = rell(lhs) # b = relr(rhs) simi = fnsim(tmp.reshape((1, tmp.shape[1])), rightop(rhs, relr)) # simi = fnsim(a, b) return theano.function([idxl, idxo, rightparts], [simi], on_unused_input='ignore')
def RankRightFnIdx_Schema(fnsim, embeddings, prior, leftop, rightop, subtensorspec=None): embedding, relationl, relationr = parse_embeddings(embeddings) # Inputs idxl, idxo = T.iscalar('idxl'), T.iscalar('idxo') g = T.matrix('g') # Graph lhs = (embedding.E[:, idxl]).reshape((1, embedding.D)) # lhs: 1xD vector containing the embedding of idxl if subtensorspec is not None: # We compute the score only for a subset of entities rhs = (embedding.E[:, :subtensorspec]).T else: rhs = embedding.E.T # rhs: NxD embedding matrix rell = (relationl.E[:, idxo]).reshape((1, relationl.D)) # rell: 1xD vector containing the embedding of idxo (relationl) relr = (relationr.E[:, idxo]).reshape((1, relationr.D)) # relr: 1xD vector containing the embedding of idxo (relationr) tmp = leftop(lhs, rell) # a = rell(lhs) # b = relr(rhs) # Negative Energy simi = fnsim(tmp.reshape((1, tmp.shape[1])), rightop(rhs, relr)) # simi = fnsim(a, b) pen_simi = g[0, :].T * prior.P[idxo, 0].T + g[1, :].T * prior.P[idxo, 1].T simi = simi - pen_simi return theano.function([idxl, idxo, g], [simi], on_unused_input='ignore')
def build_model(shared_params, options, other_params): """ Build the complete neural network model and return the symbolic variables """ # symbolic variables x = tensor.matrix(name="x", dtype=floatX) y1 = tensor.iscalar(name="y1") y2 = tensor.iscalar(name="y2") # lstm cell (ht, ct) = lstm_cell(x, shared_params, options, other_params) # gets the ht, ct # softmax 1 i.e. frame type prediction activation = tensor.dot(shared_params['softmax1_W'], ht).transpose() + shared_params['softmax1_b'] frame_pred = tensor.nnet.softmax(activation) # .transpose() # softmax 2 i.e. gesture class prediction # # predicted probability for frame type f_pred_prob = theano.function([x], frame_pred, name="f_pred_prob") # predicted frame type f_pred = theano.function([x], frame_pred.argmax(), name="f_pred") # cost cost = ifelse(tensor.eq(y1, 1), -tensor.log(frame_pred[0, 0] + options['log_offset']) * other_params['begin_cost_factor'], ifelse(tensor.eq(y1, 2), -tensor.log(frame_pred[0, 1] + options['log_offset']) * other_params['end_cost_factor'], ifelse(tensor.eq(y1, 3), -tensor.log(frame_pred[0, 2] + options['log_offset']), tensor.abs_(tensor.log(y1)))), name='ifelse_cost') # function for output of the currect lstm cell and softmax prediction f_model_cell_output = theano.function([x], (ht, ct, frame_pred), name="f_model_cell_output") # return the model symbolic variables and theano functions return x, y1, y2, f_pred_prob, f_pred, cost, f_model_cell_output
def RankLeftFnIdx_Schema(fnsim, embeddings, prior, leftop, rightop, subtensorspec=None): embedding, relationl, relationr = parse_embeddings(embeddings) # Inputs idxr, idxo = T.iscalar('idxr'), T.iscalar('idxo') g = T.matrix('g') # Graph if subtensorspec is not None: # We compute the score only for a subset of entities lhs = (embedding.E[:, :subtensorspec]).T else: lhs = embedding.E.T rhs = (embedding.E[:, idxr]).reshape((1, embedding.D)) rell = (relationl.E[:, idxo]).reshape((1, relationl.D)) relr = (relationr.E[:, idxo]).reshape((1, relationr.D)) tmp = rightop(rhs, relr) simi = fnsim(leftop(lhs, rell), tmp.reshape((1, tmp.shape[1]))) pen_simi = g[0, :].T * prior.P[idxo, 0].T + g[1, :].T * prior.P[idxo, 1].T simi = simi - pen_simi return theano.function([idxr, idxo, g], [simi], on_unused_input='ignore')
def getTrainer(self,lossType="NLL"): ''' return a function to do MBSGD on (trainX,trainY) ''' trainY = T.ivector('y') alpha = T.dscalar('a') lowIdx = T.iscalar() highIdx = T.iscalar() trainX = T.matrix() if lossType=="aNLL": loss = self.aNLL(trainY) elif lossType=='MSE': loss = self.MSE(trainY) else: loss = self.NLL(trainY) dW = T.grad(cost = loss, wrt = self.W) db = T.grad(cost = loss, wrt = self.b) updates = [(self.W,self.W - alpha * dW), (self.b,self.b - alpha * db)] trainer = theano.function( inputs = [trainX,trainY,alpha], outputs = loss, updates=updates, givens = { self.input : trainX, }, allow_input_downcast=True ) return trainer
def __init__(self, in_size, out_size, dim_y, dim_pos, hidden_size_encoder, hidden_size_decoder, cell = "gru", optimizer = "rmsprop", p = 0.5, num_sents = 1): self.X = T.matrix("X") self.Y_y = T.matrix("Y_y") self.Y_pos = T.matrix("Y_pos") self.in_size = in_size self.out_size = out_size self.dim_y = dim_y self.dim_pos = dim_pos self.hidden_size_encoder = hidden_size_encoder self.hidden_size_decoder = hidden_size_decoder self.cell = cell self.drop_rate = p self.num_sents = num_sents self.is_train = T.iscalar('is_train') # for dropout self.batch_size = T.iscalar('batch_size') # for mini-batch training self.mask = T.matrix("mask") self.mask_y = T.matrix("mask_y") self.optimizer = optimizer print "seq2seq out size ", self.out_size if self.out_size == self.dim_y + self.dim_pos: print "size right !" self.define_layers() self.define_train_test_funcs()
def multMatVect(v, A, m1, B, m2): # TODO : need description for parameter and return """ Multiply the first half of v by A with a modulo of m1 and the second half by B with a modulo of m2. Notes ----- The parameters of dot_modulo are passed implicitly because passing them explicitly takes more time than running the function's C-code. """ if multMatVect.dot_modulo is None: A_sym = tensor.lmatrix('A') s_sym = tensor.ivector('s') m_sym = tensor.iscalar('m') A2_sym = tensor.lmatrix('A2') s2_sym = tensor.ivector('s2') m2_sym = tensor.iscalar('m2') o = DotModulo()(A_sym, s_sym, m_sym, A2_sym, s2_sym, m2_sym) multMatVect.dot_modulo = function( [A_sym, s_sym, m_sym, A2_sym, s2_sym, m2_sym], o, profile=False) # This way of calling the Theano fct is done to bypass Theano overhead. f = multMatVect.dot_modulo f.input_storage[0].storage[0] = A f.input_storage[1].storage[0] = v[:3] f.input_storage[2].storage[0] = m1 f.input_storage[3].storage[0] = B f.input_storage[4].storage[0] = v[3:] f.input_storage[5].storage[0] = m2 f.fn() r = f.output_storage[0].storage[0] return r
def test_compute_lnZ(self): v = T.matrix('v') z = T.iscalar('z') V = cartesian([(0, 1)] * self.input_size, dtype=config.floatX) #H = cartesian([(0, 1)] * self.hidden_size, dtype=config.floatX) # We simulate having an infinite number of hidden units by adding lot of hidden units with parameters set to 0. nb_hidden_units_to_add = 10000 model = iRBM(input_size=self.model.input_size, hidden_size=self.model.hidden_size + nb_hidden_units_to_add, beta=self.model.beta.get_value()) model.W.set_value(np.r_[self.model.W.get_value(), np.zeros((nb_hidden_units_to_add, model.input_size), dtype=theano.config.floatX)]) model.b.set_value(np.r_[self.model.b.get_value(), np.zeros((nb_hidden_units_to_add,), dtype=theano.config.floatX)]) model.c.set_value(self.model.c.get_value()) v = T.matrix('v') z = T.iscalar('z') F_vz = theano.function([v, z], model.F(v, z)) energies = [] for z in range(1, model.hidden_size+1): energies.append(F_vz(V, z)) lnZ = logsumexp(-np.array(energies)).eval() lnZ_using_free_energy = theano.function([v], logsumexp(-self.model.free_energy(v))) assert_almost_equal(lnZ_using_free_energy(V), lnZ, decimal=5) # decimal=5 needed for float32
def compile_functions(self, opt, **args): print '... compiling training functions' gen_cost, gen_show_cost, dis_cost, cost_pfake, cost_ptrue = self.get_cost() self.opt = opt gen_updates = self.opt.get_updates(gen_cost, self.gen_params) dis_updates = self.opt.get_updates(dis_cost, self.dis_params) self.get_noise = theano.function([], self.theano_rng.uniform(size=(self.batch_size, self.num_z), low=-1, high=1) ) start_index = T.iscalar('start_index') end_index = T.iscalar('end_index') if self.uint8_data: given_train_x = T.cast(self.shared_train[start_index:end_index], dtype='float32') else: given_train_x = self.shared_train[start_index:end_index] self.train_gen_model = theano.function( [self.z], gen_show_cost, updates=gen_updates, ) self.train_dis_model = theano.function( [start_index, end_index, self.z], [cost_pfake, cost_ptrue], updates=dis_updates, givens={self.x: given_train_x} )
def compile(self, model): assert isinstance(model, Model) self.model = model dataset = self.dataset X, Y = dataset.preproc(dataset.X, dataset.Y) self.X = theano.shared(X, "X") self.Y = theano.shared(Y, "Y") self.logger.info("compiling do_loglikelihood") n_samples = T.iscalar("n_samples") batch_idx = T.iscalar("batch_idx") batch_size = T.iscalar("batch_size") first = batch_idx * batch_size last = first + batch_size X_batch, Y_batch = dataset.late_preproc(self.X[first:last], self.Y[first:last]) log_PX, _, _, _, KL, Hp, Hq = model.log_likelihood(X_batch, n_samples=n_samples) batch_L = T.sum(log_PX) batch_L2 = T.sum(log_PX ** 2) batch_KL = [T.sum(kl) for kl in KL] batch_Hp = [T.sum(hp) for hp in Hp] batch_Hq = [T.sum(hq) for hq in Hq] self.do_loglikelihood = theano.function( inputs=[batch_idx, batch_size, n_samples], outputs=[batch_L, batch_L2] + batch_KL + batch_Hp + batch_Hq, name="do_likelihood", )
def compile_bn(data_set, model, make_updates): """ データをsharedにして、modelとoptimizerを使ってcomputational graphを作って、 コンパイルする。 Parameters ----------- data_set : list of numpy.ndarray feature_vec : ndarray (n_pixels, D, n_tensors) gt_vec : ndarray (n_pixels, D) test_feature_vec, test_gt_vec model : models.Rcn1layerとか optimizer : optimizers.SGDとか """ s_input, s_target, s_test_input, s_test_target = share_data_sets(*data_set) nn, obj, train_mse, model_updates, model_param_l = model.make_graph_train() test_mse, test_out = model.make_graph_test() updates, opt_param_list = make_updates(loss=obj, param_list=nn.param_l) i_batch = T.iscalar("i_batch") index_list = T.ivector("index_list") batch_size = T.iscalar("batch_size") od = OrderedDict() for k, e in updates.items() + model_updates.items(): od[k] = e f_train = theano.function( inputs=[i_batch, index_list, batch_size]+opt_param_list+model_param_l, updates=od, givens=[(nn.x_t3, s_input[index_list[i_batch*batch_size: i_batch*batch_size + batch_size]]), (nn.t_mat, s_target[index_list[i_batch*batch_size: i_batch*batch_size + batch_size]])], on_unused_input='warn') f_training_error = theano.function( inputs=[i_batch, index_list, batch_size]+model_param_l, outputs=[train_mse], givens=[(nn.x_t3, s_input[index_list[i_batch*batch_size: i_batch*batch_size + batch_size]]), (nn.t_mat, s_target[index_list[i_batch*batch_size: i_batch*batch_size + batch_size]])], on_unused_input='warn') f_test_error = theano.function( inputs=[i_batch, index_list, batch_size], outputs=[test_mse], givens=[(nn.x_t3, s_test_input[index_list[i_batch*batch_size: i_batch*batch_size + batch_size]]), (nn.t_mat, s_test_target[index_list[i_batch*batch_size: i_batch*batch_size + batch_size]])]) f_output = theano.function( inputs=[nn.x_t3], outputs=[test_out]) result = [f_train, f_training_error, f_test_error, f_output, s_input, s_target, s_test_input, s_test_target, nn.param_l] return result
def multMatVect(v, A, m1, B, m2): """ multiply the first half of v by A with a modulo of m1 and the second half by B with a modulo of m2 Note: The parameters of dot_modulo are passed implicitly because passing them explicitly takes more time then running the function's C-code. """ if multMatVect.dot_modulo is None: A_sym = tensor.lmatrix("A") s_sym = tensor.ivector("s") m_sym = tensor.iscalar("m") A2_sym = tensor.lmatrix("A2") s2_sym = tensor.ivector("s2") m2_sym = tensor.iscalar("m2") o = DotModulo()(A_sym, s_sym, m_sym, A2_sym, s2_sym, m2_sym) multMatVect.dot_modulo = function([A_sym, s_sym, m_sym, A2_sym, s2_sym, m2_sym], o) # This way of calling the Theano fct is done to bypass Theano overhead. f = multMatVect.dot_modulo f.input_storage[0].storage[0] = A f.input_storage[1].storage[0] = v[:3] f.input_storage[2].storage[0] = m1 f.input_storage[3].storage[0] = B f.input_storage[4].storage[0] = v[3:] f.input_storage[5].storage[0] = m2 f.fn() r = f.output_storage[0].storage[0] return r
def compile(self, log_pxz, log_qpz, cost, a_pxz): batch_idx = T.iscalar() learning_rate = T.fscalar() updates, norm_grad = self.hp.optimizer(cost, self.params.values(), lr=learning_rate) self.outidx = {'cost':0, 'cost_p':1, 'cost_q':2, 'norm_grad':3} outputs = [cost, log_pxz, log_qpz] self.train = theano.function(inputs=[batch_idx, learning_rate], givens={self.X:self.data['tr_X'][batch_idx * self.hp.batch_size : (batch_idx+1) * self.hp.batch_size]}, outputs=outputs + [norm_grad], updates=updates) self.validate = theano.function(inputs=[batch_idx], givens={self.X:self.data['tr_X'][batch_idx * self.hp.test_batch_size : (batch_idx+1) * self.hp.test_batch_size]}, outputs=outputs) self.test = theano.function(inputs=[batch_idx], givens={self.X:self.data['te_X'][batch_idx * self.hp.test_batch_size : (batch_idx+1) * self.hp.test_batch_size]}, outputs=outputs) n_samples = T.iscalar() if self.resample_z: self.data['ge_Z'] = srnd.normal((self.max_gen_samples, self.n_z), dtype=theano.config.floatX) else: self.data['ge_Z'] = shared(np.random.randn(self.max_gen_samples, self.n_z)) self.decode = theano.function(inputs=[n_samples], givens={self.Z:self.data['ge_Z'][:n_samples]}, outputs=a_pxz)
def init_nnet(W, n_classes, vec_dim): """Initialize neural network. Args: W (theano.shared): embedding matrix n_classes: number of classes to be predicted vec_dim: dimensionality of the embeddings """ w_idx = TT.iscalar(name="w_idx") y_gold = TT.iscalar(name="y_gold") embs = W[w_idx] Theta = theano.shared(value=ORTHOGONAL.sample((n_classes, vec_dim)), name="Theta") beta = theano.shared(value=HE_UNIFORM.sample((1, n_classes)), name="beta") y_probs = TT.nnet.softmax(TT.dot(Theta, embs.T).flatten() + beta).flatten() params = [Theta] cost = -TT.mean(TT.log(y_probs[y_gold])) updates = sgd_updates_adadelta(params, cost) train = theano.function([w_idx, y_gold], cost, updates=updates) y_pred = TT.argmax(y_probs) y_score = y_probs[y_pred] predict = theano.function([w_idx], (y_pred, y_score)) acc = TT.eq(y_gold, y_pred) validate = theano.function([w_idx, y_gold], acc) return (train, validate, predict, params)
def test_multMatVect(): A1 = tensor.lmatrix('A1') s1 = tensor.ivector('s1') m1 = tensor.iscalar('m1') A2 = tensor.lmatrix('A2') s2 = tensor.ivector('s2') m2 = tensor.iscalar('m2') g0 = rng_mrg.DotModulo()(A1, s1, m1, A2, s2, m2) f0 = theano.function([A1, s1, m1, A2, s2, m2], g0) i32max = numpy.iinfo(numpy.int32).max A1 = numpy.random.randint(0, i32max, (3, 3)).astype('int64') s1 = numpy.random.randint(0, i32max, 3).astype('int32') m1 = numpy.asarray(numpy.random.randint(i32max), dtype="int32") A2 = numpy.random.randint(0, i32max, (3, 3)).astype('int64') s2 = numpy.random.randint(0, i32max, 3).astype('int32') m2 = numpy.asarray(numpy.random.randint(i32max), dtype="int32") f0.input_storage[0].storage[0] = A1 f0.input_storage[1].storage[0] = s1 f0.input_storage[2].storage[0] = m1 f0.input_storage[3].storage[0] = A2 f0.input_storage[4].storage[0] = s2 f0.input_storage[5].storage[0] = m2 r_a1 = rng_mrg.matVecModM(A1, s1, m1) r_a2 = rng_mrg.matVecModM(A2, s2, m2) f0.fn() r_b = f0.output_storage[0].value assert numpy.allclose(r_a1, r_b[:3]) assert numpy.allclose(r_a2, r_b[3:])
def test_clip_grad_int(): # test that integers don't crash clip gradient x = tensor.iscalar() y = tensor.iscalar() z = tensor.iscalar() c = tensor.clip(x, y, z) tensor.grad(c, [x, y, z])
def __init__(self, rng, state): Model.__init__(self) self.state = state # Compatibility towards older models self.__dict__.update(state) self.rng = rng # Load dictionary raw_dict = cPickle.load(open(self.dictionary, 'r')) # Probabilities for each term in the corpus self.str_to_idx = dict([(tok, tok_id) for tok, tok_id, _ in raw_dict]) self.idx_to_str = dict([(tok_id, tok) for tok, tok_id, freq in raw_dict]) # if '<s>' not in self.str_to_idx \ # or '</s>' not in self.str_to_idx: # raise Exception("Error, malformed dictionary!") # Number of words in the dictionary self.idim = len(self.str_to_idx) self.state['idim'] = self.idim logger.debug("Initializing language model") self.language_model = LanguageModel(self.state, self.rng, self) # Init params self.params = self.language_model.params self.x_data = T.imatrix('x_data') self.x_cost_mask = T.matrix('cost_mask') self.x_max_length = T.iscalar('x_max_length') # The training is done with a trick. We append a special </q> at the beginning of the session # so that we can predict also the first query in the session starting from the session beginning token (</q>). self.aug_x_data = T.concatenate([T.alloc(np.int32(self.eos_sym), 1, self.x_data.shape[1]), self.x_data]) self.training_x = self.aug_x_data[:self.x_max_length] self.training_y = self.aug_x_data[1:self.x_max_length+1] self.training_x_cost_mask = self.x_cost_mask[:self.x_max_length].flatten() target_probs, self.eval_h = self.language_model.build_lm(self.training_x, y=self.training_y, mode=LanguageModel.EVALUATION) # Prediction cost self.prediction_cost = T.sum(-T.log(target_probs) * self.training_x_cost_mask) # Sampling variables self.n_samples = T.iscalar("n_samples") self.n_steps = T.iscalar("n_steps") (self.sample, self.sample_log_prob), self.sampling_updates \ = self.language_model.build_sampler(self.n_samples, self.n_steps) # Beam-search variables self.beam_source = T.lvector("beam_source") self.beam_h = T.matrix("beam_h") self.beam_step_num = T.lscalar("beam_step_num")
def test(): a = T.iscalar() b = T.iscalar() c = T.iscalar() f1 = theano.function([a , b] , lt(a , b)) f2 = theano.function([a] , lt2(a) , on_unused_input='ignore') print f1(1 , 2) print f2(-2) print f2(3)
def SimilarityFunction(fnsim, embeddings, leftop, rightop): idxrel = T.iscalar("idxrel") idxright = T.iscalar("idxright") idxleft = T.iscalar("idxleft") lhs = (embeddings.E[:, idxleft]).reshape((1, embeddings.D)) rhs = (embeddings.E[:, idxright]).reshape((1, embeddings.D)) rel = (embeddings.E[:, idxrel]).reshape((1, embeddings.D)) simi = fnsim(leftop(lhs, rel), rightop(rhs, rel)) return theano.function([idxleft, idxright, idxrel], [simi])
def EnergyFn(fnsim, embeddings, leftop, rightop): embedding, relationl, relationr = parse_embeddings(embeddings) idxl, idxo, idxr = T.iscalar('idxl'), T.iscalar('idxo'), T.iscalar('idxr') lhs = (embedding.E[:, idxl]).reshape((1, embedding.D)) rhs = (embedding.E[:, idxr]).reshape((1, embedding.D)) rell = (relationl.E[:, idxo]).reshape((1, relationl.D)) relr = (relationr.E[:, idxo]).reshape((1, relationr.D)) energy = - fnsim(leftop(lhs, rell), rightop(rhs, relr)) return theano.function([idxl, idxr, idxo], [energy], on_unused_input='ignore')
def compile_functions(self, opt, **args): print '... compiling training functions' # propagte for training with batch normalization with upated std and mean for each batch self.layer_outputs = self.network_fprop() cost, show_cost = self.get_cost() self.opt = opt updates = self.opt.get_updates(cost, self.params) # propagate again for validation with fixed mean and std for batch normalization self.layer_outputs = self.network_fprop(isTest=True, noiseless=True) self.final_output = self.layer_outputs[self.network_structure[-1]['name']] errors = self.get_errors() start_index = T.iscalar('start_index') end_index = T.iscalar('end_index') train_given = {} print 'number of training inputs = ', self.ninputs for i in xrange(self.ninputs): if self.uint8_data: train_given[self.xs[i]] = T.cast(self.shared_train[i][start_index:end_index], dtype='float32') else: train_given[self.xs[i]] = self.shared_train[i][start_index:end_index] if self.batch_mean_subtraction: assert self.train_mean is not None, 'train_mean cannot be None for batch mean subtraction' assert len(self.train_mean) == self.ninputs, 'train_mean need to have the same number as number of inputs' train_given[self.xs[i]] -= self.train_mean[i] train_given[self.y] = self.shared_train_labels[start_index:end_index] self.train_model = theano.function( inputs=[start_index, end_index], outputs=[show_cost, errors], updates = updates, givens = train_given ) if hasattr(self, 'shared_valid'): valid_given = {} for i in xrange(self.ninputs): if self.uint8_data: valid_given[self.xs[i]] = T.cast(self.shared_valid[i][start_index:end_index], dtype='float32') else: valid_given[self.xs[i]] = self.shared_valid[i][start_index:end_index] if self.batch_mean_subtraction: assert self.train_mean is not None, 'train_mean cannot be None for batch mean subtraction' assert len(self.train_mean) == self.ninputs, 'train_mean need to have the same number as number of inputs' valid_given[self.xs[i]] -= self.train_mean[i] valid_given[self.y] = self.shared_valid_labels[start_index:end_index] self.validate_model = theano.function( inputs=[start_index,end_index], outputs=errors, givens = valid_given )
def get_train(U_Ot, U_R, lenW, n_facts): def phi_x1(x_t, L): return T.concatenate([L[x_t].reshape((-1,)), zeros((2*lenW,)), zeros((3,))], axis=0) def phi_x2(x_t, L): return T.concatenate([zeros((lenW,)), L[x_t].reshape((-1,)), zeros((lenW,)), zeros((3,))], axis=0) def phi_y(x_t, L): return T.concatenate([zeros((2*lenW,)), L[x_t].reshape((-1,)), zeros((3,))], axis=0) def phi_t(x_t, y_t, yp_t, L): return T.concatenate([zeros(3*lenW,), T.stack(T.switch(T.lt(x_t,y_t), 1, 0), T.switch(T.lt(x_t,yp_t), 1, 0), T.switch(T.lt(y_t,yp_t), 1, 0))], axis=0) def s_Ot(xs, y_t, yp_t, L): result, updates = theano.scan( lambda x_t, t: T.dot(T.dot(T.switch(T.eq(t, 0), phi_x1(x_t, L).reshape((1,-1)), phi_x2(x_t, L).reshape((1,-1))), U_Ot.T), T.dot(U_Ot, (phi_y(y_t, L) - phi_y(yp_t, L) + phi_t(x_t, y_t, yp_t, L)))), sequences=[xs, T.arange(T.shape(xs)[0])]) return result.sum() def sR(xs, y_t, L, V): result, updates = theano.scan( lambda x_t, t: T.dot(T.dot(T.switch(T.eq(t, 0), phi_x1(x_t, L).reshape((1,-1)), phi_x2(x_t, L).reshape((1,-1))), U_R.T), T.dot(U_R, phi_y(y_t, V))), sequences=[xs, T.arange(T.shape(xs)[0])]) return result.sum() x_t = T.iscalar('x_t') m = [x_t] + [T.iscalar('m_o%d' % i) for i in xrange(n_facts)] f = [T.iscalar('f%d_t' % i) for i in xrange(n_facts)] r_t = T.iscalar('r_t') gamma = T.scalar('gamma') L = T.fmatrix('L') # list of messages V = T.fmatrix('V') # vocab r_args = T.stack(*m) cost_arr = [0] * 2 * (len(m)-1) updates_arr = [0] * 2 * (len(m)-1) for i in xrange(len(m)-1): cost_arr[2*i], updates_arr[2*i] = theano.scan( lambda f_bar, t: T.switch(T.or_(T.eq(t, f[i]), T.eq(t, T.shape(L)-1)), 0, T.largest(gamma - s_Ot(T.stack(*m[:i+1]), f[i], t, L), 0)), sequences=[L, T.arange(T.shape(L)[0])]) cost_arr[2*i+1], updates_arr[2*i+1] = theano.scan( lambda f_bar, t: T.switch(T.or_(T.eq(t, f[i]), T.eq(t, T.shape(L)-1)), 0, T.largest(gamma + s_Ot(T.stack(*m[:i+1]), t, f[i], L), 0)), sequences=[L, T.arange(T.shape(L)[0])]) cost1, u1 = theano.scan( lambda r_bar, t: T.switch(T.eq(r_t, t), 0, T.largest(gamma - sR(r_args, r_t, L, V) + sR(r_args, t, L, V), 0)), sequences=[V, T.arange(T.shape(V)[0])]) cost = cost1.sum() for c in cost_arr: cost += c.sum() g_uo, g_ur = T.grad(cost, [U_Ot, U_R]) train = theano.function( inputs=[r_t, gamma, L, V] + m + f, outputs=[cost], updates=[(U_Ot, U_Ot-alpha*g_uo), (U_R, U_R-alpha*g_ur)]) return train
def test14(): x = T.iscalar('x') y = T.iscalar('y') z = T.arange(x) z = T.shape_padaxis(z, axis=1) z2 = T.zeros((x,y)) z2 = z + z2 fn = theano.function(inputs=[x,y],outputs=[z2],allow_input_downcast=True) res = fn(3,4) print res, res[0].shape
def create_gradientfunctions(self, x_train): """This function takes as input the whole dataset and creates the entire model""" x = T.matrix("x") epoch = T.iscalar("epoch") batch_size = x.shape[0] alpha, beta = self.encoder(x) z = self.sampler(alpha, beta) reconstructed_x, logpxz = self.decoder(x,z) # Expectation of (logpz - logqz_x) over logqz_x is equal to KLD (see appendix B): # KLD = 0.5 * T.sum(1 + beta - alpha**2 - T.exp(beta), axis=1, keepdims=True) #KLD = 0.5 * T.sum(1 + beta - (alpha**2 + T.exp(beta)) / (2*(self.prior_noise_level**2)) , axis=1, keepdims=True) # KLD = cross-entroy of the sample distribution of sigmoid(z) from the beta distribution alpha_prior = 1.0/self.prior_noise_level beta_prior = 1.0/self.prior_noise_level # sigmoidZ = T.nnet.sigmoid(z) # KLD = 25*T.sum((alpha_prior-1)*sigmoidZ + (beta-1)*(1-sigmoidZ) - betaln(alpha_prior,beta), axis=1, keepdims=True) # KLD = 0 KLD = -(betaln(alpha, beta) - betaln(alpha_prior, beta_prior) \ + (alpha_prior - alpha)*T.psi(alpha_prior) + (beta_prior - beta)*T.psi(beta_prior) \ + (alpha - alpha_prior + beta - beta_prior)*T.psi(alpha_prior+beta_prior)) # Average over batch dimension logpx = T.mean(logpxz + KLD) rmse_val = rmse_score(x, reconstructed_x) # Compute all the gradients gradients = T.grad(logpx, self.params.values()) # Adam implemented as updates updates = self.get_adam_updates(gradients, epoch) batch = T.iscalar('batch') givens = { x: x_train[batch*self.batch_size:(batch+1)*self.batch_size, :] } # Define a bunch of functions for convenience update = theano.function([batch, epoch], logpx, updates=updates, givens=givens) likelihood = theano.function([x], logpx) eval_rmse = theano.function([x], rmse_val) encode = theano.function([x], z) decode = theano.function([z], reconstructed_x) encode_alpha = theano.function([x], alpha) encode_beta = theano.function([x], beta) return update, likelihood, encode, decode, encode_alpha, encode_beta, eval_rmse
def make_theano_evaluator(use_log): """This returns a function(!) that calculates the gradient and cost. Heh.""" X = T.dmatrix('X') triplets = T.imatrix('triplets') alpha = T.dscalar('alpha') lamb = T.dscalar('lambda') no_dims = T.iscalar('no_dims') N = T.iscalar('N') triplets_A = triplets[:,0] triplets_B = triplets[:,1] triplets_C = triplets[:,2] # Compute Student-t kernel. Look familiar? sum_X = T.sum(X**2, axis=1) a = -2 * (X.dot(X.T)) b = a + sum_X[np.newaxis,:] + sum_X[:,np.newaxis] K = (1 + b / alpha) ** ((alpha+1)/-2) # Compute value of cost function P = K[triplets_A,triplets_B] / ( K[triplets_A,triplets_B] + K[triplets_A,triplets_C]) if use_log: C = -T.sum(T.log(P)) + lamb * T.sum(X**2) else: C = -T.sum(P) + lamb * T.sum(X**2) # Compute gradient, for each dimension const = (alpha+1) / alpha dim = T.iscalar('dim') def each_dim(dim): if use_log: A_to_B = (1 - P) * K[triplets_A,triplets_B] * (X[triplets_A][:,dim] - X[triplets_B][:,dim]) B_to_C = (1 - P) * K[triplets_A,triplets_C] * (X[triplets_A][:,dim] - X[triplets_C][:,dim]) else: A_to_B = P*(1 - P) * K[triplets_A,triplets_B] * (X[triplets_A][:,dim] - X[triplets_B][:,dim]) B_to_C = P*(1 - P) * K[triplets_A,triplets_C] * (X[triplets_A][:,dim] - X[triplets_C][:,dim]) this_dim = (-const * T.stack(A_to_B - B_to_C, -A_to_B, B_to_C)).T dC = T.extra_ops.bincount(triplets.ravel(), weights=this_dim.ravel(), # minlength=N ) return -dC + 2*lamb*X[:,dim] # loop across all dimensions... theano loops are weird, yes... all_dims = (t.scan(each_dim, # non_sequences=N, sequences=T.arange(no_dims)) )[0].T return t.function([X,N,no_dims,triplets,lamb,alpha], [C, all_dims], on_unused_input='ignore')
def compile_functions(self, opt, **args): print '... compiling training functions' # propagte for training with batch normalization with upated std and mean for each batch self.layer_outputs = self.network_fprop(self.layers, self.x, self.y) cost, show_cost = self.get_cost() self.opt = opt updates = self.opt.get_updates(cost, self.params) # propagate again for validation with fixed mean and std for batch normalization self.layer_outputs = self.network_fprop(self.layers, self.x, self.y, isTest=True, noiseless=True) self.final_output = self.layer_outputs[self.network_structure[-1]['name']] errors = self.get_errors() start_index = T.iscalar('start_index') end_index = T.iscalar('end_index') train_given = {} if self.uint8_data: given_train_x = T.cast(self.shared_train[start_index:end_index], dtype='float32') else: given_train_x = self.shared_train[start_index:end_index] if self.batch_mean_subtraction: assert self.train_mean is not None, 'train_mean cannot be None for batch mean subtraction' given_train_x -= self.train_mean train_given[self.x] = given_train_x train_given[self.y] = self.shared_train_labels[start_index:end_index] self.train_model = theano.function( inputs=[start_index,end_index], outputs=show_cost, updates = updates, givens = train_given ) if hasattr(self, 'shared_valid'): valid_given = {} if self.uint8_data: given_valid_x = T.cast(self.shared_valid[start_index:end_index], dtype='float32') else: given_valid_x = self.shared_valid[start_index:end_index] if self.batch_mean_subtraction: assert self.train_mean is not None, 'train_mean cannot be None for batch mean subtraction' given_valid_x -= self.train_mean valid_given[self.x] = given_valid_x valid_given[self.y] = self.shared_valid_labels[start_index:end_index] self.validate_model = theano.function( inputs=[start_index, end_index], outputs=errors, givens = valid_given )
def create_iter_functions(dataset, output_layer, X_tensor_type=T.matrix, batch_size=BATCH_SIZE, learning_rate=LEARNING_RATE, momentum=MOMENTUM): """Create functions for training, validation and testing to iterate one epoch. """ batch_index = T.iscalar('batch_index') X_batch = X_tensor_type('x') y_batch = T.ivector('y') batch_slice = slice(batch_index * batch_size, (batch_index + 1) * batch_size) objective = lasagne.objectives.Objective( output_layer, loss_function=lasagne.objectives.categorical_crossentropy) loss_train = objective.get_loss(X_batch, target=y_batch) loss_eval = objective.get_loss(X_batch, target=y_batch, deterministic=True) pred = T.argmax(output_layer.get_output(X_batch, deterministic=True), axis=1) accuracy = T.mean(T.eq(pred, y_batch), dtype=theano.config.floatX) all_params = lasagne.layers.get_all_params(output_layer) updates = lasagne.updates.nesterov_momentum(loss_train, all_params, learning_rate, momentum) iter_train = theano.function( [batch_index], loss_train, updates=updates, givens={ X_batch: dataset['X_train'][batch_slice], y_batch: dataset['y_train'][batch_slice], }, ) iter_valid = theano.function( [batch_index], [loss_eval, accuracy], givens={ X_batch: dataset['X_valid'][batch_slice], y_batch: dataset['y_valid'][batch_slice], }, ) iter_test = theano.function( [batch_index], [loss_eval, accuracy], givens={ X_batch: dataset['X_test'][batch_slice], y_batch: dataset['y_test'][batch_slice], }, ) return dict( train=iter_train, valid=iter_valid, test=iter_test, )
def test_tex_print(): tt_normalrv_noname_expr = tt.scalar("b") * NormalRV( tt.scalar("\\mu"), tt.scalar("\\sigma")) expected = textwrap.dedent(r""" \begin{equation} \begin{gathered} b \in \mathbb{R}, \,\mu \in \mathbb{R}, \,\sigma \in \mathbb{R} \\ a \sim \operatorname{N}\left(\mu, {\sigma}^{2}\right)\, \in \mathbb{R} \end{gathered} \\ (b \odot a) \end{equation} """) assert tt_tprint(tt_normalrv_noname_expr) == expected.strip() tt_normalrv_name_expr = tt.scalar("b") * NormalRV( tt.scalar("\\mu"), tt.scalar("\\sigma"), size=[2, 1], name="X") expected = textwrap.dedent(r""" \begin{equation} \begin{gathered} b \in \mathbb{R}, \,\mu \in \mathbb{R}, \,\sigma \in \mathbb{R} \\ X \sim \operatorname{N}\left(\mu, {\sigma}^{2}\right)\, \in \mathbb{R}^{2 \times 1} \end{gathered} \\ (b \odot X) \end{equation} """) assert tt_tprint(tt_normalrv_name_expr) == expected.strip() tt_2_normalrv_noname_expr = tt.matrix("M") * NormalRV( tt.scalar("\\mu_2"), tt.scalar("\\sigma_2")) tt_2_normalrv_noname_expr *= tt.scalar("b") * NormalRV( tt_2_normalrv_noname_expr, tt.scalar("\\sigma")) + tt.scalar("c") expected = textwrap.dedent(r""" \begin{equation} \begin{gathered} M \in \mathbb{R}^{N^{M}_{0} \times N^{M}_{1}} \\ \mu_2 \in \mathbb{R}, \,\sigma_2 \in \mathbb{R} \\ b \in \mathbb{R}, \,\sigma \in \mathbb{R}, \,c \in \mathbb{R} \\ a \sim \operatorname{N}\left(\mu_2, {\sigma_2}^{2}\right)\, \in \mathbb{R} \\ d \sim \operatorname{N}\left((M \odot a), {\sigma}^{2}\right)\, \in \mathbb{R}^{N^{d}_{0} \times N^{d}_{1}} \end{gathered} \\ ((M \odot a) \odot ((b \odot d) + c)) \end{equation} """) assert tt_tprint(tt_2_normalrv_noname_expr) == expected.strip() expected = textwrap.dedent(r""" \begin{equation} \begin{gathered} b \in \mathbb{Z}, \,c \in \mathbb{Z}, \,M \in \mathbb{R}^{N^{M}_{0} \times N^{M}_{1}} \end{gathered} \\ M\left[b, \,c\right] \end{equation} """) # TODO: "c" should be "1". assert (tt_tprint( tt.matrix("M")[tt.iscalar("a"), tt.constant(1, dtype="int")]) == expected.strip()) expected = textwrap.dedent(r""" \begin{equation} \begin{gathered} M \in \mathbb{R}^{N^{M}_{0} \times N^{M}_{1}} \end{gathered} \\ M\left[1\right] \end{equation} """) assert tt_tprint(tt.matrix("M")[1]) == expected.strip() expected = textwrap.dedent(r""" \begin{equation} \begin{gathered} M \in \mathbb{N}^{N^{M}_{0}} \end{gathered} \\ M\left[2:4:0\right] \end{equation} """) assert tt_tprint(tt.vector("M", dtype="uint32")[0:4:2]) == expected.strip() norm_rv = NormalRV(tt.scalar("\\mu"), tt.scalar("\\sigma")) rv_obs = observed(tt.constant(1.0, dtype=norm_rv.dtype), norm_rv) expected = textwrap.dedent(r""" \begin{equation} \begin{gathered} \mu \in \mathbb{R}, \,\sigma \in \mathbb{R} \\ a \sim \operatorname{N}\left(\mu, {\sigma}^{2}\right)\, \in \mathbb{R} \end{gathered} \\ a = 1.0 \end{equation} """) assert tt_tprint(rv_obs) == expected.strip()
from __future__ import absolute_import, print_function, division import theano import theano.tensor as tt from six.moves import xrange k = tt.iscalar("k") A = tt.vector("A") def inner_fct(prior_result, A): return prior_result * A # Symbolic description of the result result, updates = theano.scan(fn=inner_fct, outputs_info=tt.ones_like(A), non_sequences=A, n_steps=k) # Scan has provided us with A**1 through A**k. Keep only the last # value. Scan notices this and does not waste memory saving them. final_result = result[-1] power = theano.function(inputs=[A, k], outputs=final_result, updates=updates) print(power(list(range(10)), 2)) #[ 0. 1. 4. 9. 16. 25. 36. 49. 64. 81.]
def test_value_h(self): "tests that the value of the kl divergence decreases with each update to h_i " model = self.model e_step = self.e_step X = self.X assert X.shape[0] == self.m init_H = e_step.init_H_hat(V=X) init_Mu1 = e_step.init_S_hat(V=X) prev_setting = config.compute_test_value config.compute_test_value = 'off' H, Mu1 = function([], outputs=[init_H, init_Mu1])() config.compute_test_value = prev_setting H = broadcast(H, self.m) Mu1 = broadcast(Mu1, self.m) H = np.cast[config.floatX](self.model.rng.uniform(0., 1., H.shape)) Mu1 = np.cast[config.floatX](self.model.rng.uniform( -5., 5., Mu1.shape)) H_var = T.matrix(name='H_var') H_var.tag.test_value = H Mu1_var = T.matrix(name='Mu1_var') Mu1_var.tag.test_value = Mu1 idx = T.iscalar() idx.tag.test_value = 0 newH = e_step.infer_H_hat(V=X, H_hat=H_var, S_hat=Mu1_var) h_idx = newH[:, idx] h_i_func = function([H_var, Mu1_var, idx], h_idx) sigma0 = 1. / model.alpha Sigma1 = e_step.infer_var_s1_hat() mu0 = T.zeros_like(model.mu) #by truncated KL, I mean that I am dropping terms that don't depend on H and Mu1 # (they don't affect the outcome of this test and some of them are intractable ) trunc_kl = - model.entropy_hs(H_hat = H_var, var_s0_hat = sigma0, var_s1_hat = Sigma1) + \ model.expected_energy_vhs(V = X, H_hat = H_var, S_hat = Mu1_var, var_s0_hat = sigma0, var_s1_hat = Sigma1) trunc_kl_func = function([H_var, Mu1_var], trunc_kl) for i in xrange(self.N): prev_kl = trunc_kl_func(H, Mu1) H[:, i] = h_i_func(H, Mu1, i) #we don't update mu, the whole point of the split e step is we don't have to new_kl = trunc_kl_func(H, Mu1) increase = new_kl - prev_kl print 'failures after iteration ', i, ': ', (increase > self.tol).sum() mx = increase.max() if mx > 1e-4: print 'increase amounts of failing examples:' print increase[increase > self.tol] print 'failing H:' print H[increase > self.tol, :] print 'failing Mu1:' print Mu1[increase > self.tol, :] print 'failing V:' print X[increase > self.tol, :] raise Exception( 'after mean field step in h, kl divergence should decrease, but some elements increased by as much as ' + str(mx) + ' after updating h_' + str(i))
def build_finetune_functions(self, datasets, batch_size, learning_rate): '''Generates a function `train` that implements one step of finetuning, a function `validate` that computes the error on a batch from the validation set, and a function `test` that computes the error on a batch from the testing set :type datasets: list of pairs of theano.tensor.TensorType :param datasets: It is a list that contain all the datasets; the has to contain three pairs, `train`, `valid`, `test` in this order, where each pair is formed of two Theano variables, one for the datapoints, the other for the labels :type batch_size: int :param batch_size: size of a minibatch :type learning_rate: float :param learning_rate: learning rate used during finetune stage ''' (train_set_x, train_set_y) = datasets[0] (valid_set_x, valid_set_y) = datasets[1] (test_set_x, test_set_y) = datasets[2] # compute number of minibatches for training, validation and testing # n_train_size = train_set_y.get_value(borrow=True).shape[0] n_valid_batches = valid_set_y.get_value(borrow=True).shape[0] n_valid_batches /= batch_size n_test_batches = test_set_y.get_value(borrow=True).shape[0] n_test_batches /= batch_size index = T.iscalar('index') # index to a [mini]batch # compute the gradients with respect to the model parameters gparams = T.grad(self.finetune_cost, self.params) # compute list of fine-tuning updates updates = [] for param, gparam in zip(self.params, gparams): updates.append((param, param - gparam * learning_rate)) #shift the indeces in Y for each mini batch offset = theano.shared(value=numpy.asarray( [[1, 1, 0] for i in range(batch_size)], dtype='int32'), name='offset') train_fn = theano.function( inputs=[index], on_unused_input='ignore', outputs=self.finetune_cost, updates=updates, givens={ self.x: train_set_x[train_set_y[index * batch_size][0]:train_set_y[ (index + 1) * batch_size][0]], #index * batch_size: #(index + 1) * batch_size], self.y: train_set_y[index * batch_size:(index + 1) * batch_size] - offset * train_set_y[index * batch_size][0] }) test_score_i = theano.function( [index], self.errors, on_unused_input='ignore', givens={ self.x: test_set_x[test_set_y[index * batch_size][0]:test_set_y[ (index + 1) * batch_size][0]], #index * batch_size: #(index + 1) * batch_size], self.y: test_set_y[index * batch_size:(index + 1) * batch_size] - offset * test_set_y[index * batch_size][0] }) valid_score_i = theano.function( [index], self.errors, on_unused_input='ignore', givens={ self.x: valid_set_x[valid_set_y[index * batch_size][0]:valid_set_y[ (index + 1) * batch_size][0]], #index * batch_size: #(index + 1) * batch_size], self.y: valid_set_y[index * batch_size:(index + 1) * batch_size] - offset * valid_set_y[index * batch_size][0] }) # Create a function that scans the entire validation set def valid_score(): return [valid_score_i(i) for i in xrange(n_valid_batches - 1)] # Create a function that scans the entire test set def test_score(): return [test_score_i(i) for i in xrange(n_test_batches - 1)] return train_fn, valid_score, test_score
initialization='he', weightnorm=WEIGHT_NORM) out = T.nnet.relu(out) # Output # We apply the softmax later out = lib.ops.Linear('SampleLevel.Output', DIM, Q_LEVELS, out, weightnorm=WEIGHT_NORM) return out sequences = T.imatrix('sequences') h0 = T.tensor3('h0') reset = T.iscalar('reset') mask = T.matrix('mask') if args.debug: # Solely for debugging purposes. # Maybe I should set the compute_test_value=warn from here. sequences.tag.test_value = numpy.zeros((BATCH_SIZE, SEQ_LEN+OVERLAP), dtype='int32') h0.tag.test_value = numpy.zeros((BATCH_SIZE, N_RNN, H0_MULT*DIM), dtype='float32') reset.tag.test_value = numpy.array(1, dtype='int32') mask.tag.test_value = numpy.ones((BATCH_SIZE, SEQ_LEN+OVERLAP), dtype='float32') input_sequences = sequences[:, :-FRAME_SIZE] target_sequences = sequences[:, FRAME_SIZE:] target_mask = mask[:, FRAME_SIZE:]
def __init__(self, Nlayers = 1, # number of layers Ndirs = 1, # unidirectional or bidirectional Nx = 100, # input size Nh = 100, # hidden layer size Ny = 100, # output size Ah = "relu", # hidden unit activation (e.g. relu, tanh, lstm) Ay = "linear", # output unit activation (e.g. linear, sigmoid, softmax) predictPer = "frame", # frame or sequence loss = None, # loss function (e.g. mse, ce, ce_group, hinge, squared_hinge) L1reg = 0.0, # L1 regularization L2reg = 0.0, # L2 regularization multiReg = 0.0, # regularization of agreement of predictions on data of different conditions momentum = 0.0, # SGD momentum seed = 15213, # random seed for initializing the weights frontEnd = None, # a lambda function for transforming the input filename = None, # initialize from file initParams = None, # initialize from given dict ): if filename is not None: # load parameters from file with smart_open(filename, "rb") as f: initParams = dill.load(f) if initParams is not None: # load parameters from given dict self.paramNames = [] self.params = [] for k, v in initParams.iteritems(): if type(v) is numpy.ndarray: self.addParam(k, v) else: setattr(self, k, v) self.paramNames.append(k) # F*ck, locals()[k] = v doesn't work; I have to do this statically Nlayers, Ndirs, Nx, Nh, Ny, Ah, Ay, predictPer, loss, L1reg, L2reg, momentum, frontEnd \ = self.Nlayers, self.Ndirs, self.Nx, self.Nh, self.Ny, self.Ah, self.Ay, self.predictPer, self.loss, self.L1reg, self.L2reg, self.momentum, self.frontEnd else: # Initialize parameters randomly # Names of parameters to save to file self.paramNames = ["Nlayers", "Ndirs", "Nx", "Nh", "Ny", "Ah", "Ay", "predictPer", "loss", "L1reg", "L2reg", "momentum", "frontEnd"] for name in self.paramNames: value = locals()[name] setattr(self, name, value) # Values of parameters for building the computational graph self.params = [] # Initialize random number generators global rng rng = numpy.random.RandomState(seed) # Construct parameter matrices Nlstm = 4 if Ah == 'lstm' else 1 self.addParam("Win", rand_init((Nx, Nh * Ndirs * Nlstm), Ah)) self.addParam("Wrec", rand_init((Nlayers, Ndirs, Nh, Nh * Nlstm), Ah)) self.addParam("Wup", rand_init((Nlayers - 1, Nh * Ndirs, Nh * Ndirs * Nlstm), Ah)) self.addParam("Wout", rand_init((Nh * Ndirs, Ny), Ay)) if Ah != "lstm": self.addParam("Bhid", zeros((Nlayers, Nh * Ndirs))) else: self.addParam("Bhid", numpy.tile(numpy.hstack([full((Nlayers, Nh), 1.0), zeros((Nlayers, Nh * 3))]), (1, Ndirs))) self.addParam("Bout", zeros(Ny)) self.addParam("h0", zeros((Nlayers, Ndirs, Nh))) if Ah == "lstm": self.addParam("c0", zeros((Nlayers, Ndirs, Nh))) # Compute total number of parameters self.nParams = sum(x.get_value().size for x in self.params) # Initialize gradient tensors when using momentum if momentum > 0: self.dparams = [theano.shared(zeros(x.get_value().shape)) for x in self.params] # Build computation graph input = T.ftensor4() # stream * time * feature mask = T.imatrix() mask_int = [(mask % 2).nonzero(), (mask >= 2).nonzero()] mask_float = [T.cast((mask % 2).dimshuffle((1, 0)).reshape((mask.shape[1], mask.shape[0], 1)), theano.config.floatX), T.cast((mask >= 2).dimshuffle((1, 0)).reshape((mask.shape[1], mask.shape[0], 1)), theano.config.floatX)] # mask_int = [(mask & 1).nonzero(), (mask & 2).nonzero()] # mask_float = [T.cast((mask & 1).dimshuffle((1, 0)).reshape((mask.shape[1], mask.shape[0], 1)), theano.config.floatX), # T.cast(((mask & 2) / 2).dimshuffle((1, 0)).reshape((mask.shape[1], mask.shape[0], 1)), theano.config.floatX)] def step_rnn(x_t, mask, h_tm1, W, h0): h_tm1 = T.switch(mask, h0, h_tm1) return [ACTIVATION[Ah](x_t + h_tm1.dot(W))] def step_lstm(x_t, mask, c_tm1, h_tm1, W, c0, h0): c_tm1 = T.switch(mask, c0, c_tm1) h_tm1 = T.switch(mask, h0, h_tm1) a = x_t + h_tm1.dot(W) f_t = T.nnet.sigmoid(a[:, :, :Nh]) i_t = T.nnet.sigmoid(a[:, :, Nh : Nh * 2]) o_t = T.nnet.sigmoid(a[:, :, Nh * 2 : Nh * 3]) c_t = T.tanh(a[:, :, Nh * 3:]) * i_t + c_tm1 * f_t h_t = T.tanh(c_t) * o_t return [c_t, h_t] x = input if frontEnd is None else frontEnd(input) for i in range(Nlayers): h = (x.dimshuffle((2, 1, 0, 3)).dot(self.Win) if i == 0 else h.dot(self.Wup[i-1])) + self.Bhid[i] # (2, 1, 0, 3): condition * stream * time * feature => time * stream * condition * feature rep = lambda x: T.extra_ops.repeat(T.extra_ops.repeat(x.reshape((1, 1, -1)), h.shape[2], axis = 1), h.shape[1], axis = 0) if Ah != "lstm": h = T.concatenate([theano.scan( fn = step_rnn, sequences = [h[:, :, :, Nh * d : Nh * (d+1)], mask_float[d]], outputs_info = [rep(self.h0[i, d])], non_sequences = [self.Wrec[i, d], rep(self.h0[i, d])], go_backwards = (d == 1), )[0][::(1 if d == 0 else -1)] for d in range(Ndirs)], axis = 3) else: h = T.concatenate([theano.scan( fn = step_lstm, sequences = [h[:, :, :, Nh * 4 * d : Nh * 4 * (d+1)], mask_float[d]], outputs_info = [rep(self.c0[i, d]), rep(self.h0[i, d])], non_sequences = [self.Wrec[i, d], rep(self.c0[i, d]), rep(self.h0[i, d])], go_backwards = (d == 1), )[0][1][::(1 if d == 0 else -1)] for d in range(Ndirs)], axis = 3) if predictPer == "sequence": h = h.dimshuffle((1, 0, 2, 3)) # time * stream * condition * feature => stream * time * condition * feature h = T.concatenate([h[mask_int[1 - d]][:, :, Nh * d : Nh * (d+1)] for d in range(Ndirs)], axis = 2) # sequence * condition * feature h = h.dimshuffle((1, 0, 2)) # sequence * condition * feature => condition * sequence * feature else: h = h.dimshuffle((2, 1, 0, 3)) # time * stream * condition * feature => condition * stream * time * feature output = ACTIVATION[Ay](h.dot(self.Wout) + self.Bout) output_mean = output.mean(axis = 0) output_var = output.var(axis = 0) # Compute loss function if loss is None: loss = {"linear": "mse", "sigmoid": "ce", "softmax": "ce_group"}[self.Ay] if loss == "ctc": label = T.imatrix() label_time = T.imatrix() tol = T.iscalar() cost = theano.scan(fn = lambda prob: ctc_cost(prob, mask, label, label_time, tol), \ sequences = [output])[0].mean() else: if predictPer == "sequence": label = T.fmatrix() y = output_mean t = label elif predictPer == "frame": label = T.ftensor3() indices = (mask >= 0).nonzero() y = output_mean[indices] t = label[indices] cost = T.mean({ "ce": -T.mean(T.log(y) * t + T.log(1 - y) * (1 - t), axis = 1), "ce_group": -T.log((y * t).sum(axis = 1)), "mse": T.mean((y - t) ** 2, axis = 1), "hinge": T.mean(relu(1 - y * (t * 2 - 1)), axis = 1), "squared_hinge": T.mean(relu(1 - y * (t * 2 - 1)) ** 2, axis = 1), }[loss]) # Add regularization cost += sum(abs(x).sum() for x in self.params) / self.nParams * L1reg cost += sum(T.sqr(x).sum() for x in self.params) / self.nParams * L2reg if predictPer == "sequence": cost += output_var.mean() * multiReg else: indices = (mask >= 0).nonzero() cost += output_var[indices].mean() * multiReg # Compute updates for network parameters updates = [] lrate = T.fscalar() clip = T.fscalar() grad = T.grad(cost, self.params) grad_clipped = [T.maximum(T.minimum(g, clip), -clip) for g in grad] if momentum > 0: for w, d, g in zip(self.params, self.dparams, grad_clipped): updates.append((w, w + momentum * momentum * d - (1 + momentum) * lrate * g)) updates.append((d, momentum * d - lrate * g)) else: for w, g in zip(self.params, grad_clipped): updates.append((w, w - lrate * g)) # Create functions to be called from outside if loss == "ctc": inputs = [input, mask, label, label_time, tol, lrate, clip] else: inputs = [input, mask, label, lrate, clip] self.train = theano.function( inputs = inputs, outputs = cost, updates = updates, ) self.predict = theano.function(inputs = [input, mask], outputs = output)
train_freq_print = 20 valid_freq_print = 500 sample_strings = ['i am alien lamp and i love the neural nets'] * 50#['Sous le pont Mirabeau coule la Seine.']*50 algo = 'adam' # adam, sgd #model_file_load = "/u/lambalex/models/handwriting/handwriting/71535347/saved_model.pkl" #model_file_load = "/u/lambalex/models/handwriting/handwriting/81356894/saved_model.pkl" #model_file_load = "/u/lambalex/models/handwriting/handwriting/10406114/saved_model.pkl" #model_file_load = "saved_model.pkl" #model_file_load = "/u/lambalex/models/handwriting/handwriting/33757048/saved_model.pkl" model_file_load = None #model_file_load = "/u/lambalex/models/handwriting/handwriting/90207341/saved_model.pkl" #model_file_load = "/u/lambalex/models/handwriting/handwriting/11151138/saved_model.pkl" #model_file_load = "/u/lambalex/models/handwriting_pf/handwriting/52780486/saved_model.pkl" num_steps_sample = T.iscalar('num_steps_sample') exp_id = np.random.randint(0, 100000000, 1)[0] dump_path = os.path.join(os.environ.get('TMP_PATH'), 'handwriting',str(exp_id)) os.umask(055) os.makedirs(dump_path, 0777) os.makedirs(dump_path + "/src", 0777) os.chmod(dump_path, 0o777) fh = open(dump_path + "/derpy_file.txt", "w") fh.write("DERP DERP DERP DERP") fh.close()
def __init__(self, We_initial, params): #self.textfile = open(params.outfile, 'w') We = theano.shared(We_initial) embsize = We_initial.shape[1] hidden = params.hidden input_init = np.random.uniform(-0.1, 0.1, (10, MAX_lENGTH, params.num_labels)).astype('float32') self.input_init = theano.shared(input_init) input_var = T.imatrix(name='inputs') target_var = T.imatrix(name='targets') mask_var = T.fmatrix(name='masks') mask_var1 = T.fmatrix(name='masks1') length = T.iscalar() Wyy0 = np.random.uniform(-0.02, 0.02, (params.num_labels + 1, params.num_labels)).astype('float32') Wyy = theano.shared(Wyy0) l_in_word = lasagne.layers.InputLayer((None, None)) l_mask_word = lasagne.layers.InputLayer(shape=(None, None)) if params.emb ==1: l_emb_word = lasagne.layers.EmbeddingLayer(l_in_word, input_size= We_initial.shape[0] , output_size = embsize, W =We) else: l_emb_word = lasagne_embedding_layer_2(l_in_word, embsize, We) l_lstm_wordf = lasagne.layers.LSTMLayer(l_emb_word, hidden, mask_input=l_mask_word) l_lstm_wordb = lasagne.layers.LSTMLayer(l_emb_word, hidden, mask_input=l_mask_word, backwards = True) concat = lasagne.layers.concat([l_lstm_wordf, l_lstm_wordb], axis=2) l_reshape_concat = lasagne.layers.ReshapeLayer(concat,(-1,2*hidden)) l_local = lasagne.layers.DenseLayer(l_reshape_concat, num_units= params.num_labels, nonlinearity=lasagne.nonlinearities.linear) network_params = lasagne.layers.get_all_params(l_local, trainable=True) network_params.append(Wyy) #print len(network_params) f = open('ccctag_CRF_Bilstm_Viterbi_.Batchsize_10_dropout_0_LearningRate_0.01_0.0512_tagversoin_2.pickle','r') data = pickle.load(f) f.close() for idx, p in enumerate(network_params): p.set_value(data[idx]) def inner_function( targets_one_step, mask_one_step, prev_label, tg_energy): """ :param targets_one_step: [batch_size, t] :param prev_label: [batch_size, t] :param tg_energy: [batch_size] :return: """ new_ta_energy = T.dot(prev_label, Wyy[:-1,:-1]) new_ta_energy_t = tg_energy + T.sum(new_ta_energy*targets_one_step, axis =1) tg_energy_t = T.switch(mask_one_step, new_ta_energy_t, tg_energy) return [targets_one_step, tg_energy_t] local_energy = lasagne.layers.get_output(l_local, {l_in_word: input_var, l_mask_word: mask_var}) local_energy = local_energy.reshape((-1, length, params.num_labels)) local_energy = local_energy*mask_var[:,:,None] ##################### # for the end symbole of a sequence #################### end_term = Wyy[:-1,-1] local_energy = local_energy + end_term.dimshuffle('x', 'x', 0)*mask_var1[:,:, None] predy_init = self.input_init[:,:length,:] a_params = [self.input_init] predy = T.nnet.softmax(predy_init.reshape((-1, params.num_labels))) predy = predy.reshape((-1, length, params.num_labels)) prediction = T.argmax(predy_init, axis=2) predy = predy*mask_var[:,:,None] targets_shuffled = predy.dimshuffle(1, 0, 2) target_time0 = targets_shuffled[0] masks_shuffled = mask_var.dimshuffle(1, 0) initial_energy0 = T.dot(target_time0, Wyy[-1,:-1]) initials = [target_time0, initial_energy0] [ _, target_energies], _ = theano.scan(fn=inner_function, outputs_info=initials, sequences=[targets_shuffled[1:], masks_shuffled[1:]]) cost11 = target_energies[-1] + T.sum(T.sum(local_energy*predy, axis=2)*mask_var, axis=1) predy_f = predy.reshape((-1, params.num_labels)) y_f = target_var.flatten() if (params.annealing ==0): lamb = params.L3 elif (params.annealing ==1): lamb = params.L3* (1 - 0.01*t_t) cost = T.mean(-cost11) #from adam import adam #updates_a = adam(cost, a_params, params.eta) updates_a = lasagne.updates.sgd(cost, a_params, params.eta) updates_a = lasagne.updates.apply_momentum(updates_a, a_params, momentum=0.9) self.inf_fn = theano.function([input_var, mask_var, mask_var1, length], cost, updates = updates_a) self.eval_fn = theano.function([input_var, mask_var, mask_var1, length], [prediction, -cost11], on_unused_input='ignore')
config.lr_decay = 1.02 config.weight_decay = 1e-7 config.max_grad_norm = 10 config.num_steps = 35 config.max_epoch = 20 # number of epochs after which learning decay starts config.drop_x = 0.25 # variational dropout rate over input word embeddings config.drop_i = 0.75 # variational dropout rate over inputs of RHN layers(s), applied seperately in each RHN layer config.drop_s = 0.25 # variational dropout rate over recurrent state config.drop_o = 0.75 # variational dropout rate over outputs of RHN layer(s), applied before classification layer config.vocab_size = 10000 print("Data loading") train_data, valid_data, test_data, _ = ptb_raw_data(config.data_path) print('Compiling model') _is_training = T.iscalar('is_training') _lr = theano.shared(cast_floatX(config.learning_rate), 'lr') _input_data = T.imatrix('input_data') # (batch_size, num_steps) _noise_x = T.matrix('noise_x') # (batch_size, num_steps) # model _theano_rng = RandomStreams(config.seed // 2 + 321) # generates random numbers directly on GPU flat_probs, params, rhn_updates, hidden_states = stacked.model( _input_data, _noise_x, _lr, _is_training, config, _theano_rng) # loss _targets = T.imatrix('targets') # (batch_size, num_steps) flat_targets = _targets.T.flatten() xentropies = T.nnet.categorical_crossentropy( flat_probs, flat_targets) # (batch_size * num_steps,)
# We apply the softmax later out = lib.ops.Linear('SampleLevel.Output', DIM, Q_LEVELS, out, weightnorm=WEIGHT_NORM) return out sequences_8k = T.imatrix('sequences_8k') #batch size*samplenum sequences_up = T.imatrix('sequences_up') condition = T.matrix('con') con_h0 = T.tensor3('con_h0') h0 = T.tensor3('h0') #(batch size, N_RNN, DIM) big_h0 = T.tensor3('big_h0') #(batch size, N_BIG_RNN, BIG_DIM) reset = T.iscalar('reset') mask = T.matrix('mask') #batch size*samplenum batch_size = T.iscalar('batch_size') lr = T.scalar('lr') con_input_sequences = condition big_input_sequences = sequences_8k #The last BIG_FRAME_SIZE frames do not need (tier3) big_input_sequences = big_input_sequences.reshape((1, batch_size, 1, -1)) big_input_sequences = T.nnet.neighbours.images2neibs(big_input_sequences, (1, 2 * OVERLAP), neib_step=(1, OVERLAP), mode='valid') big_input_sequences = big_input_sequences.reshape((batch_size, -1)) input_sequences = sequences_8k[:, 0:-(OVERLAP - FRAME_SIZE)] #(tier2)
def create_model(s1_ae, s2_ae, s3_ae, s1_shape, s1_var, s2_shape, s2_var, s3_shape, s3_var, mask_shape, mask_var, lstm_size=250, win=T.iscalar('theta)'), output_classes=26, fusiontype='concat', w_init_fn=las.init.Orthogonal(), use_peepholes=True): s1_bn_weights, s1_bn_biases, s1_bn_shapes, s1_bn_nonlinearities = s1_ae s2_weights, s2_biases, s2_shapes, s2_nonlinearities = s2_ae s3_weights, s3_biases, s3_shapes, s3_nonlinearities = s3_ae gate_parameters = Gate(W_in=w_init_fn, W_hid=w_init_fn, b=las.init.Constant(0.)) cell_parameters = Gate( W_in=w_init_fn, W_hid=w_init_fn, # Setting W_cell to None denotes that no cell connection will be used. W_cell=None, b=las.init.Constant(0.), # By convention, the cell nonlinearity is tanh in an LSTM. nonlinearity=tanh) l_s1 = InputLayer(s1_shape, s1_var, 's1_im') l_mask = InputLayer(mask_shape, mask_var, 'mask') l_s2 = InputLayer(s2_shape, s2_var, 's2_im') l_s3 = InputLayer(s3_shape, s3_var, 's3_im') symbolic_batchsize_s1 = l_s1.input_var.shape[0] symbolic_seqlen_s1 = l_s1.input_var.shape[1] symbolic_batchsize_s2 = l_s2.input_var.shape[0] symbolic_seqlen_s2 = l_s2.input_var.shape[1] symbolic_batchsize_s3 = l_s3.input_var.shape[0] symbolic_seqlen_s3 = l_s3.input_var.shape[1] l_reshape1_s1 = ReshapeLayer(l_s1, (-1, s1_shape[-1]), name='reshape1_s1') l_encoder_s1 = create_pretrained_encoder( l_reshape1_s1, s1_bn_weights, s1_bn_biases, s1_bn_shapes, s1_bn_nonlinearities, ['fc1_s1', 'fc2_s1', 'fc3_s1', 'bottleneck_s1']) s1_len = las.layers.get_output_shape(l_encoder_s1)[-1] l_reshape2_s1 = ReshapeLayer( l_encoder_s1, (symbolic_batchsize_s1, symbolic_seqlen_s1, s1_len), name='reshape2_s1') l_delta_s1 = DeltaLayer(l_reshape2_s1, win, name='delta_s1') l_delta_s1_dropout = DropoutLayer(l_delta_s1, name='dropout_s1') # s2 images l_reshape1_s2 = ReshapeLayer(l_s2, (-1, s2_shape[-1]), name='reshape1_s2') l_encoder_s2 = create_pretrained_encoder( l_reshape1_s2, s2_weights, s2_biases, s2_shapes, s2_nonlinearities, ['fc1_s2', 'fc2_s2', 'fc3_s2', 'bottleneck_s2']) s2_len = las.layers.get_output_shape(l_encoder_s2)[-1] l_reshape2_s2 = ReshapeLayer( l_encoder_s2, (symbolic_batchsize_s2, symbolic_seqlen_s2, s2_len), name='reshape2_s2') l_delta_s2 = DeltaLayer(l_reshape2_s2, win, name='delta_s2') l_delta_s2_dropout = DropoutLayer(l_delta_s2, name='dropout_s2') # s3 images l_reshape1_s3 = ReshapeLayer(l_s3, (-1, s3_shape[-1]), name='reshape1_s3') l_encoder_s3 = create_pretrained_encoder( l_reshape1_s3, s3_weights, s3_biases, s3_shapes, s3_nonlinearities, ['fc1_s3', 'fc2_s3', 'fc3_s3', 'bottleneck_s3']) s3_len = las.layers.get_output_shape(l_encoder_s3)[-1] l_reshape2_s3 = ReshapeLayer( l_encoder_s3, (symbolic_batchsize_s3, symbolic_seqlen_s3, s3_len), name='reshape2_s3') l_delta_s3 = DeltaLayer(l_reshape2_s3, win, name='delta_s3') l_delta_s3_dropout = DropoutLayer(l_delta_s3, name='dropout_s3') l_lstm_s1 = LSTMLayer( l_delta_s1_dropout, lstm_size * 2, peepholes=use_peepholes, # We need to specify a separate input for masks mask_input=l_mask, # Here, we supply the gate parameters for each gate ingate=gate_parameters, forgetgate=gate_parameters, cell=cell_parameters, outgate=gate_parameters, # We'll learn the initialization and use gradient clipping learn_init=True, grad_clipping=5., name='lstm_s1') l_lstm_s2 = LSTMLayer( l_delta_s2_dropout, lstm_size * 2, peepholes=use_peepholes, # We need to specify a separate input for masks mask_input=l_mask, # Here, we supply the gate parameters for each gate ingate=gate_parameters, forgetgate=gate_parameters, cell=cell_parameters, outgate=gate_parameters, # We'll learn the initialization and use gradient clipping learn_init=True, grad_clipping=5., name='lstm_s2') l_lstm_s3 = LSTMLayer( l_delta_s3_dropout, lstm_size * 2, peepholes=use_peepholes, # We need to specify a separate input for masks mask_input=l_mask, # Here, we supply the gate parameters for each gate ingate=gate_parameters, forgetgate=gate_parameters, cell=cell_parameters, outgate=gate_parameters, # We'll learn the initialization and use gradient clipping learn_init=True, grad_clipping=5., name='lstm_s3') # We'll combine the forward and backward layer output by summing. # Merge layers take in lists of layers to merge as input. if fusiontype == 'adasum': l_fuse = AdaptiveElemwiseSumLayer([l_lstm_s1, l_lstm_s2, l_lstm_s3], name='adasum1') elif fusiontype == 'sum': l_fuse = ElemwiseSumLayer([l_lstm_s1, l_lstm_s2, l_lstm_s3], name='sum1') elif fusiontype == 'concat': l_fuse = ConcatLayer([l_lstm_s1, l_lstm_s2, l_lstm_s3], axis=-1, name='concat') l_fuse_dropout = DropoutLayer(l_fuse, name='concat_dropout') f_lstm_agg, b_lstm_agg = create_blstm(l_fuse_dropout, l_mask, lstm_size * 2, cell_parameters, gate_parameters, 'lstm_agg') l_sum2 = ElemwiseSumLayer([f_lstm_agg, b_lstm_agg], name='sum2') # reshape to (num_examples * seq_len, lstm_size) l_reshape3 = ReshapeLayer(l_sum2, (-1, lstm_size * 2), name='reshape3') # Now, we can apply feed-forward layers as usual. # We want the network to predict a classification for the sequence, # so we'll use a the number of classes. l_softmax = DenseLayer(l_reshape3, num_units=output_classes, nonlinearity=las.nonlinearities.softmax, name='softmax') l_out = ReshapeLayer(l_softmax, (-1, symbolic_seqlen_s1, output_classes), name='output') return l_out, l_fuse
def test_grad_s(self): "tests that the gradients with respect to s_i are 0 after doing a mean field update of s_i " model = self.model e_step = self.e_step X = self.X assert X.shape[0] == self.m model.test_batch_size = X.shape[0] init_H = e_step.init_H_hat(V=X) init_Mu1 = e_step.init_S_hat(V=X) prev_setting = config.compute_test_value config.compute_test_value = 'off' H, Mu1 = function([], outputs=[init_H, init_Mu1])() config.compute_test_value = prev_setting H = broadcast(H, self.m) Mu1 = broadcast(Mu1, self.m) H = np.cast[config.floatX](self.model.rng.uniform(0., 1., H.shape)) Mu1 = np.cast[config.floatX](self.model.rng.uniform( -5., 5., Mu1.shape)) H_var = T.matrix(name='H_var') H_var.tag.test_value = H Mu1_var = T.matrix(name='Mu1_var') Mu1_var.tag.test_value = Mu1 idx = T.iscalar() idx.tag.test_value = 0 S = e_step.infer_S_hat(V=X, H_hat=H_var, S_hat=Mu1_var) s_idx = S[:, idx] s_i_func = function([H_var, Mu1_var, idx], s_idx) sigma0 = 1. / model.alpha Sigma1 = e_step.infer_var_s1_hat() mu0 = T.zeros_like(model.mu) #by truncated KL, I mean that I am dropping terms that don't depend on H and Mu1 # (they don't affect the outcome of this test and some of them are intractable ) trunc_kl = - model.entropy_hs(H_hat = H_var, var_s0_hat = sigma0, var_s1_hat = Sigma1) + \ model.expected_energy_vhs(V = X, H_hat = H_var, S_hat = Mu1_var, var_s0_hat = sigma0, var_s1_hat = Sigma1) grad_Mu1 = T.grad(trunc_kl.sum(), Mu1_var) grad_Mu1_idx = grad_Mu1[:, idx] grad_func = function([H_var, Mu1_var, idx], grad_Mu1_idx) for i in xrange(self.N): Mu1[:, i] = s_i_func(H, Mu1, i) g = grad_func(H, Mu1, i) assert not contains_nan(g) g_abs_max = np.abs(g).max() if g_abs_max > self.tol: raise Exception( 'after mean field step, gradient of kl divergence wrt mean field parameter should be 0, but here the max magnitude of a gradient element is ' + str(g_abs_max) + ' after updating s_' + str(i))
def jobman(state, channel): # load dataset state['null_sym_source'] = 15000 state['null_sym_target'] = 15000 state['n_sym_source'] = state['null_sym_source'] + 1 state['n_sym_target'] = state['null_sym_target'] + 1 state['nouts'] = state['n_sym_target'] state['nins'] = state['n_sym_source'] rng = numpy.random.RandomState(state['seed']) if state['loopIters'] > 0: train_data, valid_data, test_data = get_data(state) else: train_data = None valid_data = None test_data = None ########### Training graph ##################### ## 1. Inputs if state['bs'] == 1: x = TT.lvector('x') x_mask = TT.vector('x_mask') y = TT.lvector('y') y0 = y y_mask = TT.vector('y_mask') else: x = TT.lmatrix('x') x_mask = TT.matrix('x_mask') y = TT.lmatrix('y') y0 = y y_mask = TT.matrix('y_mask') # 2. Layers and Operators bs = state['bs'] embdim = state['dim_mlp'] # Source Sentence emb = MultiLayer(rng, n_in=state['nins'], n_hids=[state['rank_n_approx']], activation=[state['rank_n_activ']], init_fn=state['weight_init_fn'], weight_noise=state['weight_noise'], scale=state['weight_scale'], name='emb') emb_words = [] if state['rec_gating']: gater_words = [] if state['rec_reseting']: reseter_words = [] for si in xrange(state['encoder_stack']): emb_words.append( MultiLayer(rng, n_in=state['rank_n_approx'], n_hids=[embdim], activation=['lambda x:x'], init_fn=state['weight_init_fn'], weight_noise=state['weight_noise'], scale=state['weight_scale'], name='emb_words_%d' % si)) if state['rec_gating']: gater_words.append( MultiLayer(rng, n_in=state['rank_n_approx'], n_hids=[state['dim']], activation=['lambda x:x'], init_fn=state['weight_init_fn'], weight_noise=state['weight_noise'], scale=state['weight_scale'], learn_bias=False, name='gater_words_%d' % si)) if state['rec_reseting']: reseter_words.append( MultiLayer(rng, n_in=state['rank_n_approx'], n_hids=[state['dim']], activation=['lambda x:x'], init_fn=state['weight_init_fn'], weight_noise=state['weight_noise'], scale=state['weight_scale'], learn_bias=False, name='reseter_words_%d' % si)) add_rec_step = [] rec_proj = [] if state['rec_gating']: rec_proj_gater = [] if state['rec_reseting']: rec_proj_reseter = [] for si in xrange(state['encoder_stack']): if si > 0: rec_proj.append( MultiLayer(rng, n_in=state['dim'], n_hids=[embdim], activation=['lambda x:x'], init_fn=state['rec_weight_init_fn'], weight_noise=state['weight_noise'], scale=state['rec_weight_scale'], name='rec_proj_%d' % si)) if state['rec_gating']: rec_proj_gater.append( MultiLayer(rng, n_in=state['dim'], n_hids=[state['dim']], activation=['lambda x:x'], init_fn=state['weight_init_fn'], weight_noise=state['weight_noise'], scale=state['weight_scale'], learn_bias=False, name='rec_proj_gater_%d' % si)) if state['rec_reseting']: rec_proj_reseter.append( MultiLayer(rng, n_in=state['dim'], n_hids=[state['dim']], activation=['lambda x:x'], init_fn=state['weight_init_fn'], weight_noise=state['weight_noise'], scale=state['weight_scale'], learn_bias=False, name='rec_proj_reseter_%d' % si)) add_rec_step.append( eval(state['rec_layer'])(rng, n_hids=state['dim'], activation=state['activ'], bias_scale=state['bias'], scale=state['rec_weight_scale'], init_fn=state['rec_weight_init_fn'], weight_noise=state['weight_noise_rec'], dropout=state['dropout_rec'], gating=state['rec_gating'], gater_activation=state['rec_gater'], reseting=state['rec_reseting'], reseter_activation=state['rec_reseter'], name='add_h_%d' % si)) def _add_op(words_embeddings, words_mask=None, prev_val=None, si=0, state_below=None, gater_below=None, reseter_below=None, one_step=False, bs=1, init_state=None, use_noise=True): seqlen = words_embeddings.out.shape[0] // bs rval = words_embeddings gater = None reseter = None if state['rec_gating']: gater = gater_below if state['rec_reseting']: reseter = reseter_below if si > 0: rval += rec_proj[si - 1](state_below, one_step=one_step, use_noise=use_noise) if state['rec_gating']: projg = rec_proj_gater[si - 1](state_below, one_step=one_step, use_noise=use_noise) if gater: gater += projg else: gater = projg if state['rec_reseting']: projg = rec_proj_reseter[si - 1](state_below, one_step=one_step, use_noise=use_noise) if reseter: reseter += projg else: reseter = projg if not one_step: rval = add_rec_step[si](rval, nsteps=seqlen, batch_size=bs, mask=words_mask, gater_below=gater, reseter_below=reseter, one_step=one_step, init_state=init_state, use_noise=use_noise) else: rval = add_rec_step[si](rval, mask=words_mask, state_before=prev_val, gater_below=gater, reseter_below=reseter, one_step=one_step, init_state=init_state, use_noise=use_noise) return rval add_op = Operator(_add_op) # Target Sentence emb_t = MultiLayer(rng, n_in=state['nouts'], n_hids=[state['rank_n_approx']], activation=[state['rank_n_activ']], init_fn=state['weight_init_fn'], weight_noise=state['weight_noise'], scale=state['weight_scale'], name='emb_t') emb_words_t = [] if state['rec_gating']: gater_words_t = [] if state['rec_reseting']: reseter_words_t = [] for si in xrange(state['decoder_stack']): emb_words_t.append( MultiLayer(rng, n_in=state['rank_n_approx'], n_hids=[embdim], activation=['lambda x:x'], init_fn=state['weight_init_fn'], weight_noise=state['weight_noise'], scale=state['weight_scale'], name='emb_words_t_%d' % si)) if state['rec_gating']: gater_words_t.append( MultiLayer(rng, n_in=state['rank_n_approx'], n_hids=[state['dim']], activation=['lambda x:x'], init_fn=state['weight_init_fn'], weight_noise=state['weight_noise'], scale=state['weight_scale'], learn_bias=False, name='gater_words_t_%d' % si)) if state['rec_reseting']: reseter_words_t.append( MultiLayer(rng, n_in=state['rank_n_approx'], n_hids=[state['dim']], activation=['lambda x:x'], init_fn=state['weight_init_fn'], weight_noise=state['weight_noise'], scale=state['weight_scale'], learn_bias=False, name='reseter_words_t_%d' % si)) proj_everything_t = [] if state['rec_gating']: gater_everything_t = [] if state['rec_reseting']: reseter_everything_t = [] for si in xrange(state['decoder_stack']): proj_everything_t.append( MultiLayer(rng, n_in=state['dim'], n_hids=[embdim], activation=['lambda x:x'], init_fn=state['weight_init_fn'], weight_noise=state['weight_noise'], scale=state['weight_scale'], name='proj_everything_t_%d' % si, learn_bias=False)) if state['rec_gating']: gater_everything_t.append( MultiLayer(rng, n_in=state['dim'], n_hids=[state['dim']], activation=['lambda x:x'], init_fn=state['weight_init_fn'], weight_noise=state['weight_noise'], scale=state['weight_scale'], name='gater_everything_t_%d' % si, learn_bias=False)) if state['rec_reseting']: reseter_everything_t.append( MultiLayer(rng, n_in=state['dim'], n_hids=[state['dim']], activation=['lambda x:x'], init_fn=state['weight_init_fn'], weight_noise=state['weight_noise'], scale=state['weight_scale'], name='reseter_everything_t_%d' % si, learn_bias=False)) add_rec_step_t = [] rec_proj_t = [] if state['rec_gating']: rec_proj_t_gater = [] if state['rec_reseting']: rec_proj_t_reseter = [] for si in xrange(state['decoder_stack']): if si > 0: rec_proj_t.append( MultiLayer(rng, n_in=state['dim'], n_hids=[embdim], activation=['lambda x:x'], init_fn=state['rec_weight_init_fn'], weight_noise=state['weight_noise'], scale=state['rec_weight_scale'], name='rec_proj_%d' % si)) if state['rec_gating']: rec_proj_t_gater.append( MultiLayer(rng, n_in=state['dim'], n_hids=[state['dim']], activation=['lambda x:x'], init_fn=state['weight_init_fn'], weight_noise=state['weight_noise'], scale=state['weight_scale'], learn_bias=False, name='rec_proj_t_gater_%d' % si)) if state['rec_reseting']: rec_proj_t_reseter.append( MultiLayer(rng, n_in=state['dim'], n_hids=[state['dim']], activation=['lambda x:x'], init_fn=state['weight_init_fn'], weight_noise=state['weight_noise'], scale=state['weight_scale'], learn_bias=False, name='rec_proj_t_reseter_%d' % si)) add_rec_step_t.append( eval(state['rec_layer'])(rng, n_hids=state['dim'], activation=state['activ'], bias_scale=state['bias'], scale=state['rec_weight_scale'], init_fn=state['rec_weight_init_fn'], weight_noise=state['weight_noise_rec'], dropout=state['dropout_rec'], gating=state['rec_gating'], gater_activation=state['rec_gater'], reseting=state['rec_reseting'], reseter_activation=state['rec_reseter'], name='add_h_t_%d' % si)) if state['encoder_stack'] > 1: encoder_proj = [] for si in xrange(state['encoder_stack']): encoder_proj.append( MultiLayer(rng, n_in=state['dim'], n_hids=[state['dim'] * state['maxout_part']], activation=['lambda x: x'], init_fn=state['weight_init_fn'], weight_noise=state['weight_noise'], scale=state['weight_scale'], name='encoder_proj_%d' % si, learn_bias=(si == 0))) encoder_act_layer = UnaryOp(activation=eval(state['unary_activ']), indim=indim, pieces=pieces, rng=rng) def _add_t_op(words_embeddings, everything=None, words_mask=None, prev_val=None, one_step=False, bs=1, init_state=None, use_noise=True, gater_below=None, reseter_below=None, si=0, state_below=None): seqlen = words_embeddings.out.shape[0] // bs rval = words_embeddings gater = None if state['rec_gating']: gater = gater_below reseter = None if state['rec_reseting']: reseter = reseter_below if si > 0: if isinstance(state_below, list): state_below = state_below[-1] rval += rec_proj_t[si - 1](state_below, one_step=one_step, use_noise=use_noise) if state['rec_gating']: projg = rec_proj_t_gater[si - 1](state_below, one_step=one_step, use_noise=use_noise) if gater: gater += projg else: gater = projg if state['rec_reseting']: projg = rec_proj_t_reseter[si - 1](state_below, one_step=one_step, use_noise=use_noise) if reseter: reseter += projg else: reseter = projg if everything: rval = rval + proj_everything_t[si](everything) if state['rec_gating']: everyg = gater_everything_t[si](everything, one_step=one_step, use_noise=use_noise) if gater: gater += everyg else: gater = everyg if state['rec_reseting']: everyg = reseter_everything_t[si](everything, one_step=one_step, use_noise=use_noise) if reseter: reseter += everyg else: reseter = everyg if not one_step: rval = add_rec_step_t[si](rval, nsteps=seqlen, batch_size=bs, mask=words_mask, one_step=one_step, init_state=init_state, gater_below=gater, reseter_below=reseter, use_noise=use_noise) else: rval = add_rec_step_t[si](rval, mask=words_mask, state_before=prev_val, one_step=one_step, gater_below=gater, reseter_below=reseter, use_noise=use_noise) return rval add_t_op = Operator(_add_t_op) outdim = state['dim_mlp'] if not state['deep_out']: outdim = state['rank_n_approx'] if state['bias_code']: bias_code = [] for si in xrange(state['decoder_stack']): bias_code.append( MultiLayer(rng, n_in=state['dim'], n_hids=[state['dim']], activation=[state['activ']], bias_scale=[state['bias']], scale=state['weight_scale'], init_fn=state['weight_init_fn'], weight_noise=state['weight_noise'], name='bias_code_%d' % si)) if state['avg_word']: word_code_nin = state['rank_n_approx'] word_code = MultiLayer(rng, n_in=word_code_nin, n_hids=[outdim], activation='lambda x:x', bias_scale=[state['bias_mlp'] / 3], scale=state['weight_scale'], init_fn=state['weight_init_fn'], weight_noise=state['weight_noise'], learn_bias=False, name='word_code') proj_code = MultiLayer(rng, n_in=state['dim'], n_hids=[outdim], activation='lambda x: x', bias_scale=[state['bias_mlp'] / 3], scale=state['weight_scale'], init_fn=state['weight_init_fn'], weight_noise=state['weight_noise'], learn_bias=False, name='proj_code') proj_h = [] for si in xrange(state['decoder_stack']): proj_h.append( MultiLayer(rng, n_in=state['dim'], n_hids=[outdim], activation='lambda x: x', bias_scale=[state['bias_mlp'] / 3], scale=state['weight_scale'], init_fn=state['weight_init_fn'], weight_noise=state['weight_noise'], name='proj_h_%d' % si)) if state['bigram']: proj_word = MultiLayer(rng, n_in=state['rank_n_approx'], n_hids=[outdim], activation=['lambda x:x'], bias_scale=[state['bias_mlp'] / 3], init_fn=state['weight_init_fn'], weight_noise=state['weight_noise'], scale=state['weight_scale'], learn_bias=False, name='emb_words_lm') if state['deep_out']: indim = 0 pieces = 0 act_layer = UnaryOp(activation=eval(state['unary_activ'])) drop_layer = DropOp(rng=rng, dropout=state['dropout']) if state['deep_out']: indim = state['dim_mlp'] / state['maxout_part'] rank_n_approx = state['rank_n_approx'] rank_n_activ = state['rank_n_activ'] else: indim = state['rank_n_approx'] rank_n_approx = 0 rank_n_activ = None output_layer = SoftmaxLayer(rng, indim, state['nouts'], state['weight_scale'], -1, rank_n_approx=rank_n_approx, rank_n_activ=rank_n_activ, weight_noise=state['weight_noise'], init_fn=state['weight_init_fn'], name='out') def _pop_op(everything, accum, everything_max=None, everything_min=None, word=None, aword=None, one_step=False, use_noise=True): rval = proj_h[0](accum[0], one_step=one_step, use_noise=use_noise) for si in xrange(1, state['decoder_stack']): rval += proj_h[si](accum[si], one_step=one_step, use_noise=use_noise) if state['mult_out']: rval = rval * everything else: rval = rval + everything if aword and state['avg_word']: wcode = aword if one_step: if state['mult_out']: rval = rval * wcode else: rval = rval + wcode else: if not isinstance(wcode, TT.TensorVariable): wcode = wcode.out shape = wcode.shape rshape = rval.shape rval = rval.reshape( [rshape[0] / shape[0], shape[0], rshape[1]]) wcode = wcode.dimshuffle('x', 0, 1) if state['mult_out']: rval = rval * wcode else: rval = rval + wcode rval = rval.reshape(rshape) if word and state['bigram']: if one_step: if state['mult_out']: rval *= proj_word(emb_t(word, use_noise=use_noise), one_step=one_step, use_noise=use_noise) else: rval += proj_word(emb_t(word, use_noise=use_noise), one_step=one_step, use_noise=use_noise) else: if isinstance(word, TT.TensorVariable): shape = word.shape ndim = word.ndim else: shape = word.shape ndim = word.out.ndim pword = proj_word(emb_t(word, use_noise=use_noise), one_step=one_step, use_noise=use_noise) shape_pword = pword.shape if ndim == 1: pword = Shift()(pword.reshape([shape[0], 1, outdim])) else: pword = Shift()(pword.reshape([shape[0], shape[1], outdim])) if state['mult_out']: rval *= pword.reshape(shape_pword) else: rval += pword.reshape(shape_pword) if state['deep_out']: rval = drop_layer(act_layer(rval), use_noise=use_noise) return rval pop_op = Operator(_pop_op) # 3. Constructing the model gater_below = None if state['rec_gating']: gater_below = gater_words[0](emb(x)) reseter_below = None if state['rec_reseting']: reseter_below = reseter_words[0](emb(x)) encoder_acts = [ add_op(emb_words[0](emb(x)), x_mask, bs=x_mask.shape[1], si=0, gater_below=gater_below, reseter_below=reseter_below) ] if state['encoder_stack'] > 1: everything = encoder_proj[0](last(encoder_acts[-1])) for si in xrange(1, state['encoder_stack']): gater_below = None if state['rec_gating']: gater_below = gater_words[si](emb(x)) reseter_below = None if state['rec_reseting']: reseter_below = reseter_words[si](emb(x)) encoder_acts.append( add_op(emb_words[si](emb(x)), x_mask, bs=x_mask.shape[1], si=si, state_below=encoder_acts[-1], gater_below=gater_below, reseter_below=reseter_below)) if state['encoder_stack'] > 1: everything += encoder_proj[si](last(encoder_acts[-1])) if state['encoder_stack'] <= 1: encoder = encoder_acts[-1] everything = LastState(ntimes=True, n=y.shape[0])(encoder) else: everything = encoder_act_layer(everything) everything = everything.reshape( [1, everything.shape[0], everything.shape[1]]) everything = LastState(ntimes=True, n=y.shape[0])(everything) if state['bias_code']: init_state = [bc(everything[-1]) for bc in bias_code] else: init_state = [None for bc in bias_code] if state['avg_word']: shape = x.shape pword = emb(x).out.reshape( [shape[0], shape[1], state['rank_n_approx']]) pword = pword * x_mask.dimshuffle(0, 1, 'x') aword = pword.sum(0) / TT.maximum(1., x_mask.sum(0).dimshuffle(0, 'x')) aword = word_code(aword, use_noise=False) else: aword = None gater_below = None if state['rec_gating']: gater_below = gater_words_t[0](emb_t(y0)) reseter_below = None if state['rec_reseting']: reseter_below = reseter_words_t[0](emb_t(y0)) has_said = [ add_t_op(emb_words_t[0](emb_t(y0)), everything, y_mask, bs=y_mask.shape[1], gater_below=gater_below, reseter_below=reseter_below, init_state=init_state[0], si=0) ] for si in xrange(1, state['decoder_stack']): gater_below = None if state['rec_gating']: gater_below = gater_words_t[si](emb_t(y0)) reseter_below = None if state['rec_reseting']: reseter_below = reseter_words_t[si](emb_t(y0)) has_said.append( add_t_op(emb_words_t[si](emb_t(y0)), everything, y_mask, bs=y_mask.shape[1], state_below=has_said[-1], gater_below=gater_below, reseter_below=reseter_below, init_state=init_state[si], si=si)) if has_said[0].out.ndim < 3: for si in xrange(state['decoder_stack']): shape_hs = has_said[si].shape if y0.ndim == 1: shape = y0.shape has_said[si] = Shift()(has_said[si].reshape( [shape[0], 1, state['dim_mlp']])) else: shape = y0.shape has_said[si] = Shift()(has_said[si].reshape( [shape[0], shape[1], state['dim_mlp']])) has_said[si].out = TT.set_subtensor(has_said[si].out[0, :, :], init_state[si]) has_said[si] = has_said[si].reshape(shape_hs) else: for si in xrange(state['decoder_stack']): has_said[si] = Shift()(has_said[si]) has_said[si].out = TT.set_subtensor(has_said[si].out[0, :, :], init_state[si]) model = pop_op(proj_code(everything), has_said, word=y0, aword=aword) nll = output_layer.train( state_below=model, target=y0, mask=y_mask, reg=None) / TT.cast( y.shape[0] * y.shape[1], 'float32') valid_fn = None noise_fn = None x = TT.lvector(name='x') n_steps = TT.iscalar('nsteps') temp = TT.scalar('temp') gater_below = None if state['rec_gating']: gater_below = gater_words[0](emb(x)) reseter_below = None if state['rec_reseting']: reseter_below = reseter_words[0](emb(x)) encoder_acts = [ add_op(emb_words[0](emb(x), use_noise=False), si=0, use_noise=False, gater_below=gater_below, reseter_below=reseter_below) ] if state['encoder_stack'] > 1: everything = encoder_proj[0](last(encoder_acts[-1]), use_noise=False) for si in xrange(1, state['encoder_stack']): gater_below = None if state['rec_gating']: gater_below = gater_words[si](emb(x)) reseter_below = None if state['rec_reseting']: reseter_below = reseter_words[si](emb(x)) encoder_acts.append( add_op(emb_words[si](emb(x), use_noise=False), si=si, state_below=encoder_acts[-1], use_noise=False, gater_below=gater_below, reseter_below=reseter_below)) if state['encoder_stack'] > 1: everything += encoder_proj[si](last(encoder_acts[-1]), use_noise=False) if state['encoder_stack'] <= 1: encoder = encoder_acts[-1] everything = last(encoder) else: everything = encoder_act_layer(everything) init_state = [] for si in xrange(state['decoder_stack']): if state['bias_code']: init_state.append( TT.reshape(bias_code[si](everything, use_noise=False), [1, state['dim']])) else: init_state.append(TT.alloc(numpy.float32(0), 1, state['dim'])) if state['avg_word']: aword = emb(x, use_noise=False).out.mean(0) aword = word_code(aword, use_noise=False) else: aword = None def sample_fn(*args): aidx = 0 word_tm1 = args[aidx] aidx += 1 prob_tm1 = args[aidx] has_said_tm1 = [] for si in xrange(state['decoder_stack']): aidx += 1 has_said_tm1.append(args[aidx]) aidx += 1 ctx = args[aidx] if state['avg_word']: aidx += 1 awrd = args[aidx] val = pop_op(proj_code(ctx), has_said_tm1, word=word_tm1, aword=awrd, one_step=True, use_noise=False) sample = output_layer.get_sample(state_below=val, temp=temp) logp = output_layer.get_cost(state_below=val.out.reshape( [1, TT.cast(output_layer.n_in, 'int64')]), temp=temp, target=sample.reshape([1, 1]), use_noise=False) gater_below = None if state['rec_gating']: gater_below = gater_words_t[0](emb_t(sample)) reseter_below = None if state['rec_reseting']: reseter_below = reseter_words_t[0](emb_t(sample)) has_said_t = [ add_t_op(emb_words_t[0](emb_t(sample)), ctx, prev_val=has_said_tm1[0], gater_below=gater_below, reseter_below=reseter_below, one_step=True, use_noise=True, si=0) ] for si in xrange(1, state['decoder_stack']): gater_below = None if state['rec_gating']: gater_below = gater_words_t[si](emb_t(sample)) reseter_below = None if state['rec_reseting']: reseter_below = reseter_words_t[si](emb_t(sample)) has_said_t.append( add_t_op(emb_words_t[si](emb_t(sample)), ctx, prev_val=has_said_tm1[si], gater_below=gater_below, reseter_below=reseter_below, one_step=True, use_noise=True, si=si, state_below=has_said_t[-1])) for si in xrange(state['decoder_stack']): if isinstance(has_said_t[si], list): has_said_t[si] = has_said_t[si][-1] rval = [sample, TT.cast(logp, 'float32')] + has_said_t return rval sampler_params = [everything] if state['avg_word']: sampler_params.append(aword) states = [TT.alloc(numpy.int64(0), n_steps)] states.append(TT.alloc(numpy.float32(0), n_steps)) states += init_state outputs, updates = scan(sample_fn, states=states, params=sampler_params, n_steps=n_steps, name='sampler_scan') samples = outputs[0] probs = outputs[1] sample_fn = theano.function([n_steps, temp, x], [samples, probs.sum()], updates=updates, profile=False, name='sample_fn') model = LM_Model(cost_layer=nll, weight_noise_amount=state['weight_noise_amount'], valid_fn=valid_fn, sample_fn=sample_fn, clean_before_noise_fn=False, noise_fn=noise_fn, indx_word=state['indx_word_target'], indx_word_src=state['indx_word'], character_level=False, rng=rng) if state['loopIters'] > 0: algo = SGD(model, state, train_data) else: algo = None def hook_fn(): if not hasattr(model, 'word_indxs'): model.load_dict() if not hasattr(model, 'word_indxs_src'): model.word_indxs_src = model.word_indxs old_offset = train_data.offset if state['sample_reset']: train_data.reset() ns = 0 for sidx in xrange(state['sample_n']): while True: batch = train_data.next() if batch: break x = batch['x'] y = batch['y'] #xbow = batch['x_bow'] masks = batch['x_mask'] if x.ndim > 1: for idx in xrange(x.shape[1]): ns += 1 if ns > state['sample_max']: break print 'Input: ', for k in xrange(x[:, idx].shape[0]): print model.word_indxs_src[x[:, idx][k]], if model.word_indxs_src[x[:, idx][k]] == '<eol>': break print '' print 'Target: ', for k in xrange(y[:, idx].shape[0]): print model.word_indxs[y[:, idx][k]], if model.word_indxs[y[:, idx][k]] == '<eol>': break print '' senlen = len(x[:, idx]) if len(numpy.where(masks[:, idx] == 0)[0]) > 0: senlen = numpy.where(masks[:, idx] == 0)[0][0] if senlen < 1: continue xx = x[:senlen, idx] #xx = xx.reshape([xx.shape[0], 1]) model.get_samples(state['seqlen'] + 1, 1, xx) else: ns += 1 model.get_samples(state['seqlen'] + 1, 1, x) if ns > state['sample_max']: break train_data.offset = old_offset return main = MainLoop(train_data, valid_data, None, model, algo, state, channel, reset=state['reset'], hooks=hook_fn) if state['reload']: main.load() if state['loopIters'] > 0: main.main() if state['sampler_test']: # This is a test script: we only sample if not hasattr(model, 'word_indxs'): model.load_dict() if not hasattr(model, 'word_indxs_src'): model.word_indxs_src = model.word_indxs indx_word = pkl.load(open(state['word_indx'], 'rb')) try: while True: try: seqin = raw_input('Input Sequence: ') n_samples = int(raw_input('How many samples? ')) alpha = float(raw_input('Inverse Temperature? ')) seqin = seqin.lower() seqin = seqin.split() seqlen = len(seqin) seq = numpy.zeros(seqlen + 1, dtype='int64') for idx, sx in enumerate(seqin): try: seq[idx] = indx_word[sx] except: seq[idx] = indx_word[state['oov']] seq[-1] = state['null_sym_source'] except Exception: print 'Something wrong with your input! Try again!' continue sentences = [] all_probs = [] for sidx in xrange(n_samples): #import ipdb; ipdb.set_trace() [values, probs] = model.sample_fn(seqlen * 3, alpha, seq) sen = [] for k in xrange(values.shape[0]): if model.word_indxs[values[k]] == '<eol>': break sen.append(model.word_indxs[values[k]]) sentences.append(" ".join(sen)) all_probs.append(-probs) sprobs = numpy.argsort(all_probs) for pidx in sprobs: print pidx, "(%f):" % (-all_probs[pidx]), sentences[pidx] print except KeyboardInterrupt: print 'Interrupted' pass
def __init__(self, n_in, hidden_layer_size, n_out, L1_reg, L2_reg, hidden_layer_type, output_type='LINEAR', network_type='DNN', dropout_rate=0.0): """ This function initialises a neural network :param n_in: Dimensionality of input features :type in: Integer :param hidden_layer_size: The layer size for each hidden layer :type hidden_layer_size: A list of integers :param n_out: Dimensionality of output features :type n_out: Integrer :param hidden_layer_type: the activation types of each hidden layers, e.g., TANH, LSTM, GRU, BLSTM :param L1_reg: the L1 regulasation weight :param L2_reg: the L2 regulasation weight :param output_type: the activation type of the output layer, by default is 'LINEAR', linear regression. :param dropout_rate: probability of dropout, a float number between 0 and 1. """ logger = logging.getLogger("DNN initialization") self.n_in = int(n_in) self.n_out = int(n_out) self.n_layers = len(hidden_layer_size) self.dropout_rate = dropout_rate self.is_train = T.iscalar('is_train') assert len(hidden_layer_size) == len(hidden_layer_type) self.x = T.matrix('x') self.y = T.matrix('y') if network_type == "S2S": self.d = T.ivector('d') self.f = T.matrix('f') self.L1_reg = L1_reg self.L2_reg = L2_reg self.rnn_layers = [] self.params = [] self.delta_params = [] rng = np.random.RandomState(123) BLSTM_variants = ['BLSTM', 'BSLSTM', 'BLSTME'] Encoder_variants = ['RNNE', 'LSTME', 'BLSTME', 'SLSTME'] for i in range(self.n_layers): if i == 0: input_size = n_in else: input_size = hidden_layer_size[i - 1] if hidden_layer_type[i - 1] in BLSTM_variants: input_size = hidden_layer_size[i - 1] * 2 if i == 0: layer_input = self.x else: layer_input = self.rnn_layers[i - 1].output ### sequence-to-sequence mapping ### if hidden_layer_type[i - 1] in Encoder_variants: dur_input = self.d frame_feat_input = self.f if network_type == "S2S": seq2seq_model = DistributedSequenceEncoder( rng, layer_input, dur_input) layer_input = T.concatenate( (seq2seq_model.encoded_output, frame_feat_input), axis=1) input_size = input_size + 4 else: logger.critical( "This network type: %s is not supported right now! \n Please use one of the following: DNN, RNN, S2S\n" % (network_type)) sys.exit(1) if hidden_layer_type[i] == 'SLSTM': hidden_layer = SimplifiedLstm(rng, layer_input, input_size, hidden_layer_size[i], p=self.dropout_rate, training=self.is_train) elif hidden_layer_type[i] == 'SGRU': hidden_layer = SimplifiedGRU(rng, layer_input, input_size, hidden_layer_size[i], p=self.dropout_rate, training=self.is_train) elif hidden_layer_type[i] == 'GRU': hidden_layer = GatedRecurrentUnit(rng, layer_input, input_size, hidden_layer_size[i], p=self.dropout_rate, training=self.is_train) elif hidden_layer_type[i] == 'LSTM_NFG': hidden_layer = LstmNFG(rng, layer_input, input_size, hidden_layer_size[i], p=self.dropout_rate, training=self.is_train) elif hidden_layer_type[i] == 'LSTM_NOG': hidden_layer = LstmNOG(rng, layer_input, input_size, hidden_layer_size[i], p=self.dropout_rate, training=self.is_train) elif hidden_layer_type[i] == 'LSTM_NIG': hidden_layer = LstmNIG(rng, layer_input, input_size, hidden_layer_size[i], p=self.dropout_rate, training=self.is_train) elif hidden_layer_type[i] == 'LSTM_NPH': hidden_layer = LstmNoPeepholes(rng, layer_input, input_size, hidden_layer_size[i], p=self.dropout_rate, training=self.is_train) elif hidden_layer_type[i] == 'LSTM': hidden_layer = VanillaLstm(rng, layer_input, input_size, hidden_layer_size[i], p=self.dropout_rate, training=self.is_train) elif hidden_layer_type[i] == 'LSTME': hidden_layer = VanillaLstm(rng, layer_input, input_size, hidden_layer_size[i], p=self.dropout_rate, training=self.is_train) elif hidden_layer_type[i] == 'CLSTM': hidden_layer = ContextLstm(rng, layer_input, input_size, hidden_layer_size[i], p=self.dropout_rate, training=self.is_train) elif hidden_layer_type[i] == 'LSTMD': hidden_layer = VanillaLstmDecoder(rng, layer_input, input_size, hidden_layer_size[i], self.n_out, p=self.dropout_rate, training=self.is_train) elif hidden_layer_type[i] == 'BSLSTM': hidden_layer = BidirectionSLstm(rng, layer_input, input_size, hidden_layer_size[i], hidden_layer_size[i], p=self.dropout_rate, training=self.is_train) elif hidden_layer_type[i] == 'BLSTM': hidden_layer = BidirectionLstm(rng, layer_input, input_size, hidden_layer_size[i], hidden_layer_size[i], p=self.dropout_rate, training=self.is_train) elif hidden_layer_type[i] == 'BLSTME': hidden_layer = BidirectionLstm(rng, layer_input, input_size, hidden_layer_size[i], hidden_layer_size[i], p=self.dropout_rate, training=self.is_train) elif hidden_layer_type[i] == 'RNN': hidden_layer = VanillaRNN(rng, layer_input, input_size, hidden_layer_size[i], p=self.dropout_rate, training=self.is_train) elif hidden_layer_type[i] == 'RNNE': hidden_layer = VanillaRNN(rng, layer_input, input_size, hidden_layer_size[i], p=self.dropout_rate, training=self.is_train) elif hidden_layer_type[i] == 'RNND': hidden_layer = VanillaRNNDecoder(rng, layer_input, input_size, hidden_layer_size[i], self.n_out, p=self.dropout_rate, training=self.is_train) elif hidden_layer_type[i] == 'TANH': hidden_layer = SigmoidLayer(rng, layer_input, input_size, hidden_layer_size[i], activation=T.tanh, p=self.dropout_rate, training=self.is_train) elif hidden_layer_type[i] == 'SIGMOID': hidden_layer = SigmoidLayer(rng, layer_input, input_size, hidden_layer_size[i], activation=T.nnet.sigmoid, p=self.dropout_rate, training=self.is_train) else: logger.critical( "This hidden layer type: %s is not supported right now! \n Please use one of the following: SLSTM, BSLSTM, TANH, SIGMOID\n" % (hidden_layer_type[i])) sys.exit(1) self.rnn_layers.append(hidden_layer) self.params.extend(hidden_layer.params) input_size = hidden_layer_size[-1] if hidden_layer_type[-1] == 'BSLSTM' or hidden_layer_type[ -1] == 'BLSTM': input_size = hidden_layer_size[-1] * 2 if hidden_layer_type[-1] == "RNND" or hidden_layer_type[-1] == "LSTMD": self.final_layer = self.rnn_layers[-1] else: if output_type == 'LINEAR': self.final_layer = LinearLayer(rng, self.rnn_layers[-1].output, input_size, self.n_out) elif output_type == 'SIGMOID': self.final_layer = SigmoidLayer(rng, self.rnn_layers[-1].output, input_size, self.n_out, activation=T.nnet.sigmoid) else: logger.critical( "This output layer type: %s is not supported right now! \n Please use one of the following: LINEAR, SIGMOID\n" % (output_type)) sys.exit(1) self.params.extend(self.final_layer.params) self.updates = {} for param in self.params: self.updates[param] = theano.shared( value=np.zeros(param.get_value(borrow=True).shape, dtype=theano.config.floatX), name='updates') self.finetune_cost = T.mean( T.sum((self.final_layer.output - self.y)**2, axis=1)) self.errors = T.mean( T.sum((self.final_layer.output - self.y)**2, axis=1))
import theano import theano.tensor as T import numpy as np theano.config.warn.subtensor_merge_bug = False i = T.iscalar("i") x = T.iscalar("x") y = T.iscalar("y") A = T.imatrix("A") def inner_sum(prior_x, B): return prior_x + B def inner_sum2D(x_t, y_t, u): return x_t + y_t + u row_count = 3 column_count = 4 # Symbolic description of the result result, updates = theano.scan( fn=inner_sum2D, sequences=dict(input=T.flatten(A), taps=[column_count]), outputs_info=dict(initial=T.flatten(A), taps=[-1, -column_count]), #non_sequences= n_steps=x * y) # Scan has provided us with A ** 1 through A ** k. Keep only the last
def __init__(self, We_initial, char_embedd_table_initial, params): self.textfile = open(params.outfile, 'w') We = theano.shared(We_initial) We_inf = theano.shared(We_initial) embsize = We_initial.shape[1] hidden = params.hidden self.en_hidden_size = params.hidden_inf self.num_labels = params.num_labels self.de_hidden_size = params.de_hidden_size self.lstm_layers_num = 1 char_embedd_dim = params.char_embedd_dim char_dic_size = len(params.char_dic) char_embedd_table = theano.shared(char_embedd_table_initial) char_embedd_table_inf = theano.shared(char_embedd_table_initial) input_var = T.imatrix(name='inputs') target_var = T.imatrix(name='targets') target_var_in = T.imatrix(name='in_targets') mask_var = T.fmatrix(name='masks') mask_var1 = T.fmatrix(name='masks1') length = T.iscalar() length0 = T.iscalar() t_t = T.fscalar() t_t0 = T.fscalar() char_input_var = T.itensor3(name='char-inputs') use_dropout = T.fscalar() use_dropout0 = T.fscalar() Wyy0 = np.random.uniform( -0.02, 0.02, (self.num_labels + 1, self.num_labels + 1)).astype('float32') Wyy = theano.shared(Wyy0) l_in_word = lasagne.layers.InputLayer((None, None)) l_mask_word = lasagne.layers.InputLayer(shape=(None, None)) if params.emb == 1: l_emb_word = lasagne.layers.EmbeddingLayer( l_in_word, input_size=We_initial.shape[0], output_size=embsize, W=We) else: l_emb_word = lasagne_embedding_layer_2(l_in_word, embsize, We) layer_char_input = lasagne.layers.InputLayer(shape=(None, None, Max_Char_Length), input_var=char_input_var, name='char-input') layer_char = lasagne.layers.reshape(layer_char_input, (-1, [2])) layer_char_embedding = lasagne.layers.EmbeddingLayer( layer_char, input_size=char_dic_size, output_size=char_embedd_dim, W=char_embedd_table, name='char_embedding') layer_char = lasagne.layers.DimshuffleLayer(layer_char_embedding, pattern=(0, 2, 1)) # first get some necessary dimensions or parameters conv_window = 3 num_filters = params.num_filters # construct convolution layer cnn_layer = lasagne.layers.Conv1DLayer( layer_char, num_filters=num_filters, filter_size=conv_window, pad='full', nonlinearity=lasagne.nonlinearities.tanh, name='cnn') # infer the pool size for pooling (pool size should go through all time step of cnn) _, _, pool_size = cnn_layer.output_shape # construct max pool layer pool_layer = lasagne.layers.MaxPool1DLayer(cnn_layer, pool_size=pool_size) # reshape the layer to match lstm incoming layer [batch * sent_length, num_filters, 1] --> [batch, sent_length, num_filters] output_cnn_layer = lasagne.layers.reshape(pool_layer, (-1, length, [1])) # finally, concatenate the two incoming layers together. l_emb_word = lasagne.layers.concat([output_cnn_layer, l_emb_word], axis=2) l_lstm_wordf = lasagne.layers.LSTMLayer(l_emb_word, hidden, mask_input=l_mask_word) l_lstm_wordb = lasagne.layers.LSTMLayer(l_emb_word, hidden, mask_input=l_mask_word, backwards=True) concat = lasagne.layers.concat([l_lstm_wordf, l_lstm_wordb], axis=2) l_reshape_concat = lasagne.layers.ReshapeLayer(concat, (-1, 2 * hidden)) l_local = lasagne.layers.DenseLayer( l_reshape_concat, num_units=self.num_labels, nonlinearity=lasagne.nonlinearities.linear) network_params = lasagne.layers.get_all_params(l_local, trainable=True) network_params.append(Wyy) print len(network_params) f = open( 'ccctag_BiLSTM_CNN_CRF_num_filters_30_dropout_1_LearningRate_0.01_0.0_400_emb_1_tagversoin_2.pickle', 'r') data = pickle.load(f) f.close() for idx, p in enumerate(network_params): p.set_value(data[idx]) self.params = [] self.hos = [] self.Cos = [] self.encoder_lstm_layers = [] self.decoder_lstm_layers = [] ei, di, dt = T.imatrices(3) #place holders decoderInputs0, em, em1, dm, tf, di0 = T.fmatrices(6) ci = T.itensor3() #### the last one is for the stary symbole self.de_lookuptable = theano.shared(name="Decoder LookUpTable", value=init_xavier_uniform( self.num_labels + 1, self.de_hidden_size), borrow=True) self.linear = theano.shared( name="Linear", value=init_xavier_uniform( self.de_hidden_size + 2 * self.en_hidden_size, self.num_labels), borrow=True) self.linear_bias = theano.shared( name="Hidden to Bias", value=np.asarray(np.random.randn(self.num_labels, ) * 0., dtype=theano.config.floatX), borrow=True) #self.hidden_decode = theano.shared(name="Hidden to Decode", value= init_xavier_uniform(2*hidden, self.de_hidden_size), borrow = True) #self.hidden_bias = theano.shared( # name="Hidden to Bias", # value=np.asarray(np.random.randn(self.de_hidden_size, )*0., dtype=theano.config.floatX) , # borrow=True # ) input_var_shuffle = input_var.dimshuffle(1, 0) mask_var_shuffle = mask_var.dimshuffle(1, 0) target_var_in_shuffle = target_var_in.dimshuffle(1, 0) target_var_shuffle = target_var.dimshuffle(1, 0) self.params += [ We_inf, self.linear, self.linear_bias, self.de_lookuptable ] #concatenate state_below = We_inf[input_var_shuffle.flatten()].reshape( (input_var_shuffle.shape[0], input_var_shuffle.shape[1], embsize)) ###### character word embedding layer_char_input_inf = lasagne.layers.InputLayer( shape=(None, None, Max_Char_Length), input_var=char_input_var, name='char-input') layer_char_inf = lasagne.layers.reshape(layer_char_input_inf, (-1, [2])) layer_char_embedding_inf = lasagne.layers.EmbeddingLayer( layer_char_inf, input_size=char_dic_size, output_size=char_embedd_dim, W=char_embedd_table_inf, name='char_embedding_inf') layer_char_inf = lasagne.layers.DimshuffleLayer( layer_char_embedding_inf, pattern=(0, 2, 1)) #layer_char_inf = lasagne.layers.DropoutLayer(layer_char_inf, p=0.5) cnn_layer_inf = lasagne.layers.Conv1DLayer( layer_char_inf, num_filters=num_filters, filter_size=conv_window, pad='full', nonlinearity=lasagne.nonlinearities.tanh, name='cnn_inf') pool_layer_inf = lasagne.layers.MaxPool1DLayer(cnn_layer_inf, pool_size=pool_size) output_cnn_layer_inf = lasagne.layers.reshape(pool_layer_inf, (-1, length, [1])) char_params = lasagne.layers.get_all_params(output_cnn_layer_inf, trainable=True) self.params += char_params ###### [batch, sent_length, num_filters] #char_state_below = lasagne.layers.get_output(output_cnn_layer_inf, {layer_char_input_inf:char_input_var}) char_state_below = lasagne.layers.get_output(output_cnn_layer_inf) char_state_below = dropout_layer(char_state_below, use_dropout, trng) char_state_shuff = char_state_below.dimshuffle(1, 0, 2) state_below = T.concatenate([state_below, char_state_shuff], axis=2) state_below = dropout_layer(state_below, use_dropout, trng) enclstm_f = LSTM(embsize + num_filters, self.en_hidden_size) enclstm_b = LSTM(embsize + num_filters, self.en_hidden_size, True) self.encoder_lstm_layers.append(enclstm_f) #append self.encoder_lstm_layers.append(enclstm_b) #append self.params += enclstm_f.params + enclstm_b.params #concatenate hs_f, Cs_f = enclstm_f.forward(state_below, mask_var_shuffle) hs_b, Cs_b = enclstm_b.forward(state_below, mask_var_shuffle) hs = T.concatenate([hs_f, hs_b], axis=2) Cs = T.concatenate([Cs_f, Cs_b], axis=2) #hs0 = T.concatenate([hs_f[-1], hs_b[0]], axis=1) #Cs0 = T.concatenate([Cs_f[-1], Cs_b[0]], axis=1) #self.hos += T.tanh(tensor.dot(hs0, self.hidden_decode) + self.hidden_bias), #self.Cos += T.tanh(tensor.dot(Cs0, self.hidden_decode) + self.hidden_bias), self.hos += T.alloc(np.asarray(0., dtype=theano.config.floatX), input_var_shuffle.shape[1], self.de_hidden_size), self.Cos += T.alloc(np.asarray(0., dtype=theano.config.floatX), input_var_shuffle.shape[1], self.de_hidden_size), Encoder = hs state_below = self.de_lookuptable[ target_var_in_shuffle.flatten()].reshape( (target_var_in_shuffle.shape[0], target_var_in_shuffle.shape[1], self.de_hidden_size)) for i in range(self.lstm_layers_num): declstm = LSTM(self.de_hidden_size, self.de_hidden_size) self.decoder_lstm_layers += declstm, #append self.params += declstm.params #concatenate ho, Co = self.hos[i], self.Cos[i] state_below, Cs = declstm.forward(state_below, mask_var_shuffle, ho, Co) decoder_lstm_outputs = T.concatenate([state_below, Encoder], axis=2) linear_outputs = T.dot(decoder_lstm_outputs, self.linear) + self.linear_bias[None, None, :] softmax_outputs, updates = theano.scan( fn=lambda x: T.nnet.softmax(x), sequences=[linear_outputs], ) def _NLL(pred, y, m): return -m * T.log(pred[T.arange(input_var.shape[0]), y]) def _step2(ctx_, state_, hs_, Cs_): hs, Cs = [], [] token_idxs = T.cast(state_.argmax(axis=-1), "int32") msk_ = T.fill((T.zeros_like(token_idxs, dtype="float32")), 1.) msk_ = msk_.dimshuffle('x', 0) state_below0 = self.de_lookuptable[token_idxs].reshape( (1, input_var_shuffle.shape[1], self.de_hidden_size)) for i, lstm in enumerate(self.decoder_lstm_layers): h, C = lstm.forward(state_below0, msk_, hs_[i], Cs_[i]) #mind msk hs += h[-1], Cs += C[-1], state_below0 = h hs, Cs = T.as_tensor_variable(hs), T.as_tensor_variable(Cs) state_below0 = state_below0.reshape( (input_var.shape[0], self.de_hidden_size)) state_below0 = T.concatenate([ctx_, state_below0], axis=1) newpred = T.dot(state_below0, self.linear).reshape( (input_var_shuffle.shape[1], self.num_labels)) + self.linear_bias[None, :] state_below = T.nnet.softmax(newpred) extra_p = T.zeros_like(hs[:, :, 0]) state_below = T.concatenate([state_below, extra_p.T], axis=1) return state_below, hs, Cs hs0, Cs0 = T.as_tensor_variable( self.hos, name="hs0"), T.as_tensor_variable(self.Cos, name="Cs0") train_outputs, _ = theano.scan(fn=_step2, sequences=[Encoder], outputs_info=[decoderInputs0, hs0, Cs0], n_steps=input_var_shuffle.shape[0]) predy = train_outputs[0].dimshuffle(1, 0, 2) predy = predy[:, :, :-1] * mask_var[:, :, None] predy0 = predy.reshape((-1, self.num_labels)) def inner_function(targets_one_step, mask_one_step, prev_label, tg_energy): """ :param targets_one_step: [batch_size, t] :param prev_label: [batch_size, t] :param tg_energy: [batch_size] :return: """ new_ta_energy = T.dot(prev_label, Wyy[:-1, :-1]) new_ta_energy_t = tg_energy + T.sum( new_ta_energy * targets_one_step, axis=1) tg_energy_t = T.switch(mask_one_step, new_ta_energy_t, tg_energy) return [targets_one_step, tg_energy_t] local_energy = lasagne.layers.get_output( l_local, { l_in_word: input_var, l_mask_word: mask_var, layer_char_input: char_input_var }) local_energy = local_energy.reshape((-1, length, self.num_labels)) local_energy = local_energy * mask_var[:, :, None] ##################### # for the end symbole of a sequence #################### end_term = Wyy[:-1, -1] local_energy = local_energy + end_term.dimshuffle( 'x', 'x', 0) * mask_var1[:, :, None] #predy0 = lasagne.layers.get_output(l_local_a, {l_in_word_a:input_var, l_mask_word_a:mask_var}) predy_in = T.argmax(predy0, axis=1) A = T.extra_ops.to_one_hot(predy_in, self.num_labels) A = A.reshape((-1, length, self.num_labels)) #predy = predy0.reshape((-1, length, 25)) #predy = predy*mask_var[:,:,None] targets_shuffled = predy.dimshuffle(1, 0, 2) target_time0 = targets_shuffled[0] masks_shuffled = mask_var.dimshuffle(1, 0) initial_energy0 = T.dot(target_time0, Wyy[-1, :-1]) initials = [target_time0, initial_energy0] [_, target_energies], _ = theano.scan( fn=inner_function, outputs_info=initials, sequences=[targets_shuffled[1:], masks_shuffled[1:]]) cost11 = target_energies[-1] + T.sum( T.sum(local_energy * predy, axis=2) * mask_var, axis=1) # compute the ground-truth energy targets_shuffled0 = A.dimshuffle(1, 0, 2) target_time00 = targets_shuffled0[0] initial_energy00 = T.dot(target_time00, Wyy[-1, :-1]) initials0 = [target_time00, initial_energy00] [_, target_energies0], _ = theano.scan( fn=inner_function, outputs_info=initials0, sequences=[targets_shuffled0[1:], masks_shuffled[1:]]) cost110 = target_energies0[-1] + T.sum( T.sum(local_energy * A, axis=2) * mask_var, axis=1) #predy_f = predy.reshape((-1, 25)) y_f = target_var.flatten() if (params.annealing == 0): lamb = params.L3 elif (params.annealing == 1): lamb = params.L3 * (1 - 0.01 * t_t) if (params.regutype == 0): ce_hinge = lasagne.objectives.categorical_crossentropy( predy0 + eps, y_f) ce_hinge = ce_hinge.reshape((-1, length)) ce_hinge = T.sum(ce_hinge * mask_var, axis=1) cost = T.mean(-cost11) + lamb * T.mean(ce_hinge) else: entropy_term = -T.sum(predy0 * T.log(predy0 + eps), axis=1) entropy_term = entropy_term.reshape((-1, length)) entropy_term = T.sum(entropy_term * mask_var, axis=1) cost = T.mean(-cost11) - lamb * T.mean(entropy_term) """ f = open('F0_simple.pickle') PARA = pickle.load(f) f.close() l2_term = sum(lasagne.regularization.l2(x-PARA[index]) for index, x in enumerate(a_params)) cost = T.mean(-cost11) + params.L2*l2_term """ ##from adam import adam ##updates_a = adam(cost, self.params, params.eta) #updates_a = lasagne.updates.sgd(cost, self.params, params.eta) #updates_a = lasagne.updates.apply_momentum(updates_a, self.params, momentum=0.9) from momentum import momentum updates_a = momentum(cost, self.params, params.eta, momentum=0.9) if (params.regutype == 0): self.train_fn = theano.function( inputs=[ei, ci, dt, em, em1, length0, t_t0, di0, use_dropout0], outputs=[cost, ce_hinge], updates=updates_a, on_unused_input='ignore', givens={ input_var: ei, char_input_var: ci, target_var: dt, mask_var: em, mask_var1: em1, length: length0, t_t: t_t0, decoderInputs0: di0, use_dropout: use_dropout0 }) #self.train_fn = theano.function([input_var, target_var, mask_var, mask_var1, length, t_t], [cost, ce_hinge], updates = updates_a, on_unused_input='ignore') else: self.train_fn = theano.function( inputs=[ei, dt, em, em1, length0, t_t0, di0, use_dropout0], outputs=[cost, entropy_term], updates=updates_a, on_unused_input='ignore', givens={ input_var: ei, char_input_var: ci, target_var: dt, mask_var: em, mask_var1: em1, length: length0, t_t: t_t0, decoderInputs0: di0, use_dropout: use_dropout0 }) #self.train_fn = theano.function([input_var, target_var, mask_var, mask_var1, length, t_t], [cost, entropy_term], updates = updates_a, on_unused_input='ignore') prediction = T.argmax(predy, axis=2) corr = T.eq(prediction, target_var) corr_train = (corr * mask_var).sum(dtype=theano.config.floatX) num_tokens = mask_var.sum(dtype=theano.config.floatX) self.eval_fn = theano.function( inputs=[ei, ci, dt, em, em1, length0, di0, use_dropout0], outputs=[cost11, cost110, corr_train, num_tokens, prediction], on_unused_input='ignore', givens={ input_var: ei, char_input_var: ci, target_var: dt, mask_var: em, mask_var1: em1, length: length0, decoderInputs0: di0, use_dropout: use_dropout0 })
# at work in In(y, value=1). In the case of In(w, value=2, name='w_by_name'). # We override the symbolic variable's name attribute with a name to be used # for this function. # 4. Using Shared Variables # It is also possible to make a function with an internal state. For # example, let's say we want to make an accumulator: at the beginning, # the initialized to zero. Then, on each fucntion call, the state is # incremented by the function's arguments. # First let's define teh accumulator function. It adds its argument to the # interanl state, and returns teh old state value. from theano import shared state = shared(0) inc = T.iscalar('inc') accumulator = function([inc], state, updates=[(state, state + inc)]) # This code introduces a few concepts. The shared function constructs so- # called shared variables. These are hybrid symbolic and non-symbolic # variables whose value may be shared between multiple functions. Shared # variables can be used in symbolic expressions just like the objects # returned by dmatrices(...) but they also have an internal value that # defines the value taken by this symbolic variable in all the functions # that use it. It is called a shared variable because its value is shared # between many .set_value() methods. We will com back to this soon. # The other new thing in this code is the updates parameter of function. # updates must be supplied with a list of pairs of the form (shared- # variable, new expression). It can also be a dictionary whose keys are # shared-variables and values are the new expression. Either way, it means # "whenever this function runs, it will replace the .value of each shared
def __init__(self, nh, nc, ne, de, cs, em, init, featdim): ''' nh :: dimension of the hidden layer nc :: number of classes ne :: number of word embeddings in the vocabulary de :: dimension of the word embeddings cs :: word window context size ''' # parameters of the model self.featdim = featdim tmp_emb = 0.2 * numpy.random.uniform(-1.0, 1.0, (ne + 1, de)) if init: for row in xrange(ne + 1): if em[row] is not None: tmp_emb[row] = em[row] self.emb = theano.shared(tmp_emb.astype( theano.config.floatX)) # add one for PADDING at the end # weights for LSTM n_in = de * cs print "de,cs", de, cs # print "n_i",n_i n_hidden = n_i = n_c = n_o = n_f = nh n_y = nc print "n_y", n_y print "n_hidden, n_i, n_c, n_o,nh", n_hidden, n_i, n_c, n_o, nh self.W_xi = theano.shared(0.2 * uniform(-1.0, 1.0, (n_in, n_i)).astype(dtype)) self.W_hi = theano.shared(0.2 * uniform(-1.0, 1.0, (n_hidden, n_i)).astype(dtype)) self.W_ci = theano.shared(0.2 * uniform(-1.0, 1.0, (n_c, n_i)).astype(dtype)) self.b_i = theano.shared(numpy.cast[dtype](uniform(-0.5, .5, size=n_i))) self.W_xf = theano.shared(0.2 * uniform(-1.0, 1.0, (n_in, n_f)).astype(dtype)) self.W_hf = theano.shared(0.2 * uniform(-1.0, 1.0, (n_hidden, n_f)).astype(dtype)) self.W_cf = theano.shared(0.2 * uniform(-1.0, 1.0, (n_c, n_f)).astype(dtype)) self.b_f = theano.shared(numpy.cast[dtype](uniform(0, 1., size=n_f))) self.W_xc = theano.shared(0.2 * uniform(-1.0, 1.0, (n_in, n_c)).astype(dtype)) self.W_hc = theano.shared(0.2 * uniform(-1.0, 1.0, (n_hidden, n_c)).astype(dtype)) self.b_c = theano.shared(numpy.zeros(n_c, dtype=dtype)) self.W_xo = theano.shared(0.2 * uniform(-1.0, 1.0, (n_in, n_o)).astype(dtype)) self.W_ho = theano.shared(0.2 * uniform(-1.0, 1.0, (n_hidden, n_o)).astype(dtype)) self.W_co = theano.shared(0.2 * uniform(-1.0, 1.0, (n_c, n_o)).astype(dtype)) self.b_o = theano.shared(numpy.cast[dtype](uniform(-0.5, .5, size=n_o))) self.W_hy = theano.shared( 0.2 * uniform(-1.0, 1.0, (n_hidden + featdim, n_y)).astype(dtype)) self.b_y = theano.shared(numpy.zeros(n_y, dtype=dtype)) self.c0 = theano.shared(numpy.zeros(n_hidden, dtype=dtype)) self.h0 = T.tanh(self.c0) # bundle weights self.params = [self.emb, self.W_xi, self.W_hi, self.W_ci, self.b_i, self.W_xf, self.W_hf, \ self.W_cf, self.b_f, self.W_xc, self.W_hc, self.b_c, self.W_xo, self.W_ho, \ self.W_co, self.b_o, self.W_hy, self.b_y, self.c0] self.names = ['embeddings', 'W_xi', 'W_hi', 'W_ci', 'b_i', 'W_xf', 'W_hf', 'W_cf', 'b_f', \ 'W_xc', 'W_hc', 'b_c', 'W_xo', 'W_ho', 'W_co', 'b_o', 'W_hy', 'b_y', 'c0'] idxs = T.imatrix( ) # as many columns as context window size/lines as words in the sentence # print idxs.shape() x = self.emb[idxs].reshape((idxs.shape[0], de * cs)) # print type(x), x.shape(), "details of x" f = T.matrix('f') f.reshape((idxs.shape[0], featdim)) # print type(f), f.shape(), "details of f" y = T.iscalar('y') # label # print type(y), y.shape(), "details of y" def recurrence(x_t, feat_t, h_tm1, c_tm1): i_t = sigma( theano.dot(x_t, self.W_xi) + theano.dot(h_tm1, self.W_hi) + theano.dot(c_tm1, self.W_ci) + self.b_i) f_t = sigma( theano.dot(x_t, self.W_xf) + theano.dot(h_tm1, self.W_hf) + theano.dot(c_tm1, self.W_cf) + self.b_f) c_t = f_t * c_tm1 + i_t * T.tanh( theano.dot(x_t, self.W_xc) + theano.dot(h_tm1, self.W_hc) + self.b_c) o_t = sigma( theano.dot(x_t, self.W_xo) + theano.dot(h_tm1, self.W_ho) + theano.dot(c_t, self.W_co) + self.b_o) h_t = o_t * T.tanh(c_t) if self.featdim > 0: all_t = T.concatenate([h_t, feat_t]) else: all_t = h_t # print "all_t", type(all_t), T.shape(all_t) s_t = softmax(theano.dot(all_t, self.W_hy) + self.b_y) # print T.shape(h_t), T.shape(c_t), T.shape(s_t) return [h_t, c_t, s_t] # Initialization occurs in outputs_info # scan gives -- result, updates [h, _, s], _ = theano.scan(fn=recurrence, sequences=[x, f], outputs_info=[self.h0, self.c0, None], n_steps=x.shape[0]) p_y_given_x_lastword = s[-1, 0, :] p_y_given_x_sentence = s[:, 0, :] y_pred = T.argmax(p_y_given_x_sentence, axis=1) # cost and gradients and learning rate lr = T.scalar('lr') nll = -T.mean(T.log(p_y_given_x_lastword)[y]) gradients = T.grad(nll, self.params) updates = OrderedDict( (p, p - lr * g) for p, g in zip(self.params, gradients)) # theano functions self.classify = theano.function(inputs=[idxs, f], outputs=y_pred) self.train = theano.function(inputs=[idxs, f, y, lr], outputs=nll, updates=updates) self.normalize = theano.function( inputs=[], updates={ self.emb: self.emb / T.sqrt( (self.emb**2).sum(axis=1)).dimshuffle(0, 'x') })
def test_grad_h(self): "tests that the gradients with respect to h_i are 0 after doing a mean field update of h_i " model = self.model e_step = self.e_step X = self.X assert X.shape[0] == self.m init_H = e_step.init_H_hat(V=X) init_Mu1 = e_step.init_S_hat(V=X) prev_setting = config.compute_test_value config.compute_test_value = 'off' H, Mu1 = function([], outputs=[init_H, init_Mu1])() config.compute_test_value = prev_setting H = broadcast(H, self.m) Mu1 = broadcast(Mu1, self.m) H = np.cast[config.floatX](self.model.rng.uniform(0., 1., H.shape)) Mu1 = np.cast[config.floatX](self.model.rng.uniform( -5., 5., Mu1.shape)) H_var = T.matrix(name='H_var') H_var.tag.test_value = H Mu1_var = T.matrix(name='Mu1_var') Mu1_var.tag.test_value = Mu1 idx = T.iscalar() idx.tag.test_value = 0 new_H = e_step.infer_H_hat(V=X, H_hat=H_var, S_hat=Mu1_var) h_idx = new_H[:, idx] updates_func = function([H_var, Mu1_var, idx], h_idx) sigma0 = 1. / model.alpha Sigma1 = e_step.infer_var_s1_hat() mu0 = T.zeros_like(model.mu) #by truncated KL, I mean that I am dropping terms that don't depend on H and Mu1 # (they don't affect the outcome of this test and some of them are intractable ) trunc_kl = - model.entropy_hs(H_hat = H_var, var_s0_hat = sigma0, var_s1_hat = Sigma1) + \ model.expected_energy_vhs(V = X, H_hat = H_var, S_hat = Mu1_var, var_s0_hat = sigma0, var_s1_hat = Sigma1) grad_H = T.grad(trunc_kl.sum(), H_var) assert len(grad_H.type.broadcastable) == 2 #from theano.printing import min_informative_str #print min_informative_str(grad_H) #grad_H = Print('grad_H')(grad_H) #grad_H_idx = grad_H[:,idx] grad_func = function([H_var, Mu1_var], grad_H) failed = False for i in xrange(self.N): rval = updates_func(H, Mu1, i) H[:, i] = rval g = grad_func(H, Mu1)[:, i] assert not contains_nan(g) g_abs_max = np.abs(g).max() if g_abs_max > self.tol: #print "new values of H" #print H[:,i] #print "gradient on new values of H" #print g failed = True print 'iteration ', i #print 'max value of new H: ',H[:,i].max() #print 'H for failing g: ' failing_h = H[np.abs(g) > self.tol, i] #print failing_h #from matplotlib import pyplot as plt #plt.scatter(H[:,i],g) #plt.show() #ignore failures extremely close to h=1 high_mask = failing_h > .001 low_mask = failing_h < .999 mask = high_mask * low_mask print 'masked failures: ', mask.shape[0], ' err ', g_abs_max if mask.sum() > 0: print 'failing h passing the range mask' print failing_h[mask.astype(bool)] raise Exception( 'after mean field step, gradient of kl divergence' ' wrt freshly updated variational parameter should be 0, ' 'but here the max magnitude of a gradient element is ' + str(g_abs_max) + ' after updating h_' + str(i))
def __init__(self, word_dim, hidden_dim=5, Nclass=4, degree=2, momentum=0.9, trainable_embeddings=True, labels_on_nonroot_nodes=False, irregular_tree=True): assert word_dim > 1 and hidden_dim > 1 self.word_dim = word_dim self.hidden_dim = hidden_dim self.Nclass = Nclass self.degree = degree #self.learning_rate = learning_rate self.momentum = momentum self.irregular_tree = irregular_tree self.params = [] #self.x = T.ivector(name='x') # word indices #self.x_word = T.matrix(dtype=theano.config.floatX) # word frequendtype=theano.config.floatX self.x_word = T.matrix(name='x_word') # word frequent self.x_index = T.imatrix(name='x_index') # word indices self.tree = T.imatrix(name='tree') # shape [None, self.degree] self.y = T.ivector(name='y') # output shape [self.output_dim] self.num_parent = T.iscalar(name='num_parent') self.num_nodes = self.x_word.shape[ 0] # total number of nodes (leaves + internal) in tree self.num_child = self.num_nodes - self.num_parent - 1 #emb_x = self.embeddings[self.x] #emb_x = emb_x * T.neq(self.x, -1).dimshuffle(0, 'x') # zero-out non-existent embeddings self.tree_states = self.compute_tree(self.x_word, self.x_index, self.num_parent, self.tree) #self.final_state = self.tree_states.mean(axis=0)#self.tree_states[-1] #self.final_state = pool_2d(input=self.tree_states, ds=(self.num_child,1), ignore_border=True,mode='max') self.final_state = self.tree_states.max(axis=0) self.output_fn = self.create_output_fn() self.pred_y = self.output_fn(self.final_state) self.loss = self.loss_fn(self.y, self.pred_y) self.learning_rate = T.scalar('learning_rate') #updates = self.gradient_descent(self.loss, self.learning_rate) train_inputs = [ self.x_word, self.x_index, self.num_parent, self.tree, self.y, self.learning_rate ] updates = self.gradient_descent(self.loss) #train_inputs = [self.x_word, self.x_index, self.tree, self.y] self._train = theano.function(train_inputs, [self.loss, self.pred_y], updates=updates) self._evaluate = theano.function( [self.x_word, self.x_index, self.num_parent, self.tree], self.final_state) self._evaluate2 = theano.function( [self.x_word, self.x_index, self.num_parent, self.tree], self.tree_states) #self._state = theano.function([self.x_word, self.x_index, self.num_child, self.tree], self.tree_states) self._predict = theano.function( [self.x_word, self.x_index, self.num_parent, self.tree], self.pred_y) self.tree_states_test = self.compute_tree_test(self.x_word, self.x_index, self.tree) self._evaluate3 = theano.function( [self.x_word, self.x_index, self.tree], self.tree_states_test)
def main(n_iter, n_batch, n_hidden, time_steps, learning_rate, savefile, scale_penalty, use_scale, reload_progress, model, n_hidden_lstm, n_gru_lr_proj, initial_b_u): np.random.seed(1234) #import pdb; pdb.set_trace() # --- Set optimization params -------- # --- Set data params ---------------- n_input = 1 n_output = 10 ##### MNIST processing ################################################ # load and preprocess the data (train_x, train_y), (valid_x, valid_y), (test_x, test_y) = cPickle.load( gzip.open("mnist.pkl.gz", 'rb')) n_data = train_x.shape[0] num_batches = n_data / n_batch # shuffle data order inds = range(n_data) np.random.shuffle(inds) train_x = np.ascontiguousarray(train_x[inds, :time_steps]) train_y = np.ascontiguousarray(train_y[inds]) n_data_valid = valid_x.shape[0] inds_valid = range(n_data_valid) np.random.shuffle(inds_valid) valid_x = np.ascontiguousarray(valid_x[inds_valid, :time_steps]) valid_y = np.ascontiguousarray(valid_y[inds_valid]) # reshape x train_x = np.reshape(train_x.T, (time_steps, n_data, 1)) valid_x = np.reshape(valid_x.T, (time_steps, valid_x.shape[0], 1)) # change y to one-hot encoding temp = np.zeros((n_data, n_output)) # import pdb; pdb.set_trace() temp[np.arange(n_data), train_y] = 1 train_y = temp.astype('float32') temp = np.zeros((n_data_valid, n_output)) temp[np.arange(n_data_valid), valid_y] = 1 valid_y = temp.astype('float32') # Random permutation of pixels P = np.random.permutation(time_steps) train_x = train_x[P, :, :] valid_x = valid_x[P, :, :] ####################################################################### # --- Compile theano graph and gradients gradient_clipping = np.float32(1) if (model == 'LSTM'): #inputs, parameters, costs = LSTM(n_input, n_hidden_LSTM, n_output) inputs, parameters, costs = LSTM(n_input, n_hidden_lstm, n_output, initial_b_f=initial_b_u) #by AnvaMiba elif (model == 'GRU'): inputs, parameters, costs = GRU(n_input, n_hidden_lstm, n_output, initial_b_u=initial_b_u) #by AnvaMiba elif (model == 'GRU_LR'): inputs, parameters, costs = GRU_LR(n_input, n_hidden_lstm, n_output, n_gru_lr_proj, initial_b_u=initial_b_u) elif (model == 'complex_RNN'): gradient_clipping = np.float32(100000) inputs, parameters, costs = complex_RNN(n_input, n_hidden, n_output, scale_penalty) elif (model == 'complex_RNN_LSTM'): inputs, parameters, costs = complex_RNN_LSTM(n_input, n_hidden, n_hidden_lstm, n_output, scale_penalty) elif (model == 'IRNN'): inputs, parameters, costs = IRNN(n_input, n_hidden, n_output) elif (model == 'RNN'): inputs, parameters, costs = RNN(n_input, n_hidden, n_output) else: print >> sys.stderr, "Unsuported model:", model return gradients = T.grad(costs[0], parameters) # GRADIENT CLIPPING gradients = gradients[:7] + [ T.clip(g, -gradient_clipping, gradient_clipping) for g in gradients[7:] ] s_train_x = theano.shared(train_x) s_train_y = theano.shared(train_y) s_valid_x = theano.shared(valid_x) s_valid_y = theano.shared(valid_y) # --- Compile theano functions -------------------------------------------------- index = T.iscalar('i') updates, rmsprop = rms_prop(learning_rate, parameters, gradients) givens = { inputs[0]: s_train_x[:, n_batch * index:n_batch * (index + 1), :], inputs[1]: s_train_y[n_batch * index:n_batch * (index + 1), :] } givens_valid = {inputs[0]: s_valid_x, inputs[1]: s_valid_y} train = theano.function([index], [costs[0], costs[2]], givens=givens, updates=updates) valid = theano.function([], [costs[1], costs[2]], givens=givens_valid) #import pdb; pdb.set_trace() # --- Training Loop --------------------------------------------------------------- train_loss = [] test_loss = [] test_acc = [] best_params = [p.get_value() for p in parameters] best_test_loss = 1e6 for i in xrange(n_iter): # pdb.set_trace() [cross_entropy, acc] = train(i % num_batches) train_loss.append(cross_entropy) print >> sys.stderr, "Iteration:", i print >> sys.stderr, "cross_entropy:", cross_entropy print >> sys.stderr, "accurracy", acc * 100 print >> sys.stderr, '' #if (i % 100==0): if (i % 300 == 0): [valid_cross_entropy, valid_acc] = valid() print >> sys.stderr, '' print >> sys.stderr, "VALIDATION" print >> sys.stderr, "cross_entropy:", valid_cross_entropy print >> sys.stderr, "accurracy", valid_acc * 100 print >> sys.stderr, '' test_loss.append(valid_cross_entropy) test_acc.append(valid_acc) if valid_cross_entropy < best_test_loss: print >> sys.stderr, "NEW BEST!" best_params = [p.get_value() for p in parameters] best_test_loss = valid_cross_entropy save_vals = { 'parameters': [p.get_value() for p in parameters], 'rmsprop': [r.get_value() for r in rmsprop], 'train_loss': train_loss, 'test_loss': test_loss, 'best_params': best_params, 'test_acc': test_acc, 'best_test_loss': best_test_loss } cPickle.dump(save_vals, file(savefile, 'wb'), cPickle.HIGHEST_PROTOCOL)
def build(self, dropout, char_dim, char_hidden_dim, char_bidirect, word_dim, word_hidden_dim, word_bidirect, tagger_hidden_dim, hamming_cost, L2_reg, lr_method, pre_word_emb, pre_char_emb, tagger, use_gaze, POS, # plot_cost, #cap_dim, training=True, **kwargs ): """ Build the network. """ # Training parameters n_words = len(self.id_to_word) n_chars = len(self.id_to_char) n_tags = len(self.id_to_tag) # n_pos = len(self.id_to_pos) + 1 # Number of capitalization features #if cap_dim: # n_cap = 4 # Network variables is_train = T.iscalar('is_train') # declare variable,声明整型变量is_train word_ids = T.ivector(name='word_ids') #声明整型一维向量 char_for_ids = T.imatrix(name='char_for_ids') # 声明整型二维矩阵 char_rev_ids = T.imatrix(name='char_rev_ids') char_pos_ids = T.ivector(name='char_pos_ids') if use_gaze: gaze = T.imatrix(name='gaze') if POS: # pos_ids = T.ivector(name='pos_ids') pos_one_hot = T.imatrix(name= 'pos_one_hot') #hamming_cost = T.matrix('hamming_cost', theano.config.floatX) # 声明整型二维矩阵 tag_ids = T.ivector(name='tag_ids') #if cap_dim: # cap_ids = T.ivector(name='cap_ids') # Sentence length s_len = (word_ids if word_dim else char_pos_ids).shape[0] #句子中的单词数 # Final input (all word features) input_dim = 0 inputs = [] L2_norm = 0.0 theano.config.compute_test_value = 'off' # # Word inputs # if word_dim: print("word_dim:", word_dim) input_dim += word_dim word_layer = EmbeddingLayer(n_words, word_dim, name='word_layer') word_input = word_layer.link(word_ids) inputs.append(word_input) # Initialize with pretrained embeddings if pre_word_emb and training: new_weights = word_layer.embeddings.get_value() print 'Loading pretrained word embeddings from %s...' % pre_word_emb pretrained = {} emb_invalid = 0 for i, line in enumerate(codecs.open(pre_word_emb, 'r', 'utf-8', 'ignore')): line = line.rstrip().split() if len(line) == word_dim + 1: pretrained[line[0]] = np.array( [float(x) for x in line[1:]] ).astype(np.float32) else: emb_invalid += 1 if emb_invalid > 0: print 'WARNING: %i invalid word embedding lines' % emb_invalid c_found = 0 c_lower = 0 c_zeros = 0 # Lookup table initialization for i in xrange(n_words): word = self.id_to_word[i] if word in pretrained: new_weights[i] = pretrained[word] c_found += 1 elif word.lower() in pretrained: new_weights[i] = pretrained[word.lower()] c_lower += 1 elif re.sub('\d', '0', word) in pretrained: new_weights[i] = pretrained[ re.sub('\d', '0', word) ] c_zeros += 1 word_layer.embeddings.set_value(new_weights) print 'Loaded %i pretrained word embeddings.' % len(pretrained) print ('%i / %i (%.4f%%) words have been initialized with ' 'pretrained word embeddings.') % ( c_found + c_lower + c_zeros, n_words, 100. * (c_found + c_lower + c_zeros) / n_words ) print ('%i found directly, %i after lowercasing + zero.') % (c_found, c_lower + c_zeros) L2_norm += (word_layer.embeddings ** 2).sum() # # Chars inputs # if char_dim: print("char_dim:", char_dim) input_dim += char_dim char_layer = EmbeddingLayer(n_chars, char_dim, name='char_layer') char_for_input = char_layer.link(char_for_ids) # Initialize with pretrained char embeddings if pre_char_emb and training: new_weights = char_layer.embeddings.get_value() print 'Loading pretrained char embeddings from %s...' % pre_char_emb pretrained = {} emb_invalid = 0 for i, line in enumerate(codecs.open(pre_char_emb, 'r', 'utf-8', 'ignore')): line = line.rstrip().split() if len(line) == char_dim + 1: pretrained[line[0]] = np.array( [float(x) for x in line[1:]] ).astype(np.float32) else: emb_invalid += 1 if emb_invalid > 0: print 'WARNING: %i invalid char embedding lines' % emb_invalid c_found = 0 c_lower = 0 c_zeros = 0 # Lookup table initialization for i in xrange(n_chars): char = self.id_to_char[i] if char in pretrained: new_weights[i] = pretrained[char] c_found += 1 elif word.lower() in pretrained: new_weights[i] = pretrained[word.lower()] c_lower += 1 elif re.sub('\d', '0', char) in pretrained: new_weights[i] = pretrained[ re.sub('\d', '0', char) ] c_zeros += 1 char_layer.embeddings.set_value(new_weights) print 'Loaded %i pretrained char embeddings.' % len(pretrained) print ('%i / %i (%.4f%%) words have been initialized with ' 'pretrained char embeddings.') % ( c_found + c_lower + c_zeros, n_chars, 100. * (c_found + +c_lower + c_zeros) / n_chars ) print ('%i found directly, %i after lowercasing + zero.') % (c_found, c_lower + c_zeros) L2_norm += (char_layer.embeddings ** 2).sum() wc_layer = CW_EmbeddingLayer(char_dim, word_dim + char_dim, bias= True, name= 'wc_layer') wc_comp_input = wc_layer.link(char_for_input, word_input) for param in wc_layer.params: L2_norm += (param ** 2).sum() print(word_input.ndim) print(wc_comp_input.ndim) # new_word_input, _ = theano.scan(lambda x_t, y_t: T.max([x_t, y_t], axis= 0), sequences= [word_input, wc_comp_input], n_steps= word_input.shape[0]) # print(new_word_input.ndim) inputs.append(wc_comp_input) # if POS: # pos_dim = 20 # input_dim += pos_dim # pos_layer = EmbeddingLayer(n_pos, pos_dim, name='pos_layer') # pos_input = pos_layer.link(pos_ids) # inputs.append(pos_input) # L2_norm += (pos_layer.embeddings ** 2).sum() #if len(inputs) != 1: inputs = T.concatenate(inputs, axis= 1) # # Dropout on final input # if dropout: dropout_layer = DropoutLayer(p=dropout) input_train = dropout_layer.link(inputs) input_test = (1 - dropout) * inputs inputs = T.switch(T.neq(is_train, 0), input_train, input_test) # 条件句 # if POS: # inputs = T.concatenate([inputs, pos_one_hot], axis= 1) # input_dim += 6 # LSTM for words print("input_dim:", input_dim) print("word_hidden_dim:", word_hidden_dim) word_lstm_for = LSTM(input_dim, word_hidden_dim, with_batch=False, name='word_lstm_for') word_lstm_rev = LSTM(input_dim, word_hidden_dim, with_batch=False, name='word_lstm_rev') word_lstm_for.link(inputs) # 单词的顺序: I like dog word_lstm_rev.link(inputs[::-1, :]) # 单词的顺序: dog like I word_for_output = word_lstm_for.h word_rev_output = word_lstm_rev.h[::-1, :] for param in word_lstm_for.params[:8]: L2_norm += (param ** 2).sum() if word_bidirect: final_output = T.concatenate( [word_for_output, word_rev_output], axis=1 ) tanh_layer = HiddenLayer(2 * word_hidden_dim, word_hidden_dim, name='tanh_layer', activation='tanh') final_output = tanh_layer.link(final_output) for param in word_lstm_rev.params[:8]: L2_norm += (param ** 2).sum() else: final_output = word_for_output dims = word_hidden_dim if use_gaze: final_output = T.concatenate([final_output, gaze], axis= 1) dims = word_hidden_dim + n_tags if POS: final_output = T.concatenate([final_output, pos_one_hot], axis=1) dims += 6 # if word_bidirect: # final_output = T.concatenate( # [word_for_output, word_rev_output], # axis=1 # ) # tanh_layer = HiddenLayer(2 * word_hidden_dim, word_hidden_dim, # name='tanh_layer', activation='tanh') # final_output = tanh_layer.link(final_output) # else: # final_output = word_for_output # Sentence to Named Entity tags ## final_layer = HiddenLayer(dims, n_tags, name='final_layer', ## activation=(None if crf else 'softmax')) # final_layer = HiddenLayer(word_hidden_dim, n_tags, name='final_layer', # activation=(None if crf else 'softmax')) ## tags_scores = final_layer.link(final_output) ## L2_norm += (final_layer.params[0] ** 2).sum() # No CRF if tagger == 'lstm': tagger_layer = LSTM_d(dims, tagger_hidden_dim, with_batch= False, name='LSTM_d') tagger_layer.link(final_output) final_output = tagger_layer.t dims = tagger_hidden_dim for param in tagger_layer.params[:8]: L2_norm += (param ** 2).sum() final_layer = HiddenLayer(dims, n_tags, name='final_layer', activation=(None if tagger == 'crf' else 'softmax')) tags_scores = final_layer.link(final_output) L2_norm += (final_layer.params[0] ** 2).sum() if tagger != 'crf': cost = T.nnet.categorical_crossentropy(tags_scores, tag_ids).mean() # CRF else: transitions = shared((n_tags + 2, n_tags + 2), 'transitions') small = -1000 b_s = np.array([[small] * n_tags + [0, small]]).astype(np.float32) e_s = np.array([[small] * n_tags + [small, 0]]).astype(np.float32) observations = T.concatenate( [tags_scores, small * T.ones((s_len, 2))], axis=1 ) observations = T.concatenate( [b_s, observations, e_s], axis=0 ) # Score from tags real_path_score = tags_scores[T.arange(s_len), tag_ids].sum() # P中对应元素的求和好 # Score from add_componentnsitions b_id = theano.shared(value=np.array([n_tags], dtype=np.int32)) e_id = theano.shared(value=np.array([n_tags + 1], dtype=np.int32)) padded_tags_ids = T.concatenate([b_id, tag_ids, e_id], axis=0) real_path_score += transitions[ padded_tags_ids[T.arange(s_len + 1)], padded_tags_ids[T.arange(s_len + 1) + 1] ].sum() # A中对应元素的求和 all_paths_scores = forward(observations, transitions, hamming_cost=hamming_cost, n_tags=n_tags, padded_tags_ids=padded_tags_ids) L2_norm += (transitions ** 2).sum() cost = - (real_path_score - all_paths_scores) + L2_reg * L2_norm # Network parameters params = [] if word_dim: self.add_component(word_layer) params.extend(word_layer.params) if char_dim: self.add_component(wc_layer) params.extend(wc_layer.params) self.add_component(word_lstm_for) params.extend(word_lstm_for.params) # if POS: # self.add_component(pos_layer) # params.extend(pos_layer.params) if word_bidirect: self.add_component(word_lstm_rev) params.extend(word_lstm_rev.params) self.add_component(final_layer) params.extend(final_layer.params) if tagger == 'lstm': self.add_component(tagger_layer) params.extend(tagger_layer.params) elif tagger == 'crf': self.add_component(transitions) params.append(transitions) if word_bidirect: self.add_component(tanh_layer) params.extend(tanh_layer.params) # Prepare train and eval inputs eval_inputs = [] if word_dim: eval_inputs.append(word_ids) if use_gaze: eval_inputs.append(gaze) if POS: # eval_inputs.append(pos_ids) eval_inputs.append(pos_one_hot) if char_dim: eval_inputs.append(char_for_ids) if char_bidirect: eval_inputs.append(char_rev_ids) eval_inputs.append(char_pos_ids) #if cap_dim: # eval_inputs.append(cap_ids) train_inputs = eval_inputs + [tag_ids] # Parse optimization method parameters if "-" in lr_method: lr_method_name = lr_method[:lr_method.find('-')] lr_method_parameters = {} for x in lr_method[lr_method.find('-') + 1:].split('-'): split = x.split('_') assert len(split) == 2 lr_method_parameters[split[0]] = float(split[1]) else: lr_method_name = lr_method lr_method_parameters = {} # Compile training function print 'Compiling...' if training: updates = Optimization(clip=5.0).get_updates(lr_method_name, cost, params, **lr_method_parameters) f_train = theano.function( inputs=train_inputs, outputs=cost, updates=updates, givens=({is_train: np.cast['int32'](1)} if dropout else {}), on_unused_input='warn' ) else: f_train = None # if plot_cost: # f_plot_cost = theano.function( # inputs=train_inputs, # outputs=cost, # givens=({is_train: np.cast['int32'](1)} if dropout else {}), # on_unused_input='warn' # ) # else: # f_plot_cost = None # Compile evaluation function if tagger != 'crf': f_eval = theano.function( inputs=eval_inputs, outputs=tags_scores, givens=({is_train: np.cast['int32'](0)} if dropout else {}), on_unused_input='warn' ) else: f_eval = theano.function( inputs=eval_inputs, outputs=forward(observations, transitions, hamming_cost= 0, n_tags= None, padded_tags_ids= None, viterbi=True, return_alpha=False, return_best_sequence=True), givens=({is_train: np.cast['int32'](0)} if dropout else {}), on_unused_input='warn' ) return f_train, f_eval#, f_plot_cost
def __init__(self, nh, nc, ne, de, cs): ''' nh :: dimension of the hidden layer nc :: number of classes ne :: number of word embeddings in the vocabulary 572 de :: dimension of the word embeddings 100 cs :: word window context size ''' # parameters of the model self.emb = theano.shared(0.2 * numpy.random.uniform(-1.0, 1.0,\ (ne+1, de)).astype(theano.config.floatX)) # add one for PADDING at the end self.Wx = theano.shared(0.2 * numpy.random.uniform(-1.0, 1.0,\ (de * cs, nh)).astype(theano.config.floatX)) self.Wh = theano.shared(0.2 * numpy.random.uniform(-1.0, 1.0,\ (nh, nh)).astype(theano.config.floatX)) self.W = theano.shared(0.2 * numpy.random.uniform(-1.0, 1.0,\ (nh, nc)).astype(theano.config.floatX)) self.bh = theano.shared(numpy.zeros(nh, dtype=theano.config.floatX)) self.b = theano.shared(numpy.zeros(nc, dtype=theano.config.floatX)) self.h0 = theano.shared(numpy.zeros(nh, dtype=theano.config.floatX)) # bundle self.params = [ self.emb, self.Wx, self.Wh, self.W, self.bh, self.b, self.h0 ] self.names = ['embeddings', 'Wx', 'Wh', 'W', 'bh', 'b', 'h0'] idxs = T.imatrix( ) # as many columns as context window size/lines as words in the sentence x = self.emb[idxs].reshape((idxs.shape[0], de * cs)) y = T.iscalar('y') # label def Relu(x): out_dtype = scalar.upgrade_to_float( scalar.Scalar(dtype=x.dtype))[0].dtype a = T.constant(0.5, dtype=out_dtype) # ab = T.constant(abs(x), dtype=out_dtype) # x = (x * slope) + shift y = (x + abs(x)) * a r = T.clip(y, 0, 1) return r def PRelu(x): out_dtype = scalar.upgrade_to_float( scalar.Scalar(dtype=x.dtype))[0].dtype a = T.constant(0.625, dtype=out_dtype) b = T.constant(0.375, dtype=out_dtype) # x = (x * slope) + shift y = x * a + abs(x) * b r = T.clip(y, 0, 1) return r def my_tanh(x): #return 2*T.nnet.sigmoid(2*x)-1 return T.nnet.sigmoid(x) def sigmoid_sigmoid(x): return 0.8 * T.nnet.sigmoid(x) + 0.2 * T.nnet.hard_sigmoid(x) def recurrence(x_t, h_tm1): temp = T.dot(x_t, self.Wx) + T.dot(h_tm1, self.Wh) + self.bh #h_t = T.nnet.hard_sigmoid(temp) # the t moment output of the hidden layer #h_t = T.tanh(temp) h_t = T.nnet.sigmoid(temp) #h_t=T.nnet.relu(temp,0.2)#relu=T.maximum(0, temp) s_t = T.nnet.softmax( T.dot(h_t, self.W) + self.b) # the t moment output of the output layer return [h_t, s_t] [h, s], _ = theano.scan(fn=recurrence, \ sequences=x, outputs_info=[self.h0, None], \ n_steps=x.shape[0]) p_y_given_x_lastword = s[-1, 0, :] p_y_given_x_sentence = s[:, 0, :] y_pred = T.argmax(p_y_given_x_sentence, axis=1) #print 'y_pred', y_pred #print ' p_y_given_x_sentence', p_y_given_x_sentence # cost and gradients and learning rate lr = T.scalar('lr') nll = -T.log(p_y_given_x_lastword)[y] #negative log-likelihood(NLL) gradients = T.grad(nll, self.params) updates = OrderedDict( (p, p - lr * g) for p, g in zip(self.params, gradients)) # theano functions self.myclassify = theano.function(inputs=[idxs], outputs=p_y_given_x_sentence) self.classify = theano.function(inputs=[idxs], outputs=y_pred) self.train = theano.function(inputs=[idxs, y, lr], outputs=nll, updates=updates) self.normalize = theano.function( inputs = [], updates = {self.emb:\ self.emb/T.sqrt((self.emb**2).sum(axis=1)).dimshuffle(0,'x')})
def main(n_iter, n_batch, n_hidden, time_steps, learning_rate, savefile, scale_penalty, use_scale, model, n_hidden_lstm, loss_function): #import pdb; pdb.set_trace() # --- Set optimization params -------- gradient_clipping = np.float32(50000) # --- Set data params ---------------- n_input = 2 n_output = 1 # --- Manage data -------------------- n_train = 1e5 n_test = 1e4 num_batches = n_train / n_batch train_x = np.asarray(np.zeros((time_steps, n_train, 2)), dtype=theano.config.floatX) train_x[:,:,0] = np.asarray(np.random.uniform(low=0., high=1., size=(time_steps, n_train)), dtype=theano.config.floatX) # inds = np.asarray([np.random.choice(time_steps, 2, replace=False) for i in xrange(train_x.shape[1])]) inds = np.asarray(np.random.randint(time_steps/2, size=(train_x.shape[1],2))) inds[:, 1] += time_steps/2 for i in range(train_x.shape[1]): train_x[inds[i, 0], i, 1] = 1.0 train_x[inds[i, 1], i, 1] = 1.0 train_y = (train_x[:,:,0] * train_x[:,:,1]).sum(axis=0) train_y = np.reshape(train_y, (n_train, 1)) test_x = np.asarray(np.zeros((time_steps, n_test, 2)), dtype=theano.config.floatX) test_x[:,:,0] = np.asarray(np.random.uniform(low=0., high=1., size=(time_steps, n_test)), dtype=theano.config.floatX) inds = np.asarray([np.random.choice(time_steps, 2, replace=False) for i in xrange(test_x.shape[1])]) for i in range(test_x.shape[1]): test_x[inds[i, 0], i, 1] = 1.0 test_x[inds[i, 1], i, 1] = 1.0 test_y = (test_x[:,:,0] * test_x[:,:,1]).sum(axis=0) test_y = np.reshape(test_y, (n_test, 1)) ####################################################################### gradient_clipping = np.float32(1) if (model == 'LSTM'): inputs, parameters, costs = LSTM(n_input, n_hidden_lstm, n_output, loss_function=loss_function) gradients = T.grad(costs[0], parameters) gradients = [T.clip(g, -gradient_clipping, gradient_clipping) for g in gradients] elif (model == 'complex_RNN'): inputs, parameters, costs = complex_RNN(n_input, n_hidden, n_output, scale_penalty, loss_function=loss_function) if use_scale is False: parameters.pop() gradients = T.grad(costs[0], parameters) elif (model == 'complex_RNN_LSTM'): inputs, parameters, costs = complex_RNN_LSTM(n_input, n_hidden, n_hidden_lstm, n_output, scale_penalty, loss_function=loss_function) elif (model == 'IRNN'): inputs, parameters, costs = IRNN(n_input, n_hidden, n_output, loss_function=loss_function) gradients = T.grad(costs[0], parameters) gradients = [T.clip(g, -gradient_clipping, gradient_clipping) for g in gradients] elif (model == 'RNN'): inputs, parameters, costs = tanhRNN(n_input, n_hidden, n_output, loss_function=loss_function) gradients = T.grad(costs[0], parameters) gradients = [T.clip(g, -gradient_clipping, gradient_clipping) for g in gradients] else: print "Unsuported model:", model return s_train_x = theano.shared(train_x) s_train_y = theano.shared(train_y) s_test_x = theano.shared(test_x) s_test_y = theano.shared(test_y) # --- Compile theano functions -------------------------------------------------- index = T.iscalar('i') updates, rmsprop = rms_prop(learning_rate, parameters, gradients) givens = {inputs[0] : s_train_x[:, n_batch * index : n_batch * (index + 1), :], inputs[1] : s_train_y[n_batch * index : n_batch * (index + 1), :]} givens_test = {inputs[0] : s_test_x, inputs[1] : s_test_y} train = theano.function([index], costs[0], givens=givens, updates=updates) test = theano.function([], costs[1], givens=givens_test) # --- Training Loop --------------------------------------------------------------- # f1 = file('/data/lisatmp3/shahamar/adding/complexRNN_400.pkl', 'rb') # data1 = cPickle.load(f1) # f1.close() # train_loss = data1['train_loss'] # test_loss = data1['test_loss'] # best_params = data1['best_params'] # best_test_loss = data1['best_test_loss'] # for i in xrange(len(parameters)): # parameters[i].set_value(data1['parameters'][i]) # for i in xrange(len(parameters)): # rmsprop[i].set_value(data1['rmsprop'][i]) # import pdb; pdb.set_trace() train_loss = [] test_loss = [] best_params = [p.get_value() for p in parameters] best_test_loss = 1e6 for i in xrange(n_iter): # start_time = timeit.default_timer() if (n_iter % int(num_batches) == 0): #import pdb; pdb.set_trace() inds = np.random.permutation(int(n_train)) data_x = s_train_x.get_value() s_train_x.set_value(data_x[:,inds,:]) data_y = s_train_y.get_value() s_train_y.set_value(data_y[inds,:]) mse = train(i % int(num_batches)) train_loss.append(mse) print "Iteration:", i print "mse:", mse print if (i % 50==0): mse = test() print print "TEST" print "mse:", mse print test_loss.append(mse) if mse < best_test_loss: best_params = [p.get_value() for p in parameters] best_test_loss = mse save_vals = {'parameters': [p.get_value() for p in parameters], 'rmsprop': [r.get_value() for r in rmsprop], 'train_loss': train_loss, 'test_loss': test_loss, 'best_params': best_params, 'best_test_loss': best_test_loss, 'model': model, 'time_steps': time_steps} cPickle.dump(save_vals, file(savefile, 'wb'), cPickle.HIGHEST_PROTOCOL)
def test_notex_print(): tt_normalrv_noname_expr = tt.scalar("b") * NormalRV( tt.scalar("\\mu"), tt.scalar("\\sigma")) expected = textwrap.dedent(r""" b in R, \mu in R, \sigma in R a ~ N(\mu, \sigma**2) in R (b * a) """) assert tt_pprint(tt_normalrv_noname_expr) == expected.strip() # Make sure the constant shape is show in values and not symbols. tt_normalrv_name_expr = tt.scalar("b") * NormalRV( tt.scalar("\\mu"), tt.scalar("\\sigma"), size=[2, 1], name="X") expected = textwrap.dedent(r""" b in R, \mu in R, \sigma in R X ~ N(\mu, \sigma**2) in R**(2 x 1) (b * X) """) assert tt_pprint(tt_normalrv_name_expr) == expected.strip() tt_2_normalrv_noname_expr = tt.matrix("M") * NormalRV( tt.scalar("\\mu_2"), tt.scalar("\\sigma_2")) tt_2_normalrv_noname_expr *= tt.scalar("b") * NormalRV( tt_2_normalrv_noname_expr, tt.scalar("\\sigma")) + tt.scalar("c") expected = textwrap.dedent(r""" M in R**(N^M_0 x N^M_1), \mu_2 in R, \sigma_2 in R b in R, \sigma in R, c in R a ~ N(\mu_2, \sigma_2**2) in R, d ~ N((M * a), \sigma**2) in R**(N^d_0 x N^d_1) ((M * a) * ((b * d) + c)) """) assert tt_pprint(tt_2_normalrv_noname_expr) == expected.strip() expected = textwrap.dedent(r""" b in Z, c in Z, M in R**(N^M_0 x N^M_1) M[b, c] """) # TODO: "c" should be "1". assert (tt_pprint( tt.matrix("M")[tt.iscalar("a"), tt.constant(1, dtype="int")]) == expected.strip()) expected = textwrap.dedent(r""" M in R**(N^M_0 x N^M_1) M[1] """) assert tt_pprint(tt.matrix("M")[1]) == expected.strip() expected = textwrap.dedent(r""" M in N**(N^M_0) M[2:4:0] """) assert tt_pprint(tt.vector("M", dtype="uint32")[0:4:2]) == expected.strip() norm_rv = NormalRV(tt.scalar("\\mu"), tt.scalar("\\sigma")) rv_obs = observed(tt.constant(1.0, dtype=norm_rv.dtype), norm_rv) expected = textwrap.dedent(r""" \mu in R, \sigma in R a ~ N(\mu, \sigma**2) in R a = 1.0 """) assert tt_pprint(rv_obs) == expected.strip()
def __init__(self, n_in, hidden_layer_size, n_out, L1_reg, L2_reg, hidden_layer_type, output_type='LINEAR', dropout_rate=0.0, optimizer='sgd', loss_function='MMSE', rnn_batch_training=False): """ This function initialises a neural network :param n_in: Dimensionality of input features :type in: Integer :param hidden_layer_size: The layer size for each hidden layer :type hidden_layer_size: A list of integers :param n_out: Dimensionality of output features :type n_out: Integrer :param hidden_layer_type: the activation types of each hidden layers, e.g., TANH, LSTM, GRU, BLSTM :param L1_reg: the L1 regulasation weight :param L2_reg: the L2 regulasation weight :param output_type: the activation type of the output layer, by default is 'LINEAR', linear regression. :param dropout_rate: probability of dropout, a float number between 0 and 1. """ logger = logging.getLogger("DNN initialization") self.n_in = int(n_in) self.n_out = int(n_out) self.n_layers = len(hidden_layer_size) self.dropout_rate = dropout_rate self.optimizer = optimizer self.loss_function = loss_function self.is_train = T.iscalar('is_train') self.rnn_batch_training = rnn_batch_training assert len(hidden_layer_size) == len(hidden_layer_type) self.list_of_activations = [ 'TANH', 'SIGMOID', 'SOFTMAX', 'RELU', 'RESU' ] if self.rnn_batch_training: self.x = T.tensor3('x') self.y = T.tensor3('y') else: self.x = T.matrix('x') self.y = T.matrix('y') self.L1_reg = L1_reg self.L2_reg = L2_reg self.rnn_layers = [] self.params = [] self.delta_params = [] rng = np.random.RandomState(123) for i in range(self.n_layers): if i == 0: input_size = n_in else: input_size = hidden_layer_size[i - 1] if i == 0: layer_input = self.x else: layer_input = self.rnn_layers[i - 1].output if hidden_layer_type[i - 1] == 'BSLSTM' or hidden_layer_type[ i - 1] == 'BLSTM': input_size = hidden_layer_size[i - 1] * 2 if hidden_layer_type[i] in self.list_of_activations: hidden_activation = hidden_layer_type[i].lower() hidden_layer = GeneralLayer(rng, layer_input, input_size, hidden_layer_size[i], activation=hidden_activation, p=self.dropout_rate, training=self.is_train) elif hidden_layer_type[i] == 'TANH_LHUC': hidden_layer = SigmoidLayer_LHUC(rng, layer_input, input_size, hidden_layer_size[i], activation=T.tanh, p=self.dropout_rate, training=self.is_train) elif hidden_layer_type[i] == 'SLSTM': hidden_layer = SimplifiedLstm( rng, layer_input, input_size, hidden_layer_size[i], p=self.dropout_rate, training=self.is_train, rnn_batch_training=self.rnn_batch_training) elif hidden_layer_type[i] == 'SGRU': hidden_layer = SimplifiedGRU( rng, layer_input, input_size, hidden_layer_size[i], p=self.dropout_rate, training=self.is_train, rnn_batch_training=self.rnn_batch_training) elif hidden_layer_type[i] == 'GRU': hidden_layer = GatedRecurrentUnit( rng, layer_input, input_size, hidden_layer_size[i], p=self.dropout_rate, training=self.is_train, rnn_batch_training=self.rnn_batch_training) elif hidden_layer_type[i] == 'LSTM_NFG': hidden_layer = LstmNFG( rng, layer_input, input_size, hidden_layer_size[i], p=self.dropout_rate, training=self.is_train, rnn_batch_training=self.rnn_batch_training) elif hidden_layer_type[i] == 'LSTM_NOG': hidden_layer = LstmNOG( rng, layer_input, input_size, hidden_layer_size[i], p=self.dropout_rate, training=self.is_train, rnn_batch_training=self.rnn_batch_training) elif hidden_layer_type[i] == 'LSTM_NIG': hidden_layer = LstmNIG( rng, layer_input, input_size, hidden_layer_size[i], p=self.dropout_rate, training=self.is_train, rnn_batch_training=self.rnn_batch_training) elif hidden_layer_type[i] == 'LSTM_NPH': hidden_layer = LstmNoPeepholes( rng, layer_input, input_size, hidden_layer_size[i], p=self.dropout_rate, training=self.is_train, rnn_batch_training=self.rnn_batch_training) elif hidden_layer_type[i] == 'LSTM': hidden_layer = VanillaLstm( rng, layer_input, input_size, hidden_layer_size[i], p=self.dropout_rate, training=self.is_train, rnn_batch_training=self.rnn_batch_training) elif hidden_layer_type[i] == 'BSLSTM': hidden_layer = BidirectionSLstm( rng, layer_input, input_size, hidden_layer_size[i], hidden_layer_size[i], p=self.dropout_rate, training=self.is_train, rnn_batch_training=self.rnn_batch_training) elif hidden_layer_type[i] == 'BLSTM': hidden_layer = BidirectionLstm( rng, layer_input, input_size, hidden_layer_size[i], hidden_layer_size[i], p=self.dropout_rate, training=self.is_train, rnn_batch_training=self.rnn_batch_training) elif hidden_layer_type[i] == 'RNN': hidden_layer = VanillaRNN( rng, layer_input, input_size, hidden_layer_size[i], p=self.dropout_rate, training=self.is_train, rnn_batch_training=self.rnn_batch_training) elif hidden_layer_type[i] == 'LSTM_LHUC': hidden_layer = VanillaLstm_LHUC( rng, layer_input, input_size, hidden_layer_size[i], p=self.dropout_rate, training=self.is_train, rnn_batch_training=self.rnn_batch_training) else: logger.critical( "This hidden layer type: %s is not supported right now! \n Please use one of the following: SLSTM, BSLSTM, TANH, SIGMOID\n" % (hidden_layer_type[i])) sys.exit(1) self.rnn_layers.append(hidden_layer) self.params.extend(hidden_layer.params) input_size = hidden_layer_size[-1] if hidden_layer_type[-1] == 'BSLSTM' or hidden_layer_type[ -1] == 'BLSTM': input_size = hidden_layer_size[-1] * 2 output_activation = output_type.lower() if output_activation == 'linear': self.final_layer = LinearLayer(rng, self.rnn_layers[-1].output, input_size, self.n_out) elif output_activation == 'recurrent': self.final_layer = RecurrentOutputLayer( rng, self.rnn_layers[-1].output, input_size, self.n_out, rnn_batch_training=self.rnn_batch_training) elif output_type.upper() in self.list_of_activations: self.final_layer = GeneralLayer(rng, self.rnn_layers[-1].output, input_size, self.n_out, activation=output_activation) else: logger.critical( "This output layer type: %s is not supported right now! \n Please use one of the following: LINEAR, BSLSTM\n" % (output_type)) sys.exit(1) self.params.extend(self.final_layer.params) self.updates = {} for param in self.params: self.updates[param] = theano.shared( value=np.zeros(param.get_value(borrow=True).shape, dtype=theano.config.floatX), name='updates') if self.loss_function == 'CCE': self.finetune_cost = self.categorical_crossentropy_loss( self.final_layer.output, self.y) self.errors = self.categorical_crossentropy_loss( self.final_layer.output, self.y) elif self.loss_function == 'Hinge': self.finetune_cost = self.multiclass_hinge_loss( self.final_layer.output, self.y) self.errors = self.multiclass_hinge_loss(self.final_layer.output, self.y) elif self.loss_function == 'MMSE': if self.rnn_batch_training: self.y_mod = T.reshape(self.y, (-1, n_out)) self.final_layer_output = T.reshape(self.final_layer.output, (-1, n_out)) nonzero_rows = T.any(self.y_mod, 1).nonzero() self.y_mod = self.y_mod[nonzero_rows] self.final_layer_output = self.final_layer_output[nonzero_rows] self.finetune_cost = T.mean( T.sum((self.final_layer_output - self.y_mod)**2, axis=1)) self.errors = T.mean( T.sum((self.final_layer_output - self.y_mod)**2, axis=1)) else: self.finetune_cost = T.mean( T.sum((self.final_layer.output - self.y)**2, axis=1)) self.errors = T.mean( T.sum((self.final_layer.output - self.y)**2, axis=1))
def test_value_s(self): "tests that the value of the kl divergence decreases with each update to s_i " model = self.model e_step = self.e_step X = self.X assert X.shape[0] == self.m init_H = e_step.init_H_hat(V=X) init_Mu1 = e_step.init_S_hat(V=X) prev_setting = config.compute_test_value config.compute_test_value = 'off' H, Mu1 = function([], outputs=[init_H, init_Mu1])() config.compute_test_value = prev_setting H = broadcast(H, self.m) Mu1 = broadcast(Mu1, self.m) H = np.cast[config.floatX](self.model.rng.uniform(0., 1., H.shape)) Mu1 = np.cast[config.floatX](self.model.rng.uniform( -5., 5., Mu1.shape)) H_var = T.matrix(name='H_var') H_var.tag.test_value = H Mu1_var = T.matrix(name='Mu1_var') Mu1_var.tag.test_value = Mu1 idx = T.iscalar() idx.tag.test_value = 0 S = e_step.infer_S_hat(V=X, H_hat=H_var, S_hat=Mu1_var) s_idx = S[:, idx] s_i_func = function([H_var, Mu1_var, idx], s_idx) sigma0 = 1. / model.alpha Sigma1 = e_step.infer_var_s1_hat() mu0 = T.zeros_like(model.mu) #by truncated KL, I mean that I am dropping terms that don't depend on H and Mu1 # (they don't affect the outcome of this test and some of them are intractable ) trunc_kl = - model.entropy_hs(H_hat = H_var, var_s0_hat = sigma0, var_s1_hat = Sigma1) + \ model.expected_energy_vhs(V = X, H_hat = H_var, S_hat = Mu1_var, var_s0_hat = sigma0, var_s1_hat = Sigma1) trunc_kl_func = function([H_var, Mu1_var], trunc_kl) for i in xrange(self.N): prev_kl = trunc_kl_func(H, Mu1) Mu1[:, i] = s_i_func(H, Mu1, i) new_kl = trunc_kl_func(H, Mu1) increase = new_kl - prev_kl mx = increase.max() if mx > 1e-3: raise Exception( 'after mean field step in s, kl divergence should decrease, but some elements increased by as much as ' + str(mx) + ' after updating s_' + str(i))
from __future__ import print_function import numpy as np import theano import theano.tensor as T N = T.iscalar('N') def calc(n, fn1, fn2): return fn1 + fn2, fn1 outputs, _ = theano.scan(fn=calc, sequences=T.arange(N), n_steps=N, outputs_info=[1., 1.]) fibonacci = theano.function(inputs=[N], outputs=outputs) print(fibonacci(8))
def build(self, dropout, char_dim, char_lstm_dim, char_bidirect, slb_dim, slb_lstm_dim, slb_bidirect, word_dim, word_lstm_dim, word_bidirect, lr_method, pre_emb, crf, pos_dim, lexicon_dim, training=True, **kwargs): """ Build the network. """ # Training parameters n_words = len(self.id_to_word) n_tags = len(self.id_to_tag) # Number of features if slb_dim: n_slbs = len(self.id_to_slb) if char_dim: n_chars = len(self.id_to_char) if pos_dim: n_pos = len(self.id_to_pos) + 2 if lexicon_dim: n_lex = lexicon_dim # Network variables is_train = T.iscalar('is_train') word_ids = T.ivector(name='word_ids') if slb_dim: slb_for_ids = T.imatrix(name='slb_for_ids') if slb_lstm_dim: slb_rev_ids = T.imatrix(name='slb_rev_ids') if slb_bidirect: slb_pos_ids = T.ivector(name='slb_pos_ids') if char_dim: char_for_ids = T.imatrix(name='char_for_ids') if char_lstm_dim: char_rev_ids = T.imatrix(name='char_rev_ids') if char_bidirect: char_pos_ids = T.ivector(name='char_pos_ids') if pos_dim: pos_ids = T.ivector(name='pos_ids') if lexicon_dim: lex_ids = T.fmatrix(name='lex_ids') tag_ids = T.ivector(name='tag_ids') # Sentence length s_len = (word_ids if word_dim else char_pos_ids).shape[0] # Final input (all word features) input_dim = 0 inputs = [] # # Word inputs # if word_dim: input_dim += word_dim word_layer = EmbeddingLayer(n_words, word_dim, name='word_layer') word_input = word_layer.link(word_ids) inputs.append(word_input) # Initialize with pretrained embeddings if pre_emb and training: new_weights = word_layer.embeddings.get_value() print 'Loading pretrained embeddings...' pretrained = {} emb_invalid = 0 for i, line in enumerate(codecs.open(pre_emb, 'r', 'utf-8')): line = line.rstrip().split() if len(line) == word_dim + 1: pretrained[line[0]] = np.array( [float(x) for x in line[1:]]).astype(np.float32) else: emb_invalid += 1 # if emb_invalid > 0: # print 'WARNING: %i invalid lines' % emb_invalid # Lookup table initialization for i in xrange(n_words): word = self.id_to_word[i] if word in pretrained: # print word new_weights[i] = pretrained[word] word_layer.embeddings.set_value(new_weights) # # Syllable inputs # if slb_dim: slb_layer = EmbeddingLayer(n_slbs, slb_dim, name='slb_layer') if slb_lstm_dim: input_dim += slb_lstm_dim slb_lstm_for = LSTM(slb_dim, slb_lstm_dim, with_batch=True, name='slb_lstm_for') slb_lstm_rev = LSTM(slb_dim, slb_lstm_dim, with_batch=True, name='slb_lstm_rev') slb_lstm_for.link(slb_layer.link(slb_for_ids)) slb_lstm_rev.link(slb_layer.link(slb_rev_ids)) slb_for_input = slb_lstm_for.h.dimshuffle( (1, 0, 2))[T.arange(s_len), slb_pos_ids] slb_rev_input = slb_lstm_rev.h.dimshuffle( (1, 0, 2))[T.arange(s_len), slb_pos_ids] inputs.append(slb_for_input) if slb_bidirect: inputs.append(slb_rev_input) input_dim += slb_lstm_dim # # Chars inputs # if char_dim: char_layer = EmbeddingLayer(n_chars, char_dim, name='char_layer') if char_lstm_dim: input_dim += char_lstm_dim char_lstm_for = LSTM(char_dim, char_lstm_dim, with_batch=True, name='char_lstm_for') char_lstm_rev = LSTM(char_dim, char_lstm_dim, with_batch=True, name='char_lstm_rev') char_lstm_for.link(char_layer.link(char_for_ids)) char_lstm_rev.link(char_layer.link(char_rev_ids)) char_for_input = char_lstm_for.h.dimshuffle( (1, 0, 2))[T.arange(s_len), char_pos_ids] char_rev_input = char_lstm_rev.h.dimshuffle( (1, 0, 2))[T.arange(s_len), char_pos_ids] inputs.append(char_for_input) if char_bidirect: inputs.append(char_rev_input) input_dim += char_lstm_dim # # PoS & Lexicon feature # if pos_dim: input_dim += pos_dim pos_layer = EmbeddingLayer(n_pos, pos_dim, name='pos_layer') inputs.append(pos_layer.link(pos_ids)) if lexicon_dim: input_dim += lexicon_dim lex_layer = HiddenLayer(n_lex, lexicon_dim, name='lex_layer', activation=None) inputs.append(lex_layer.link(lex_ids)) # Prepare final input if len(inputs) != 1: inputs = T.concatenate(inputs, axis=1) else: inputs = inputs[0] # # Dropout on final input # if dropout: dropout_layer = DropoutLayer(p=dropout) input_train = dropout_layer.link(inputs) input_test = (1 - dropout) * inputs inputs = T.switch(T.neq(is_train, 0), input_train, input_test) # LSTM for words word_lstm_for = LSTM(input_dim, word_lstm_dim, with_batch=False, name='word_lstm_for') word_lstm_rev = LSTM(input_dim, word_lstm_dim, with_batch=False, name='word_lstm_rev') word_lstm_for.link(inputs) word_lstm_rev.link(inputs[::-1, :]) word_for_output = word_lstm_for.h word_rev_output = word_lstm_rev.h[::-1, :] if word_bidirect: final_output = T.concatenate([word_for_output, word_rev_output], axis=1) tanh_layer = HiddenLayer(2 * word_lstm_dim, word_lstm_dim, name='tanh_layer', activation='tanh') final_output = tanh_layer.link(final_output) else: final_output = word_for_output # Sentence to Named Entity tags - Score final_layer = HiddenLayer(word_lstm_dim, n_tags, name='final_layer', activation=(None if crf else 'softmax')) tags_scores = final_layer.link(final_output) # No CRF if not crf: cost = T.nnet.categorical_crossentropy(tags_scores, tag_ids).mean() # CRF else: transitions = shared((n_tags + 2, n_tags + 2), 'transitions') small = -1000 b_s = np.array([[small] * n_tags + [0, small]]).astype(np.float32) e_s = np.array([[small] * n_tags + [small, 0]]).astype(np.float32) observations = T.concatenate( [tags_scores, small * T.ones((s_len, 2))], axis=1) observations = T.concatenate([b_s, observations, e_s], axis=0) # Score from tags real_path_score = tags_scores[T.arange(s_len), tag_ids].sum() # Score from transitions b_id = theano.shared(value=np.array([n_tags], dtype=np.int32)) e_id = theano.shared(value=np.array([n_tags + 1], dtype=np.int32)) padded_tags_ids = T.concatenate([b_id, tag_ids, e_id], axis=0) real_path_score += transitions[padded_tags_ids[T.arange(s_len + 1)], padded_tags_ids[T.arange(s_len + 1) + 1]].sum() all_paths_scores = forward(observations, transitions) cost = -(real_path_score - all_paths_scores) # Network parameters params = [] if word_dim: self.add_component(word_layer) params.extend(word_layer.params) if slb_dim: self.add_component(slb_layer) params.extend(slb_layer.params) if slb_lstm_dim: self.add_component(slb_lstm_for) params.extend(slb_lstm_for.params) if slb_bidirect: self.add_component(slb_lstm_rev) params.extend(slb_lstm_rev.params) if char_dim: self.add_component(char_layer) params.extend(char_layer.params) if char_lstm_dim: self.add_component(char_lstm_for) params.extend(char_lstm_for.params) if char_bidirect: self.add_component(char_lstm_rev) params.extend(char_lstm_rev.params) self.add_component(word_lstm_for) params.extend(word_lstm_for.params) if word_bidirect: self.add_component(word_lstm_rev) params.extend(word_lstm_rev.params) if pos_dim: self.add_component(pos_layer) params.extend(pos_layer.params) if lexicon_dim: self.add_component(lex_layer) params.extend(lex_layer.params) self.add_component(final_layer) params.extend(final_layer.params) if crf: self.add_component(transitions) params.append(transitions) if word_bidirect: self.add_component(tanh_layer) params.extend(tanh_layer.params) # Prepare train and eval inputs eval_inputs = [] if word_dim: eval_inputs.append(word_ids) if slb_dim: eval_inputs.append(slb_for_ids) if slb_lstm_dim: if slb_bidirect: eval_inputs.append(slb_rev_ids) eval_inputs.append(slb_pos_ids) if char_dim: eval_inputs.append(char_for_ids) if char_lstm_dim: if char_bidirect: eval_inputs.append(char_rev_ids) eval_inputs.append(char_pos_ids) if pos_dim: eval_inputs.append(pos_ids) if lexicon_dim: eval_inputs.append(lex_ids) train_inputs = eval_inputs + [tag_ids] # Parse optimization method parameters if "-" in lr_method: lr_method_name = lr_method[:lr_method.find('-')] lr_method_parameters = {} for x in lr_method[lr_method.find('-') + 1:].split('-'): split = x.split('_') assert len(split) == 2 lr_method_parameters[split[0]] = float(split[1]) else: lr_method_name = lr_method lr_method_parameters = {} # Compile training function print 'Compiling...' if training: updates = Optimization(clip=5.0).get_updates( lr_method_name, cost, params, **lr_method_parameters) f_train = theano.function(inputs=train_inputs, outputs=cost, updates=updates, givens=({ is_train: np.cast['int32'](1) } if dropout else {})) else: f_train = None # Compile evaluation function if not crf: f_eval = theano.function(inputs=eval_inputs, outputs=tags_scores, givens=({ is_train: np.cast['int32'](0) } if dropout else {})) else: f_eval = theano.function(inputs=eval_inputs, outputs=forward( observations, transitions, viterbi=True, return_alpha=False, return_best_sequence=True), givens=({ is_train: np.cast['int32'](0) } if dropout else {})) return f_train, f_eval