def build_and_train_model(self,n_hu,n_hl):
		print('Building Model')

		input_phrase = T.imatrix('train_inputmatrix')
		labels = T.imatrix('trainphrase_matrix')

		network = self.define_layers(input_phrase,labels,n_hu,n_hl)

		print("Defining loss")
		#Prediction or loss
		prediction = []
		prediction.append(T.clip(lasagne.layers.get_output(network[0]),1.0e-7,1.0-1.0e-7))
		prediction.append(T.clip(lasagne.layers.get_output(network[1]),1.0e-7,1.0-1.0e-7))

		loss = l.define_loss(prediction[0],prediction[1])
		self.model = network
		#define params
		params = lasagne.layers.get_all_params(network)
		updates = lasagne.updates.adadelta(loss,params)

		#run test

		train_fn = theano.function([input_phrase,labels],[loss, prediction[0], prediction[1]],updates=updates,allow_input_downcast=True)

		print("Model and params defined now training")
		epoch = 0
		for epoch in range(self.end_epoch):
			train_loss = 0
			train_pred = []
			start_time = time.time()
			loss, predicted, phrase = train_fn(self.train_inputmatrix,self.trainphrase_matrix)
			print('Training Loss: ' + str(loss) + ' Train Epoch ' + str(epoch))
			self.save_best(loss,predicted,network)
Exemple #2
0
    def _classify(self,dataset_static,dataset_nonstatic):
        """
        Classify method for static or non-static models.
        :param classifier: model
        :param conv_layers: list of convPoolLayer objects
        :param Words: Dictionary of word index to word vectors
        :param dataset: Indices of words for the current sentence/dataset
        :param dim: dimension of word vector
        :param img_h: length of sentence vector after padding
        :return: [y_pred,prob_pred] The probability for each class
        """
        x_static = T.imatrix('x_static')
        x_nonstatic = T.imatrix('x_nonstatic')
        y = T.ivector('y')
        Words_static = theano.shared(value = self.Words_static, name = "Words_static")
        Words_nonstatic = theano.shared(value = self.Words_nonstatic, name = "Words_nonstatic")

        test_pred_layers = []
        test_size = np.shape(dataset_static)[0]
        test_layer0_input_static = Words_static[T.cast(x_static.flatten(),dtype="int32")].reshape((test_size,1,self.img_h,self.Words_static.shape[1]))
        test_layer0_input_nonstatic = Words_nonstatic[T.cast(x_nonstatic.flatten(),dtype="int32")].reshape((test_size,1,self.img_h,self.Words_nonstatic.shape[1]))
        for i in range(len(self.conv_layers)/2):
            test_layer0_output = self.conv_layers[i].predict(test_layer0_input_nonstatic, test_size)
            test_pred_layers.append(test_layer0_output.flatten(2))
        for i in range(len(self.conv_layers)/2,len(self.conv_layers)):
            test_layer0_output = self.conv_layers[i].predict(test_layer0_input_static, test_size)
            test_pred_layers.append(test_layer0_output.flatten(2))

        test_layer1_input = T.concatenate(test_pred_layers, 1)
        test_y_pred = self.classifier.predict(test_layer1_input)
        test_prob_pred = self.classifier.predict_p(test_layer1_input)
        test_model_all = theano.function([x_static,x_nonstatic], (test_y_pred,test_prob_pred))

        return test_model_all(dataset_static,dataset_nonstatic)
Exemple #3
0
def run():
    batch_size = 16
    prems = np.random.randint(low=0, high=99, size=(batch_size, 5), dtype='int32')
    hypoes = np.random.randint(low=0, high=99, size=(batch_size, 3), dtype='int32')
    labels = np.random.randint(low=0, high=3, size=(batch_size,), dtype='int32')
    print prems
    print hypoes
    print labels

    ematrix = np.random.uniform(low=-1, high=1, size=(100, 100)).astype(theano.config.floatX)

    t_prems = T.imatrix('p')
    t_hypoes = T.imatrix('h')
    t_ematrix = theano.shared(ematrix, 't_ematrix')

    r_prems = T.repeat(t_prems, 3, axis= 1)
    r_hypoes = T.concatenate([t_hypoes]* 5, axis=1)

    batch_prems = t_ematrix[r_prems]
    batch_hypoes = t_ematrix[r_hypoes]

    batch_prem_hypo = T.concatenate((batch_prems, batch_hypoes), axis=2)

    get_b_prems = theano.function(inputs=[t_prems], outputs=batch_prems)
    get_r_prems = theano.function(inputs=[t_prems], outputs=r_prems)
    get_b_hypoes = theano.function(inputs=[t_hypoes], outputs=batch_hypoes)
    get_r_hypoes = theano.function(inputs=[t_hypoes], outputs=r_hypoes)
    get_b_ph = theano.function(inputs=[t_prems, t_hypoes], outputs=batch_prem_hypo)

    # print get_b_prems(prems)
    print get_r_prems(prems)
    print get_r_hypoes(hypoes)

    print get_b_prems(prems).shape
    print get_b_hypoes(hypoes).shape

    print get_b_ph(prems, hypoes).shape

    W = theano.shared(
        value=np.random.uniform(
            low=-np.sqrt(1. / 6),
            high=np.sqrt(1. / 6),
            size=(200, 400)
        ).astype(theano.config.floatX),
        name='W'
    )

    U = theano.shared(
        value=np.random.uniform(
            low=-np.sqrt(1. / 6),
            high=np.sqrt(1. / 6),
            size=(400,)
        ).astype(theano.config.floatX),
        name='U'
    )

    result = T.dot(T.dot(batch_prem_hypo, W), U)

    get_result = theano.function(inputs=[t_prems, t_hypoes], outputs=result)
    print get_result(prems, hypoes).shape
Exemple #4
0
	def train_ready(self):
		print "adopt softmax model plus contractive regularization ........ "
		print "weight 1   : "+str(self.lowreg_weight)
		print "weight 2   : "+str(self.highreg_weight)
		print "variance   : "+str(self.variance)
		print "nc  : "+str(self.nc)

		var_x = T.imatrix()
		var_y = T.imatrix()

		loss = self.reg_logp(var_x,var_y, self.lowreg_weight, self.highreg_weight, self.variance, self.nc)
	

		witems = self.w.values()
		#ave_w = sum(T.sum(item**2) for item in witems)/len(witems)
		wg = T.grad(loss, witems)
		#ave_g = sum(T.sum(item**2) for item in wg) /len(wg)

		weight_up = self.upda(wg, witems, self.lrate, self.mweight, self.opt, self.gradbound)

		if not self.fix_emb:
			dicitems = self.dic.values()
			dg = T.grad(loss, dicitems)

			dic_up = self.upda(dg, dicitems, self.lrate/10., self.mweight, self.opt)
			weight_up.update(dic_up)

		up  = weight_up

		self.updatefunc = theano.function([var_x, var_y], loss,updates = up)
def create_model(num_timesteps, num_blocks, hidden_size, learning_rate, \
    grad_clip=10, dropout_p=0.5, num_lstm_layers=1, use_forward_and_backward_lstm=False):
    '''
     returns train function which reports both loss and accuracy
     and test function, which also reports both loss and accuracy
    '''
    
    l_in, l_mask, l_out, l_out_slice, l_lstm, l_lstm_slice = \
    _build_net_layers(num_timesteps, num_blocks, hidden_size, learning_rate, \
        grad_clip, dropout_p, num_lstm_layers, use_forward_and_backward_lstm)

    inp = T.tensor3('input')
    truth = T.imatrix("truth")
    mask = T.imatrix("mask")

    # pred should be of shape (batchsize, num_timesteps, num_asts)
    pred = lasagne.layers.get_output(l_out)
    # pred_slice should be of shape (batchsize, num_asts), only contains
    # predictions for the last timestep
    pred_slice = lasagne.layers.get_output(l_out_slice)
    # the hidden representations for the last timestep (batchsize, hidden_size)
    hidden_slice = lasagne.layers.get_output(l_lstm_slice)
    # truth should also be of shape (batchsize, num_timesteps, num_asts)

    pred_2d = pred.reshape((-1, num_blocks))
    truth_1d = truth.reshape((-1,))

    # pred_2d_shape = T.shape(pred_2d)
    # truth_1d_shape = T.shape(truth_1d)

    # categorical_crossentropy
    loss = T.nnet.categorical_crossentropy(pred_2d, truth_1d).mean()
    # categorical accuracy
    # acc = T.nnet.categorical_crossentropy(pred_2d, truth_1d).mean()
    acc = lasagne.objectives.categorical_accuracy(pred_2d, truth_1d).mean()
    # update function
    print("Computing updates ...")
    all_params = lasagne.layers.get_all_params(l_out)
    updates = lasagne.updates.adam(loss, all_params, learning_rate)

    # training function
    print("Compiling functions ...")
    train_loss = theano.function([l_in.input_var, l_mask.input_var, truth], loss, updates=updates, allow_input_downcast=True)
    compute_loss = theano.function([l_in.input_var, l_mask.input_var, truth], loss, allow_input_downcast=True)
    # training function, returns loss and acc
    compute_pred = theano.function([l_in.input_var, l_mask.input_var, truth],  [pred_2d, truth_1d], updates=updates, allow_input_downcast=True)
    train_loss_acc = theano.function([l_in.input_var, l_mask.input_var, truth], [loss, acc, pred], updates=updates, allow_input_downcast=True)
    # computes loss and accuracy, without training
    compute_loss_acc = theano.function([l_in.input_var, l_mask.input_var, truth], [loss, acc, pred], allow_input_downcast=True)

    # In order to generate text from the network, we need the probability distribution of the next character given
    # the state of the network and the input (a seed).
    # In order to produce the probability distribution of the prediction, we compile a function called probs. 
    probs = theano.function([l_in.input_var, l_mask.input_var], pred_slice, allow_input_downcast=True)

    generate_hidden_representations = theano.function([l_in.input_var, l_mask.input_var], hidden_slice, allow_input_downcast=True)

    print("Compiling done!")
    
    return train_loss_acc, compute_loss_acc, probs, generate_hidden_representations, compute_pred, l_out
 def __init__( self,
               config,
               qvocab_len,
               max_qlen,
               num_ans,
               num_qtypes,  
               l_saver):
     self.config                     = config
     self.qn                         = T.imatrix()
     self.lstm_mask                  = T.imatrix()
     self.iX                         = T.fmatrix()
     self.Y                          = T.ivector()
     self.qtype                      = T.ivector()
     self.sparse_indices             = T.ivector()
     self.qembd                      = T.fmatrix()
     self.ql_out                     = T.fmatrix()
     self.timer                      = l.timer_type()
     self.saver, self.exp_saver      = l_saver
     self.qlstm_hidden_dim           = 300 
     self.qn_classifier_emb_size     = 75
     self.max_ql                     = max_qlen
     self.qvocab_len                 = qvocab_len 
     self.bptt_trunk_steps           = -1 
     self.mlp_input_dim              = 1024
     self.num_qtypes                 = num_qtypes
     self.num_ans                    = num_ans
     self.grad_clip                  = config['grad_clip']
     self.params                     = {}
     print "Models Initialization done ..."
Exemple #7
0
    def test_sparseblockgemvF(self):
        """
            Test the fortan order for W (which can happen in the grad for some
            graphs).
        """
        b = tensor.fmatrix()
        W = tensor.ftensor4()
        h = tensor.ftensor3()
        iIdx = tensor.imatrix()
        oIdx = tensor.imatrix()

        o = self.gemv_op(b.take(oIdx, axis=0),
                         tensor.DimShuffle((False, False, False, False),
                                           (0, 1, 3, 2))
                         (tensor.as_tensor_variable(W)),
                         h, iIdx, oIdx)

        f = theano.function([W, h, iIdx, b, oIdx], o, mode=self.mode)

        W_val, h_val, iIdx_val, b_val, oIdx_val = \
            BlockSparse_Gemv_and_Outer.gemv_data()

        th_out = f(numpy.swapaxes(W_val, 2, 3), h_val, iIdx_val, b_val,
                   oIdx_val)
        ref_out = BlockSparse_Gemv_and_Outer.gemv_numpy(
            b_val.take(oIdx_val, axis=0), W_val, h_val, iIdx_val, oIdx_val)

        utt.assert_allclose(ref_out, th_out)
Exemple #8
0
 def __init__(self, size_vocab, size_embed, size, size_out, depth, network, 
              alpha=0.5,
              gru_activation=clipped_rectify, 
              visual_activation=linear, 
              visual_encoder=StackedGRUH0, 
              cost_visual=CosineDistance,
              max_norm=None, 
              lr=0.0002, 
              dropout_prob=0.0):
     autoassign(locals())
     self.network = network(self.size_vocab, 
                            self.size_embed, 
                            self.size, 
                            self.size_out, 
                            self.depth,
                            gru_activation=self.gru_activation, 
                            visual_activation=self.visual_activation,
                            visual_encoder=self.visual_encoder,
                            dropout_prob=self.dropout_prob)
                            
     self.input         = T.imatrix()
     self.output_t_prev = T.imatrix()
     self.output_t      = T.imatrix()
     self.output_v      = T.fmatrix()
     self.OH       = OneHot(size_in=self.size_vocab)
     self.output_t_oh   = self.OH(self.output_t)
     self.updater = util.Adam(max_norm=self.max_norm, lr=self.lr)
     self.train = self._make_train()
     self.loss_test = self._make_loss_test()
def set_model(argv, vocab_word, init_emb):
    x_span = T.imatrix("x_span")
    x_word = T.imatrix("x_word")
    x_ctx = T.imatrix("x_ctx")
    x_dist = T.imatrix("x_dist")
    x_slen = T.imatrix("x_slen")
    y = T.ivector("y")

    """ Set params for the model """
    n_vocab = vocab_word.size()
    dim_x_word = argv.emb
    dim_x_dist = 10  # (0, ..., 10-)
    dim_h = argv.hidden
    L2_reg = argv.reg

    """ Instantiate the model """
    return Model(
        x_span=x_span,
        x_word=x_word,
        x_ctx=x_ctx,
        x_dist=x_dist,
        x_slen=x_slen,
        y=y,
        init_emb=init_emb,
        n_vocab=n_vocab,
        dim_w_p=dim_x_word,
        dim_d=dim_x_dist,
        dim_h=dim_h,
        L2_reg=L2_reg,
    )
Exemple #10
0
def run():
    # params
    dims = 10
    negrate = 1
    batsize = 300
    epochs = 300

    #paths
    datafileprefix = "../../data/nycfilms/"
    dirfwdsuffix = "direct_forward.plustypes.ssd"

    # get the data and split
    dirfwdf = open(datafileprefix+dirfwdsuffix)
    datadf = readdata(dirfwdf)
    traind, validd, testd = datadf.split((70, 15, 15), random=True)

    numents = int(datadf.ix[:, 0].max())+1
    print numents
    numrels = int(datadf.ix[:, 1].max())+1
    print numrels

    # define model
    inp = Input(T.imatrix())

    eemb = VectorEmbed.indim(numents).outdim(dims).Wreg(l2reg(0.00001))()
    remb = VectorEmbed.indim(numrels).outdim(dims).Wreg(l2reg(0.00001))()

    # for debugging
    eembd = SymTensor(T.fmatrix())
    rembd = SymTensor(T.fmatrix())
    dotp = SymTensor(T.fmatrix())

    out = ((inp[:, 0] >> eemb >> eembd) & (inp[:, 1] >> remb >> rembd)) >> DotProduct() >> dotp >> Tanh()

    # for plotting purposes: relation to relation dot product (or relation-type)
    r2rinp = Input(T.imatrix())
    rel2rel = ((r2rinp[:, 0] >> remb) & (r2rinp[:, 1] >> remb)) >> DotProduct()

    outtest = Output(T.fvector())

    loss = (out & outtest) >> HingeLoss()
    trainer = Trainer\
        .batsize(batsize)\
        .epochs(epochs)\
        .onrun(getonrun())\
        .offrun(offrun)\
        .offepoch(getoffepoch(out, rel2rel))\
        .onbatch(getonbatch(negrate, numents, numrels))\
        .optimizer(sgd(lr=1.))\
        .batchtransformer(transbat)
    trainer\
        .loss(loss)\

    trainer.train(traind.values, validd.values)\
           .test(testd.values)

    explore(eemb, remb)
    # functions for interactive exploration

    embed()
    def train_model_func(self, batch_size, num_batches, summary_sz, input_sz):
        summaries = T.imatrix('summaries')
        docs = T.imatrix('docs')

        s = np.zeros((batch_size * num_batches, summary_sz))
        d = np.zeros((batch_size * num_batches, input_sz))

        summary_superbatch = theano.shared( s.astype(theano.config.floatX),
                                           name = 's_summs', borrow = True )
        doc_superbatch = theano.shared( d.astype(theano.config.floatX),
                                        name = 's_docs', borrow = True )

        self.ssb = summary_superbatch
        self.dsb = doc_superbatch

        cost = self.negative_log_likelihood_batch(docs, summaries, batch_size)
        regularization_cost = self.l2_coefficient * sum([(p ** 2).sum() for p in self.params])

        self.get_batch_cost_unregularized = theano.function([docs, summaries], cost, allow_input_downcast=True)
        #theano.printing.debugprint(cost)
        cost = cost + regularization_cost

        params = {p.name: p for p in self.params} 
        grads = T.grad(cost, self.params)
        #grads = theano.printing.Print("grads")(grads)

        # learning rate
        lr = T.scalar(name='lr')
        gradient_update = optimisers.sgd_(lr, self.params, grads, docs, summaries,
                                                                cost, self.dsb, self.ssb, batch_size)
        return gradient_update
Exemple #12
0
    def build_model_1(self):

        x = T.imatrix('x').astype(theano.config.floatX)
        drop_masks = T.imatrix('drop_masks').astype(theano.config.floatX)
        y = T.ivector('y')

        self.layers[0] = LSTMLayer(random_state=self.random_state,input=x,drop_masks=drop_masks,input_dim=self.input_dim,output_dim=self.hidden_dims[0])
        params = self.layers[0].params

        self.layers[1] = OutputLayer(input=self.layers[0].output,
                                                         input_dim=self.layers[0].output_dim, output_dim=self.output_dim,random_state=self.random_state)

        params += self.layers[1].params
        _EPSILON = 10e-8

        L1 = 0.001 * T.sum([T.sum(param) for param in params])
        L2 = 0.001 * T.sum([T.sum(param ** param) for param in params])
        cost = T.sum(T.nnet.categorical_crossentropy(T.clip(self.layers[self.number_of_layers].probabilities[-1], _EPSILON, 1.0 - _EPSILON),y)) + L1 + L2

        #grads = T.grad(cost, params)

        #updates = [(param_i, param_i - self.learning_rate * grad_i) for param_i,grad_i in zip(params,grads)]
        updates =  LearningAlgorithms.adam(cost,params,learning_rate=0.001)

        self.sgd_step = theano.function([x,drop_masks, y], L1, updates=updates)
        self.predict = theano.function([x,drop_masks],self.layers[self.number_of_layers].probabilities[-1])

        self.test_model = theano.function([x,drop_masks, y], cost)
Exemple #13
0
    def _make_stack(self, seq_length=4):
        self.embedding_dim = embedding_dim = 3
        self.vocab_size = vocab_size = 10
        self.seq_length = seq_length

        def compose_network(inp, inp_dim, outp_dim, vs, name="compose"):
            # Just add the two embeddings!
            W = T.concatenate([T.eye(outp_dim), T.eye(outp_dim)], axis=0)
            return inp.dot(W)

        X = T.imatrix("X")
        transitions = T.imatrix("transitions")
        apply_dropout = T.scalar("apply_dropout")
        vs = VariableStore()
        self.stack = HardStack(
            embedding_dim,
            embedding_dim,
            vocab_size,
            seq_length,
            compose_network,
            IdentityLayer,
            apply_dropout,
            vs,
            X=X,
            transitions=transitions,
            make_test_fn=True,
        )

        # Swap in our own dummy embeddings and weights.
        embeddings = np.arange(vocab_size).reshape((vocab_size, 1)).repeat(embedding_dim, axis=1)
        self.stack.embeddings.set_value(embeddings)
Exemple #14
0
 def build_model(self):
   print '\n... building the model with unroll=%d, backroll=%d' \
     % (self.source.unroll, self.source.backroll)
   x = T.imatrix('x')
   y = T.imatrix('y')
   reset = T.scalar('reset')
   hiddens = [h['init'] for h in self.hiddens.values()]
   outputs_info = [None] * 3 + hiddens
   [losses, probs, errors, hids], updates = \
     theano.scan(self.step, sequences=[x, y], outputs_info=outputs_info)
   loss = losses.sum()
   error = errors.sum() / T.cast((T.neq(y, 255).sum()), floatX)
   hidden_updates_train = []
   hidden_updates_test = []
   for h in self.hiddens.values():
     h_train = ifelse(T.eq(reset, 0), \
       hids[-1-self.source.backroll, :], T.ones_like(h['init']))
     h_test = ifelse(T.eq(reset, 0), \
       hids[-1, :], T.ones_like(h['init']))
     hidden_updates_train.append((h['init'], h_train))
     hidden_updates_test.append((h['init'], h_test))
   updates = self.source.get_updates(loss, self.sgd_params)
   updates += hidden_updates_train
   rets = [loss, probs[-1, :], error]
   mode = theano.Mode(linker='cvm')
   train_model = theano.function([x, y, reset, self.lr], rets, \
     updates=updates, mode=mode)
   test_model = theano.function([x, y, reset], rets, \
     updates=hidden_updates_test, mode=mode)
   return train_model, test_model
    def setup_encode(self):

        # dimensions: (batch, time, 12)
        chord_types = T.btensor3()
        # dimensions: (batch, time)
        chord_roots = T.imatrix()
        # dimensions: (batch, time)
        relative_posns = [T.imatrix() for _ in self.encodings]
        # dimesions: (batch, time, output_data)
        encoded_melodies = [T.btensor3() for _ in self.encodings]
        n_batch, n_time = chord_roots.shape

        all_activations = []
        for encoding, enc_lstmstack, encoded_melody, relative_pos in zip(self.encodings, self.enc_lstmstacks, encoded_melodies, relative_posns):
            activations = enc_lstmstack.do_preprocess_scan( timestep=T.tile(T.arange(n_time), (n_batch,1)) ,
                                                        relative_position=relative_pos,
                                                        cur_chord_type=chord_types,
                                                        cur_chord_root=chord_roots,
                                                        cur_input=encoded_melody,
                                                        deterministic_dropout=True )
            all_activations.append(activations)
        reduced_activations = functools.reduce((lambda x,y: x+y), all_activations)
        strengths, vects = self.qman.get_strengths_and_vects(reduced_activations)

        self.encode_fun = theano.function(
            inputs=[chord_types, chord_roots] + relative_posns + encoded_melodies,
            outputs=[strengths, vects],
            allow_input_downcast=True,
            mode=(NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=True) if self.nanguard else None))
 def _get_input_tensor_variables(self):
     # x_w: 1D: batch, 2D: n_words, 3D: 5 + window; word id
     # x_p: 1D: batch, 2D: n_words; posit id
     # y: 1D: batch, 2D: n_words; label id
     if self.argv.mark_phi:
         return [T.itensor3('x_w'), T.imatrix('x_p'), T.imatrix('y')]
     return [T.itensor3('x_w'), T.imatrix('y')]
Exemple #17
0
    def build(self):
        """build the model. This method should be called after self.add_data.
        """
        x_sym = sparse.csr_matrix('x', dtype = 'float32')
        y_sym = T.imatrix('y')
        g_sym = T.imatrix('g')
        gy_sym = T.vector('gy')
        ind_sym = T.ivector('ind')

        l_x_in = lasagne.layers.InputLayer(shape = (None, self.x.shape[1]), input_var = x_sym)
        l_g_in = lasagne.layers.InputLayer(shape = (None, 2), input_var = g_sym)
        l_ind_in = lasagne.layers.InputLayer(shape = (None, ), input_var = ind_sym)
        l_gy_in = lasagne.layers.InputLayer(shape = (None, ), input_var = gy_sym)

        num_ver = max(self.graph.keys()) + 1
        l_emb_in = lasagne.layers.SliceLayer(l_g_in, indices = 0, axis = 1)
        l_emb_in = lasagne.layers.EmbeddingLayer(l_emb_in, input_size = num_ver, output_size = self.embedding_size)
        l_emb_out = lasagne.layers.SliceLayer(l_g_in, indices = 1, axis = 1)
        if self.neg_samp > 0:
            l_emb_out = lasagne.layers.EmbeddingLayer(l_emb_out, input_size = num_ver, output_size = self.embedding_size)

        l_emd_f = lasagne.layers.EmbeddingLayer(l_ind_in, input_size = num_ver, output_size = self.embedding_size, W = l_emb_in.W)
        l_x_hid = layers.SparseLayer(l_x_in, self.y.shape[1], nonlinearity = lasagne.nonlinearities.softmax)
        
        if self.use_feature:
            l_emd_f = layers.DenseLayer(l_emd_f, self.y.shape[1], nonlinearity = lasagne.nonlinearities.softmax)
            l_y = lasagne.layers.ConcatLayer([l_x_hid, l_emd_f], axis = 1)
            l_y = layers.DenseLayer(l_y, self.y.shape[1], nonlinearity = lasagne.nonlinearities.softmax)
        else:
            l_y = layers.DenseLayer(l_emd_f, self.y.shape[1], nonlinearity = lasagne.nonlinearities.softmax)

        py_sym = lasagne.layers.get_output(l_y)
        loss = lasagne.objectives.categorical_crossentropy(py_sym, y_sym).mean()
        if self.layer_loss and self.use_feature:
            hid_sym = lasagne.layers.get_output(l_x_hid)
            loss += lasagne.objectives.categorical_crossentropy(hid_sym, y_sym).mean()
            emd_sym = lasagne.layers.get_output(l_emd_f)
            loss += lasagne.objectives.categorical_crossentropy(emd_sym, y_sym).mean()

        if self.neg_samp == 0:
            l_gy = layers.DenseLayer(l_emb_in, num_ver, nonlinearity = lasagne.nonlinearities.softmax)
            pgy_sym = lasagne.layers.get_output(l_gy)
            g_loss = lasagne.objectives.categorical_crossentropy(pgy_sym, lasagne.layers.get_output(l_emb_out)).sum()
        else:
            l_gy = lasagne.layers.ElemwiseMergeLayer([l_emb_in, l_emb_out], T.mul)
            pgy_sym = lasagne.layers.get_output(l_gy)
            g_loss = - T.log(T.nnet.sigmoid(T.sum(pgy_sym, axis = 1) * gy_sym)).sum()

        params = [l_emd_f.W, l_emd_f.b, l_x_hid.W, l_x_hid.b, l_y.W, l_y.b] if self.use_feature else [l_y.W, l_y.b]
        if self.update_emb:
            params = lasagne.layers.get_all_params(l_y)
        updates = lasagne.updates.sgd(loss, params, learning_rate = self.learning_rate)

        self.train_fn = theano.function([x_sym, y_sym, ind_sym], loss, updates = updates, on_unused_input = 'ignore')
        self.test_fn = theano.function([x_sym, ind_sym], py_sym, on_unused_input = 'ignore')
        self.l = [l_gy, l_y]

        g_params = lasagne.layers.get_all_params(l_gy, trainable = True)
        g_updates = lasagne.updates.sgd(g_loss, g_params, learning_rate = self.g_learning_rate)
        self.g_fn = theano.function([g_sym, gy_sym], g_loss, updates = g_updates, on_unused_input = 'ignore')
Exemple #18
0
    def __theano_build__(self):
        params = self.params
        param_names = self.param_names
        hidden_dim = self.hidden_dim

        x1  = T.imatrix('x1')    # first sentence
        x2  = T.imatrix('x2')    # second sentence
        x1_mask = T.fmatrix('x1_mask')    #mask
        x2_mask = T.fmatrix('x2_mask')
        y   = T.ivector('y')     # label
        y_c = T.ivector('y_c')   # class weights 
        
        # Embdding words
        _E1 = params["E"].dot(params["W"][0]) + params["B"][0]
        _E2 = params["E"].dot(params["W"][1]) + params["B"][1]
        statex1 = _E1[x1.flatten(), :].reshape([x1.shape[0], x1.shape[1], hidden_dim])
        statex2 = _E2[x2.flatten(), :].reshape([x2.shape[0], x2.shape[1], hidden_dim])
        
        def rnn_cell(x, mx, ph, Wh):
            h = T.tanh(ph.dot(Wh) + x)
            h = mx[:, None] * h + (1-mx[:, None]) * ph
            return [h] 
            
        [h1], updates = theano.scan(
            fn=rnn_cell,
            sequences=[statex1, x1_mask],
            truncate_gradient=self.truncate,
            outputs_info=[dict(initial=T.zeros([self.batch_size, self.hidden_dim]))],
            non_sequences=params["W"][2])
        
        [h2], updates = theano.scan(
            fn=rnn_cell,
            sequences=[statex2, x2_mask],
            truncate_gradient=self.truncate,
            outputs_info=[dict(initial=h1[-1])],
            non_sequences=params["W"][3])
       
        #predict
        _s = T.nnet.softmax(h1[-1].dot(params["lrW"][0]) + h2[-1].dot(params["lrW"][1]) + params["lrb"])
        _p = T.argmax(_s, axis=1)
        _c = T.nnet.categorical_crossentropy(_s, y)
        _c = T.sum(_c * y_c)
        _l = T.sum(params["lrW"]**2)
        _cost = _c + 0.01 * _l
        
        # SGD parameters
        learning_rate = T.scalar('learning_rate')
        decay = T.scalar('decay')
        
        # Gradients and updates
        _grads, _updates = rms_prop(_cost, param_names, params, learning_rate, decay)
        
        # Assign functions
        self.bptt = theano.function([x1, x2, x1_mask, x2_mask, y, y_c], _grads)
        self.loss = theano.function([x1, x2, x1_mask, x2_mask, y, y_c], _c)
        self.weights = theano.function([x1, x2, x1_mask, x2_mask], _s)
        self.predictions = theano.function([x1, x2, x1_mask, x2_mask], _p)
        self.sgd_step = theano.function(
            [x1, x2, x1_mask, x2_mask, y, y_c, learning_rate, decay],
            updates=_updates)
    def __init__(self,rng,model_params):
        self.input = T.itensor3('input') # the data is a minibatch
        self.label = T.imatrix('label') # label's shape (mini_batch size, max_term_per_sent)
        self.sent_length= T.ivector('sent_length') # sent_length is the number of terms in each sentence
        self.masks = T.imatrix('masks') # masks which used in error and likelihood calculation

        self.core = SentenceLevelNeuralModelCore(rng,self.input,self.label,self.sent_length,self.masks,model_params)

        self.params = self.core.wordvec.params() \
                + self.core.POSvec.params() \
                + self.core.wordpos_vec.params() \
                + self.core.verbpos_vec.params() \
                + self.core.conv_word.params() \
                + self.core.conv_POS.params() \
                + self.core.conv_wordpos.params() \
                + self.core.conv_verbpos.params() \
                + self.core.hidden_layer.params

        self.L2_sqr = (self.core.wordvec.embeddings ** 2).sum() \
                + (self.core.POSvec.embeddings ** 2).sum() \
                + (self.core.wordpos_vec.embeddings ** 2).sum() \
                + (self.core.verbpos_vec.embeddings ** 2).sum() \
                + (self.core.conv_word.W ** 2).sum() \
                + (self.core.conv_POS.W ** 2).sum() \
                + (self.core.conv_wordpos.W ** 2).sum() \
                + (self.core.conv_verbpos.W ** 2).sum() \
                + (self.core.hidden_layer.W ** 2).sum()

        self.negative_log_likelihood = self.core.likelihood()
        self.errors = self.core.errors()

        # we only use L2 regularization
        self.cost = self.negative_log_likelihood \
                + self.core.L2_reg * self.L2_sqr


        self.gparams = []
        for param in self.params:
            gparam = T.grad(self.cost, param)
            self.gparams.append(gparam)

        self.updates = []

        learning_rate = model_params['learning_rate']
        for param, gparam in zip(self.params, self.gparams):
            self.updates.append((param, param - learning_rate * gparam))


        #self.train_model = theano.function(inputs=[self.input,self.label,self.masks], outputs=self.core.conv_word.output,on_unused_input='ignore')
        #self.train_model = theano.function(inputs=[self.input,self.label,self.masks], outputs=self.core.conv_POS.output,on_unused_input='ignore')
        #self.train_model = theano.function(inputs=[self.input,self.label,self.masks], outputs=self.core.conv_verbpos.output,on_unused_input='ignore')
        #self.train_model = theano.function(inputs=[self.input,self.label,self.masks], outputs=self.core.conv_wordpos.output,on_unused_input='ignore')
        #self.train_model = theano.function(inputs=[self.input,self.label,self.masks], outputs=self.core.conv_out,on_unused_input='ignore')
        #self.train_model = theano.function(inputs=[self.input,self.label,self.masks], outputs=self.core.max_out,on_unused_input='ignore')
        #self.train_model = theano.function(inputs=[self.input,self.label,self.masks], outputs=self.core.hidden_layer.output,on_unused_input='ignore')
        #self.train_model = theano.function(inputs=[self.input,self.label,self.masks], outputs=self.core.negative_log_likelihood,on_unused_input='ignore')
        #self.train_model = theano.function(inputs=[self.input,self.label,self.masks], outputs=self.cost,on_unused_input='ignore')
        self.train_model = theano.function(inputs=[self.input,self.label,self.masks], outputs=self.cost,updates=self.updates,on_unused_input='ignore')
        self.valid_model = theano.function(inputs=[self.input,self.label,self.masks], outputs=[self.errors,self.core.sentce_loglikelihood.y_pred_pointwise],on_unused_input='ignore')
Exemple #20
0
 def __init__(self, config):
     autoassign(locals())
     self.updater = util.Adam(max_norm=config['max_norm'], lr=config['lr'])
     self.Decode = Decoder(config['size_vocab'],
                           config['size_embed'], config['size'], config['depth'])
     self.ToTxt  = Dense(config['size'], config['size_vocab'])
     self.inputs = [T.imatrix()]
     self.target = T.imatrix()
Exemple #21
0
def ndim_itensor(ndim, name=None):
    if ndim == 2:
        return T.imatrix(name)
    elif ndim == 3:
        return T.itensor3(name)
    elif ndim == 4:
        return T.itensor4(name)
    return T.imatrix(name=name)
Exemple #22
0
 def defmodel(self):
     pathidxs = T.imatrix("pathidxs")  # integers of (batsize, seqlen)
     zidxs = T.imatrix("zidxs")  # integers of (batsize, seqlen)
     occluder = T.imatrix("occluder")
     scores = self.definnermodel(pathidxs)  # predictions, floats of (batsize, seqlen, vocabsize)
     # probs = T.nnet.softmax(scores) # row-wise softmax; probs: (batsize, seqlen, vocabsize) #softmax doesn't work on tensor3D
     probs, _ = theano.scan(fn=T.nnet.softmax, sequences=scores, outputs_info=[None])
     return probs, zidxs, occluder, [pathidxs, zidxs, occluder]
Exemple #23
0
def create_rnn(hidden_dim, vocab_dim,mode="rnn"):
    # input
    x = tensor.imatrix('inchar')
    y = tensor.imatrix('outchar')

    # 
    W = LookupTable(
        name = "W1",
        #dim = hidden_dim*4,
        dim = hidden_dim,
        length = vocab_dim,
        weights_init = initialization.IsotropicGaussian(0.01),
        biases_init = initialization.Constant(0)
    )
    if mode == "lstm":
        # Long Short Term Memory
        H = LSTM(
            hidden_dim, 
            name = 'H',
            weights_init = initialization.IsotropicGaussian(0.01),
            biases_init = initialization.Constant(0.0)
        )
    else:
        # recurrent history weight
        H = SimpleRecurrent(
            name = "H",
            dim = hidden_dim,
            activation = Tanh(),
            weights_init = initialization.IsotropicGaussian(0.01)
        )
    # 
    S = Linear(
        name = "W2",
        input_dim = hidden_dim,
        output_dim = vocab_dim,
        weights_init = initialization.IsotropicGaussian(0.01),
        biases_init = initialization.Constant(0)
    )

    A = NDimensionalSoftmax(
        name = "softmax"
    )

    initLayers([W,H,S])
    activations = W.apply(x)
    hiddens = H.apply(activations)#[0]
    activations2 = S.apply(hiddens)
    y_hat = A.apply(activations2, extra_ndim=1)
    cost = A.categorical_cross_entropy(y, activations2, extra_ndim=1).mean()

    cg = ComputationGraph(cost)
    #print VariableFilter(roles=[WEIGHT])(cg.variables)
    #W1,H,W2 = VariableFilter(roles=[WEIGHT])(cg.variables)

    layers = (x, W, H, S, A, y)

    return  cg, layers, y_hat, cost
def build_model(options):
    print('Build model...')
    sys.stdout.flush()
    weights = None
    if options['flag_random_lookup_table'] == False: weights = options['embedding']
    embed_layer = Embedding(input_dim = options['embedding'].shape[0], 
                            output_dim = options['embedding'].shape[1], 
                            weights = weights)
    dense_layers = []
    dense_layers.append(Dense(input_dim = options['embedding'].shape[1] * 2, output_dim = options['size_hidden_layer'], activation = 'tanh'))
    dense_layers.append(Dense(input_dim = options['size_hidden_layer'], output_dim = 1, activation = 'sigmoid'))
    
    # for training
    sentence1 = T.imatrix('s1')  # sentence1, n_samples * len_sentence
    sentence1_mask = T.matrix('s1_mask')
    sentence2 = T.imatrix('s2')  # sentence2, n_samples * len_sentence
    sentence2_mask = T.matrix('s2_mask')
    y = T.ivector('y1')  # n_samples
    
    embed_s1 = embed_layer.get_output(sentence1) # n_samples * len_sentence * embed_dim
    embed_s2 = embed_layer.get_output(sentence2) # n_samples * len_sentence * embed_dim
    if options['sentence_modeling'] == 'CBoW':
        embed_s1 = ave_embed(embed_s1,sentence1_mask) # n_samples * embed_dim
        embed_s2 = ave_embed(embed_s2,sentence2_mask) # n_samples * embed_dim
    elif options['sentence_modeling'] == 'CNN':
        sentence_encode_layer = Convolution1D(input_dim = options['embedding'].shape[1], activation = 'tanh',
                                nb_filter = options['embedding'].shape[1], filter_length = options['CNN_filter_length'],
                                border_mode = 'same')
        embed_s1 = CNN_embed(embed_s1,sentence1_mask,sentence_encode_layer) # n_samples * embed_dim
        embed_s2 = CNN_embed(embed_s2,sentence2_mask,sentence_encode_layer) # n_samples * embed_dim
    elif options['sentence_modeling'] == 'LSTM':
        sentence_encode_layer = LSTM(input_dim = options['embedding'].shape[1], output_dim = options['embedding'].shape[1])
        embed_s1 = LSTM_embed(embed_s1,sentence1_mask,sentence_encode_layer,options) # n_samples * embed_dim
        embed_s2 = LSTM_embed(embed_s2,sentence2_mask,sentence_encode_layer,options) # n_samples * embed_dim
    else:
        print 'Error: No model called %s available!' % options['sentence_modeling']
        return
    
    output = T.concatenate([embed_s1,embed_s2],axis = -1) # n_samples * (embed_dim * 2)
    
    if options['flag_dropout'] == True:
        output = dropout(output, level=options['dropoutRates'])
    for dense_layer in dense_layers:
        output = dense_layer.get_output(output)
    f_pred = theano.function([sentence1,sentence1_mask,sentence2,sentence2_mask],output, allow_input_downcast=True)
    
    output = output.reshape((output.shape[0],))
    #y = y.reshape((output.shape[0],1))
    cost = T.nnet.binary_crossentropy(output, y).mean()
    f_debug = theano.function([sentence1,sentence1_mask,sentence2,sentence2_mask,y],[output,y,T.nnet.binary_crossentropy(output, y),cost], allow_input_downcast=True)
    tparams = []
    tparams += embed_layer.params
    if options['sentence_modeling'] != 'CBoW':
        tparams += sentence_encode_layer.params
    for dense_layer in dense_layers: tparams += dense_layer.params
    return sentence1,sentence1_mask,sentence2,sentence2_mask,y,cost,f_pred,tparams,f_debug
    
Exemple #25
0
	def __init__(self, voca_size, hidden_size, ydim, num_layers=2, learning_rate=0.1):
		self.hidden_size = hidden_size
		self.n_out = ydim
		self.learning_rate = learning_rate
		self.num_layers = num_layers
		self.layers = []
		self.params = []

		self.emb = WordEmbeder(voca_size, hidden_size)
		self.params += self.emb.params

		x = tensor.imatrix() #symbolic
		mask = tensor.imatrix()
		y = tensor.ivector()

		state_below = self.emb.embed_it(x)
		for _ in range(self.num_layers):
			binet = BiLSTM(self.hidden_size, self.learning_rate)
			self.layers += binet,
			self.params += binet.params
			state_below = binet.forward(state_below, mask)

		self.U = theano.shared(name="biU", value=utils.init_norm(self.hidden_size, self.n_out), borrow=True)
		self.by = theano.shared(name="by", value=np.zeros(self.n_out), borrow=True)
		self.params += [self.U, self.by]

		#mean pooling
		hs = state_below
		mp = (hs*mask[:,:,None]).sum(axis=0)
		mp = mp / mask.sum(axis=0)[:,None]

		#classifier
		pred_p = tensor.nnet.softmax(tensor.dot(mp, self.U) + self.by)
		pred_y = pred_p.argmax(axis=1)

		#nll
		off_set = 1e-8
		cost = -tensor.log( pred_p[tensor.arange(mask.shape[1]), y] + off_set ).mean()
		gparams = [tensor.grad(cost, param) for param in self.params]
		updates = [(param, param - self.learning_rate*gparam) for param, gparam in zip(self.params, gparams)]

		vinputs = tensor.imatrix("vinputs")#variable
		vmask = tensor.imatrix("vmask")
		vy = tensor.ivector("vy")
		
		self._train = theano.function(
			inputs=[vinputs, vmask, vy],
			outputs=cost,
			updates=updates,
			givens={x:vinputs, mask:vmask, y:vy}
			)

		self._predict = theano.function(
			inputs=[vinputs, vmask],
			outputs=pred_y,
			givens={x:vinputs, mask:vmask}
			)
Exemple #26
0
def add_b():
    w = T.imatrix('w')
    a = T.imatrix('a')
    y = w + a.repeat(4, 0)
    f = theano.function(inputs=[w, a], outputs=[y])

    e = np.asarray([[2, 4], [2, 1], [3, 2], [4, 1]], dtype='int32')
    b = np.asarray([[2, 1]], dtype='int32')
    print f(e, b)
 def build_batch(self):
     x = TT.imatrix('x')                # 2D int32
     x_mask = TT.fmatrix('x_mask')       # float32
     y = TT.imatrix('y')                 # 2D int32
     y_given_x = self.fprop_batch(x)     # 3D, shape (seq_len, bs, n_out)
     self.get_y_given_x = theano.function(inputs = [x], outputs = y_given_x)
     
     y_given_x_ = y_given_x.reshape((y_given_x.shape[0]*y_given_x.shape[1], y_given_x.shape[2]))
     y_ = y.reshape((y.shape[0]*y.shape[1], ))
     nll = -TT.sum( 
                    TT.log( y_given_x_[TT.arange(y_.shape[0]), y_] ) * 
                    x_mask.reshape( (x_mask.shape[0]*x_mask.shape[1], ) ) 
                    ) / x_mask.shape[1]  # nll is the sum of nll divided by batch size
     cost = nll
     
     # l2 norm cost
     if self.l2_weight is not None:
         L2 = 0
         for p in self.params_l2:
             L2 += TT.sum(p ** 2)
         cost += self.l2_weight * L2
         print '[SimpleRNNLM] L2 norm used %g' % self.l2_weight
     else:
         print '[SimpleRNNLM] L2 norm not used'
     
     lr = TT.scalar('lr')
     
     print '[SimpleRNNLM] ... get grads ...'
     grads = TT.grad(cost, self.params)
     grad_norm = TT.sqrt(sum([TT.sum(g**2) for g in grads]))
     if self.grad_clip is not None:
         grads = clip_grad(grads, grad_norm, self.grad_clip)
         grad_norm = TT.sqrt(sum([TT.sum(g**2) for g in grads]))
     else:
         print '[SimpleRNNLM] no grad_clip is used'
     print '[SimpleRNNLM] ... got grads ...'
     
     print '[SimpleRNNLM] algo = ', self.algo
     if self.algo == 'SGD':
         updates = SGD(self.params, grads, lr)
     else:
         sys.stderr.write('Not recognized training algorithm')
         sys.exit(1)
     
     print '[SimpleRNNLM] ...build training function...'
     self.train_batch_fn = theano.function(inputs = [x, x_mask, y, lr], outputs = nll, updates = updates)
     print '[SimpleRNNLM] ...build training function done...'
     
     # valid_fn return nll
     self.valid_batch_fn = theano.function(inputs = [x, x_mask, y], outputs = nll)
     
     # detailed valid function return both nll and y_given_x
     self.detailed_valid_batch_fn = theano.function(inputs = [x, x_mask, y], outputs = [nll, y_given_x])
     
     print '[SimpleRNNLM] build train_fn and valid_fn done!'
     
     return self.train_batch_fn, self.valid_batch_fn
Exemple #28
0
    def test_dot_infershape(self):
        b = tensor.fmatrix()
        W = tensor.ftensor4()
        h = tensor.ftensor3()
        iIdx = tensor.imatrix()
        oIdx = tensor.imatrix()

        self._compile_and_check(
            [W, h, iIdx, b, oIdx], [sparse_block_dot(W, h, iIdx, b, oIdx)], self.gemv_data(), self.gemv_class
        )
Exemple #29
0
    def test_outer_infershape(self):
        o = tensor.ftensor4()
        x = tensor.ftensor3()
        y = tensor.ftensor3()
        xIdx = tensor.imatrix()
        yIdx = tensor.imatrix()

        self._compile_and_check(
            [o, x, y, xIdx, yIdx], [self.outer_op(o, x, y, xIdx, yIdx)], self.outer_data(), self.outer_class
        )
Exemple #30
0
 def __init__(self, config):
     autoassign(locals())
     self.updater = util.Adam(max_norm=config['max_norm'], lr=config['lr'])
     self.Decode = Decoder(config['size_vocab'],
                           config['size_embed'], config['size'], config['depth'],
                           activation=eval(config.get('activation','clipped_rectify')),
                           residual=config.get('residual', False))
     self.ToTxt  = Dense(config['size'], config['size_vocab']) 
     self.inputs = [T.imatrix()]
     self.target = T.imatrix()
    def __init__(self, dv, dh, dx, nc, alpha=1.0, init_scale=0.2, initial_embeddings=None, params_init=None,
                 update='adagrad', seed=None, drop_p=0.5, momentum=0.9):

        self.dv = dv  # vocabulary size
        self.dh = dh  # hidden node size
        self.dx = dx  # word embedding size
        self.nc = nc  # number of classes
        self.alpha = alpha  # regularization strength
        self.drop_p = drop_p  # probability of dropping an input with dropout

        # adagrad parameters
        self.epsilon = 0.00001

        if initial_embeddings is None:
            self.emb = theano.shared(name='embeddings',
                                     value=init_scale * np.random.uniform(-1.0, 1.0,
                                                                          (dv, dx)).astype(theano.config.floatX))
        else:
            self.emb = theano.shared(name='embeddings', value=initial_embeddings.astype(theano.config.floatX))

        self.W_x_i = theano.shared(name='W_x_i', value=init_scale * np.random.uniform(-1.0, 1.0, (dx, dh))
                                   .astype(theano.config.floatX))
        self.W_hl_i = theano.shared(name='W_hl_i', value=init_scale * np.random.uniform(-1.0, 1.0, (dh, dh))
                                    .astype(theano.config.floatX))
        self.W_hr_i = theano.shared(name='W_hr_i', value=init_scale * np.random.uniform(-1.0, 1.0, (dh, dh))
                                    .astype(theano.config.floatX))
        self.b_h_i = theano.shared(name='b_h_i', value=np.array(np.zeros(dh),
                                                                dtype=theano.config.floatX))

        self.W_x_f = theano.shared(name='W_x_f', value=init_scale * np.random.uniform(-1.0, 1.0, (dx, dh))
                                   .astype(theano.config.floatX))
        self.b_h_f = theano.shared(name='b_h_f', value=np.array(np.random.uniform(0.0, 1.0, dh),
                                                                dtype=theano.config.floatX))

        self.W_hl_fl = theano.shared(name='W_hl_fl', value=init_scale * np.random.uniform(-1.0, 1.0, (dh, dh))
                                     .astype(theano.config.floatX))
        self.W_hr_fl = theano.shared(name='W_hr_fl', value=init_scale * np.random.uniform(-1.0, 1.0, (dh, dh))
                                     .astype(theano.config.floatX))

        self.W_hl_fr = theano.shared(name='W_hl_fr', value=init_scale * np.random.uniform(-1.0, 1.0, (dh, dh))
                                     .astype(theano.config.floatX))
        self.W_hr_fr = theano.shared(name='W_hr_fr', value=init_scale * np.random.uniform(-1.0, 1.0, (dh, dh))
                                     .astype(theano.config.floatX))

        self.W_x_o = theano.shared(name='W_x_o', value=init_scale * np.random.uniform(-1.0, 1.0, (dx, dh))
                                   .astype(theano.config.floatX))
        self.W_hl_o = theano.shared(name='W_hl_o', value=init_scale * np.random.uniform(-1.0, 1.0, (dh, dh))
                                    .astype(theano.config.floatX))
        self.W_hr_o = theano.shared(name='W_hr_o', value=init_scale * np.random.uniform(-1.0, 1.0, (dh, dh))
                                    .astype(theano.config.floatX))
        self.b_h_o = theano.shared(name='b_h_o', value=np.array(np.zeros(dh),
                                                                dtype=theano.config.floatX))

        self.W_x_u = theano.shared(name='W_x_u', value=init_scale * np.random.uniform(-1.0, 1.0, (dx, dh))
                                   .astype(theano.config.floatX))
        self.W_hl_u = theano.shared(name='W_hl_u', value=init_scale * np.random.uniform(-1.0, 1.0, (dh, dh))
                                    .astype(theano.config.floatX))
        self.W_hr_u = theano.shared(name='W_hr_u', value=init_scale * np.random.uniform(-1.0, 1.0, (dh, dh))
                                    .astype(theano.config.floatX))
        self.b_h_u = theano.shared(name='b_h_u', value=np.array(np.zeros(dh),
                                                                dtype=theano.config.floatX))

        self.W_z = theano.shared(name='W_z', value=init_scale * np.random.uniform(-1.0, 1.0, (dh, nc))
                                 .astype(theano.config.floatX))
        self.b_z = theano.shared(name='b_z', value=np.array(np.zeros(nc),
                                                            dtype=theano.config.floatX))

        self.params = [self.W_x_i, self.W_hl_i, self.W_hr_i, self.b_h_i]
        self.params += [self.W_x_f, self.W_hl_fl, self.W_hr_fl, self.b_h_f]
        self.params += [self.W_hl_fr, self.W_hr_fr]
        self.params += [self.W_x_o, self.W_hl_o, self.W_hr_o, self.b_h_o]
        self.params += [self.W_x_u, self.W_hl_u, self.W_hr_u, self.b_h_u]
        self.params += [self.W_z, self.b_z]

        self.param_shapes = [(dx, dh), (dh, dh), (dh, dh), (dh,),
                             (dx, dh), (dh, dh), (dh, dh), (dh,),
                             (dh, dh), (dh, dh),
                             (dx, dh), (dh, dh), (dh, dh), (dh,),
                             (dx, dh), (dh, dh), (dh, dh), (dh,),
                             (dh, nc), (nc,)]

        if update == 'adagrad':
            self.grad_histories = [
                theano.shared(
                    value=np.zeros(param_shape, dtype=theano.config.floatX),
                    borrow=True,
                    name="grad_hist:" + param.name
                )
                for param_shape, param in zip(self.param_shapes, self.params)
                ]

        elif update == 'sgdm':
            self.velocity = [
                theano.shared(
                    value=np.zeros(param_shape, dtype=theano.config.floatX),
                    borrow=True,
                    name="momentum:" + param.name
                )
                for param_shape, param in zip(self.param_shapes, self.params)
                ]
            self.momentum = momentum

        self.theano_rng = RandomStreams(seed)

        idxs = T.ivector()
        sequence_length = T.shape(idxs)[0]
        temp = self.emb[idxs]
        x = temp.reshape([sequence_length, dx])

        #counter = T.ivector('counter')
        left_mask = T.imatrix('left_mask')
        right_mask = T.imatrix('right_mask')
        y = T.iscalar('y')
        lr = T.scalar('lr', dtype=theano.config.floatX)
        is_train = T.iscalar('is_train')
        drop_x = T.iscalar('drop_x')

        # This is a bit annoying; the 0th dimension of x needs to be sequence, so we can iterate over it
        # but the 0th dimension of the hidden nodes needs to be hidden-node dimension, so that we can broadcast
        # the mask out to it
        def treefwd(x_t, left_mask_t, right_mask_t, counter_t, h_tm1, c_tm1):
            h_t = h_tm1
            c_t = c_tm1
            # zero out the input unless this is a leaf node
            input = T.switch(T.eq(T.sum(left_mask_t) + T.sum(right_mask_t), 0), x_t, x_t*0)
            i_t = T.nnet.sigmoid(T.dot(input, self.W_x_i) + T.sum(T.dot(self.W_hl_i.T, (left_mask_t * h_tm1)).T, axis=0) + T.sum(T.dot(self.W_hr_i.T, (right_mask_t * h_tm1)).T, axis=0) + self.b_h_i)
            fl_t = T.nnet.sigmoid(T.dot(input, self.W_x_f) + T.sum(T.dot(self.W_hl_fl.T, (left_mask_t * h_tm1)).T, axis=0) + T.sum(T.dot(self.W_hr_fl.T, (right_mask_t * h_tm1)).T, axis=0) + self.b_h_f)
            fr_t = T.nnet.sigmoid(T.dot(input, self.W_x_f) + T.sum(T.dot(self.W_hl_fr.T, (left_mask_t * h_tm1)).T, axis=0) + T.sum(T.dot(self.W_hr_fr.T, (right_mask_t * h_tm1)).T, axis=0) + self.b_h_f)
            o_t = T.nnet.sigmoid(T.dot(input, self.W_x_o) + T.sum(T.dot(self.W_hl_o.T, (left_mask_t * h_tm1)).T, axis=0) + T.sum(T.dot(self.W_hr_o.T, (right_mask_t * h_tm1)).T, axis=0) + self.b_h_o)
            u_t = T.tanh(T.dot(input, self.W_x_u) + T.sum(T.dot(self.W_hl_u.T, (left_mask_t * h_tm1)).T, axis=0) + T.sum(T.dot(self.W_hr_u.T, (right_mask_t * h_tm1)).T, axis=0) + self.b_h_u)
            c_temp = i_t * u_t + fl_t * T.sum((left_mask_t * c_tm1).T, axis=0) + fr_t * T.sum((right_mask_t * c_tm1).T, axis=0)
            h_temp = o_t * T.tanh(c_temp)
            h_t = T.set_subtensor(h_t[:, counter_t], h_temp)
            c_t = T.set_subtensor(c_t[:, counter_t], c_temp)
            return h_t, c_t

        def drop(drop_input, drop_p, is_train):
            mask = self.theano_rng.binomial(p=1.0-drop_p, size=drop_input.shape, dtype=theano.config.floatX)
            return T.cast(T.switch(T.neq(is_train, 0), drop_input * mask, drop_input * (1.0-self.drop_p)), dtype=theano.config.floatX)

        ds, dx = T.shape(x)
        # do dropout on x, if specified
        x = T.switch(T.neq(drop_x, 0), drop(x, self.drop_p, is_train), x)
        output, _ = theano.scan(fn=treefwd, sequences=[x, left_mask, right_mask, T.arange(0, ds)], outputs_info=[T.zeros((dh, ds), dtype=theano.config.floatX), T.zeros((dh, ds), dtype=theano.config.floatX)])
        full_h, full_c = output
        h = full_h[-1, :, -1]

        h = drop(h, self.drop_p, is_train)
        temp = T.dot(h, self.W_z) + self.b_z
        p_y_given_x = T.nnet.softmax(temp)[0]
        pred_y = T.argmax(p_y_given_x)

        log_loss = T.sum(-T.log(p_y_given_x[y]))
        penalty = T.sum([T.sum(p ** 2) for p in self.params])
        cost = log_loss + alpha * penalty / 2.0

        gradients = [T.grad(cost, param) for param in self.params]

        if update == 'adagrad':
            new_grad_histories = [
                T.cast(g_hist + g ** 2, dtype=theano.config.floatX)
                for g_hist, g in zip(self.grad_histories, gradients)
                ]
            grad_hist_update = zip(self.grad_histories, new_grad_histories)

            param_updates = [(param, T.cast(param - lr / (T.sqrt(g_hist) + self.epsilon) * param_grad, dtype=theano.config.floatX))
                             for param, param_grad, g_hist in zip(self.params, gradients, new_grad_histories)]

            updates = grad_hist_update + param_updates

        # sgd with momentum
        elif update == 'sgdm':
            velocity_t = [momentum * v + lr * g for v, g in zip(self.velocity, gradients)]
            velocity_updates = [(v, T.cast(v_t, theano.config.floatX)) for v, v_t in zip(self.velocity, velocity_t)]
            param_updates = [(param, T.cast(param - v_t, theano.config.floatX)) for param, v_t in zip(self.params, velocity_t)]
            updates = velocity_updates + param_updates

        # else, basic sgd
        else:
            updates = OrderedDict((p, T.cast(p - lr * g, dtype=theano.config.floatX)) for p, g in zip(self.params, gradients))

        self.train = theano.function(inputs=[idxs, left_mask, right_mask, y, lr, is_train, drop_x],
                                     outputs=[pred_y, p_y_given_x, log_loss, cost], updates=updates,
                                     on_unused_input='ignore')
        self.predict = theano.function(inputs=[idxs, left_mask, right_mask, is_train, drop_x],
                                       outputs=[pred_y, p_y_given_x])

        # good example of how to see a value in a tensor; way easier than theano.printing.Print()
        idx = T.iscalar('idx')
        emb = self.emb[idx]
        self.get_embedding = theano.function(inputs=[idx], outputs=emb)
Exemple #32
0
                RECURR_SGDM_LR.set_value(RECURR_SGDM_LR.get_value() *
                                         EPOCH_LR_COEFF)
            ADAM_EPOCHS = 0
        else:
            for _ in xrange(max_epoch):
                RESNET_ADAM_LR.set_value(RESNET_ADAM_LR.get_value() *
                                         EPOCH_LR_COEFF)
                RECURR_ADAM_LR.set_value(RECURR_ADAM_LR.get_value() *
                                         EPOCH_LR_COEFF)
        param_values_file = 'param_values_{}.pkl'.format(max_epoch)

    logger.info('Building the network.')
    im_features = lasagne.layers.get_output(resnet['pool5'])
    im_features = T.flatten(im_features,
                            outdim=2)  # batch size, number of features
    cap_out_var = T.imatrix('cap_out')  # batch size, seq len
    cap_in_var = T.imatrix('cap_in')  # batch size, seq len
    mask_var = T.bmatrix('mask_var')  # batch size, seq len
    gate = lasagne.layers.Gate(W_in=lasagne.init.Orthogonal(),
                               W_hid=lasagne.init.Orthogonal(),
                               W_cell=lasagne.init.Normal(),
                               b=lasagne.init.Constant(0.0))
    cell_gate = lasagne.layers.Gate(W_in=lasagne.init.Orthogonal(),
                                    W_hid=lasagne.init.Orthogonal(),
                                    W_cell=None,
                                    b=lasagne.init.Constant(0.0),
                                    nonlinearity=lasagne.nonlinearities.tanh)
    forget_gate = lasagne.layers.Gate(W_in=lasagne.init.Orthogonal(),
                                      W_hid=lasagne.init.Orthogonal(),
                                      W_cell=lasagne.init.Normal(),
                                      b=lasagne.init.Constant(5.0))
Exemple #33
0
    def __init__(self, We_initial, params):

        self.eta = params.eta
        We = theano.shared(We_initial)
        embsize = We_initial.shape[1]
        hidden = params.hidden

        g = T.imatrix()
        gmask = T.fmatrix()
        y = T.ivector()
        idxs = T.ivector()

        l_in_word = lasagne.layers.InputLayer((None, None))
        l_mask_word = lasagne.layers.InputLayer(shape=(None, None))

        if params.emb == 1:
            l_emb_word = lasagne.layers.EmbeddingLayer(
                l_in_word,
                input_size=We_initial.shape[0],
                output_size=embsize,
                W=We)
        else:
            l_emb_word = lasagne_embedding_layer_2(l_in_word, embsize, We)
    #l_emb_word = lasagne.layers.EmbeddingLayer(l_in_word,  input_size= We_initial.shape[0]   , output_size = embsize , W =We)
        #l_emb_word = lasagne_embedding_layer_2(l_in_word,  embsize , We)

        if params.dropout:
            l_emb_word = lasagne.layers.DropoutLayer(l_emb_word, p=0.5)

        if (params.inf == 0):
            l_lstm_wordf = lasagne.layers.LSTMLayer(l_emb_word,
                                                    hidden,
                                                    mask_input=l_mask_word)
            l_lstm_wordb = lasagne.layers.LSTMLayer(l_emb_word,
                                                    hidden,
                                                    mask_input=l_mask_word,
                                                    backwards=True)

            l_reshapef = lasagne.layers.ReshapeLayer(l_lstm_wordf,
                                                     (-1, hidden))
            l_reshapeb = lasagne.layers.ReshapeLayer(l_lstm_wordb,
                                                     (-1, hidden))
            concat2 = lasagne.layers.ConcatLayer([l_reshapef, l_reshapeb])
        elif (params.inf == 1):
            l_cnn_input = lasagne.layers.DimshuffleLayer(l_emb_word, (0, 2, 1))
            l_cnn_1 = lasagne.layers.Conv1DLayer(l_cnn_input,
                                                 hidden,
                                                 1,
                                                 1,
                                                 pad='same')
            l_cnn_3 = lasagne.layers.Conv1DLayer(l_cnn_input,
                                                 hidden,
                                                 3,
                                                 1,
                                                 pad='same')
            l_cnn = lasagne.layers.ConcatLayer([l_cnn_1, l_cnn_3], axis=1)
            #l_cnn = lasagne.layers.Conv1DLayer(l_cnn_input, hidden, 1, 1, pad = 'same')
            concat2 = lasagne.layers.DimshuffleLayer(l_cnn, (0, 2, 1))
            #concat2 = lasagne.layers.ConcatLayer([l_emb_word, concat2], axis =2)
            concat2 = lasagne.layers.ReshapeLayer(concat2, (-1, 2 * hidden))
        else:
            l_cnn_input = lasagne.layers.DimshuffleLayer(l_emb_word, (0, 2, 1))
            l_cnn = lasagne.layers.Conv1DLayer(l_cnn_input,
                                               hidden,
                                               3,
                                               1,
                                               pad='same')
            concat2 = lasagne.layers.DimshuffleLayer(l_cnn, (0, 2, 1))
            concat2 = lasagne.layers.ReshapeLayer(concat2, (-1, hidden))
            concat2 = lasagne.layers.DenseLayer(concat2, num_units=hidden)

        if params.dropout:
            concat2 = lasagne.layers.DropoutLayer(concat2, p=0.5)

    #l_emb = lasagne.layers.DenseLayer(concat2, num_units=hidden, nonlinearity=lasagne.nonlinearities.tanh)
        l_out = lasagne.layers.DenseLayer(
            concat2,
            num_units=params.num_labels,
            nonlinearity=lasagne.nonlinearities.softmax)

        output = lasagne.layers.get_output(l_out, {
            l_in_word: g,
            l_mask_word: gmask
        })

        output_1 = output[idxs]

        test_output = lasagne.layers.get_output(l_out, {
            l_in_word: g,
            l_mask_word: gmask
        },
                                                deterministic=True)

        test_output_1 = test_output[idxs]

        model_params = lasagne.layers.get_all_params(l_out, trainable=True)
        self.model_p = lasagne.layers.get_all_params(l_out, trainable=True)

        reg = sum(lasagne.regularization.l2(x) for x in model_params)

        cost = lasagne.objectives.categorical_crossentropy(output_1, y)
        cost = T.mean(cost) + params.L2 * reg

        #pred = T.argmax(output_1, axis=1)
        final_pred = T.argmax(test_output_1, axis=1)

        y1 = T.ones_like(y)
        SUM = T.sum(y1)
        acc = 1.0 * T.sum(T.eq(final_pred, y)) / SUM

        self.acc_function = theano.function([g, gmask, y, idxs],
                                            [acc, final_pred],
                                            on_unused_input='warn')

        #updates = lasagne.updates.adam(cost, model_params, self.eta)
        #from adam import adam
        #updates = adam(cost, model_params, self.eta)
        updates = lasagne.updates.sgd(cost, model_params, self.eta)
        updates = lasagne.updates.apply_momentum(updates,
                                                 model_params,
                                                 momentum=0.9)
        self.train_function = theano.function([g, gmask, y, idxs], [cost, acc],
                                              updates=updates,
                                              on_unused_input='warn')
Exemple #34
0
data_dir = "/nikel/dhpark/fundus/kaggle/original/training/train_medium"
label_file = "/nikel/dhpark/fundus/kaggle/original/training/trainLabels.csv"
#mean_file = ""
#model = "models/softmax_regression"
#model = "models/double_softmax"
model = "models/512x512_model"
#model = "models/vgg_bn_pairwise"
dst_path = "/nikel/dhpark/fundus_saved_weights/vgg_pairwise"
#dst_path = "/nikel/dhpark/fundus_saved_weights/multi_task_loss_oversampled"
#dst_path = "/nikel/dhpark/fundus_saved_weights/hybrid_loss"
#dst_path = "/nikel/dhpark/fundus_saved_weights/vgg_bn_pairwise"


# Load the model
x = T.tensor4('x')
y = T.imatrix('y')
input_layer, output_layer = load_model(model).build_model(x)

# Get batchiterator
#First load the files and split to train and validation set
#Then create a iterator using these
files = data_util.get_image_files(data_dir)
names = data_util.get_names(files)
labels = data_util.get_labels(names, label_file=label_file).astype(np.int32)
print('{} files loaded'.format(len(files)))

paired_files, paired_labels, merged_labels = data_util.pair_up(files, labels)

sss = StratifiedShuffleSplit(merged_labels, n_iter=1, test_size=0.1, random_state=123)
train_idx, valid_idx = next(iter(sss))
    def __init__(self,
                 name='gnic',
                 nimg=2048,
                 nh=512,
                 nw=512,
                 nout=8843,
                 model_file=None):
        self.name = name
        if model_file is not None:
            with h5py.File(model_file, 'r') as f:
                nimg = f.attrs['nimg']
                nh = f.attrs['nh']
                nw = f.attrs['nw']
                nout = f.attrs['nout']
        self.config = {'nimg': nimg, 'nh': nh, 'nw': nw, 'nout': nout}

        # word embedding layer
        self.embedding = Embedding(n_emb=nout,
                                   dim_emb=nw,
                                   name=self.name + '@embedding')

        # initialization mlp layer
        self.proj_mlp = MLP(layer_sizes=[nimg, 2 * nh],
                            output_type='tanh',
                            name=self.name + '@proj_mlp')

        # lstm
        self.lstm = BasicLSTM(dim_x=nw, dim_h=nh, name=self.name + '@lstm')

        # prediction mlp
        self.pred_mlp = MLP(layer_sizes=[nh + nw, nout],
                            output_type='softmax',
                            name=self.name + '@pred_mlp')

        # inputs
        cap = T.imatrix('cap')
        img = T.matrix('img')
        self.inputs = [cap, img]

        # go through sequence
        init_state = self.proj_mlp.compute(img)
        (state, self.p,
         loss), _ = theano.scan(fn=self.scan_func,
                                sequences=[cap[0:-1, :], cap[1:, :]],
                                outputs_info=[init_state, None, None])

        # loss function
        loss = T.mean(loss)
        self.costs = [loss]

        # layers and parameters
        self.layers = [self.embedding, self.proj_mlp, self.lstm, self.pred_mlp]
        self.params = sum([l.params for l in self.layers], [])

        # load weights from file, if model_file is not None
        if model_file is not None:
            self.load_weights(model_file)

        # these functions are used in test stage
        self._init_func = None
        self._step_func = None
    def __init__(self, input_width, input_height, num_actions,
                 num_frames, discount, learning_rate, rho,
                 rms_epsilon, momentum, clip_delta, freeze_interval,
                 batch_size, network_type, update_rule, lambda_reg,
                 batch_accumulator, pretrained_net, rng, input_scale=255.0):

        self.input_width = input_width
        self.input_height = input_height
        self.num_actions = num_actions
        self.num_frames = num_frames
        self.batch_size = batch_size
        self.discount = discount
        self.rho = rho
        self.lr = learning_rate
        self.rms_epsilon = rms_epsilon
        self.momentum = momentum
        self.clip_delta = clip_delta
        self.freeze_interval = freeze_interval
        self.rng = rng
        self.lambda_reg = lambda_reg

        lasagne.random.set_rng(self.rng)

        self.update_counter = 0

        self.l_in, self.l_act_in, self.l_out, self.pred_z, self.true_z = \
                                        self.build_network(network_type, \
                                        input_width, input_height, num_actions,\
                                        num_frames, batch_size)

        if self.freeze_interval > 0:
            self.next_l_in, self.next_l_act_in, self.next_l_out, _d, _d = \
                                self.build_network(network_type, input_width, \
                                input_height, num_actions, num_frames, batch_size)
            self.reset_q_hat()

        states = T.tensor4('states')
        next_states = T.tensor4('next_states')
        rewards = T.col('rewards')
        actions = T.imatrix('actions')
        terminals = T.icol('terminals')

        # Shared variables for training from a minibatch of replayed
        # state transitions, each consisting of num_frames + 1 (due to
        # overlap) images, along with the chosen action and resulting
        # reward and terminal status.
        self.imgs_shared = theano.shared(
            np.zeros((batch_size, num_frames*2+1, input_height, input_width),
                     dtype=theano.config.floatX))
        self.rewards_shared = theano.shared(
            np.zeros((batch_size, 1), dtype=theano.config.floatX),
            broadcastable=(False, True))
        self.actions_shared = theano.shared(
            np.zeros((batch_size, num_frames), dtype='int32')
            )
        self.terminals_shared = theano.shared(
            np.zeros((batch_size, 1), dtype='int32'),
            broadcastable=(False, True))

        # Shared variable for a single state, to calculate q_vals.
        self.state_shared = theano.shared(
            np.zeros((num_frames*2, input_height, input_width),
                     dtype=theano.config.floatX))

        q_vals, z_pred, z_true = lasagne.layers.get_output(
                                    [self.l_out, self.pred_z, self.true_z],
                                    inputs = {self.l_in: states / input_scale,
                                        self.l_act_in: actions}
                                )
        
        if self.freeze_interval > 0:
            next_q_vals = lasagne.layers.get_output(
                                    self.next_l_out,
                                    {self.next_l_in: next_states / input_scale, 
                                     self.next_l_act_in: actions}
                                    )
        else:
            next_q_vals = lasagne.layers.get_output(
                                    self.l_out,
                                    {self.l_in: next_states / input_scale, 
                                     self.l_act_in: actions}
                                    )
            next_q_vals = theano.gradient.disconnected_grad(next_q_vals)

        terminalsX = terminals.astype(theano.config.floatX)
        actionmask = T.eq(T.arange(num_actions).reshape((1, -1)),
                actions[:, 0].reshape((-1, 1))).astype(theano.config.floatX)

        target = (rewards +
                  (T.ones_like(terminalsX) - terminalsX) *
                  self.discount * T.max(next_q_vals, axis=1, keepdims=True))
        output = (q_vals * actionmask).sum(axis=1).reshape((-1, 1))
        diff = target - output
        diff_reg = z_true - z_pred

        if self.clip_delta > 0:
            # If we simply take the squared clipped diff as our loss,
            # then the gradient will be zero whenever the diff exceeds
            # the clip bounds. To avoid this, we extend the loss
            # linearly past the clip point to keep the gradient constant
            # in that regime.
            # 
            # This is equivalent to declaring d loss/d q_vals to be
            # equal to the clipped diff, then backpropagating from
            # there, which is what the DeepMind implementation does.
            quadratic_part = T.minimum(abs(diff), self.clip_delta)
            linear_part = abs(diff) - quadratic_part
            loss = 0.5 * quadratic_part ** 2 + self.clip_delta * linear_part
        else:
            loss = 0.5 * diff ** 2

        loss = loss + 0.5 * self.lambda_reg * (diff_reg ** 2).sum(axis=1)

        if batch_accumulator == 'sum':
            loss = T.sum(loss)
        elif batch_accumulator == 'mean':
            loss = T.mean(loss)
        else:
            raise ValueError("Bad accumulator: {}".format(batch_accumulator))

        params = lasagne.layers.helper.get_all_params([self.l_out, self.pred_z, self.true_z])  
        train_givens = {
            states: self.imgs_shared[:, :-1],
            next_states: self.imgs_shared[:, 1:],
            rewards: self.rewards_shared,
            actions: self.actions_shared,
            terminals: self.terminals_shared
        }

        if update_rule == 'deepmind_rmsprop':
            updates = deepmind_rmsprop(loss, params, self.lr, self.rho,
                                       self.rms_epsilon)
        elif update_rule == 'rmsprop':
            updates = lasagne.updates.rmsprop(loss, params, self.lr, self.rho,
                                              self.rms_epsilon)
        elif update_rule == 'sgd':
            updates = lasagne.updates.sgd(loss, params, self.lr)
        else:
            raise ValueError("Unrecognized update: {}".format(update_rule))

        if self.momentum > 0:
            updates = lasagne.updates.apply_momentum(updates, None,
                                                     self.momentum)

        self._train = theano.function([], [loss], updates=updates,
                                      givens=train_givens)
        q_givens = {
            states: self.state_shared.reshape((1,
                                               self.num_frames*2,
                                               self.input_height,
                                               self.input_width))
        }
        self._q_vals = theano.function([], q_vals[0], givens=q_givens)
Exemple #37
0
config = importlib.import_module('configurations.%s' %
                                 metadata['configuration'])

# samples dir
if not os.path.isdir('samples'):
    os.makedirs('samples')
target_path = "samples/%s-s%d-%.2f-%s.txt" % (
    metadata['experiment_id'], rng_seed, temperature,
    time.strftime("%Y%m%d-%H%M%S", time.localtime()))

token2idx = metadata['token2idx']
idx2token = dict((v, k) for k, v in token2idx.iteritems())
vocab_size = len(token2idx)

print('Building the model')
x = T.imatrix('x')

l_inp = InputLayer((1, None), input_var=x)

W_emb = np.eye(
    vocab_size,
    dtype='float32') if config.one_hot else lasagne.init.Orthogonal()
emb_output_size = vocab_size if config.one_hot else config.embedding_size
l_emb = EmbeddingLayer(l_inp,
                       input_size=vocab_size,
                       output_size=emb_output_size,
                       W=W_emb)

main_layers = []
for _ in xrange(config.num_layers):
    if not main_layers:
Exemple #38
0
    def ready(self):
        args = self.args
        w_emb_layer = self.w_emb_layer
        c_emb_layer = self.c_emb_layer
        r_emb_layers = self.r_emb_layers
        r_matrix_layers = self.r_matrix_layers

        char_dim = self.char_dim = args.char_dim
        char_lstm_dim = self.char_lstm_dim = args.char_lstm_dim
        word_dim = self.word_dim = args.word_dim
        word_lstm_dim = self.word_lstm_dim = args.word_lstm_dim

        dropout = self.dropout = theano.shared(
            np.float64(args.dropout).astype(theano.config.floatX))

        word_ids = self.word_ids = T.ivector('word_ids')
        char_ids = self.char_ids = T.imatrix('char_ids')
        char_lens = self.char_lens = T.fvector('char_lens')
        char_masks = self.char_masks = T.imatrix('char_masks')
        up_ids = self.up_ids = T.imatrix('up_ids')
        up_rels = self.up_rels = T.imatrix('up_rels')
        up_id_masks = self.up_id_masks = T.imatrix('up_id_masks')
        down_ids = self.down_ids = T.imatrix('down_ids')
        down_rels = self.down_rels = T.imatrix('down_rels')
        down_id_masks = self.down_id_masks = T.imatrix('down_id_masks')
        tag_ids = self.tag_ids = T.ivector('tag_ids')

        layers = self.layers = [w_emb_layer, c_emb_layer]
        layers.extend(r_emb_layers)
        layers.extend(r_matrix_layers)

        inputs = self.inputs = []

        inputs.append(self.word_ids)
        inputs.append(self.char_ids)
        inputs.append(self.char_lens)
        inputs.append(self.char_masks)
        inputs.append(self.up_ids)
        inputs.append(self.up_rels)
        inputs.append(self.up_id_masks)
        inputs.append(self.down_ids)
        inputs.append(self.down_rels)
        inputs.append(self.down_id_masks)
        inputs.append(self.tag_ids)
        wslices = w_emb_layer.forward(word_ids)
        cslices = c_emb_layer.forward(char_ids.ravel())
        cslices = cslices.reshape(
            (char_ids.shape[0], char_ids.shape[1], char_dim))
        cslices = cslices.dimshuffle(1, 0, 2)

        bv_ur_slicess = []
        bv_dr_slicess = []
        b_ur_slicess = []
        b_dr_slicess = []

        bv_ur_matrixss = []
        bv_dr_matrixss = []
        b_ur_matrixss = []
        b_dr_matrixss = []

        for r_matrix_layer in r_matrix_layers:
            bv_ur_matrixs = r_matrix_layer.forward1(up_rels.ravel())
            bv_dr_matrixs = r_matrix_layer.forward1(down_rels.ravel())
            b_ur_matrixs = r_matrix_layer.forward2(up_rels.ravel())
            b_dr_matrixs = r_matrix_layer.forward2(down_rels.ravel())
            bv_ur_matrixss.append(
                bv_ur_matrixs.reshape(
                    (up_rels.shape[0], up_rels.shape[1], word_dim, word_dim)))
            bv_dr_matrixss.append(
                bv_dr_matrixs.reshape((down_rels.shape[0], down_rels.shape[1],
                                       word_dim, word_dim)))
            b_ur_matrixss.append(
                b_ur_matrixs.reshape(
                    (up_rels.shape[0], up_rels.shape[1], word_dim, word_dim)))
            b_dr_matrixss.append(
                b_dr_matrixs.reshape((down_rels.shape[0], down_rels.shape[1],
                                      word_dim, word_dim)))

        for r_emb_layer in r_emb_layers:
            bv_ur_slices = r_emb_layer.forward(up_rels.ravel())
            bv_dr_slices = r_emb_layer.forward(down_rels.ravel())
            b_ur_slices = r_emb_layer.forward2(up_rels.ravel())
            b_dr_slices = r_emb_layer.forward2(down_rels.ravel())
            bv_ur_slicess.append(
                bv_ur_slices.reshape(
                    (up_rels.shape[0], up_rels.shape[1], word_dim)))
            bv_dr_slicess.append(
                bv_dr_slices.reshape(
                    (down_rels.shape[0], down_rels.shape[1], word_dim)))
            b_ur_slicess.append(
                b_ur_slices.reshape(
                    (up_rels.shape[0], up_rels.shape[1], word_dim)))
            b_dr_slicess.append(
                b_dr_slices.reshape(
                    (down_rels.shape[0], down_rels.shape[1], word_dim)))

        char_masks = char_masks.dimshuffle(1, 0)

        prev_output = wslices
        prev_size = word_dim

        if char_dim:
            layers.append(
                LSTM(n_in=char_dim,
                     n_out=char_lstm_dim,
                     direction='bi' if args.char_bidirect else 'si'))
            prev_output_2 = cslices
            prev_output_2 = apply_dropout(prev_output_2, dropout, v2=True)
            prev_output_2 = layers[-1].forward_all(cslices, char_masks)
            prev_output_2 = T.sum(prev_output_2, axis=0)
            prev_output_2 = prev_output_2 / (1e-6 * T.ones_like(char_lens) +
                                             char_lens).dimshuffle(0, 'x')

            prev_size += char_lstm_dim
            prev_output = T.concatenate([prev_output, prev_output_2], axis=1)

        prev_output = apply_dropout(prev_output, dropout)
        if args.conv != 0:
            for ind in range(args.clayer):
                layers.append(GraphCNNTensor(
                    n_in=prev_size,
                    n_out=prev_size,
                ))
                residual = True
                if ind == 0:
                    residual = False
                prev_output = layers[-1].forward_all(prev_output,
                                                     up_ids,
                                                     up_id_masks,
                                                     bv_ur_slicess[ind],
                                                     bv_ur_matrixss[ind],
                                                     b_ur_slicess[ind],
                                                     b_ur_matrixss[ind],
                                                     down_ids,
                                                     down_id_masks,
                                                     bv_dr_slicess[ind],
                                                     bv_dr_matrixss[ind],
                                                     b_dr_slicess[ind],
                                                     b_dr_matrixss[ind],
                                                     residual=residual)
                prev_output = apply_dropout(prev_output, dropout)

        prev_size *= 3
        layers.append(
            LSTM(n_in=prev_size,
                 n_out=word_lstm_dim,
                 direction='bi' if args.word_bidirect else 'si'))

        prev_output = prev_output.dimshuffle(0, 'x', 1)
        prev_output = layers[-1].forward_all(prev_output)
        prev_output = prev_output.reshape(
            (prev_output.shape[0], prev_output.shape[-1]))

        prev_size = word_lstm_dim

        layers.append(
            Layer(
                n_in=prev_size,
                n_out=args.classes,
                activation=linear,  #ReLU,
                has_bias=False))

        n_tags = args.classes
        s_len = char_ids.shape[0]
        tags_scores = layers[-1].forward(prev_output)
        transitions = shared((n_tags + 2, n_tags + 2), 'transitions')
        small = -1000
        b_s = np.array([[small] * n_tags + [0, small]]).astype(np.float32)
        e_s = np.array([[small] * n_tags + [small, 0]]).astype(np.float32)
        observations = T.concatenate([tags_scores, small * T.ones((s_len, 2))],
                                     axis=1)

        observations = T.concatenate([b_s, observations, e_s], axis=0)

        real_path_score = tags_scores[T.arange(s_len), tag_ids].sum()
        b_id = theano.shared(value=np.array([n_tags], dtype=np.int32))
        e_id = theano.shared(value=np.array([n_tags + 1], dtype=np.int32))
        padded_tags_ids = T.concatenate([b_id, tag_ids, e_id], axis=0)

        pre_ids = T.arange(s_len + 1)

        s_ids = T.arange(s_len + 1) + 1

        real_path_score += transitions[padded_tags_ids[pre_ids],
                                       padded_tags_ids[s_ids]].sum()

        all_paths_scores = CRFForward(observations, transitions)
        self.nll_loss = nll_loss = -(real_path_score - all_paths_scores)
        preds = CRFForward(observations,
                           transitions,
                           viterbi=True,
                           return_alpha=False,
                           return_best_sequence=True)

        self.pred = preds[1:-1]

        self.l2_sqr = None
        params = self.params = [transitions]
        for layer in layers:
            self.params += layer.params
        for p in self.params:
            if self.l2_sqr is None:
                self.l2_sqr = args.l2_reg * T.sum(p**2)
            else:
                self.l2_sqr += args.l2_reg * T.sum(p**2)

        #for l, i in zip(layers[3:], range(len(layers[3:]))):
        for l, i in zip(
                layers[2 + len(r_emb_layers) + len(r_matrix_layers):],
                range(
                    len(layers[2 + len(r_emb_layers) +
                               len(r_matrix_layers):]))):
            say("layer {}: n_in={}\tn_out={}\n".format(i, l.n_in, l.n_out))

        nparams = sum(len(x.get_value(borrow=True).ravel()) \
                        for x in self.params)
        say("total # parameters: {}\n".format(nparams))

        cost = self.nll_loss + self.l2_sqr

        lr_method_name = args.learning
        lr_method_parameters = {}
        lr_method_parameters['lr'] = args.learning_rate
        updates = Optimization(clip=5.0).get_updates(lr_method_name, cost,
                                                     params,
                                                     **lr_method_parameters)

        f_train = theano.function(inputs=self.inputs,
                                  outputs=[cost, nll_loss],
                                  updates=updates,
                                  allow_input_downcast=True)

        f_eval = theano.function(inputs=self.inputs[:-1],
                                 outputs=self.pred,
                                 allow_input_downcast=True)

        return f_train, f_eval
Exemple #39
0
    def build(self,
              dropout,
              char_dim,
              char_lstm_dim,
              char_bidirect,
              word_dim,
              word_lstm_dim,
              word_bidirect,
              lr_method,
              pre_emb,
              crf,
              cap_dim,
              training=True,
              hard_training_labels=True,
              crf_probs=False,
              **kwargs):
        """
        Build the network.
        """
        # Training parameters
        n_words = len(self.id_to_word)
        n_chars = len(self.id_to_char)
        n_tags = len(self.id_to_tag)

        # Number of capitalization features
        if cap_dim:
            n_cap = 4

        # Network variables
        is_train = T.iscalar('is_train')
        word_ids = T.ivector(name='word_ids')
        char_for_ids = T.imatrix(name='char_for_ids')
        char_rev_ids = T.imatrix(name='char_rev_ids')
        char_pos_ids = T.ivector(name='char_pos_ids')

        if hard_training_labels:
            tag_ids = T.ivector(name='tag_ids')
        else:
            tag_dist = T.imatrix(name='tag_dist')

        if cap_dim:
            cap_ids = T.ivector(name='cap_ids')

        # Sentence length
        s_len = (word_ids if word_dim else char_pos_ids).shape[0]

        # Final input (all word features)
        input_dim = 0
        inputs = []

        #
        # Word inputs
        #
        if word_dim:
            input_dim += word_dim
            word_layer = EmbeddingLayer(n_words, word_dim, name='word_layer')
            word_input = word_layer.link(word_ids)
            inputs.append(word_input)
            # Initialize with pretrained embeddings
            if pre_emb and training:
                new_weights = word_layer.embeddings.get_value()
                print('Loading pretrained embeddings from %s...' % pre_emb)
                pretrained = {}
                emb_invalid = 0
                for i, line in enumerate(codecs.open(pre_emb, 'r', 'utf-8')):

                    if i > 100000:
                        break  # we don't need all the embeddings

                    line = line.rstrip().split()
                    if len(line) == word_dim + 1:
                        pretrained[line[0]] = np.array(
                            [float(x) for x in line[1:]]).astype(np.float32)
                    else:
                        emb_invalid += 1
                if emb_invalid > 0:
                    print('WARNING: %i invalid lines' % emb_invalid)
                c_found = 0
                c_lower = 0
                c_zeros = 0
                # Lookup table initialization
                for i in range(n_words):
                    word = self.id_to_word[i]
                    if word in pretrained:
                        new_weights[i] = pretrained[word]
                        c_found += 1
                    elif word.lower() in pretrained:
                        new_weights[i] = pretrained[word.lower()]
                        c_lower += 1
                    elif re.sub('\d', '0', word.lower()) in pretrained:
                        new_weights[i] = pretrained[re.sub(
                            '\d', '0', word.lower())]
                        c_zeros += 1
                word_layer.embeddings.set_value(new_weights)
                print('Loaded %i pretrained embeddings.' % len(pretrained))
                print(('%i / %i (%.4f%%) words have been initialized with '
                       'pretrained embeddings.') %
                      (c_found + c_lower + c_zeros, n_words, 100. *
                       (c_found + c_lower + c_zeros) / n_words))
                print(('%i found directly, %i after lowercasing, '
                       '%i after lowercasing + zero.') %
                      (c_found, c_lower, c_zeros))

        #
        # Chars inputs
        #
        if char_dim:
            input_dim += char_lstm_dim
            char_layer = EmbeddingLayer(n_chars, char_dim, name='char_layer')

            char_lstm_for = LSTM(char_dim,
                                 char_lstm_dim,
                                 with_batch=True,
                                 name='char_lstm_for')
            char_lstm_rev = LSTM(char_dim,
                                 char_lstm_dim,
                                 with_batch=True,
                                 name='char_lstm_rev')

            char_lstm_for.link(char_layer.link(char_for_ids))
            char_lstm_rev.link(char_layer.link(char_rev_ids))

            char_for_output = char_lstm_for.h.dimshuffle(
                (1, 0, 2))[T.arange(s_len), char_pos_ids]
            char_rev_output = char_lstm_rev.h.dimshuffle(
                (1, 0, 2))[T.arange(s_len), char_pos_ids]

            inputs.append(char_for_output)
            if char_bidirect:
                inputs.append(char_rev_output)
                input_dim += char_lstm_dim

        #
        # Capitalization feature
        #
        if cap_dim:
            input_dim += cap_dim
            cap_layer = EmbeddingLayer(n_cap, cap_dim, name='cap_layer')
            inputs.append(cap_layer.link(cap_ids))

        # Prepare final input
        inputs = T.concatenate(inputs,
                               axis=1) if len(inputs) != 1 else inputs[0]

        #
        # Dropout on final input
        #
        if dropout:
            dropout_layer = DropoutLayer(p=dropout)
            input_train = dropout_layer.link(inputs)
            input_test = (1 - dropout) * inputs
            inputs = T.switch(T.neq(is_train, 0), input_train, input_test)

        # LSTM for words
        word_lstm_for = LSTM(input_dim,
                             word_lstm_dim,
                             with_batch=False,
                             name='word_lstm_for')
        word_lstm_rev = LSTM(input_dim,
                             word_lstm_dim,
                             with_batch=False,
                             name='word_lstm_rev')
        word_lstm_for.link(inputs)
        word_lstm_rev.link(inputs[::-1, :])
        word_for_output = word_lstm_for.h
        word_rev_output = word_lstm_rev.h[::-1, :]
        if word_bidirect:
            final_output = T.concatenate([word_for_output, word_rev_output],
                                         axis=1)
            tanh_layer = HiddenLayer(2 * word_lstm_dim,
                                     word_lstm_dim,
                                     name='tanh_layer',
                                     activation='tanh')
            final_output = tanh_layer.link(final_output)
        else:
            final_output = word_for_output

        # Sentence to Named Entity tags - Score
        final_layer = HiddenLayer(word_lstm_dim,
                                  n_tags,
                                  name='final_layer',
                                  activation=(None if crf else 'softmax'))
        tags_scores = final_layer.link(final_output)

        # No CRF
        if not crf:
            # Here we pass in hard labels as 'tag_ids'. We can also pass in a matrix in this place where each row is
            # a categorical distribution to provide a soft label.
            if hard_training_labels:
                cost = T.nnet.categorical_crossentropy(tags_scores,
                                                       tag_ids).mean()
            else:
                cost = T.nnet.categorical_crossentropy(tags_scores,
                                                       tag_dist).mean()
        # CRF
        else:
            transitions = shared((n_tags + 2, n_tags + 2), 'transitions')

            small = -1000
            b_s = np.array([[small] * n_tags + [0, small]]).astype(np.float32)
            e_s = np.array([[small] * n_tags + [small, 0]]).astype(np.float32)
            observations = T.concatenate(
                [tags_scores, small * T.ones((s_len, 2))], axis=1)
            observations = T.concatenate([b_s, observations, e_s], axis=0)

            # Score from tags -- uses tag_ids as hard labels here.
            if hard_training_labels:
                real_path_score = tags_scores[T.arange(s_len), tag_ids].sum()
            else:  # soft training labels (probabilities)
                real_path_score = (tags_scores * tag_dist).sum()

            b_id = theano.shared(value=np.array([n_tags], dtype=np.int32))
            e_id = theano.shared(value=np.array([n_tags + 1], dtype=np.int32))
            if hard_training_labels:
                padded_tags_ids = T.concatenate([b_id, tag_ids, e_id], axis=0)
            else:
                padded_tags_ids = T.concatenate([b_id, tag_dist, e_id], axis=0)
            real_path_score += transitions[padded_tags_ids[T.arange(s_len +
                                                                    1)],
                                           padded_tags_ids[T.arange(s_len + 1)
                                                           + 1]].sum()

            all_paths_scores = forward(observations, transitions)
            cost = -(real_path_score - all_paths_scores)

        # Network parameters
        params = []
        if word_dim:
            self.add_component(word_layer)
            params.extend(word_layer.params)
        if char_dim:
            self.add_component(char_layer)
            self.add_component(char_lstm_for)
            params.extend(char_layer.params)
            params.extend(char_lstm_for.params)
            if char_bidirect:
                self.add_component(char_lstm_rev)
                params.extend(char_lstm_rev.params)
        self.add_component(word_lstm_for)
        params.extend(word_lstm_for.params)
        if word_bidirect:
            self.add_component(word_lstm_rev)
            params.extend(word_lstm_rev.params)
        if cap_dim:
            self.add_component(cap_layer)
            params.extend(cap_layer.params)
        self.add_component(final_layer)
        params.extend(final_layer.params)
        if crf:
            self.add_component(transitions)
            params.append(transitions)
        if word_bidirect:
            self.add_component(tanh_layer)
            params.extend(tanh_layer.params)

        # Prepare train and eval inputs
        eval_inputs = []
        if word_dim:
            eval_inputs.append(word_ids)
        if char_dim:
            eval_inputs.append(char_for_ids)
            if char_bidirect:
                eval_inputs.append(char_rev_ids)
            eval_inputs.append(char_pos_ids)
        if cap_dim:
            eval_inputs.append(cap_ids)

        if hard_training_labels:
            train_inputs = eval_inputs + [tag_ids]
        else:
            train_inputs = eval_inputs + [tag_dist]

        # Parse optimization method parameters
        if "-" in lr_method:
            lr_method_name = lr_method[:lr_method.find('-')]
            lr_method_parameters = {}
            for x in lr_method[lr_method.find('-') + 1:].split('-'):
                split = x.split('_')
                assert len(split) == 2
                lr_method_parameters[split[0]] = float(split[1])
        else:
            lr_method_name = lr_method
            lr_method_parameters = {}

        # Compile training function
        print('Compiling...')
        if training:
            updates = Optimization(clip=5.0).get_updates(
                lr_method_name, cost, params, **lr_method_parameters)
            f_train = theano.function(inputs=train_inputs,
                                      outputs=cost,
                                      updates=updates,
                                      givens=({
                                          is_train: np.cast['int32'](1)
                                      } if dropout else {}))
        else:
            f_train = None

        # Compile evaluation function
        if not crf:
            f_eval = theano.function(inputs=eval_inputs,
                                     outputs=tags_scores,
                                     givens=({
                                         is_train: np.cast['int32'](0)
                                     } if dropout else {}))
        else:
            f_eval = theano.function(inputs=eval_inputs,
                                     outputs=forward(
                                         observations,
                                         transitions,
                                         viterbi=True,
                                         return_alpha=crf_probs,
                                         return_best_sequence=not crf_probs),
                                     givens=({
                                         is_train: np.cast['int32'](0)
                                     } if dropout else {}))

        return f_train, f_eval
def build_model(options):
    print('Build model...')
    sys.stdout.flush()
    weights = None
    if options['flag_random_lookup_table'] == False:
        weights = options['embedding']
    embed_layer = Embedding(input_dim=options['embedding'].shape[0],
                            output_dim=options['embedding'].shape[1],
                            weights=weights)
    dense_layers = []
    dense_layers.append(
        Dense(input_dim=options['embedding'].shape[1] * 2,
              output_dim=options['size_hidden_layer'],
              activation='tanh'))
    dense_layers.append(
        Dense(input_dim=options['size_hidden_layer'],
              output_dim=1,
              activation='sigmoid'))

    # for training
    sentence1 = T.imatrix('s1')  # sentence1, n_samples * len_sentence
    sentence1_mask = T.matrix('s1_mask')
    sentence2 = T.imatrix('s2')  # sentence2, n_samples * len_sentence
    sentence2_mask = T.matrix('s2_mask')
    y = T.ivector('y1')  # n_samples

    embed_s1 = embed_layer.get_output(
        sentence1)  # n_samples * len_sentence * embed_dim
    embed_s2 = embed_layer.get_output(
        sentence2)  # n_samples * len_sentence * embed_dim
    if options['sentence_modeling'] == 'CBoW':
        embed_s1 = ave_embed(embed_s1, sentence1_mask)  # n_samples * embed_dim
        embed_s2 = ave_embed(embed_s2, sentence2_mask)  # n_samples * embed_dim
    elif options['sentence_modeling'] == 'CNN':
        sentence_encode_layer = Convolution1D(
            input_dim=options['embedding'].shape[1],
            activation='tanh',
            nb_filter=options['embedding'].shape[1],
            filter_length=options['CNN_filter_length'],
            border_mode='same')
        embed_s1 = CNN_embed(embed_s1, sentence1_mask,
                             sentence_encode_layer)  # n_samples * embed_dim
        embed_s2 = CNN_embed(embed_s2, sentence2_mask,
                             sentence_encode_layer)  # n_samples * embed_dim
    elif options['sentence_modeling'] == 'LSTM':
        sentence_encode_layer = LSTM(input_dim=options['embedding'].shape[1],
                                     output_dim=options['embedding'].shape[1])
        embed_s1 = LSTM_embed(embed_s1, sentence1_mask, sentence_encode_layer,
                              options)  # n_samples * embed_dim
        embed_s2 = LSTM_embed(embed_s2, sentence2_mask, sentence_encode_layer,
                              options)  # n_samples * embed_dim
    else:
        print 'Error: No model called %s available!' % options[
            'sentence_modeling']
        return

    output = T.concatenate([embed_s1, embed_s2],
                           axis=-1)  # n_samples * (embed_dim * 2)

    if options['flag_dropout'] == True:
        output = dropout(output, level=options['dropoutRates'])
    for dense_layer in dense_layers:
        output = dense_layer.get_output(output)
    f_pred = theano.function(
        [sentence1, sentence1_mask, sentence2, sentence2_mask],
        output,
        allow_input_downcast=True)

    output = output.reshape((output.shape[0], ))
    #y = y.reshape((output.shape[0],1))
    cost = T.nnet.binary_crossentropy(output, y).mean()
    f_debug = theano.function(
        [sentence1, sentence1_mask, sentence2, sentence2_mask, y],
        [output, y, T.nnet.binary_crossentropy(output, y), cost],
        allow_input_downcast=True)
    tparams = []
    tparams += embed_layer.params
    if options['sentence_modeling'] != 'CBoW':
        tparams += sentence_encode_layer.params
    for dense_layer in dense_layers:
        tparams += dense_layer.params
    return sentence1, sentence1_mask, sentence2, sentence2_mask, y, cost, f_pred, tparams, f_debug
Exemple #41
0
                         initialization='he',
                         weightnorm=WEIGHT_NORM)
    out = T.nnet.relu(out)

    # Output
    # We apply the softmax later
    out = lib.ops.Linear('SampleLevel.Output',
                         DIM,
                         Q_LEVELS,
                         out,
                         weightnorm=WEIGHT_NORM)
    return out


print('----got to T var---')
sequences = T.imatrix('sequences')
h0 = T.tensor3('h0')
reset = T.iscalar('reset')
mask = T.matrix('mask')
#sequences_lab      = T.tensor3('sequences_lab')
sequences_lab = T.itensor3('sequences_lab')

if args.debug:
    # Solely for debugging purposes.
    # Maybe I should set the compute_test_value=warn from here.
    sequences.tag.test_value = numpy.zeros((BATCH_SIZE, SEQ_LEN + OVERLAP),
                                           dtype='int32')
    h0.tag.test_value = numpy.zeros((BATCH_SIZE, N_RNN, H0_MULT * DIM),
                                    dtype='float32')
    reset.tag.test_value = numpy.array(1, dtype='int32')
    mask.tag.test_value = numpy.ones((BATCH_SIZE, SEQ_LEN + OVERLAP),
Exemple #42
0
def run_epoch():
    # define symbolic variables
    q = T.imatrix('q')
    q_mask = T.matrix('q_mask', dtype=theano.config.floatX)
    l = T.imatrix('l')
    l_mask = T.matrix('l_mask', dtype=theano.config.floatX)
    a = T.imatrix('a')
    a_mask = T.matrix('a_mask', dtype=theano.config.floatX)
    y = T.ivector('y')
    lr = T.scalar(name='lr')

    np_emb = get_embedding_matrix_from_param_file(config.embedding_param_file)

    # build model
    print '...building model'
    model = DMN(q, q_mask, l, l_mask, a, a_mask, y, np_emb,
                options['word_size'], options['hidden_size'],
                options['use_dropout'], options['drop_p'])

    cost = model.loss
    grads = T.grad(cost, wrt=list(model.params.values()))
    optimizer = options['optimizer']
    f_grad_shared, f_update = optimizer(lr, model.params, grads,
                                        [q, q_mask, l, l_mask, a, a_mask, y],
                                        cost)

    detector = theano.function(inputs=[q, q_mask, l, l_mask, a, a_mask, y],
                               outputs=model.error,
                               on_unused_input='ignore')
    p_predictor = theano.function(inputs=[q, q_mask, l, l_mask, a, a_mask],
                                  outputs=model.p_d,
                                  on_unused_input='ignore')

    # load parameters from specified file
    if not options['loaded_params'] is None:
        print '...loading parameters from ' + options['loaded_params']
        file_name = options['loaded_params']
        with open(file_name, 'rb') as f:
            param_dict = cPickle.load(f)
            for k, v in model.params.items():
                v.set_value(param_dict[k])

    # test the performance of initialized parameters
    print '...testing the performance of initialized parameters'
    p_ds = []
    ys = []
    for q_, q_mask_, l_, l_mask_, a_, a_mask_, y_ in gkhmc_qla_iterator(
            path=config.dataset,
            batch_size=options['valid_batch_size'],
            is_train=False):
        p_d = p_predictor(q_, q_mask_, l_, l_mask_, a_, a_mask_)
        p_ds.extend(p_d)
        ys.extend(y_)
    right_num, total_num, _ = pred_check(p_ds, ys)
    print right_num, '/', total_num

    best_perform = -np.inf

    print '...training model'
    for i in xrange(options['max_epochs']):
        total_loss = 0.
        idx = 0
        for q_, q_mask_, l_, l_mask_, a_, a_mask_, y_ in gkhmc_qla_iterator(
                path=config.dataset,
                batch_size=options['batch_size'],
                is_train=True):
            model.emb_set_value_zero()
            this_cost = f_grad_shared(q_, q_mask_, l_, l_mask_, a_, a_mask_,
                                      y_)
            f_update(options['lrate'])
            total_loss += this_cost
            print '\r', 'epoch:', i, ', idx:', idx, ', this_loss:', this_cost,
            idx += 1
        print ', total loss:', total_loss

        # validate model performance when necessary
        if (i + 1) % options['valid_freq'] == 0:
            # test performance on train set
            errors = []
            for q_, q_mask_, l_, l_mask_, a_, a_mask_, y_ in gkhmc_qla_iterator(
                    path=config.dataset,
                    batch_size=options['valid_batch_size'],
                    is_train=True):
                error = detector(q_, q_mask_, l_, l_mask_, a_, a_mask_, y_)
                errors.append(error)
            print '\ttrain error of epoch ' + str(i) + ': ' + str(
                np.mean(errors) * 100) + '%'

            # test performance on test set
            p_ds = []
            ys = []
            for q_, q_mask_, l_, l_mask_, a_, a_mask_, y_ in gkhmc_qla_iterator(
                    path=config.dataset,
                    batch_size=options['valid_batch_size'],
                    is_train=False):
                p_d = p_predictor(q_, q_mask_, l_, l_mask_, a_, a_mask_)
                p_ds.extend(p_d)
                ys.extend(y_)
            right_num, total_num, _ = pred_check(p_ds, ys)

            # judge whether it's necessary to save the parameters
            save = False
            if float(right_num) / float(total_num) > best_perform:
                best_perform = float(right_num) / float(total_num)
                save = True

            print '\ttest performance of epoch', i, ':', right_num, '/', total_num, '\t', \
                float(right_num * 10000 / total_num) / 100., '%', '\tbest through:', float(int(best_perform * 10000)) / 100.

            # save parameters if need
            if save:
                print '\t...saving parameters'
                file_name = options['param_path'] + model.name + '_hidden' + str(options['hidden_size']) + '_lrate' + \
                            str(options['lrate']) + '_batch' + str(options['batch_size']) + '_epoch' + str(i+1) + \
                            '_perform' + str(float(int(best_perform * 10000)) / 100.) + '.pickle'
                with open(file_name, 'wb') as f:
                    new_dict = {}
                    for k, v in model.params.items():
                        new_dict[k] = v.get_value()
                    cPickle.dump(new_dict, f)
def build_network_from_ae(classn):
    input_var = T.tensor4('input_var')
    target_var = T.imatrix('targets')

    layer = layers.InputLayer(shape=(None, 3, PS, PS), input_var=input_var)
    layer = batch_norm(
        layers.Conv2DLayer(layer,
                           100,
                           filter_size=(5, 5),
                           stride=1,
                           nonlinearity=leaky_rectify))
    layer = batch_norm(
        layers.Conv2DLayer(layer,
                           100,
                           filter_size=(5, 5),
                           stride=1,
                           nonlinearity=leaky_rectify))
    layer = batch_norm(
        layers.Conv2DLayer(layer,
                           120,
                           filter_size=(4, 4),
                           stride=1,
                           nonlinearity=leaky_rectify))
    layer = layers.MaxPool2DLayer(layer, pool_size=(3, 3), stride=2)
    layer = batch_norm(
        layers.Conv2DLayer(layer,
                           240,
                           filter_size=(3, 3),
                           stride=1,
                           nonlinearity=leaky_rectify))
    layer = batch_norm(
        layers.Conv2DLayer(layer,
                           320,
                           filter_size=(3, 3),
                           stride=1,
                           nonlinearity=leaky_rectify))
    layer = batch_norm(
        layers.Conv2DLayer(layer,
                           320,
                           filter_size=(3, 3),
                           stride=1,
                           nonlinearity=leaky_rectify))
    layer = batch_norm(
        layers.Conv2DLayer(layer,
                           320,
                           filter_size=(3, 3),
                           stride=1,
                           nonlinearity=leaky_rectify))
    layer = batch_norm(
        layers.Conv2DLayer(layer,
                           320,
                           filter_size=(3, 3),
                           stride=1,
                           nonlinearity=leaky_rectify))
    layer = batch_norm(
        layers.Conv2DLayer(layer,
                           320,
                           filter_size=(3, 3),
                           stride=1,
                           nonlinearity=leaky_rectify))
    layer = batch_norm(
        layers.Conv2DLayer(layer,
                           320,
                           filter_size=(3, 3),
                           stride=1,
                           nonlinearity=leaky_rectify))
    layer = batch_norm(
        layers.Conv2DLayer(layer,
                           320,
                           filter_size=(3, 3),
                           stride=1,
                           nonlinearity=leaky_rectify))
    layer = batch_norm(
        layers.Conv2DLayer(layer,
                           480,
                           filter_size=(3, 3),
                           stride=1,
                           nonlinearity=leaky_rectify))
    layer = batch_norm(
        layers.Conv2DLayer(layer,
                           480,
                           filter_size=(3, 3),
                           stride=1,
                           nonlinearity=leaky_rectify))
    layer = batch_norm(
        layers.Conv2DLayer(layer,
                           480,
                           filter_size=(3, 3),
                           stride=1,
                           nonlinearity=leaky_rectify))
    layer = batch_norm(
        layers.Conv2DLayer(layer,
                           480,
                           filter_size=(3, 3),
                           stride=1,
                           nonlinearity=leaky_rectify))

    layer = layers.Pool2DLayer(layer,
                               pool_size=(20, 20),
                               stride=20,
                               mode='average_inc_pad')
    network = layers.DenseLayer(layer, classn, nonlinearity=sigmoid)

    return network, input_var, target_var
Exemple #44
0
    def __init__(self,
                 args,
                 sent_emb_dim,
                 flic_dim,
                 load_model=None,
                 epochs_done=0):
        """Initializes the model and constructs the Theano Computation Graph."""
        self.args = args
        self.sig_handler = GracefullExit()
        self.best_val_error = sys.float_info.max

        if self.args.sample_all_sentences:
            print("We sample all items from the generator in each iteration.")
        else:
            print("We sample {} sets from the generator in each iteration.".
                  format(args.num_samples))

        self.flic_dim = flic_dim
        self.sent_emb_dim = sent_emb_dim

        # TODO: Implement or remove
        self.dropout_encoder = theano.shared(
            np.float64(args.dropout_encoder).astype(theano.config.floatX))
        self.dropout_generator = theano.shared(
            np.float64(args.dropout_generator).astype(theano.config.floatX))

        # Generator and Encoder Layers
        self.generator = Generator(args, None, self.sent_emb_dim, flic_dim)
        if self.args.context == 'train_context':
            self.encoder = Encoder(args, None, self.sent_emb_dim)
        else:
            self.encoder = Encoder(args, None, flic_dim)

        #---------------------------------------------------------------------------------------------------------------
        # Construct computation graph
        #---------------------------------------------------------------------------------------------------------------
        print("Constructing computation graph.")

        # (Input) Tensors
        sent_embs_t = T.matrix('sent_embs', dtype=theano.config.floatX)
        context_t = T.vector('context', dtype=theano.config.floatX)
        sample_sentences_padded_t = T.tensor3('sample_sent_embeddings',
                                              dtype=theano.config.floatX)
        item_counts_t = T.ivector('item_counts')
        y_t = T.scalar('y', dtype=theano.config.floatX)
        samples_t = T.imatrix('samples')  # Sentence embedding
        max_num_sents_t = T.iscalar('max_num_sents')

        transformed_context_t = self.generator.transform_context(
            context_t, normalize_embeddings=True)
        transformed_sent_embs_t = self.generator.transform_sent_embs(
            sent_embs_t, normalize_embeddings=True)

        if not self.args.sample_all_sentences:
            # Construct L to sample from the DPP.
            L_t = self.generator.get_L(transformed_sent_embs_t,
                                       transformed_context_t)

            self.get_L_t = theano.function(
                inputs=[sent_embs_t, context_t],
                outputs=L_t,
                #mode='DebugMode',
                #profile=True,
                allow_input_downcast=True,
                #mode=NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=True),
                on_unused_input='warn')

            # The samples will be passed into the Deep Set Layer to calculate the final cost.

            # Encoder cost & updates
            padded_sents_t, sents_count_t = self.generator.get_padded_embeddings_from_samples_t(
                samples_t, transformed_sent_embs_t, max_num_sents_t)

            probs_t, costs_encoder_t, \
            probs_mean_t, costs_encoder_mean_t, rand_updates = self.encoder.cost(padded_sents_t, sents_count_t,
                                                                                 transformed_context_t, y_t)

            # Generator cost & updates
            lprob_t, cost_enc_t, cost_generator_t = self.generator.cost(
                L_t, samples_t, costs_encoder_t, sents_count_t)

        # Sample all sentences
        # TODO: Implement or remove
        else:
            probs_mean_t, costs_encoder_mean_t = self.encoder.cost_all_sentences(
                transformed_sent_embs_t, transformed_context_t, y_t)
            cost_generator_t = costs_encoder_mean_t

        # Updates of the Generator and Encoder Parameters
        updates_e, self.lr_e, gnorm_e, self.params_opt_e = create_optimization_updates(
            cost=costs_encoder_mean_t,
            params=self.encoder.get_params(),
            method=self.args.learning,
            lr=self.args.learning_rate_encoder)[:4]

        updates_g, self.lr_g, gnorm_g, self.params_opt_g = create_optimization_updates(
            cost=cost_generator_t,
            params=self.generator.get_params(),
            method=self.args.learning,
            lr=self.args.learning_rate_generator)[:4]

        if self.args.adaptive_lrs:
            self.adaptive_learning_rate = adaptive_learning_rate(
                lr_1=self.lr_e, lr_2=self.lr_g)
        else:
            self.adaptive_learning_rate = adaptive_learning_rate()

        if not self.args.sample_all_sentences:
            # Compile training graph
            self.train_model_t = theano.function(
                inputs=[
                    sent_embs_t, samples_t, max_num_sents_t, context_t, y_t
                ],
                outputs=[probs_mean_t, costs_encoder_mean_t],
                updates=updates_e.items() + updates_g.items() + rand_updates,
                allow_input_downcast=True,
                #mode=NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=False),
                #mode='DebugMode',
                on_unused_input='warn')

            # Compile graph for validation data
            self.validate_t = theano.function(
                inputs=[
                    sent_embs_t, samples_t, max_num_sents_t, context_t, y_t
                ],
                outputs=[probs_mean_t, costs_encoder_mean_t],
                updates=rand_updates,
                allow_input_downcast=True,
                on_unused_input='warn')

        else:
            # Compile train graph
            self.train_model_t = theano.function(
                inputs=[sent_embs_t, context_t, y_t],
                outputs=[probs_mean_t, costs_encoder_mean_t],
                updates=updates_g.items() + updates_e.items(),
                allow_input_downcast=True,
                # mode=NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=True),
                on_unused_input='warn')
            # Compile graph for validation data
            self.validate_t = theano.function(
                inputs=[sent_embs_t, context_t, y_t],
                outputs=[probs_mean_t, costs_encoder_mean_t],
                updates=[],
                allow_input_downcast=True,
                on_unused_input='warn')

        # Load pretrained model
        if load_model:
            self.load(load_model)
        elif self.args.load_model:
            self.load(args.load_model)
        self.epochs_done = epochs_done
#
#####################################


# Create the train and predict_labels function
n_in = 2*windowSize+1
n_hidden = numHiddenUnits
n_out = len(label2Idx)

number_of_epochs = 10
minibatch_size = 35
embedding_size = embeddings.shape[1]

dim_case = 6

x = T.imatrix('x')  # the data, one word+context per row
y = T.ivector('y')  # the labels are presented as 1D vector of [int] labels
        
        
print "Embeddings shape",embeddings.shape

words = Sequential()
words.add(Embedding(output_dim=embeddings.shape[1], input_dim=embeddings.shape[0], input_length=n_in,  weights=[embeddings]))       
words.layers[0].trainable_weights = [] #Fixed Embedding layer
words.add(Flatten())

casing = Sequential()  
casing.add(Embedding(output_dim=caseMatrix.shape[1], input_dim=caseMatrix.shape[0], input_length=n_in, weights=[caseMatrix]))    
casing.layers[0].trainable_weights = [] #Fixed Embedding layer    
casing.add(Flatten())
Exemple #46
0
    def setupSetMacroBatchSubset(self):

        if isinstance(self.tvsData_x, list):
            data_block = [
                T.tensor4('data_block_{}'.format(i))
                for i in range(len(self.tvsData_x))
            ]
            data_updates = [(dx, T.set_subtensor(dx[:db.shape[0]], db))
                            for (dx, db) in zip(self.tvsData_x, data_block)]
        else:
            data_block = T.tensor4('data_block')
            data_updates = [
                (self.tvsData_x,
                 T.set_subtensor(self.tvsData_x[:data_block.shape[0]],
                                 data_block))
            ]
        self.tfSetMacroBatchSubsetData = theano.function(inputs=[data_block],
                                                         updates=data_updates)

        if self.cfgParams.use_labels:
            y_block = T.ivector('y_block')
            y_updates = [(self.tvsData_y,
                          T.set_subtensor(self.tvsData_y[:y_block.shape[0]],
                                          y_block))]
            self.tfSetMacroBatchSubsetY = theano.function(inputs=[y_block],
                                                          updates=y_updates)

        if self.cfgParams.use_regtargets:
            yr_block = T.vector('yr_block')
            yr_updates = [(self.tvsData_yr,
                           T.set_subtensor(self.tvsData_yr[:yr_block.shape[0]],
                                           yr_block))]
            self.tfSetMacroBatchSubsetYR = theano.function(inputs=[yr_block],
                                                           updates=yr_updates)

        if self.cfgParams.use_pairs:
            pairIdx_block = T.imatrix('pairIdx_block')
            pairLabels_block = T.ivector('pairLabels_block')
            pair_updates = [
                (self.tvsData_pairIdx,
                 T.set_subtensor(self.tvsData_pairIdx[:pairIdx_block.shape[0]],
                                 pairIdx_block)),
                (self.tvsData_pairLabels,
                 T.set_subtensor(
                     self.tvsData_pairLabels[:pairLabels_block.shape[0]],
                     pairLabels_block))
            ]
            self.tfSetMacroBatchSubsetPairs = theano.function(
                inputs=[pairIdx_block, pairLabels_block], updates=pair_updates)

        if self.cfgParams.use_triplets:
            tripletIdx_block = T.imatrix('tripletIdx_block')
            triplets_updates = [
                (self.tvsData_tripletIdx,
                 T.set_subtensor(
                     self.tvsData_tripletIdx[:tripletIdx_block.shape[0]],
                     tripletIdx_block))
            ]
            self.tfSetMacroBatchSubsetTriplets = theano.function(
                inputs=[tripletIdx_block], updates=triplets_updates)

        if self.cfgParams.use_tripletPools:
            tripletPoolIdx_block = T.imatrix('tripletPoolIdx_block')
            tripletPools_updates = [
                (self.tvsData_tripletPoolIdx,
                 T.set_subtensor(
                     self.tvsData_tripletPoolIdx[:tripletPoolIdx_block.
                                                 shape[0]],
                     tripletPoolIdx_block))
            ]
            self.tfSetMacroBatchSubsetTripletPools = theano.function(
                inputs=[tripletPoolIdx_block], updates=tripletPools_updates)
Exemple #47
0
import theano
import theano.tensor as T
import numpy as np

a = T.imatrix()
b = T.imatrix()

ok = T.horizontal_stack(a, b)

myfunc = theano.function([a, b], ok)

a_init = np.reshape(np.arange(10, dtype='int32'), (2, 5))
b_init = np.reshape(np.arange(10, 20, dtype='int32'), (2, 5))

ok = myfunc(a_init, b_init)

print ok
Exemple #48
0
                                      which_set=args.train_dataset,
                                      batch_size=args.batch_size)
    valid_datastream = get_datastream(path=args.data_path,
                                      which_set=args.valid_dataset,
                                      batch_size=args.batch_size)
    test_datastream = get_datastream(path=args.data_path,
                                     which_set=args.test_dataset,
                                     batch_size=args.batch_size)

    #################
    # build network #
    #################
    print('Building and compiling network')
    input_data = T.ftensor3('input_data')
    input_mask = T.fmatrix('input_mask')
    target_data = T.imatrix('target_data')
    target_mask = T.fmatrix('target_mask')
    network_output = deep_prj_lstm_model_v1(input_var=input_data,
                                            mask_var=input_mask,
                                            num_inputs=input_dim,
                                            num_outputs=output_dim,
                                            num_layers=args.num_layers,
                                            num_units=args.num_units,
                                            num_prjs=args.num_prjs,
                                            grad_clipping=args.grad_clipping,
                                            dropout=args.dropout)

    network = network_output
    network_params = get_all_params(network, trainable=True)
    network_reg_params = get_all_params(network,
                                        trainable=True,
Exemple #49
0
 def __init__(self):
     super(SimpleVLblNceTrainer, self).__init__()
     self.h_indices = debug_print(T.imatrix('h'), 'h_indices')
     self.w_indices = debug_print(T.ivector(name='w'), 'w_indices')
     self.inputs = [self.h_indices, self.w_indices]
def build_network_from_ae(classn):
    input_var = T.tensor4('input_var')

    layer = layers.InputLayer(shape=(None, 3, PS, PS), input_var=input_var)
    layer = batch_norm(
        layers.Conv2DLayer(layer,
                           100,
                           filter_size=(5, 5),
                           stride=1,
                           pad='same',
                           nonlinearity=leaky_rectify))
    layer = batch_norm(
        layers.Conv2DLayer(layer,
                           120,
                           filter_size=(5, 5),
                           stride=1,
                           pad='same',
                           nonlinearity=leaky_rectify))
    layer = layers.Pool2DLayer(layer,
                               pool_size=(2, 2),
                               stride=2,
                               mode='average_inc_pad')
    layer = batch_norm(
        layers.Conv2DLayer(layer,
                           240,
                           filter_size=(3, 3),
                           stride=1,
                           pad='same',
                           nonlinearity=leaky_rectify))
    layer = batch_norm(
        layers.Conv2DLayer(layer,
                           320,
                           filter_size=(3, 3),
                           stride=1,
                           pad='same',
                           nonlinearity=leaky_rectify))
    layer = layers.Pool2DLayer(layer,
                               pool_size=(2, 2),
                               stride=2,
                               mode='average_inc_pad')
    layer = batch_norm(
        layers.Conv2DLayer(layer,
                           640,
                           filter_size=(3, 3),
                           stride=1,
                           pad='same',
                           nonlinearity=leaky_rectify))
    prely = batch_norm(
        layers.Conv2DLayer(layer,
                           1024,
                           filter_size=(3, 3),
                           stride=1,
                           pad='same',
                           nonlinearity=leaky_rectify))

    featm = batch_norm(
        layers.Conv2DLayer(prely,
                           640,
                           filter_size=(1, 1),
                           nonlinearity=leaky_rectify))
    feat_map = batch_norm(
        layers.Conv2DLayer(featm,
                           100,
                           filter_size=(1, 1),
                           nonlinearity=rectify,
                           name="feat_map"))
    maskm = batch_norm(
        layers.Conv2DLayer(prely,
                           100,
                           filter_size=(1, 1),
                           nonlinearity=leaky_rectify))
    mask_rep = batch_norm(layers.Conv2DLayer(maskm,
                                             1,
                                             filter_size=(1, 1),
                                             nonlinearity=None),
                          beta=None,
                          gamma=None)
    mask_map = SoftThresPerc(mask_rep,
                             perc=97.0,
                             alpha=0.1,
                             beta=init.Constant(0.5),
                             tight=100.0,
                             name="mask_map")
    enlyr = ChInnerProdMerge(feat_map, mask_map, name="encoder")

    layer = batch_norm(
        layers.Deconv2DLayer(enlyr,
                             1024,
                             filter_size=(3, 3),
                             stride=1,
                             crop='same',
                             nonlinearity=leaky_rectify))
    layer = batch_norm(
        layers.Deconv2DLayer(layer,
                             640,
                             filter_size=(3, 3),
                             stride=1,
                             crop='same',
                             nonlinearity=leaky_rectify))
    layer = batch_norm(
        layers.Deconv2DLayer(layer,
                             640,
                             filter_size=(4, 4),
                             stride=2,
                             crop=(1, 1),
                             nonlinearity=leaky_rectify))
    layer = batch_norm(
        layers.Deconv2DLayer(layer,
                             320,
                             filter_size=(3, 3),
                             stride=1,
                             crop='same',
                             nonlinearity=leaky_rectify))
    layer = batch_norm(
        layers.Deconv2DLayer(layer,
                             320,
                             filter_size=(3, 3),
                             stride=1,
                             crop='same',
                             nonlinearity=leaky_rectify))
    layer = batch_norm(
        layers.Deconv2DLayer(layer,
                             240,
                             filter_size=(4, 4),
                             stride=2,
                             crop=(1, 1),
                             nonlinearity=leaky_rectify))
    layer = batch_norm(
        layers.Deconv2DLayer(layer,
                             120,
                             filter_size=(5, 5),
                             stride=1,
                             crop='same',
                             nonlinearity=leaky_rectify))
    layer = batch_norm(
        layers.Deconv2DLayer(layer,
                             100,
                             filter_size=(5, 5),
                             stride=1,
                             crop='same',
                             nonlinearity=leaky_rectify))
    layer = layers.Deconv2DLayer(layer,
                                 3,
                                 filter_size=(1, 1),
                                 stride=1,
                                 crop='same',
                                 nonlinearity=identity)

    glblf = batch_norm(
        layers.Conv2DLayer(prely,
                           128,
                           filter_size=(1, 1),
                           nonlinearity=leaky_rectify))
    glblf = layers.Pool2DLayer(glblf,
                               pool_size=(5, 5),
                               stride=5,
                               mode='average_inc_pad')
    glblf = batch_norm(
        layers.Conv2DLayer(glblf,
                           64,
                           filter_size=(3, 3),
                           stride=1,
                           pad='same',
                           nonlinearity=leaky_rectify))
    gllyr = batch_norm(layers.Conv2DLayer(glblf,
                                          5,
                                          filter_size=(1, 1),
                                          nonlinearity=rectify),
                       name="global_feature")

    glblf = batch_norm(
        layers.Deconv2DLayer(gllyr,
                             256,
                             filter_size=(3, 3),
                             stride=1,
                             crop='same',
                             nonlinearity=leaky_rectify))
    glblf = batch_norm(
        layers.Deconv2DLayer(glblf,
                             128,
                             filter_size=(3, 3),
                             stride=1,
                             crop='same',
                             nonlinearity=leaky_rectify))
    glblf = batch_norm(
        layers.Deconv2DLayer(glblf,
                             128,
                             filter_size=(9, 9),
                             stride=5,
                             crop=(2, 2),
                             nonlinearity=leaky_rectify))
    glblf = batch_norm(
        layers.Deconv2DLayer(glblf,
                             128,
                             filter_size=(3, 3),
                             stride=1,
                             crop='same',
                             nonlinearity=leaky_rectify))
    glblf = batch_norm(
        layers.Deconv2DLayer(glblf,
                             128,
                             filter_size=(3, 3),
                             stride=1,
                             crop='same',
                             nonlinearity=leaky_rectify))
    glblf = batch_norm(
        layers.Deconv2DLayer(glblf,
                             64,
                             filter_size=(4, 4),
                             stride=2,
                             crop=(1, 1),
                             nonlinearity=leaky_rectify))
    glblf = batch_norm(
        layers.Deconv2DLayer(glblf,
                             64,
                             filter_size=(3, 3),
                             stride=1,
                             crop='same',
                             nonlinearity=leaky_rectify))
    glblf = batch_norm(
        layers.Deconv2DLayer(glblf,
                             64,
                             filter_size=(3, 3),
                             stride=1,
                             crop='same',
                             nonlinearity=leaky_rectify))
    glblf = batch_norm(
        layers.Deconv2DLayer(glblf,
                             32,
                             filter_size=(4, 4),
                             stride=2,
                             crop=(1, 1),
                             nonlinearity=leaky_rectify))
    glblf = batch_norm(
        layers.Deconv2DLayer(glblf,
                             32,
                             filter_size=(3, 3),
                             stride=1,
                             crop='same',
                             nonlinearity=leaky_rectify))
    glblf = batch_norm(
        layers.Deconv2DLayer(glblf,
                             32,
                             filter_size=(3, 3),
                             stride=1,
                             crop='same',
                             nonlinearity=leaky_rectify))
    glblf = layers.Deconv2DLayer(glblf,
                                 3,
                                 filter_size=(1, 1),
                                 stride=1,
                                 crop='same',
                                 nonlinearity=identity)

    layer = layers.ElemwiseSumLayer([layer, glblf])

    network = ReshapeLayer(layer, ([0], -1))
    layers.set_all_param_values(network,
                                pickle.load(open(filename_model_ae, 'rb')))
    mask_map.beta.set_value(np.float32(0.8 * mask_map.beta.get_value()))
    old_params = layers.get_all_params(network, trainable=True)

    # Adding more layers
    aug_var = T.matrix('aug_var')
    target_var = T.imatrix('targets')
    add_a = batch_norm(
        layers.Conv2DLayer(enlyr,
                           320,
                           filter_size=(1, 1),
                           nonlinearity=leaky_rectify))
    add_b = batch_norm(
        layers.Conv2DLayer(add_a,
                           320,
                           filter_size=(1, 1),
                           nonlinearity=leaky_rectify))
    add_c = batch_norm(
        layers.Conv2DLayer(add_b,
                           320,
                           filter_size=(1, 1),
                           nonlinearity=leaky_rectify))
    add_d = batch_norm(
        layers.Conv2DLayer(add_c,
                           320,
                           filter_size=(1, 1),
                           nonlinearity=leaky_rectify))
    add_0 = layers.Pool2DLayer(add_d,
                               pool_size=(25, 25),
                               stride=25,
                               mode='average_inc_pad')
    add_1 = batch_norm(
        layers.DenseLayer(add_0, 100, nonlinearity=leaky_rectify))

    add_2 = batch_norm(
        layers.DenseLayer(gllyr, 320, nonlinearity=leaky_rectify))
    add_3 = batch_norm(
        layers.DenseLayer(add_2, 320, nonlinearity=leaky_rectify))
    add_4 = batch_norm(
        layers.DenseLayer(add_3, 100, nonlinearity=leaky_rectify))

    aug_layer = layers.InputLayer(shape=(None, aug_fea_n), input_var=aug_var)

    cat_layer = lasagne.layers.ConcatLayer([add_1, add_4, aug_layer], axis=1)

    hidden_layer = layers.DenseLayer(cat_layer, 80, nonlinearity=leaky_rectify)
    network = layers.DenseLayer(hidden_layer, classn, nonlinearity=sigmoid)

    all_params = layers.get_all_params(network, trainable=True)
    new_params = [x for x in all_params if x not in old_params]

    return network, new_params, input_var, aug_var, target_var
Exemple #51
0
    def __init__(self, num_hidden, num_classes, context_win_size, embeddings,
                 featdim=0, fine_tuning=False, truncate_gradient=-1):
        """
        num_hidden :: dimension of the hidden layer
        num_classes :: number of classes
        context_win_size :: word window context size
        embeddings :: matrix
        """
        # hyper parameters of the model

        self.hyperparams = {}

        # nh :: dimension of the hidden layer
        nh = num_hidden
        self.hyperparams['nh'] = nh

        # nc :: number of classes
        nc = num_classes
        self.hyperparams['nc'] = nc

        # de :: dimension of the word embeddings
        de = embeddings.shape[1]
        self.hyperparams['de'] = de

        # cs :: word window context size
        cs = context_win_size
        self.hyperparams['cs'] = cs

        self.hyperparams['featdim'] = featdim
        self.hyperparams['fine_tuning'] = fine_tuning
        self.hyperparams['truncate_gradient'] = truncate_gradient

        # parameters of the model
        self.emb = theano.shared(embeddings.astype(theano.config.floatX))

        # inputs
        idxs = T.imatrix()
        w = T.fscalar('w')
        x = self.emb[idxs].reshape((idxs.shape[0], de * cs))*w
        y = T.iscalar('y')
        y_sentence = T.ivector('y_sentence')
        f = T.matrix('f')
        f.reshape((idxs.shape[0], featdim))

        # forward parameters of the model
        self.fWx = theano.shared(0.2 * np.random.uniform(-1.0, 1.0,
                                                         (de * cs, nh)).astype(theano.config.floatX))

        self.fWh = theano.shared(0.2 * np.random.uniform(-1.0, 1.0,
                                                         (nh, nh)).astype(theano.config.floatX))

        self.fbh = theano.shared(np.zeros(nh, dtype=theano.config.floatX))

        self.fh0 = theano.shared(np.zeros(nh, dtype=theano.config.floatX))

        fparams = [self.fWx, self.fWh, self.fbh, self.fh0]
        fnames = ['fWx', 'fWh', 'fbh', 'fh0']

        def frecurrence(x_t, h_tm1):
            h_t = T.nnet.sigmoid(T.dot(x_t, self.fWx) + T.dot(h_tm1, self.fWh) + self.fbh)
            return h_t

        fh, _ = theano.scan(fn=frecurrence,
                            sequences=x,
                            outputs_info=[self.fh0],
                            n_steps=x.shape[0],
                            truncate_gradient=truncate_gradient)

        # backwards parameters of the model
        self.bWx = theano.shared(0.2 * np.random.uniform(-1.0, 1.0,
                                                         (de * cs, nh)).astype(theano.config.floatX))

        self.bWh = theano.shared(0.2 * np.random.uniform(-1.0, 1.0,
                                                         (nh, nh)).astype(theano.config.floatX))

        self.bbh = theano.shared(np.zeros(nh, dtype=theano.config.floatX))

        self.bh0 = theano.shared(np.zeros(nh, dtype=theano.config.floatX))

        bparams = [self.bWx, self.bWh, self.bbh, self.bh0]
        bnames = ['bWx', 'bWh', 'bbh', 'bh0']

        def brecurrence(x_t, h_tm1):
            h_t = T.nnet.sigmoid(T.dot(x_t, self.bWx) + T.dot(h_tm1, self.bWh) + self.bbh)
            return h_t

        bh, _ = theano.scan(fn=brecurrence,
                            sequences=x,
                            outputs_info=[self.bh0],
                            n_steps=x.shape[0],
                            go_backwards=True,
                            truncate_gradient=truncate_gradient)

        # inverting backwards hidden
        bh = bh[::-1]

        # concatenation parameters
        self.bW = theano.shared(0.2 * np.random.uniform(-1.0, 1.0,
                                                        (nh+featdim, nc)).astype(theano.config.floatX))

        self.fW = theano.shared(0.2 * np.random.uniform(-1.0, 1.0,
                                                        (nh+featdim, nc)).astype(theano.config.floatX))

        self.b = theano.shared(np.zeros(nc, dtype=theano.config.floatX))

        # adding features
        if featdim > 0:
            fh_final = T.concatenate([fh, f], axis=1)
            bh_final = T.concatenate([bh, f], axis=1)
        else:
            fh_final = fh
            bh_final = bh

        # "concatenating" forward and backward hidden states
        h = T.dot(bh_final, self.bW) + T.dot(fh_final, self.fW)

        s = T.nnet.softmax(h + self.b)

        p_y_given_x_lastword = s[-1, :]
        p_y_given_x_sentence = s

        self.params = fparams + bparams + [self.bW, self.fW, self.b]
        self.names = fnames + bnames + ['bW', 'fW', 'b']

        if fine_tuning:
            self.params.append(self.emb)
            self.names.append("emb")

        # prediction
        y_pred = T.argmax(p_y_given_x_sentence, axis=1)

        # cost functions
        sentence_nll = -T.mean(T.log(p_y_given_x_sentence)
                               [T.arange(x.shape[0]), y_sentence])

        nll = -T.mean(T.log(p_y_given_x_lastword)[y])

        # gradients
        sentence_gradients = T.grad(sentence_nll, self.params)
        gradients = T.grad(nll, self.params)

        # learning rate
        lr = T.scalar('lr')

        # updates
        sentence_updates = OrderedDict((p, p - lr * g)
                                       for p, g in
                                       zip(self.params, sentence_gradients))

        updates = OrderedDict((p, p - lr * g)
                              for p, g in
                              zip(self.params, gradients))

        # theano functions
        self.classify = theano.function(inputs=[idxs, f, In(w, value=1.0)],
                                        outputs=y_pred,
                                        on_unused_input='ignore')

        self.sentence_train = theano.function(inputs=[idxs, f, y_sentence, lr, In(w, value=1.0)],
                                              outputs=sentence_nll,
                                              updates=sentence_updates,
                                              on_unused_input='ignore')

        self.train = theano.function(inputs=[idxs, f, y, lr, In(w, value=1.0)],
                                     outputs=nll,
                                     updates=updates,
                                     on_unused_input='ignore')

        self.predict = theano.function(inputs=[idxs, f, In(w, value=1.0)],
                                       outputs=p_y_given_x_sentence,
                                       on_unused_input='ignore')

        self.normalize = theano.function(inputs=[],
                                         updates={self.emb:\
                                                  self.emb/T.sqrt((self.emb**2).sum(axis=1)).dimshuffle(0, 'x')})
def main():
    parser = argparse.ArgumentParser(description='Tuning with bi-directional LSTM-CNN-CRF')
    parser.add_argument('--fine_tune', action='store_true', help='Fine tune the word embeddings')
    parser.add_argument('--embedding', choices=['word2vec', 'glove', 'senna', 'random'], help='Embedding for words',
                        required=True)
    parser.add_argument('--embedding_dict', default=None, help='path for embedding dict')
    parser.add_argument('--batch_size', type=int, default=10, help='Number of sentences in each batch')
    parser.add_argument('--num_units', type=int, default=100, help='Number of hidden units in LSTM')
    parser.add_argument('--num_filters', type=int, default=20, help='Number of filters in CNN')
    parser.add_argument('--learning_rate', type=float, default=0.1, help='Learning rate')
    parser.add_argument('--decay_rate', type=float, default=0.1, help='Decay rate of learning rate')
    parser.add_argument('--grad_clipping', type=float, default=0, help='Gradient clipping')
    parser.add_argument('--gamma', type=float, default=1e-6, help='weight for regularization')
    parser.add_argument('--peepholes', action='store_true', help='Peepholes for LSTM')
    parser.add_argument('--oov', choices=['random', 'embedding'], help='Embedding for oov word', required=True)
    parser.add_argument('--update', choices=['sgd', 'momentum', 'nesterov', 'adadelta'], help='update algorithm',
                        default='sgd')
    parser.add_argument('--regular', choices=['none', 'l2'], help='regularization for training', required=True)
    parser.add_argument('--dropout', action='store_true', help='Apply dropout layers')
    parser.add_argument('--patience', type=int, default=5, help='Patience for early stopping')
    parser.add_argument('--output_prediction', action='store_true', help='Output predictions to temp files')
    parser.add_argument('--train')  # "data/POS-penn/wsj/split1/wsj1.train.original"
    parser.add_argument('--dev')  # "data/POS-penn/wsj/split1/wsj1.dev.original"
    parser.add_argument('--test')  # "data/POS-penn/wsj/split1/wsj1.test.original"

    args = parser.parse_args()

    def construct_input_layer():
        if fine_tune:
            layer_input = lasagne.layers.InputLayer(shape=(None, max_length), input_var=input_var, name='input')
            layer_embedding = lasagne.layers.EmbeddingLayer(layer_input, input_size=alphabet_size,
                                                            output_size=embedd_dim,
                                                            W=embedd_table, name='embedding')
            return layer_embedding
        else:
            layer_input = lasagne.layers.InputLayer(shape=(None, max_length, embedd_dim), input_var=input_var,
                                                    name='input')
            return layer_input

    def construct_char_input_layer():
        layer_char_input = lasagne.layers.InputLayer(shape=(None, max_sent_length, max_char_length),
                                                     input_var=char_input_var, name='char-input')
        layer_char_input = lasagne.layers.reshape(layer_char_input, (-1, [2]))
        layer_char_embedding = lasagne.layers.EmbeddingLayer(layer_char_input, input_size=char_alphabet_size,
                                                             output_size=char_embedd_dim, W=char_embedd_table,
                                                             name='char_embedding')
        layer_char_input = lasagne.layers.DimshuffleLayer(layer_char_embedding, pattern=(0, 2, 1))
        return layer_char_input

    logger = utils.get_logger("BiLSTM-CNN-CRF")
    fine_tune = args.fine_tune
    oov = args.oov
    regular = args.regular
    embedding = args.embedding
    embedding_path = args.embedding_dict
    train_path = args.train
    dev_path = args.dev
    test_path = args.test
    update_algo = args.update
    grad_clipping = args.grad_clipping
    peepholes = args.peepholes
    num_filters = args.num_filters
    gamma = args.gamma
    output_predict = args.output_prediction
    dropout = args.dropout

    X_train, Y_train, mask_train, X_dev, Y_dev, mask_dev, X_test, Y_test, mask_test, \
    embedd_table, label_alphabet, \
    C_train, C_dev, C_test, char_embedd_table = data_processor.load_dataset_sequence_labeling(train_path, dev_path,
                                                                                              test_path,word_column=0, label_column=1, oov=oov, fine_tune=fine_tune,embedding=embedding, embedding_path=embedding_path,  use_character=True)
    num_labels = label_alphabet.size() - 1

    logger.info("constructing network...")
    # create variables
    target_var = T.imatrix(name='targets')
    mask_var = T.matrix(name='masks', dtype=theano.config.floatX)
    if fine_tune:
        input_var = T.imatrix(name='inputs')
        num_data, max_length = X_train.shape
        alphabet_size, embedd_dim = embedd_table.shape
    else:
        input_var = T.tensor3(name='inputs', dtype=theano.config.floatX)
        num_data, max_length, embedd_dim = X_train.shape
    char_input_var = T.itensor3(name='char-inputs')
    num_data_char, max_sent_length, max_char_length = C_train.shape
    char_alphabet_size, char_embedd_dim = char_embedd_table.shape
    assert (max_length == max_sent_length)
    assert (num_data == num_data_char)

    # construct input and mask layers
    layer_incoming1 = construct_char_input_layer()
    layer_incoming2 = construct_input_layer()

    layer_mask = lasagne.layers.InputLayer(shape=(None, max_length), input_var=mask_var, name='mask')

    # construct bi-rnn-cnn
    num_units = args.num_units

    bi_lstm_cnn_crf = build_BiLSTM_CNN_CRF(layer_incoming1, layer_incoming2, num_units, num_labels, mask=layer_mask,
                                           grad_clipping=grad_clipping, peepholes=peepholes, num_filters=num_filters,
                                           dropout=dropout)

    logger.info("Network structure: hidden=%d, filter=%d" % (num_units, num_filters))

    # compute loss
    num_tokens = mask_var.sum(dtype=theano.config.floatX)

    # get outpout of bi-lstm-cnn-crf shape [batch, length, num_labels, num_labels]
    energies_train = lasagne.layers.get_output(bi_lstm_cnn_crf)
    energies_eval = lasagne.layers.get_output(bi_lstm_cnn_crf, deterministic=True)

    loss_train = crf_loss(energies_train, target_var, mask_var).mean()
    loss_eval = crf_loss(energies_eval, target_var, mask_var).mean()
    # l2 regularization?
    if regular == 'l2':
        l2_penalty = lasagne.regularization.regularize_network_params(bi_lstm_cnn_crf, lasagne.regularization.l2)
        loss_train = loss_train + gamma * l2_penalty

    _, corr_train = crf_accuracy(energies_train, target_var)

    corr_train = (corr_train * mask_var).sum(dtype=theano.config.floatX)
    prediction_eval, corr_eval = crf_accuracy(energies_eval, target_var)
    corr_eval = (corr_eval * mask_var).sum(dtype=theano.config.floatX)
    debug_out = crf_nbest_debug(energies_eval,target_var)
    # crf_para = crf_parameter(energies_eval)

    # Create update expressions for training.
    # hyper parameters to tune: learning rate, momentum, regularization.
    batch_size = args.batch_size
    learning_rate = 1.0 if update_algo == 'adadelta' else args.learning_rate
    decay_rate = args.decay_rate
    momentum = 0.9
    params = lasagne.layers.get_all_params(bi_lstm_cnn_crf, trainable=True)
    updates = utils.create_updates(loss_train, params, update_algo, learning_rate, momentum=momentum)

    # Compile a function performing a training step on a mini-batch
    train_fn = theano.function([input_var, target_var, mask_var, char_input_var], [loss_train, corr_train, num_tokens],
                               updates=updates)
    # Compile a second function evaluating the loss and accuracy of network
    eval_fn = theano.function([input_var, target_var, mask_var, char_input_var],
                              [loss_eval, corr_eval, num_tokens, prediction_eval, energies_eval])
    debug_fn = theano.function([input_var, target_var, mask_var, char_input_var], debug_out,on_unused_input='ignore')


    # Finally, launch the training loop.
    logger.info(
        "Start training: %s with regularization: %s(%f), dropout: %s, fine tune: %s (#training data: %d, batch size: %d, clip: %.1f, peepholes: %s)..." \
        % (
            update_algo, regular, (0.0 if regular == 'none' else gamma), dropout, fine_tune, num_data, batch_size,
            grad_clipping,
            peepholes))
    num_batches = num_data / batch_size
    num_epochs = 3
    best_loss = 1e+12
    best_acc = 0.0
    best_epoch_loss = 0
    best_epoch_acc = 0
    best_loss_test_err = 0.
    best_loss_test_corr = 0.
    best_acc_test_err = 0.
    best_acc_test_corr = 0.
    stop_count = 0
    lr = learning_rate
    patience = args.patience
    for epoch in range(1, num_epochs + 1):
        print 'Epoch %d (learning rate=%.4f, decay rate=%.4f): ' % (epoch, lr, decay_rate)
        train_err = 0.0
        train_corr = 0.0
        train_total = 0
        train_inst = 0
        start_time = time.time()
        num_back = 0
        train_batches = 0
        for batch in utils.iterate_minibatches(X_train, Y_train, masks=mask_train, char_inputs=C_train,
                                               batch_size=batch_size, shuffle=True):
            inputs, targets, masks, char_inputs = batch
            err, corr, num = train_fn(inputs, targets, masks, char_inputs)
            train_err += err * inputs.shape[0]
            train_corr += corr
            train_total += num
            train_inst += inputs.shape[0]
            train_batches += 1
            time_ave = (time.time() - start_time) / train_batches
            time_left = (num_batches - train_batches) * time_ave

            # update log
            sys.stdout.write("\b" * num_back)
            log_info = 'train: %d/%d loss: %.4f, acc: %.2f%%, time left (estimated): %.2fs' % (
                min(train_batches * batch_size, num_data), num_data,
                train_err / train_inst, train_corr * 100 / train_total, time_left)
            sys.stdout.write(log_info)
            num_back = len(log_info)
        # update training log after each epoch
        assert train_inst == num_data
        sys.stdout.write("\b" * num_back)
        print 'train: %d/%d loss: %.4f, acc: %.2f%%, time: %.2fs' % (
            min(train_batches * batch_size, num_data), num_data,
            train_err / num_data, train_corr * 100 / train_total, time.time() - start_time)

        # evaluate performance on dev data
        dev_err = 0.0
        dev_corr = 0.0
        dev_total = 0
        dev_inst = 0

        for batch in utils.iterate_minibatches(X_dev, Y_dev, masks=mask_dev, char_inputs=C_dev, batch_size=batch_size):
            inputs, targets, masks, char_inputs = batch
            err, corr, num, predictions, crf_para = eval_fn(inputs, targets, masks, char_inputs)
            dev_err += err * inputs.shape[0]
            dev_corr += corr
            dev_total += num
            dev_inst += inputs.shape[0]
            if output_predict:
                utils.output_predictions(predictions, targets, masks, 'tmp/dev%d' % epoch, label_alphabet,
                                         is_flattened=False)
            # debug_out = debug_fn(inputs, targets, masks, char_inputs)
            # print "debug out:", debug_out[1]
            crf_nbest.write_nbest(inputs, targets, masks, crf_para,label_alphabet,'tmp/dev_nbest%d' % epoch, 10, is_flattened=False)


        print 'dev loss: %.4f, corr: %d, total: %d, acc: %.2f%%' % (
            dev_err / dev_inst, dev_corr, dev_total, dev_corr * 100 / dev_total)

        if best_loss < dev_err and best_acc > dev_corr / dev_total:
            stop_count += 1
        else:
            update_loss = False
            update_acc = False
            stop_count = 0
            if best_loss > dev_err:
                update_loss = True
                best_loss = dev_err
                best_epoch_loss = epoch
            if best_acc < dev_corr / dev_total:
                update_acc = True
                best_acc = dev_corr / dev_total
                best_epoch_acc = epoch

            # evaluate on test data when better performance detected
            test_err = 0.0
            test_corr = 0.0
            test_total = 0
            test_inst = 0
            for batch in utils.iterate_minibatches(X_test, Y_test, masks=mask_test, char_inputs=C_test,
                                                   batch_size=batch_size):
                inputs, targets, masks, char_inputs = batch
                err, corr, num, predictions,crf_para = eval_fn(inputs, targets, masks, char_inputs)
                test_err += err * inputs.shape[0]
                test_corr += corr
                test_total += num
                test_inst += inputs.shape[0]
                if output_predict:
                    utils.output_predictions(predictions, targets, masks, 'tmp/test%d' % epoch, label_alphabet,
                                             is_flattened=False)

            print 'test loss: %.4f, corr: %d, total: %d, acc: %.2f%%' % (
                test_err / test_inst, test_corr, test_total, test_corr * 100 / test_total)

            if update_loss:
                best_loss_test_err = test_err
                best_loss_test_corr = test_corr
            if update_acc:
                best_acc_test_err = test_err
                best_acc_test_corr = test_corr

        # stop if dev acc decrease 3 time straightly.
        if stop_count == patience:
            break

        # re-compile a function with new learning rate for training
        if update_algo != 'adadelta':
            lr = learning_rate / (1.0 + epoch * decay_rate)
            updates = utils.create_updates(loss_train, params, update_algo, lr, momentum=momentum)
            train_fn = theano.function([input_var, target_var, mask_var, char_input_var],
                                        [loss_train, corr_train, num_tokens],
                                        updates=updates)

    # print best performance on test data.
    logger.info("final best loss test performance (at epoch %d)" % best_epoch_loss)
    print 'test loss: %.4f, corr: %d, total: %d, acc: %.2f%%' % (
        best_loss_test_err / test_inst, best_loss_test_corr, test_total, best_loss_test_corr * 100 / test_total)
    logger.info("final best acc test performance (at epoch %d)" % best_epoch_acc)
    print 'test loss: %.4f, corr: %d, total: %d, acc: %.2f%%' % (
        best_acc_test_err / test_inst, best_acc_test_corr, test_total, best_acc_test_corr * 100 / test_total)
    print "BiLSTM-CNN-CRF model finished!"
Exemple #53
0
    def __init__(self,
                 ne,
                 de,
                 na,
                 n_lstm,
                 n_out,
                 cs,
                 npos,
                 lr=0.05,
                 single_output=True,
                 output_activation=T.nnet.softmax,
                 cost_function='nll'):
        '''
        ne :: number of word embeddings in the vocabulary
        de :: dimension of the word embeddings
        na :: number of acoustic or language model features at each word step
                (acoustic context size in frames * number of features)
        n_lstm :: dimension of the lstm layer
        n_out :: number of classes
        cs :: word window context size
        npos :: number of pos tags
        '''

        # add one to ne for PADDING
        self.emb = init_weight((ne + 1, de), 'emb')
        self.n_in = (de * cs) + (npos * cs)
        self.n_lstm = n_lstm
        self.n_out = n_out
        self.W_xi = init_weight((self.n_in, self.n_lstm), 'W_xi')
        self.W_hi = init_weight((self.n_lstm, self.n_lstm), 'W_hi', 'svd')
        self.W_ci = init_weight((self.n_lstm, self.n_lstm), 'W_ci', 'svd')
        # bias to the input:
        self.b_i = shared(np.cast[dtype](np.random.uniform(-0.5,
                                                           .5,
                                                           size=n_lstm)))
        # forget gate weights:
        self.W_xf = init_weight((self.n_in, self.n_lstm), 'W_xf')
        self.W_hf = init_weight((self.n_lstm, self.n_lstm), 'W_hf', 'svd')
        self.W_cf = init_weight((self.n_lstm, self.n_lstm), 'W_cf', 'svd')
        # bias
        self.b_f = shared(np.cast[dtype](np.random.uniform(0, 1.,
                                                           size=n_lstm)))
        # memory cell gate weights:
        self.W_xc = init_weight((self.n_in, self.n_lstm), 'W_xc')
        self.W_hc = init_weight((self.n_lstm, self.n_lstm), 'W_hc', 'svd')
        # bias to the memory cell:
        self.b_c = shared(np.zeros(n_lstm, dtype=dtype))
        # output gate weights:
        self.W_xo = init_weight((self.n_in, self.n_lstm), 'W_xo')
        self.W_ho = init_weight((self.n_lstm, self.n_lstm), 'W_ho', 'svd')
        self.W_co = init_weight((self.n_lstm, self.n_lstm), 'W_co', 'svd')
        # bias on output gate:
        self.b_o = shared(np.cast[dtype](np.random.uniform(-0.5,
                                                           .5,
                                                           size=n_lstm)))
        # hidden to y matrix weights:
        self.W_hy = init_weight((self.n_lstm, self.n_out), 'W_hy')
        self.b_y = shared(np.zeros(n_out, dtype=dtype))  # output bias

        # Weights for L1 and L2
        self.L1_reg = 0.0
        self.L2_reg = 0.00001

        self.params = [
            self.W_xi, self.W_hi, self.W_ci, self.b_i, self.W_xf, self.W_hf,
            self.W_cf, self.b_f, self.W_xc, self.W_hc, self.b_c, self.W_ho,
            self.W_co, self.W_co, self.b_o, self.W_hy, self.b_y, self.emb
        ]
        self.names = [
            "W_xi", "W_hi", "W_ci", "b_i", "W_xf", "W_hf", "W_cf", "b_f",
            "W_xc", "W_hc", "b_c", "W_ho", "W_co", "W_co", "b_o", "W_hy",
            "b_y", "embeddings"
        ]

        def step_lstm(x_t, h_tm1, c_tm1):
            i_t = T.nnet.sigmoid(
                T.dot(x_t, self.W_xi) + T.dot(h_tm1, self.W_hi) +
                T.dot(c_tm1, self.W_ci) + self.b_i)
            f_t = T.nnet.sigmoid(
                T.dot(x_t, self.W_xf) + T.dot(h_tm1, self.W_hf) +
                T.dot(c_tm1, self.W_cf) + self.b_f)
            c_t = f_t * c_tm1 + i_t * T.tanh(
                T.dot(x_t, self.W_xc) + T.dot(h_tm1, self.W_hc) + self.b_c)
            o_t = T.nnet.sigmoid(
                T.dot(x_t, self.W_xo) + T.dot(h_tm1, self.W_ho) +
                T.dot(c_t, self.W_co) + self.b_o)
            h_t = o_t * T.tanh(c_t)
            y_t = T.nnet.softmax(T.dot(h_t, self.W_hy) + self.b_y)
            return [h_t, c_t, y_t]

        # batch of sequence of vectors
        self.idxs = T.imatrix()
        self.pos_idxs = T.imatrix()

        # The eye function (diagonal 1s) for the POS, small in memory
        self.pos = T.eye(npos, npos, 0)
        # TODO No pos
        # x = self.emb[self.idxs].reshape((self.idxs.shape[0], de*cs))
        # POS version
        x = T.concatenate((self.emb[self.idxs].reshape(
            (self.idxs.shape[0], de * cs)), self.pos[self.pos_idxs].reshape(
                (self.pos_idxs.shape[0], npos * cs))), 1)

        self.y = T.iscalar('y')
        # initial hidden state
        self.h0 = shared(np.zeros(shape=self.n_lstm, dtype=dtype))
        self.c0 = shared(np.zeros(shape=self.n_lstm, dtype=dtype))
        self.lr = T.scalar('lr')
        [h_vals, c_vals,
         y_vals], _ = theano.scan(fn=step_lstm,
                                  sequences=x,
                                  outputs_info=[self.h0, self.c0, None],
                                  n_steps=x.shape[0])
        self.output = y_vals
        p_y_given_x_lastword = self.output[-1, 0, :]
        p_y_given_x_sentence = self.output[:, 0, :]
        p_y_given_x_sentence_hidden = (h_vals, c_vals, self.output[:, 0, :])
        y_pred = T.argmax(p_y_given_x_sentence, axis=1)
        # y_pred_word = T.argmax(p_y_given_x_lastword)

        self.cxe = T.mean(T.nnet.binary_crossentropy(self.output, self.y))
        self.nll = -T.mean(T.log(p_y_given_x_lastword)[self.y])
        self.mse = T.mean((self.output - self.y)**2)

        self.sentence_nll = -T.mean(
            T.log(p_y_given_x_sentence)[T.arange(x.shape[0]), self.y])

        self.L2_sqr = sum([(p**2).sum() for p in self.params])
        self.cost = self.nll + self.L2_reg * self.L2_sqr
        if cost_function == 'mse':
            self.cost = self.mse + self.L2_reg * self.L2_sqr
        elif cost_function == 'cxe':
            self.cost = self.cxe + self.L2_reg * self.L2_sqr
        self.debug = theano.function(
            inputs=[x, self.y],
            outputs=[x.shape, self.y.shape, y_vals.shape, self.cost.shape])
        gradients = T.grad(self.cost, self.params)
        self.updates = OrderedDict(
            (p, p - self.lr * g) for p, g in zip(self.params, gradients))
        self.loss = theano.function(inputs=[x, self.y], outputs=self.cost)
        # if na == 0: #assume no acoustic features for now
        # simply outputs the soft_max distribution for each word in utterance
        self.soft_max = theano.function(inputs=[self.idxs, self.pos_idxs],
                                        outputs=p_y_given_x_sentence)
        self.soft_max_return_hidden_layer = theano.function(
            inputs=[self.idxs, self.pos_idxs],
            outputs=p_y_given_x_sentence_hidden)
        if na == 0:
            self.train = theano.function(
                inputs=[self.idxs, self.pos_idxs, self.y, self.lr],
                outputs=self.cost,
                updates=self.updates)
            self.classify = theano.function(inputs=[self.idxs, self.pos_idxs],
                                            outputs=y_pred)
        else:
            self.train = theano.function(inputs=[
                self.idxs, self.pos_idxs, self.acoustic, self.y, self.lr
            ],
                                         outputs=self.cost,
                                         updates=self.updates)
            self.classify = theano.function(
                inputs=[self.idxs, self.pos_idxs, self.acoustic],
                outputs=y_pred)
        self.normalize = theano.function(
            inputs=[],
            updates={
                self.emb:
                self.emb / T.sqrt(
                    (self.emb**2).sum(axis=1)).dimshuffle(0, 'x')
            })
def evaluate_lenet5(learning_rate=0.02,
                    n_epochs=4,
                    L2_weight=1e-5,
                    extra_size=4,
                    emb_size=300,
                    batch_size=20,
                    filter_size=[3, 3],
                    maxSentLen=40,
                    hidden_size=[300, 300],
                    max_term_len=4):

    model_options = locals().copy()
    print "model options", model_options

    seed = 1234
    np.random.seed(seed)
    rng = np.random.RandomState(
        seed)  #random seed, control the model generates the same results

    all_sentences_l, all_masks_l, all_sentences_r, all_masks_r, all_word1, all_word2, all_word1_mask, all_word2_mask, all_labels, word2id = load_wordnet_hyper_vs_all_with_words(
        maxlen=maxSentLen, wordlen=max_term_len
    )  #minlen, include one label, at least one word in the sentence
    # test_sents_l, test_masks_l, test_sents_r, test_masks_r, test_labels, word2id  =load_ACE05_dataset(maxSentLen, word2id)
    test_sents_l, test_masks_l, test_sents_r, test_masks_r, test_word1, test_word2, test_word1_mask, test_word2_mask, test_labels, word2id = load_EVAlution_hyper_vs_all_with_words(
        maxSentLen, word2id, wordlen=max_term_len)
    total_size = len(all_sentences_l)
    hold_test_size = 10000
    train_size = total_size - hold_test_size

    train_sents_l = np.asarray(all_sentences_l[:train_size], dtype='int32')
    # dev_sents_l=np.asarray(all_sentences_l[1], dtype='int32')
    # test_sents_l=np.asarray(all_sentences_l[-test_size:], dtype='int32')
    test_sents_l = np.asarray(test_sents_l, dtype='int32')

    train_masks_l = np.asarray(all_masks_l[:train_size],
                               dtype=theano.config.floatX)
    # dev_masks_l=np.asarray(all_masks_l[1], dtype=theano.config.floatX)
    # test_masks_l=np.asarray(all_masks_l[-test_size:], dtype=theano.config.floatX)
    test_masks_l = np.asarray(test_masks_l, dtype=theano.config.floatX)

    train_sents_r = np.asarray(all_sentences_r[:train_size], dtype='int32')
    # dev_sents_r=np.asarray(all_sentences_r[1]    , dtype='int32')
    # test_sents_r=np.asarray(all_sentences_r[-test_size:], dtype='int32')
    test_sents_r = np.asarray(test_sents_r, dtype='int32')

    train_masks_r = np.asarray(all_masks_r[:train_size],
                               dtype=theano.config.floatX)
    # dev_masks_r=np.asarray(all_masks_r[1], dtype=theano.config.floatX)
    # test_masks_r=np.asarray(all_masks_r[-test_size:], dtype=theano.config.floatX)
    test_masks_r = np.asarray(test_masks_r, dtype=theano.config.floatX)

    train_word1 = np.asarray(all_word1[:train_size], dtype='int32')
    train_word2 = np.asarray(all_word2[:train_size], dtype='int32')
    test_word1 = np.asarray(test_word1, dtype='int32')
    test_word2 = np.asarray(test_word2, dtype='int32')

    train_word1_mask = np.asarray(all_word1_mask[:train_size],
                                  dtype=theano.config.floatX)
    train_word2_mask = np.asarray(all_word2_mask[:train_size],
                                  dtype=theano.config.floatX)
    test_word1_mask = np.asarray(test_word1_mask, dtype=theano.config.floatX)
    test_word2_mask = np.asarray(test_word2_mask, dtype=theano.config.floatX)

    train_labels_store = np.asarray(all_labels[:train_size], dtype='int32')
    # dev_labels_store=np.asarray(all_labels[1], dtype='int32')
    # test_labels_store=np.asarray(all_labels[-test_size:], dtype='int32')
    test_labels_store = np.asarray(test_labels, dtype='int32')

    # train_size=len(train_labels_store)
    # dev_size=len(dev_labels_store)
    test_size = len(test_labels_store)
    print 'train size: ', train_size, ' test size: ', test_size

    vocab_size = len(word2id) + 1

    rand_values = rng.normal(
        0.0, 0.01,
        (vocab_size, emb_size))  #generate a matrix by Gaussian distribution
    #here, we leave code for loading word2vec to initialize words
    rand_values[0] = np.array(np.zeros(emb_size), dtype=theano.config.floatX)
    id2word = {y: x for x, y in word2id.iteritems()}
    word2vec = load_word2vec()
    rand_values = load_word2vec_to_init(rand_values, id2word, word2vec)
    init_embeddings = theano.shared(
        value=np.array(rand_values, dtype=theano.config.floatX), borrow=True
    )  #wrap up the python variable "rand_values" into theano variable

    #now, start to build the input form of the model
    sents_ids_l = T.imatrix()
    sents_mask_l = T.fmatrix()
    sents_ids_r = T.imatrix()
    sents_mask_r = T.fmatrix()
    word1_ids = T.imatrix()
    word2_ids = T.imatrix()
    word1_mask = T.fmatrix()
    word2_mask = T.fmatrix()
    labels = T.ivector()
    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    def embed_input(emb_matrix, sent_ids):
        return emb_matrix[sent_ids.flatten()].reshape(
            (batch_size, maxSentLen, emb_size)).dimshuffle(0, 2, 1)

    embed_input_l = embed_input(
        init_embeddings, sents_ids_l
    )  #embeddings[sents_ids_l.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1) #the input format can be adapted into CNN or GRU or LSTM
    embed_input_r = embed_input(
        init_embeddings, sents_ids_r
    )  #embeddings[sents_ids_r.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1)

    embed_word1 = init_embeddings[word1_ids.flatten()].reshape(
        (batch_size, max_term_len, emb_size))
    embed_word2 = init_embeddings[word2_ids.flatten()].reshape(
        (batch_size, max_term_len, emb_size))
    word1_embedding = T.sum(embed_word1 * word1_mask.dimshuffle(0, 1, 'x'),
                            axis=1)
    word2_embedding = T.sum(embed_word2 * word2_mask.dimshuffle(0, 1, 'x'),
                            axis=1)
    '''create_AttentiveConv_params '''
    conv_W, conv_b = create_conv_para(rng,
                                      filter_shape=(hidden_size[1], 1,
                                                    emb_size, filter_size[0]))
    conv_W_context, conv_b_context = create_conv_para(
        rng, filter_shape=(hidden_size[1], 1, emb_size, 1))

    NN_para = [conv_W, conv_b, conv_W_context]
    '''
    attentive convolution function
    '''

    attentive_conv_layer = Conv_for_Pair(
        rng,
        origin_input_tensor3=embed_input_l,
        origin_input_tensor3_r=embed_input_r,
        input_tensor3=embed_input_l,
        input_tensor3_r=embed_input_r,
        mask_matrix=sents_mask_l,
        mask_matrix_r=sents_mask_r,
        image_shape=(batch_size, 1, emb_size, maxSentLen),
        image_shape_r=(batch_size, 1, emb_size, maxSentLen),
        filter_shape=(hidden_size[1], 1, emb_size, filter_size[0]),
        filter_shape_context=(hidden_size[1], 1, emb_size, 1),
        W=conv_W,
        b=conv_b,
        W_context=conv_W_context,
        b_context=conv_b_context)
    attentive_sent_embeddings_l = attentive_conv_layer.attentive_maxpool_vec_l
    attentive_sent_embeddings_r = attentive_conv_layer.attentive_maxpool_vec_r

    "form input to LR classifier"
    LR_input = T.concatenate([
        attentive_sent_embeddings_l, attentive_sent_embeddings_r,
        attentive_sent_embeddings_l * attentive_sent_embeddings_r,
        attentive_sent_embeddings_l - attentive_sent_embeddings_r,
        word1_embedding, word2_embedding, word1_embedding * word2_embedding
    ],
                             axis=1)
    LR_input_size = 4 * hidden_size[1] + 3 * emb_size

    U_a = create_ensemble_para(
        rng, 2, LR_input_size)  # the weight matrix hidden_size*2
    LR_b = theano.shared(value=np.zeros((2, ), dtype=theano.config.floatX),
                         name='LR_b',
                         borrow=True)  #bias for each target class
    LR_para = [U_a, LR_b]

    layer_LR = LogisticRegression(
        rng, input=LR_input, n_in=LR_input_size, n_out=2, W=U_a, b=LR_b
    )  #basically it is a multiplication between weight matrix and input feature vector
    loss = layer_LR.negative_log_likelihood(
        labels
    )  #for classification task, we usually used negative log likelihood as loss, the lower the better.

    # L2_reg = (conv_W**2).sum()+(conv_W_context**2).sum()+(U_a**2).sum()

    params = NN_para + LR_para  #[init_embeddings]

    cost = loss  #+L2_weight*L2_reg

    updates = Gradient_Cost_Para(cost, params, learning_rate)

    train_model = theano.function([
        sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, word1_ids,
        word2_ids, word1_mask, word2_mask, labels
    ],
                                  cost,
                                  updates=updates,
                                  allow_input_downcast=True,
                                  on_unused_input='ignore')
    test_model = theano.function([
        sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, word1_ids,
        word2_ids, word1_mask, word2_mask, labels
    ], [layer_LR.errors(labels), layer_LR.y_pred, layer_LR.prop_for_posi],
                                 allow_input_downcast=True,
                                 on_unused_input='ignore')

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 50000000000  # look as this many examples regardless
    start_time = time.time()
    mid_time = start_time
    past_time = mid_time
    epoch = 0
    done_looping = False

    n_train_batches = train_size / batch_size
    train_batch_start = list(
        np.arange(n_train_batches) * batch_size) + [train_size - batch_size]
    # n_dev_batches=dev_size/batch_size
    # dev_batch_start=list(np.arange(n_dev_batches)*batch_size)+[dev_size-batch_size]
    n_test_batches = test_size / batch_size
    n_test_remain = test_size % batch_size
    if n_test_remain != 0:
        test_batch_start = list(
            np.arange(n_test_batches) * batch_size) + [test_size - batch_size]
    else:
        test_batch_start = list(np.arange(n_test_batches) * batch_size)

    # max_acc_dev=0.0
    max_ap_test = 0.0
    max_ap_topk_test = 0.0
    max_f1 = 0.0

    cost_i = 0.0
    train_indices = range(train_size)

    while epoch < n_epochs:
        epoch = epoch + 1

        random.Random(100).shuffle(
            train_indices
        )  #shuffle training set for each new epoch, is supposed to promote performance, but not garrenteed

        iter_accu = 0

        for batch_id in train_batch_start:  #for each batch
            # iter means how many batches have been run, taking into loop
            iter = (epoch - 1) * n_train_batches + iter_accu + 1
            iter_accu += 1
            train_id_batch = train_indices[batch_id:batch_id + batch_size]
            cost_i += train_model(
                train_sents_l[train_id_batch], train_masks_l[train_id_batch],
                train_sents_r[train_id_batch], train_masks_r[train_id_batch],
                train_word1[train_id_batch], train_word2[train_id_batch],
                train_word1_mask[train_id_batch],
                train_word2_mask[train_id_batch],
                train_labels_store[train_id_batch])

            #after each 1000 batches, we test the performance of the model on all test data
            if iter % 100 == 0:
                print 'Epoch ', epoch, 'iter ' + str(
                    iter) + ' average cost: ' + str(cost_i / iter), 'uses ', (
                        time.time() - past_time) / 60.0, 'min'
                past_time = time.time()

                pred_labels = []
                probs = []
                gold_labels = []
                error_sum = 0.0
                for idd, test_batch_id in enumerate(
                        test_batch_start):  # for each test batch
                    error_i, pred_i, prob_i = test_model(
                        test_sents_l[test_batch_id:test_batch_id + batch_size],
                        test_masks_l[test_batch_id:test_batch_id + batch_size],
                        test_sents_r[test_batch_id:test_batch_id + batch_size],
                        test_masks_r[test_batch_id:test_batch_id + batch_size],
                        test_word1[test_batch_id:test_batch_id + batch_size],
                        test_word2[test_batch_id:test_batch_id + batch_size],
                        test_word1_mask[test_batch_id:test_batch_id +
                                        batch_size],
                        test_word2_mask[test_batch_id:test_batch_id +
                                        batch_size],
                        test_labels_store[test_batch_id:test_batch_id +
                                          batch_size])

                    error_sum += error_i
                    pred_labels += list(pred_i)
                    probs += list(prob_i)
                if n_test_remain != 0:
                    probs = probs[:(len(test_batch_start) - 1) *
                                  batch_size] + probs[-n_test_remain:]
                assert len(test_labels) == len(probs)
                # test_acc=1.0-error_sum/(len(test_batch_start))
                test_ap = apk(test_labels, probs, k=len(test_labels))
                test_ap_top100 = apk(test_labels, probs, k=100)

                if test_ap > max_ap_test:
                    max_ap_test = test_ap
                if test_ap_top100 > max_ap_topk_test:
                    max_ap_topk_test = test_ap_top100
                print '\t\tcurrent ap:', test_ap, ' ; ', '\t\tmax_ap: ', max_ap_test, 'ap@100: ', test_ap_top100, '\tmax_ap@100:', max_ap_topk_test

        print 'Epoch ', epoch, 'uses ', (time.time() - mid_time) / 60.0, 'min'
        mid_time = time.time()

        #print 'Batch_size: ', update_freq
    end_time = time.time()

    print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
  def _InitializeModelThatPredictsCharsMultiSoftmax(self,learning_rate, num_softmaxes=5):
    image_input = T.tensor4('image_input')
    print ("num_of_softmax: " + str(num_softmaxes))
    #prediction_layer = self._BuildModelToPredictFirstChar(image_input)
    prediction_layer = self._BuildModelToPredictCharsMultiSoftmax(
        image_input, num_softmaxes=num_softmaxes)

    target_chars_input = T.imatrix('target_chars_input')
    target_chars = target_chars_input[:, :num_softmaxes].reshape(shape=(-1,))

    # Create a loss expression for training, Using cross-entropy loss.
    prediction = lasagne.layers.get_output(prediction_layer)
    l_loss = lasagne.objectives.categorical_crossentropy(prediction, target_chars)
    loss = l_loss.mean()

    # Create update expressions for training, i.e., how to modify the
    # parameters at each training step. Here, we'll use Stochastic Gradient
    # Descent (SGD) with Nesterov momentum.
    params = lasagne.layers.get_all_params(prediction_layer, trainable=True)
    updates = lasagne.updates.nesterov_momentum(
    	loss, params, learning_rate, momentum=0.9)
    #updates = lasagne.updates.adagrad(loss, params, learning_rate=0.0001)

    # Create a loss expression for validation/testing. The crucial difference
    # here is that we do a deterministic forward pass through the network,
    # disabling dropout layers.
    test_prediction = lasagne.layers.get_output(prediction_layer, deterministic=True)
    test_loss = lasagne.objectives.categorical_crossentropy(test_prediction,
							    target_chars)
    test_loss = test_loss.mean()

    predicted_chars = T.argmax(test_prediction, axis=1)
    correctly_predicted_chars = T.eq(predicted_chars, target_chars)
    # An expression for the classification accuracy:
    test_acc = T.mean(correctly_predicted_chars,
		      dtype=theano.config.floatX)
    predicted_chars = predicted_chars.reshape(shape=(-1, num_softmaxes))
    correctly_predicted_chars = correctly_predicted_chars.reshape(shape=(-1, num_softmaxes))
    num_chars_matched = T.sum(correctly_predicted_chars, axis=1, dtype=theano.config.floatX)
    seq_test_acc = T.mean(T.eq(num_chars_matched, T.fill(num_chars_matched, num_softmaxes)),
                          dtype=theano.config.floatX)
    test_prediction = test_prediction.reshape(shape=(-1, num_softmaxes, len(self.CHARS)))

    # Compile a function performing a training step on a mini-batch (by giving
    # the updates dictionary) and returning the corresponding training loss:
    train_fn = theano.function(
        [image_input, target_chars_input],
        loss,
        updates=updates,
        allow_input_downcast=True)

    # Compile a second function computing the prediction, validation loss and accuracy:
    test_fn = theano.function([image_input, target_chars_input],
			      [test_loss, test_acc, seq_test_acc],
                              allow_input_downcast=True)

    # Compile a third function computing the prediction.
    inference_fn = theano.function([image_input],
			           [predicted_chars, test_prediction],
                                   allow_input_downcast=True)

    return prediction_layer, train_fn, test_fn, inference_fn
Exemple #56
0
 def _get_input_tensor_variables():
     # x_w: 1D: batch, 2D: n_words, 3D: 5 + window; word id
     # x_p: 1D: batch, 2D: n_words; posit id
     # y: 1D: batch, 2D: n_words; label id
     return T.itensor3('x_w'), T.imatrix('x_p'), T.imatrix('y')
  def _InitializeModelThatPredictsAllChars(
      self, learning_rate, bidirectional_rnn=False, use_mask_input=False,
      lstm_layer_units=256, cnn_dense_layer_sizes = 256, lstm_grad_clipping=False):
    image_input = T.tensor4('image_input')
    num_rnn_steps = self.num_rnn_steps
    target_chars_input = T.imatrix('target_chars')
    target_chars = target_chars_input[:, :num_rnn_steps]
    target_chars = target_chars.reshape(shape=(-1,))

    mask_input_input = None
    mask_input = None
    if use_mask_input:
      mask_input_input = T.imatrix('mask_input')
      mask_input = mask_input_input[:, :num_rnn_steps]
      #mask_input = mask_input.reshape(shape=(-1,))
    prediction_layer, l_cnn, l_lstm = self._BuildModelToPredictAllChars(
        image_input, num_rnn_steps=num_rnn_steps, mask_input=mask_input,
        bidirectional_rnn=bidirectional_rnn, lstm_layer_units=lstm_layer_units, 
        cnn_dense_layer_sizes= cnn_dense_layer_sizes,
        lstm_grad_clipping=lstm_grad_clipping)
        #lstm_grad_clipping=False)

    # Create a loss expression for training, Using cross-entropy loss.
    #prediction = lasagne.layers.get_output(prediction_layer)
    prediction, l_cnn, l_lstm = tuple(
        lasagne.layers.get_output([prediction_layer, l_cnn, l_lstm]))
    l_loss = lasagne.objectives.categorical_crossentropy(prediction, target_chars)
    print ("prediction",prediction.shape,"target_char",target_chars.shape,"$$$$$$$$")
    if use_mask_input:
      l_loss = l_loss.reshape(shape=(-1, num_rnn_steps))
      l_loss *= mask_input
      loss = l_loss.sum() / mask_input.sum()
    else:
      loss = l_loss.mean()

    # Create update expressions for training, i.e., how to modify the
    # parameters at each training step. Here, we'll use Stochastic Gradient
    # Descent (SGD) with Nesterov momentum.
    params = lasagne.layers.get_all_params(prediction_layer, trainable=True)

    #grads = theano.grad(loss, params)
    if lstm_grad_clipping:
      print('doing grad clipping')
      max_grad_norm = 15.0
      grads = theano.grad(loss, params)
      grads = [grad.clip(-5., 5.) for grad in grads]
      #grads, norm = lasagne.updates.total_norm_constraint(
      #	 grads, max_grad_norm, return_norm=True)
      grads = [lasagne.updates.norm_constraint(
                   grad, max_grad_norm, range(grad.ndim))
      	       for grad in grads]
      updates = lasagne.updates.adam(grads, params, learning_rate=learning_rate)

    else:
      updates = lasagne.updates.nesterov_momentum(
	  loss, params, learning_rate, momentum=0.9)
      #updates = lasagne.updates.adagrad(loss, params, learning_rate=0.001)

    # Create a loss expression for validation/testing. The crucial difference
    # here is that we do a deterministic forward pass through the network,
    # disabling dropout layers.
    test_prediction = lasagne.layers.get_output(prediction_layer, deterministic=True)
    test_loss = lasagne.objectives.categorical_crossentropy(test_prediction,
							    target_chars)
    test_loss = test_loss.mean()

    predicted_chars = T.argmax(test_prediction, axis=1)
    correctly_predicted_chars = T.eq(predicted_chars, target_chars)
    # An expression for the classification accuracy:
    test_acc = T.mean(correctly_predicted_chars,
		      dtype=theano.config.floatX)
    predicted_chars = predicted_chars.reshape(shape=(-1, num_rnn_steps))
    correctly_predicted_chars = correctly_predicted_chars.reshape(shape=(-1, num_rnn_steps))
    num_chars_matched = T.sum(correctly_predicted_chars, axis=1, dtype=theano.config.floatX)
    seq_test_acc = T.mean(T.eq(num_chars_matched, T.fill(num_chars_matched, num_rnn_steps)),
                          dtype=theano.config.floatX)
    test_prediction = test_prediction.reshape(shape=(-1, num_rnn_steps, len(self.CHARS)))

    mask_input_vec = [mask_input_input] if use_mask_input else []
    # Compile a function performing a training step on a mini-batch (by giving
    # the updates dictionary) and returning the corresponding training loss:
    print ("target chars",image_input)
    train_fn = theano.function(
        [image_input, target_chars_input] + mask_input_vec,
        loss,
        updates=updates,
        allow_input_downcast=True)

    # Compile a second function computing the prediction, validation loss and accuracy:
    test_fn = theano.function([image_input, target_chars_input] + mask_input_vec,
			      [test_loss, test_acc, seq_test_acc,predicted_chars,target_chars,correctly_predicted_chars],
                              allow_input_downcast=True)

    # Compile a third function computing the prediction.
    inference_fn = theano.function([image_input] + mask_input_vec,
			           [predicted_chars, test_prediction],
                                   allow_input_downcast=True)

    return prediction_layer, train_fn, test_fn, inference_fn
Exemple #58
0
def main():
    configure_theano()
    options = parse_options()
    config_file = options['config']
    config = ConfigParser.ConfigParser()
    config.read(config_file)

    print('CLI options: {}'.format(options.items()))

    print('Reading Config File: {}...'.format(config_file))
    print(config.items('stream1'))
    print(config.items('lstm_classifier'))
    print(config.items('training'))

    print('preprocessing dataset...')
    data = load_mat_file(config.get('stream1', 'data'))
    has_encoder = config.getboolean('stream1', 'has_encoder')
    stream1_dim = config.getint('stream1', 'input_dimensions')
    imagesize = tuple([int(d) for d in config.get('stream1', 'imagesize').split(',')])
    if has_encoder:
        stream1 = config.get('stream1', 'model')
        stream1_shape = config.get('stream1', 'shape')
        stream1_nonlinearities = config.get('stream1', 'nonlinearities')

    # lstm classifier
    output_classes = config.getint('lstm_classifier', 'output_classes')
    output_classnames = config.get('lstm_classifier', 'output_classnames').split(',')
    lstm_size = config.getint('lstm_classifier', 'lstm_size')
    matlab_target_offset = config.getboolean('lstm_classifier', 'matlab_target_offset')

    # lstm classifier configurations
    weight_init = options['weight_init'] if 'weight_init' in options else config.get('lstm_classifier', 'weight_init')
    use_peepholes = options['use_peepholes'] if 'use_peepholes' in options else config.getboolean('lstm_classifier',
                                                                                                  'use_peepholes')
    use_blstm = True if config.has_option('lstm_classifier', 'use_blstm') else False
    windowsize = config.getint('lstm_classifier', 'windowsize')

    # capture training parameters
    validation_window = int(options['validation_window']) \
        if 'validation_window' in options else config.getint('training', 'validation_window')
    num_epoch = int(options['num_epoch']) if 'num_epoch' in options else config.getint('training', 'num_epoch')
    learning_rate = options['learning_rate'] if 'learning_rate' in options \
        else config.getfloat('training', 'learning_rate')

    epochsize = config.getint('training', 'epochsize')
    batchsize = config.getint('training', 'batchsize')

    weight_init_fn = las.init.GlorotUniform()
    if weight_init == 'glorot':
        weight_init_fn = las.init.GlorotUniform()
    if weight_init == 'norm':
        weight_init_fn = las.init.Normal(0.1)
    if weight_init == 'uniform':
        weight_init_fn = las.init.Uniform()
    if weight_init == 'ortho':
        weight_init_fn = las.init.Orthogonal()

    data_matrix = data['dataMatrix'].astype('float32')
    targets_vec = data['targetsVec'].reshape((-1,))
    subjects_vec = data['subjectsVec'].reshape((-1,))
    vidlen_vec = data['videoLengthVec'].reshape((-1,))
    iter_vec = data['iterVec'].reshape((-1,))

    data_matrix = presplit_dataprocessing(data_matrix, vidlen_vec, config, 'stream1', imagesize=imagesize)

    indexes = create_split_index(len(data_matrix), vidlen_vec, iter_vec)
    train_vidlen_vec, test_vidlen_vec = split_videolen(vidlen_vec, iter_vec)

    if matlab_target_offset:
        targets_vec -= 1

    # split the data
    train_data = data_matrix[indexes == True]
    train_targets = targets_vec[indexes == True]
    train_targets = train_targets.reshape((len(train_targets),))
    test_data = data_matrix[indexes == False]
    test_targets = targets_vec[indexes == False]
    test_targets = test_targets.reshape((len(test_targets),))

    train_data, test_data = postsplit_datapreprocessing(train_data, test_data, config, 'stream1')

    inputs = T.tensor3('inputs', dtype='float32')
    window = T.iscalar('theta')
    mask = T.matrix('mask', dtype='uint8')
    targets = T.imatrix('targets')

    print('constructing end to end model...')
    if not has_encoder:
        network = deltanet_v1.create_model((None, None, stream1_dim), inputs,
                                           (None, None), mask, window,
                                           lstm_size, output_classes, weight_init_fn, use_peepholes, use_blstm)
    else:
        ae1 = load_decoder(stream1, stream1_shape, stream1_nonlinearities)
        network = deltanet_majority_vote.create_model(ae1, (None, None, stream1_dim), inputs,
                                                      (None, None), mask,
                                                      lstm_size, window, output_classes, weight_init_fn, use_peepholes)

    print_network(network)
    draw_to_file(las.layers.get_all_layers(network), 'network.png', verbose=True)
    # exit()
    print('compiling model...')
    predictions = las.layers.get_output(network, deterministic=False)
    all_params = las.layers.get_all_params(network, trainable=True)
    cost = temporal_softmax_loss(predictions, targets, mask)
    updates = las.updates.adam(cost, all_params, learning_rate)

    train = theano.function(
        [inputs, targets, mask, window],
        cost, updates=updates, allow_input_downcast=True)
    compute_train_cost = theano.function([inputs, targets, mask, window], cost, allow_input_downcast=True)

    test_predictions = las.layers.get_output(network, deterministic=True)
    test_cost = temporal_softmax_loss(test_predictions, targets, mask)
    compute_test_cost = theano.function(
        [inputs, targets, mask, window], test_cost, allow_input_downcast=True)

    val_fn = theano.function([inputs, mask, window], test_predictions, allow_input_downcast=True)

    # We'll train the network with 10 epochs of 30 minibatches each
    print('begin training...')
    cost_train = []
    cost_val = []
    class_rate = []
    STRIP_SIZE = 3
    val_window = circular_list(validation_window)
    train_strip = np.zeros((STRIP_SIZE,))
    best_val = float('inf')
    best_conf = None
    best_cr = 0.0

    datagen = gen_lstm_batch_random(train_data, train_targets, train_vidlen_vec, batchsize=batchsize)
    val_datagen = gen_lstm_batch_random(test_data, test_targets, test_vidlen_vec,
                                        batchsize=len(test_vidlen_vec))

    # We'll use this "validation set" to periodically check progress
    X_val, y_val, mask_val, idxs_val = next(val_datagen)
    # reshape the targets for validation
    y_val_evaluate = y_val
    y_val = y_val.reshape((-1, 1)).repeat(mask_val.shape[-1], axis=-1)

    for epoch in range(num_epoch):
        time_start = time.time()
        for i in range(epochsize):
            X, y, m, batch_idxs = next(datagen)
            # repeat targets based on max sequence len
            y = y.reshape((-1, 1))
            y = y.repeat(m.shape[-1], axis=-1)
            print_str = 'Epoch {} batch {}/{}: {} examples at learning rate = {:.4f}'.format(
                epoch + 1, i + 1, epochsize, len(X), learning_rate)
            print(print_str, end='')
            sys.stdout.flush()
            train(X, y, m, windowsize)
            print('\r', end='')
        cost = compute_train_cost(X, y, m, windowsize)
        val_cost = compute_test_cost(X_val, y_val, mask_val, windowsize)
        cost_train.append(cost)
        cost_val.append(val_cost)
        train_strip[epoch % STRIP_SIZE] = cost
        val_window.push(val_cost)

        gl = 100 * (cost_val[-1] / np.min(cost_val) - 1)
        pk = 1000 * (np.sum(train_strip) / (STRIP_SIZE * np.min(train_strip)) - 1)
        pq = gl / pk

        cr, val_conf = evaluate_model2(X_val, y_val_evaluate, mask_val, windowsize, val_fn)
        class_rate.append(cr)

        print("Epoch {} train cost = {}, validation cost = {}, "
              "generalization loss = {:.3f}, GQ = {:.3f}, classification rate = {:.3f} ({:.1f}sec)"
              .format(epoch + 1, cost_train[-1], cost_val[-1], gl, pq, cr, time.time() - time_start))

        if val_cost < best_val:
            best_val = val_cost
            best_conf = val_conf
            best_cr = cr

        if epoch >= validation_window and early_stop2(val_window, best_val, validation_window):
            break

    print('Best Model')
    print('classification rate: {}, validation loss: {}'.format(best_cr, best_val))
    print('confusion matrix: ')
    plot_confusion_matrix(best_conf, output_classnames, fmt='latex')
    plot_validation_cost(cost_train, cost_val, class_rate)
def train_lbl(train_data, dev_data, test_data=[], 
              K=20, context_sz=2, learning_rate=1.0, 
              rate_update='simple', epochs=10, 
              batch_size=1, rng=None, patience=None, 
              patience_incr=2, improvement_thrs=0.995, 
              validation_freq=1000):

    """ Train log-bilinear model """
    # create vocabulary from train data, plus <s>, </s>
    
    logger.info("Creating vocabulary dictionary...")
    vocab = Dictionary.from_corpus(train_data, unk='<unk>')
    logger.info("Creating tag dictionary...")
    vocab_tags = Dictionary.from_corpus_tags(train_data, unk='<unk>')
    vocab.add_word('<s>')
    vocab.add_word('</s>')
    V = vocab.size()

    vocab_tags.add_word('<s>')
    vocab_tags.add_word('</s>')
    V_tag = vocab_tags.size()
    #print train_data
    
    # initialize random generator if not provided
    rng = np.random.RandomState() if not rng else rng
    
    logger.info("Making instances...")
    # generate (context, target) pairs of word ids
    train_set_x, train_set_y, train_set_tags = make_instances(train_data, vocab, vocab_tags, context_sz)
    dev_set_x, dev_set_y, dev_set_tags  = make_instances(dev_data, vocab, vocab_tags, context_sz)
    test_set_x, test_set_y, test_set_tags  = make_instances(test_data, vocab, vocab_tags, context_sz)
    
    # make feature_matrix 
    # very sparse matrix...better way to do it?
    feature_matrix = np.zeros((vocab_tags.size(),vocab_tags.num_sub_tags))
    feature_matrix[(0,0)] = 1 # unk encoding
    
    for tag,tag_id in vocab_tags:
        if tag == "<s>":
            feature_matrix[(tag_id,1)] = 1
        elif tag == "</s>":
            feature_matrix[(tag_id,2)] = 1
        else:
            for sub_tag in vocab_tags.map_tag_to_sub_tags[tag]:
                val = vocab_tags.map_sub_to_ids[sub_tag]
                feature_matrix[(tag_id,val)] = 1
             
    feature_matrix[1,:] = np.zeros((vocab_tags.num_sub_tags))
    # number of minibatches for training
    n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size
    n_dev_batches = dev_set_x.get_value(borrow=True).shape[0] / batch_size
    n_test_batches = test_set_x.get_value(borrow=True).shape[0] / batch_size

    # build the model
    logger.info("Build the model ...")
    index = T.lscalar()
    
    x = T.imatrix('x')
    y = T.ivector('y')
    t = T.ivector('t') # the tag vector
    
    # create log-bilinear model
    lbl = LogBilinearLanguageModel(x, V, K, vocab_tags.num_sub_tags, feature_matrix, context_sz, rng)
 

    # cost function is negative log likelihood of the training data
    cost = lbl.negative_log_likelihood(y,t)
  
    # compute the gradient
    gparams = []
    for param in lbl.params:
        gparam = T.grad(cost, param)
        gparams.append(gparam)

    # specify how to update the parameter of the model
    updates = []
    for param_i,(param, gparam) in enumerate(zip(lbl.params, gparams)):
        updates.append((param, param-learning_rate*gparam))
                        
    # function that computes log-probability of the dev set
    logprob_dev = theano.function(inputs=[index], outputs=cost,
                                  givens={x: dev_set_x[index*batch_size:
                                                           (index+1)*batch_size],
                                          y: dev_set_y[index*batch_size:
                                                           (index+1)*batch_size],
                                          t: dev_set_tags[index*batch_size:(index+1)*batch_size]
                                          })


    # function that computes log-probability of the test set
    logprob_test = theano.function(inputs=[index], outputs=cost,
                                   givens={x: test_set_x[index*batch_size:
                                                             (index+1)*batch_size],
                                           y: test_set_y[index*batch_size:
                                                             (index+1)*batch_size],
                                           t: test_set_tags[index*batch_size:(index+1)*batch_size]
                                       })
    
    # function that returns the cost and updates the parameter 
    train_model = theano.function(inputs=[index], outputs=cost,
                                  updates=updates,
                                  givens={x: train_set_x[index*batch_size:
                                                             (index+1)*batch_size],
                                          y: train_set_y[index*batch_size:
                                                             (index+1)*batch_size],
                                          t: train_set_tags[index*batch_size:(index+1)*batch_size]
                                          })


    # perplexity functions
    def compute_dev_logp():
        return np.mean([logprob_dev(i) for i in xrange(n_dev_batches)])

    def compute_test_logp():
        return np.mean([logprob_test(i) for i in xrange(n_test_batches)])

    def ppl(neg_logp):
        return np.power(2.0, neg_logp)
    
    # train model
    logger.info("training model...")
    best_params = None
    last_epoch_dev_ppl = np.inf
    best_dev_ppl = np.inf
    test_ppl = np.inf
    test_core = 0
    start_time = time.clock()
    done_looping = False

    for epoch in xrange(epochs):
        if done_looping:
            break
        logger.info('epoch %i' % epoch) 
        for minibatch_index in xrange(n_train_batches):
            itr = epoch * n_train_batches + minibatch_index
            train_logp = train_model(minibatch_index)
            logger.info('epoch %i, minibatch %i/%i, train minibatch log prob %.4f ppl %.4f' % 
                         (epoch, minibatch_index+1, n_train_batches, 
                          train_logp, ppl(train_logp)))
            if (itr+1) % validation_freq == 0:
                # compute perplexity on dev set, lower is better
                dev_logp = compute_dev_logp()
                dev_ppl = ppl(dev_logp)
                logger.debug('epoch %i, minibatch %i/%i, dev log prob %.4f ppl %.4f' % 
                             (epoch, minibatch_index+1, n_train_batches, 
                              dev_logp, ppl(dev_logp)))
                # if we got the lowest perplexity until now
                if dev_ppl < best_dev_ppl:
                    # improve patience if loss improvement is good enough
                    if patience and dev_ppl < best_dev_ppl * improvement_thrs:
                        patience = max(patience, itr * patience_incr)
                    best_dev_ppl = dev_ppl
                    test_logp = compute_test_logp()
                    test_ppl = ppl(test_logp)
                    logger.debug('epoch %i, minibatch %i/%i, test log prob %.4f ppl %.4f' % 
                                 (epoch, minibatch_index+1, n_train_batches, 
                                  test_logp, ppl(test_logp)))
            # stop learning if no improvement was seen for a long time
            if patience and patience <= itr:
                done_looping = True
                break
        # adapt learning rate
        if rate_update == 'simple':
            # set learning rate to 1 / (epoch+1)
            learning_rate = 1.0 / (epoch+1)
        elif rate_update == 'adaptive':
            # half learning rate if perplexity increased at end of epoch (Mnih and Teh 2012)
            this_epoch_dev_ppl = ppl(compute_dev_logp())
            if this_epoch_dev_ppl > last_epoch_dev_ppl:
                learning_rate /= 2.0
            last_epoch_dev_ppl = this_epoch_dev_ppl
        elif rate_update == 'constant':
            # keep learning rate constant
            pass
        else:
            raise ValueError("Unknown learning rate update strategy: %s" %rate_update)
        
    end_time = time.clock()
    total_time = end_time - start_time
    logger.info('Optimization complete with best dev ppl of %.4f and test ppl %.4f' % 
                (best_dev_ppl, test_ppl))
    logger.info('Training took %d epochs, with %.1f epochs/sec' % (epoch+1, 
                float(epoch+1) / total_time))
    logger.info("Total training time %d days %d hours %d min %d sec." % 
                (total_time/60/60/24, total_time/60/60%24, total_time/60%60, total_time%60))
    # return model
    return lbl
Exemple #60
0
        sys.exit("'Hidden layer size' argument missing!")

    if len(sys.argv) > 3:
        learning_rate = float(sys.argv[3])
    else:
        sys.exit("'Learning rate' argument missing!")

    model_file_name = "Model_%s_h%d_lr%s.pcl" % (model_name, num_hidden,
                                                 learning_rate)

    print num_hidden, learning_rate, model_file_name

    word_vocabulary = data.read_vocabulary(data.WORD_VOCAB_FILE)
    punctuation_vocabulary = data.iterable_to_dict(data.PUNCTUATION_VOCABULARY)

    x = T.imatrix('x')
    y = T.imatrix('y')
    lr = T.scalar('lr')

    if os.path.isfile(model_file_name):
        print "Loading previous model state"

        net, state = models.load(model_file_name, MINIBATCH_SIZE, x)
        gsums, learning_rate, validation_ppl_history, starting_epoch, rng = state
        best_ppl = min(validation_ppl_history)

    else:
        rng = np.random
        rng.seed(1)

        print "Building model..."