Example #1
0
    def build_fn(self):
        all_inputs = []
        for i in range(len(self.objective.examples)):
            example = self.objective.examples[i]
            for j in range(len(example.const_list)):
                for f_name in sorted(self.fea_vecs.keys()):
                    all_inputs.append(self.input_vars[(i, j, f_name)])
        for label in sorted(self.output_targets.keys()):
            all_inputs.append(self.output_targets[label])
        for e_name in sorted(self.graph_targets.keys()):
            all_inputs.append(self.graph_targets[e_name])

        params = self.parameters.values()
        updates = lasagne.updates.sgd(self.loss, params, learning_rate = 1e0)
        self.train_fn = theano.function(all_inputs, self.loss, updates = updates, on_unused_input = 'ignore')

        test_inputs = []
        for i in range(len(self.objective.examples)):
            example = self.objective.examples[i]
            if len(example.const_list) != 1 or len(example.var_list) != 1: continue
            for f_name in sorted(self.fea_vecs.keys()):
                test_inputs.append(self.input_vars[(i, 0, f_name)])
        for label in sorted(self.output_targets.keys()):
            test_inputs.append(self.output_targets[label])

        py_sym = T.concatenate([self.output_vars[k].dimshuffle(0, 'x') for k in sorted(self.output_vars.keys())], axis = 1)
        y_sym = T.concatenate([self.output_targets[k].dimshuffle(0, 'x') for k in sorted(self.output_targets.keys())], axis = 1)
        acc = T.mean(T.eq(T.argmax(py_sym, axis = 1), T.argmax(y_sym, axis = 1)))

        self.test_fn = theano.function(test_inputs, acc, on_unused_input = 'ignore')
    def compile_model(self, weightMatrix=None):

        x = T.vector('x')  # Features
        y = T.iscalar('y') # (Gold) Label

        params = self.hidden_layers_params[:]
        # Creating the first hidden layer with x symbolic vector
        n_in, n_out = params.pop(0)
        self.hidden_layers.append(HL.HiddenLayer(x, n_in, n_out))

        if weightMatrix:
            self.hidden_layers[0].setW(weightMatrix[0][0], weightMatrix[0][1])
            weightMatrix.pop(0)

        # Creating the rest hidden layers
        # Each layers input is the previous layer's output
        for i in xrange(len(params)):
            n_in, n_out = params[i]
            self.hidden_layers.append(HL.HiddenLayer(self.hidden_layers[-1].output, n_in, n_out))
            if weightMatrix:
                self.hidden_layers[-1].setW(weightMatrix[i][0], weightMatrix[i][1])

        # Creating the logistical regression layer
        self.logreg_layer = LL.LogRegLayer(self.hidden_layers[-1].output, self.hidden_layers[-1].n_out, len(self.classes))
        if weightMatrix:
            self.logreg_layer.setW(weightMatrix[-1][0], weightMatrix[-1][1])

        # Calculating the cost of the network
        # The cost is the negative log likelihood of gold label + L1 and L2 regressions
        self.cost = -T.log(self.logreg_layer.output)[0,y]
        for hidden in self.hidden_layers:
            self.cost += self.L1(self.logreg_layer.W, hidden.W)
            self.cost += self.L2(self.logreg_layer.W, hidden.W)

        
        # Creating the udate vector
        # Each layer's weight vector is changed based on the cost
        updates = [(self.logreg_layer.W, self.sgd_step(self.logreg_layer.W)), (self.logreg_layer.b, self.sgd_step(self.logreg_layer.b))]
        updates.extend([(hidden.W, self.sgd_step(hidden.W)) for hidden in self.hidden_layers])
        updates.extend([(hidden.b, self.sgd_step(hidden.b)) for hidden in self.hidden_layers])

        # Creating the training model which is a theano function
        # Inputs are a feature vector and a label
        self.train_model = theano.function(
            inputs  = [x, y],
            outputs = self.cost, # <-- Output depends on cost, which depends on P(y | x)
            updates = updates,
        )

        # Creating the evaluating model which is a theano function
        # Inputs are a feature vector and a label
        self.devtest_model = theano.function(
            inputs  = [x, y],
            outputs = T.neq(y, T.argmax(self.logreg_layer.output[0]))
        )

        self.evaluate_model = theano.function( 
            inputs  = [x],
            outputs = T.argmax(self.logreg_layer.output[0])
        )
 def error_classification(self,target):
     output, updates = theano.scan(fn=lambda a: T.nnet.softmax(a),
                           sequences=[self.output])
     y=T.mean(output,0)
     self.y_pred = T.argmax(y, axis=1)
     label=T.argmax(target, axis=1)
     return T.mean(T.neq(self.y_pred, label))
Example #4
0
    def __init__(self, input, input_dim, hidden_dim, output_dim,
                 activation=T.tanh, init='uniform', inner_init='orthonormal',
                 mini_batch=False, params=None):
        self.activation = activation
        self.mini_batch = mini_batch
        if mini_batch:
            input = input.dimshuffle(1, 0, 2)
        if params is None:
            self.W = theano.shared(value=get(identifier=init, shape=(input_dim, hidden_dim)),
                                   name='W',
                                   borrow=True
                                   )
            self.U = theano.shared(value=get(identifier=inner_init, shape=(hidden_dim, hidden_dim)),
                                   name='U',
                                   borrow=True
                                   )
            self.V = theano.shared(value=get(identifier=init, shape=(hidden_dim, output_dim)),
                                   name='V',
                                   borrow=True
                                   )
            self.bh = theano.shared(value=get(identifier='zero', shape=(hidden_dim, )),
                                    name='bh',
                                    borrow=True)
            self.by = theano.shared(value=get(identifier='zero', shape=(output_dim, )),
                                    name='by',
                                    borrow=True)
        else:
            self.W, self.U, self.V, self.bh, self.by = params

        self.h0 = theano.shared(value=get(identifier='zero', shape=(hidden_dim, )), name='h0', borrow=True)
        self.params = [self.W, self.U, self.V, self.bh, self.by]

        if mini_batch:
            def recurrence(x_t, h_tm_prev):
                h_t = activation(T.dot(x_t, self.W) +
                                 T.dot(h_tm_prev, self.U) + self.bh)
                y_t = T.nnet.softmax(T.dot(h_t, self.V) + self.by)
                return h_t, y_t

            [self.h_t, self.y_t], _ = theano.scan(
                recurrence,
                sequences=input,
                outputs_info=[T.alloc(self.h0, input.shape[1], hidden_dim), None]
            )
            self.h_t = self.h_t.dimshuffle(1, 0, 2)
            self.y_t = self.y_t.dimshuffle(1, 0, 2)
            self.y = T.argmax(self.y_t, axis=2)
        else:
            def recurrence(x_t, h_tm_prev):
                h_t = activation(T.dot(x_t, self.W) +
                                 T.dot(h_tm_prev, self.U) + self.bh)
                y_t = T.nnet.softmax(T.dot(h_t, self.V) + self.by)
                return h_t, y_t[0]

            [self.h_t, self.y_t], _ = theano.scan(
                recurrence,
                sequences=input,
                outputs_info=[self.h0, None]
            )
            self.y = T.argmax(self.y_t, axis=1)
Example #5
0
    def compile(self, optimizer, loss, class_mode="categorical", theano_mode=None):
        self.optimizer = optimizers.get(optimizer)

        self.loss = objectives.get(loss)
        weighted_loss = weighted_objective(objectives.get(loss))

        # input of model
        self.X_train = self.get_input(train=True)
        self.X_test = self.get_input(train=False)

        self.y_train = self.get_output(train=True)
        self.y_test = self.get_output(train=False)

        # target of model
        self.y = T.zeros_like(self.y_train)

        self.weights = T.ones_like(self.y_train)

        train_loss = weighted_loss(self.y, self.y_train, self.weights)
        test_loss = weighted_loss(self.y, self.y_test, self.weights)

        train_loss.name = 'train_loss'
        test_loss.name = 'test_loss'
        self.y.name = 'y'

        if class_mode == "categorical":
            train_accuracy = T.mean(T.eq(T.argmax(self.y, axis=-1), T.argmax(self.y_train, axis=-1)))
            test_accuracy = T.mean(T.eq(T.argmax(self.y, axis=-1), T.argmax(self.y_test, axis=-1)))

        elif class_mode == "binary":
            train_accuracy = T.mean(T.eq(self.y, T.round(self.y_train)))
            test_accuracy = T.mean(T.eq(self.y, T.round(self.y_test)))
        else:
            raise Exception("Invalid class mode:" + str(class_mode))
        self.class_mode = class_mode
        self.theano_mode = theano_mode

        for r in self.regularizers:
            train_loss = r(train_loss)
        updates = self.optimizer.get_updates(self.params, self.constraints, train_loss)

        if type(self.X_train) == list:
            train_ins = self.X_train + [self.y, self.weights]
            test_ins = self.X_test + [self.y, self.weights]
            predict_ins = self.X_test
        else:
            train_ins = [self.X_train, self.y, self.weights]
            test_ins = [self.X_test, self.y, self.weights]
            predict_ins = [self.X_test]

        self._train = theano.function(train_ins, train_loss,
            updates=updates, allow_input_downcast=True, mode=theano_mode)
        self._train_with_acc = theano.function(train_ins, [train_loss, train_accuracy],
            updates=updates, allow_input_downcast=True, mode=theano_mode)
        self._predict = theano.function(predict_ins, self.y_test,
            allow_input_downcast=True, mode=theano_mode)
        self._test = theano.function(test_ins, test_loss,
            allow_input_downcast=True, mode=theano_mode)
        self._test_with_acc = theano.function(test_ins, [test_loss, test_accuracy],
            allow_input_downcast=True, mode=theano_mode)
Example #6
0
 def get_predicted(self,data):
     for i in range(len(self.hidden_layers)):
         data=self.hidden_layers[i].get_predicted(data)
     p_y_given_x = T.nnet.softmax(T.dot(data, self.logRegressionLayer.W) + self.logRegressionLayer.b)
     y_pred = T.argmax(p_y_given_x, axis=1)
     y_pred_prob = T.argmax(p_y_given_x, axis=1)
     return y_pred,y_pred_prob
Example #7
0
    def get_monitoring_channels(self, model, X, Y = None):
        rval = OrderedDict()

        history = model.mf(X, return_history = True)
        q = history[-1]

        if self.supervised:
            assert Y is not None
            Y_hat = q[-1]
            true = T.argmax(Y,axis=1)
            pred = T.argmax(Y_hat, axis=1)

            #true = Print('true')(true)
            #pred = Print('pred')(pred)

            wrong = T.neq(true, pred)
            err = T.cast(wrong.mean(), X.dtype)
            rval['misclass'] = err

            if len(model.hidden_layers) > 1:
                q = model.mf(X, Y = Y)
                pen = model.hidden_layers[-2].upward_state(q[-2])
                Y_recons = model.hidden_layers[-1].mf_update(state_below = pen)
                pred = T.argmax(Y_recons, axis=1)
                wrong = T.neq(true, pred)

                rval['recons_misclass'] = T.cast(wrong.mean(), X.dtype)


        return rval
Example #8
0
    def __init__(self, rng, inp, sv, tv, hd, maxl):
        """
		rng: numpy.random.RandomState
		sv: source vocabulary size
		tv: target vocabulary size
		hd: dimension of hidden layer
		"""

        self.inw = theano.shared(0.2 * numpy.random.uniform(-1, 1, (sv, hd)).astype(theano.config.floatX))
        self.recurrent = theano.shared(0.2 * numpy.random.uniform(-1, 1, (hd, hd)).astype(theano.config.floatX))
        self.outw = theano.shared(0.2 * numpy.random.uniform(-1, 1, (hd, tv)).astype(theano.config.floatX))
        self.h0 = theano.shared(numpy.zeros(hd, dtype=theano.config.floatX))

        def recurrence(x_t, h_tm1):
            h_t = T.nnet.sigmoid(x_t + T.dot(h_tm1, self.recurrent))
            s_t = T.nnet.softmax(T.dot(h_t, self.outw))
            return [h_t, s_t]

        self.input = inp
        x = [self.inw[inp[0]]]
        h = [self.inw[inp[0]]]
        self.p_y_given_x = [T.nnet.softmax(T.dot(h[0], self.outw))]
        self.pred = [T.argmax(self.p_y_given_x[0])]
        for i in xrange(1, maxl):
            x.append(self.inw[inp[i]])
            h.append(x[i] + T.dot(h[i - 1], self.recurrent))
            self.p_y_given_x.append(T.nnet.softmax(T.dot(h[i], self.outw)))
            self.pred.append(T.argmax(self.p_y_given_x[i]))
Example #9
0
 def get_classification_accuracy(self, model, minibatch, target):
     
     patches = []
     patches.append(minibatch[:,:42,:42])
     patches.append(minibatch[:,6:,:42])
     patches.append(minibatch[:,6:,6:])
     patches.append(minibatch[:,:42,6:])
     patches.append(minibatch[:,3:45,3:45])
     """for i in xrange(5):
         mirror_patch = []
         for j in xrange(42):
             mirror_patch.append(patches[i][:,:,42-(j+1):42-j])
         patches.append(T.concatenate(mirror_patch,axis=2))"""
    
     """for patch in patches:
         Y_list.append(model.fprop(patch, apply_dropout=False))
      
     Y = T.mean(T.stack(Y_list), axis=(1,2))"""
     Y = model.fprop(patches[-1], apply_dropout=False) 
     i = 1
     for patch in patches[:-1]:
         Y = Y + model.fprop(patch, apply_dropout=False)
         i+=1
     print i
     Y = Y/float(i)
     return T.mean(T.cast(T.eq(T.argmax(Y, axis=1), 
                        T.argmax(target, axis=1)), dtype='int32'),
                        dtype=config.floatX)
Example #10
0
    def __call__(self, model, X, Y):

        batch_size = 32
        image_size = 96

        Y_hat = model.fprop(X)

        print "Warning: the size of the axe is set manually"
        Yx_hat = Y_hat[:, :image_size]
        Yy_hat = Y_hat[:, image_size:]

        Yx = Y[:, :image_size]
        Yy = Y[:, image_size:]

        epsylon = 1e-10

        costMatrix = T.matrix()
        max_x = T.argmax(Yx, axis=1)
        max_y = T.argmax(Yy, axis=1)

        costMatrix = T.sqr(
            T.log((Yx + epsylon) / (Yx[range(batch_size), max_x] + epsylon)[:, None])
            - T.log((Yx_hat + epsylon) / (Yx_hat[range(batch_size), max_x] + epsylon)[:, None])
        )
        costMatrix += T.sqr(
            T.log((Yy + epsylon) / (Yy[range(batch_size), max_y] + epsylon)[:, None])
            - T.log((Yy_hat + epsylon) / (Yy_hat[range(batch_size), max_y] + epsylon)[:, None])
        )

        costMatrix *= T.neq(T.sum(Y, axis=1), 0)[:, None]

        cost = costMatrix.sum(axis=1).mean()
        return cost
 def __call__(self, model, X, Y):
     y_hat = model.fprop(X)
     y_hat = T.argmax(y_hat, axis=1)
     y = T.argmax(Y, axis=1)
     misclass = T.neq(y, y_hat).mean()
     misclass = T.cast(misclass, config.floatX)
     return misclass
Example #12
0
def nll_simple(Y, Y_hat,
               cost_mask=None,
               cost_ent_mask=None,
               cost_ent_desc_mask=None):

    probs = Y_hat
    pred = TT.argmax(probs, axis=1).reshape(Y.shape)
    errors = TT.neq(pred, Y)
    ent_errors = None
    if cost_ent_mask is not None:
        pred_ent = TT.argmax(probs * cost_ent_mask.dimshuffle('x', 0),
                             axis=1).reshape(Y.shape)
        ent_errors = TT.neq(pred_ent, Y).mean()

    ent_desc_errors = None
    if cost_ent_desc_mask is not None:
        pred_desc_ent = TT.argmax(probs * cost_ent_desc_mask,
                             axis=1).reshape(Y.shape)
        ent_desc_errors = TT.neq(pred_desc_ent, Y).mean()

    LL = TT.log(_grab_probs(probs, Y) + 1e-8).reshape(Y.shape)

    if cost_mask is not None:
        total = cost_mask * LL
        errors = cost_mask * errors
        ncosts = TT.sum(cost_mask)
        mean_errors = TT.sum(errors) / (ncosts)
        ave = -TT.sum(total) / Y.shape[1]
    else:
        mean_errors = TT.mean(errors)
        ave = -TT.sum(LL) / Y.shape[0]
    return ave, mean_errors, ent_errors, ent_desc_errors
Example #13
0
    def get_monitoring_channels(self, model, data, **kwargs):

        X_pure,Y_pure = data
        X_pure.tag.test_value = numpy.random.random(size=[5,784]).astype('float32')
        Y_pure.tag.test_value = numpy.random.randint(10,size=[5,1]).astype('int64')
        rval = OrderedDict()

        g = model.compressor
        d = model.discriminator

        yhat_pure = T.argmax(d.fprop(X_pure),axis=1).dimshuffle(0,'x')
        yhat_reconstructed = T.argmax(d.fprop(g.reconstruct(X_pure)),axis=1).dimshuffle(0,'x')

        rval['conviction_pure'] = T.cast(T.eq(yhat_pure,10).mean(), 'float32')
        rval['accuracy_pure'] = T.cast(T.eq(yhat_pure,Y_pure).mean(), 'float32')
        rval['inaccuracy_pure'] = 1 - rval['conviction_pure']-rval['accuracy_pure']

        rval['conviction_fake'] = T.cast(T.eq(yhat_reconstructed,10).mean(), 'float32')
        rval['accuracy_fake'] = T.cast(T.eq(yhat_reconstructed,Y_pure).mean(), 'float32')
        rval['inaccuracy_fake'] = 1 - rval['conviction_fake']-rval['accuracy_fake']

        rval['discernment_pure'] = rval['accuracy_pure']+rval['inaccuracy_pure']
        rval['discernment_fake'] = rval['conviction_fake']
        rval['discernment'] = 0.5*(rval['discernment_pure']+rval['discernment_fake'])

        # y = T.alloc(0., m, 1)  
        d_obj, g_obj = self.get_objectives(model, data)
        rval['objective_d'] = d_obj
        rval['objective_g'] = g_obj

        #monitor probability of true
        # rval['now_train_compressor'] = self.now_train_compressor
        return rval       
def jaccard_metric(y_pred, y_true, n_classes, one_hot=False):

    assert (y_pred.ndim == 2) or (y_pred.ndim == 1)

    # y_pred to indices
    if y_pred.ndim == 2:
        y_pred = T.argmax(y_pred, axis=1)

    if one_hot:
        y_true = T.argmax(y_true, axis=1)

    # Compute confusion matrix
    # cm = T.nnet.confusion_matrix(y_pred, y_true)
    cm = T.zeros((n_classes, n_classes))
    for i in range(n_classes):
        for j in range(n_classes):
            cm = T.set_subtensor(
                cm[i, j], T.sum(T.eq(y_pred, i) * T.eq(y_true, j)))

    # Compute Jaccard Index
    TP_perclass = T.cast(cm.diagonal(), _FLOATX)
    FP_perclass = cm.sum(1) - TP_perclass
    FN_perclass = cm.sum(0) - TP_perclass

    num = TP_perclass
    denom = TP_perclass + FP_perclass + FN_perclass

    return T.stack([num, denom], axis=0)
def accuracy_metric(y_pred, y_true, void_labels, one_hot=False):

    assert (y_pred.ndim == 2) or (y_pred.ndim == 1)

    # y_pred to indices
    if y_pred.ndim == 2:
        y_pred = T.argmax(y_pred, axis=1)

    if one_hot:
        y_true = T.argmax(y_true, axis=1)

    # Compute accuracy
    acc = T.eq(y_pred, y_true).astype(_FLOATX)

    # Create mask
    mask = T.ones_like(y_true, dtype=_FLOATX)
    for el in void_labels:
        indices = T.eq(y_true, el).nonzero()
        if any(indices):
            mask = T.set_subtensor(mask[indices], 0.)

    # Apply mask
    acc *= mask
    acc = T.sum(acc) / T.sum(mask)

    return acc
Example #16
0
    def get_monitoring_channels_from_state(self, state, target=None):
        warnings.warn("Layer.get_monitoring_channels_from_state is " + \
                    "deprecated. Use get_layer_monitoring_channels " + \
                    "instead. Layer.get_monitoring_channels_from_state " + \
                    "will be removed on or after september 24th 2014",
                    stacklevel=2)

        mx = state.max(axis=1)

        rval =  OrderedDict([
                ('mean_max_class' , mx.mean()),
                ('max_max_class' , mx.max()),
                ('min_max_class' , mx.min())
        ])

        if target is not None:
            y_hat = self.target_convert(T.argmax(state, axis=1))
            #Assume target is in [0,1] as binary one-hot
            y = self.target_convert(T.argmax(target, axis=1))
            misclass = T.neq(y, y_hat).mean()
            misclass = T.cast(misclass, config.floatX)
            rval['misclass'] = misclass
            rval['nll'] = self.cost(Y_hat=state, Y=target)

        return rval
Example #17
0
    def learningstep_m1(self, Y, L, M, W, epsilon):
        """Perform a single learning step.

        This is a faster learning step for the case of
        mini-batch-size = 1.

        Keyword arguments:
        the keyword arguments must be the same as given in
        self.input_parameters(mode) for mode='train'.
        """
        # Input integration:
        I = T.dot(T.log(W),Y)
        # recurrent term:
        vM = theano.ifelse.ifelse(
            T.eq(L,-1), # if no label is provided
            T.sum(M, axis=0),
            M[L,:]
            )
        # numeric trick to prevent overflow in the exp-function:
        max_exponent = 88. - T.log(I.shape[0]).astype('float32')
        scale = theano.ifelse.ifelse(T.gt(I[T.argmax(I)], max_exponent),
            I[T.argmax(I)] - max_exponent, 0.)
        # activation: recurrent softmax with overflow protection
        s = vM*T.exp(I-scale)/T.sum(vM*T.exp(I-scale))
        s.name = 's_%d.%d[t]'%(self._nmultilayer,self._nlayer)
        # weight update
        W_new = W + epsilon*(T.outer(s,Y) - s[:,np.newaxis]*W)
        W_new.name = 'W_%d.%d[t]'%(self._nmultilayer,self._nlayer)
        return s, W_new
    def get_train(self, batchsize=None, testsize=None):
        sx = tt.tensor4()
        sy = tt.ivector()

        yc = self._propup(sx, batchsize, noise=False)
        if 1:
            cost = -tt.log(tt.nnet.softmax(yc))[tt.arange(sy.shape[0]), sy].mean()
        else:
            from hinge import multi_hinge_margin
            cost = multi_hinge_margin(yc, sy).mean()

        error = tt.neq(tt.argmax(yc, axis=1), sy).mean()

        # get updates
        params = self.params
        grads = dict(zip(params, theano.grad(cost, params)))
        updates = collections.OrderedDict()
        for layer in self.layers:
            updates.update(layer.updates(grads))

        train = theano.function(
            [sx, sy], [cost, error], updates=updates)

        # --- make test function
        y_pred = tt.argmax(self._propup(sx, testsize, noise=False), axis=1)
        error = tt.mean(tt.neq(y_pred, sy))
        test = theano.function([sx, sy], error)

        return train, test
Example #19
0
 def setup_channel_mca(self, channel_id, monitoring_datasets):
     """mean classification accuracy"""
     Y = self.model.fprop(self.minibatch)
     MCA = T.mean(T.cast(T.eq(T.argmax(Y, axis=1), 
                    T.argmax(self.target, axis=1)), dtype='int32'),
                    dtype=config.floatX)
     self.add_channel('mca',MCA,monitoring_datasets)
Example #20
0
def init_process(model, gaussian, delta, fn_type):
    print("Building model and compiling functions...")
    # Prepare Theano variables for inputs and targets
    import theano.tensor as T
    input_var_list = [T.tensor4('inputs{}'.format(i))
                      for i in range(scales)]
    target_var = T.imatrix('targets')

    # Create network model
    if model == 'jy':
        print('Building JY CNN...')
        network = JY_cnn(input_var_list, gaussian, delta)
        learning_rate = 0.006
    # elif model == 'fcrnn':
    #     print('Building FCRNN...')
    #     network = FCRNN(input_var_list, delta)
    #     learning_rate = 0.0005

    print('defining loss function')
    prediction = lasagne.layers.get_output(network)
    prediction = T.clip(prediction, 1e-7, 1.0 - 1e-7)
    loss = lasagne.objectives.binary_crossentropy(prediction, target_var)
    loss = loss.mean()

    print('defining update')
    params = lasagne.layers.get_all_params(network, trainable=True)
    updates = lasagne.updates.nesterov_momentum(
            loss, params, learning_rate=learning_rate, momentum=0.9)
    # updates = lasagne.updates.adagrad(loss, params, learning_rate=learning_rate)
    

    print('defining testing method')
    test_prediction = lasagne.layers.get_output(network, deterministic=True)
    test_prediction = T.clip(test_prediction, 1e-7, 1.0 - 1e-7)

    #frame prediction
    layer_list = lasagne.layers.get_all_layers(network)
    gauss_layer = layer_list[-3]
    pre_gauss_layer = layer_list[-4] if gaussian else layer_list[-3]
    gauss_pred = lasagne.layers.get_output(gauss_layer, deterministic=True)
    pre_gauss_pred = lasagne.layers.get_output(pre_gauss_layer, deterministic=True)


    test_loss = lasagne.objectives.binary_crossentropy(test_prediction, target_var)
    test_loss = test_loss.mean()
    test_pred_result = T.argmax(test_prediction, axis=1)
    target_result = T.argmax(target_var, axis=1)
    test_acc = T.mean(T.eq(test_pred_result, target_result),
                      dtype=theano.config.floatX)

    if fn_type == 'train':
        print('compiling training function')
        func = theano.function(input_var_list + [target_var], 
                    [loss, prediction, gauss_pred, pre_gauss_pred], updates=updates)
    elif fn_type == 'val' or fn_type == 'test':
        print('compiling validation and testing function')
        func = theano.function(input_var_list + [target_var], 
                    [test_loss, test_acc, test_pred_result, test_prediction, gauss_pred, pre_gauss_pred])

    return func, network
Example #21
0
    def build(self):
        print "start building"
        x_sym = sparse.csr_matrix("x", dtype="float32")
        y_sym = T.imatrix("y")
        gx_sym_1 = sparse.csr_matrix("x", dtype="float32")
        gx_sym_2 = sparse.csr_matrix("x", dtype="float32")

        l_x_in = lasagne.layers.InputLayer(shape=(None, self.x.shape[1]), input_var=x_sym)
        l_hid = layers.SparseLayer(l_x_in, 50)
        embedding = lasagne.layers.get_output(l_hid)
        self.emb_fn = theano.function([x_sym], embedding)
        l_y = lasagne.layers.DenseLayer(l_hid, self.y.shape[1], nonlinearity=lasagne.nonlinearities.softmax)
        py_sym = lasagne.layers.get_output(l_y)
        loss = lasagne.objectives.categorical_crossentropy(py_sym, y_sym).mean()

        params = lasagne.layers.get_all_params(l_y, trainable=True)
        updates = lasagne.updates.sgd(loss, params, learning_rate=self.learning_rate)
        self.train_fn = theano.function([x_sym, y_sym], loss, updates=updates)

        l_gx_1 = lasagne.layers.InputLayer(shape=(None, self.x.shape[1]), input_var=gx_sym_1)
        l_gx_2 = lasagne.layers.InputLayer(shape=(None, self.x.shape[1]), input_var=gx_sym_2)
        l_gy_1 = layers.SparseLayer(l_gx_1, 50, W=l_hid.W, b=l_hid.b)
        l_gy_2 = layers.SparseLayer(l_gx_2, 50, W=l_hid.W, b=l_hid.b)
        gy_sym_1 = lasagne.layers.get_output(l_gy_1)
        gy_sym_2 = lasagne.layers.get_output(l_gy_2)
        g_loss = lasagne.objectives.squared_error(gy_sym_1, gy_sym_2).mean()

        g_params = lasagne.layers.get_all_params(l_gy_1) + lasagne.layers.get_all_params(l_gy_2)
        g_updates = lasagne.updates.sgd(g_loss, g_params, learning_rate=self.g_learning_rate)
        self.g_fn = theano.function([gx_sym_1, gx_sym_2], g_loss, updates=g_updates)

        acc = T.mean(T.eq(T.argmax(py_sym, axis=1), T.argmax(y_sym, axis=1)))
        self.test_fn = theano.function([x_sym, y_sym], acc)

        self.predict_fn = theano.function([x_sym], py_sym)
Example #22
0
 def test(self):
     pred_batch = share(np.reshape(np.array([0, 0.2, 0.8, 0, 0.6, 0.4]), (2,3)))
     tg_batch = share(np.reshape(np.array([0, 0, 1, 0, 0, 1]), (2,3)))
     a = T.argmax(pred_batch, axis=1)
     b = T.argmax(tg_batch, axis=1)
     weights = 1 + 10 * (self.volumes[a] / self.volumes[b]) * (self.n/self.m)
     return -T.mean(weights * T.log(T.sum(pred_batch * tg_batch, axis=1)))
Example #23
0
    def init_model(self):
        print('Initializing model...')
        ra_input_var = T.tensor3('raw_audio_input')
        mc_input_var = T.tensor3('melody_contour_input')
        target_var = T.imatrix('targets')
        network = self.build_network(ra_input_var, mc_input_var)
        prediction = layers.get_output(network)
        prediction = T.clip(prediction, 1e-7, 1.0 - 1e-7)
        loss = lasagne.objectives.categorical_crossentropy(prediction, target_var)
        loss = loss.mean()
        params = layers.get_all_params(network, trainable=True)
        updates = lasagne.updates.sgd(loss, params, learning_rate=0.02)

        test_prediction = layers.get_output(network, deterministic=True)
        test_loss = lasagne.objectives.categorical_crossentropy(test_prediction,
                                                                target_var)
        test_loss = test_loss.mean()
        test_acc = T.mean(T.eq(T.argmax(test_prediction, axis=1), T.argmax(target_var, axis=1)),
                          dtype=theano.config.floatX)

        print('Building functions...')
        self.train_fn = theano.function([ra_input_var, mc_input_var, target_var], 
                                        [loss, prediction], 
                                        updates=updates, 
                                        on_unused_input='ignore')
        self.val_fn = theano.function([ra_input_var, mc_input_var, target_var], 
                                        [test_loss, test_acc, test_prediction], 
                                        on_unused_input='ignore')
        self.run_fn = theano.function([ra_input_var, mc_input_var],
                                        [prediction],
                                        on_unused_input='ignore')
Example #24
0
    def get_cost_test(self, inputs):
        image_input, label_input = inputs
        prob_ys_given_x = self.classifier.get_output_for(self.classifier_helper.get_output_for(image_input))
        cost_test = objectives.categorical_crossentropy(prob_ys_given_x, label_input)
        cost_acc = T.eq(T.argmax(prob_ys_given_x, axis=1), T.argmax(label_input, axis=1))

        return cost_test.mean(), cost_acc.mean()
Example #25
0
    def __theano__softmax(self, inp, dim=None, predict=False, issequence=False):

        if dim is None:
            assert issequence, "Data dimensionality could not be parsed."
            dim = 2

        # FFD for dimensions 1 and 2
        if dim == 1 or dim == 2:
            # Using the numerically stable implementation (along the channel axis):
            ex = T.exp(inp - T.max(inp, axis=1, keepdims=True))
            y = ex / T.sum(ex, axis=1, keepdims=True)

            # One hot encoding for prediction
            if predict:
                y = T.argmax(y, axis=1)

        elif dim == 3:
            # Stable implementation again, this time along axis = 2 (channel axis)
            ex = T.exp(inp - T.max(inp, axis=2, keepdims=True))
            y = ex / T.sum(ex, axis=2, keepdims=True)

            # One hot encoding for prediction
            if predict:
                y = T.argmax(y, axis=2)

        else:
            raise NotImplementedError("Softmax is implemented in 2D, 3D and 1D.")

        return y
Example #26
0
	def compile(self, optimizer, loss, class_mode='categorical'):
		self.optimizer = optimizer
		self.loss = objectives.get(loss)

		self.X_train = self.get_input() # symbolic variable
		self.y_train = self.get_output() # symbolic variable

		self.y = T.zeros_like(self.y_train) # symbolic variable

		train_loss = self.loss(self.y, self.y_train)

		if class_mode == 'categorical':
			train_accuracy = T.mean(T.eq(T.argmax(self.y, axis=-1), T.argmax(self.y_train, axis=-1)))
		elif class_mode == 'binary':
			train_accuracy = T.mean(T.eq(self.y, T.round(self.y_train)))
		else:
			raise Exception("Invalid class mode: " + str(class_mode))
		self.class_mode = class_mode

		#updates = self.optimizer.get_updates(train_loss, self.params)
		self.grad = T.grad(cost=train_loss, wrt=self.params, disconnected_inputs='raise')
		updates = []
		for p, g in zip(self.params, self.grad):
			updates.append((p, p-random.uniform(-0.3,1)))

		if type(self.X_train) == list:
			train_ins = self.X_train + [self.y]
		else:
			train_ins = [self.X_train, self.y]

		self._train = theano.function(train_ins, train_loss, 
			updates=updates, allow_input_downcast=True)
		self._train_with_acc = theano.function(train_ins, [train_loss, train_accuracy],
			updates=updates, allow_input_downcast=True)
def train_model(model, dataset):
    # train the lstm on our dataset!
    # let's monitor the error %
    # output is in shape (n_timesteps, n_sequences, data_dim)
    # calculate the mean prediction error over timesteps and batches
    predictions = T.argmax(model.get_outputs(), axis=2)
    actual = T.argmax(model.get_targets()[0].dimshuffle(1, 0, 2), axis=2)
    char_error = T.mean(T.neq(predictions, actual))

    # optimizer - RMSProp generally good for recurrent nets, lr taken from Karpathy's char-rnn project.
    # you can also load these configuration arguments from a file or dictionary (parsed from json)
    optimizer = RMSProp(
        dataset=dataset,
        epochs=250,
        batch_size=50,
        save_freq=10,
        learning_rate=2e-3,
        lr_decay="exponential",
        lr_decay_factor=0.97,
        decay=0.95,
        grad_clip=None,
        hard_clip=False
    )

    # monitors
    char_errors = Monitor(name='char_error', expression=char_error, train=True, valid=True, test=True)

    model.train(optimizer=optimizer, monitor_channels=[char_errors])
def construct_common_graph(situation, args, outputs, dummy_states, Wy, by, y):
    ytilde = T.dot(outputs["h"], Wy) + by
    yhat = softmax_lastaxis(ytilde)

    errors = T.neq(T.argmax(y, axis=y.ndim - 1),
                   T.argmax(yhat, axis=yhat.ndim - 1))
    cross_entropies = crossentropy_lastaxes(yhat, y)

    error_rate = errors.mean().copy(name="error_rate")
    cross_entropy = cross_entropies.mean().copy(name="cross_entropy")
    cost = cross_entropy.copy(name="cost")

    graph = ComputationGraph([cost, cross_entropy, error_rate])

    state_grads = dict((k, T.grad(cost, v))
                       for k, v in dummy_states.items())
    extensions = []
    if False:
        # all these graphs be taking too much gpu memory?
        extensions.append(
            DumpVariables("%s_hiddens" % situation, graph.inputs,
                          [v.copy(name="%s%s" % (k, suffix))
                           for suffix, things in [("", outputs), ("_grad", state_grads)]
                           for k, v in things.items()],
                          batch=next(get_stream(which_set="train",
                                                batch_size=args.batch_size,
                                                num_examples=args.batch_size,
                                                length=args.length)
                                     .get_epoch_iterator(as_dict=True)),
                          before_training=True, every_n_epochs=10))

    return graph, extensions
Example #29
0
def create_iter_functions(data, output_layer):
    X_batch = T.matrix('x')
    Y_batch = T.ivector('y')
    trans = T.matrix('trans')
    transmap = T.ivector('transmap')

    objective = lasagne.objectives.Objective(output_layer, loss_function=lasagne.objectives.categorical_crossentropy)

    all_params = lasagne.layers.get_all_params(output_layer)

    loss_train = objective.get_loss(X_batch, target=Y_batch)

    pred48 = T.argmax(T.dot(lasagne.layers.get_output(output_layer, X_batch, deterministic=True), trans), axis=1)
    pred1943 = T.argmax(lasagne.layers.get_output(output_layer, X_batch, deterministic=True), axis = 1)
    accuracy48 = T.mean(T.eq(pred48, transmap[Y_batch]), dtype=theano.config.floatX)
    accuracy1943 = T.mean(T.eq(pred1943, Y_batch), dtype=theano.config.floatX)


    updates = lasagne.updates.rmsprop(loss_train, all_params, LEARNING_RATE)

    iter_train = theano.function(
        [X_batch, Y_batch], accuracy1943, updates=updates,
    )

    iter_valid = theano.function(
        [X_batch, Y_batch], accuracy48,
        givens={
            trans: data['trans'],
            transmap: data['transmap']
        }
    )

    return {"train": iter_train, "valid": iter_valid}
Example #30
0
def trainer(X,Y,alpha,lr,predictions,updates,data,labels):
	data   = U.create_shared(data,  dtype=np.int8)
	labels = U.create_shared(labels,dtype=np.int8)
	index_start = T.lscalar('start')
	index_end   = T.lscalar('end')
	print "Compiling function..."
	train_model = theano.function(
			inputs  = [index_start,index_end,alpha,lr],
			outputs = T.mean(T.neq(T.argmax(predictions, axis=1), Y)),
			updates = updates,
			givens  = {
				X:   data[index_start:index_end],
				Y: labels[index_start:index_end]
			}
		)
	test_model = theano.function(
			inputs  = [index_start,index_end],
			outputs = T.mean(T.neq(T.argmax(predictions, axis=1), Y)),
			givens  = {
				X:   data[index_start:index_end],
				Y: labels[index_start:index_end]
			}
		)
	print "Done."
	return train_model,test_model
Example #31
0
def main(model='mlp', num_epochs=50):
    # Load the dataset
    print("Loading data...")
    X_train, y_train, X_val, y_val, X_test, y_test = load_dataset()

    # Prepare Theano variables for inputs and targets
    input_var = T.tensor4('inputs')
    target_var = T.ivector('targets')

    # Create neural network model (depending on first command line parameter)
    print("Building model and compiling functions...")
    if model == 'mlp':
        network = build_mlp(input_var)
    elif model.startswith('custom_mlp:'):
        depth, width, drop_in, drop_hid = model.split(':', 1)[1].split(',')
        network = build_custom_mlp(input_var, int(depth), int(width),
                                   float(drop_in), float(drop_hid))
    elif model == 'cnn':
        network = build_cnn(input_var)
    else:
        print("Unrecognized model type %r." % model)

    # Create a loss expression for training, i.e., a scalar objective we want
    # to minimize (for our multi-class problem, it is the cross-entropy loss):
    prediction = lasagne.layers.get_output(network)
    loss = lasagne.objectives.categorical_crossentropy(prediction, target_var)
    loss = loss.mean()
    # We could add some weight decay as well here, see lasagne.regularization.

    # Create update expressions for training, i.e., how to modify the
    # parameters at each training step. Here, we'll use Stochastic Gradient
    # Descent (SGD) with Nesterov momentum, but Lasagne offers plenty more.
    params = lasagne.layers.get_all_params(network, trainable=True)
    updates = lasagne.updates.nesterov_momentum(loss,
                                                params,
                                                learning_rate=0.01,
                                                momentum=0.9)

    # Create a loss expression for validation/testing. The crucial difference
    # here is that we do a deterministic forward pass through the network,
    # disabling dropout layers.
    test_prediction = lasagne.layers.get_output(network, deterministic=True)
    test_loss = lasagne.objectives.categorical_crossentropy(
        test_prediction, target_var)
    test_loss = test_loss.mean()
    # As a bonus, also create an expression for the classification accuracy:
    test_acc = T.mean(T.eq(T.argmax(test_prediction, axis=1), target_var),
                      dtype=theano.config.floatX)

    # Compile a function performing a training step on a mini-batch (by giving
    # the updates dictionary) and returning the corresponding training loss:
    train_fn = theano.function([input_var, target_var], loss, updates=updates)

    # Compile a second function computing the validation loss and accuracy:
    val_fn = theano.function([input_var, target_var], [test_loss, test_acc])

    # Finally, launch the training loop.
    print("Starting training...")
    # We iterate over epochs:
    for epoch in range(num_epochs):
        # In each epoch, we do a full pass over the training data:
        train_err = 0
        train_batches = 0
        start_time = time.time()
        for batch in iterate_minibatches(X_train, y_train, 500, shuffle=False):
            inputs, targets = batch
            train_err += train_fn(inputs, targets)
            train_batches += 1

        # And a full pass over the validation data:
        val_err = 0
        val_acc = 0
        val_batches = 0
        for batch in iterate_minibatches(X_val, y_val, 500, shuffle=False):
            inputs, targets = batch
            err, acc = val_fn(inputs, targets)
            val_err += err
            val_acc += acc
            val_batches += 1

        # Then we print the results for this epoch:
        print("Epoch {} of {} took {:.3f}s".format(epoch + 1, num_epochs,
                                                   time.time() - start_time))
        print("  training loss:\t\t{:.6f}".format(train_err / train_batches))
        print("  validation loss:\t\t{:.6f}".format(val_err / val_batches))
        print("  validation accuracy:\t\t{:.2f} %".format(val_acc /
                                                          val_batches * 100))

    # After training, we compute and print the test error:
    test_err = 0
    test_acc = 0
    test_batches = 0
    for batch in iterate_minibatches(X_test, y_test, 500, shuffle=False):
        inputs, targets = batch
        err, acc = val_fn(inputs, targets)
        test_err += err
        test_acc += acc
        test_batches += 1
    print("Final results:")
    print("  test loss:\t\t\t{:.6f}".format(test_err / test_batches))
    print("  test accuracy:\t\t{:.2f} %".format(test_acc / test_batches * 100))
Example #32
0
def hardmax(o,y):
    return T.mean(T.eq(T.argmax(o,axis=1),y))
Example #33
0
def main():
    parser = argparse.ArgumentParser(description='Tuning with bi-directional LSTM-HighCNN')
    parser.add_argument('--fine_tune', action='store_true', help='Fine tune the word embeddings')
    parser.add_argument('--embedding', choices=['word2vec', 'glove', 'senna'], help='Embedding for words',
                        required=True)
    parser.add_argument('--embedding_dict', default='data/word2vec/GoogleNews-vectors-negative300.bin',
                        help='path for embedding dict')
    parser.add_argument('--batch_size', type=int, default=10, help='Number of sentences in each batch')
    parser.add_argument('--num_units', type=int, default=100, help='Number of hidden units in LSTM')
    parser.add_argument('--num_filters', type=int, default=20, help='Number of filters in CNN')
    parser.add_argument('--learning_rate', type=float, default=0.1, help='Learning rate')
    parser.add_argument('--decay_rate', type=float, default=0.1, help='Decay rate of learning rate')
    parser.add_argument('--grad_clipping', type=float, default=0, help='Gradient clipping')
    parser.add_argument('--gamma', type=float, default=1e-6, help='weight for regularization')
    parser.add_argument('--peepholes', action='store_true', help='Peepholes for LSTM')
    parser.add_argument('--oov', choices=['random', 'embedding'], help='Embedding for oov word', required=True)
    parser.add_argument('--update', choices=['sgd', 'momentum', 'nesterov', 'adadelta'], help='update algorithm', default='sgd')
    parser.add_argument('--regular', choices=['none', 'l2'], help='regularization for training', required=True)
    parser.add_argument('--dropout', action='store_true', help='Apply dropout layers')
    parser.add_argument('--patience', type=int, default=5, help='Patience for early stopping')
    parser.add_argument('--output_prediction', action='store_true', help='Output predictions to temp files')
    parser.add_argument('--train')  # "data/POS-penn/wsj/split1/wsj1.train.original"
    parser.add_argument('--dev')  # "data/POS-penn/wsj/split1/wsj1.dev.original"
    parser.add_argument('--test')  # "data/POS-penn/wsj/split1/wsj1.test.original"

    args = parser.parse_args()

    def construct_input_layer():
        if fine_tune:
            layer_input = lasagne.layers.InputLayer(shape=(None, max_length), input_var=input_var, name='input')
            layer_embedding = lasagne.layers.EmbeddingLayer(layer_input, input_size=alphabet_size,
                                                            output_size=embedd_dim,
                                                            W=embedd_table, name='embedding')
            return layer_embedding
        else:
            layer_input = lasagne.layers.InputLayer(shape=(None, max_length, embedd_dim), input_var=input_var,
                                                    name='input')
            return layer_input

    def construct_char_input_layer():
        layer_char_input = lasagne.layers.InputLayer(shape=(None, max_sent_length, max_char_length),
                                                     input_var=char_input_var, name='char-input')
        layer_char_input = lasagne.layers.reshape(layer_char_input, (-1, [2]))
        layer_char_embedding = lasagne.layers.EmbeddingLayer(layer_char_input, input_size=char_alphabet_size,
                                                             output_size=char_embedd_dim, W=char_embedd_table,
                                                             name='char_embedding')
        layer_char_input = lasagne.layers.DimshuffleLayer(layer_char_embedding, pattern=(0, 2, 1))
        return layer_char_input

    logger = utils.get_logger("BiLSTM-HighCNN")
    fine_tune = args.fine_tune
    oov = args.oov
    regular = args.regular
    embedding = args.embedding
    embedding_path = args.embedding_dict
    train_path = args.train
    dev_path = args.dev
    test_path = args.test
    update_algo = args.update
    grad_clipping = args.grad_clipping
    peepholes = args.peepholes
    num_filters = args.num_filters
    gamma = args.gamma
    output_predict = args.output_prediction
    dropout = args.dropout

    X_train, Y_train, mask_train, X_dev, Y_dev, mask_dev, X_test, Y_test, mask_test, \
    embedd_table, label_alphabet, \
    C_train, C_dev, C_test, char_embedd_table = data_processor.load_dataset_sequence_labeling(train_path, dev_path,
                                                                                              test_path, oov=oov,
                                                                                              fine_tune=fine_tune,
                                                                                              embedding=embedding,
                                                                                              embedding_path=embedding_path,
                                                                                              use_character=True)
    num_labels = label_alphabet.size() - 1

    logger.info("constructing network...")
    # create variables
    target_var = T.imatrix(name='targets')
    mask_var = T.matrix(name='masks', dtype=theano.config.floatX)
    if fine_tune:
        input_var = T.imatrix(name='inputs')
        num_data, max_length = X_train.shape
        alphabet_size, embedd_dim = embedd_table.shape
    else:
        input_var = T.tensor3(name='inputs', dtype=theano.config.floatX)
        num_data, max_length, embedd_dim = X_train.shape
    char_input_var = T.itensor3(name='char-inputs')
    num_data_char, max_sent_length, max_char_length = C_train.shape
    char_alphabet_size, char_embedd_dim = char_embedd_table.shape
    assert (max_length == max_sent_length)
    assert (num_data == num_data_char)

    # construct input and mask layers
    layer_incoming1 = construct_char_input_layer()
    layer_incoming2 = construct_input_layer()

    layer_mask = lasagne.layers.InputLayer(shape=(None, max_length), input_var=mask_var, name='mask')

    # construct bi-rnn-cnn
    num_units = args.num_units
    bi_lstm_cnn = build_BiLSTM_HighCNN(layer_incoming1, layer_incoming2, num_units, mask=layer_mask,
                                   grad_clipping=grad_clipping, peepholes=peepholes, num_filters=num_filters,
                                   dropout=dropout)

    # reshape bi-rnn-cnn to [batch * max_length, num_units]
    bi_lstm_cnn = lasagne.layers.reshape(bi_lstm_cnn, (-1, [2]))

    # construct output layer (dense layer with softmax)
    layer_output = lasagne.layers.DenseLayer(bi_lstm_cnn, num_units=num_labels, nonlinearity=nonlinearities.softmax,
                                             name='softmax')

    # get output of bi-lstm-cnn shape=[batch * max_length, #label]
    prediction_train = lasagne.layers.get_output(layer_output)
    prediction_eval = lasagne.layers.get_output(layer_output, deterministic=True)
    final_prediction = T.argmax(prediction_eval, axis=1)

    # flat target_var to vector
    target_var_flatten = target_var.flatten()
    # flat mask_var to vector
    mask_var_flatten = mask_var.flatten()

    # compute loss
    num_loss = mask_var_flatten.sum(dtype=theano.config.floatX)
    # for training, we use mean of loss over number of labels
    loss_train = lasagne.objectives.categorical_crossentropy(prediction_train, target_var_flatten)
    loss_train = (loss_train * mask_var_flatten).sum(dtype=theano.config.floatX) / num_loss
    # l2 regularization?
    if regular == 'l2':
        l2_penalty = lasagne.regularization.regularize_network_params(layer_output, lasagne.regularization.l2)
        loss_train = loss_train + gamma * l2_penalty

    loss_eval = lasagne.objectives.categorical_crossentropy(prediction_eval, target_var_flatten)
    loss_eval = (loss_eval * mask_var_flatten).sum(dtype=theano.config.floatX) / num_loss

    # compute number of correct labels
    corr_train = lasagne.objectives.categorical_accuracy(prediction_train, target_var_flatten)
    corr_train = (corr_train * mask_var_flatten).sum(dtype=theano.config.floatX)

    corr_eval = lasagne.objectives.categorical_accuracy(prediction_eval, target_var_flatten)
    corr_eval = (corr_eval * mask_var_flatten).sum(dtype=theano.config.floatX)

    # Create update expressions for training.
    # hyper parameters to tune: learning rate, momentum, regularization.
    batch_size = args.batch_size
    learning_rate = 1.0 if update_algo == 'adadelta' else args.learning_rate
    decay_rate = args.decay_rate
    momentum = 0.9
    params = lasagne.layers.get_all_params(layer_output, trainable=True)
    updates = utils.create_updates(loss_train, params, update_algo, learning_rate, momentum=momentum)

    # Compile a function performing a training step on a mini-batch
    train_fn = theano.function([input_var, target_var, mask_var, char_input_var], [loss_train, corr_train, num_loss],
                               updates=updates)
    # Compile a second function evaluating the loss and accuracy of network
    eval_fn = theano.function([input_var, target_var, mask_var, char_input_var],
                              [loss_eval, corr_eval, num_loss, final_prediction])

    # Finally, launch the training loop.
    logger.info(
        "Start training: %s with regularization: %s(%f), dropout: %s, fine tune: %s (#training data: %d, batch size: %d, clip: %.1f, peepholes: %s)..." \
        % (
            update_algo, regular, (0.0 if regular == 'none' else gamma), dropout, fine_tune, num_data, batch_size, grad_clipping,
            peepholes))
    num_batches = num_data / batch_size
    num_epochs = 1000
    best_loss = 1e+12
    best_acc = 0.0
    best_epoch_loss = 0
    best_epoch_acc = 0
    best_loss_test_err = 0.
    best_loss_test_corr = 0.
    best_acc_test_err = 0.
    best_acc_test_corr = 0.
    stop_count = 0
    lr = learning_rate
    patience = args.patience
    for epoch in range(1, num_epochs + 1):
        print 'Epoch %d (learning rate=%.4f, decay rate=%.4f): ' % (epoch, lr, decay_rate)
        train_err = 0.0
        train_corr = 0.0
        train_total = 0
        start_time = time.time()
        num_back = 0
        train_batches = 0
        for batch in utils.iterate_minibatches(X_train, Y_train, masks=mask_train, char_inputs=C_train,
                                               batch_size=batch_size, shuffle=True):
            inputs, targets, masks, char_inputs = batch
            err, corr, num = train_fn(inputs, targets, masks, char_inputs)
            train_err += err * num
            train_corr += corr
            train_total += num
            train_batches += 1
            time_ave = (time.time() - start_time) / train_batches
            time_left = (num_batches - train_batches) * time_ave

            # update log
            sys.stdout.write("\b" * num_back)
            log_info = 'train: %d/%d loss: %.4f, acc: %.2f%%, time left (estimated): %.2fs' % (
                min(train_batches * batch_size, num_data), num_data,
                train_err / train_total, train_corr * 100 / train_total, time_left)
            sys.stdout.write(log_info)
            num_back = len(log_info)
        # update training log after each epoch
        sys.stdout.write("\b" * num_back)
        print 'train: %d/%d loss: %.4f, acc: %.2f%%, time: %.2fs' % (
            min(train_batches * batch_size, num_data), num_data,
            train_err / train_total, train_corr * 100 / train_total, time.time() - start_time)

        # evaluate performance on dev data
        dev_err = 0.0
        dev_corr = 0.0
        dev_total = 0
        for batch in utils.iterate_minibatches(X_dev, Y_dev, masks=mask_dev, char_inputs=C_dev, batch_size=batch_size):
            inputs, targets, masks, char_inputs = batch
            err, corr, num, predictions = eval_fn(inputs, targets, masks, char_inputs)
            dev_err += err * num
            dev_corr += corr
            dev_total += num
            if output_predict:
                utils.output_predictions(predictions, targets, masks, 'tmp/dev%d' % epoch, label_alphabet)

        print 'dev loss: %.4f, corr: %d, total: %d, acc: %.2f%%' % (
            dev_err / dev_total, dev_corr, dev_total, dev_corr * 100 / dev_total)

        if best_loss < dev_err and best_acc > dev_corr / dev_total:
            stop_count += 1
        else:
            update_loss = False
            update_acc = False
            stop_count = 0
            if best_loss > dev_err:
                update_loss = True
                best_loss = dev_err
                best_epoch_loss = epoch
            if best_acc < dev_corr / dev_total:
                update_acc = True
                best_acc = dev_corr / dev_total
                best_epoch_acc = epoch

            # evaluate on test data when better performance detected
            test_err = 0.0
            test_corr = 0.0
            test_total = 0
            for batch in utils.iterate_minibatches(X_test, Y_test, masks=mask_test, char_inputs=C_test,
                                                   batch_size=batch_size):
                inputs, targets, masks, char_inputs = batch
                err, corr, num, predictions = eval_fn(inputs, targets, masks, char_inputs)
                test_err += err * num
                test_corr += corr
                test_total += num
                if output_predict:
                    utils.output_predictions(predictions, targets, masks, 'tmp/test%d' % epoch, label_alphabet)

            print 'test loss: %.4f, corr: %d, total: %d, acc: %.2f%%' % (
                test_err / test_total, test_corr, test_total, test_corr * 100 / test_total)

            if update_loss:
                best_loss_test_err = test_err
                best_loss_test_corr = test_corr
            if update_acc:
                best_acc_test_err = test_err
                best_acc_test_corr = test_corr

        # stop if dev acc decrease 3 time straightly.
        if stop_count == patience:
            break

        # re-compile a function with new learning rate for training
        if update_algo != 'adadelta':
            lr = learning_rate / (1.0 + epoch * decay_rate)
            updates = utils.create_updates(loss_train, params, update_algo, lr, momentum=momentum)
            train_fn = theano.function([input_var, target_var, mask_var, char_input_var],
                                       [loss_train, corr_train, num_loss],
                                       updates=updates)

    # print best performance on test data.
    logger.info("final best loss test performance (at epoch %d)" % best_epoch_loss)
    print 'test loss: %.4f, corr: %d, total: %d, acc: %.2f%%' % (
        best_loss_test_err / test_total, best_loss_test_corr, test_total, best_loss_test_corr * 100 / test_total)
    logger.info("final best acc test performance (at epoch %d)" % best_epoch_acc)
    print 'test loss: %.4f, corr: %d, total: %d, acc: %.2f%%' % (
        best_acc_test_err / test_total, best_acc_test_corr, test_total, best_acc_test_corr * 100 / test_total)
    return theano.shared(floatX(np.random.randn(*shape) * 0.01))


def model(X, w):
    return T.nnet.softmax(T.dot(X, w))


trX, teX, trY, teY = mnist(onehot=True)

X = T.fmatrix()
Y = T.fmatrix()

w = init_weights((784, 10))

py_x = model(X, w)
y_pred = T.argmax(py_x, axis=1)

cost = T.mean(T.nnet.categorical_crossentropy(py_x, Y))
gradient = T.grad(cost=cost, wrt=w)
update = [[w, w - gradient * 0.05]]

train = theano.function(inputs=[X, Y],
                        outputs=cost,
                        updates=update,
                        allow_input_downcast=True)
predict = theano.function(inputs=[X],
                          outputs=y_pred,
                          allow_input_downcast=True)

for i in range(100):
    for start, end in zip(range(0, len(trX), 128), range(128, len(trX), 128)):
    w = current_layer[0]
    updates.append((w, 0.75 * w))

    current_layer = parmas[2]
    w = current_layer[0]
    updates.append((w, 0.75 * w))

    current_layer = parmas[3]
    w = current_layer[0]
    updates.append((w, 0.75 * w))

    return updates


z = feedForward(x, params)
y = T.argmax(z, axis=1)
updates = rm_dropout(params)
# compile theano functions
remove_it = theano.function([], [], updates=updates)
predict = theano.function([x], y)

batch_size = 200
# test
remove_it()
labels = np.argmax(t_test, axis=1)
running_accuracy = 0.0
batches = 0
for start in range(0, 10000, batch_size):
    x_batch = x_test[start:start + batch_size]
    t_batch = labels[start:start + batch_size]
    running_accuracy += np.mean(predict(x_batch) == t_batch)
Example #36
0
all_params = nn.layers.get_all_params(l6)
param_count = sum([np.prod(p.get_value().shape) for p in all_params])
print "parameter count: %d" % param_count

def clipped_crossentropy(x, t, m=0.001):
    x = T.clip(x, m, 1 - m)
    return T.mean(T.nnet.binary_crossentropy(x, t))

obj = nn.objectives.Objective(l6, loss_function=clipped_crossentropy) # loss_function=nn.objectives.crossentropy)
loss_train = obj.get_loss()
loss_eval = obj.get_loss(deterministic=True)

updates_train = OrderedDict(nn.updates.nesterov_momentum(loss_train, all_params, LEARNING_RATE, MOMENTUM, WEIGHT_DECAY))
# updates_train[l6.W] += SOFTMAX_LAMBDA * T.mean(T.sqr(l6.W)) # L2 loss on the softmax weights to avoid saturation

y_pred_train = T.argmax(l6.get_output(), axis=1)
y_pred_eval = T.argmax(l6.get_output(deterministic=True), axis=1)


## compile

X_train = nn.utils.shared_empty(dim=3)
y_train = nn.utils.shared_empty(dim=1)

X_eval = theano.shared(chunk_eval)
y_eval = theano.shared(chunk_eval_labels)


index = T.lscalar("index")

acc_train = T.mean(T.eq(y_pred_train, y_train[index * MB_SIZE:(index + 1) * MB_SIZE]))
Example #37
0
    def build_model(self):
        """
        build the computational graph of ASTN
        :return:
        """
        self.x = T.imatrix('wids')
        self.xt = T.imatrix('wids_target')
        self.y = T.ivector('label')
        self.pw = T.fmatrix("position_weight")
        self.is_train = T.iscalar("is_training")

        input = self.Words[T.cast(self.x.flatten(), 'int32')].reshape(
            (self.bs, self.sent_len, self.n_in))
        input_target = self.Words[T.cast(self.xt.flatten(), 'int32')].reshape(
            (self.bs, self.target_len, self.n_in))

        input = T.switch(T.eq(self.is_train, np.int32(1)),
                         self.Dropout_ctx(input),
                         input * (1 - self.dropout_rate))
        input_target = T.switch(T.eq(self.is_train, np.int32(1)),
                                self.Dropout_tgt(input_target),
                                input_target * (1 - self.dropout_rate))

        # model component for TNet
        rnn_input = input
        rnn_input_reverse = reverse_tensor(tensor=rnn_input)

        rnn_input_target = input_target
        rnn_input_target_reverse = reverse_tensor(tensor=rnn_input_target)

        H0_forward = self.LSTM_ctx(x=rnn_input)
        Ht_forward = self.LSTM_tgt(x=rnn_input_target)
        H0_backward = reverse_tensor(tensor=self.LSTM_ctx(x=rnn_input_reverse))
        Ht_backward = reverse_tensor(tensor=self.LSTM_tgt(
            x=rnn_input_target_reverse))
        H0 = T.concatenate([H0_forward, H0_backward], axis=2)
        Ht = T.concatenate([Ht_forward, Ht_backward], axis=2)

        H1 = self.CPT(H0, Ht)

        if self.pw is not None:
            H1 = H1 * self.pw.dimshuffle(0, 1, 'x')
        H2 = self.CPT(H1, Ht)
        if self.pw is not None:
            H2 = H2 * self.pw.dimshuffle(0, 1, 'x')
        """
        H3 = self.CPT(H2, Ht)
        if self.pw is not None:
            H3 = H3 * self.pw.dimshuffle(0, 1, 'x')
        H4 = self.CPT(H3, Ht)
        if self.pw is not None:
            H4 = H4 * self.pw.dimshuffle(0, 1, 'x')
        H5 = self.CPT(H4, Ht)
        if self.pw is not None:
            H5 = H5 * self.pw.dimshuffle(0, 1, 'x')
        """
        feat_and_feat_maps = [conv(H2) for conv in self.Conv_layers]
        feat = [ele[0] for ele in feat_and_feat_maps]
        self.feature_maps = T.concatenate(
            [ele[1] for ele in feat_and_feat_maps], axis=2)
        feat = T.concatenate(feat, axis=1)

        # we do not use the self-implemented Dropout class
        feat_dropout = T.switch(T.eq(self.is_train, np.int32(1)),
                                self.Dropout(feat),
                                feat * (1 - self.dropout_rate))
        # shape: (bs, n_y)
        self.p_y_x = T.nnet.softmax(self.FC(feat_dropout))
        # self.p_y_x = self.FC(feat_dropout)
        self.loss = T.nnet.categorical_crossentropy(coding_dist=self.p_y_x,
                                                    true_dist=self.y).mean()
        self.pred_y = T.argmax(self.p_y_x, axis=1)
Example #38
0
def main(model='mlp', batch_size=500, num_epochs=10):

    # Load the dataset
    print("Loading data...")
    X_train, y_train, X_val, y_val, X_test, y_test = load_dataset()

    # Fork worker processes and initilize GPU before building variables.
    synk.fork()

    # Prepare Theano variables for inputs and targets
    input_var = T.tensor4('inputs')
    target_var = T.ivector('targets')

    network = build_network(model, input_var)

    # Create a loss expression for training, i.e., a scalar objective we want
    # to minimize (for our multi-class problem, it is the cross-entropy loss):
    prediction = lasagne.layers.get_output(network)
    loss = lasagne.objectives.categorical_crossentropy(prediction, target_var)
    loss = loss.mean()
    # We could add some weight decay as well here, see lasagne.regularization.

    # Create update expressions for training, i.e., how to modify the
    # parameters at each training step. Here, we'll use Stochastic Gradient
    # Descent (SGD) with Nesterov momentum, but Lasagne offers plenty more.
    params = lasagne.layers.get_all_params(network, trainable=True)

    grad_updates, param_updates, grad_shared = updates.nesterov_momentum(
        loss, params, learning_rate=0.01, momentum=0.9)
    # updates = lasagne.updates.nesterov_momentum(
    #         loss, params, learning_rate=0.01, momentum=0.9)

    # Create a loss expression for validation/testing. The crucial difference
    # here is that we do a deterministic forward pass through the network,
    # disabling dropout layers.
    test_prediction = lasagne.layers.get_output(network, deterministic=True)
    test_loss = lasagne.objectives.categorical_crossentropy(
        test_prediction, target_var)
    test_loss = test_loss.mean()
    # As a bonus, also create an expression for the classification accuracy:
    test_acc = T.mean(T.eq(T.argmax(test_prediction, axis=1), target_var),
                      dtype=theano.config.floatX)

    # Compile a function performing a training step on a mini-batch (by giving
    # the updates dictionary) and returning the corresponding training loss:
    train_grad_fn = synk.function([input_var, target_var],
                                  outputs=loss,
                                  updates=grad_updates)
    train_update_fn = synk.function([], updates=param_updates)
    # train_fn = theano.function([input_var, target_var], loss, updates=updates)

    # Compile a second function computing the validation loss and accuracy:
    val_fn = synk.function([input_var, target_var],
                           outputs=[test_loss, test_acc])
    # val_fn = theano.function([input_var, target_var], [test_loss, test_acc])

    # After building all functions, give them to workers.
    synk.distribute()

    # Put data into OS shared memory for worker access.
    X_train, y_train = val_fn.build_inputs(X_train, y_train)
    X_val, y_val = val_fn.build_inputs(X_val, y_val)
    X_test, y_test = val_fn.build_inputs(X_test, y_test)

    # Finally, launch the training loop.
    print("Starting training...")
    # We iterate over epochs:
    for epoch in range(num_epochs):
        # In each epoch, we do a full pass over the training data:
        train_err = 0
        train_batches = 0
        start_time = time.time()
        # for batch in iterate_minibatches(X_train, y_train, batch_size, shuffle=True):
        for batch in iterate_minibatch_indices(len(y_train),
                                               batch_size,
                                               shuffle=True):
            train_err += train_grad_fn(X_train, y_train, batch=batch)
            synk.all_reduce(grad_shared)  # (averges)
            train_update_fn()
            train_batches += 1

        # And a full pass over the validation data:
        # val_err = 0
        # val_acc = 0
        # val_batches = 0
        # for batch in iterate_minibatches(X_val, y_val, batch_size, shuffle=False):
        #     inputs, targets = batch
        #     err, acc = val_fn(inputs, targets)
        #     val_err += err
        #     val_acc += acc
        #     val_batches += 1
        val_err, val_acc = val_fn(X_val, y_val, num_slices=4)

        # Then we print the results for this epoch:
        print("Epoch {} of {} took {:.3f}s".format(epoch + 1, num_epochs,
                                                   time.time() - start_time))
        print("  training loss:\t\t{:.6f}".format(train_err / train_batches))
        print("  validation loss:\t\t{:.6f}".format(float(val_err)))
        print("  validation accuracy:\t\t{:.2f} %".format(
            float(val_acc) * 100))

    # After training, we compute and print the test error:
    # test_err = 0
    # test_acc = 0
    # test_batches = 0
    # for batch in iterate_minibatches(X_test, y_test, batch_size, shuffle=False):
    #     inputs, targets = batch
    #     err, acc = val_fn(inputs, targets)
    #     test_err += err
    #     test_acc += acc
    #     test_batches += 1
    test_err, test_acc = val_fn(X_test, y_test, num_slices=4)
    print("Final results:")
    print("  test loss:\t\t\t{:.6f}".format(float(test_err)))
    print("  test accuracy:\t\t{:.2f} %".format(float(test_acc) * 100))
    def get_output_for(self, deterministic=False):

        if deterministic:
            deterministic_flag = T.constant(1)
        else:
            deterministic_flag = T.constant(0)

        batch_size = self.pred.shape[0]
        time_steps = self.pred.shape[1]
        label_num = input, 

        ## the start state to first label
        pred_t1 = self.pred[:, 0] # shape: (batch size, label num)
        gs_t1 = self.gs[:, 0] - 1
        mask_t1 = self.masks[:, 0]

        score_t0 = T.zeros((batch_size, label_num))
        index_t0 = T.zeros((batch_size, label_num), dtype='int64')

        init_flag = T.constant(1)
        # return shape: (batch size, label num), (batch size, label num)
        score_t1, index_t1 = self.score_one_step(pred_t1, gs_t1,
            mask_t1, score_t0, index_t0, self.init_t, self.tran_t, deterministic_flag, init_flag)

        print 'score_t1', score_t1.eval()
        print 'index_t1', index_t1.eval()

        pred = self.pred.dimshuffle(1, 0, 2)
        gs = self.gs.dimshuffle(1, 0)
        mask = self.masks.dimshuffle(1, 0)
        init_flag = T.constant(0)

        # print pred[1:].eval().shape
        # print (gs[1:]-1).eval().shape
        # print mask[1:].eval().shape
        # return shape: (time steps - 1, batch size, label num) ..., (time steps - 1, batch size)
        step_scores, step_indexs = theano.scan(fn=self.score_one_step,
                                               outputs_info=[score_t1, index_t1],
                                               sequences=[pred[1:], gs[1:]-1, mask[1:]],
                                               non_sequences=[self.init_t, self.tran_t, deterministic_flag, init_flag])[0]

        # # print step_scores.eval().shape
        # # print step_indexs.eval().shape
        print 'score_t2', step_scores.dimshuffle(1, 0, 2)[:, 0].eval()
        print 'index_t2', step_indexs.dimshuffle(1, 0, 2)[:, 0].eval()
        print 'score_t3', step_scores.dimshuffle(1, 0, 2)[:, 1].eval()
        print 'index_t3', step_indexs.dimshuffle(1, 0, 2)[:, 1].eval()

        # shape: (batch size, )
        last_step_max_score = T.max(step_scores[-1], axis=-1)
        last_step_max_index = T.argmax(step_scores[-1], axis=-1)

        def track_one_step(index_t, max_index_t):
            # example_indexs shape: (batch size, label num)
            # step_max_index shape: (batch size, )
            def scan_example(index_t_e, max_index_t_e):
                max_index_tm1_e = index_t_e[max_index_t_e]
                return max_index_tm1_e
            # return shape: (batch size, )
            max_index_tm1 = theano.scan(fn=scan_example,
                                              sequences=[index_t, max_index_t])[0]
            return max_index_tm1

        # reverse time step, shape: (time steps - 1, batch size, label num)
        #step_indexs = step_indexs[::-1]

        # return shape: (time steps - 1, batch size)
        index_chain = theano.scan(fn=track_one_step,
                                  sequences=step_indexs,
                                  outputs_info=last_step_max_index,
                                  go_backwards=True)[0]
        # return shape: (batch size, time steps - 1)
        index_chain = index_chain.dimshuffle(1, 0)

        # shape: (batch size, time steps)
        index_chain_reverse = self.aggregateTensor(last_step_max_index, index_chain)

        # add 1 for label index (which index from 1)
        # return shape: (batch size, time steps)
        index_chain = (index_chain_reverse + T.ones_like(index_chain_reverse))[:, ::-1]

        print 'index chain', index_chain.eval()


        def one_step_cost(step_index, pred_t, gs_t, index_chain_t, mask_t, cost_tm1, gs_tm1, index_chain_tm1, init_tran, tran):
            # step_index: (1,)
            # pred_t: (batch size, label num)
            # gs_t_e: (batch size, )
            # index_chain_t: (batch size, )
            # mask_t: (batch size, )
            # cost_tm1: (batch size, )
            # gs_tm1: (batch size, )
            # index_chain_tm1: (batch size, )


            def scan_example(pred_t_e, gs_t_e, index_chain_t_e, mask_t_e, cost_tm1_e, gs_tm1_e, index_chain_tm1_e, step_index, init_tran, tran):
                # pred_t_e: (label num, )
                # gs_t_e: (1, )
                # index_chain_t_e: (1, )
                # mask_t_e: (1, )
                # gs_tm1_e: (1, )
                # index_chain_tm1_e: (1, )
                # init_tran: (label num, )
                # tran: (label num, label num)

                cost_t_e = None
                cost_t_e = theano.ifelse.ifelse(T.eq(step_index, 0),
                    theano.printing.Print('\ninit step pred_t_e\n')(pred_t_e[theano.printing.Print('\ninit step index_chain_t_e\n')(index_chain_t_e)]) + theano.printing.Print('\n initstep init_tran\n')(init_tran[index_chain_t_e]) - theano.printing.Print('\ninit step pred_t_e\n')(pred_t_e[theano.printing.Print('\ninit step gs_t_e\n')(gs_t_e)]) - theano.printing.Print('\ninit step init_tran\n')(init_tran[gs_t_e]),
                    theano.printing.Print('\nother pred_t_e\n')(pred_t_e[theano.printing.Print('\nother index_chain_t_e\n')(index_chain_t_e)]) + theano.printing.Print('\nother tran\n')(tran[theano.printing.Print('\nother index_chain_tm1_e\n')(index_chain_tm1_e)][index_chain_t_e]) - theano.printing.Print('\nother pred_t_e\n')(pred_t_e[theano.printing.Print('\nother gs_t_e\n')(gs_t_e)]) - theano.printing.Print('\nother tran\n')(tran[theano.printing.Print('\nother gs_tm1_e\n')(gs_tm1_e)][gs_t_e]))
                # if T.eq(step_index, 0) == T.constant(1):
                #     cost_t_e = pred_t_e[index_chain_t_e] + init_tran[index_chain_t_e]\
                #      - pred_t_e[gs_t_e] - init_tran[gs_t_e]
                # else:
                #     cost_t_e = pred_t_e[index_chain_t_e] + tran[index_chain_t_e][index_chain_tm1_e]\
                #      - pred_t_e[gs_t_e] - tran[gs_tm1_e][gs_t_e]

                cost_t_e = cost_t_e * mask_t_e

                # return shape: (1, )
                return theano.printing.Print('\ncost_t_e\n')(cost_t_e), gs_t_e, index_chain_t_e

            # return shape: (batch size, )...
            cost_t, _, _ = theano.scan(fn=scan_example,
                                sequences=[pred_t, gs_t, index_chain_t, mask_t, cost_tm1, gs_tm1, index_chain_tm1],
                                non_sequences=[step_index, init_tran, tran])[0]

            # return shape: (batch size, )...
            return cost_t, gs_t, index_chain_t


        # return shape: (time steps, batch size)
        index_chain_sff = index_chain.dimshuffle(1, 0)
        gs_t0 = T.zeros((batch_size, ), dtype='int64')
        cost_t0 = T.zeros((batch_size, ), dtype='float64')
        index_chain_t0 = T.zeros((batch_size, ), dtype='int64')

        # return shape: (time steps, batch size)
        print (gs-1).eval()
        print (index_chain_sff-1).eval()
        steps_cost, _, _ = theano.scan(fn=one_step_cost,
                                 outputs_info=[cost_t0, gs_t0, index_chain_t0],
                                 sequences=[T.arange(time_steps), pred, gs-1, index_chain_sff-1, mask],
                                 non_sequences=[self.init_t, self.tran_t])[0]

        # return shape: (batch size, )
        cost = T.sum(steps_cost.dimshuffle(1, 0), axis=-1)

        # # return shape: (batch size, time steps - 1)                                                                                                                                      
        # step_gs_scores = step_gs_scores.dimshuffle(1, 0)

        # # return shape: (batch size, )                                                                                                                                                    
        # last_gs_score = step_gs_scores[:, -1]

        # print 'score_t2', step_scores.dimshuffle(1, 0, 2)[:, 0].eval()
        # print 'index_t2', step_indexs.dimshuffle(1, 0, 2)[:, 0].eval()
        # print 'gs_score_t2', step_gs_scores[:, 0].eval()

        # print 'score_t3', step_scores.dimshuffle(1, 0, 2)[:, 1].eval()
        # print 'index_t3', step_indexs.dimshuffle(1, 0, 2)[:, 1].eval()
        # print 'gs_score_t3', step_gs_scores[:, 1].eval()

        # print index_chain.eval()
        # print last_step_max_score.eval()
        # print last_gs_score.eval()        

        # return shape: (exmaple num, time steps), (batch size, ), (batch size, )
        #return [index_chain, last_step_max_score, last_gs_score]

        print 'cost', cost.eval()
        # return shape: (batch size, )
        return cost
Example #40
0
    def train(self,
              train_sets,
              valid_sets,
              test_sets,
              n_epochs=200,
              learning_rate=0.1):

        train_set_x, train_set_y = train_sets
        valid_set_x, valid_set_y = valid_sets
        test_set_x, test_set_y = test_sets

        n_train_batches = train_set_x.get_value(borrow=True).shape[0]
        n_valid_batches = valid_set_x.get_value(borrow=True).shape[0]
        n_test_batches = test_set_x.get_value(borrow=True).shape[0]
        n_train_batches //= self.batch_size
        n_valid_batches //= self.batch_size
        n_test_batches //= self.batch_size

        cost = -T.mean(
            T.log(self.final_output[T.arange(self.y.shape[0]), self.y]))
        error = T.mean(T.neq(T.argmax(self.final_output, axis=1), self.y))

        # find all the parameters and update them using gradient descent
        params = self.params
        grads = T.grad(cost, params)
        updates = [(param_i, param_i - learning_rate * grad_i)
                   for param_i, grad_i in zip(params, grads)]

        index = self.index
        batch_size = self.batch_size
        x = self.x
        y = self.y

        test_model = theano.function(
            [index],
            error,
            givens={
                x: test_set_x[index * batch_size:(index + 1) * batch_size],
                y: test_set_y[index * batch_size:(index + 1) * batch_size]
            })

        validate_model = theano.function(
            [index],
            error,
            givens={
                x: valid_set_x[index * batch_size:(index + 1) * batch_size],
                y: valid_set_y[index * batch_size:(index + 1) * batch_size]
            })

        train_model = theano.function(
            [index],
            cost,
            updates=updates,
            givens={
                x: train_set_x[index * batch_size:(index + 1) * batch_size],
                y: train_set_y[index * batch_size:(index + 1) * batch_size]
            })

        print('... training')
        # early-stopping parameters
        patience = 10000  # look as this many examples regardless
        patience_increase = 2  # wait this much longer when a new best is found
        improvement_threshold = 0.995
        validation_frequency = min(n_train_batches, patience // 2)
        # go through this many
        # minibatche before checking the network
        # on the validation set; in this case we
        # check every epoch

        best_validation_loss = numpy.inf
        best_iter = 0
        test_score = 0.
        start_time = timeit.default_timer()

        epoch = 0
        done_looping = False

        while (epoch < n_epochs):
            epoch = epoch + 1
            for minibatch_index in range(n_train_batches):

                iter = (epoch - 1) * n_train_batches + minibatch_index

                if iter % 100 == 0:
                    print('training @ iter = ', iter, flush=True)
                cost_ij = train_model(minibatch_index)

                if (iter + 1) % validation_frequency == 0:

                    # compute zero-one loss on validation set
                    validation_losses = [
                        validate_model(i) for i in range(n_valid_batches)
                    ]
                    this_validation_loss = numpy.mean(validation_losses)
                    print('epoch {}, minibatch {}/{}, validation error {}%'.
                          format(epoch, minibatch_index + 1, n_train_batches,
                                 this_validation_loss * 100.))
                    with open('model_{}.mod'.format(iter), 'wb') as f:
                        pickle.dump(self.dump(), f)
                    # if we got the best validation score until now
                    if this_validation_loss < best_validation_loss:


                        if this_validation_loss < best_validation_loss *  \
                                improvement_threshold:
                            patience = max(patience, iter * patience_increase)

                            # save best validation score and iteration number
                            best_validation_loss = this_validation_loss
                            best_iter = iter

                    test_losses = [
                        test_model(i) for i in range(n_test_batches)
                    ]
                    test_score = numpy.mean(test_losses)
                    print(('     epoch {}, minibatch {}/{}, test error of '
                           'best model {}%').format(epoch, minibatch_index + 1,
                                                    n_train_batches,
                                                    test_score * 100.))
                    with open('test_{}.res'.format(iter), 'w') as f:
                        print(network.predict(test_set_x), file=f)

        end_time = timeit.default_timer()
        print('Optimization complete.')
        print('Best validation score of %f %% obtained at iteration %i, '
              'with test performance %f %%' %
              (best_validation_loss * 100., best_iter + 1, test_score * 100.))
        print(('The code for file ' + os.path.split(__file__)[1] +
               ' ran for %.2fm' % ((end_time - start_time) / 60.)),
              file=sys.stderr)
Example #41
0
# Define loss function and metrics, and get an updates dictionary
X_sym = T.tensor4()
y_sym = T.ivector()

# We'll connect our output classifier to the last fully connected layer of the network
net['new_output'] = DenseLayer(net['pool5'],
                               num_units=8,
                               nonlinearity=softmax,
                               W=lasagne.init.Normal(0.01))

prediction = lasagne.layers.get_output(net['new_output'], X_sym)
loss = lasagne.objectives.categorical_crossentropy(prediction, y_sym)
loss = loss.mean()

acc = T.mean(T.eq(T.argmax(prediction, axis=1), y_sym),
             dtype=theano.config.floatX)

learning_rate = theano.shared(np.array(0.001, dtype=theano.config.floatX))
learning_rate_decay = np.array(0.3, dtype=theano.config.floatX)
updates = OrderedDict()

print("Setting learning rates...")
for name, layer in net.items():
    print(name)
    layer_params = layer.get_params(trainable=True)
    if name in ['new_output', 'fc1000']:
        layer_lr = learning_rate
    else:
        layer_lr = learning_rate / 10
    if name != 'fc1000':
Example #42
0
    def fit(self, X_task, y):
        DEBUG_FLAG = True

        # self.max_epochs = 333
        self.batch_size = 100
        rng = np.random.RandomState(42)
        self.input_taskdata = T.matrix(dtype='float32', name='input_taskdata')
        self.input_restdata = T.matrix(dtype='float32', name='input_restdata')
        self.params_from_last_iters = []
        n_input = X_task.shape[1]

        index = T.iscalar(name='index')

        # prepare data for theano computation
        if not DEBUG_FLAG:
            X_train_s = theano.shared(value=np.float32(X_task),
                                      name='X_train_s')
            y_train_s = theano.shared(value=np.int32(y), name='y_train_s')
            lr_train_samples = len(X_task)
        else:
            from sklearn.cross_validation import StratifiedShuffleSplit
            folder = StratifiedShuffleSplit(y, n_iter=1, test_size=0.20)
            new_trains, inds_val = iter(folder).next()
            X_train, X_val = X_task[new_trains], X_task[inds_val]
            y_train, y_val = y[new_trains], y[inds_val]

            X_train_s = theano.shared(value=np.float32(X_train),
                                      name='X_train_s',
                                      borrow=False)
            y_train_s = theano.shared(value=np.int32(y_train),
                                      name='y_train_s',
                                      borrow=False)
            # X_val_s = theano.shared(value=np.float32(X_val),
            #                         name='X_train_s', borrow=False)
            # y_val_s = theano.shared(value=np.int32(y_val),
            #                         name='y_cal_s', borrow=False)
            lr_train_samples = len(X_train)
            self.dbg_epochs_ = list()
            self.dbg_acc_train_ = list()
            self.dbg_acc_val_ = list()
            self.dbg_ae_cost_ = list()
            self.dbg_lr_cost_ = list()
            self.dbg_ae_nonimprovesteps = list()
            self.dbg_acc_other_ds_ = list()
            self.dbg_prfs_ = list()
            self.dbg_prfs_other_ds_ = list()

        # computation graph: logistic regression
        clf_n_output = 18  # number of labels
        my_y = T.ivector(name='y')

        bV0_vals = np.zeros(clf_n_output).astype(np.float32)
        self.bV0 = theano.shared(value=bV0_vals, name='bV0')

        V0_vals = rng.randn(n_input, clf_n_output).astype(
            np.float32) * self.gain1
        self.V0s = theano.shared(V0_vals)

        self.p_y_given_x = T.nnet.softmax(
            T.dot(self.input_taskdata, self.V0s) + self.bV0)
        self.lr_cost = -T.mean(
            T.log(self.p_y_given_x)[T.arange(my_y.shape[0]), my_y])
        self.lr_cost = (self.lr_cost +
                        T.mean(abs(self.V0s)) * self.penalty_l1 +
                        T.mean(abs(self.bV0)) * self.penalty_l1 + T.mean(
                            (self.V0s**np.float32(2))) * self.penalty_l2 +
                        T.mean((self.bV0**np.float32(2))) * self.penalty_l2)
        self.y_pred = T.argmax(self.p_y_given_x, axis=1)

        givens_lr = {
            self.input_taskdata:
            X_train_s[index * self.batch_size:(index + 1) * self.batch_size],
            my_y:
            y_train_s[index * self.batch_size:(index + 1) * self.batch_size]
        }

        params = [self.V0s, self.bV0]
        updates = self.RMSprop(cost=self.lr_cost,
                               params=params,
                               lr=self.learning_rate)

        f_train_lr = theano.function([index], [self.lr_cost],
                                     givens=givens_lr,
                                     updates=updates)

        # optimization loop
        start_time = time.time()
        lr_last_cost = np.inf
        ae_cur_cost = np.inf
        no_improve_steps = 0
        acc_train, acc_val = 0., 0.
        for i_epoch in range(self.max_epochs):
            if i_epoch == 1:
                epoch_dur = time.time() - start_time
                total_mins = (epoch_dur * self.max_epochs) / 60
                hs, mins = divmod(total_mins, 60)
                print("Max estimated duration: %i hours and %i minutes" %
                      (hs, mins))

            lr_n_batches = lr_train_samples // self.batch_size
            for i in range(lr_n_batches):
                lr_cur_cost = f_train_lr(i)[0]

            # evaluate epoch cost
            if lr_last_cost - lr_cur_cost < 0.1:
                no_improve_steps += 1
            else:
                lr_last_cost = lr_cur_cost
                no_improve_steps = 0

            # logistic
            lr_last_cost = lr_cur_cost
            acc_train = self.score(X_train, y_train)
            acc_val, prfs_val = self.score(X_val, y_val, return_prfs=True)

            print(
                'E:%i, ae_cost:%.4f, lr_cost:%.4f, train_score:%.2f, vald_score:%.2f, ae_badsteps:%i'
                % (i_epoch + 1, ae_cur_cost, lr_cur_cost, acc_train, acc_val,
                   no_improve_steps))

            if (i_epoch % 10 == 0):
                self.dbg_ae_cost_.append(ae_cur_cost)
                self.dbg_lr_cost_.append(lr_cur_cost)

                self.dbg_epochs_.append(i_epoch + 1)
                self.dbg_ae_nonimprovesteps.append(no_improve_steps)
                self.dbg_acc_train_.append(acc_train)
                self.dbg_acc_val_.append(acc_val)
                self.dbg_prfs_.append(prfs_val)

            # if i_epoch > (self.max_epochs - 100):
            param_pool = self.get_param_pool()
            self.params_from_last_iters.append(param_pool)

        total_mins = (time.time() - start_time) / 60
        hs, mins = divmod(total_mins, 60)
        print("Final duration: %i hours and %i minutes" % (hs, mins))

        return self
Example #43
0
def main():
    # step 1: get the data and define all the usual variables
    X, Y = get_normalized_data()

    max_iter = 20
    print_period = 10

    lr = 0.00004
    reg = 0.01

    X = X.astype(np.float32)
    Y = Y.astype(np.float32)
    Xtrain = X[:-1000,]
    Ytrain = Y[:-1000]
    Xtest  = X[-1000:,]
    Ytest  = Y[-1000:]
    Ytrain_ind = y2indicator(Ytrain).astype(np.float32)
    Ytest_ind = y2indicator(Ytest).astype(np.float32)

    N, D = Xtrain.shape
    batch_sz = 500
    n_batches = N / batch_sz

    M = 300
    K = 10
    W1_init = np.random.randn(D, M) / 28
    b1_init = np.zeros(M)
    W2_init = np.random.randn(M, K) / np.sqrt(M)
    b2_init = np.zeros(K)

    # step 2: define theano variables and expressions
    thX = T.matrix('X')
    thT = T.matrix('T')
    W1 = theano.shared(W1_init.astype(np.float32), 'W1')
    b1 = theano.shared(b1_init.astype(np.float32), 'b1')
    W2 = theano.shared(W2_init.astype(np.float32), 'W2')
    b2 = theano.shared(b2_init.astype(np.float32), 'b2')

    # we can use the built-in theano functions to do relu and softmax
    thZ = relu( thX.dot(W1) + b1 ) # relu is new in version 0.7.1 but just in case you don't have it
    thY = T.nnet.softmax( thZ.dot(W2) + b2 )

    # define the cost function and prediction
    cost = -(thT * T.log(thY)).sum() + reg*((W1*W1).sum() + (b1*b1).sum() + (W2*W2).sum() + (b2*b2).sum())
    prediction = T.argmax(thY, axis=1)

    # step 3: training expressions and functions
    # we can just include regularization as part of the cost because it is also automatically differentiated!
    # update_W1 = W1 - lr*(T.grad(cost, W1) + reg*W1)
    # update_b1 = b1 - lr*(T.grad(cost, b1) + reg*b1)
    # update_W2 = W2 - lr*(T.grad(cost, W2) + reg*W2)
    # update_b2 = b2 - lr*(T.grad(cost, b2) + reg*b2)
    update_W1 = W1 - lr*T.grad(cost, W1)
    update_b1 = b1 - lr*T.grad(cost, b1)
    update_W2 = W2 - lr*T.grad(cost, W2)
    update_b2 = b2 - lr*T.grad(cost, b2)

    train = theano.function(
        inputs=[thX, thT],
        updates=[(W1, update_W1), (b1, update_b1), (W2, update_W2), (b2, update_b2)],
    )

    # create another function for this because we want it over the whole dataset
    get_prediction = theano.function(
        inputs=[thX, thT],
        outputs=[cost, prediction],
    )

    t0 = datetime.now()
    for i in xrange(max_iter):
        for j in xrange(n_batches):
            Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),]
            Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),]

            train(Xbatch, Ybatch)
            if j % print_period == 0:
                cost_val, prediction_val = get_prediction(Xtest, Ytest_ind)
                err = error_rate(prediction_val, Ytest)
                print "Cost / err at iteration i=%d, j=%d: %.3f / %.3f" % (i, j, cost_val, err)

    print "Training time:", datetime.now() - t0
from models import ResNet_FullPreActivation, ResNet_BottleNeck_FullPreActivation
from utils import load_pickle_data_test

BATCHSIZE = 1
'''
Set up all theano functions
'''
X = T.tensor4('X')
Y = T.ivector('y')

# set up theano functions to generate output by feeding data through network, any test outputs should be deterministic
output_layer = ResNet_BottleNeck_FullPreActivation(X, n=18)
output_test = lasagne.layers.get_output(output_layer, deterministic=True)

output_class = T.argmax(output_test, axis=1)

# set up training and prediction functions
predict_proba = theano.function(inputs=[X], outputs=output_test)
predict_class = theano.function(inputs=[X], outputs=output_class)
'''
Load data and make predictions
'''
test_X, test_y = load_pickle_data_test()

# load network weights
f = gzip.open('data/weights/resnet164_fullpreactivation.pklz', 'rb')
all_params = pickle.load(f)
f.close()
helper.set_all_param_values(output_layer, all_params)
Example #45
0
                print ipt.ndim, 'dimensions'
            except:
                print 'no ndm'
            print min_informative_str(ipt)
    if found > 0:
        print type(node.op), found
        try:
            print '\t', type(node.op.scalar_op)
        except:
            pass

print count

test = CIFAR10(which_set='test', one_hot=True, gcn=55.)

yl = T.argmax(yb, axis=1)

mf1acc = 1. - T.neq(yl, T.argmax(ymf1, axis=1)).mean()
#mfnacc = 1.-T.neq(yl , T.argmax(mfny,axis=1)).mean()

batch_acc = function([Xb, yb], [mf1acc])


def accs():
    mf1_accs = []
    for i in xrange(10000 / batch_size):
        mf1_accs.append(
            batch_acc(
                test.get_topological_view(test.X[i * batch_size:(i + 1) *
                                                 batch_size, :]),
                test.y[i * batch_size:(i + 1) * batch_size, :])[0])
Example #46
0
def main(argv):
    ###-------------------------------- Get files to proccess ----------------------------------------------

    # just read header to get layer dimensions
    #files = glob.glob("B:\\NN_data\\THEANO_DATA\\tmp\\*.tmp")
    context = 5
    in_frame_num = context * 2 + 1
    feature_per_frame = 41
    output_vec_len = 214

    files = glob.glob(argv[0] + "\\*.mfsc")
    #inputs, in_frame_num, feature_per_frame, output_vec_len = read_data_only_inputs(files[0])
    print("\feature_per_frame:  {}".format(feature_per_frame))
    print("\output_vec_len:  {}".format(output_vec_len))

    ###-------------------------------- Read apriori state ppb ---------------------------------------------
    apriori_ppb_input = read_state_apriori_ppb(argv[2], output_vec_len)
    #print(apriori_ppb)
    #exit(0)
    ###-------------------------------- BUILD Theano functions ----------------------------------------------

    network = None
    layer2reg = None
    total_batch = 0

    # Prepare Theano variables for inputs and targets
    input_var = T.tensor4('inputs')
    apriori_ppb = T.fmatrix('apriori')
    target_var = T.ivector('targets')
    lrate_var = theano.tensor.scalar('lrate', dtype='float32')
    momentum_var = theano.tensor.scalar('momentum', dtype='float32')

    #create network
    #network, layer2reg = build_cnn(input_var, in_frame_num, feature_per_frame, output_vec_len)
    with open(argv[1], "rb") as f2:
        context = int(f2.readline())
        in_frame_num = context * 2 + 1
        filter_num = int(f2.readline())
        neuron_num = int(f2.readline())
        network, layer2reg = build_cnn(input_var, in_frame_num,
                                       feature_per_frame, output_vec_len,
                                       filter_num, neuron_num)
        lasagne.layers.set_all_param_values(network, np.load(f2))

    # Create a loss expression for training, i.e., a scalar objective we want
    # to minimize (for our multi-class problem, it is the cross-entropy loss):
    prediction = lasagne.layers.get_output(network)
    loss = lasagne.objectives.categorical_crossentropy(prediction, target_var)
    loss = loss.mean()

    # SGB with momentm and changing learning rate
    params = lasagne.layers.get_all_params(network, trainable=True)
    updates = lasagne.updates.nesterov_momentum(loss,
                                                params,
                                                learning_rate=lrate_var,
                                                momentum=momentum_var)

    test_prediction = lasagne.layers.get_output(network, deterministic=True)
    test_loss = lasagne.objectives.categorical_crossentropy(
        test_prediction, target_var)
    test_loss = test_loss.mean()
    test_acc = T.mean(T.eq(T.argmax(test_prediction, axis=1), target_var),
                      dtype=theano.config.floatX)

    # Compile a training function
    train_fn = theano.function(
        [input_var, target_var, lrate_var, momentum_var],
        loss,
        updates=updates)

    # Compile a second function computing the validation loss and accuracy:
    val_fn = theano.function([input_var, target_var], [test_loss, test_acc])
    test_prediction_1 = lasagne.layers.get_output(network, deterministic=True)
    test_prediction = T.dot(test_prediction_1, apriori_ppb)
    test_result = T.argmax(test_prediction_1, axis=1)

    # Compile a second function computing the validation loss and accuracy:
    predict_fn = theano.function([input_var], [test_result])
    predict_fn_j = theano.function([input_var, apriori_ppb], [test_prediction])
    print("theano functions compiled")

    ###-------------------------------- Final lunch ----------------------------------------------

    file_counter = 0
    print("")

    for one_file in files:
        inputs_j = load_mfsc_data_with_context(one_file, context)
        dot_index = one_file.find('.mfsc')
        output_file_name = one_file[0:dot_index] + ".bin"
        print(inputs_j[0][0])
        sample_num = len(inputs_j)
        base_file_name = output_file_name[output_file_name.rfind('\\') + 1:]
        file_counter += 1
        if file_counter % 5 == 0:
            print(base_file_name)
        else:
            print(base_file_name + "   ", end="")

        with open(output_file_name, 'wb') as f:
            f.write(struct.pack('I', swap32(len(inputs_j))))
            f.write(struct.pack('I', swap32(100000)))
            f.write(struct.pack('H', output_vec_len * 4)[::-1])
            f.write(struct.pack('H', 9)[::-1])

            begin = 0
            batch = 8
            while begin < sample_num:
                batch_size = np.minimum(batch, sample_num - begin)
                result = predict_fn_j(
                    inputs_j[begin:begin + batch_size].reshape(
                        batch_size, 3, in_frame_num, feature_per_frame),
                    apriori_ppb_input)

                res = np.log(result)[0]
                for aaa in res:
                    for ppb in aaa:
                        f.write(ppb)
                        #f.write(struct.pack('f',ppb)[::-1])
                begin += batch_size
    print("")
Example #47
0
def generator_step_sm(x_tm1, h_tm1, m_tm1, s_tm1, tau, eps):
    """One step of the generative decoder version."""
    # x_tm1 is `BxT` one-hot, h_tm1 is `batch x ...`
    # m_tm1 is `batch`, tau, eps are scalars

    # collect the inputs
    inputs = {l_decoder_embed: x_tm1.dimshuffle(0, "x", 1),
              l_decoder_mask: m_tm1.dimshuffle(0, "x")}

    # Connect the prev variables to the the hidden and stack state feeds
    j = 0
    for layer in dec_rnn_layers:
        inputs[layer.hid_init] = slice_(h_tm1, j, layer.num_units)
        j += layer.num_units

    j = 0
    for layer in dec_rnn_layers:
        layer = layer.input_layers[1]
        dep, wid = layer.output_shape[-2:]
        stack_slice_ = slice_(s_tm1, j, dep * wid)
        inputs[layer] = stack_slice_.reshape((-1, dep, wid))
        j += dep * wid

    # Get the outputs
    outputs = [l_decoder_reembedder]
    for pair in zip(dec_rnn_layers_sliced, dec_rnn_layers_stack):
        outputs.extend(pair)

    # propagate through the decoder column
    logit_t, *rest = lasagne.layers.get_output(outputs, inputs,
                                               deterministic=True)
    h_t_list, s_t_list = rest[::2], rest[1::2]

    # Pack the hidden and flattened stack states
    h_t = tt.concatenate(h_t_list, axis=-1)
    s_t = tt.concatenate([v.flatten(ndim=2) for v in s_t_list], axis=-1)
    
    # Generate the next symbol: logit_t is `Bx1xV`
    logit_t = logit_t[:, 0]
    prob_t = tt.nnet.softmax(logit_t)

    # Gumbel-softmax sampling: Gumbel (e^{-e^{-x}}) distributed random noise
    gumbel = -tt.log(-tt.log(theano_random_state.uniform(size=logit_t.shape) + eps) + eps)
#     logit_t = theano.ifelse.ifelse(tt.gt(tau, 0), gumbel + logit_t, logit_t)
#     inv_temp = theano.ifelse.ifelse(tt.gt(tau, 0), 1.0 / tau, tt.constant(1.0))
    logit_t = tt.switch(tt.gt(tau, 0), gumbel + logit_t, logit_t)
    inv_temp = tt.switch(tt.gt(tau, 0), 1.0 / tau, tt.constant(1.0))

    # Get the softmax: x_t is `BxV`
    x_t = tt.nnet.softmax(logit_t * inv_temp)

    # Get the best symbol
    c_t = tt.cast(tt.argmax(x_t, axis=-1), "int8")

    # Get the estimated probability of the picked symbol.
    p_t = prob_t[tt.arange(c_t.shape[0]), c_t]

    # Compute the mask and inhibit the propagation on a stop symbol.
    # Recurrent layers return the previous state if m_tm1 is Fasle
    m_t = m_tm1 & tt.gt(c_t, vocab.index("\x03"))
    c_t = tt.switch(m_t, c_t, vocab.index("\x03"))

    # There is no need to freeze the states as they will be frozen by
    # the RNN passthrough according to the mask `m_t`.

    # Embed the current character.
    x_t = tt.dot(x_t, l_embed_char.W)

    return x_t, h_t, m_t, s_t, p_t, c_t
loss_remember = (I0*lasagne.objectives.squared_error(W0,W_n0)).mean()+ \
                (I1*lasagne.objectives.squared_error(W1,W_n1)).mean()+ \
                (I2*lasagne.objectives.squared_error(W2,W_n2)).mean()
loss = loss_class + 100 * loss_remember

# Get network params, with specifications of manually updated ones
params = lasagne.layers.get_all_params(network, trainable=True)
updates = lasagne.updates.sgd(loss, params, learning_rate=0.0001)
#updates = lasagne.updates.adam(loss,params,learning_rate=0.00001)
#updates = lasagne.updates.nesterov_momentum(loss,params,learning_rate=0.01)

test_prediction = lasagne.layers.get_output(network, deterministic=True)
test_loss = lasagne.objectives.categorical_crossentropy(
    test_prediction, target_var)
test_loss = test_loss.mean()
test_acc = T.mean(T.eq(T.argmax(test_prediction, axis=1), target_var),
                  dtype=theano.config.floatX)

# Compile theano function computing the training validation loss and accuracy:
train_fn = theano.function([input_var, target_var, W0, W1, W2, I0, I1, I2],
                           [loss, loss_class, loss_remember],
                           updates=updates)
#train_fn = theano.function([input_var, target_var], loss, updates=updates)
val_fn = theano.function([input_var, target_var], [test_loss, test_acc])
#gradient_fn = theano.function([input_var, target_var], gradient)

# The training loop
print("Starting training...")
num_epochs = 250
for epoch in range(num_epochs):
Example #49
0
    def train(self, Xs, Ys, Xv, Yv, mdl,
              data_folder='data/', out_folder='out/'):

        data_folder = os.path.join(data_folder, 'imgs/', 'train/')
        input_var = mdl.input_var
        net = mdl.get_output_layer()
        target_var = T.ivector('targets')

        prediction = lasagne.layers.get_output(net)
        loss = lasagne.objectives.categorical_crossentropy(prediction, target_var)
        loss = loss.mean()

        params = lasagne.layers.get_all_params(net, trainable=True)

        grads = T.grad(loss, params)

        test_prediction = lasagne.layers.get_output(net, deterministic=True)
        test_loss = lasagne.objectives. \
            categorical_crossentropy(test_prediction, target_var)
        test_loss = test_loss.mean()
        test_acc = T.mean(T.eq(T.argmax(test_prediction, axis=1), target_var),
                          dtype=theano.config.floatX)

        logger.info("Compiling network functions...")
        grads_fn = theano.function([input_var, target_var], grads)
        train_fn = theano.function([input_var, target_var], loss)
        val_fn = theano.function([input_var, target_var], [test_loss, test_acc])
        predict_proba = theano.function([input_var], test_prediction)

        logger.info("Training...")
        logger.info('GPU Free Mem: %.3f' % gpu_free_mem('gb'))

        # TODO change to steps
        epochs = self.max_iter / len(Xs)

        best_val_loss, best_epoch = None, None
        best_mdl_path = os.path.join(out_folder, 'best_model.npz')
        if not os.path.exists(out_folder):
            os.makedirs(out_folder)

        steps = 0
        for epoch in range(epochs):
            start_time = time.time()
            train_err, train_batches = 0, 0
            data_s = FileSystemData(Xs, Ys, data_folder, self.batch_size,
                                    infinite=False, augment=True, shuffle=True)
            step_err, step_g = 0, None

            for batch in tqdm(data_s, total=data_s.steps, leave=False):
                inputs, targets = batch
                inputs = floatX(np.array([mdl.preprocess(x) for x in inputs]))

                batch_err = train_fn(inputs, targets)
                batch_g = grads_fn(inputs, targets)

                if step_g is None:
                    step_g = batch_g
                else:
                    step_g = [s_g + b_g for s_g, b_g in zip(step_g, batch_g)]
                train_err += batch_err
                step_err += batch_err
                train_batches += 1
                if train_batches % self.iter_size == 0:
                    step_g = [g / np.array(self.iter_size) for g in step_g]

                    if steps == 0:
                        t_prev, m_prev, u_prev = \
                            init_adam(batch_g, params)
                    updates = step_adam(step_g, params, t_prev, m_prev, u_prev,
                                        learning_rate=self.base_lr)
                    for p, new_val in updates.items():
                        p.set_value(new_val)
                    steps += 1
                    step_err, step_g = 0, None

            data_v = FileSystemData(Xv, Yv, data_folder, self.batch_size,
                                    infinite=False, augment=False, shuffle=False)
            val_err, val_acc, val_batches = 0, 0, 0
            for batch in tqdm(data_v, total=data_v.steps, leave=False):
                inputs, targets = batch
                inputs = floatX(np.array([mdl.preprocess(x) for x in inputs]))
                err, acc = val_fn(inputs, targets)
                val_err += err
                val_acc += acc
                val_batches += 1

            train_loss = train_err / train_batches
            val_loss = val_err / val_batches
            val_acc = val_acc / val_batches * 100
            end_time = time.time() - start_time

            if not best_val_loss or val_loss < best_val_loss:
                best_val_loss = val_loss
                best_epoch = epoch
                np.savez(best_mdl_path,
                         *lasagne.layers.get_all_param_values(net))
            snapshot_path = os.path.join(out_folder, 'snapshot_epoch_%d.npz'
                                         % epoch)
            np.savez(snapshot_path, *lasagne.layers.get_all_param_values(net))

            logger.info("epoch[%d] -- Ls: %.3f | Lv: %.3f | ACCv: %.3f | Ts: %.3f"
                        % (epoch, train_loss, val_loss, val_acc, end_time))

        logger.info("loading best model: epoch[%d]" % best_epoch)
        with np.load(best_mdl_path) as f:
            param_values = [f['arr_%d' % i] for i in range(len(f.files))]
        lasagne.layers.set_all_param_values(net, param_values)

        return predict_proba
Example #50
0
    def __init__(
        self,
        input_shape,
        output_dim,
        prob_network=None,
        hidden_sizes=(32, 32),
        hidden_nonlinearity=NL.rectify,
        optimizer=None,
        use_trust_region=True,
        step_size=0.01,
        normalize_inputs=True,
        name=None,
    ):
        """
        :param input_shape: Shape of the input data.
        :param output_dim: Dimension of output.
        :param hidden_sizes: Number of hidden units of each layer of the mean
         network.
        :param hidden_nonlinearity: Non-linearity used for each layer of the
         mean network.
        :param optimizer: Optimizer for minimizing the negative log-likelihood.
        :param use_trust_region: Whether to use trust region constraint.
        :param step_size: KL divergence constraint for each iteration
        """
        Serializable.quick_init(self, locals())

        if optimizer is None:
            if use_trust_region:
                optimizer = PenaltyLbfgsOptimizer()
            else:
                optimizer = LbfgsOptimizer()

        self.output_dim = output_dim
        self._optimizer = optimizer

        if prob_network is None:
            prob_network = MLP(
                input_shape=input_shape,
                output_dim=output_dim,
                hidden_sizes=hidden_sizes,
                hidden_nonlinearity=hidden_nonlinearity,
                output_nonlinearity=NL.softmax,
            )

        l_prob = prob_network.output_layer

        LasagnePowered.__init__(self, [l_prob])

        xs_var = prob_network.input_layer.input_var
        ys_var = TT.imatrix("ys")
        old_prob_var = TT.matrix("old_prob")

        x_mean_var = theano.shared(np.zeros((1, ) + input_shape),
                                   name="x_mean",
                                   broadcastable=(True, ) +
                                   (False, ) * len(input_shape))
        x_std_var = theano.shared(np.ones((1, ) + input_shape),
                                  name="x_std",
                                  broadcastable=(True, ) +
                                  (False, ) * len(input_shape))

        normalized_xs_var = (xs_var - x_mean_var) / x_std_var

        prob_var = L.get_output(l_prob,
                                {prob_network.input_layer: normalized_xs_var})

        old_info_vars = dict(prob=old_prob_var)
        info_vars = dict(prob=prob_var)

        dist = self._dist = Categorical(output_dim)

        mean_kl = TT.mean(dist.kl_sym(old_info_vars, info_vars))

        loss = -TT.mean(dist.log_likelihood_sym(ys_var, info_vars))

        predicted = tensor_utils.to_onehot_sym(TT.argmax(prob_var, axis=1),
                                               output_dim)

        self._f_predict = tensor_utils.compile_function([xs_var], predicted)
        self._f_prob = tensor_utils.compile_function([xs_var], prob_var)
        self._prob_network = prob_network
        self._l_prob = l_prob

        optimizer_args = dict(
            loss=loss,
            target=self,
            network_outputs=[prob_var],
        )

        if use_trust_region:
            optimizer_args["leq_constraint"] = (mean_kl, step_size)
            optimizer_args["inputs"] = [xs_var, ys_var, old_prob_var]
        else:
            optimizer_args["inputs"] = [xs_var, ys_var]

        self._optimizer.update_opt(**optimizer_args)

        self._use_trust_region = use_trust_region
        self._name = name

        self._normalize_inputs = normalize_inputs
        self._x_mean_var = x_mean_var
        self._x_std_var = x_std_var
    def fit(self, X, learning_rate=1e-5, mu=0.99, epochs=10, show_fig=True, activation=T.nnet.relu, RecurrentUnit=GRU, normalize=True):
        D = self.D
        V = self.V
        N = len(X)

        We = init_weight(V, D)
        self.hidden_layers = []
        Mi = D
        for Mo in self.hidden_layer_sizes:
            ru = RecurrentUnit(Mi, Mo, activation)
            self.hidden_layers.append(ru)
            Mi = Mo

        Wo = init_weight(Mi, V)
        bo = np.zeros(V)

        self.We = theano.shared(We)
        self.Wo = theano.shared(Wo)
        self.bo = theano.shared(bo)
        self.params = [self.Wo, self.bo]
        for ru in self.hidden_layers:
            self.params += ru.params

        thX = T.ivector('X')
        thY = T.ivector('Y')

        Z = self.We[thX]
        for ru in self.hidden_layers:
            Z = ru.output(Z)
        py_x = T.nnet.softmax(Z.dot(self.Wo) + self.bo)

        prediction = T.argmax(py_x, axis=1)
        # let's return py_x too so we can draw a sample instead
        self.predict_op = theano.function(
            inputs=[thX],
            outputs=[py_x, prediction],
            allow_input_downcast=True,
        )
        
        cost = -T.mean(T.log(py_x[T.arange(thY.shape[0]), thY]))
        grads = T.grad(cost, self.params)
        dparams = [theano.shared(p.get_value()*0) for p in self.params]

        dWe = theano.shared(self.We.get_value()*0)
        gWe = T.grad(cost, self.We)
        dWe_update = mu*dWe - learning_rate*gWe
        We_update = self.We + dWe_update
        if normalize:
            We_update /= We_update.norm(2)

        updates = [
            (p, p + mu*dp - learning_rate*g) for p, dp, g in zip(self.params, dparams, grads)
        ] + [
            (dp, mu*dp - learning_rate*g) for dp, g in zip(dparams, grads)
        ] + [
            (self.We, We_update), (dWe, dWe_update)
        ]

        self.train_op = theano.function(
            inputs=[thX, thY],
            outputs=[cost, prediction],
            updates=updates
        )

        costs = []
        for i in range(epochs):
            t0 = datetime.now()
            X = shuffle(X)
            n_correct = 0
            n_total = 0
            cost = 0
            for j in range(N):
                if np.random.random() < 0.01 or len(X[j]) <= 1:
                    input_sequence = [0] + X[j]
                    output_sequence = X[j] + [1]
                else:
                    input_sequence = [0] + X[j][:-1]
                    output_sequence = X[j]
                n_total += len(output_sequence)

                # test:
                
                try:
                    # we set 0 to start and 1 to end
                    c, p = self.train_op(input_sequence, output_sequence)
                except Exception as e:
                    PYX, pred = self.predict_op(input_sequence)
                    print("input_sequence len:", len(input_sequence))
                    print("PYX.shape:",PYX.shape)
                    print("pred.shape:", pred.shape)
                    raise e
                # print "p:", p
                cost += c
                # print "j:", j, "c:", c/len(X[j]+1)
                for pj, xj in zip(p, output_sequence):
                    if pj == xj:
                        n_correct += 1
                if j % 200 == 0:
                    sys.stdout.write("j/N: %d/%d correct rate so far: %f\r" % (j, N, float(n_correct)/n_total))
                    sys.stdout.flush()
            print("i:", i, "cost:", cost, "correct rate:", (float(n_correct)/n_total), "time for epoch:", (datetime.now() - t0))
            costs.append(cost)

        if show_fig:
            plt.plot(costs)
            plt.show()
Example #52
0
def main(num_epochs=NEPOCH):
    print("Loading data ...")
    snli = SNLI(batch_size=BSIZE)
    train_batches = list(snli.train_minibatch_generator())
    dev_batches = list(snli.dev_minibatch_generator())
    test_batches = list(snli.test_minibatch_generator())
    W_word_embedding = snli.weight  # W shape: (# vocab size, WE_DIM)
    W_word_embedding = snli.weight / \
                       (numpy.linalg.norm(snli.weight, axis=1).reshape(snli.weight.shape[0], 1) + \
                        0.00001)
    del snli

    print("Building network ...")
    ########### input layers ###########
    # hypothesis
    input_var_h = T.TensorType('int32', [False, False])('hypothesis_vector')
    input_var_h.tag.test_value = numpy.hstack(
        (numpy.random.randint(1, 10000, (BSIZE, 18),
                              'int32'), numpy.zeros(
                                  (BSIZE, 6)).astype('int32')))
    l_in_h = lasagne.layers.InputLayer(shape=(BSIZE, None),
                                       input_var=input_var_h)

    input_mask_h = T.TensorType('int32', [False, False])('hypo_mask')
    input_mask_h.tag.test_value = numpy.hstack((numpy.ones(
        (BSIZE, 18), dtype='int32'), numpy.zeros((BSIZE, 6), dtype='int32')))
    input_mask_h.tag.test_value[1, 18:22] = 1
    l_mask_h = lasagne.layers.InputLayer(shape=(BSIZE, None),
                                         input_var=input_mask_h)

    # premise
    input_var_p = T.TensorType('int32', [False, False])('premise_vector')
    input_var_p.tag.test_value = numpy.hstack(
        (numpy.random.randint(1, 10000, (BSIZE, 16),
                              'int32'), numpy.zeros(
                                  (BSIZE, 3)).astype('int32')))
    l_in_p = lasagne.layers.InputLayer(shape=(BSIZE, None),
                                       input_var=input_var_p)

    input_mask_p = T.TensorType('int32', [False, False])('premise_mask')
    input_mask_p.tag.test_value = numpy.hstack((numpy.ones(
        (BSIZE, 16), dtype='int32'), numpy.zeros((BSIZE, 3), dtype='int32')))
    input_mask_p.tag.test_value[1, 16:18] = 1
    l_mask_p = lasagne.layers.InputLayer(shape=(BSIZE, None),
                                         input_var=input_mask_p)
    ###################################

    # output shape (BSIZE, None, WEDIM)
    l_hypo_embed = lasagne.layers.EmbeddingLayer(
        l_in_h,
        input_size=W_word_embedding.shape[0],
        output_size=W_word_embedding.shape[1],
        W=W_word_embedding)

    l_prem_embed = lasagne.layers.EmbeddingLayer(
        l_in_p,
        input_size=W_word_embedding.shape[0],
        output_size=W_word_embedding.shape[1],
        W=l_hypo_embed.W)

    # EMBEDING MAPPING: output shape (BSIZE, None, WEMAP)
    l_hypo_reduced_embed = DenseLayer3DInput(l_hypo_embed,
                                             num_units=WEMAP,
                                             b=None,
                                             nonlinearity=None)
    l_hypo_embed_dpout = lasagne.layers.DropoutLayer(l_hypo_reduced_embed,
                                                     p=DPOUT,
                                                     rescale=True)
    l_prem_reduced_embed = DenseLayer3DInput(l_prem_embed,
                                             num_units=WEMAP,
                                             W=l_hypo_reduced_embed.W,
                                             b=None,
                                             nonlinearity=None)
    l_prem_embed_dpout = lasagne.layers.DropoutLayer(l_prem_reduced_embed,
                                                     p=DPOUT,
                                                     rescale=True)

    # ATTEND
    l_hypo_embed_hid1 = DenseLayer3DInput(
        l_hypo_embed_dpout,
        num_units=EMBDHIDA,
        b=None,
        nonlinearity=lasagne.nonlinearities.rectify)
    l_hypo_embed_hid1_dpout = lasagne.layers.DropoutLayer(l_hypo_embed_hid1,
                                                          p=DPOUT,
                                                          rescale=True)
    l_hypo_embed_hid2 = DenseLayer3DInput(
        l_hypo_embed_hid1_dpout,
        num_units=EMBDHIDB,
        b=None,
        nonlinearity=lasagne.nonlinearities.rectify)

    l_prem_embed_hid1 = DenseLayer3DInput(
        l_prem_embed_dpout,
        num_units=EMBDHIDA,
        W=l_hypo_embed_hid1.W,
        b=None,
        nonlinearity=lasagne.nonlinearities.rectify)
    l_prem_embed_hid1_dpout = lasagne.layers.DropoutLayer(l_prem_embed_hid1,
                                                          p=DPOUT,
                                                          rescale=True)
    l_prem_embed_hid2 = DenseLayer3DInput(
        l_prem_embed_hid1_dpout,
        num_units=EMBDHIDB,
        W=l_hypo_embed_hid2.W,
        b=None,
        nonlinearity=lasagne.nonlinearities.rectify)

    # output dim: (BSIZE, NROWx, NROWy)
    l_e = ComputeEmbeddingPool([l_hypo_embed_hid1, l_prem_embed_hid2])
    # output dim: (BSIZE, NROWy, DIM)
    l_hypo_weighted = AttendOnEmbedding([l_hypo_reduced_embed, l_e],
                                        masks=[l_mask_h, l_mask_p],
                                        direction='col')
    # output dim: (BSIZE, NROWx, DIM)
    l_prem_weighted = AttendOnEmbedding([l_prem_reduced_embed, l_e],
                                        masks=[l_mask_h, l_mask_p],
                                        direction='row')

    # COMPARE
    # output dim: (BSIZE, NROW, 4*LSTMHID)
    l_hypo_premwtd = lasagne.layers.ConcatLayer(
        [l_hypo_reduced_embed, l_prem_weighted], axis=2)
    l_prem_hypowtd = lasagne.layers.ConcatLayer(
        [l_prem_reduced_embed, l_hypo_weighted], axis=2)

    l_hypo_premwtd_dpout = lasagne.layers.DropoutLayer(l_hypo_premwtd,
                                                       p=DPOUT,
                                                       rescale=True)
    l_hypo_comphid1 = DenseLayer3DInput(
        l_hypo_premwtd_dpout,
        num_units=COMPHIDA,
        b=None,
        nonlinearity=lasagne.nonlinearities.rectify)

    l_hypo_comphid1_dpout = lasagne.layers.DropoutLayer(l_hypo_comphid1,
                                                        p=DPOUT,
                                                        rescale=True)
    l_hypo_comphid2 = DenseLayer3DInput(
        l_hypo_comphid1_dpout,
        num_units=COMPHIDB,
        b=None,
        nonlinearity=lasagne.nonlinearities.rectify)

    l_prem_hypowtd_dpout = lasagne.layers.DropoutLayer(l_prem_hypowtd,
                                                       p=DPOUT,
                                                       rescale=True)
    l_prem_comphid1 = DenseLayer3DInput(
        l_prem_hypowtd_dpout,
        num_units=COMPHIDA,
        W=l_hypo_comphid1.W,
        b=None,
        nonlinearity=lasagne.nonlinearities.rectify)
    l_prem_comphid1_dpout = lasagne.layers.DropoutLayer(l_prem_comphid1,
                                                        p=DPOUT,
                                                        rescale=True)
    l_prem_comphid2 = DenseLayer3DInput(
        l_prem_comphid1_dpout,
        num_units=COMPHIDB,
        W=l_hypo_comphid2.W,
        b=None,
        nonlinearity=lasagne.nonlinearities.rectify)

    # AGGREGATE
    # output dim: (BSIZE, 4*LSTMHID)
    l_hypo_mean = MeanOverDim(l_hypo_comphid2, mask=l_mask_h, dim=1)
    l_prem_mean = MeanOverDim(l_prem_comphid2, mask=l_mask_p, dim=1)

    l_v1v2 = lasagne.layers.ConcatLayer([l_hypo_mean, l_prem_mean], axis=1)
    l_v1v2_dpout = lasagne.layers.DropoutLayer(l_v1v2, p=DPOUT, rescale=True)

    l_outhid1 = lasagne.layers.DenseLayer(
        l_v1v2_dpout,
        num_units=OUTHID,
        b=None,
        nonlinearity=lasagne.nonlinearities.rectify)
    l_outhid1_dpout = lasagne.layers.DropoutLayer(l_outhid1,
                                                  p=DPOUT,
                                                  rescale=True)

    l_outhid2 = lasagne.layers.DenseLayer(
        l_outhid1_dpout,
        num_units=OUTHID,
        b=None,
        nonlinearity=lasagne.nonlinearities.rectify)
    # l_outhid2_dpout = lasagne.layers.DropoutLayer(l_outhid2, p=DPOUT, rescale=True)

    l_output = lasagne.layers.DenseLayer(
        l_outhid2,
        num_units=3,
        b=None,
        nonlinearity=lasagne.nonlinearities.softmax)

    ########### target, cost, validation, etc. ##########
    target_values = T.ivector('target_output')
    target_values.tag.test_value = numpy.asarray([
        1,
    ] * BSIZE, dtype='int32')

    network_output = lasagne.layers.get_output(l_output)
    network_prediction = T.argmax(network_output, axis=1)
    error_rate = T.mean(T.neq(network_prediction, target_values))

    network_output_clean = lasagne.layers.get_output(l_output,
                                                     deterministic=True)
    network_prediction_clean = T.argmax(network_output_clean, axis=1)
    error_rate_clean = T.mean(T.neq(network_prediction_clean, target_values))

    cost = T.mean(
        T.nnet.categorical_crossentropy(network_output, target_values))
    cost_clean = T.mean(
        T.nnet.categorical_crossentropy(network_output_clean, target_values))

    # Retrieve all parameters from the network
    all_params = lasagne.layers.get_all_params(l_output)
    if not UPDATEWE:
        all_params.remove(l_hypo_embed.W)

    numparams = sum(
        [numpy.prod(i) for i in [i.shape.eval() for i in all_params]])
    print("Number of params: {}\nName\t\t\tShape\t\t\tSize".format(numparams))
    print("-----------------------------------------------------------------")
    for item in all_params:
        print("{0:24}{1:24}{2}".format(item, item.shape.eval(),
                                       numpy.prod(item.shape.eval())))

    # if exist param file then load params
    look_for = 'params' + os.sep + 'params_' + filename + '.pkl'
    if os.path.isfile(look_for):
        print("Resuming from file: " + look_for)
        all_param_values = cPickle.load(open(look_for, 'rb'))
        for p, v in zip(all_params, all_param_values):
            p.set_value(v)

    # Compute SGD updates for training
    print("Computing updates ...")
    updates = lasagne.updates.adagrad(cost, all_params, LR)

    # Theano functions for training and computing cost
    print("Compiling functions ...")
    train = theano.function([
        l_in_h.input_var, l_mask_h.input_var, l_in_p.input_var,
        l_mask_p.input_var, target_values
    ], [cost, error_rate],
                            updates=updates)
    # mode=NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=False))
    compute_cost = theano.function([
        l_in_h.input_var, l_mask_h.input_var, l_in_p.input_var,
        l_mask_p.input_var, target_values
    ], [cost_clean, error_rate_clean])

    # mode=NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=False))

    def evaluate(mode):
        if mode == 'dev':
            data = dev_batches
        if mode == 'test':
            data = test_batches

        set_cost = 0.
        set_error_rate = 0.
        for batches_seen, (hypo, hm, premise, pm, truth) in enumerate(data, 1):
            _cost, _error = compute_cost(hypo, hm, premise, pm, truth)
            set_cost = (1.0 - 1.0 / batches_seen) * set_cost + \
                       1.0 / batches_seen * _cost
            set_error_rate = (1.0 - 1.0 / batches_seen) * set_error_rate + \
                             1.0 / batches_seen * _error

        return set_cost, set_error_rate

    print("Done. Evaluating scratch model ...")
    dev_set_cost, dev_set_error = evaluate('dev')
    print("BEFORE TRAINING: dev cost %f, error %f" %
          (dev_set_cost, dev_set_error))
    print("Training ...")
    try:
        for epoch in range(num_epochs):
            train_set_cost = 0.
            train_set_error = 0.
            start = time.time()

            for batches_seen, (hypo, hm, premise, pm,
                               truth) in enumerate(train_batches, 1):
                _cost, _error = train(hypo, hm, premise, pm, truth)
                train_set_cost = (1.0 - 1.0 / batches_seen) * train_set_cost + \
                                 1.0 / batches_seen * _cost
                train_set_error = (1.0 - 1.0 / batches_seen) * train_set_error + \
                                  1.0 / batches_seen * _error
                if (batches_seen * BSIZE) % 5000 == 0:
                    end = time.time()
                    print("Sample %d %.2fs, lr %.4f, train cost %f, error %f" %
                          (batches_seen * BSIZE, end - start, LR,
                           train_set_cost, train_set_error))
                    start = end

                if (batches_seen * BSIZE) % 100000 == 0:
                    dev_set_cost, dev_set_error = evaluate('dev')
                    print("***dev cost %f, error %f" %
                          (dev_set_cost, dev_set_error))

            # save parameters
            all_param_values = [p.get_value() for p in all_params]
            cPickle.dump(
                all_param_values,
                open('params' + os.sep + 'params_' + filename + '.pkl', 'wb'))

            dev_set_cost, dev_set_error = evaluate('dev')
            test_set_cost, test_set_error = evaluate('test')

            print("epoch %d, cost: train %f dev %f test %f;\n"
                  "         error train %f dev %f test %f" %
                  (epoch, train_set_cost, dev_set_cost, test_set_cost,
                   train_set_error, dev_set_error, test_set_error))
    except KeyboardInterrupt:
        pdb.set_trace()
        pass
Example #53
0
    def __init__(self,
                 environment,
                 rho=0.9,
                 rms_epsilon=0.0001,
                 momentum=0,
                 clip_delta=0,
                 freeze_interval=1000,
                 batch_size=32,
                 update_rule="rmsprop",
                 random_state=np.random.RandomState(),
                 double_Q=False,
                 neural_network=NN):
        """ Initialize environment
        
        """
        QNetwork.__init__(self, environment, batch_size)

        self._rho = rho
        self._rms_epsilon = rms_epsilon
        self._momentum = momentum
        self._clip_delta = clip_delta
        self._freeze_interval = freeze_interval
        self._double_Q = double_Q
        self._random_state = random_state

        self.update_counter = 0

        states = [
        ]  # list of symbolic variables for each of the k element in the belief state
        # --> [ T.tensor4 if observation of element=matrix, T.tensor3 if vector, T.tensor 2 if scalar ]
        next_states = []  # idem than states at t+1
        self.states_shared = [
        ]  # list of shared variable for each of the k element in the belief state
        self.next_states_shared = []  # idem that self.states_shared at t+1

        for i, dim in enumerate(self._input_dimensions):
            if len(dim) == 3:
                states.append(T.tensor4("%s_%s" % ("state", i)))
                next_states.append(T.tensor4("%s_%s" % ("next_state", i)))

            elif len(dim) == 2:
                states.append(T.tensor3("%s_%s" % ("state", i)))
                next_states.append(T.tensor3("%s_%s" % ("next_state", i)))

            elif len(dim) == 1:
                states.append(T.matrix("%s_%s" % ("state", i)))
                next_states.append(T.matrix("%s_%s" % ("next_state", i)))

            self.states_shared.append(
                theano.shared(np.zeros((batch_size, ) + dim,
                                       dtype=theano.config.floatX),
                              borrow=False))
            self.next_states_shared.append(
                theano.shared(np.zeros((batch_size, ) + dim,
                                       dtype=theano.config.floatX),
                              borrow=False))

        print("Number of observations per state: {}".format(
            len(self.states_shared)))
        print("For each observation, historySize + ponctualObs_i.shape: {}".
              format(self._input_dimensions))

        rewards = T.col('rewards')
        actions = T.icol('actions')
        terminals = T.icol('terminals')
        thediscount = T.scalar(name='thediscount', dtype=theano.config.floatX)
        thelr = T.scalar(name='thelr', dtype=theano.config.floatX)

        Q_net = neural_network(self._batch_size, self._input_dimensions,
                               self._n_actions, self._random_state)
        self.q_vals, self.params, shape_after_conv = Q_net._buildDQN(states)

        print(
            "Number of neurons after spatial and temporal convolution layers: {}"
            .format(shape_after_conv))

        self.next_q_vals, self.next_params, shape_after_conv = Q_net._buildDQN(
            next_states)
        self._resetQHat()

        self.rewards_shared = theano.shared(np.zeros(
            (batch_size, 1), dtype=theano.config.floatX),
                                            broadcastable=(False, True))

        self.actions_shared = theano.shared(np.zeros((batch_size, 1),
                                                     dtype='int32'),
                                            broadcastable=(False, True))

        self.terminals_shared = theano.shared(np.zeros((batch_size, 1),
                                                       dtype='int32'),
                                              broadcastable=(False, True))

        if (self._double_Q == True):
            givens_next = {}
            for i, x in enumerate(self.next_states_shared):
                givens_next[states[i]] = x

            self.next_q_vals_current_qnet = theano.function([],
                                                            self.q_vals,
                                                            givens=givens_next)

            next_q_curr_qnet = theano.clone(self.next_q_vals)

            argmax_next_q_vals = T.argmax(next_q_curr_qnet,
                                          axis=1,
                                          keepdims=True)

            max_next_q_vals = self.next_q_vals[T.arange(batch_size),
                                               argmax_next_q_vals.reshape(
                                                   (-1, ))].reshape((-1, 1))

        else:
            max_next_q_vals = T.max(self.next_q_vals, axis=1, keepdims=True)

        not_terminals = T.ones_like(terminals) - terminals

        target = rewards + not_terminals * thediscount * max_next_q_vals

        q_val = self.q_vals[T.arange(batch_size),
                            actions.reshape((-1, ))].reshape((-1, 1))
        # Note : Strangely (target - q_val) lead to problems with python 3.5, theano 0.8.0rc and floatX=float32...
        diff = -q_val + target

        if self._clip_delta > 0:
            # This loss function implementation is taken from
            # https://github.com/spragunr/deep_q_rl
            # If we simply take the squared clipped diff as our loss,
            # then the gradient will be zero whenever the diff exceeds
            # the clip bounds. To avoid this, we extend the loss
            # linearly past the clip point to keep the gradient constant
            # in that regime.
            #
            # This is equivalent to declaring d loss/d q_vals to be
            # equal to the clipped diff, then backpropagating from
            # there, which is what the DeepMind implementation does.
            quadratic_part = T.minimum(abs(diff), self._clip_delta)
            linear_part = abs(diff) - quadratic_part
            loss_ind = 0.5 * quadratic_part**2 + self._clip_delta * linear_part
        else:
            loss_ind = 0.5 * diff**2

        loss = T.mean(loss_ind)

        givens = {
            rewards: self.rewards_shared,
            actions: self.actions_shared,  ## actions not needed!
            terminals: self.terminals_shared
        }

        for i, x in enumerate(self.states_shared):
            givens[states[i]] = x
        for i, x in enumerate(self.next_states_shared):
            givens[next_states[i]] = x

        gparams = []
        for p in self.params:
            gparam = T.grad(loss, p)
            gparams.append(gparam)

        updates = []

        if update_rule == 'deepmind_rmsprop':
            updates = deepmind_rmsprop(loss, self.params, gparams, thelr,
                                       self._rho, self._rms_epsilon)
        elif update_rule == 'rmsprop':
            for i, (p, g) in enumerate(zip(self.params, gparams)):
                acc = theano.shared(p.get_value() * 0.)
                acc_new = rho * acc + (1 - self._rho) * g**2
                gradient_scaling = T.sqrt(acc_new + self._rms_epsilon)
                g = g / gradient_scaling
                updates.append((acc, acc_new))
                updates.append((p, p - thelr * g))

        elif update_rule == 'sgd':
            for i, (param, gparam) in enumerate(zip(self.params, gparams)):
                updates.append((param, param - thelr * gparam))
        else:
            raise ValueError("Unrecognized update: {}".format(update_rule))

        if (self._double_Q == True):
            self._train = theano.function(
                [thediscount, thelr, next_q_curr_qnet],
                [loss, loss_ind, self.q_vals],
                updates=updates,
                givens=givens,
                on_unused_input='warn')
        else:
            self._train = theano.function([thediscount, thelr],
                                          [loss, loss_ind, self.q_vals],
                                          updates=updates,
                                          givens=givens,
                                          on_unused_input='warn')
        givens2 = {}
        for i, x in enumerate(self.states_shared):
            givens2[states[i]] = x

        self._q_vals = theano.function([],
                                       self.q_vals,
                                       givens=givens2,
                                       on_unused_input='warn')
Example #54
0
    def compile(self,
                optimizer,
                loss,
                class_mode="categorical",
                theano_mode=None):
        self.optimizer = optimizers.get(optimizer)

        self.loss = objectives.get(loss)
        weighted_loss = weighted_objective(objectives.get(loss))

        # input of model
        self.X_train = self.get_input(train=True)
        self.X_test = self.get_input(train=False)

        self.y_train = self.get_output(train=True)
        self.y_test = self.get_output(train=False)

        # target of model
        self.y = T.zeros_like(self.y_train)

        self.weights = T.ones_like(self.y_train)

        if hasattr(self.layers[-1], "get_output_mask"):
            mask = self.layers[-1].get_output_mask()
        else:
            mask = None
        train_loss = weighted_loss(self.y, self.y_train, self.weights, mask)
        test_loss = weighted_loss(self.y, self.y_test, self.weights, mask)

        train_loss.name = 'train_loss'
        test_loss.name = 'test_loss'
        self.y.name = 'y'

        if class_mode == "categorical":
            train_accuracy = T.mean(
                T.eq(T.argmax(self.y, axis=-1), T.argmax(self.y_train,
                                                         axis=-1)))
            test_accuracy = T.mean(
                T.eq(T.argmax(self.y, axis=-1), T.argmax(self.y_test,
                                                         axis=-1)))

        elif class_mode == "binary":
            train_accuracy = T.mean(T.eq(self.y, T.round(self.y_train)),
                                    dtype='float32')
            test_accuracy = T.mean(T.eq(self.y, T.round(self.y_test)),
                                   dtype='float32')
        else:
            raise Exception("Invalid class mode:" + str(class_mode))
        self.class_mode = class_mode
        self.theano_mode = theano_mode

        for r in self.regularizers:
            train_loss = r(train_loss)
        updates = self.optimizer.get_updates(self.params, self.constraints,
                                             train_loss)
        updates += self.updates

        if type(self.X_train) == list:
            train_ins = self.X_train + [self.y, self.weights]
            test_ins = self.X_test + [self.y, self.weights]
            predict_ins = self.X_test
        else:
            train_ins = [self.X_train, self.y, self.weights]
            test_ins = [self.X_test, self.y, self.weights]
            predict_ins = [self.X_test]

        self._train = theano.function(train_ins,
                                      train_loss,
                                      updates=updates,
                                      allow_input_downcast=True,
                                      mode=theano_mode)
        self._train_with_acc = theano.function(train_ins,
                                               [train_loss, train_accuracy],
                                               updates=updates,
                                               allow_input_downcast=True,
                                               mode=theano_mode)
        self._predict = theano.function(predict_ins,
                                        self.y_test,
                                        allow_input_downcast=True,
                                        mode=theano_mode)
        self._test = theano.function(test_ins,
                                     test_loss,
                                     allow_input_downcast=True,
                                     mode=theano_mode)
        self._test_with_acc = theano.function(test_ins,
                                              [test_loss, test_accuracy],
                                              allow_input_downcast=True,
                                              mode=theano_mode)
Example #55
0
def main():

    print("Loading Data")
    X_train, y_train, X_valid, y_valid, X_test, y_test = load_data.load_data_feautre_train(feautre = u"\uBC18\uD314",root_path= "/home/prosurpa/Image/image/",image_size=(28,28))

    input_var = T.tensor4('inputs')
    target_var = T.ivector('targets')

    print("Bulding Model")

    batch_size = 20

    network = build_f_cnn(batch_size ,input_var)


    prediction = lasagne.layers.get_output(network)
    loss = lasagne.objectives.categorical_crossentropy(prediction, target_var)
    loss = loss.mean()

    params = lasagne.layers.get_all_params(network, trainable=True)
    updates = lasagne.updates.nesterov_momentum(
        loss, params, learning_rate=0.01, momentum=0.9
    )


    test_prediction = lasagne.layers.get_output(network, deterministic=True)
    test_loss = lasagne.objectives.categorical_crossentropy(test_prediction, target_var)

    test_loss = test_loss.mean()
    test_acc = T.mean(T.eq(T.argmax(test_prediction, axis=1), target_var), dtype=theano.config.floatX)

    train_fn = theano.function([input_var, target_var], loss, updates=updates)

    val_fn = theano.function([input_var, target_var], [test_loss, test_acc])

    #model_rw.read_model_data(network, "75.0000009934model")

    print("Starting training")


    num_epochs = 1000
    best_acc = 75
    for epoch in range(num_epochs):
        train_err = 0
        train_batches = 0
        start_time = time.time()

        print((len(X_train)/batch_size))
        for batch in iterate_minibatches(X_train, y_train, batch_size, shuffle=True):
            inputs, targets = batch
            train_err += train_fn(inputs, targets)
            train_batches += 1
            if train_batches%20 == 0:
                print(train_batches)



        val_err = 0
        val_acc = 0
        val_batches = 0

        print((len(X_valid) / batch_size))
        for batch in iterate_minibatches(X_valid, y_valid, batch_size, shuffle=False):
            inputs, targets = batch
            err, acc = val_fn(inputs, targets)
            val_err += err
            val_acc += acc
            val_batches += 1
            if train_batches % 20 == 0:
                print(val_batches)


        print("Epoch {} of {} took {:.3f}s".format(
            epoch + 1, num_epochs, time.time() - start_time))
        print("  training loss:\t\t{:.6f}".format(train_err / train_batches))
        print("  validation loss:\t\t{:.6f}".format(val_err / val_batches))
        print("  validation accuracy:\t\t{:.2f} %".format(
            val_acc / val_batches * 100))

        test_err = 0
        test_acc = 0
        test_batches = 0

        print((len(X_test) / batch_size))
        for batch in iterate_minibatches(X_test, y_test, batch_size, shuffle=False):
            inputs, targets = batch
            err, acc = val_fn(inputs, targets)
            test_err += err
            test_acc += acc
            test_batches += 1
            if train_batches % 20 == 0:
                print(test_batches)


        print("Final results:")
        print("  test loss:\t\t\t{:.6f}".format(test_err / test_batches))
        print("  test accuracy:\t\t{:.2f} %".format(
            test_acc / test_batches * 100))

        re_acc = test_acc / test_batches * 100

        if re_acc > best_acc + 0.5:
            best_acc = re_acc
            model_rw.write_model_data(network, str(best_acc) + "model")
            lasagne.objectives.squared_error(prediction, train_prediction_b))
    #        loss=loss+pi_loss
    elif model.network_type == "tempens":
        # Tempens model loss:
        loss = T.mean(loss * mask_train, dtype=theano.config.floatX)
        loss += unsup_weight_var * T.mean(
            lasagne.objectives.squared_error(prediction, z_target_var))
    else:
        loss = T.mean(loss, dtype=theano.config.floatX)

    # regularization:L1,L2
    l2_penalty = lasagne.regularization.regularize_network_params(
        gru_network, lasagne.regularization.l2) * model.l2_loss
    loss = loss + l2_penalty

    train_acc = T.mean(T.eq(T.argmax(prediction, axis=1),
                            T.argmax(target_var, axis=1)),
                       dtype=theano.config.floatX)
    # We could add some weight decay as well here, see lasagne.regularization.

    # Create update expressions for training, i.e., how to modify the
    # parameters at each training step. Here, we'll use Stochastic Gradient
    # Descent (SGD) with Nesterov momentum, but Lasagne offers plenty more.
    params = lasagne.layers.get_all_params(gru_network, trainable=True)
    updates = lasagne.updates.adam(loss,
                                   params,
                                   learning_rate=learning_rate_var,
                                   beta1=adam_beta1_var)
    """
    3.test loss and accuracy
    """
    def fit(self, trees, learning_rate=10e-4, mu=0.99, reg=10e-3, epochs=15, activation=T.nnet.relu, train_inner_nodes=False):
        D = self.D
        V = self.V
        K = self.K
        self.f = activation
        N = len(trees)

        We = init_weight(V, D)
        W11 = np.random.randn(D, D, D) / np.sqrt(3*D)
        W22 = np.random.randn(D, D, D) / np.sqrt(3*D)
        W12 = np.random.randn(D, D, D) / np.sqrt(3*D)
        W1 = init_weight(D, D)
        W2 = init_weight(D, D)
        bh = np.zeros(D)
        Wo = init_weight(D, K)
        bo = np.zeros(K)

        self.We = theano.shared(We)
        self.W11 = theano.shared(W11)
        self.W22 = theano.shared(W22)
        self.W12 = theano.shared(W12)
        self.W1 = theano.shared(W1)
        self.W2 = theano.shared(W2)
        self.bh = theano.shared(bh)
        self.Wo = theano.shared(Wo)
        self.bo = theano.shared(bo)
        self.params = [self.We, self.W11, self.W22, self.W12, self.W1, self.W2, self.bh, self.Wo, self.bo]

        words = T.ivector('words')
        left_children = T.ivector('left_children')
        right_children = T.ivector('right_children')
        labels = T.ivector('labels')

        def recurrence(n, hiddens, words, left, right):
            w = words[n]
            # any non-word will have index -1
            hiddens = T.switch(
                T.ge(w, 0),
                T.set_subtensor(hiddens[n], self.We[w]),
                T.set_subtensor(hiddens[n],
                    self.f(
                        hiddens[left[n]].dot(self.W11).dot(hiddens[left[n]]) +
                        hiddens[right[n]].dot(self.W22).dot(hiddens[right[n]]) +
                        hiddens[left[n]].dot(self.W12).dot(hiddens[right[n]]) +
                        hiddens[left[n]].dot(self.W1) +
                        hiddens[right[n]].dot(self.W2) +
                        self.bh
                    )
                )
            )
            return hiddens

        hiddens = T.zeros((words.shape[0], D))

        h, _ = theano.scan(
            fn=recurrence,
            outputs_info=[hiddens],
            n_steps=words.shape[0],
            sequences=T.arange(words.shape[0]),
            non_sequences=[words, left_children, right_children],
        )

        py_x = T.nnet.softmax(h[:,0,:].dot(self.Wo) + self.bo)

        prediction = T.argmax(py_x, axis=1)
        
        rcost = T.mean([(p*p).sum() for p in self.params])
        if train_inner_nodes:
            cost = -T.mean(T.log(py_x[T.arange(labels.shape[0]), labels])) + rcost
        else:
            cost = -T.mean(T.log(py_x[-1, labels[-1]])) + rcost
        grads = T.grad(cost, self.params)
        dparams = [theano.shared(p.get_value()*0) for p in self.params]

        updates = [
            (p, p + mu*dp - learning_rate*g) for p, dp, g in zip(self.params, dparams, grads)
        ] + [
            (dp, mu*dp - learning_rate*g) for dp, g in zip(dparams, grads)
        ]

        self.cost_predict_op = theano.function(
            inputs=[words, left_children, right_children, labels],
            outputs=[cost, prediction],
            allow_input_downcast=True,
        )

        self.train_op = theano.function(
            inputs=[words, left_children, right_children, labels],
            outputs=[cost, prediction],
            updates=updates
        )

        costs = []
        sequence_indexes = range(N)
        if train_inner_nodes:
            n_total = sum(len(words) for words, _, _, _ in trees)
        else:
            n_total = N
        for i in xrange(epochs):
            t0 = datetime.now()
            sequence_indexes = shuffle(sequence_indexes)
            n_correct = 0
            cost = 0
            it = 0
            for j in sequence_indexes:
                words, left, right, lab = trees[j]
                c, p = self.train_op(words, left, right, lab)
                if np.isnan(c):
                    print "Cost is nan! Let's stop here. Why don't you try decreasing the learning rate?"
                    exit()
                cost += c
                if train_inner_nodes:
                    n_correct += np.sum(p == lab)
                else:
                    n_correct += (p[-1] == lab[-1])
                it += 1
                if it % 1 == 0:
                    sys.stdout.write("j/N: %d/%d correct rate so far: %f, cost so far: %f\r" % (it, N, float(n_correct)/n_total, cost))
                    sys.stdout.flush()
            print "i:", i, "cost:", cost, "correct rate:", (float(n_correct)/n_total), "time for epoch:", (datetime.now() - t0)
            costs.append(cost)

        plt.plot(costs)
        plt.show()
output = architecture.buildDCNN()
dcnnParams = lasagne.layers.get_all_params(output)

# SYMBOLIC INPUTS
x = T.imatrix()
y = T.ivector()

# Without L2 Regularization 
loss = lasagne.objectives.aggregate(
    lasagne.objectives.categorical_crossentropy(
        lasagne.layers.get_output(output, x), y), mode = 'mean')
updates = lasagne.updates.adagrad(loss, dcnnParams, learning_rate = 0.1)

# ACCURACY FOR PREDICTIONS
prediction = T.argmax(lasagne.layers.get_output(output, x, deterministic=True), axis=1)
score = T.eq(prediction, y).mean()

# SYMBOLIC FUNCTIONS
trainDCNN = theano.function([x,y], outputs = loss, updates = updates)
validateDCNN = theano.function([x,y], outputs = score)
testDCNN = theano.function([x,y], outputs = score)

# LOAD THE DATA
trainingSentences = loader.loadData('myDataset/train.txt')
trainingLabels = loader.loadData('myDataset/train_label.txt')
validationSentences = loader.loadData('myDataset/dev.txt')
validationLabels = loader.loadData('myDataset/dev_label.txt')
testSentences = loader.loadData('myDataset/test.txt')
testLabels = loader.loadData('myDataset/test_label.txt')
    lasagne.layers.set_all_param_values(net['prob'], params)


n_batches_per_epoch = np.floor(n_training_samples/float(BATCH_SIZE))
n_test_batches = np.floor(n_val_samples/float(BATCH_SIZE))

x_sym = T.tensor4()
y_sym = T.ivector()

l2_loss = lasagne.regularization.regularize_network_params(net['prob'], lasagne.regularization.l2) * 5e-4

prediction_train = lasagne.layers.get_output(net['prob'], x_sym, deterministic=False)
loss = lasagne.objectives.categorical_crossentropy(prediction_train, y_sym)
loss = loss.mean()
loss += l2_loss
acc_train = T.mean(T.eq(T.argmax(prediction_train, axis=1), y_sym), dtype=theano.config.floatX)

prediction_test = lasagne.layers.get_output(net['prob'], x_sym, deterministic=True)
loss_val = lasagne.objectives.categorical_crossentropy(prediction_test, y_sym)
loss_val = loss_val.mean()
loss_val += l2_loss
acc = T.mean(T.eq(T.argmax(prediction_test, axis=1), y_sym), dtype=theano.config.floatX)

params = lasagne.layers.get_all_params(net['prob'], trainable=True)
learning_rate = theano.shared(np.float32(0.001))
updates = lasagne.updates.adam(loss, params, learning_rate=learning_rate)

train_fn = theano.function([x_sym, y_sym], [loss, acc_train], updates=updates)
val_fn = theano.function([x_sym, y_sym], [loss_val, acc])
pred_fn = theano.function([x_sym], prediction_test)
Example #60
0
    return p_y_given_x


w_c1 = init_weights((4, 1, 3, 3))
b_c1 = init_weights((4, ))
w_c2 = init_weights((8, 4, 3, 3))
b_c2 = init_weights((8, ))
w_h3 = init_weights((8 * 4 * 4, 100))
b_h3 = init_weights((100, ))
w_o = init_weights((100, 10))
b_o = init_weights((10, ))

params = [w_c1, b_c1, w_c2, b_c2, w_h3, b_h3, w_o, b_o]

p_y_given_x = model(x, *params)
y = T.argmax(p_y_given_x, axis=1)

cost = T.mean(T.nnet.categorical_crossentropy(p_y_given_x, t))

updates = momentum(cost, params, learning_rate=0.01, momentum=0.9)

# compile theano functions
train = theano.function([x, t], cost, updates=updates)
predict = theano.function([x], y)

# train model
batch_size = 50

for i in range(50):
    print "iteration %d" % (i + 1)
    for start in range(0, len(x_train), batch_size):