def create_encoder_decoder_func(layers, apply_updates=False):
    X = T.fmatrix('X')
    X_batch = T.fmatrix('X_batch')

    X_hat = get_output(layers['l_decoder_out'], X, deterministic=False)

    # reconstruction loss
    encoder_decoder_loss = T.mean(
        T.mean(T.sqr(X - X_hat), axis=1)
    )

    if apply_updates:
        # all layers that participate in the forward pass should be updated
        encoder_decoder_params = get_all_params(
            layers['l_decoder_out'], trainable=True)

        encoder_decoder_updates = nesterov_momentum(
            encoder_decoder_loss, encoder_decoder_params, 0.01, 0.9)
    else:
        encoder_decoder_updates = None

    encoder_decoder_func = theano.function(
        inputs=[theano.In(X_batch)],
        outputs=encoder_decoder_loss,
        updates=encoder_decoder_updates,
        givens={
            X: X_batch,
        },
    )

    return encoder_decoder_func
Beispiel #2
0
def create_updates(loss, network, opt, learning_rate, momentum, beta1, beta2):
    params = lasagne.layers.get_all_params(network, trainable=True)
    grads = theano.grad(loss, params)
    # if max_norm:
    #     names = ['crf.U', 'crf.W_h', 'crf.W_c', 'crf.b']
    #     constraints = [grad for param, grad in zip(params, grads) if param.name in names]
    #     assert len(constraints) == 4
    #     scaled_grads = total_norm_constraint(constraints, max_norm=max_norm)
    #     counter = 0
    #     for i in xrange(len(params)):
    #         param = params[i]
    #         if param.name in names:
    #             grads[i] = scaled_grads[counter]
    #             counter += 1
    #     assert counter == 4
    if opt == 'adam':
        updates = adam(grads,
                       params=params,
                       learning_rate=learning_rate,
                       beta1=beta1,
                       beta2=beta2)
    elif opt == 'momentum':
        updates = nesterov_momentum(grads,
                                    params=params,
                                    learning_rate=learning_rate,
                                    momentum=momentum)
    else:
        raise ValueError('unkown optimization algorithm: %s' % opt)

    return updates
Beispiel #3
0
 def __init__(self, C, lr):
     self.C = C
     self.X = T.ftensor4()
     self.Y = T.fmatrix()
     self.net = self._forward()
     params = layers.get_all_params(self.net['flatten'], trainable=True)
     netout = layers.get_output(self.net['out'])
     flattenout = layers.get_output(self.net['flatten'])
     reg = regularization.regularize_network_params(self.net['flatten'],
                                                    regularization.l2)
     reg /= layers.helper.count_params(self.net['flatten'])
     self.flattenfn = theano.function([self.X],
                                      flattenout,
                                      allow_input_downcast=True)
     self.predictfn = theano.function([self.X],
                                      netout,
                                      allow_input_downcast=True)
     accrarcy = myUtils.basic.accuracy(netout, self.Y)
     self.scorefn = theano.function([self.X, self.Y],
                                    accrarcy,
                                    allow_input_downcast=True)
     self.sharedBeta = self.net['out'].get_params()[0]
     crossentropy = objectives.categorical_crossentropy(netout, self.Y)
     cost = T.mean(crossentropy) + C * reg
     updatesDict = updates.nesterov_momentum(cost, params, lr, 0.9)
     # 训练随机参数
     self.trainfn = theano.function([self.X, self.Y], [cost, accrarcy],
                                    updates=updatesDict,
                                    allow_input_downcast=True)
Beispiel #4
0
    def __build_loss_train__fn__(self):
        # create loss function
        prediction = layers.get_output(self.net)
        loss = objectives.categorical_crossentropy(prediction,
                                                   self.__target_var__)
        loss = loss.mean() + 1e-4 * regularization.regularize_network_params(
            self.net, regularization.l2)

        val_acc = T.mean(T.eq(T.argmax(prediction, axis=1),
                              self.__target_var__),
                         dtype=theano.config.floatX)

        # create parameter update expressions
        params = layers.get_all_params(self.net, trainable=True)
        self.eta = theano.shared(sp.array(sp.float32(0.05), dtype=sp.float32))
        update_rule = updates.nesterov_momentum(loss,
                                                params,
                                                learning_rate=self.eta,
                                                momentum=0.9)

        # compile training function that updates parameters and returns training loss
        self.__train_fn__ = theano.function(
            [self.__input_var__, self.__target_var__],
            loss,
            updates=update_rule)
        self.__predict_fn__ = theano.function(
            [self.__input_var__],
            layers.get_output(self.net, deterministic=True))
        self.__val_fn__ = theano.function(
            [self.__input_var__, self.__target_var__], [loss, val_acc])
Beispiel #5
0
def build_network(nkwargs):

    opt_kwargs = {k: nkwargs[k] for k in ['learning_rate', 'momentum']}

    input_var = T.tensor4('input_var')
    target_var = T.tensor4('target_var')

    network, hid_layer = build_denoising_convae(input_var, nkwargs)

    #get outputs
    prediction = get_output(network, deterministic=False)
    hid_layer_output = get_output(hid_layer, deterministic=True)

    #make losses
    loss = squared_error(prediction, target_var).mean()
    test_prediction = get_output(network, deterministic=True)
    test_loss = squared_error(test_prediction, target_var).mean()
    test_acc = test_loss

    #set up updates
    params = get_all_params(network, trainable=True)
    updates = nesterov_momentum(loss, params, **opt_kwargs)

    #make fns
    train_fn = function([input_var, target_var], loss, updates=updates)
    val_fn = function([input_var, target_var], [test_loss, test_acc])
    pred_fn = function([input_var], test_prediction)
    hlayer_fn = function([input_var], hid_layer_output)

    return train_fn, val_fn, pred_fn, hlayer_fn, network
Beispiel #6
0
 def __init__(self, istrained, name=None, args=None):
     self.istrained = istrained
     self.X = T.tensor4('X')
     self.y = T.ivector('y')
     self.outprob = build_model(self.X)
     if self.istrained:
         params = cPickle.load(open(dataset_path + 'plain_cnn.pkl', 'r'))
         layers.set_all_param_values(self.outprob, params)
         self.yFullProb = layers.get_output(self.outprob, deterministic=True)
         self.predfn = makeFunc([self.X, ], [self.yFullProb, ], None)
     else:
         self.lr, self.C, self.momentum = args
         self.params = layers.get_all_params(self.outprob, trainable=True)
         reg = regularization.regularize_network_params(self.outprob, regularization.l2)
         reg /= layers.helper.count_params(self.outprob)
         # 训练集
         self.yDropProb = layers.get_output(self.outprob)
         trCrossentropy = objectives.categorical_crossentropy(self.yDropProb, self.y)
         self.trCost = trCrossentropy.mean() + self.C * reg
         # 验证、测试集
         self.yFullProb = layers.get_output(self.outprob, deterministic=True)
         vateCrossentropy = objectives.categorical_crossentropy(self.yFullProb, self.y)
         self.vateCost = vateCrossentropy.mean() + self.C * reg
         # 训练函数,输入训练集,输出训练损失和误差
         updatesDict = updates.nesterov_momentum(self.trCost, self.params, self.lr, self.momentum)
         self.trainfn = makeFunc([self.X, self.y], [self.trCost, self.yDropProb], updatesDict)
         # 验证或测试函数,输入验证或测试集,输出损失和误差,不进行更新
         self.vatefn = makeFunc([self.X, self.y], [self.vateCost, self.yFullProb], None)
Beispiel #7
0
 def __init__(self, lr, C, momentum):
     self.lr = lr
     self.C = C
     self.momentum = momentum
     self.X = T.tensor4('X')
     self.y = T.ivector('y')
     self.network = self._build()
     self.params = layers.get_all_params(self.network, trainable=True)
     reg = regularization.regularize_network_params(self.network, regularization.l2)
     reg /= layers.helper.count_params(self.network)
     # 训练集
     yDropProb = layers.get_output(self.network)
     self.trEqs = myUtils.basic.eqs(yDropProb, self.y)
     trCrossentropy = objectives.categorical_crossentropy(yDropProb, self.y)
     self.trCost = trCrossentropy.mean() + C * reg
     # 验证、测试集
     yFullProb = layers.get_output(self.network, deterministic=True)
     self.vateEqs = myUtils.basic.eqs(yFullProb, self.y)
     vateCrossentropy = objectives.categorical_crossentropy(yFullProb, self.y)
     self.vateCost = vateCrossentropy.mean() + C * reg
     self.yPred = yFullProb
     # 训练函数,输入训练集,输出训练损失和误差
     updatesDict = updates.nesterov_momentum(self.trCost, self.params, lr, momentum)
     self.trainfn = myUtils.basic.makeFunc([self.X, self.y], [self.trCost, self.trEqs], updatesDict)
     # 验证或测试函数,输入验证或测试集,输出损失和误差,不进行更新
     self.vatefn = myUtils.basic.makeFunc([self.X, self.y], [self.vateCost, self.vateEqs], None)
Beispiel #8
0
def create_encoder_decoder_func(layers, apply_updates=False):
    X = T.fmatrix('X')
    X_batch = T.fmatrix('X_batch')

    X_hat = get_output(layers['l_decoder_out'], X, deterministic=False)

    # reconstruction loss
    encoder_decoder_loss = T.mean(T.mean(T.sqr(X - X_hat), axis=1))

    if apply_updates:
        # all layers that participate in the forward pass should be updated
        encoder_decoder_params = get_all_params(layers['l_decoder_out'],
                                                trainable=True)

        encoder_decoder_updates = nesterov_momentum(encoder_decoder_loss,
                                                    encoder_decoder_params,
                                                    0.01, 0.9)
    else:
        encoder_decoder_updates = None

    encoder_decoder_func = theano.function(
        inputs=[theano.In(X_batch)],
        outputs=encoder_decoder_loss,
        updates=encoder_decoder_updates,
        givens={
            X: X_batch,
        },
    )

    return encoder_decoder_func
Beispiel #9
0
def build_updates(grad, params, optimization, learning_rate):
	""" setup optimization algorithm """

	if optimization['optimizer'] == 'sgd':
		update_op = updates.sgd(grad, params, learning_rate=learning_rate) 
 
	elif optimization['optimizer'] == 'nesterov_momentum':
		if momenum in optimization:
			momentum = optimization['momentum']
		else:
			momentum = 0.9
		update_op = updates.nesterov_momentum(grad, params, learning_rate=learning_rate, momentum=momentum)
	
	elif optimization['optimizer'] == 'adagrad':
		update_op = updates.adagrad(grad, params, learning_rate=learning_rate)
	
	elif optimization['optimizer'] == 'rmsprop':
		if 'rho' in optimization:
			rho = optimization['rho']
		else:
			rho = 0.9
		update_op = updates.rmsprop(grad, params, learning_rate=learning_rate, rho=rho)
	
	elif optimization['optimizer'] == 'adam':
		if 'beta1' in optimization:
			beta1 = optimization['beta1']
		else:
			beta1 = 0.9
		if 'beta2' in optimization:
			beta2 = optimization['beta2']
		else:
			beta2 = 0.999
		update_op = updates.adam(grad, params, learning_rate=learning_rate, beta1=beta1, beta2=beta2)
  
	return update_op
def generate_theano_func(args, network, penalty, input_dict, target_var):

    prediction = get_output(network, input_dict)

    # loss = T.mean( target_var * ( T.log(target_var) - prediction ))
    loss = T.mean(categorical_crossentropy(prediction, target_var))
    # loss += 0.0001 * sum (T.sum(layer_params ** 2) for layer_params in get_all_params(network) )
    # penalty = sum ( T.sum(lstm_param**2) for lstm_param in lstm_params )
    # penalty = regularize_layer_params(l_forward_1_lstm, l2)
    # penalty = T.sum(lstm_param**2 for lstm_param in lstm_params)
    # penalty = 0.0001 * sum (T.sum(layer_params ** 2) for layer_params in get_all_params(l_forward_1) )

    loss = loss + penalty

    params = get_all_params(network, trainable=True)

    if args.optimizer == "sgd":
        updates = sgd(loss, params, learning_rate=args.step)
    elif args.optimizer == "adagrad":
        updates = adagrad(loss, params, learning_rate=args.step)
    elif args.optimizer == "adadelta":
        updates = adadelta(loss, params, learning_rate=args.step)
    elif args.optimizer == "nesterov":
        updates = nesterov_momentum(loss, params, learning_rate=args.step)
    elif args.optimizer == "rms":
        updates = rmsprop(loss, params, learning_rate=args.step)
    elif args.optimizer == "adam":
        updates = adam(loss, params, learning_rate=args.step)
    else:
        raise "Need set optimizer correctly"

    test_prediction = get_output(network, input_dict, deterministic=True)
    # test_prediction = get_output(network, deterministic=True)
    # test_loss = T.mean( target_var * ( T.log(target_var) - test_prediction))
    test_loss = T.mean(categorical_crossentropy(test_prediction, target_var))

    train_fn = theano.function(
        [input1_var, input1_mask_var, input2_var, input2_mask_var, target_var],
        loss,
        updates=updates,
        allow_input_downcast=True,
    )

    if args.task == "sts":
        val_fn = theano.function(
            [input1_var, input1_mask_var, input2_var, input2_mask_var, target_var],
            [test_loss, test_prediction],
            allow_input_downcast=True,
        )

    elif args.task == "ent":
        # test_acc = T.mean(T.eq(T.argmax(test_prediction, axis=1), target_var), dtype=theano.config.floatX)
        test_acc = T.mean(categorical_accuracy(test_prediction, target_var))
        val_fn = theano.function(
            [input1_var, input1_mask_var, input2_var, input2_mask_var, target_var],
            [test_loss, test_acc],
            allow_input_downcast=True,
        )

    return train_fn, val_fn
Beispiel #11
0
def get_updates(nnet, train_obj, trainable_params):

    implemented_solvers = ("nesterov", "adagrad", "adadelta", "adam")

    if not hasattr(nnet, "solver") or nnet.solver not in implemented_solvers:
        nnet.sgd_solver = "nesterov"
    else:
        nnet.sgd_solver = nnet.solver

    if nnet.sgd_solver == "nesterov":
        updates = l_updates.nesterov_momentum(train_obj,
                                              trainable_params,
                                              learning_rate=Cfg.learning_rate,
                                              momentum=0.9)

    elif nnet.sgd_solver == "adagrad":
        updates = l_updates.adagrad(train_obj,
                                    trainable_params,
                                    learning_rate=Cfg.learning_rate)

    elif nnet.sgd_solver == "adadelta":
        updates = l_updates.adadelta(train_obj,
                                     trainable_params,
                                     learning_rate=Cfg.learning_rate)

    elif nnet.sgd_solver == "adam":
        updates = l_updates.adam(train_obj,
                                 trainable_params,
                                 learning_rate=Cfg.learning_rate)

    return updates
Beispiel #12
0
    def __init__(self, conf):
        self.conf = conf

        if self.conf.act == "linear":
            self.conf.act = linear
        elif self.conf.act == "sigmoid":
            self.conf.act = sigmoid
        elif self.conf.act == "relu":
            self.conf.act = rectify
        elif self.conf.act == "tanh":
            self.conf.act = tanh
        else:
            raise ValueError("Unknown activation function", self.conf.act)

        input_var_first = T.matrix('inputs1')
        input_var_second = T.matrix('inputs2')
        target_var = T.matrix('targets')

        # create network
        self.autoencoder, encoder_first, encoder_second = self.__create_toplogy__(
            input_var_first, input_var_second)

        self.out = get_output(self.autoencoder)

        loss = squared_error(self.out, target_var)
        loss = loss.mean()

        params = get_all_params(self.autoencoder, trainable=True)
        updates = nesterov_momentum(loss,
                                    params,
                                    learning_rate=self.conf.lr,
                                    momentum=self.conf.momentum)

        # training function
        self.train_fn = theano.function(
            [input_var_first, input_var_second, target_var],
            loss,
            updates=updates)

        # fuction to reconstruct
        test_reconstruction = get_output(self.autoencoder, deterministic=True)
        self.reconstruction_fn = theano.function(
            [input_var_first, input_var_second], test_reconstruction)

        # encoding function
        test_encode = get_output([encoder_first, encoder_second],
                                 deterministic=True)
        self.encoding_fn = theano.function([input_var_first, input_var_second],
                                           test_encode)

        # utils
        blas = lambda name, ndarray: scipy.linalg.get_blas_funcs(
            (name, ), (ndarray, ))[0]
        self.blas_nrm2 = blas('nrm2', np.array([], dtype=float))
        self.blas_scal = blas('scal', np.array([], dtype=float))

        # load weights if necessary
        if self.conf.load_model is not None:
            self.load_model()
 def set_update(self):
     params = get_all_params(self.model, trainable=True)
     self.lr_schedule = {
         0: 0.01,
         6: 0.001,
         NUM_EPOCHS-2: 0.0001
     }
     self.lr = theano.shared(np.float32(self.lr_schedule[0]))
     self.updates = nesterov_momentum(self.train_loss, params,
                                      learning_rate=self.lr, momentum=0.9)
Beispiel #14
0
def create_discriminator_func(layers, apply_updates=False):
    X = T.fmatrix('X')
    pz = T.fmatrix('pz')

    X_batch = T.fmatrix('X_batch')
    pz_batch = T.fmatrix('pz_batch')

    # the discriminator receives samples from q(z|x) and p(z)
    # and should predict to which distribution each sample belongs
    discriminator_outputs = get_output(
        layers['l_discriminator_out'],
        inputs={
            layers['l_prior_in']: pz,
            layers['l_encoder_in']: X,
        },
        deterministic=False,
    )

    # label samples from q(z|x) as 1 and samples from p(z) as 0
    discriminator_targets = T.vertical_stack(
        T.ones((X_batch.shape[0], 1)),
        T.zeros((pz_batch.shape[0], 1))
    )

    discriminator_loss = T.mean(
        T.nnet.binary_crossentropy(
            discriminator_outputs,
            discriminator_targets,
        )
    )

    if apply_updates:
        # only layers that are part of the discriminator should be updated
        discriminator_params = get_all_params(
            layers['l_discriminator_out'], trainable=True, discriminator=True)

        discriminator_updates = nesterov_momentum(
            discriminator_loss, discriminator_params, 0.1, 0.0)
    else:
        discriminator_updates = None

    discriminator_func = theano.function(
        inputs=[
            theano.In(X_batch),
            theano.In(pz_batch),
        ],
        outputs=discriminator_loss,
        updates=discriminator_updates,
        givens={
            X: X_batch,
            pz: pz_batch,
        },
    )

    return discriminator_func
Beispiel #15
0
    def trainModel(self, tr_X, va_X, tr_y, va_y, batchSize=64, maxIter=300, verbose=True,
                   start=10, period=2, threshold=10, earlyStopTol=2, totalStopTol=2):
        trainfn = self.trainfn
        validatefn = self.vatefn
        lr = self.lr

        earlyStop = myUtils.basic.earlyStopGen(start, period, threshold, earlyStopTol)
        earlyStop.next()  # 初始化生成器
        totalStopCount = 0
        for epoch in xrange(maxIter):  # every epoch
            # In each epoch, we do a full pass over the training data:
            trAllPred = None
            trRandy = None
            trCostSum = 0.
            startTime = time.time()
            for batch in myUtils.basic.miniBatchGen(tr_X, tr_y, batchSize, shuffle=True):
                Xb, yb = batch
                trCost, trPred = trainfn(Xb, yb)
                trCostSum += trCost
                trAllPred = np.concatenate((trAllPred, trPred), axis=0) \
                    if trAllPred is not None else trPred
                trRandy = np.concatenate((trRandy, yb)) if trRandy is not None else yb
            trIter = len(tr_X) // batchSize
            if len(tr_X) % batchSize != 0: trIter += 1
            trCostMean = trCostSum / trIter
            trAcc = myUtils.basic.accuracy(trAllPred, trRandy)
            trP, trR = myUtils.basic.precision_recall(trAllPred, trRandy)
            # And a full pass over the validation data:
            vaAllPred = None
            vaCostSum = 0.
            for batch in myUtils.basic.miniBatchGen(va_X, va_y, batchSize, shuffle=False):
                Xb, yb = batch
                vaCost, vaPred = validatefn(Xb, yb)
                vaCostSum += vaCost
                vaAllPred = np.concatenate((vaAllPred, vaPred), axis=0) \
                    if vaAllPred is not None else vaPred
            vaIter = len(va_X) // batchSize
            if len(va_X) % batchSize != 0: vaIter += 1
            vaCostMean = vaCostSum / vaIter
            vaAcc = myUtils.basic.accuracy(vaAllPred, va_y)
            vaP, vaR = myUtils.basic.precision_recall(vaAllPred, va_y)
            if verbose:
                print 'epoch ', epoch, ' time: %.3f' % (time.time() - startTime),
                print 'trcost: %.5f  tracc: %.5f  trp: %.5f  trr: %.5f' % (trCostMean, trAcc, trP, trR),
                print 'vacost: %.5f  vaacc: %.5f  vap: %.5f  var: %.5f' % (vaCostMean, vaAcc, vaP, vaR)
            # Then we decide whether to early stop:
            if earlyStop.send((trCostMean, vaCostMean)):
                lr /= 10  # 如果一次早停止发生,则学习率降低继续迭代
                updatesDict = updates.nesterov_momentum(self.trCost, self.params, lr, self.momentum)
                trainfn = myUtils.basic.makeFunc([self.X, self.y], [self.trCost, self.yDropProb], updatesDict)
                totalStopCount += 1
                if totalStopCount > totalStopTol:  # 如果学习率降低仍然发生早停止,则退出迭代
                    print 'stop'
                    break
                if verbose: print 'learning rate decreases to ', lr
Beispiel #16
0
    def build_train_fn(self):

        prediction = get_output(self.network, deterministic=False)

        loss = squared_error(prediction, self.target_var)
        loss = loss.mean()

        params = get_all_params(self.network, trainable=True)

        updates = nesterov_momentum(loss, params, learning_rate=self.learning_rate, momentum=self.momentum)

        self.train_fn = theano.function([self.input_var, self.target_var], loss, updates=updates)
def test_stochastic_layer_network():
    learning_rate = 0.1
    momentum = 0.9
    num_epoch = 1000
    input = T.fmatrix('input')
    output = T.fmatrix('output')
    print 'FF-Layer: (Batch_size, n_features)'
    print 'Building stochastic layer model'
    l_in = L.InputLayer(shape=(1, 10), input_var=input)
    l_2 = L.DenseLayer(l_in,
                       num_units=10,
                       nonlinearity=lasagne.nonlinearities.softmax,
                       W=lasagne.init.Constant(0.))
    print 'Input Layer shape:  ', L.get_output_shape(l_in)
    print 'Dense Layer shape: ', L.get_output_shape(l_2)
    l_stochastic_layer = StochasticLayer(l_2, estimator='ST')
    print 'Stochastic Layer shape:  ', L.get_output_shape(l_stochastic_layer)
    l_out = L.DenseLayer(l_stochastic_layer,
                         num_units=10,
                         b=lasagne.init.Constant(0.))
    print 'Final Dense Layer shape: ', L.get_output_shape(l_out)
    network_output = L.get_output(l_out)
    print 'Building loss function...'
    loss = lasagne.objectives.squared_error(network_output, output)
    loss = loss.mean()
    params = L.get_all_params(l_out, trainable=True)
    updates = nesterov_momentum(loss, params, learning_rate, momentum)
    train = theano.function([input, output],
                            loss,
                            updates=updates,
                            allow_input_downcast=True)
    output_fn = theano.function([input],
                                network_output,
                                allow_input_downcast=True)

    test_X = np.ones((1, 10))
    test_Y = np.ones((1, 10))
    losses = []
    mean_losses = []
    for epoch in range(num_epoch):
        print 'Epoch number: ', epoch
        losses.append(train(test_X, test_Y))
        print 'epoch {} mean loss {}'.format(epoch, np.mean(losses))
        print 'Current Output: ', output_fn(test_X)
        mean_losses.append(np.mean(losses))

    plt.title("Mean loss")
    plt.xlabel("Training examples")
    plt.ylabel("Loss")
    plt.plot(mean_losses, label="train")
    plt.grid()
    plt.legend()
    plt.draw()
Beispiel #18
0
def contrastive_loss_iter(embedder, update_params={}):
    # assume it's the 1d version
    X_pairs = {
            'te1':T.tensor3(),
            'te2':T.tensor3(),
            }
    y = T.ivector() # basically class labels

    final_emb_layer = embedder[-1]
    all_layers = ll.get_all_layers(embedder)
    imwrite_architecture(all_layers, './layer_rep.png')
    # assume we get a list of predictions (e.g. for jet architecture, but should work w/just one pred)
    # another assumption (which must hold when the network is being made)
    # the last prediction layer is a) the end of the network and b) what we ultimately care about
    # however the other prediction layers will be incorporated into the training loss
    predicted_embeds_train = {k:ll.get_output(embedder, X)[-1] for k, X in X_pairs.items()}
    predicted_embeds_valid = {k:ll.get_output(final_emb_layer, X, deterministic=True) for k, X in X_pairs.items()}

    margin = 1

    # if distance is 0 that's bad
    distance = lambda pred: (pred['te1'] - pred['te2'] + 1e-7).norm(2, axis=1)
    contrastive_loss = lambda pred: T.mean(y*(distance(pred)) + (1 - y)*(margin - distance(pred)).clip(0,np.inf))
    failed_matches = lambda pred: T.switch(T.eq(T.sum(y),0), 0, T.sum((y*distance(pred)) > margin) / T.sum(y))
    failed_nonmatches = lambda pred: T.switch(T.eq(T.sum(1-y),0), 0, T.sum((1-y*distance(pred)) < margin) / T.sum(1-y))
    failed_pairs = lambda pred: 0.5*failed_matches(pred) + 0.5*failed_nonmatches(pred)

    decay = 0.001
    reg = regularize_network_params(final_emb_layer, l2) * decay
    losses_reg = lambda pred: contrastive_loss(pred) + reg
    loss_train = losses_reg(predicted_embeds_train)
    loss_train.name = 'CL' # for the names
    #all_params = list(chain(*[ll.get_all_params(pred) for pred in embedder]))
    all_params = ll.get_all_params(embedder, trainable=True) # this should work with multiple 'roots'
    grads = T.grad(loss_train, all_params, add_names=True)
    updates = nesterov_momentum(grads, all_params, update_params['l_r'], momentum=update_params['momentum'])

    print("Compiling network for training")
    tic = time.time()
    train_iter = theano.function([X_pairs['te1'], X_pairs['te2'], y], [loss_train] + grads, updates=updates)
    toc = time.time() - tic
    print("Took %0.2f seconds" % toc)
    #theano.printing.pydotprint(loss, outfile='./loss_graph.png',var_with_name_simple=True)
    print("Compiling network for validation")
    tic = time.time()
    valid_iter = theano.function([X_pairs['te1'], X_pairs['te2'], y], [
                                    contrastive_loss(predicted_embeds_valid),
                                    losses_reg(predicted_embeds_valid),
                                    failed_pairs(predicted_embeds_valid)])
    toc = time.time() - tic
    print("Took %0.2f seconds" % toc)

    return {'train':train_iter, 'valid':valid_iter, 'gradnames':[g.name for g in grads]}
Beispiel #19
0
def create_generator2_func(layers, apply_updates=False):
    X = T.fmatrix('X')
    X_batch = T.fmatrix('X_batch')

    # no need to pass an input to l_prior_in here
    generator_outputs = get_output(layers['l_encoder_out2'],
                                   X,
                                   deterministic=False)

    # so pass the output of the generator as the output of the concat layer
    discriminator_outputs = get_output(layers['l_discriminator_out2'],
                                       inputs={
                                           layers['l_prior_encoder_concat2']:
                                           generator_outputs,
                                       },
                                       deterministic=False)

    # the discriminator learns to predict 1 for q(z|x),
    # so the generator should fool it into predicting 0
    generator_targets = T.zeros_like(X_batch.shape[0])

    # so the generator needs to push the discriminator's output to 0
    generator_loss = T.mean(
        T.nnet.binary_crossentropy(
            discriminator_outputs,
            generator_targets,
        ))

    if apply_updates:
        # only layers that are part of the generator (i.e., encoder)
        # should be updated
        generator_params = get_all_params(layers['l_discriminator_out2'],
                                          trainable=True,
                                          generator2=True)

        generator_updates = nesterov_momentum(generator_loss, generator_params,
                                              0.1, 0.0)
    else:
        generator_updates = None

    generator_func = theano.function(
        inputs=[
            theano.In(X_batch),
        ],
        outputs=generator_loss,
        updates=generator_updates,
        givens={
            X: X_batch,
        },
    )

    return generator_func
def create_generator_func(layers, apply_updates=False):
    X = T.fmatrix('X')
    X_batch = T.fmatrix('X_batch')

    # no need to pass an input to l_prior_in here
    generator_outputs = get_output(
        layers['l_encoder_out'], X, deterministic=False)

    # so pass the output of the generator as the output of the concat layer
    discriminator_outputs = get_output(
        layers['l_discriminator_out'],
        inputs={
            layers['l_prior_encoder_concat']: generator_outputs,
        },
        deterministic=False
    )

    # the discriminator learns to predict 1 for q(z|x),
    # so the generator should fool it into predicting 0
    generator_targets = T.zeros_like(X_batch.shape[0])

    # so the generator needs to push the discriminator's output to 0
    generator_loss = T.mean(
        T.nnet.binary_crossentropy(
            discriminator_outputs,
            generator_targets,
        )
    )

    if apply_updates:
        # only layers that are part of the generator (i.e., encoder)
        # should be updated
        generator_params = get_all_params(
            layers['l_discriminator_out'], trainable=True, generator=True)

        generator_updates = nesterov_momentum(
            generator_loss, generator_params, 0.1, 0.0)
    else:
        generator_updates = None

    generator_func = theano.function(
        inputs=[
            theano.In(X_batch),
        ],
        outputs=generator_loss,
        updates=generator_updates,
        givens={
            X: X_batch,
        },
    )

    return generator_func
Beispiel #21
0
def define_updates(output_layer, X, Y):
    output_train = lasagne.layers.get_output(output_layer)
    output_test = lasagne.layers.get_output(output_layer, deterministic=True)

    # set up the loss that we aim to minimize when using cat cross entropy our Y should be ints not one-hot
    loss = lasagne.objectives.categorical_crossentropy(
        T.clip(output_train, 0.000001, 0.999999), Y)
    loss = loss.mean()

    acc = T.mean(T.eq(T.argmax(output_train, axis=1), Y),
                 dtype=theano.config.floatX)

    # if using ResNet use L2 regularization
    all_layers = lasagne.layers.get_all_layers(output_layer)
    l2_penalty = lasagne.regularization.regularize_layer_params(
        all_layers, lasagne.regularization.l2) * P.L2_LAMBDA
    loss = loss + l2_penalty

    # set up loss functions for validation dataset
    test_loss = lasagne.objectives.categorical_crossentropy(
        T.clip(output_test, 0.000001, 0.999999), Y)
    test_loss = test_loss.mean()
    test_loss = test_loss + l2_penalty

    test_acc = T.mean(T.eq(T.argmax(output_test, axis=1), Y),
                      dtype=theano.config.floatX)

    # get parameters from network and set up sgd with nesterov momentum to update parameters, l_r is shared var so it can be changed
    l_r = theano.shared(np.array(LR_SCHEDULE[0], dtype=theano.config.floatX))
    params = lasagne.layers.get_all_params(output_layer, trainable=True)
    updates = nesterov_momentum(loss,
                                params,
                                learning_rate=l_r,
                                momentum=P.MOMENTUM)
    #updates = adam(loss, params, learning_rate=l_r)

    prediction_binary = T.argmax(output_train, axis=1)
    test_prediction_binary = T.argmax(output_test, axis=1)

    # set up training and prediction functions
    train_fn = theano.function(
        inputs=[X, Y],
        outputs=[loss, l2_penalty, acc, prediction_binary, output_train[:, 1]],
        updates=updates)
    valid_fn = theano.function(inputs=[X, Y],
                               outputs=[
                                   test_loss, l2_penalty, test_acc,
                                   test_prediction_binary, output_test[:, 1]
                               ])

    return train_fn, valid_fn, l_r
Beispiel #22
0
    def trainModel(self, tr_X, va_X, tr_y, va_y, batchSize=128, maxIter=100, verbose=True,
                   start=5, period=2, threshold=10, earlyStopTol=2, totalStopTol=2):
        trainfn = self.trainfn
        validatefn = self.vatefn
        lr = self.lr

        earlyStop = myUtils.basic.earlyStopGen(start, period, threshold, earlyStopTol)
        earlyStop.next()  # 初始化生成器
        totalStopCount = 0
        for epoch in xrange(maxIter):  # every epoch
            # In each epoch, we do a full pass over the training data:
            trEqCount = 0
            trCostSum = 0.
            startTime = time.time()
            for batch in myUtils.basic.miniBatchGen(tr_X, tr_y, batchSize, shuffle=True):
                Xb, yb = batch
                trCost, trEqs = trainfn(Xb, yb)
                trCostSum += trCost
                trEqCount += trEqs
            trIter = len(tr_X) // batchSize
            if len(tr_X) % batchSize != 0: trIter += 1
            trCostMean = trCostSum / trIter
            trAccuracy = float(trEqCount) / len(tr_X)
            # And a full pass over the validation data:
            vaEqCount = 0
            vaCostSum = 0.
            for batch in myUtils.basic.miniBatchGen(va_X, va_y, batchSize, shuffle=False):
                Xb, yb = batch
                vaCost, vaEqs = validatefn(Xb, yb)
                vaCostSum += vaCost
                vaEqCount += vaEqs
            vaIter = len(va_X) // batchSize
            if len(va_X) % batchSize != 0: vaIter += 1
            vaCostMean = vaCostSum / vaIter
            vaAccuracy = float(vaEqCount) / len(va_X)
            if verbose:
                print 'epoch ', epoch, ' time: %.3f' % (time.time() - startTime),
                print 'trcost: %.5f  tracc: %.5f' % (trCostMean, trAccuracy),
                print 'vacost: %.5f  vaacc: %.5f' % (vaCostMean, vaAccuracy)
            # Then we decide whether to early stop:
            if earlyStop.send((trCostMean, vaCostMean)):
                lr /= 10  # 如果一次早停止发生,则学习率降低继续迭代
                updatesDict = updates.nesterov_momentum(self.trCost, self.params, lr, self.momentum)
                trainfn = myUtils.basic.makeFunc([self.X, self.y], [self.trCost, self.trEqs], updatesDict)
                totalStopCount += 1
                if totalStopCount > totalStopTol:  # 如果学习率降低仍然发生早停止,则退出迭代
                    print 'stop'
                    break
                if verbose: print 'learning rate decreases to ', lr
Beispiel #23
0
    def __init__(self, conf):
        self.conf = conf

        if self.conf.act == "linear":
            self.conf.act = linear
        elif self.conf.act == "sigmoid":
            self.conf.act = sigmoid
        elif self.conf.act == "relu":
            self.conf.act = rectify
        elif self.conf.act == "tanh":
            self.conf.act = tanh
        else:
            raise ValueError("Unknown activation function", self.conf.act)

        input_var_first   = T.matrix('inputs1')
        input_var_second  = T.matrix('inputs2')
        target_var        = T.matrix('targets')

        # create network        
        self.autoencoder, encoder_first, encoder_second = self.__create_toplogy__(input_var_first, input_var_second)
        
        self.out = get_output(self.autoencoder)
        
        loss = squared_error(self.out, target_var)
        loss = loss.mean()
        
        params = get_all_params(self.autoencoder, trainable=True)
        updates = nesterov_momentum(loss, params, learning_rate=self.conf.lr, momentum=self.conf.momentum)
        
        # training function
        self.train_fn = theano.function([input_var_first, input_var_second, target_var], loss, updates=updates)
        
        # fuction to reconstruct
        test_reconstruction = get_output(self.autoencoder, deterministic=True)
        self.reconstruction_fn = theano.function([input_var_first, input_var_second], test_reconstruction)
        
        # encoding function
        test_encode = get_output([encoder_first, encoder_second], deterministic=True)
        self.encoding_fn = theano.function([input_var_first, input_var_second], test_encode)

        # utils
        blas = lambda name, ndarray: scipy.linalg.get_blas_funcs((name,), (ndarray,))[0]
        self.blas_nrm2 = blas('nrm2', np.array([], dtype=float))
        self.blas_scal = blas('scal', np.array([], dtype=float))

        # load weights if necessary
        if self.conf.load_model is not None:
            self.load_model()
Beispiel #24
0
def get_updates(nnet, train_obj, trainable_params, solver=None):

    implemented_solvers = ("sgd", "momentum", "nesterov", "adagrad", "rmsprop",
                           "adadelta", "adam", "adamax")

    if solver not in implemented_solvers:
        nnet.sgd_solver = "adam"
    else:
        nnet.sgd_solver = solver

    if nnet.sgd_solver == "sgd":
        updates = l_updates.sgd(train_obj,
                                trainable_params,
                                learning_rate=Cfg.learning_rate)
    elif nnet.sgd_solver == "momentum":
        updates = l_updates.momentum(train_obj,
                                     trainable_params,
                                     learning_rate=Cfg.learning_rate,
                                     momentum=Cfg.momentum)
    elif nnet.sgd_solver == "nesterov":
        updates = l_updates.nesterov_momentum(train_obj,
                                              trainable_params,
                                              learning_rate=Cfg.learning_rate,
                                              momentum=Cfg.momentum)
    elif nnet.sgd_solver == "adagrad":
        updates = l_updates.adagrad(train_obj,
                                    trainable_params,
                                    learning_rate=Cfg.learning_rate)
    elif nnet.sgd_solver == "rmsprop":
        updates = l_updates.rmsprop(train_obj,
                                    trainable_params,
                                    learning_rate=Cfg.learning_rate,
                                    rho=Cfg.rho)
    elif nnet.sgd_solver == "adadelta":
        updates = l_updates.adadelta(train_obj,
                                     trainable_params,
                                     learning_rate=Cfg.learning_rate,
                                     rho=Cfg.rho)
    elif nnet.sgd_solver == "adam":
        updates = l_updates.adam(train_obj,
                                 trainable_params,
                                 learning_rate=Cfg.learning_rate)
    elif nnet.sgd_solver == "adamax":
        updates = l_updates.adamax(train_obj,
                                   trainable_params,
                                   learning_rate=Cfg.learning_rate)

    return updates
def net_updates(net, loss, lr):

    # Get all trainable parameters (weights) of our net
    params = l.get_all_params(net, trainable=True)

    # We use the adam update, other options are available
    if cfg.OPTIMIZER == 'adam':
        param_updates = updates.adam(loss, params, learning_rate=lr, beta1=0.9)
    elif cfg.OPTIMIZER == 'nesterov':
        param_updates = updates.nesterov_momentum(loss,
                                                  params,
                                                  learning_rate=lr,
                                                  momentum=0.9)
    elif cfg.OPTIMIZER == 'sgd':
        param_updates = updates.sgd(loss, params, learning_rate=lr)

    return param_updates
 def __build_loss_train__fn__(self):
     # create loss function
     prediction = layers.get_output(self.net)
     loss = objectives.categorical_crossentropy(prediction, self.__target_var__)
     loss = loss.mean() + 1e-4 * regularization.regularize_network_params(self.net, regularization.l2)
     
     val_acc = T.mean(T.eq(T.argmax(prediction, axis=1), self.__target_var__),dtype=theano.config.floatX)
     
     # create parameter update expressions
     params = layers.get_all_params(self.net, trainable=True)
     self.eta = theano.shared(sp.array(sp.float32(0.05), dtype=sp.float32))
     update_rule = updates.nesterov_momentum(loss, params, learning_rate=self.eta,
                                                 momentum=0.9)
     
     # compile training function that updates parameters and returns training loss
     self.__train_fn__ = theano.function([self.__input_var__,self.__target_var__], loss, updates=update_rule)
     self.__predict_fn__ = theano.function([self.__input_var__], layers.get_output(self.net,deterministic=True))
     self.__val_fn__ = theano.function([self.__input_var__,self.__target_var__], [loss,val_acc])
Beispiel #27
0
def similarity_iter(output_layer, match_layer, update_params, match_layer_w=0):
    X1 = T.tensor4()
    X2 = T.tensor4()
    y = T.ivector()

    # find the input layers
    # TODO this better
    all_layers = ll.get_all_layers(match_layer)
    # make image of all layers
    imwrite_architecture(all_layers, './layer_rep.png')

    input_1 = filter(lambda x: x.name == 'input1', all_layers)[0]
    input_2 = filter(lambda x: x.name == 'input2', all_layers)[0]

    descriptors_train, match_prob_train = ll.get_output([output_layer, match_layer], {input_1: X1, input_2: X2})
    descriptors_eval, match_prob_eval = ll.get_output([output_layer, match_layer], {input_1: X1, input_2: X2}, deterministic=True)
    #descriptor_shape = ll.get_output_shape(output_layer, {input_1: X1, input_2: X2})
    #print("Network output shape: %r" % (descriptor_shape,))
    # distance minimization
    distance = lambda x: (x[:,0,:] - x[:,1,:] + 1e-7).norm(2, axis=1)
    #distance_eval = (descriptors_eval[:,0,:] - descriptors_eval[:,1,:] + 1e-7).norm(2, axis=1)
    # 9/21 squaring the loss seems to prevent it from getting to 0.5 really quickly (i.e. w/in 3 epochs)
    # let's see if it will learn something good
    margin = 1
    decay = 0
    reg = regularize_network_params(match_layer, l2) * decay
    loss = lambda x, z: ((1-match_layer_w)*T.mean(y*(distance(x)) + (1 - y)*(T.maximum(0, margin - distance(x))))/2 # constrastive loss
            + match_layer_w*T.mean(binary_crossentropy(z.T + 1e-7,y))) # matching loss
    loss_reg = lambda x, z: (loss(x,z) + reg)
    # this loss doesn't work since it just pushes all the descriptors near each other and then predicts 0 all the time for tha matching
    #jason_loss = lambda x, z: T.mean(distance(x)*y + (1-y)*binary_crossentropy(z.T + 1e-7,y))
    #loss_eval = T.mean(y*(distance_eval**2) + (1 - y)*(T.maximum(0, 1 - distance_eval)**2))
    all_params = ll.get_all_params(match_layer) # unsure how I would do this if there were truly two trainable branches...
    loss_train = loss_reg(descriptors_train, match_prob_train)
    loss_train.name = 'combined_loss' # for the names
    grads = T.grad(loss_train, all_params, add_names=True)
    #updates = adam(grads, all_params, **update_params)
    updates = nesterov_momentum(grads, all_params, **update_params)

    train_iter = theano.function([X1, X2, y], [loss_train, loss(descriptors_train, match_prob_train)] + grads, updates=updates)
    #theano.printing.pydotprint(loss, outfile='./loss_graph.png',var_with_name_simple=True)
    valid_iter = theano.function([X1, X2, y], loss(descriptors_eval, match_prob_eval))

    return {'train':train_iter, 'valid':valid_iter, 'gradnames':[g.name for g in grads]}
Beispiel #28
0
 def get_train_fn(self, last_only=False):
     input_var = self.net['input'].input_var
     target_var = T.ivector('targets')
     prediction = lasagne.layers.get_output(self.output_layer)
     loss = categorical_crossentropy(prediction, target_var)
     loss = loss.mean()
     error = T.mean(T.neq(T.argmax(prediction, axis=1), target_var),
                    dtype=theano.config.floatX)
     regularization = self.regularizer_amount * regularize_network_params(
         self.output_layer, l2)
     if last_only:
         all_params = self.output_layer.get_params(trainable=True)
     else:
         all_params = lasagne.layers.get_all_params(self.output_layer,
                                                    trainable=True)
     updates = nesterov_momentum(loss + regularization,
                                 all_params,
                                 learning_rate=self.lr)
     return theano.function([input_var, target_var], (loss, error),
                            updates=updates)
def create_train_func(layers):
    Xa, Xb = T.tensor4('Xa'), T.tensor4('Xb')
    Xa_batch, Xb_batch = T.tensor4('Xa_batch'), T.tensor4('Xb_batch')

    Tp = get_output(
        layers['trans'],
        inputs={
            layers['inputa']: Xa,
            layers['inputb']: Xb,
        },
        deterministic=False,
    )

    # transforms: ground-truth, predicted
    Tg = T.fmatrix('Tg')
    Tg_batch = T.fmatrix('Tg_batch')
    theta_gt = Tg.reshape((-1, 2, 3))
    theta_pr = Tp.reshape((-1, 2, 3))

    # grids: ground-truth, predicted
    Gg = T.dot(theta_gt, _meshgrid(20, 20))
    Gp = T.dot(theta_pr, _meshgrid(20, 20))

    train_loss = T.mean(T.sqr(Gg - Gp))

    params = get_all_params(layers['trans'], trainable=True)
    updates = nesterov_momentum(train_loss, params, 1e-3, 0.9)

    corr_func = theano.function(
        inputs=[theano.In(Xa_batch),
                theano.In(Xb_batch),
                theano.In(Tg_batch)],
        outputs=[Tp, train_loss],
        updates=updates,
        givens={
            Xa: Xa_batch,
            Xb: Xb_batch,  # Ia, Ib
            Tg: Tg_batch,  # transform Ia --> Ib
        })

    return corr_func
Beispiel #30
0
def create_updates(loss, network, opt, learning_rate, momentum, beta1, beta2):
    params = lasagne.layers.get_all_params(network, trainable=True)
    grads = theano.grad(loss, params)
    # if max_norm:
    #     names = ['crf.U', 'crf.W_h', 'crf.W_c', 'crf.b']
    #     constraints = [grad for param, grad in zip(params, grads) if param.name in names]
    #     assert len(constraints) == 4
    #     scaled_grads = total_norm_constraint(constraints, max_norm=max_norm)
    #     counter = 0
    #     for i in xrange(len(params)):
    #         param = params[i]
    #         if param.name in names:
    #             grads[i] = scaled_grads[counter]
    #             counter += 1
    #     assert counter == 4
    if opt == 'adam':
        updates = adam(grads, params=params, learning_rate=learning_rate, beta1=beta1, beta2=beta2)
    elif opt == 'momentum':
        updates = nesterov_momentum(grads, params=params, learning_rate=learning_rate, momentum=momentum)
    else:
        raise ValueError('unkown optimization algorithm: %s' % opt)

    return updates
Beispiel #31
0
    def build_train_func(self, lr=0.01, mntm=0.9):
        y_hat = get_output(self.layers['out'], self.x, deterministic=False)
        train_loss = T.mean(
            T.nnet.categorical_crossentropy(y_hat, self.y)
        )
        train_acc = T.mean(
            T.eq(y_hat.argmax(axis=1), self.y)
        )

        all_params = get_all_params(self.layers['out'], trainable=True)
        updates = nesterov_momentum(
            train_loss, all_params, lr, mntm)

        train_func = theano.function(
            inputs=[theano.In(self.x_batch), theano.In(self.y_batch)],
            outputs=[train_loss, train_acc],
            updates=updates,
            givens={
                self.x: self.x_batch,
                self.y: self.y_batch,
            },
        )

        return train_func
Beispiel #32
0
def define_updates(output_layer, X, Y):
    output_train = lasagne.layers.get_output(output_layer)
    output_test = lasagne.layers.get_output(output_layer, deterministic=True)

    # set up the loss that we aim to minimize when using cat cross entropy our Y should be ints not one-hot
    loss = lasagne.objectives.categorical_crossentropy(T.clip(output_train,0.000001,0.999999), Y)
    loss = loss.mean()

    acc = T.mean(T.eq(T.argmax(output_train, axis=1), Y), dtype=theano.config.floatX)

    # if using ResNet use L2 regularization
    all_layers = lasagne.layers.get_all_layers(output_layer)
    l2_penalty = lasagne.regularization.regularize_layer_params(all_layers, lasagne.regularization.l2) * P.L2_LAMBDA
    loss = loss + l2_penalty

    # set up loss functions for validation dataset
    test_loss = lasagne.objectives.categorical_crossentropy(T.clip(output_test,0.000001,0.999999), Y)
    test_loss = test_loss.mean()
    test_loss = test_loss + l2_penalty

    test_acc = T.mean(T.eq(T.argmax(output_test, axis=1), Y), dtype=theano.config.floatX)

    # get parameters from network and set up sgd with nesterov momentum to update parameters, l_r is shared var so it can be changed
    l_r = theano.shared(np.array(LR_SCHEDULE[0], dtype=theano.config.floatX))
    params = lasagne.layers.get_all_params(output_layer, trainable=True)
    updates = nesterov_momentum(loss, params, learning_rate=l_r, momentum=P.MOMENTUM)
    #updates = adam(loss, params, learning_rate=l_r)

    prediction_binary = T.argmax(output_train, axis=1)
    test_prediction_binary = T.argmax(output_test, axis=1)

    # set up training and prediction functions
    train_fn = theano.function(inputs=[X,Y], outputs=[loss, l2_penalty, acc, prediction_binary, output_train[:,1]], updates=updates)
    valid_fn = theano.function(inputs=[X,Y], outputs=[test_loss, l2_penalty, test_acc, test_prediction_binary, output_test[:,1]])

    return train_fn, valid_fn, l_r
Beispiel #33
0
 def __init__(self, lr, C, momentum):
     self.lr = lr
     self.C = C
     self.momentum = momentum
     self.X = T.tensor4('X')
     self.y = T.ivector('y')
     self.network = build_model(self.X)
     self.prob = self.network['prob']
     feat = self.network['pool5/7x7_s1']
     featout = layers.get_output(feat, deterministic=True)
     self.params = layers.get_all_params(self.prob, trainable=True)
     reg = regularization.regularize_network_params(self.prob,
                                                    regularization.l2)
     reg /= layers.helper.count_params(self.prob)
     # 训练集
     self.yDropProb = layers.get_output(self.prob)
     trCrossentropy = objectives.categorical_crossentropy(
         self.yDropProb, self.y)
     self.trCost = trCrossentropy.mean() + C * reg
     # 验证、测试集
     self.yFullProb = layers.get_output(self.prob, deterministic=True)
     vateCrossentropy = objectives.categorical_crossentropy(
         self.yFullProb, self.y)
     self.vateCost = vateCrossentropy.mean() + C * reg
     # 训练函数,输入训练集,输出训练损失和误差
     updatesDict = updates.nesterov_momentum(self.trCost, self.params, lr,
                                             momentum)
     self.trainfn = myUtils.basic.makeFunc([self.X, self.y],
                                           [self.trCost, self.yDropProb],
                                           updatesDict)
     # 验证或测试函数,输入验证或测试集,输出损失和误差,不进行更新
     self.vatefn = myUtils.basic.makeFunc([self.X, self.y],
                                          [self.vateCost, self.yFullProb],
                                          None)
     # 输出特征
     self.featfn = myUtils.basic.makeFunc([self.X], [featout], None)
Beispiel #34
0
def train(files, batch_size, emb_dim_size, save_dir, load_dir, skip_window,
          num_skips):
    learning_rate = 0.1
    momentum = 0.9
    num_epochs = 1000

    dataset = Dataset(files,
                      load_dir=load_dir,
                      skip_window=skip_window,
                      num_skips=num_skips)
    data_stream = dataset.data_stream
    if save_dir:
        dataset.save_dictionary(save_dir)

    dictionary = dataset.dictionary
    reverse_dictionary = dict((v, k) for k, v in dictionary.iteritems())
    print 'Dictionary size: ', len(dictionary)

    query_input = T.ivector('query')
    context_target = T.ivector('context')
    word2vec = Word2VecDiscrete(batch_size=batch_size,
                                context_vocab_size=dataset.vocab_size,
                                query_vocab_size=dataset.vocab_size,
                                emb_dim_size=emb_dim_size)
    word2vec.build_model(query_input)

    prediction = word2vec.get_output()
    loss = lasagne.objectives.categorical_crossentropy(prediction,
                                                       context_target)
    loss = loss.mean()
    params = word2vec.get_all_params()
    updates = nesterov_momentum(loss, params, learning_rate, momentum)

    train = theano.function([query_input, context_target],
                            loss,
                            updates=updates)

    losses = []
    mean_losses = []
    start = time.time()
    for epoch in range(num_epochs):

        mean_loss = 0
        for i, batch in enumerate(data_stream.get_batches(batch_size)):
            queries, contexts = batch
            losses.append(train(queries, contexts))

            if save_dir and i % 10000 == 0:
                word2vec.save(save_dir)

            mean_loss = np.mean(losses)

        mean_losses.append(mean_loss)

        if epoch % 1 == 0:
            print('epoch {} mean loss {}'.format(epoch, mean_loss))
        #print 'Embedding for king is: ', word2vec.embed([dictionary['king']])

    if save_dir:
        word2vec.save(save_dir)
    end = time.time()
    print("Time: ", end - start)

    print 'Top similar words: '
    results = [
        (word,
         spatial.distance.euclidean(word2vec.embed([dictionary['king']]),
                                    word2vec.embed([dictionary[word]])))
        for (word, _) in dictionary.iteritems()
    ]
    results.sort(key=operator.itemgetter(1))
    out = [r[0] for r in results]
    print 'closest to {} : {}'.format('king', out)

    print 'Top similar words: '
    results = [
        (word,
         spatial.distance.euclidean(word2vec.embed([dictionary['queen']]),
                                    word2vec.embed([dictionary[word]])))
        for (word, _) in dictionary.iteritems()
    ]
    results.sort(key=operator.itemgetter(1))
    out = [r[0] for r in results]
    print 'closest to {} : {}'.format('queen', out)

    plt.title("Mean loss")
    plt.xlabel("Epochs")
    plt.ylabel("Loss")
    plt.plot(mean_losses, label="train")
    plt.grid()
    plt.legend()
    plt.draw()
    plt.show()
def main():
    # load the training and validation data sets
    train_X, test_X, train_y, test_y = load_data_cv('data/train.csv')

    X = T.ftensor4()
    Y = T.fmatrix()

    # set up theano functions to generate output by feeding data through network
    output_layer = lasagne_model()
    output_train = lasagne.layers.get_output(output_layer, X)
    output_valid = lasagne.layers.get_output(output_layer, X, deterministic=True)

    # set up the loss that we aim to minimize
    loss_train = T.mean(T.nnet.categorical_crossentropy(output_train, Y))
    loss_valid = T.mean(T.nnet.categorical_crossentropy(output_valid, Y))

    # prediction functions for classifications
    pred = T.argmax(output_train, axis=1)
    pred_valid = T.argmax(output_valid, axis=1)

    # get parameters from network and set up sgd with nesterov momentum to update parameters
    params = lasagne.layers.get_all_params(output_layer)
    updates = nesterov_momentum(loss_train, params, learning_rate=0.003, momentum=0.9)

    # set up training and prediction functions
    train = theano.function(inputs=[X, Y], outputs=loss_train, updates=updates, allow_input_downcast=True)
    valid = theano.function(inputs=[X, Y], outputs=loss_valid, allow_input_downcast=True)
    predict_valid = theano.function(inputs=[X], outputs=pred_valid, allow_input_downcast=True)

    # loop over training functions for however many iterations, print information while training
    train_eval = []
    valid_eval = []
    valid_acc = []
    try:
        for i in range(45):
            train_loss = batch_iterator(train_X, train_y, BATCHSIZE, train)
            train_eval.append(train_loss)
            valid_loss = valid(test_X, test_y)
            valid_eval.append(valid_loss)
            acc = np.mean(np.argmax(test_y, axis=1) == predict_valid(test_X))
            valid_acc.append(acc)
            print 'iter:', i, '| Tloss:', train_loss, '| Vloss:', valid_loss, '| valid acc:', acc

    except KeyboardInterrupt:
        pass

    # save weights
    all_params = helper.get_all_param_values(output_layer)
    f = gzip.open('data/weights.pklz', 'wb')
    pickle.dump(all_params, f)
    f.close()

    # plot loss and accuracy
    train_eval = np.array(train_eval)
    valid_eval = np.array(valid_eval)
    valid_acc = np.array(valid_acc)
    sns.set_style("whitegrid")
    pyplot.plot(train_eval, linewidth = 3, label = 'train loss')
    pyplot.plot(valid_eval, linewidth = 3, label = 'valid loss')
    pyplot.legend(loc = 2)
    pyplot.twinx()
    pyplot.plot(valid_acc, linewidth = 3, label = 'valid accuracy', color = 'r')
    pyplot.grid()
    pyplot.ylim([.9,1])
    pyplot.legend(loc = 1)
    pyplot.savefig('data/training_plot.png')
def main():
    # load the training and validation data sets
    # labels=int(0.7*speech.all_count)
    global valid_best
    data=speech.data_dict
    labels=data.keys()
    # assert (PIXELS,PIXELS)==speechSize
    train_X=np.zeros([num_speechs,1,speechSize[0],speechSize[1]])
    train_y=np.zeros([num_speechs,len(labels)])
    i=0
    for label in (data.keys()):
        for im in data[label]:
            train_X[i,0]=im
            train_y[i]=label_binarize([label],labels)[0]
            i+=1
            if i>=num_speechs:
                break
            if i%500==0:
                print 'idx of speechs:',i
        if i>=num_speechs:
            break
    zipp=zip(train_X,train_y)
    random.shuffle(zipp)
    xx=np.array([one[0] for one in zipp])
    yy=np.array([one[1] for one in zipp])
    del train_X,train_y
    train_X=xx[:size]
    train_y=yy[:size]
    valid_X=xx[size:]
    valid_y=yy[size:]
    del xx,yy
    print 'Shuffle finish. Begin to build model.'

    X = T.tensor4()
    Y = T.matrix()

    # set up theano functions to generate output by feeding data through network
    output_layer = lasagne_model()
    output_train = lasagne.layers.get_output(output_layer, X)
    output_valid = lasagne.layers.get_output(output_layer, X, deterministic=True)

    # set up the loss that we aim to minimize
    loss_train = T.mean(T.nnet.categorical_crossentropy(output_train, Y))
    loss_valid = T.mean(T.nnet.categorical_crossentropy(output_valid, Y))

    # prediction functions for classifications
    pred = T.argmax(output_train, axis=1)
    pred_valid = T.argmax(output_valid, axis=1)

    # get parameters from network and set up sgd with nesterov momentum to update parameters
    params = lasagne.layers.get_all_params(output_layer,trainable=True)
    updates = nesterov_momentum(loss_train, params, learning_rate=0.003, momentum=0.9)
    # updates =lasagne.updates.sgd(loss_train, params, learning_rate=0.01)

    # set up training and prediction functions
    train = theano.function(inputs=[X, Y], outputs=[loss_train,pred], updates=updates, allow_input_downcast=True)
    valid = theano.function(inputs=[X, Y], outputs=[loss_valid,pred_valid], allow_input_downcast=True)
    # predict_valid = theano.function(inputs=[X], outputs=[pred_valid], allow_input_downcast=True)

    # loop over training functions for however many iterations, print information while training
    train_eval = []
    valid_eval = []
    valid_acc = []

    for i in range(450):
        batch_total_number = len(train_X) / BATCHSIZE
        for idx_batch in range (batch_total_number):
            xx_batch = np.float32(train_X[idx_batch * BATCHSIZE:(idx_batch + 1) * BATCHSIZE])
            yy_batch = np.float32(train_y[idx_batch * BATCHSIZE:(idx_batch + 1) * BATCHSIZE])

            train_loss ,pred = train(xx_batch,yy_batch)
            count=np.count_nonzero(np.int32(pred ==np.argmax(yy_batch,axis=1)))
            print i,idx_batch,'| Tloss:', train_loss,'| Count:',count,'| Acc:',float(count)/(BATCHSIZE)
            print pred
            print np.argmax(yy_batch,axis=1)
            print "time:",time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())

            if 1 and idx_batch%15==0:
                acc=0
                valid_batch_number=len(valid_X)/BATCHSIZE
                for j in tqdm(range(valid_batch_number)):
                    x_batch = np.float32(valid_X[j* BATCHSIZE:(j+ 1) * BATCHSIZE])
                    y_batch = np.float32(valid_y[j* BATCHSIZE:(j+ 1) * BATCHSIZE])
                    print len(x_batch),len(y_batch)
                    valid_loss,pred = valid(x_batch,y_batch)
                    # pred = predict_valid(x_batch)[0]
                    acc += np.count_nonzero(np.int32(pred ==np.argmax(y_batch,axis=1)))
                acc=float(acc)/(valid_batch_number*BATCHSIZE)
                print 'iter:', i,idx_batch, '| Vloss:',valid_loss,'|Acc:',acc
                if acc>valid_best:
                    print 'new valid_best:',valid_best,'-->',acc
                    valid_best=acc
                    all_params = helper.get_all_param_values(output_layer)
                    f = gzip.open('speech_params/validbest_cnn_allbatchnorm_{}_{}_{}.pklz'.format(i,valid_best,time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())), 'wb')
                    pickle.dump(all_params, f)
                    f.close()

        # save weights
        if i%5:
            all_params = helper.get_all_param_values(output_layer)
            f = gzip.open('speech_params/validbest_cnn_allbatchnorm_{}_{}_{}.pklz'.format(i,acc,time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())), 'wb')
            pickle.dump(all_params, f)
            f.close()
def event_span_classifier(args, input_var, input_mask_var, target_var, wordEmbeddings, seqlen):

    print("Building model with LSTM")

    vocab_size = wordEmbeddings.shape[1]
    wordDim = wordEmbeddings.shape[0]

    GRAD_CLIP = wordDim

    args.lstmDim = 150

    input = InputLayer((None, seqlen),input_var=input_var)
    batchsize, seqlen = input.input_var.shape
    input_mask = InputLayer((None, seqlen),input_var=input_mask_var)
    
    emb = EmbeddingLayer(input, input_size=vocab_size, output_size=wordDim, W=wordEmbeddings.T)
    #emb.params[emb_1.W].remove('trainable')

    lstm = LSTMLayer(emb, num_units=args.lstmDim, mask_input=input_mask, grad_clipping=GRAD_CLIP,
        nonlinearity=tanh)

    lstm_back = LSTMLayer(
        emb, num_units=args.lstmDim, mask_input=input_mask, grad_clipping=GRAD_CLIP,
        nonlinearity=tanh, backwards=True)

    slice_forward = SliceLayer(lstm, indices=-1, axis=1) # out_shape (None, args.lstmDim)
    slice_backward = SliceLayer(lstm_back, indices=0, axis=1) # out_shape (None, args.lstmDim)

    concat = ConcatLayer([slice_forward, slice_backward])

    hid = DenseLayer(concat, num_units=args.hiddenDim, nonlinearity=sigmoid)

    network = DenseLayer(hid, num_units=2, nonlinearity=softmax)

    prediction = get_output(network)
    
    loss = T.mean(binary_crossentropy(prediction,target_var))
    lambda_val = 0.5 * 1e-4

    layers = {emb:lambda_val, lstm:lambda_val, hid:lambda_val, network:lambda_val} 
    penalty = regularize_layer_params_weighted(layers, l2)
    loss = loss + penalty


    params = get_all_params(network, trainable=True)

    if args.optimizer == "sgd":
        updates = sgd(loss, params, learning_rate=args.step)
    elif args.optimizer == "adagrad":
        updates = adagrad(loss, params, learning_rate=args.step)
    elif args.optimizer == "adadelta":
        updates = adadelta(loss, params, learning_rate=args.step)
    elif args.optimizer == "nesterov":
        updates = nesterov_momentum(loss, params, learning_rate=args.step)
    elif args.optimizer == "rms":
        updates = rmsprop(loss, params, learning_rate=args.step)
    elif args.optimizer == "adam":
        updates = adam(loss, params, learning_rate=args.step)
    else:
        raise "Need set optimizer correctly"
 
    test_prediction = get_output(network, deterministic=True)
    test_loss = T.mean(binary_crossentropy(test_prediction,target_var))


    train_fn = theano.function([input_var, input_mask_var,target_var], 
        loss, updates=updates, allow_input_downcast=True)

    test_acc = T.mean(binary_accuracy(test_prediction, target_var))

    val_fn = theano.function([input_var, input_mask_var, target_var], [test_loss, test_acc], allow_input_downcast=True)

    return train_fn, val_fn, network
def main():
    # load model and parameters
    output_layer = lasagne_model()
    f = gzip.open('data/weights.pklz', 'rb')
    all_params = pickle.load(f)
    f.close()

    X = T.ftensor4()
    Y = T.fmatrix()

    # set up theano functions to generate output by feeding data through network
    output_layer = lasagne_model()
    output_train = lasagne.layers.get_output(output_layer, X)
    output_valid = lasagne.layers.get_output(output_layer,
                                             X,
                                             deterministic=True)

    # set up the loss that we aim to minimize
    loss_train = T.mean(T.nnet.categorical_crossentropy(output_train, Y))
    loss_valid = T.mean(T.nnet.categorical_crossentropy(output_valid, Y))

    # prediction functions for classifications
    pred = T.argmax(output_train, axis=1)
    pred_valid = T.argmax(output_valid, axis=1)

    # get parameters from network and set up sgd with nesterov momentum to update parameters
    helper.set_all_param_values(output_layer, all_params)
    params = lasagne.layers.get_all_params(output_layer)
    updates = nesterov_momentum(loss_train,
                                params,
                                learning_rate=0.0001,
                                momentum=0.9)

    # set up training and prediction functions
    train = theano.function(inputs=[X, Y],
                            outputs=loss_train,
                            updates=updates,
                            allow_input_downcast=True)
    valid = theano.function(inputs=[X, Y],
                            outputs=loss_valid,
                            allow_input_downcast=True)
    predict_valid = theano.function(inputs=[X],
                                    outputs=pred_valid,
                                    allow_input_downcast=True)

    # fine tune network
    train_X, test_X, train_y, test_y = load_data_cv('data/train.csv')
    train_eval = []
    valid_eval = []
    valid_acc = []
    try:
        for i in range(5):
            train_loss = batch_iterator_no_aug(train_X, train_y, BATCHSIZE,
                                               train)
            train_eval.append(train_loss)
            valid_loss = valid(test_X, test_y)
            valid_eval.append(valid_loss)
            acc = np.mean(np.argmax(test_y, axis=1) == predict_valid(test_X))
            valid_acc.append(acc)
            print 'iter:', i, '| Tloss:', train_loss, '| Vloss:', valid_loss, '| valid acc:', acc

    except KeyboardInterrupt:
        pass

    # after training create output for kaggle
    testing_inputs = load_test_data('data/test.csv')
    predictions = []
    for j in range((testing_inputs.shape[0] + BATCHSIZE - 1) // BATCHSIZE):
        sl = slice(j * BATCHSIZE, (j + 1) * BATCHSIZE)
        X_batch = testing_inputs[sl]
        predictions.extend(predict_valid(X_batch))
    out = pd.read_csv('data/convnet_preds.csv')
    out['Label'] = predictions
    out.to_csv('preds/convnet_preds.csv', index=False)
Beispiel #39
0
####################### UPDATES #########################
#we use dynamic learning rates which change after some epochs
lr_dynamic = T.scalar(name='learning_rate')

#get all trainable parameters (weights) of our net
params = l.get_all_params(NET, trainable=True)

#we use the adam update
if OPTIMIZER == 'adam':
    param_updates = updates.adam(loss,
                                 params,
                                 learning_rate=lr_dynamic,
                                 beta1=0.5)
elif OPTIMIZER == 'nesterov':
    param_updates = updates.nesterov_momentum(loss,
                                              params,
                                              learning_rate=lr_dynamic,
                                              momentum=0.9)

#################### TRAIN FUNCTION ######################
#the theano train functions takes images and class targets as input
print "COMPILING THEANO TRAIN FUNCTION...",
start = time.time()
train_net = theano.function(
    [l.get_all_layers(NET)[0].input_var, targets, lr_dynamic],
    loss,
    updates=param_updates)
print "DONE! (", int(time.time() - start), "s )"

################# PREDICTION FUNCTION ####################
#we need the prediction function to calculate the validation accuracy
#this way we can test the net during/after training
def build_network_2dconv(args, input_var, target_var, wordEmbeddings, maxlen=60):

    print("Building model with 2D Convolution")

    vocab_size = wordEmbeddings.shape[1]
    wordDim = wordEmbeddings.shape[0]

    num_filters = 100
    stride = 1

    # CNN_sentence config
    filter_size = (3, wordDim)
    pool_size = (maxlen - 3 + 1, 1)

    input = InputLayer((None, maxlen), input_var=input_var)
    batchsize, seqlen = input.input_var.shape
    emb = EmbeddingLayer(input, input_size=vocab_size, output_size=wordDim, W=wordEmbeddings.T)
    emb.params[emb.W].remove("trainable")  # (batchsize, maxlen, wordDim)

    reshape = ReshapeLayer(emb, (batchsize, 1, maxlen, wordDim))

    conv2d = Conv2DLayer(
        reshape,
        num_filters=num_filters,
        filter_size=(filter_size),
        stride=stride,
        nonlinearity=rectify,
        W=GlorotUniform(),
    )  # (None, 100, 34, 1)
    maxpool = MaxPool2DLayer(conv2d, pool_size=pool_size)  # (None, 100, 1, 1)

    forward = FlattenLayer(maxpool)  # (None, 100) #(None, 50400)

    hid = DenseLayer(forward, num_units=args.hiddenDim, nonlinearity=sigmoid)

    network = DenseLayer(hid, num_units=2, nonlinearity=softmax)

    prediction = get_output(network)

    loss = T.mean(binary_crossentropy(prediction, target_var))
    lambda_val = 0.5 * 1e-4

    layers = {conv2d: lambda_val, hid: lambda_val, network: lambda_val}
    penalty = regularize_layer_params_weighted(layers, l2)
    loss = loss + penalty

    params = get_all_params(network, trainable=True)

    if args.optimizer == "sgd":
        updates = sgd(loss, params, learning_rate=args.step)
    elif args.optimizer == "adagrad":
        updates = adagrad(loss, params, learning_rate=args.step)
    elif args.optimizer == "adadelta":
        updates = adadelta(loss, params, learning_rate=args.step)
    elif args.optimizer == "nesterov":
        updates = nesterov_momentum(loss, params, learning_rate=args.step)
    elif args.optimizer == "rms":
        updates = rmsprop(loss, params, learning_rate=args.step)
    elif args.optimizer == "adam":
        updates = adam(loss, params, learning_rate=args.step)
    else:
        raise "Need set optimizer correctly"

    test_prediction = get_output(network, deterministic=True)
    test_loss = T.mean(binary_crossentropy(test_prediction, target_var))

    train_fn = theano.function([input_var, target_var], loss, updates=updates, allow_input_downcast=True)

    test_acc = T.mean(binary_accuracy(test_prediction, target_var))
    val_fn = theano.function([input_var, target_var], [test_loss, test_acc], allow_input_downcast=True)

    return train_fn, val_fn
output_test = lasagne.layers.get_output(output_layer, deterministic=True)

# set up the loss that we aim to minimize, when using cat cross entropy our Y should be ints not one-hot
loss = lasagne.objectives.categorical_crossentropy(output_train, Y)
loss = loss.mean()

# set up loss functions for validation dataset
valid_loss = lasagne.objectives.categorical_crossentropy(output_test, Y)
valid_loss = valid_loss.mean()

valid_acc = T.mean(T.eq(T.argmax(output_test, axis=1), Y), dtype=theano.config.floatX)

# get parameters from network and set up sgd with nesterov momentum to update parameters
l_r = theano.shared(np.array(LR_SCHEDULE[0], dtype=theano.config.floatX))
params = lasagne.layers.get_all_params(output_layer, trainable=True)
updates = nesterov_momentum(loss, params, learning_rate=l_r)

# set up training and prediction functions
train_fn = theano.function(inputs=[X,Y], outputs=loss, updates=updates)
valid_fn = theano.function(inputs=[X,Y], outputs=[valid_loss, valid_acc])

# set up prediction function
predict_proba = theano.function(inputs=[X], outputs=output_test)

'''
load training data and start training
'''
encoder = LabelEncoder()

# load data
X_train = np.load('data/cache/X_train_%d_f32_clean.npy'%PIXELS)
Beispiel #42
0
    def build_finetune_functions(self, datasets, batch_size, learning_rate):
        """Generates a function `train_function` that implements one step of
        finetuning, a function `valid_score` that computes the cost on the validation set
        and a function 'test_score' that computes the reconstruction cost on the test set.

        :type datasets: list of pairs of theano.tensor.TensorType
        :param datasets: It is a list that contain all the datasets;
                         the has to contain three pairs, `train`,
                         `valid`, `test` in this order, where each pair
                         is formed of two Theano variables, one for the
                         datapoints, the other for the labels

        :type batch_size: int
        :param batch_size: size of a minibatch

        :type learning_rate: float (usually a shared variable so it can be updated)
        :param learning_rate: learning rate used during finetune stage
        """

        (train_set_x, train_set_y) = datasets[0]
        (valid_set_x, valid_set_y) = datasets[1]
        (test_set_x, test_set_y) = datasets[2]

        index = T.lscalar('index')  # index to a [mini]batch

        updates = nesterov_momentum(self.fine_tune_cost, self.params, learning_rate=learning_rate, momentum=0.9)
        train_function = theano.function(
                inputs=[index],
                outputs=self.fine_tune_cost,
                updates=updates,
                givens={
                    self.x: train_set_x[index * batch_size: (index + 1) * batch_size]
                },
                name='train')

        valid_score_i = theano.function(
                [index],
                outputs=self.test_cost,
                givens={
                    self.x: valid_set_x[index * batch_size: (index + 1) * batch_size],
                },
                name='valid'
        )

        test_score_i = theano.function(
                [index],
                outputs=self.test_cost,
                givens={
                    self.x: test_set_x[index * batch_size: (index + 1) * batch_size],
                },
                name='test'
        )

        # Create a function that scans the entire validation set
        n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] // batch_size

        def valid_score():
            return [valid_score_i(i) for i in range(n_valid_batches)]

        # Create a function that scans the entire test set
        n_test_batches = test_set_x.get_value(borrow=True).shape[0] // batch_size

        def test_score():
            return [test_score_i(i) for i in range(n_test_batches)]

        return train_function, valid_score, test_score
Beispiel #43
0
def get_model():

    # Prepare Theano variables for inputs and targets
    input_var = T.tensor4('inputs')
    target_var = T.matrix('targets')

    # input layer with unspecified batch size
    layer_0 = InputLayer(shape=(None, 30, 64, 64), input_var=input_var)

    # Z-score?

    # Convolution then batchNormalisation then activation layer, twice, then zero padding layer followed by a dropout layer
    layer_1 = batch_norm(
        Conv2DLayer(layer_0,
                    64, (3, 3),
                    pad='same',
                    nonlinearity=leaky_rectify))
    layer_2 = batch_norm(
        Conv2DLayer(layer_1,
                    64, (3, 3),
                    pad='valid',
                    nonlinearity=leaky_rectify))
    layer_3 = MaxPool2DLayer(layer_2,
                             pool_size=(2, 2),
                             stride=(2, 2),
                             pad=(1, 1))
    layer_4 = DropoutLayer(layer_3, p=0.25)

    # Convolution then batchNormalisation then activation layer, twice, then zero padding layer followed by a dropout layer
    layer_5 = batch_norm(
        Conv2DLayer(layer_4,
                    96, (3, 3),
                    pad='same',
                    nonlinearity=leaky_rectify))
    layer_6 = batch_norm(
        Conv2DLayer(layer_5,
                    96, (3, 3),
                    pad='valid',
                    nonlinearity=leaky_rectify))
    layer_7 = MaxPool2DLayer(layer_6,
                             pool_size=(2, 2),
                             stride=(2, 2),
                             pad=(1, 1))
    layer_8 = DropoutLayer(layer_7, p=0.25)

    # Convolution then batchNormalisation then activation layer, twice, then zero padding layer followed by a dropout layer
    layer_9 = batch_norm(
        Conv2DLayer(layer_8,
                    128, (3, 3),
                    pad='same',
                    nonlinearity=leaky_rectify))
    layer_10 = batch_norm(
        Conv2DLayer(layer_9,
                    128, (3, 3),
                    pad='valid',
                    nonlinearity=leaky_rectify))
    layer_11 = MaxPool2DLayer(layer_10,
                              pool_size=(2, 2),
                              stride=(2, 2),
                              pad=(1, 1))
    layer_12 = DropoutLayer(layer_11, p=0.25)

    # Last layers
    layer_13 = FlattenLayer(layer_12)
    layer_14 = DenseLayer(layer_13, 1024, nonlinearity=leaky_rectify)
    layer_15 = DropoutLayer(layer_14, p=0.5)
    layer_16 = DenseLayer(layer_15, 600, nonlinearity=softmax)

    # Loss
    prediction = get_output(layer_16)
    loss = squared_error(prediction, target_var)
    loss = loss.mean() + regularize_layer_params(layer_14, l2)

    #Updates : Stochastic Gradient Descent (SGD) with Nesterov momentum
    params = get_all_params(layer_16, trainable=True)
    updates = nesterov_momentum(loss, params, learning_rate=0.01, momentum=0.9)

    # Create a loss expression for validation/testing. The crucial difference
    # here is that we do a deterministic forward pass through the network, disabling dropout layers.
    test_prediction = get_output(layer_16, deterministic=True)
    test_loss = squared_error(test_prediction, target_var)
    test_loss = test_loss.mean()

    # Compile a function performing a training step on a mini-batch (by giving
    # the updates dictionary) and returning the corresponding training loss:
    train_fn = theano.function([input_var, target_var], loss, updates=updates)

    # Compile a second function computing the validation loss and accuracy:
    val_fn = theano.function([input_var, target_var], test_loss)

    # Compule a third function computing the prediction
    predict_fn = theano.function([input_var], test_prediction)

    return [layer_16, train_fn, val_fn, predict_fn]
def get_model():

    # Prepare Theano variables for inputs and targets
    input_var = T.tensor4('inputs')
    target_var = T.matrix('targets')

    # input layer with unspecified batch size
    layer_both_0         = InputLayer(shape=(None, 30, 64, 64), input_var=input_var)

    # Z-score?

    # Convolution then batchNormalisation then activation layer, twice, then zero padding layer followed by a dropout layer
    layer_both_1         = batch_norm(Conv2DLayer(layer_both_0, 64, (3, 3), pad='same', nonlinearity=leaky_rectify))
    layer_both_2         = batch_norm(Conv2DLayer(layer_both_1, 64, (3, 3), pad='valid', nonlinearity=leaky_rectify))
    layer_both_3         = MaxPool2DLayer(layer_both_2, pool_size=(2, 2), stride=(2, 2), pad=(1, 1))
    layer_both_4         = DropoutLayer(layer_both_3, p=0.25)

    # Systole : Convolution then batchNormalisation then activation layer, twice, then zero padding layer followed by a dropout layer
    layer_systole_0      = batch_norm(Conv2DLayer(layer_both_4, 96, (3, 3), pad='same',nonlinearity=leaky_rectify))
    layer_systole_1      = batch_norm(Conv2DLayer(layer_systole_0, 96, (3, 3), pad='valid',nonlinearity=leaky_rectify))
    layer_systole_2      = MaxPool2DLayer(layer_systole_1, pool_size=(2, 2), stride=(2, 2), pad=(1, 1))
    layer_systole_3      = DropoutLayer(layer_systole_2, p=0.25)

    # Diastole : Convolution then batchNormalisation then activation layer, twice, then zero padding layer followed by a dropout layer
    layer_diastole_0     = batch_norm(Conv2DLayer(layer_both_4, 96, (3, 3), pad='same',nonlinearity=leaky_rectify))
    layer_diastole_1     = batch_norm(Conv2DLayer(layer_diastole_0, 96, (3, 3), pad='valid',nonlinearity=leaky_rectify))
    layer_diastole_2     = MaxPool2DLayer(layer_diastole_1, pool_size=(2, 2), stride=(2, 2), pad=(1, 1))
    layer_diastole_3     = DropoutLayer(layer_diastole_2, p=0.25)

    # Systole : Convolution then batchNormalisation then activation layer, twice, then zero padding layer followed by a dropout layer
    layer_systole_4      = batch_norm(Conv2DLayer(layer_systole_3, 128, (3, 3), pad='same',nonlinearity=leaky_rectify))
    layer_systole_5      = batch_norm(Conv2DLayer(layer_systole_4, 128, (3, 3), pad='valid',nonlinearity=leaky_rectify))
    layer_systole_6      = MaxPool2DLayer(layer_systole_5, pool_size=(2, 2), stride=(2, 2), pad=(1, 1))
    layer_systole_7      = DropoutLayer(layer_systole_6, p=0.25)

    # Diastole : Convolution then batchNormalisation then activation layer, twice, then zero padding layer followed by a dropout layer
    layer_diastole_4     = batch_norm(Conv2DLayer(layer_diastole_3, 128, (3, 3), pad='same',nonlinearity=leaky_rectify))
    layer_diastole_5     = batch_norm(Conv2DLayer(layer_diastole_4, 128, (3, 3), pad='valid',nonlinearity=leaky_rectify))
    layer_diastole_6     = MaxPool2DLayer(layer_diastole_5, pool_size=(2, 2), stride=(2, 2), pad=(1, 1))
    layer_diastole_7     = DropoutLayer(layer_diastole_6, p=0.25)

    # Systole : Last layers
    layer_systole_8      = FlattenLayer(layer_systole_7)
    layer_systole_9      = DenseLayer(layer_systole_8, 1024, nonlinearity=leaky_rectify)
    layer_systole_10     = DropoutLayer(layer_systole_9, p=0.5)
    layer_systole_11     = DenseLayer(layer_systole_10, 600, nonlinearity=softmax)

    # Diastole : Last layers
    layer_diastole_8     = FlattenLayer(layer_diastole_7)
    layer_diastole_9     = DenseLayer(layer_diastole_8, 1024, nonlinearity=leaky_rectify)
    layer_diastole_10    = DropoutLayer(layer_diastole_9, p=0.5)
    layer_diastole_11    = DenseLayer(layer_diastole_10, 600, nonlinearity=softmax)

    # Merge
    layer_both_5         = ConcatLayer([layer_systole_11, layer_diastole_11])

    # Loss
    prediction           = get_output(layer_both_5) 
    loss                 = squared_error(prediction, target_var)
    loss                 = loss.mean() + regularize_layer_params(layer_systole_9, l2) + regularize_layer_params(layer_diastole_9, l2)

    #Updates : Stochastic Gradient Descent (SGD) with Nesterov momentum
    params               = get_all_params(layer_both_5, trainable=True)
    updates              = nesterov_momentum(loss, params, learning_rate=0.01, momentum=0.9)

    # Create a loss expression for validation/testing. The crucial difference
    # here is that we do a deterministic forward pass through the network, disabling dropout layers.
    test_prediction      = get_output(layer_both_5, deterministic=True)
    test_loss            = squared_error(test_prediction, target_var)
    test_loss            = test_loss.mean()

    # Compile a function performing a training step on a mini-batch (by giving
    # the updates dictionary) and returning the corresponding training loss:
    train_fn             = theano.function([input_var, target_var], loss, updates=updates)

    # Compile a second function computing the validation loss and accuracy:
    val_fn               = theano.function([input_var, target_var], test_loss)

    # Compule a third function computing the prediction
    predict_fn           = theano.function([input_var], test_prediction)

    return [layer_both_5, train_fn, val_fn, predict_fn]
Beispiel #45
0
def main():
    # load the training and validation data sets
    labels=int(0.7*image.all_count)
    data=image.ddd
    train_X=np.zeros([size,1,PIXELS,PIXELS])
    train_y=np.zeros([size,labels])
    for i in range(size):
        label=random.sample(range(labels),1)[0]
        train_X[i,0]=0.01*random.sample(data[label],1)[0]
        train_y[i]=label_binarize([label],range(labels))[0]

    X = T.tensor4()
    Y = T.matrix()

    # set up theano functions to generate output by feeding data through network
    output_layer = lasagne_model()
    output_train = lasagne.layers.get_output(output_layer, X)
    output_valid = lasagne.layers.get_output(output_layer, X, deterministic=True)

    # set up the loss that we aim to minimize
    loss_train = T.mean(T.nnet.categorical_crossentropy(output_train, Y))
    loss_valid = T.mean(T.nnet.categorical_crossentropy(output_valid, Y))

    # prediction functions for classifications
    pred = T.argmax(output_train, axis=1)
    pred_valid = T.argmax(output_valid, axis=1)

    # get parameters from network and set up sgd with nesterov momentum to update parameters
    params = lasagne.layers.get_all_params(output_layer)
    updates = nesterov_momentum(loss_train, params, learning_rate=0.03, momentum=0.9)
    # updates =lasagne.updates.adagrad(loss_train, params, learning_rate=0.003)

    # set up training and prediction functions
    train = theano.function(inputs=[X, Y], outputs=[loss_train,pred_valid], updates=updates, allow_input_downcast=True)
    valid = theano.function(inputs=[X, Y], outputs=loss_valid, allow_input_downcast=True)
    predict_valid = theano.function(inputs=[X], outputs=pred_valid, allow_input_downcast=True)

    # loop over training functions for however many iterations, print information while training
    train_eval = []
    valid_eval = []
    valid_acc = []

    for i in range(450):
        batch_total_number = len(train_X) / BATCHSIZE
        for idx_batch in range (batch_total_number):
            xx_batch = np.float32(train_X[idx_batch * BATCHSIZE:(idx_batch + 1) * BATCHSIZE])
            yy_batch = np.float32(train_y[idx_batch * BATCHSIZE:(idx_batch + 1) * BATCHSIZE])
               
            train_loss ,pred = train(xx_batch,yy_batch)
            print i,idx_batch,'| Tloss:', train_loss,'| Count:',np.count_nonzero(np.int32(pred ==np.argmax(yy_batch,axis=1)))
            print pred
            print np.argmax(yy_batch,axis=1)

        acc=0
        for j in range(batch_total_number):
            x_batch = np.float32(train_X[idx_batch * BATCHSIZE:(idx_batch + 1) * BATCHSIZE])
            y_batch = np.float32(train_y[idx_batch * BATCHSIZE:(idx_batch + 1) * BATCHSIZE])
            pred = predict_valid(x_batch)
            acc += np.count_nonzero(np.int32(pred ==np.argmax(y_batch,axis=1)))
        acc=float(acc)/(BATCHSIZE*batch_total_number)
        print 'iter:', i,idx_batch, '| Tloss:', train_loss,'|Acc:',acc


    # save weights
    all_params = helper.get_all_param_values(output_layer)
    f = gzip.open('params/weights.pklz', 'wb')
    pickle.dump(all_params, f)
    f.close()
Beispiel #46
0
def main():
    parser = argparse.ArgumentParser(description='Tuning with bi-directional LSTM-CNN-CRF')
    parser.add_argument('--num_epochs', type=int, default=1000, help='Number of training epochs')
    parser.add_argument('--batch_size', type=int, default=10, help='Number of sentences in each batch')
    parser.add_argument('--num_units', type=int, default=100, help='Number of hidden units in LSTM')
    parser.add_argument('--num_filters', type=int, default=20, help='Number of filters in CNN')
    parser.add_argument('--learning_rate', type=float, default=0.1, help='Learning rate')
    parser.add_argument('--decay_rate', type=float, default=0.1, help='Decay rate of learning rate')
    parser.add_argument('--grad_clipping', type=float, default=0, help='Gradient clipping')
    parser.add_argument('--gamma', type=float, default=1e-6, help='weight for regularization')
    parser.add_argument('--delta', type=float, default=0.0, help='weight for expectation-linear regularization')
    parser.add_argument('--regular', choices=['none', 'l2'], help='regularization for training', required=True)
    parser.add_argument('--dropout', choices=['std', 'recurrent'], help='dropout patten')
    parser.add_argument('--schedule', nargs='+', type=int, help='schedule for learning rate decay')
    parser.add_argument('--output_prediction', action='store_true', help='Output predictions to temp files')
    parser.add_argument('--train')  # "data/POS-penn/wsj/split1/wsj1.train.original"
    parser.add_argument('--dev')  # "data/POS-penn/wsj/split1/wsj1.dev.original"
    parser.add_argument('--test')  # "data/POS-penn/wsj/split1/wsj1.test.original"

    args = parser.parse_args()

    logger = get_logger("Sequence Labeling")
    train_path = args.train
    dev_path = args.dev
    test_path = args.test
    num_epochs = args.num_epochs
    batch_size = args.batch_size
    num_units = args.num_units
    num_filters = args.num_filters
    regular = args.regular
    grad_clipping = args.grad_clipping
    gamma = args.gamma
    delta = args.delta
    learning_rate = args.learning_rate
    momentum = 0.9
    decay_rate = args.decay_rate
    schedule = args.schedule
    output_predict = args.output_prediction
    dropout = args.dropout
    p = 0.5

    logger.info("Creating Alphabets")
    word_alphabet, char_alphabet, pos_alphabet, type_alphabet = data_utils.create_alphabets("data/alphabets/",
                                                                                            [train_path, dev_path,
                                                                                             test_path],
                                                                                            40000)
    logger.info("Word Alphabet Size: %d" % word_alphabet.size())
    logger.info("Character Alphabet Size: %d" % char_alphabet.size())
    logger.info("POS Alphabet Size: %d" % pos_alphabet.size())

    num_labels = pos_alphabet.size() - 1

    logger.info("Reading Data")
    data_train = data_utils.read_data(train_path, word_alphabet, char_alphabet, pos_alphabet, type_alphabet)
    data_dev = data_utils.read_data(dev_path, word_alphabet, char_alphabet, pos_alphabet, type_alphabet)
    data_test = data_utils.read_data(test_path, word_alphabet, char_alphabet, pos_alphabet, type_alphabet)

    num_data = sum([len(bucket) for bucket in data_train])

    logger.info("constructing network...")
    # create variables
    target_var = T.imatrix(name='targets')
    mask_var = T.matrix(name='masks', dtype=theano.config.floatX)
    mask_nr_var = T.matrix(name='masks_nr', dtype=theano.config.floatX)
    word_var = T.imatrix(name='inputs')
    char_var = T.itensor3(name='char-inputs')

    network = build_network(word_var, char_var, mask_var, word_alphabet, char_alphabet, dropout, num_units, num_labels,
                            grad_clipping, num_filters, p)

    logger.info("Network structure: hidden=%d, filter=%d, dropout=%s" % (num_units, num_filters, dropout))
    # compute loss
    num_tokens = mask_var.sum(dtype=theano.config.floatX)
    num_tokens_nr = mask_nr_var.sum(dtype=theano.config.floatX)

    # get outpout of bi-lstm-cnn-crf shape [batch, length, num_labels, num_labels]
    energies_train = lasagne.layers.get_output(network)
    energies_train_det = lasagne.layers.get_output(network, deterministic=True)
    energies_eval = lasagne.layers.get_output(network, deterministic=True)

    loss_train_org = chain_crf_loss(energies_train, target_var, mask_var).mean()

    energy_shape = energies_train.shape
    # [batch, length, num_labels, num_labels] --> [batch*length, num_labels*num_labels]
    energies = T.reshape(energies_train, (energy_shape[0] * energy_shape[1], energy_shape[2] * energy_shape[3]))
    energies = nonlinearities.softmax(energies)
    energies_det = T.reshape(energies_train_det, (energy_shape[0] * energy_shape[1], energy_shape[2] * energy_shape[3]))
    energies_det = nonlinearities.softmax(energies_det)
    # [batch*length, num_labels*num_labels] --> [batch, length*num_labels*num_labels]
    energies = T.reshape(energies, (energy_shape[0], energy_shape[1] * energy_shape[2] * energy_shape[3]))
    energies_det = T.reshape(energies_det, (energy_shape[0], energy_shape[1] * energy_shape[2] * energy_shape[3]))

    loss_train_expect_linear = lasagne.objectives.squared_error(energies, energies_det)
    loss_train_expect_linear = loss_train_expect_linear.sum(axis=1)
    loss_train_expect_linear = loss_train_expect_linear.mean()

    loss_train = loss_train_org + delta * loss_train_expect_linear
    # l2 regularization?
    if regular == 'l2':
        l2_penalty = lasagne.regularization.regularize_network_params(network, lasagne.regularization.l2)
        loss_train = loss_train + gamma * l2_penalty

    _, corr_train = chain_crf_accuracy(energies_train, target_var)
    corr_nr_train = (corr_train * mask_nr_var).sum(dtype=theano.config.floatX)
    corr_train = (corr_train * mask_var).sum(dtype=theano.config.floatX)
    prediction_eval, corr_eval = chain_crf_accuracy(energies_eval, target_var)
    corr_nr_eval = (corr_eval * mask_nr_var).sum(dtype=theano.config.floatX)
    corr_eval = (corr_eval * mask_var).sum(dtype=theano.config.floatX)

    params = lasagne.layers.get_all_params(network, trainable=True)
    updates = nesterov_momentum(loss_train, params=params, learning_rate=learning_rate, momentum=momentum)

    # Compile a function performing a training step on a mini-batch
    train_fn = theano.function([word_var, char_var, target_var, mask_var, mask_nr_var],
                               [loss_train, loss_train_org, loss_train_expect_linear,
                                corr_train, corr_nr_train, num_tokens, num_tokens_nr], updates=updates)
    # Compile a second function evaluating the loss and accuracy of network
    eval_fn = theano.function([word_var, char_var, target_var, mask_var, mask_nr_var],
                              [corr_eval, corr_nr_eval, num_tokens, num_tokens_nr, prediction_eval])

    # Finally, launch the training loop.
    logger.info(
        "Start training: regularization: %s(%f), dropout: %s, delta: %.2f (#training data: %d, batch size: %d, clip: %.1f)..." \
        % (regular, (0.0 if regular == 'none' else gamma), dropout, delta, num_data, batch_size, grad_clipping))

    num_batches = num_data / batch_size + 1
    dev_correct = 0.0
    dev_correct_nr = 0.0
    best_epoch = 0
    test_correct = 0.0
    test_correct_nr = 0.0
    test_total = 0
    test_total_nr = 0
    test_inst = 0
    lr = learning_rate
    for epoch in range(1, num_epochs + 1):
        print 'Epoch %d (learning rate=%.4f, decay rate=%.4f): ' % (epoch, lr, decay_rate)
        train_err = 0.0
        train_err_org = 0.0
        train_err_linear = 0.0
        train_corr = 0.0
        train_corr_nr = 0.0
        train_total = 0
        train_total_nr = 0
        train_inst = 0
        start_time = time.time()
        num_back = 0
        for batch in xrange(1, num_batches + 1):
            wids, cids, pids, _, _, masks = data_utils.get_batch(data_train, batch_size)
            masks_nr = np.copy(masks)
            masks_nr[:, 0] = 0
            err, err_org, err_linear, corr, corr_nr, num, num_nr = train_fn(wids, cids, pids, masks, masks_nr)
            train_err += err * wids.shape[0]
            train_err_org += err_org * wids.shape[0]
            train_err_linear += err_linear * wids.shape[0]
            train_corr += corr
            train_corr_nr += corr_nr
            train_total += num
            train_total_nr += num_nr
            train_inst += wids.shape[0]
            time_ave = (time.time() - start_time) / batch
            time_left = (num_batches - batch) * time_ave

            # update log
            sys.stdout.write("\b" * num_back)
            log_info = 'train: %d/%d loss: %.4f, loss_org: %.4f, loss_linear: %.4f, acc: %.2f%%, acc(no root): %.2f%%, time left (estimated): %.2fs' % (
                batch, num_batches, train_err / train_inst, train_err_org / train_inst, train_err_linear / train_inst,
                train_corr * 100 / train_total, train_corr_nr * 100 / train_total_nr, time_left)
            sys.stdout.write(log_info)
            num_back = len(log_info)
        # update training log after each epoch
        assert train_inst == num_batches * batch_size
        assert train_total == train_total_nr + train_inst
        sys.stdout.write("\b" * num_back)
        print 'train: %d/%d loss: %.4f,  loss_org: %.4f, loss_linear: %.4f, acc: %.2f%%, acc(no root): %.2f%%, time: %.2fs' % (
            train_inst, train_inst, train_err / train_inst, train_err_org / train_inst, train_err_linear / train_inst,
            train_corr * 100 / train_total, train_corr_nr * 100 / train_total_nr, time.time() - start_time)

        # evaluate performance on dev data
        dev_corr = 0.0
        dev_corr_nr = 0.0
        dev_total = 0
        dev_total_nr = 0
        dev_inst = 0
        for batch in data_utils.iterate_batch(data_dev, batch_size):
            wids, cids, pids, _, _, masks = batch
            masks_nr = np.copy(masks)
            masks_nr[:, 0] = 0
            corr, corr_nr, num, num_nr, predictions = eval_fn(wids, cids, pids, masks, masks_nr)
            dev_corr += corr
            dev_corr_nr += corr_nr
            dev_total += num
            dev_total_nr += num_nr
            dev_inst += wids.shape[0]
        assert dev_total == dev_total_nr + dev_inst
        print 'dev corr: %d, total: %d, acc: %.2f%%, no root corr: %d, total: %d, acc: %.2f%%' % (
            dev_corr, dev_total, dev_corr * 100 / dev_total, dev_corr_nr, dev_total_nr, dev_corr_nr * 100 / dev_total_nr)

        if dev_correct_nr < dev_corr_nr:
            dev_correct = dev_corr
            dev_correct_nr = dev_corr_nr
            best_epoch = epoch

            # evaluate on test data when better performance detected
            test_corr = 0.0
            test_corr_nr = 0.0
            test_total = 0
            test_total_nr = 0
            test_inst = 0
            for batch in data_utils.iterate_batch(data_test, batch_size):
                wids, cids, pids, _, _, masks = batch
                masks_nr = np.copy(masks)
                masks_nr[:, 0] = 0
                corr, corr_nr, num, num_nr, predictions = eval_fn(wids, cids, pids, masks, masks_nr)
                test_corr += corr
                test_corr_nr += corr_nr
                test_total += num
                test_total_nr += num_nr
                test_inst += wids.shape[0]
            assert test_total + test_total_nr + test_inst
            test_correct = test_corr
            test_correct_nr = test_corr_nr
        print "best dev  corr: %d, total: %d, acc: %.2f%%, no root corr: %d, total: %d, acc: %.2f%% (epoch: %d)" % (
            dev_correct, dev_total, dev_correct * 100 / dev_total,
            dev_correct_nr, dev_total_nr, dev_correct_nr * 100 / dev_total_nr, best_epoch)
        print "best test corr: %d, total: %d, acc: %.2f%%, no root corr: %d, total: %d, acc: %.2f%% (epoch: %d)" % (
            test_correct, test_total, test_correct * 100 / test_total,
            test_correct_nr, test_total_nr, test_correct_nr * 100 / test_total_nr, best_epoch)

        if epoch in schedule:
            lr = lr * decay_rate
            updates = nesterov_momentum(loss_train, params=params, learning_rate=lr, momentum=momentum)
            train_fn = theano.function([word_var, char_var, target_var, mask_var, mask_nr_var],
                                       [loss_train, loss_train_org, loss_train_expect_linear,
                                        corr_train, corr_nr_train, num_tokens, num_tokens_nr], updates=updates)
def train_setup():

    # Prepare Theano variables for inputs and targets
    input_var = T.tensor4('inputs')
    target_var = T.ivector('targets')

    # Create neural network model (depending on first command line parameter)
    print("Building model and compiling functions...")
    print( " with input dimension {0},{1},{2}".format( config.image_height, \
                                                       config.image_width, \
                                                      config. image_channel ) )
    network = cnn_archi( input_var,  \
                         config.image_channel,\
                         config.image_height, config.image_width,\
                         config.output_length )

    print('Number of parameters : {0}'.format(count_params(network)))

    if (config.init_model is not None):
        with np.load(config.init_model) as f:
            param_values = [f['arr_%d' % i] for i in range(len(f.files))]

        set_all_param_values(network, param_values)

    # Create a loss expression for training, i.e., a scalar objective we want
    # to minimize (for our multi-class problem, it is the cross-entropy loss):
    prediction = lasagne.layers.get_output(network)
    ent_loss = categorical_crossentropy(prediction, target_var)
    ent_loss = ent_loss.mean()

    l1_regu = config.l1_regu * regularize_network_params(network, l1)
    l2_regu = config.l2_regu * regularize_network_params(network, l2)

    loss = ent_loss + l1_regu + l2_regu
    # We could add some weight decay as well here, see lasagne.regularization.

    # Create update expressions for training, i.e., how to modify the
    # parameters at each training step. Here, we'll use Stochastic Gradient
    # Descent (SGD) with Nesterov momentum, but Lasagne offers plenty more.
    params = get_all_params(network, trainable=True)

    #grads = T.grad( loss, params )
    #scaled_grads = norm_constraint( grads, 5. )
    updates = nesterov_momentum(loss, params, \
                                learning_rate=config.learning_rate, \
                                momentum=config.momentum )
    #updates = rmsprop( loss , params, learning_rate = config.learning_rate )

    for param in get_all_params(network, regularizable=True):
        norm_axis = None
        if param.ndim == 1:
            norm_axis = [0]
        updates[param] = norm_constraint( updates[param], \
                                 5. * compute_norms( param.get_value() ).mean(),
                                 norm_axes = norm_axis  )

    #Create a loss expression for validation/testing. The crucial difference
    # here is that we do a deterministic forward pass through the network,
    # disabling dropout layers.
    test_prediction = get_output(network, deterministic=True)
    test_classes = T.argmax(test_prediction, axis=1)
    test_loss = categorical_crossentropy(test_prediction, target_var)
    test_loss = test_loss.mean()
    # As a bonus, also create an expression for the classification accuracy:
    test_acc = T.eq(test_classes, target_var)

    # Compile a function performing a training step on a mini-batch (by giving
    # the updates dictionary) and returning the corresponding training loss:
    train_fn = theano.function([input_var,target_var], \
                               ent_loss,\
                               updates=updates, \
                               allow_input_downcast=True)

    # Compile a second function computing the validation loss and accuracy:
    val_fn = theano.function([input_var, target_var], \
                             [test_loss, test_prediction, test_acc], \
                             allow_input_downcast=True )

    return network, train_fn, val_fn
Beispiel #48
0
def event_span_classifier(args, input_var, target_var, wordEmbeddings, seqlen, num_feats):

    print("Building model with 1D Convolution")

    vocab_size = wordEmbeddings.shape[1]
    wordDim = wordEmbeddings.shape[0]

    kw = 2
    num_filters = seqlen-kw+1
    stride = 1 

    #important context words as channels
 
    #CNN_sentence config
    filter_size=wordDim
    pool_size=seqlen-filter_size+1

    input = InputLayer((None, seqlen, num_feats),input_var=input_var)
    batchsize, _, _ = input.input_var.shape
    emb = EmbeddingLayer(input, input_size=vocab_size, output_size=wordDim, W=wordEmbeddings.T)
    #emb.params[emb.W].remove('trainable') #(batchsize, seqlen, wordDim)

    #print get_output_shape(emb)
    reshape = ReshapeLayer(emb, (batchsize, seqlen, num_feats*wordDim))
    #print get_output_shape(reshape)

    conv1d = Conv1DLayer(reshape, num_filters=num_filters, filter_size=wordDim, stride=1, 
        nonlinearity=tanh,W=GlorotUniform()) #nOutputFrame = num_flters, 
                                            #nOutputFrameSize = (num_feats*wordDim-filter_size)/stride +1

    #print get_output_shape(conv1d)

    conv1d = DimshuffleLayer(conv1d, (0,2,1))

    #print get_output_shape(conv1d)

    pool_size=num_filters

    maxpool = MaxPool1DLayer(conv1d, pool_size=pool_size) 

    #print get_output_shape(maxpool)
  
    #forward = FlattenLayer(maxpool) 

    #print get_output_shape(forward)
 
    hid = DenseLayer(maxpool, num_units=args.hiddenDim, nonlinearity=sigmoid)

    network = DenseLayer(hid, num_units=2, nonlinearity=softmax)

    prediction = get_output(network)
    
    loss = T.mean(binary_crossentropy(prediction,target_var))
    lambda_val = 0.5 * 1e-4

    layers = {emb:lambda_val, conv1d:lambda_val, hid:lambda_val, network:lambda_val} 
    penalty = regularize_layer_params_weighted(layers, l2)
    loss = loss + penalty


    params = get_all_params(network, trainable=True)

    if args.optimizer == "sgd":
        updates = sgd(loss, params, learning_rate=args.step)
    elif args.optimizer == "adagrad":
        updates = adagrad(loss, params, learning_rate=args.step)
    elif args.optimizer == "adadelta":
        updates = adadelta(loss, params, learning_rate=args.step)
    elif args.optimizer == "nesterov":
        updates = nesterov_momentum(loss, params, learning_rate=args.step)
    elif args.optimizer == "rms":
        updates = rmsprop(loss, params, learning_rate=args.step)
    elif args.optimizer == "adam":
        updates = adam(loss, params, learning_rate=args.step)
    else:
        raise "Need set optimizer correctly"
 
    test_prediction = get_output(network, deterministic=True)
    test_loss = T.mean(binary_crossentropy(test_prediction,target_var))

    train_fn = theano.function([input_var, target_var], 
        loss, updates=updates, allow_input_downcast=True)

    test_acc = T.mean(binary_accuracy(test_prediction, target_var))
    val_fn = theano.function([input_var, target_var], [test_loss, test_acc], allow_input_downcast=True)

    return train_fn, val_fn, network
Beispiel #49
0
    def _prepare(self, X, y, X_valid=None, y_valid=None, sample_weight=None,
                 whole_dataset_in_device=True):

        self._stats = []
        self._class_label_encoder = LabelEncoder()
        if self.is_classification is True:
            self._class_label_encoder.fit(y)
            self.classes_ = self._class_label_encoder.classes_
            y = self._class_label_encoder.transform(y).astype(y.dtype)
            self.y_train_transformed = y
            if y_valid is not None:
                y_valid_transformed = self._class_label_encoder.transform(
                    y_valid).astype(y_valid.dtype)

        self._l_x_in = layers.InputLayer(shape=(None, X.shape[1]))
        batch_index, X_batch, y_batch, batch_slice = get_theano_batch_variables(
            self.batch_size, y_softmax=self.is_classification)

        if sample_weight is not None:
            t_sample_weight = T.vector('sample_weight')
            sample_weight = sample_weight.astype(theano.config.floatX)
        else:
            t_sample_weight = T.scalar('sample_weight')

        if self.is_classification is True:
            y_dim = len(set(y.flatten().tolist()))
        else:
            y_dim = y.shape[1]

        self._prediction_layer = self._build_model(y_dim)
        self._layers = layers.get_all_layers(self._prediction_layer)
        self._build_prediction_functions(X_batch, self._prediction_layer)

        if self.input_noise_function is None:
            output = layers.get_output(self._prediction_layer, X_batch)

        else:
            X_batch_noisy = self.input_noise_function(X_batch)
            output = layers.get_output(self._prediction_layer, X_batch_noisy)

        if self.is_classification:
            loss = -T.mean(t_sample_weight * T.log(output)
                           [T.arange(y_batch.shape[0]), y_batch])
        else:
            loss = T.mean(
                t_sample_weight * T.sum((output - y_batch) ** 2, axis=1))

        loss_unreg = loss

        all_params = layers.get_all_params(self._prediction_layer)
        if self._output_softener_coefs is not None:
            all_params.append(self._output_softener_coefs)

        W_params = layers.get_all_param_values(
            self._prediction_layer, regularizable=True)

        # regularization
        if self.L1_factor is not None:
            for L1_factor_layer, W in zip(self.L1_factor, W_params):
                loss = loss + L1_factor_layer * T.sum(abs(W))

        if self.L2_factor is not None:
            for L2_factor_layer, W in zip(self.L2_factor, W_params):
                loss = loss + L2_factor_layer * T.sum(W**2)

        if self.optimization_method == 'nesterov_momentum':
            gradient_updates = updates.nesterov_momentum(loss, all_params, learning_rate=self.learning_rate,
                                                         momentum=self.momentum)
        elif self.optimization_method == 'adadelta':
            # don't need momentum there
            gradient_updates = updates.adadelta(
                loss, all_params, learning_rate=self.learning_rate)
        elif self.optimization_method == 'adam':
            gradient_updates = updates.Adam(
                loss, all_params, learning_rate=self.learning_rate)
        elif self.optimization_method == 'momentum':
            gradient_updates = updates.momentum(
                loss, all_params, learning_rate=self.learning_rate,
                momentum=self.momentum
            )
        elif self.optimization_method == 'adagrad':
            gradient_updates = updates.adadelta(
                loss, all_params, learning_rate=self.learning_rate)
        elif self.optimization_method == 'rmsprop':
            gradient_updates = updates.adadelta(
                loss, all_params, learning_rate=self.learning_rate)
        elif self.optimization_method == 'sgd':
            gradient_updates = updates.sgd(
                loss, all_params, learning_rate=self.learning_rate,
            )
        else:
            raise Exception("wrong optimization method")

        nb_batches = X.shape[0] // self.batch_size
        if (X.shape[0] % self.batch_size) != 0:
            nb_batches += 1

        X = X.astype(theano.config.floatX)
        if self.is_classification == True:
            y = y.astype(np.int32)
        else:
            y = y.astype(theano.config.floatX)

        if whole_dataset_in_device == True:
            X_shared = theano.shared(X, borrow=True)
            y_shared = theano.shared(y, borrow=True)

            givens = {
                X_batch: X_shared[batch_slice],
                y_batch: y_shared[batch_slice]
            }

            if sample_weight is not None:
                sample_weight_shared = theano.shared(
                    sample_weight, borrow=True)
                givens[t_sample_weight] = sample_weight_shared[batch_slice]
            else:
                givens[t_sample_weight] = T.as_tensor_variable(
                    np.array(1., dtype=theano.config.floatX))

            iter_update_batch = theano.function(
                [batch_index], loss,
                updates=gradient_updates,
                givens=givens,

            )
        else:
            if sample_weight is None:
                iter_update_gradients = theano.function(
                    [X_batch, y_batch],
                    loss,
                    updates=gradient_updates,
                    givens={t_sample_weight: T.as_tensor_variable(
                        np.array(1., dtype=theano.config.floatX))},

                )

                def iter_update_batch(batch_index):
                    sl = slice(batch_index * self.batch_size,
                               (batch_index + 1) * self.batch_size)
                    return iter_update_gradients(X[sl], y[sl])

            else:
                iter_update_gradients = theano.function(
                    [X_batch, y_batch, t_sample_weight],
                    loss,
                    updates=gradient_updates
                )

                def iter_update_batch(batch_index):
                    sl = slice(batch_index * self.batch_size,
                               (batch_index + 1) * self.batch_size)
                    return iter_update_gradients(X[sl], y[sl], sample_weight[sl])
        self._iter_update_batch = iter_update_batch
        self._get_loss = theano.function(
            [X_batch, y_batch, t_sample_weight], loss_unreg, allow_input_downcast=True)

        def iter_update(epoch):
            losses = []
            #self.learning_rate.set_value(self.learning_rate.get_value() * np.array(0.99, dtype=theano.config.floatX))
            for i in xrange(nb_batches):
                losses.append(self._iter_update_batch(i))
                # max norm
                if self.max_norm is not None:
                    for max_norm_layer, layer in zip(self.max_norm, self._layers):
                        layer.W = updates.norm_constraint(
                            layer.W, self.max_norm)

            losses = np.array(losses)

            d = OrderedDict()
            d["epoch"] = epoch
            #d["loss_train_std"] = losses.std()

            #d["loss_train"] = losses.mean()
            d["loss_train"] = self._get_loss(
                self.X_train, self.y_train_transformed, 1.)

            d["accuracy_train"] = (
                self.predict(self.X_train) == self.y_train).mean()

            if X_valid is not None and y_valid is not None:
                d["loss_valid"] = self._get_loss(
                    X_valid, y_valid_transformed, 1.)

                if self.is_classification == True:
                    d["accuracy_valid"] = (
                        self.predict(X_valid) == y_valid).mean()

            if self.verbose > 0:
                if (epoch % self.report_each) == 0:
                    print(tabulate([d], headers="keys"))
            self._stats.append(d)
            return d

        def quitter(update_status):
            cur_epoch = len(self._stats) - 1
            if self.patience_nb_epochs > 0:
                # patience heuristic (for early stopping)
                cur_patience_stat = update_status[self.patience_stat]

                if self.cur_best_patience_stat is None:
                    self.cur_best_patience_stat = cur_patience_stat
                    first_time = True
                else:
                    first_time = False

                thresh = self.patience_progression_rate_threshold
                if cur_patience_stat < self.cur_best_patience_stat * thresh or first_time:

                    if self.verbose >= 2:
                        fmt = "--Early stopping-- good we have a new best value : {0}={1}, last best : epoch {2}, value={3}"
                        print(fmt.format(self.patience_stat, cur_patience_stat,
                                         self.cur_best_epoch, self.cur_best_patience_stat))
                    self.cur_best_epoch = cur_epoch
                    self.cur_best_patience_stat = cur_patience_stat
                    if hasattr(self, "set_state") and hasattr(self, "get_state"):
                        self.cur_best_model = self.get_state()
                    else:
                        self.cur_best_model = pickle.dumps(
                            self.__dict__, protocol=pickle.HIGHEST_PROTOCOL)
                if (cur_epoch - self.cur_best_epoch) >= self.patience_nb_epochs:
                    finish = True
                    if hasattr(self, "set_state") and hasattr(self, "get_state"):
                        self.set_state(self.cur_best_model)
                    else:
                        self.__dict__.update(pickle.loads(self.cur_best_model))

                    self._stats = self._stats[0:self.cur_best_epoch + 1]
                    if self.verbose >= 2:
                        print("out of patience...take the model at epoch {0} and quit".format(
                            self.cur_best_epoch + 1))
                else:
                    finish = False
                return finish
            else:
                return False

        def monitor(update_status):
            pass

        def observer(monitor_output):
            pass

        return (iter_update, quitter, monitor, observer)
def main():
    # load the training and validation data sets
    # labels=int(0.7*speech.all_count)
    global valid_best

    X = T.tensor4()
    Y = T.matrix()

    # set up theano functions to generate output by feeding data through network
    output_layer,hidden_layer = lasagne_model()
    output_train = lasagne.layers.get_output(output_layer, X)
    output_valid = lasagne.layers.get_output(output_layer, X, deterministic=True)

    # set up the loss that we aim to minimize
    loss_train = T.mean(T.nnet.categorical_crossentropy(output_train, Y))
    loss_valid = T.mean(T.nnet.categorical_crossentropy(output_valid, Y))

    # prediction functions for classifications
    pred = T.argmax(output_train, axis=1)
    pred_valid = T.argmax(output_valid, axis=1)

    # get parameters from network and set up sgd with nesterov momentum to update parameters
    params = lasagne.layers.get_all_params(output_layer,trainable=True)
    print params
    updates = nesterov_momentum(loss_train, params, learning_rate=0.2, momentum=0.9)
    # updates =lasagne.updates.sgd(loss_train, params, learning_rate=0.1)

    # set up training and prediction functions
    train = theano.function(inputs=[X, Y], outputs=[loss_train,pred], updates=updates, allow_input_downcast=True)
    valid = theano.function(inputs=[X, Y], outputs=[loss_valid,pred_valid], allow_input_downcast=True)
    # predict_valid = theano.function(inputs=[X], outputs=[pred_valid], allow_input_downcast=True)

    # loop over training functions for however many iterations, print information while training
    train_eval = []
    valid_eval = []
    valid_acc = []

    if 0:
        # pre_params = 'speech_params/validbest_cnn_spkall_22_0.95458984375_2017-06-24 00:50:39.pklz'
        pre_params='speech_params/validbest_cnn_fisher_0_idxbatch20000_0.65625_2017-08-17 16:14:04.pklz' # 1 shot: 99.0 2000多次
        load_params = pickle.load(gzip.open(pre_params))
        # load_params=load_params[:-2]
        lasagne.layers.set_all_param_values(output_layer,load_params)
        print 'load params: ', pre_params,'\n'

    for i in range(450):
        train_list,valid_list=get_data(train_files,num_speechs,size,valid_size)
        batch_total_number = size / BATCHSIZE
        if 1:
            valid_list_limit=3000
            valid_list=valid_list[:valid_list_limit]
        print 'batch_total_number:',batch_total_number
        train_acc_aver=0.0
        for idx_batch in range (batch_total_number):
            batch_list = train_list[idx_batch * BATCHSIZE:(idx_batch + 1) * BATCHSIZE]
            xx_batch=np.zeros((BATCHSIZE,1,speechSize[0],speechSize[1]))
            yy_batch=np.zeros((BATCHSIZE,num_labels_train))
            for ind,one_name in enumerate(batch_list):
                aim_file_name,aim_idx=one_name.split('.')
                aim_file_name+='.npy'
                aim_file_name_full=train_load_path+aim_file_name
                xx_batch[ind,0]=np.load(aim_file_name_full)[int(aim_idx)]
                yy_batch[ind]=label_binarize([train_files.index(aim_file_name)],range(num_labels_train))[0]

            train_loss ,pred = train(xx_batch,yy_batch)
            count=np.count_nonzero(np.int32(pred ==np.argmax(yy_batch,axis=1)))
            train_acc = float(count) / (BATCHSIZE)
            print i,idx_batch,'| Tloss:', train_loss,'| Count:',count,'| Acc:',train_acc
            print pred
            print np.argmax(yy_batch,axis=1)
            print "time:",time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
            train_acc_aver += train_acc

            # raise EOFError

            if train_acc>0.9 and idx_batch%1000==0 and idx_batch>=0:
                acc=0
                valid_batch_number=len(valid_list)/BATCHSIZE
                for j in tqdm(range(valid_batch_number)):
                    batch_list = valid_list[j * BATCHSIZE:(j + 1) * BATCHSIZE]
                    x_batch=np.zeros((BATCHSIZE,1,speechSize[0],speechSize[1]))
                    y_batch=np.zeros((BATCHSIZE,num_labels_train))
                    for ind,one_name in enumerate(batch_list):
                        aim_file_name,aim_idx=one_name.split('.')
                        aim_file_name+='.npy'
                        aim_file_name_full=train_load_path+aim_file_name
                        x_batch[ind,0]=np.load(aim_file_name_full)[int(aim_idx)]
                        y_batch[ind]=label_binarize([train_files.index(aim_file_name)],range(num_labels_train))[0]
                    valid_loss,pred = valid(x_batch,y_batch)
                    acc += np.count_nonzero(np.int32(pred ==np.argmax(y_batch,axis=1)))
                acc=float(acc)/(valid_batch_number*BATCHSIZE)
                print 'iter:', i,idx_batch, '| Vloss:',valid_loss,'|Acc:',acc
                if acc>valid_best:
                    print 'new valid_best:',valid_best,'-->',acc
                    valid_best=acc
                    all_params = helper.get_all_param_values(output_layer)
                    f = gzip.open('speech_params/validbest_cnn_fisher_{}_validacc{}_{}.pklz'.format(i,valid_best,time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())), 'wb')
                    pickle.dump(all_params, f)
                    f.close()

            if idx_batch%3000==0 and idx_batch>0:
                all_params = helper.get_all_param_values(output_layer)
                f = gzip.open('speech_params/validbest_cnn_fisher_{}_idxbatch{}_{}.pklz'.format(i,idx_batch,time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())), 'wb')
                pickle.dump(all_params, f)
                f.close()

        # save weights
        if i%1==0:
            all_params = helper.get_all_param_values(output_layer)
            f = gzip.open('speech_params/validbest_cnn_fisher_averacc{}_{}_{}.pklz'.format(i,train_acc_aver/batch_total_number,time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())), 'wb')
            pickle.dump(all_params, f)
            f.close()
def event_span_classifier(args, input_var, target_var, wordEmbeddings, seqlen, num_feats):

    print("Building model with 1D Convolution")

    vocab_size = wordEmbeddings.shape[1]
    wordDim = wordEmbeddings.shape[0]

    kw = 2
    num_filters = seqlen-kw+1
    stride = 1 

    #important context words as channels
 
    #CNN_sentence config
    filter_size=wordDim
    pool_size=seqlen-filter_size+1

    input = InputLayer((None, seqlen, num_feats),input_var=input_var)
    batchsize, _, _ = input.input_var.shape
    emb = EmbeddingLayer(input, input_size=vocab_size, output_size=wordDim, W=wordEmbeddings.T)
    #emb.params[emb.W].remove('trainable') #(batchsize, seqlen, wordDim)

    #print get_output_shape(emb)
    reshape = ReshapeLayer(emb, (batchsize, seqlen, num_feats*wordDim))
    #print get_output_shape(reshape)

    conv1d = Conv1DLayer(reshape, num_filters=num_filters, filter_size=wordDim, stride=1, 
        nonlinearity=tanh,W=GlorotUniform()) #nOutputFrame = num_flters, 
                                            #nOutputFrameSize = (num_feats*wordDim-filter_size)/stride +1

    #print get_output_shape(conv1d)

    conv1d = DimshuffleLayer(conv1d, (0,2,1))

    #print get_output_shape(conv1d)

    pool_size=num_filters

    maxpool = MaxPool1DLayer(conv1d, pool_size=pool_size) 

    #print get_output_shape(maxpool)
  
    #forward = FlattenLayer(maxpool) 

    #print get_output_shape(forward)
 
    hid = DenseLayer(maxpool, num_units=args.hiddenDim, nonlinearity=sigmoid)

    network = DenseLayer(hid, num_units=2, nonlinearity=softmax)

    prediction = get_output(network)
    
    loss = T.mean(binary_crossentropy(prediction,target_var))
    lambda_val = 0.5 * 1e-4

    layers = {emb:lambda_val, conv1d:lambda_val, hid:lambda_val, network:lambda_val} 
    penalty = regularize_layer_params_weighted(layers, l2)
    loss = loss + penalty


    params = get_all_params(network, trainable=True)

    if args.optimizer == "sgd":
        updates = sgd(loss, params, learning_rate=args.step)
    elif args.optimizer == "adagrad":
        updates = adagrad(loss, params, learning_rate=args.step)
    elif args.optimizer == "adadelta":
        updates = adadelta(loss, params, learning_rate=args.step)
    elif args.optimizer == "nesterov":
        updates = nesterov_momentum(loss, params, learning_rate=args.step)
    elif args.optimizer == "rms":
        updates = rmsprop(loss, params, learning_rate=args.step)
    elif args.optimizer == "adam":
        updates = adam(loss, params, learning_rate=args.step)
    else:
        raise "Need set optimizer correctly"
 
    test_prediction = get_output(network, deterministic=True)
    test_loss = T.mean(binary_crossentropy(test_prediction,target_var))

    train_fn = theano.function([input_var, target_var], 
        loss, updates=updates, allow_input_downcast=True)

    test_acc = T.mean(binary_accuracy(test_prediction, target_var))
    val_fn = theano.function([input_var, target_var], [test_loss, test_acc], allow_input_downcast=True)

    return train_fn, val_fn, network
all_layers = lasagne.layers.get_all_layers(output_layer)
l2_penalty = lasagne.regularization.regularize_layer_params(
    all_layers, lasagne.regularization.l2) * 0.0001
loss = loss + l2_penalty

# set up loss functions for validation dataset
test_loss = lasagne.objectives.categorical_crossentropy(output_test, Y)
test_loss = test_loss.mean()

test_acc = T.mean(T.eq(T.argmax(output_test, axis=1), Y),
                  dtype=theano.config.floatX)

# get parameters from network and set up sgd with nesterov momentum to update parameters, l_r is shared var so it can be changed
l_r = theano.shared(np.array(LR_SCHEDULE[0], dtype=theano.config.floatX))
params = lasagne.layers.get_all_params(output_layer, trainable=True)
updates = nesterov_momentum(loss, params, learning_rate=l_r, momentum=0.9)
#updates = adam(loss, params, learning_rate=l_r)

# set up training and prediction functions
train_fn = theano.function(inputs=[X, Y], outputs=loss, updates=updates)
valid_fn = theano.function(inputs=[X, Y], outputs=[test_loss, test_acc])
'''
load training data and start training
'''

# load the training and validation data sets
train_X, test_X, train_y, test_y = load_pickle_data_cv()
print 'Train shape:', train_X.shape, 'Test shape:', test_X.shape
print 'Train y shape:', train_y.shape, 'Test y shape:', test_y.shape
print np.amax(train_X), np.amin(train_X), np.mean(train_X)
def main():
    # load model and parameters
    output_layer = lasagne_model()
    f = gzip.open('data/weights.pklz', 'rb')
    all_params = pickle.load(f)
    f.close()

    X = T.ftensor4()
    Y = T.fmatrix()

    # set up theano functions to generate output by feeding data through network
    output_layer = lasagne_model()
    output_train = lasagne.layers.get_output(output_layer, X)
    output_valid = lasagne.layers.get_output(output_layer, X, deterministic=True)

    # set up the loss that we aim to minimize
    loss_train = T.mean(T.nnet.categorical_crossentropy(output_train, Y))
    loss_valid = T.mean(T.nnet.categorical_crossentropy(output_valid, Y))

    # prediction functions for classifications
    pred = T.argmax(output_train, axis=1)
    pred_valid = T.argmax(output_valid, axis=1)

    # get parameters from network and set up sgd with nesterov momentum to update parameters
    helper.set_all_param_values(output_layer, all_params)
    params = lasagne.layers.get_all_params(output_layer)
    updates = nesterov_momentum(loss_train, params, learning_rate=0.0001, momentum=0.9)

    # set up training and prediction functions
    train = theano.function(inputs=[X, Y], outputs=loss_train, updates=updates, allow_input_downcast=True)
    valid = theano.function(inputs=[X, Y], outputs=loss_valid, allow_input_downcast=True)
    predict_valid = theano.function(inputs=[X], outputs=pred_valid, allow_input_downcast=True)

    # fine tune network
    train_X, test_X, train_y, test_y = load_data_cv('data/train.csv')
    train_eval = []
    valid_eval = []
    valid_acc = []
    try:
        for i in range(5):
            train_loss = batch_iterator_no_aug(train_X, train_y, BATCHSIZE, train)
            train_eval.append(train_loss)
            valid_loss = valid(test_X, test_y)
            valid_eval.append(valid_loss)
            acc = np.mean(np.argmax(test_y, axis=1) == predict_valid(test_X))
            valid_acc.append(acc)
            print 'iter:', i, '| Tloss:', train_loss, '| Vloss:', valid_loss, '| valid acc:', acc

    except KeyboardInterrupt:
        pass

    # after training create output for kaggle
    testing_inputs = load_test_data('data/test.csv')
    predictions = []
    for j in range((testing_inputs.shape[0] + BATCHSIZE -1) // BATCHSIZE):
        sl = slice(j * BATCHSIZE, (j + 1) * BATCHSIZE)
        X_batch = testing_inputs[sl]
        predictions.extend(predict_valid(X_batch))
    out = pd.read_csv('data/convnet_preds.csv')
    out['Label'] = predictions
    out.to_csv('preds/convnet_preds.csv', index = False)
def build_network_2dconv(
    args, input1_var, input1_mask_var, input2_var, intut2_mask_var, target_var, wordEmbeddings, maxlen=36
):

    print ("Building model with 2D Convolution")

    vocab_size = wordEmbeddings.shape[1]
    wordDim = wordEmbeddings.shape[0]

    num_filters = 100

    stride = 1

    # CNN_sentence config
    filter_size = (3, wordDim)
    pool_size = (maxlen - 3 + 1, 1)

    # two conv pool layer
    # filter_size=(10, 100)
    # pool_size=(4,4)

    input_1 = InputLayer((None, maxlen), input_var=input1_var)
    batchsize, seqlen = input_1.input_var.shape
    # input_1_mask = InputLayer((None, maxlen),input_var=input1_mask_var)
    emb_1 = EmbeddingLayer(input_1, input_size=vocab_size, output_size=wordDim, W=wordEmbeddings.T)
    emb_1.params[emb_1.W].remove("trainable")  # (batchsize, maxlen, wordDim)

    reshape_1 = ReshapeLayer(emb_1, (batchsize, 1, maxlen, wordDim))

    conv2d_1 = Conv2DLayer(
        reshape_1,
        num_filters=num_filters,
        filter_size=(filter_size),
        stride=stride,
        nonlinearity=rectify,
        W=GlorotUniform(),
    )  # (None, 100, 34, 1)
    maxpool_1 = MaxPool2DLayer(conv2d_1, pool_size=pool_size)  # (None, 100, 1, 1)

    """
    filter_size_2=(4, 10)
    pool_size_2=(2,2)
    conv2d_1 = Conv2DLayer(maxpool_1, num_filters=num_filters, filter_size=filter_size_2, stride=stride, 
        nonlinearity=rectify,W=GlorotUniform()) #(None, 100, 34, 1)
    maxpool_1 = MaxPool2DLayer(conv2d_1, pool_size=pool_size_2) #(None, 100, 1, 1) (None, 100, 1, 20)
    """

    forward_1 = FlattenLayer(maxpool_1)  # (None, 100) #(None, 50400)

    input_2 = InputLayer((None, maxlen), input_var=input2_var)
    # input_2_mask = InputLayer((None, maxlen),input_var=input2_mask_var)
    emb_2 = EmbeddingLayer(input_2, input_size=vocab_size, output_size=wordDim, W=wordEmbeddings.T)
    emb_2.params[emb_2.W].remove("trainable")

    reshape_2 = ReshapeLayer(emb_2, (batchsize, 1, maxlen, wordDim))
    conv2d_2 = Conv2DLayer(
        reshape_2,
        num_filters=num_filters,
        filter_size=filter_size,
        stride=stride,
        nonlinearity=rectify,
        W=GlorotUniform(),
    )  # (None, 100, 34, 1)
    maxpool_2 = MaxPool2DLayer(conv2d_2, pool_size=pool_size)  # (None, 100, 1, 1)

    """
    conv2d_2 = Conv2DLayer(maxpool_2, num_filters=num_filters, filter_size=filter_size_2, stride=stride, 
        nonlinearity=rectify,W=GlorotUniform()) #(None, 100, 34, 1)
    maxpool_2 = MaxPool2DLayer(conv2d_2, pool_size=pool_size_2) #(None, 100, 1, 1)
    """

    forward_2 = FlattenLayer(maxpool_2)  # (None, 100)

    # elementwisemerge need fix the sequence length
    mul = ElemwiseMergeLayer([forward_1, forward_2], merge_function=T.mul)
    sub = AbsSubLayer([forward_1, forward_2], merge_function=T.sub)
    concat = ConcatLayer([mul, sub])

    concat = ConcatLayer([forward_1, forward_2])

    hid = DenseLayer(concat, num_units=args.hiddenDim, nonlinearity=sigmoid)

    if args.task == "sts":
        network = DenseLayer(hid, num_units=5, nonlinearity=softmax)

    elif args.task == "ent":
        network = DenseLayer(hid, num_units=3, nonlinearity=softmax)

    # prediction = get_output(network, {input_1:input1_var, input_2:input2_var})
    prediction = get_output(network)

    loss = T.mean(categorical_crossentropy(prediction, target_var))
    lambda_val = 0.5 * 1e-4

    layers = {conv2d_1: lambda_val, hid: lambda_val, network: lambda_val}
    penalty = regularize_layer_params_weighted(layers, l2)
    loss = loss + penalty

    params = get_all_params(network, trainable=True)

    if args.optimizer == "sgd":
        updates = sgd(loss, params, learning_rate=args.step)
    elif args.optimizer == "adagrad":
        updates = adagrad(loss, params, learning_rate=args.step)
    elif args.optimizer == "adadelta":
        updates = adadelta(loss, params, learning_rate=args.step)
    elif args.optimizer == "nesterov":
        updates = nesterov_momentum(loss, params, learning_rate=args.step)
    elif args.optimizer == "rms":
        updates = rmsprop(loss, params, learning_rate=args.step)
    elif args.optimizer == "adam":
        updates = adam(loss, params, learning_rate=args.step)
    else:
        raise "Need set optimizer correctly"

    # test_prediction = get_output(network, {input_1:input1_var, input_2:input2_var}, deterministic=True)
    test_prediction = get_output(network, deterministic=True)
    test_loss = T.mean(categorical_crossentropy(test_prediction, target_var))

    """
    train_fn = theano.function([input1_var, input1_mask_var, input2_var, intut2_mask_var, target_var], 
        loss, updates=updates, allow_input_downcast=True)
    """
    train_fn = theano.function([input1_var, input2_var, target_var], loss, updates=updates, allow_input_downcast=True)

    if args.task == "sts":
        """
        val_fn = theano.function([input1_var, input1_mask_var, input2_var, intut2_mask_var, target_var], 
            [test_loss, test_prediction], allow_input_downcast=True)
        """
        val_fn = theano.function(
            [input1_var, input2_var, target_var], [test_loss, test_prediction], allow_input_downcast=True
        )

    elif args.task == "ent":
        # test_acc = T.mean(T.eq(T.argmax(test_prediction, axis=1), target_var), dtype=theano.config.floatX)
        test_acc = T.mean(categorical_accuracy(test_prediction, target_var))

        """
        val_fn = theano.function([input1_var, input1_mask_var, input2_var, intut2_mask_var, target_var], 
            [test_loss, test_acc], allow_input_downcast=True)
        """
        val_fn = theano.function([input1_var, input2_var, target_var], [test_loss, test_acc], allow_input_downcast=True)

    return train_fn, val_fn
Beispiel #55
0
def word2vec(
		files=[],
		directories=[],
		skip=[],
		save_dir=None,
		num_epochs=5,
		unigram_dictionary=None,
		noise_ratio=15,
		kernel=[1,2,3,4,5,5,4,3,2,1],
		t = 1.0e-5,
		batch_size = 1000,  # Number of *signal* examples per batch
		num_embedding_dimensions=500,
		word_embedding_init=Normal(),
		context_embedding_init=Normal(),
		learning_rate=0.1,
		momentum=0.9,
		num_processes=3,
		load_dictionary_dir=None,
		min_frequency=10,
		macrobatch_size = 100000,
		max_queue_size=0,
		verbose=True
	):

	'''
	Helper function that handles all concerns involved in training
	A word2vec model using the approach of Mikolov et al.  It surfaces
	all of the options.

	For customizations going beyond simply tweeking existing options and
	hyperparameters, substitute this function by writing your own training
	routine using the provided classes.  This function would be a starting
	point for you.
	'''

	# Make a Word2VecMinibatcher, pass through parameters sent by caller
	reader = DatasetReader(
		files=files,
		directories=directories,
		skip=skip,
		noise_ratio=noise_ratio,
		t=t,
		num_processes=num_processes,
		unigram_dictionary=unigram_dictionary,
		kernel=kernel,
		max_queue_size=max_queue_size,
		macrobatch_size=macrobatch_size,
		verbose=verbose
	)


	# Prepare the minibatch generator
	# (this produces the counter_sampler stats)
	if load_dictionary_dir is None and unigram_dictionary is None:
		if verbose:
			print 'preparing dictionaries...'
		reader.prepare(save_dir=save_dir)

	# If min_frequency was specified, prune the dictionaries
	if min_frequency is not None:
		if verbose:
			print 'prunning dictionaries...'
		reader.prune(min_frequency)

	# Make a symbolic minibatcher
	minibatcher = NoiseContrastiveTheanoMinibatcher(
		batch_size=batch_size,
		noise_ratio=noise_ratio,
		dtype="int32",
		num_dims=2
	)

	# Make a Word2VecEmbedder object, feed it the combined input.
	# Note that the full batch includes noise examples and signal_examples
	# so is larger than batch_size, which is the number of signal_examples
	# only per batch.
	full_batch_size = batch_size * (1 + noise_ratio)
	embedder = Word2VecEmbedder(
		input_var=minibatcher.get_batch(),
		batch_size=full_batch_size,
		vocabulary_size=reader.get_vocab_size(),
		num_embedding_dimensions=num_embedding_dimensions,
		word_embedding_init=word_embedding_init,
		context_embedding_init=context_embedding_init
	)

	# Architectue is ready.  Make the loss function, and use it to create 
	# the parameter updates responsible for learning
	loss = get_noise_contrastive_loss(embedder.get_output(), batch_size)
	updates = nesterov_momentum(
		loss, embedder.get_params(), learning_rate, momentum
	)

	# Include minibatcher updates, which cause the symbolic batch to move
	# through the dataset like a sliding window
	updates.update(minibatcher.get_updates())

	# Use the loss function and the updates to compile a training function.
	# Note that it takes no inputs because the dataset is fully loaded using
	# theano shared variables
	train = function([], loss, updates=updates)

	# Iterate through the dataset, training the embeddings
	for epoch in range(num_epochs):

		if verbose:
			print 'starting epoch %d' % epoch

		macrobatches = reader.generate_dataset_serial()
		macrobatch_num = 0
		for signal_macrobatch, noise_macrobatch in macrobatches:

			macrobatch_num += 1
			if verbose:
				print 'running macrobatch %d' % (macrobatch_num - 1)

			minibatcher.load_dataset(signal_macrobatch, noise_macrobatch)
			losses = []
			for batch_num in range(minibatcher.get_num_batches()):
				if verbose:
					print 'running minibatch', batch_num
				losses.append(train())
			if verbose:
				print '\taverage loss: %f' % np.mean(losses)

	# Save the model (the embeddings) if save_dir was provided
	if save_dir is not None:
		embedder.save(save_dir)

	# Return the trained embedder and the dictionary mapping tokens
	# to ids
	return embedder, reader
# load datasets
X_train_fc7 = theano.shared(np.load('/root/proj/MIT_dumped/X_train_fc7.npy').astype(theano.config.floatX))
X_test_fc7 = theano.shared(np.load('/root/proj/MIT_dumped/X_test_fc7.npy').astype(theano.config.floatX))

all_params = layers.get_all_params(output)

objective = objectives.Objective(output,loss_function=objectives.multinomial_nll)
loss_train = objective.get_loss([X_batch_one, X_batch_two], target=y_batch)


LEARNING_RATE =0.0122
MOMENTUM=0.9
REG = .0009
reg_loss = regularization.l2(output) * REG
total_loss = loss_train + reg_loss
upds = updates.nesterov_momentum(total_loss, all_params, LEARNING_RATE, MOMENTUM)
pred = T.argmax(
    output.get_output([X_batch_one, X_batch_two], deterministic=True), axis=1)
accuracy = T.mean(T.eq(pred, y_batch), dtype=theano.config.floatX)

print "begin compiling"
givens =    {X_batch_one: X_train_fc6[batch_index*batch_size:(batch_index+1)*batch_size],
            X_batch_two: X_train_fc7[batch_index*batch_size:(batch_index+1)*batch_size],
            y_batch: y_train[batch_index*batch_size:(batch_index+1)*batch_size]}
train = theano.function([batch_index], loss_train, updates=upds, givens=givens)
test = theano.function([], accuracy, givens={X_batch_one:X_test_fc6, X_batch_two:X_test_fc7, y_batch:y_test})
num_epochs = 1000
for epoch in range(num_epochs):
    print "epoch %s" % epoch
    for batch in range(total/batch_size):
        loss = train(batch)
Beispiel #57
0
 def update(all_grads, all_params, learning_rate):
     return nesterov_momentum(all_grads,
                              all_params,
                              learning_rate,
                              momentum=m)