Beispiel #1
0
    def _build_model(hyper_parameters,
                     learning_rate,
                     symmetric_double_encoder,
                     params,
                     regularization_methods,
                     model_updates,
                     model_deltas,
                     moving_averages,
                     number_of_batches,
                     strategy='SGDCayley',
                     bias_1=0.9,
                     bias_2=0.999,
                     rho=0.5,
                     eps=1e-8,
                     loss='L2',
                     last_layer=0,
                     autoencoder_x=False,
                     autoencoder_y=False):

        # loss_decision = Tensor.iscalar()
        t = Tensor.dscalar()

        # Retrieve the reconstructions of x and y
        x_tilde = symmetric_double_encoder.reconstruct_x()
        y_tilde = symmetric_double_encoder.reconstruct_y()

        x_hidden = symmetric_double_encoder[0].output_forward_x
        y_hidden = symmetric_double_encoder[0].output_forward_y

        var_x = symmetric_double_encoder.var_x
        var_y = symmetric_double_encoder.var_y

        print 'Calculating Loss'

        if autoencoder_x == True or autoencoder_y == True:
            loss_forward = Tensor.constant(0)
            loss_backward = Tensor.constant(0)

            if autoencoder_x == True:
                loss_forward = Tensor.mean((var_x - symmetric_double_encoder[-1].reconstruct_x(x_hidden)) ** 2)

            if autoencoder_y == True:
                loss_backward = Tensor.mean((var_y - symmetric_double_encoder[-1].reconstruct_y(y_hidden)) ** 2)

            loss = loss_forward + loss_backward

        elif loss == 'L2':
            loss_backward = Tensor.mean(
                ((var_x - x_tilde) ** 2).sum(axis=1, dtype=Tensor.config.floatX))

            # Compute the loss of the backward encoding as L2 loss
            loss_forward = Tensor.mean(
                ((var_y - y_tilde) ** 2).sum(axis=1, dtype=Tensor.config.floatX))

            # loss = ifelse(loss_decision, loss_forward, loss_backward)#loss_backward + loss_forward
            loss = (loss_forward + loss_backward)

        elif loss == 'cosine':

            mod_x = Tensor.sqrt(Tensor.sum(var_x ** 2, 1) + eps)
            mod_x_tilde = Tensor.sqrt(Tensor.sum(x_tilde ** 2, 1) + eps)
            loss_backward = 1 - Tensor.mean(Tensor.diag(Tensor.dot(var_x, x_tilde.T)) / (mod_x * mod_x_tilde))

            mod_y = Tensor.sqrt(Tensor.sum(var_y ** 2, 1) + eps)
            mod_y_tilde = Tensor.sqrt(Tensor.sum(y_tilde ** 2, 1) + eps)
            loss_forward = 1 - Tensor.mean(Tensor.diag(Tensor.dot(var_y, y_tilde.T)) / (mod_y * mod_y_tilde))

            loss = Tensor.mean(loss_forward * loss_backward.T)

            # Compute the loss of the forward encoding as L2 loss

        else:
            raise Exception('Loss not recognized')

        # loss -= Trainer.add_negative(var_x, x_tilde, None)
        # loss -= Trainer.add_negative(var_y, y_tilde, None)

        print 'Adding regularization'

        # Add the regularization method computations to the loss
        regularizations = [regularization_method.compute(symmetric_double_encoder, params) for regularization_method in
                           regularization_methods if not regularization_method.weight == 0]

        print 'Regularization number = {0}'.format(len(regularizations))

        if len(regularizations) > 0:
            loss += Tensor.sum(regularizations, dtype=Tensor.config.floatX)

        print 'Calculating gradients'

        # Computing the gradient for the stochastic gradient decent
        # the result is gradients for each parameter of the cross encoder
        gradients = Tensor.grad(loss, params)

        if strategy == 'SGD':

            if hyper_parameters.momentum > 0:

                print 'Adding momentum'

                updates = OrderedDict()
                zipped = zip(params, gradients, model_updates)
                for param, gradient, model_update in zipped:
                    update, delta = Trainer._calc_update(learning_rate, gradient, param,
                                                         last_layer=last_layer)
                    delta = hyper_parameters.momentum * model_update - delta

                    updates[param] = param + delta
                    updates[model_update] = delta

            else:
                # generate the list of updates, each update is a round in the decent
                updates = OrderedDict()
                for param, gradient in zip(params, gradients):
                    update, delta = Trainer._calc_update(learning_rate, gradient, param,
                                                         last_layer=last_layer)
                    updates[param] = update

        elif strategy == 'adaGrad':
            updates = OrderedDict()
            zipped = zip(params, gradients, model_updates)
            for ndx, (param, gradient, accumulated_gradient) in enumerate(zipped):
                agrad = accumulated_gradient + gradient ** 2
                effective_learning_rate = (learning_rate / (Tensor.sqrt(agrad) + eps))
                update, delta = Trainer._calc_update(effective_learning_rate, gradient, param, last_layer=last_layer)
                # delta = effective_learning_rate * gradient
                updates[param] = update
                updates[accumulated_gradient] = agrad

        elif strategy == 'RMSProp':
            updates = OrderedDict()
            zipped = zip(params, gradients, model_updates)
            for ndx, (param, gradient, accumulated_gradient) in enumerate(zipped):
                agrad = rho * accumulated_gradient + (1 - rho) * gradient ** 2
                effective_learning_rate = (learning_rate / (Tensor.sqrt(agrad) + eps))
                update, delta = Trainer._calc_update(effective_learning_rate, gradient, param, last_layer=last_layer)
                # delta = effective_learning_rate * gradient
                updates[param] = update
                updates[accumulated_gradient] = agrad

        elif strategy == 'RMSProp2':
            updates = OrderedDict()
            zipped = zip(params, gradients, model_updates)
            for ndx, (param, gradient, accumulated_gradient) in enumerate(zipped):
                agrad = rho * accumulated_gradient + gradient ** 2
                effective_learning_rate = (learning_rate / (Tensor.sqrt(agrad) + eps))
                update, delta = Trainer._calc_update(effective_learning_rate, gradient, param, last_layer=last_layer)
                # delta = effective_learning_rate * gradient
                updates[param] = update
                updates[accumulated_gradient] = agrad

        elif strategy == 'adaDelta':
            updates = OrderedDict()
            zipped = zip(params, gradients, model_updates, model_deltas)
            for ndx, (param, gradient, accumulated_gradient, accumulated_delta) in enumerate(zipped):
                agrad = rho * accumulated_gradient + (1 - rho) * gradient ** 2
                # delta = Tensor.sqrt((accumulated_delta + eps) / (agrad + eps)) * gradient
                step_size = Tensor.sqrt((accumulated_delta + eps) / (agrad + eps))
                update, delta = Trainer._calc_update(step_size, gradient, param, last_layer=last_layer)
                updates[param] = update
                updates[accumulated_gradient] = agrad
                updates[accumulated_delta] = rho * accumulated_delta + (1 - rho) * (delta ** 2)

        elif strategy == 'adam':
            updates = OrderedDict()
            zipped = zip(params, gradients, model_updates, model_deltas)
            for ndx, (param, gradient, accumulated_gradient, accumulated_delta) in enumerate(zipped):
                moment_1 = bias_1 * accumulated_gradient + (1 - bias_1) * gradient
                moment_2 = bias_2 * accumulated_delta + (1 - bias_2) * gradient ** 2
                corrected_moment_1 = moment_1 / Tensor.cast((1 - bias_1 ** t), theano.config.floatX)
                corrected_moment_2 = moment_2 / Tensor.cast((1 - bias_2 ** t), theano.config.floatX)
                g = corrected_moment_1 / (Tensor.sqrt(corrected_moment_2 + eps))
                update, delta = Trainer._calc_update(learning_rate, g, param, last_layer=last_layer)

                updates[param] = update
                updates[accumulated_gradient] = moment_1
                updates[accumulated_delta] = moment_2
        elif strategy == 'SGDCayley':
            updates = []
            for param, gradient in zip(params, gradients):

                if param.name == 'Wx_layer0' or param.name == 'Wy_layer0':
                    param_update = Trainer._calc_update(learning_rate, gradient, param, 'Cayley',
                                                        last_layer=last_layer)
                else:
                    param_update = Trainer._calc_update(learning_rate, gradient, param, 'Regular',
                                                        last_layer=last_layer)

                updates.append((param, param_update))

        else:
            msg = 'Unknown optimization strategy'
            OutputLog().write(msg)
            raise Exception(msg)

        print 'Building function'

        variance_hidden_x = Tensor.var(x_hidden, axis=0)
        variance_hidden_y = Tensor.var(y_hidden, axis=0)

        if moving_averages is not None:
            Trainer._add_moving_averages(moving_averages, updates, number_of_batches)

        update_mean = []
        update_var = []
        for param in params:
            if 'W' in param.name:
                OutputLog().write('Adding weight: {0}'.format(param.name))
                update_mean.append(Tensor.mean(abs(updates[param])))
                update_var.append(Tensor.var(abs(updates[param])))

        update_mean = Tensor.stacklists(update_mean)
        update_var = Tensor.stacklists(update_var)

        # Building the theano function
        # input : batch index
        # output : both losses
        # updates : gradient decent updates for all params
        # givens : replacing inputs for each iteration

        model = function(inputs=[t],
                         outputs=[Tensor.mean(loss_backward),
                                  Tensor.mean(loss_forward),
                                  Tensor.mean(variance_hidden_x),
                                  Tensor.mean(variance_hidden_y),
                                  x_hidden,
                                  y_hidden, update_mean, update_var] + regularizations + [t],
                         updates=updates)

        return model