Example #1
0
def generation_simple(z_list, n_latent, n_out, y):
    logger.info('generate output without MLP')
    hid_to_out = Linear(name='hidDecoder_to_output', input_dim=n_latent, output_dim=n_out)
    initialize([hid_to_out])
    mysigmoid = Logistic(name='y_hat_vae')
    agg_logpy_xz = 0.
    agg_y_hat = 0.
    for z in z_list:
        lin_out = hid_to_out.apply(z)
        y_hat = mysigmoid.apply(lin_out) #reconstructed x
        logpy_xz = -cross_entropy_loss(y_hat, y)
        agg_logpy_xz += logpy_xz
        agg_y_hat += y_hat
    agg_logpy_xz /= len(z_list)
    agg_y_hat /= len(z_list)
    return agg_y_hat, agg_logpy_xz
Example #2
0
def softmax_layer_old(h, y, hidden_size, num_targets, cost_fn='softmax'):
    hidden_to_output = Linear(name='hidden_to_output', input_dim=hidden_size, output_dim=num_targets)
    initialize([hidden_to_output])
    linear_output = hidden_to_output.apply(h)
    linear_output.name = 'linear_output'
    y_pred = T.argmax(linear_output, axis=1)
    label_of_predicted = debug_print(y[T.arange(y.shape[0]), y_pred], 'label_of_predicted', False)
    pat1 = T.mean(label_of_predicted)
    updates = {}
    if 'softmax' in cost_fn:
        y_hat = Logistic().apply(linear_output)
        y_hat.name = 'y_hat'
        cost = cross_entropy_loss(y_hat, y)
    else:
        cost, updates = ranking_loss(linear_output, y)
    cost.name = 'cost'
    pat1.name = 'precision@1'
    return cost, pat1, updates
Example #3
0
def generation(z_list, n_latent, hu_decoder, n_out, y):
    logger.info('in generation: n_latent: %d, hu_decoder: %d', n_latent, hu_decoder)
    if hu_decoder == 0:
        return generation_simple(z_list, n_latent, n_out, y)
    mlp1 = MLP(activations=[Rectifier()], dims=[n_latent, hu_decoder], name='latent_to_hidDecoder')
    initialize([mlp1])
    hid_to_out = Linear(name='hidDecoder_to_output', input_dim=hu_decoder, output_dim=n_out)
    initialize([hid_to_out])
    mysigmoid = Logistic(name='y_hat_vae')
    agg_logpy_xz = 0.
    agg_y_hat = 0.
    for i, z in enumerate(z_list):
        y_hat = mysigmoid.apply(hid_to_out.apply(mlp1.apply(z))) #reconstructed x
        agg_logpy_xz += cross_entropy_loss(y_hat, y)
        agg_y_hat += y_hat
    
    agg_logpy_xz /= len(z_list)
    agg_y_hat /= len(z_list)
    return agg_y_hat, agg_logpy_xz
Example #4
0
def softmax_layer(h, y, hidden_size, num_targets, cost_fn='cross'):
    hidden_to_output = Linear(name='hidden_to_output', input_dim=hidden_size, output_dim=num_targets)
    initialize([hidden_to_output])
    linear_output = hidden_to_output.apply(h)
    linear_output.name = 'linear_output'
    y_pred = T.argmax(linear_output, axis=1)
    label_of_predicted = debug_print(y[T.arange(y.shape[0]), y_pred], 'label_of_predicted', False)
    pat1 = T.mean(label_of_predicted)
    updates = None
    if 'ranking' in cost_fn:
        cost, updates = ranking_loss(linear_output, y)
        print 'using ranking loss function!'
    else:
        y_hat = Logistic().apply(linear_output)
        y_hat.name = 'y_hat'
        cost = cross_entropy_loss(y_hat, y)
    cost.name = 'cost'
    pat1.name = 'precision@1'
    misclassify_rate = MultiMisclassificationRate().apply(y, T.ge(linear_output, 0.5))
    misclassify_rate.name = 'error_rate'
    return cost, pat1, updates, misclassify_rate
Example #5
0
    def __init__(self, dim, activation=None, gate_activation=None, **kwargs):
        self.dim = dim

        if not activation:
            activation = Tanh()
        if not gate_activation:
            gate_activation = Logistic()
        self.activation = activation
        self.gate_activation = gate_activation

        children = [self.activation, self.gate_activation]
        kwargs.setdefault('children', []).extend(children)
        super(LSTMConv, self).__init__(**kwargs)
Example #6
0
    def __init__(self, dim, activation=None, gate_activation=None, **kwargs):
        self.dim = dim

        if not activation:
            activation = Tanh()
        if not gate_activation:
            gate_activation = Logistic()
        self.activation = activation
        self.gate_activation = gate_activation

        children = ([self.activation, self.gate_activation] +
                    kwargs.get('children', []))
        super(LSTM, self).__init__(children=children, **kwargs)
Example #7
0
    def __init__(self, dim, activation=None, gate_activation=None,
                 **kwargs):
        super(GatedRecurrent, self).__init__(**kwargs)
        self.dim = dim

        if not activation:
            activation = Tanh()
        if not gate_activation:
            gate_activation = Logistic()
        self.activation = activation
        self.gate_activation = gate_activation

        self.children = [activation, gate_activation]
Example #8
0
def test_activations():
    x = tensor.vector()
    x_val = numpy.random.rand(8).astype(theano.config.floatX)
    exp_x_val = numpy.exp(x_val)

    assert_allclose(x_val, Identity().apply(x).eval({x: x_val}))
    assert_allclose(numpy.tanh(x_val), Tanh().apply(x).eval({x: x_val}),
                    rtol=1e-06)
    assert_allclose(numpy.log(1 + exp_x_val),
                    Softplus(x).apply(x).eval({x: x_val}), rtol=1e-6)
    assert_allclose(exp_x_val / numpy.sum(exp_x_val),
                    Softmax(x).apply(x).eval({x: x_val}).flatten(), rtol=1e-6)
    assert_allclose(1.0 / (1.0 + numpy.exp(-x_val)),
                    Logistic(x).apply(x).eval({x: x_val}), rtol=1e-6)
Example #9
0
    def __init__(self, x_dim, hidden_layers, hidden_act, z_dim, **kwargs):
        super(DVAE, self).__init__([], [], **kwargs)

        inits = {
            #'weights_init': IsotropicGaussian(std=0.1),
            'weights_init': RWSInitialization(factor=1.),
            'biases_init': Constant(0.0),
        }

        hidden_act = [hidden_act] * len(hidden_layers)

        q_mlp = BatchNormalizedMLP(hidden_act + [Logistic()],
                                   [x_dim] + hidden_layers + [z_dim], **inits)
        #q_mlp = MLP(hidden_act+[Logistic()], [x_dim]+hidden_layers+[z_dim], **inits)
        p_mlp = BatchNormalizedMLP(hidden_act + [Logistic()],
                                   [z_dim] + hidden_layers + [x_dim], **inits)
        #p_mlp = MLP(hidden_act+[Logistic()], [z_dim]+hidden_layers+[x_dim], **inits)

        self.q = BernoulliLayer(q_mlp, name="q")
        self.p = BernoulliLayer(p_mlp, name="p")
        self.p_top = BernoulliTopLayer(z_dim, biases_init=Constant(0.0))

        self.children = [self.p_top, self.p, self.q]
Example #10
0
    def __init__(self, image_dimension, **kwargs):

        layers = []
        
        #############################################
        # a first block with 2 convolutions of 32 (3, 3) filters
        layers.append(Convolutional((3, 3), 32, border_mode='half'))
        layers.append(Rectifier())
        layers.append(Convolutional((3, 3), 32, border_mode='half'))
        layers.append(Rectifier())

        # maxpool with size=(2, 2)
        layers.append(MaxPooling((2, 2)))

        #############################################
        # a 2nd block with 3 convolutions of 64 (3, 3) filters
        layers.append(Convolutional((3, 3), 64, border_mode='half'))
        layers.append(Rectifier())
        layers.append(Convolutional((3, 3), 64, border_mode='half'))
        layers.append(Rectifier())
        layers.append(Convolutional((3, 3), 64, border_mode='half'))
        layers.append(Rectifier())
        
        # maxpool with size=(2, 2)
        layers.append(MaxPooling((2, 2)))

        #############################################
        # a 3rd block with 4 convolutions of 128 (3, 3) filters
        layers.append(Convolutional((3, 3), 128, border_mode='half'))
        layers.append(Rectifier())
        layers.append(Convolutional((3, 3), 128, border_mode='half'))
        layers.append(Rectifier())
        layers.append(Convolutional((3, 3), 128, border_mode='half'))
        layers.append(Rectifier())
        layers.append(Convolutional((3, 3), 128, border_mode='half'))
        layers.append(Rectifier())
        
        # maxpool with size=(2, 2)
        layers.append(MaxPooling((2, 2)))

        self.conv_sequence = ConvolutionalSequence(layers, 3, image_size=image_dimension)

        flattener = Flattener()

        self.top_mlp = BatchNormalizedMLP(activations=[Rectifier(), Logistic()], dims=[500, 1])

        application_methods = [self.conv_sequence.apply, flattener.apply, self.top_mlp.apply]

        super(VGGNet, self).__init__(application_methods, biases_init=Constant(0), weights_init=Uniform(width=.1), **kwargs)
Example #11
0
    def create_base_model(self, x, y, input_dim):

        # Create the output of the MLP
        mlp = MLP([Tanh(), Tanh(), Logistic()], [input_dim, 100, 100, 1],
                  weights_init=IsotropicGaussian(0.001),
                  biases_init=Constant(0))
        mlp.initialize()
        probs = mlp.apply(x)
        #sq_err = SquaredError()
        cost = T.mean(T.sqr(y.flatten() - probs.flatten()))
        cost.name = 'cost'
        pred_out = probs > 0.5
        mis_cost = T.mean(T.neq(y.flatten(), pred_out.flatten()))
        mis_cost.name = 'MisclassificationRate'
        return mlp, cost, mis_cost
Example #12
0
    def __init__(self, dim, activation=None, gate_activation=None, **kwargs):
        self.dim = dim

        self.recurrent_weights_init = None
        self.initial_states_init = None

        if not activation:
            activation = Tanh()
        if not gate_activation:
            gate_activation = Logistic()
        self.activation = activation
        self.gate_activation = gate_activation

        children = [activation, gate_activation] + kwargs.get('children', [])
        super(GatedRecurrent, self).__init__(children=children, **kwargs)
Example #13
0
    def create_model(self, x, y, input_dim, p):

        # Create the output of the MLP
        mlp = MLP([Tanh(), Tanh(), Logistic()], [input_dim, 200, 100, 1],
                  weights_init=IsotropicGaussian(0.01),
                  biases_init=Constant(0))
        mlp.initialize()
        probs = mlp.apply(x).sum()

        # Create the if-else cost function
        reward = (probs * y * 1.0) / p + (1 - probs) * (1 - y) * 1.0 / (1 - p)
        cost = -reward  # Negative of reward
        cost.name = "cost"

        return mlp, cost
Example #14
0
    def __init__(self, **kwargs):

        children = []

        self.list_simple_joints = [19, 20, 8, 7] + range(16,19)[::-1] + range(4, 7)[::-1] + [1] + [15, 3, 2] + [0]

        self.simple_joints = {}
        for simple_joint_idx in self.list_simple_joints:
            self.simple_joints[simple_joint_idx] = [Convolutional(filter_size=(1,1), num_filters = 512, border_mode = (0,0), use_bias=True, tied_biases=True, name='fconv_' + str(simple_joint_idx), biases_init=Constant(0.), weights_init=IsotropicGaussian(0.01), num_channels = 512), \
                                                    Rectifier(name='fconv_relu' + str(simple_joint_idx)), \
                                                    Convolutional(filter_size=(1,1), num_filters = 1, border_mode = (0,0), use_bias=True, tied_biases=True, name='fconv1_' + str(simple_joint_idx), biases_init=Constant(0.), weights_init=IsotropicGaussian(0.01), num_channels = 512), \
                                                    Logistic(name = 'flogistic_' + str(simple_joint_idx))]
            children += self.simple_joints[simple_joint_idx]

        kwargs.setdefault('children', []).extend(children)
        super(top_direction_block, self).__init__(**kwargs)
Example #15
0
    def __init__(self, dim, activation=None, gate_activation=None,
                 model_type=6, ogates_zoneout=False, **kwargs):
        self.dim = dim
        self.model_type = model_type
        self.ogates_zoneout = ogates_zoneout

        if not activation:
            activation = Tanh()
        if not gate_activation:
            gate_activation = Logistic()
        self.activation = activation
        self.gate_activation = gate_activation

        children = [self.activation, self.gate_activation]
        kwargs.setdefault('children', []).extend(children)
        super(ZoneoutLSTM, self).__init__(**kwargs)
Example #16
0
    def __init__(self, mlp, const=1e-5, **kwargs):
        super(PitchGaussianEmitter, self).__init__(**kwargs)
        self.mlp = mlp
        input_dim = self.mlp.output_dim
        self.const = const

        self.mu = MLP(activations=[Identity()],
                      dims=[input_dim, 1],
                      name=self.name + "_mu")
        self.sigma = MLP(activations=[SoftPlus()],
                         dims=[input_dim, 1],
                         name=self.name + "_sigma")
        self.binary = MLP(activations=[Logistic()],
                          dims=[input_dim, 1],
                          name=self.name + "_binary")

        self.children = [self.mlp, self.mu, self.sigma, self.binary]
Example #17
0
    def build_network(self,
                      num_labels,
                      features,
                      max_len=None,
                      hidden_units=None,
                      l2=None,
                      use_cnn=None,
                      cnn_filter_size=None,
                      cnn_pool_size=None,
                      cnn_num_filters=None,
                      cnn_filter_sizes=None,
                      embedding_size=None,
                      DEBUG=False):
        """ Build the neural network used for training.

        :param num_labels:      Number of labels to classify
        :param features:        the input features we use
        :param max_len:     Configured window-size
        :param hidden_units:    Number of units in the MLP's hiddden layer
        :returns:               The cost function, the misclassification rate
                                function, the computation graph of the cost
                                function and the prediction function
        """
        logger.info(
            'building the network, with one CNN for left and one for right')
        hidden_units = hidden_units or self._config['hidden_units']
        logger.info('#hidden units: %d', hidden_units)
        # building the feature vector from input.
        mlp_in_e1, mlp_in_e2, mlp_in_dim = self.build_feature_vector_noMention(
            features)
        logger.info('feature vector size: %d', mlp_in_dim)

        mlp = MLP(activations=[Rectifier()],
                  dims=[mlp_in_dim, hidden_units],
                  seed=self.curSeed)
        initialize([mlp])
        before_out_e1 = mlp.apply(mlp_in_e1)
        before_out_e2 = mlp.apply(mlp_in_e2)
        hidden_to_output = Linear(name='hidden_to_output',
                                  input_dim=hidden_units,
                                  output_dim=num_labels)
        initialize([hidden_to_output])
        linear_output_e1 = hidden_to_output.apply(before_out_e1)
        linear_output_e2 = hidden_to_output.apply(before_out_e2)
        linear_output_e1.name = 'linear_output_e1'
        linear_output_e2.name = 'linear_output_e2'

        y_hat_e1 = Logistic(name='logistic1').apply(linear_output_e1)
        y_hat_e2 = Logistic(name='logistic2').apply(linear_output_e2)
        y_hat_e1.name = 'y_hat_e1'
        y_hat_e2.name = 'y_hat_e2'
        y_hat_e1 = debug_print(y_hat_e1, 'y_1', DEBUG)
        return y_hat_e1, y_hat_e2, before_out_e1, before_out_e2
Example #18
0
    def create_model(self, x, y, input_dim, p):

        # Create the output of the MLP
        mlp = MLP(
            [Rectifier(), Rectifier(), Logistic()], [input_dim, 150, 100, 1],
            weights_init=IsotropicGaussian(0.01),
            biases_init=Constant(0))
        mlp.initialize()
        probs = 1 - mlp.apply(x)
        y = y.dimshuffle(0, 'x')
        # Create the if-else cost function
        pos_ex = (y * probs) / p
        neg_ex = (1 - y) * (1 - probs) / np.float32(1 - p)
        reward = pos_ex + neg_ex
        cost = reward  # Negative of reward
        cost.name = "cost"

        return mlp, cost
Example #19
0
 def create_model(self):
     x = self.x
     y = self.y
     input_dim = self.input_dim
     p = self.p
     mlp = MLP(
         [Rectifier(), Rectifier(), Logistic()], [input_dim, 100, 80, 1],
         weights_init=IsotropicGaussian(0.001),
         biases_init=Constant(0))
     mlp.initialize()
     self.mlp = mlp
     probs = mlp.apply(x)
     probs.name = "score"
     y = y.dimshuffle(0, 'x')
     # Create the if-else cost function
     pos_ex = (y * probs) / p
     neg_ex = (1 - y) * (1 - probs) / np.float32(1 - p)
     reward = pos_ex + neg_ex
     cost = reward  # Negative of reward
     cost.name = "cost"
     return cost, probs
Example #20
0
    def __init__(self,
                 dim,
                 model_type,
                 update_prob,
                 activation=None,
                 gate_activation=None,
                 **kwargs):
        self.dim = dim
        self.model_type = model_type
        self.update_prob = update_prob

        if not activation:
            activation = Tanh()
        if not gate_activation:
            gate_activation = Logistic()
        self.activation = activation
        self.gate_activation = gate_activation

        children = [self.activation, self.gate_activation]
        kwargs.setdefault('children', []).extend(children)
        super(DropLSTM, self).__init__(**kwargs)
Example #21
0
    def __init__(self, x_dim, hidden_layers, hidden_act, z_dim, batch_norm=False, **kwargs):
        super(VAE, self).__init__([], [], **kwargs)

        inits = {
            'weights_init': IsotropicGaussian(std=0.1),
            #'weights_init': RWSInitialization(factor=1.),
            'biases_init': Constant(0.0),
        }

        hidden_act = [hidden_act]*len(hidden_layers)

        assert batch_norm == False
        q_mlp = MLP(hidden_act             , [x_dim]+hidden_layers, **inits)
        p_mlp = MLP(hidden_act+[Logistic()], [z_dim]+hidden_layers+[x_dim], **inits)

        self.q = GaussianLayer(z_dim, q_mlp, **inits)
        self.p = BernoulliLayer(p_mlp, **inits)

        self.prior_log_sigma = numpy.zeros(z_dim)    # 
        self.prior_mu = numpy.zeros(z_dim)           # 

        self.children = [self.p, self.q]
Example #22
0
def test_activations():
    x = tensor.vector()
    x_val = numpy.random.rand(8).astype(theano.config.floatX)
    exp_x_val = numpy.exp(x_val)

    assert_allclose(x_val, Identity().apply(x).eval({x: x_val}))
    assert_allclose(numpy.tanh(x_val), Tanh().apply(x).eval({x: x_val}),
                    rtol=1e-06)
    assert_allclose(numpy.log(1 + exp_x_val),
                    Softplus(x).apply(x).eval({x: x_val}), rtol=1e-6)
    assert_allclose(exp_x_val / numpy.sum(exp_x_val),
                    Softmax(x).apply(x).eval({x: x_val}).flatten(), rtol=1e-6)
    assert_allclose(1.0 / (1.0 + numpy.exp(-x_val)),
                    Logistic(x).apply(x).eval({x: x_val}), rtol=1e-6)
    leaky_out_1 = x_val - 0.5
    leaky_out_1[leaky_out_1 < 0] *= 0.01
    assert_allclose(leaky_out_1,
                    LeakyRectifier().apply(x).eval({x: x_val - 0.5}))
    leaky_out_2 = x_val - 0.5
    leaky_out_2[leaky_out_2 < 0] *= 0.05
    assert_allclose(leaky_out_2,
                    LeakyRectifier(leak=0.05).apply(x).eval({x: x_val - 0.5}))
Example #23
0
    def __init__(self,
                 dim,
                 attended_dim,
                 activation=None,
                 gate_activation=None,
                 **kwargs):
        super(GRU, self).__init__(**kwargs)
        self.dim = dim
        self.attended_dim = attended_dim

        if not activation:
            activation = Tanh()
        if not gate_activation:
            gate_activation = Logistic()
        self.activation = activation
        self.gate_activation = gate_activation

        self.initial_transformer = MLP(activations=[Tanh()],
                                       dims=[attended_dim, self.dim],
                                       name='state_initializer')

        self.children = [activation, gate_activation, self.initial_transformer]
    def __init__(self, embedding_dim, state_dim, **kwargs):
        super(BidirectionalEncoderSigmoid, self).__init__(**kwargs)
        self.embedding_dim = embedding_dim
        self.state_dim = state_dim

        curSeed = 1791095845
        self.rng = numpy.random.RandomState(curSeed)

        self.bidir = BidirectionalWMT15(
            GatedRecurrentWithZerosAtMask(activation=Logistic(),
                                          dim=state_dim))
        self.fwd_fork = Fork([
            name
            for name in self.bidir.prototype.apply.sequences if name != 'mask'
        ],
                             prototype=Linear(),
                             name='fwd_fork')
        self.back_fork = Fork([
            name
            for name in self.bidir.prototype.apply.sequences if name != 'mask'
        ],
                              prototype=Linear(),
                              name='back_fork')

        #self.children = [self.lookup, self.bidir,
        self.children = [self.bidir, self.fwd_fork, self.back_fork]

        self._push_allocation_config(
        )  # maybe not necessary? (maybe only necessary for decoder)

        print "RNN seed: " + str(self.rng.get_state()[1][0])
        # initialization of parameters
        self.weights_init = IsotropicGaussian()
        self.biases_init = Constant(0)
        self.push_initialization_config()
        self.bidir.prototype.weights_init = Orthogonal()
        self.initialize()
Example #25
0
def test_benoulli_layer():
    # Setup layer
    dim_y = 50
    dim_x = 100

    mlp = MLP([Logistic()], [dim_y, dim_x], **inits)

    l = BernoulliLayer(mlp, name="layer", **inits)
    l.initialize()

    y = tensor.fmatrix('y')
    x_expected = l.sample_expected(y)
    x, x_log_prob = l.sample(y)

    do = theano.function([y], [x_expected, x, x_log_prob],
                         allow_input_downcast=True)

    y = numpy.eye(50, dtype=numpy.float32)

    x_expected, x, x_log_prob = do(y)

    assert x_expected.shape == (50, dim_x)
    assert x.shape == (50, dim_x)
    assert x_log_prob.shape == (50, )
Example #26
0
def build_model_new(fea2obj,
                    num_targets,
                    config,
                    kl_weight,
                    entropy_weight,
                    deterministic=False,
                    test=False):
    hidden_size = config['hidden_units'].split()
    use_highway = str_to_bool(
        config['use_highway']) if 'use_highway' in config else False
    use_gaus = str_to_bool(
        config['use_gaus']) if 'use_gaus' in config else False
    use_rec = str_to_bool(config['use_rec']) if 'use_rec' in config else True
    n_latent_z = int(config['n_latent']) if 'use_gaus' in config else 0
    use_noise = str_to_bool(
        config['use_noise']) if 'use_noise' in config else False
    use_vae = str_to_bool(config['use_vae']) if 'use_vae' in config else False
    hu_decoder = int(
        config['hu_decoder']) if 'hu_decoder' in config else hidden_size
    logger.info(
        'use_gaus: %s, use_rec: %s, use_noise: %s, use_vae: %s, hidden_size: %s, n_latent_z: %d, hu_decoder: %s, hu_encoder: %s',
        use_gaus, use_rec, use_noise, use_vae, hidden_size, n_latent_z,
        hu_decoder, hidden_size)
    init_with_type = str_to_bool(
        config['init_with_type']) if 'init_with_type' in config else False
    y = T.matrix('targets', dtype='int32')

    drop_prob = float(config['dropout']) if 'dropout' in config else 0

    #build the feature vector with one model, e.g., with cnn or mean or lstm
    feature_vec, feature_vec_len = build_feature_vec(fea2obj, config)

    #drop out
    if drop_prob > 0:
        mask = T.cast(
            srng.binomial(n=1, p=1 - drop_prob, size=feature_vec.shape),
            'float32')
        if test:
            feature_vec *= (1 - drop_prob)
        else:
            feature_vec *= mask

    #Highway network
    if use_highway:
        g_mlp = MLP(activations=[Rectifier()],
                    dims=[feature_vec_len, feature_vec_len],
                    name='g_mlp')
        t_mlp = MLP(activations=[Logistic()],
                    dims=[feature_vec_len, feature_vec_len],
                    name='t_mlp')
        initialize([g_mlp, t_mlp])
        t = t_mlp.apply(feature_vec)
        z = t * g_mlp.apply(feature_vec) + (1. - t) * feature_vec
        feature_vec = z

    #MLP(s)
    logger.info('feature vec length = %s and hidden layer units = %s',
                feature_vec_len, ' '.join(hidden_size))
    if len(hidden_size) > 1:
        #2 MLP on feature fector
        mlp = MLP(
            activations=[Rectifier(), Rectifier()],
            dims=[feature_vec_len,
                  int(hidden_size[0]),
                  int(hidden_size[1])],
            name='joint_mlp')
        initialize([mlp])
        before_out = mlp.apply(feature_vec)
        last_hidden_size = int(hidden_size[1])
    else:
        hidden_size = int(hidden_size[0])
        mlp = MLP(activations=[Rectifier()],
                  dims=[feature_vec_len, hidden_size],
                  name='joint_mlp')
        initialize([mlp])
        before_out = mlp.apply(feature_vec)
        last_hidden_size = hidden_size

    #compute y_hat initial guess
    hidden_to_output = Linear(name='hidden_to_output',
                              input_dim=last_hidden_size,
                              output_dim=num_targets)

    typemfile = None
    if init_with_type:
        typemfile = config['dsdir'] + '/_typematrix.npy'
        #typemfile = config['dsdir'] + '/_typeCooccurrMatrix.npy'

    initialize_lasthid(hidden_to_output, typemfile)
    #         initialize([hidden_to_output])

    y_hat_init = Logistic().apply(hidden_to_output.apply(before_out))
    y_hat_init.name = 'y_hat_init'
    y_hat_init = debug_print(y_hat_init, 'yhat_init', False)
    logpy_xz_init = cross_entropy_loss(y_hat_init, y)
    logpy_xz = logpy_xz_init
    y_hat_recog = y_hat_init
    y_hat = y_hat_init
    KLD = 0

    if use_gaus:
        if use_vae:
            logger.info('using VAE')
            vae_conditional = str_to_bool(config['vae_cond'])
            if vae_conditional:
                y_hat, logpy_xz, KLD, y_hat_recog = build_vae_conditoinal(
                    kl_weight,
                    entropy_weight,
                    y_hat_init,
                    feature_vec,
                    feature_vec_len,
                    config,
                    y,
                    test=test,
                    deterministic=deterministic,
                    num_targets=num_targets,
                    n_latent_z=n_latent_z,
                    hidden_size=hidden_size,
                    hu_decoder=hu_decoder)
            else:
                y_hat, logpy_xz, KLD = build_vae_basic(
                    kl_weight,
                    feature_vec,
                    feature_vec_len,
                    config,
                    y,
                    test=test,
                    deterministic=deterministic,
                    num_targets=num_targets,
                    n_latent_z=n_latent_z,
                    hidden_size=hidden_size,
                    hu_decoder=hu_decoder)
                y_hat_recog = y_hat
        else:
            if use_rec:
                logger.info('Not using VAE... but using recursion')
                prior_in = T.concatenate([feature_vec, y_hat_init], axis=1)
                mu_prior, log_sigma_prior = prior_network(
                    x=prior_in,
                    n_input=feature_vec_len + num_targets,
                    hu_encoder=hidden_size,
                    n_latent=n_latent_z)
                z_prior = sampler(mu_prior,
                                  log_sigma_prior,
                                  deterministic=deterministic,
                                  use_noise=use_noise)
                zl = [T.concatenate([z_prior, feature_vec], axis=1)]
                y_hat, logpy_xz = generation(zl,
                                             n_latent=n_latent_z +
                                             feature_vec_len,
                                             hu_decoder=hu_decoder,
                                             n_out=num_targets,
                                             y=y)
                y_hat = (y_hat + y_hat_init) / 2.
                logpy_xz = (logpy_xz + logpy_xz_init) / 2.
            else:
                prior_in = T.concatenate([feature_vec], axis=1)
                mu_prior, log_sigma_prior = prior_network(
                    x=prior_in,
                    n_input=feature_vec_len,
                    hu_encoder=hidden_size,
                    n_latent=n_latent_z)
                z_prior = sampler(mu_prior,
                                  log_sigma_prior,
                                  deterministic=deterministic,
                                  use_noise=use_noise)
                zl = [T.concatenate([z_prior, feature_vec], axis=1)]
                y_hat, logpy_xz = generation(zl,
                                             n_latent=n_latent_z +
                                             feature_vec_len,
                                             hu_decoder=hu_decoder,
                                             n_out=num_targets,
                                             y=y)

            y_hat_recog = y_hat

    y_hat = debug_print(y_hat, 'y_hat', False)

    pat1 = T.mean(y[T.arange(y.shape[0]), T.argmax(y_hat, axis=1)])
    max_type = debug_print(T.argmax(y_hat_recog, axis=1), 'max_type', False)
    pat1_recog = T.mean(y[T.arange(y.shape[0]), max_type])
    mean_cross = T.mean(logpy_xz)
    mean_kld = T.mean(KLD)
    cost = mean_kld + mean_cross
    cost.name = 'cost'
    mean_kld.name = 'kld'
    mean_cross.name = 'cross_entropy_loss'
    pat1.name = 'p@1'
    pat1_recog.name = 'p@1_recog'
    misclassify_rate = MultiMisclassificationRate().apply(y, T.ge(y_hat, 0.5))
    misclassify_rate.name = 'error_rate'

    return cost, pat1, y_hat, mean_kld, mean_cross, pat1_recog, misclassify_rate
Example #27
0
def build_model_hard(vocab_size, args, dtype=floatX):
    logger.info('Building model ...')

    # Parameters for the model
    context = args.context
    state_dim = args.state_dim
    layers = args.layers
    skip_connections = args.skip_connections

    # Symbolic variables
    # In both cases: Time X Batch
    x = tensor.lmatrix('features')
    y = tensor.lmatrix('targets')

    # Build the model
    output_names = []
    output_dims = []
    for d in range(layers):
        if d > 0:
            suffix = '_' + str(d)
        else:
            suffix = ''
        if d == 0 or skip_connections:
            output_names.append("inputs" + suffix)
            output_dims.append(state_dim)

    lookup = LookupTable(length=vocab_size, dim=state_dim)
    lookup.weights_init = initialization.IsotropicGaussian(0.1)
    lookup.biases_init = initialization.Constant(0)

    fork = Fork(output_names=output_names,
                input_dim=args.mini_batch_size,
                output_dims=output_dims,
                prototype=FeedforwardSequence([lookup.apply]))

    transitions = [SimpleRecurrent(dim=state_dim, activation=Tanh())]
    for i in range(layers - 1):
        mlp = MLP(activations=[Logistic()],
                  dims=[2 * state_dim, 1],
                  weights_init=initialization.IsotropicGaussian(0.1),
                  biases_init=initialization.Constant(0),
                  name="mlp_" + str(i))
        transitions.append(
            HardGatedRecurrent(dim=state_dim, mlp=mlp, activation=Tanh()))

    rnn = RecurrentStack(transitions, skip_connections=skip_connections)

    # dim = layers * state_dim
    output_layer = Linear(input_dim=layers * state_dim,
                          output_dim=vocab_size,
                          name="output_layer")

    # Return list of 3D Tensor, one for each layer
    # (Time X Batch X embedding_dim)
    pre_rnn = fork.apply(x)

    # Give a name to the input of each layer
    if skip_connections:
        for t in range(len(pre_rnn)):
            pre_rnn[t].name = "pre_rnn_" + str(t)
    else:
        pre_rnn.name = "pre_rnn"

    # Prepare inputs for the RNN
    kwargs = OrderedDict()
    init_states = {}
    for d in range(layers):
        if d > 0:
            suffix = '_' + str(d)
        else:
            suffix = ''
        if skip_connections:
            kwargs['inputs' + suffix] = pre_rnn[d]
        elif d == 0:
            kwargs['inputs' + suffix] = pre_rnn
        init_states[d] = theano.shared(numpy.zeros(
            (args.mini_batch_size, state_dim)).astype(floatX),
                                       name='state0_%d' % d)
        kwargs['states' + suffix] = init_states[d]

    # Apply the RNN to the inputs
    h = rnn.apply(low_memory=True, **kwargs)

    # Now we have correctly:
    # h = [state_1, state_2, state_3 ...]

    # Save all the last states
    last_states = {}
    for d in range(layers):
        last_states[d] = h[d][-1, :, :]

    # Concatenate all the states
    if layers > 1:
        h = tensor.concatenate(h, axis=2)
    h.name = "hidden_state"

    # The updates of the hidden states
    updates = []
    for d in range(layers):
        updates.append((init_states[d], last_states[d]))

    presoft = output_layer.apply(h[context:, :, :])
    # Define the cost
    # Compute the probability distribution
    time, batch, feat = presoft.shape
    presoft.name = 'presoft'

    cross_entropy = Softmax().categorical_cross_entropy(
        y[context:, :].flatten(), presoft.reshape((batch * time, feat)))
    cross_entropy = cross_entropy / tensor.log(2)
    cross_entropy.name = "cross_entropy"

    # TODO: add regularisation for the cost
    # the log(1) is here in order to differentiate the two variables
    # for monitoring
    cost = cross_entropy + tensor.log(1)
    cost.name = "regularized_cost"

    # Initialize the model
    logger.info('Initializing...')

    fork.initialize()

    rnn.weights_init = initialization.Orthogonal()
    rnn.biases_init = initialization.Constant(0)
    rnn.initialize()

    output_layer.weights_init = initialization.IsotropicGaussian(0.1)
    output_layer.biases_init = initialization.Constant(0)
    output_layer.initialize()

    return cost, cross_entropy, updates
Example #28
0
def main(args):
    """Run experiment. """
    lr_tag = float_tag(args.learning_rate)

    x_dim, train_stream, valid_stream, test_stream = datasets.get_streams(
        args.data, args.batch_size)

    #------------------------------------------------------------
    # Setup model
    deterministic_act = Tanh
    deterministic_size = 1.

    if args.method == 'vae':
        sizes_tag = args.layer_spec.replace(",", "-")
        layer_sizes = [int(i) for i in args.layer_spec.split(",")]
        layer_sizes, z_dim = layer_sizes[:-1], layer_sizes[-1]

        name = "%s-%s-%s-lr%s-spl%d-%s" % \
            (args.data, args.method, args.name, lr_tag, args.n_samples, sizes_tag)

        if args.activation == "tanh":
            hidden_act = Tanh()
        elif args.activation == "logistic":
            hidden_act = Logistic()
        elif args.activation == "relu":
            hidden_act = Rectifier()
        else:
            raise "Unknown hidden nonlinearity %s" % args.hidden_act

        model = VAE(x_dim=x_dim,
                    hidden_layers=layer_sizes,
                    hidden_act=hidden_act,
                    z_dim=z_dim,
                    batch_norm=args.batch_normalization)
        model.initialize()
    elif args.method == 'dvae':
        sizes_tag = args.layer_spec.replace(",", "-")
        layer_sizes = [int(i) for i in args.layer_spec.split(",")]
        layer_sizes, z_dim = layer_sizes[:-1], layer_sizes[-1]

        name = "%s-%s-%s-lr%s-spl%d-%s" % \
            (args.data, args.method, args.name, lr_tag, args.n_samples, sizes_tag)

        if args.activation == "tanh":
            hidden_act = Tanh()
        elif args.activation == "logistic":
            hidden_act = Logistic()
        elif args.activation == "relu":
            hidden_act = Rectifier()
        else:
            raise "Unknown hidden nonlinearity %s" % args.hidden_act

        model = DVAE(x_dim=x_dim,
                     hidden_layers=layer_sizes,
                     hidden_act=hidden_act,
                     z_dim=z_dim,
                     batch_norm=args.batch_normalization)
        model.initialize()
    elif args.method == 'rws':
        sizes_tag = args.layer_spec.replace(",", "-")
        qbase = "" if not args.no_qbaseline else "noqb-"

        name = "%s-%s-%s-%slr%s-dl%d-spl%d-%s" % \
            (args.data, args.method, args.name, qbase, lr_tag, args.deterministic_layers, args.n_samples, sizes_tag)

        p_layers, q_layers = create_layers(args.layer_spec, x_dim,
                                           args.deterministic_layers,
                                           deterministic_act,
                                           deterministic_size)

        model = ReweightedWakeSleep(
            p_layers,
            q_layers,
            qbaseline=(not args.no_qbaseline),
        )
        model.initialize()
    elif args.method == 'bihm-rws':
        sizes_tag = args.layer_spec.replace(",", "-")
        name = "%s-%s-%s-lr%s-dl%d-spl%d-%s" % \
            (args.data, args.method, args.name, lr_tag, args.deterministic_layers, args.n_samples, sizes_tag)

        p_layers, q_layers = create_layers(args.layer_spec, x_dim,
                                           args.deterministic_layers,
                                           deterministic_act,
                                           deterministic_size)

        model = BiHM(
            p_layers,
            q_layers,
            l1reg=args.l1reg,
            l2reg=args.l2reg,
        )
        model.initialize()
    elif args.method == 'continue':
        import cPickle as pickle
        from os.path import basename, splitext

        with open(args.model_file, 'rb') as f:
            m = pickle.load(f)

        if isinstance(m, MainLoop):
            m = m.model

        model = m.get_top_bricks()[0]
        while len(model.parents) > 0:
            model = model.parents[0]

        assert isinstance(model, (BiHM, ReweightedWakeSleep, VAE))

        mname, _, _ = basename(args.model_file).rpartition("_model.pkl")
        name = "%s-cont-%s-lr%s-spl%s" % (mname, args.name, lr_tag,
                                          args.n_samples)
    else:
        raise ValueError("Unknown training method '%s'" % args.method)

    #------------------------------------------------------------

    x = tensor.matrix('features')

    #------------------------------------------------------------
    # Testset monitoring

    train_monitors = []
    valid_monitors = []
    test_monitors = []
    for s in [1, 10, 100, 1000]:
        log_p, log_ph = model.log_likelihood(x, s)
        log_p = -log_p.mean()
        log_ph = -log_ph.mean()
        log_p.name = "log_p_%d" % s
        log_ph.name = "log_ph_%d" % s

        #train_monitors += [log_p, log_ph]
        #valid_monitors += [log_p, log_ph]
        test_monitors += [log_p, log_ph]

    #------------------------------------------------------------
    # Z estimation
    #for s in [100000]:
    #    z2 = tensor.exp(model.estimate_log_z2(s)) / s
    #    z2.name = "z2_%d" % s
    #
    #    valid_monitors += [z2]
    #    test_monitors += [z2]

    #------------------------------------------------------------
    # Gradient and training monitoring

    if args.method in ['vae', 'dvae']:
        log_p_bound, gradients = model.get_gradients(x, args.n_samples)
        log_p_bound = -log_p_bound.mean()
        log_p_bound.name = "log_p_bound"
        cost = log_p_bound

        train_monitors += [
            log_p_bound,
            named(model.kl_term.mean(), 'kl_term'),
            named(model.recons_term.mean(), 'recons_term')
        ]
        valid_monitors += [
            log_p_bound,
            named(model.kl_term.mean(), 'kl_term'),
            named(model.recons_term.mean(), 'recons_term')
        ]
        test_monitors += [
            log_p_bound,
            named(model.kl_term.mean(), 'kl_term'),
            named(model.recons_term.mean(), 'recons_term')
        ]
    else:
        log_p, log_ph, gradients = model.get_gradients(x, args.n_samples)
        log_p = -log_p.mean()
        log_ph = -log_ph.mean()
        log_p.name = "log_p"
        log_ph.name = "log_ph"
        cost = log_ph

        train_monitors += [log_p, log_ph]
        valid_monitors += [log_p, log_ph]

    #------------------------------------------------------------
    # Detailed monitoring
    """
    n_layers = len(p_layers)

    log_px, w, log_p, log_q, samples = model.log_likelihood(x, n_samples)

    exp_samples = []
    for l in xrange(n_layers):
        e = (w.dimshuffle(0, 1, 'x')*samples[l]).sum(axis=1)
        e.name = "inference_h%d" % l
        e.tag.aggregation_scheme = aggregation.TakeLast(e)
        exp_samples.append(e)

    s1 = samples[1]
    sh1 = s1.shape
    s1_ = s1.reshape([sh1[0]*sh1[1], sh1[2]])
    s0, _ = model.p_layers[0].sample_expected(s1_)
    s0 = s0.reshape([sh1[0], sh1[1], s0.shape[1]])
    s0 = (w.dimshuffle(0, 1, 'x')*s0).sum(axis=1)
    s0.name = "inference_h0^"
    s0.tag.aggregation_scheme = aggregation.TakeLast(s0)
    exp_samples.append(s0)

    # Draw P-samples
    p_samples, _, _ = model.sample_p(100)
    #weights = model.importance_weights(samples)
    #weights = weights / weights.sum()

    for i, s in enumerate(p_samples):
        s.name = "psamples_h%d" % i
        s.tag.aggregation_scheme = aggregation.TakeLast(s)

    #
    samples = model.sample(100, oversample=100)

    for i, s in enumerate(samples):
        s.name = "samples_h%d" % i
        s.tag.aggregation_scheme = aggregation.TakeLast(s)
    """
    cg = ComputationGraph([cost])

    #------------------------------------------------------------

    if args.step_rule == "momentum":
        step_rule = Momentum(args.learning_rate, 0.95)
    elif args.step_rule == "rmsprop":
        step_rule = RMSProp(args.learning_rate)
    elif args.step_rule == "adam":
        step_rule = Adam(args.learning_rate)
    else:
        raise "Unknown step_rule %s" % args.step_rule

    #parameters = cg.parameters[:4] + cg.parameters[5:]
    parameters = cg.parameters

    algorithm = GradientDescent(
        cost=cost,
        parameters=parameters,
        gradients=gradients,
        step_rule=CompositeRule([
            #StepClipping(25),
            step_rule,
            #RemoveNotFinite(1.0),
        ]))

    #------------------------------------------------------------

    train_monitors += [
        aggregation.mean(algorithm.total_gradient_norm),
        aggregation.mean(algorithm.total_step_norm)
    ]

    #------------------------------------------------------------

    # Live plotting?
    plotting_extensions = []
    if args.live_plotting:
        plotting_extensions = [
            PlotManager(
                name,
                [
                    Plotter(channels=[[
                        "valid_%s" % cost.name, "valid_log_p"
                    ], ["train_total_gradient_norm", "train_total_step_norm"]],
                            titles=[
                                "validation cost",
                                "norm of training gradient and step"
                            ]),
                    DisplayImage(
                        [
                            WeightDisplay(model.p_layers[0].mlp.
                                          linear_transformations[0].W,
                                          n_weights=100,
                                          image_shape=(28, 28))
                        ]
                        #ImageDataStreamDisplay(test_stream, image_shape=(28,28))]
                    )
                ])
        ]

    main_loop = MainLoop(
        model=Model(cost),
        data_stream=train_stream,
        algorithm=algorithm,
        extensions=[
            Timing(),
            ProgressBar(),
            TrainingDataMonitoring(
                train_monitors, prefix="train", after_epoch=True),
            DataStreamMonitoring(
                valid_monitors, data_stream=valid_stream, prefix="valid"),
            DataStreamMonitoring(test_monitors,
                                 data_stream=test_stream,
                                 prefix="test",
                                 after_epoch=False,
                                 after_training=True,
                                 every_n_epochs=10),
            #SharedVariableModifier(
            #    algorithm.step_rule.components[0].learning_rate,
            #    half_lr_func,
            #    before_training=False,
            #    after_epoch=False,
            #    after_batch=False,
            #    every_n_epochs=half_lr),
            TrackTheBest('valid_%s' % cost.name),
            Checkpoint(name + ".pkl", save_separately=['log', 'model']),
            FinishIfNoImprovementAfter('valid_%s_best_so_far' % cost.name,
                                       epochs=args.patience),
            FinishAfter(after_n_epochs=args.max_epochs),
            Printing()
        ] + plotting_extensions)
    main_loop.run()
Example #29
0
        bn_brick(),
        LeakyRectifier(leak=LEAK),
        conv_transpose_brick(4, 1, 64),
        bn_brick(),
        LeakyRectifier(leak=LEAK),
        conv_transpose_brick(4, 2, 32),
        bn_brick(),
        LeakyRectifier(leak=LEAK),
        conv_transpose_brick(5, 1, 32),
        bn_brick(),
        LeakyRectifier(leak=LEAK),
        conv_transpose_brick(1, 1, 32),
        bn_brick(),
        LeakyRectifier(leak=LEAK),
        conv_brick(1, 1, NUM_CHANNELS),
        Logistic()
    ]

    decoder = Decoder(layers=layers,
                      num_channels=(NLAT + NEMB),
                      image_size=(1, 1),
                      weights_init=WEIGHTS_INIT,
                      biases_init=BIASES_INIT)
    decoder.initialize()
    decoder_fun = function([z, y], decoder.apply(z, embeddings))
    out = decoder_fun(z_hat, test_labels)

    # Discriminator

    layers = [
        conv_brick(5, 1, 32),
Example #30
0
    def __init__(self,
                 vocab_size,
                 embedding_dim,
                 state_dim,
                 att_dim,
                 maxout_dim,
                 representation_dim,
                 attention_strategy='content',
                 attention_sources='s',
                 readout_sources='sfa',
                 memory='none',
                 memory_size=500,
                 seq_len=50,
                 init_strategy='last',
                 theano_seed=None,
                 **kwargs):
        """Creates a new decoder brick without embedding.
        
        Args:
            vocab_size (int): Target language vocabulary size
            embedding_dim (int): Size of feedback embedding layer
            state_dim (int): Number of hidden units
            att_dim (int): Size of attention match vector
            maxout_dim (int): Size of maxout layer
            representation_dim (int): Dimension of source annotations
            attention_strategy (string): Which attention should be used
                                         cf.  ``_initialize_attention``
            attention_sources (string): Defines the sources used by the 
                                        attention model 's' for decoder
                                        states, 'f' for feedback
            readout_sources (string): Defines the sources used in the 
                                      readout network. 's' for decoder
                                      states, 'f' for feedback, 'a' for
                                      attention (context vector)
            memory (string): Which external memory should be used
                             (cf.  ``_initialize_attention``)
            memory_size (int): Size of the external memory structure
            seq_len (int): Maximum sentence length
            init_strategy (string): How to initialize the RNN state
                                    (cf.  ``GRUInitialState``)
            theano_seed: Random seed
        """
        super(NoLookupDecoder, self).__init__(**kwargs)
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.state_dim = state_dim
        self.representation_dim = representation_dim
        self.theano_seed = theano_seed

        # Initialize gru with special initial state
        self.transition = GRUInitialState(attended_dim=state_dim,
                                          init_strategy=init_strategy,
                                          dim=state_dim,
                                          activation=Tanh(),
                                          name='decoder')

        # Initialize the attention mechanism
        att_dim = att_dim if att_dim > 0 else state_dim
        self.attention, src_names = _initialize_attention(
            attention_strategy, seq_len, self.transition, representation_dim,
            att_dim, attention_sources, readout_sources, memory, memory_size)

        # Initialize the readout, note that SoftmaxEmitter emits -1 for
        # initial outputs which is used by LookupFeedBackWMT15
        maxout_dim = maxout_dim if maxout_dim > 0 else state_dim
        readout = Readout(
            source_names=src_names,
            readout_dim=embedding_dim,
            emitter=NoLookupEmitter(initial_output=-1,
                                    readout_dim=embedding_dim,
                                    cost_brick=SquaredError()),
            #                        cost_brick=CategoricalCrossEntropy()),
            feedback_brick=TrivialFeedback(output_dim=embedding_dim),
            post_merge=InitializableFeedforwardSequence([
                Bias(dim=maxout_dim, name='maxout_bias').apply,
                Maxout(num_pieces=2, name='maxout').apply,
                Linear(input_dim=maxout_dim / 2,
                       output_dim=embedding_dim,
                       use_bias=False,
                       name='softmax0').apply,
                Logistic(name='softmax1').apply
            ]),
            merged_dim=maxout_dim)

        # Build sequence generator accordingly
        self.sequence_generator = SequenceGenerator(
            readout=readout,
            transition=self.transition,
            attention=self.attention,
            fork=Fork([
                name
                for name in self.transition.apply.sequences if name != 'mask'
            ],
                      prototype=Linear()))

        self.children = [self.sequence_generator]
Example #31
0
def test_variable_filter():
    # Creating computation graph
    brick1 = Linear(input_dim=2, output_dim=2, name="linear1")
    brick2 = Bias(2, name="bias1")
    activation = Logistic(name="sigm")

    x = tensor.vector()
    h1 = brick1.apply(x)
    h2 = activation.apply(h1)
    h2.name = "h2act"
    y = brick2.apply(h2)
    cg = ComputationGraph(y)

    parameters = [brick1.W, brick1.b, brick2.parameters[0]]
    bias = [brick1.b, brick2.parameters[0]]
    brick1_bias = [brick1.b]

    # Testing filtering by role
    role_filter = VariableFilter(roles=[PARAMETER])
    assert parameters == role_filter(cg.variables)
    role_filter = VariableFilter(roles=[FILTER])
    assert [] == role_filter(cg.variables)

    # Testing filtering by role using each_role flag
    role_filter = VariableFilter(roles=[PARAMETER, BIAS])
    assert parameters == role_filter(cg.variables)
    role_filter = VariableFilter(roles=[PARAMETER, BIAS], each_role=True)
    assert not parameters == role_filter(cg.variables)
    assert bias == role_filter(cg.variables)

    # Testing filtering by bricks classes
    brick_filter = VariableFilter(roles=[BIAS], bricks=[Linear])
    assert brick1_bias == brick_filter(cg.variables)

    # Testing filtering by bricks instances
    brick_filter = VariableFilter(roles=[BIAS], bricks=[brick1])
    assert brick1_bias == brick_filter(cg.variables)

    # Testing filtering by brick instance
    brick_filter = VariableFilter(roles=[BIAS], bricks=[brick1])
    assert brick1_bias == brick_filter(cg.variables)

    # Testing filtering by name
    name_filter = VariableFilter(name="W_norm")
    assert [cg.variables[2]] == name_filter(cg.variables)

    # Testing filtering by name regex
    name_filter_regex = VariableFilter(name_regex="W_no.?m")
    assert [cg.variables[2]] == name_filter_regex(cg.variables)

    # Testing filtering by theano name
    theano_name_filter = VariableFilter(theano_name="h2act")
    assert [cg.variables[11]] == theano_name_filter(cg.variables)

    # Testing filtering by theano name regex
    theano_name_filter_regex = VariableFilter(theano_name_regex="h2a.?t")
    assert [cg.variables[11]] == theano_name_filter_regex(cg.variables)

    # Testing filtering by application
    appli_filter = VariableFilter(applications=[brick1.apply])
    variables = [cg.variables[1], cg.variables[8]]
    assert variables == appli_filter(cg.variables)

    # Testing filtering by application
    appli_filter_list = VariableFilter(applications=[brick1.apply])
    assert variables == appli_filter_list(cg.variables)

    input1 = tensor.matrix("input1")
    input2 = tensor.matrix("input2")
    merge = Merge(["input1", "input2"], [5, 6], 2)
    merged = merge.apply(input1, input2)
    merge_cg = ComputationGraph(merged)
    outputs = VariableFilter(roles=[OUTPUT], bricks=[merge])(merge_cg.variables)
    assert merged in outputs
    assert len(outputs) == 3

    outputs_application = VariableFilter(roles=[OUTPUT], applications=[merge.apply])(merge_cg.variables)
    assert outputs_application == [merged]
Example #32
0
	weights_init=IsotropicGaussian(0.01), biases_init=Constant(0))
l4 = Linear(name='l4', input_dim=500, output_dim=500,
	weights_init=IsotropicGaussian(0.01), biases_init=Constant(0))
l5 = Linear(name='l5', input_dim=500, output_dim=10,
	weights_init=IsotropicGaussian(0.01), biases_init=Constant(0))

l1.initialize()
l2.initialize()
l3.initialize()
l4.initialize()
l5.initialize()

a1 = l1.apply(x.flatten(2)/255)
a1.name = 'a1'
n1, M1, S1 = normalize(a1, output_dim = 500)
o1 = Logistic().apply(n1)

a2 = l2.apply(o1)
n2, M2, S2  = normalize(a2, output_dim = 500)
o2 = Logistic().apply(n2)

a3 = l3.apply(o2)
n3, M3, S3  = normalize(a3, output_dim = 500)
o3 = Logistic().apply(n3)

a4 = l4.apply(o3)
n4, M4, S4  = normalize(a4, output_dim = 500)
o4 = Logistic().apply(n4)

a5 = l5.apply(o4)
n5, M5, S5 = normalize(a5, output_dim = 10)
Example #33
0
def create_model_bricks(z_dim, image_size, depth):

    g_image_size = image_size
    g_image_size2 = g_image_size / 2
    g_image_size3 = g_image_size / 4
    g_image_size4 = g_image_size / 8
    g_image_size5 = g_image_size / 16

    encoder_layers = []
    if depth > 0:
        encoder_layers = encoder_layers + [
            Convolutional(filter_size=(3, 3),
                          border_mode=(1, 1),
                          num_filters=32,
                          name='conv1'),
            SpatialBatchNormalization(name='batch_norm1'),
            Rectifier(),
            Convolutional(filter_size=(3, 3),
                          border_mode=(1, 1),
                          num_filters=32,
                          name='conv2'),
            SpatialBatchNormalization(name='batch_norm2'),
            Rectifier(),
            Convolutional(
                filter_size=(2, 2), step=(2, 2), num_filters=32, name='conv3'),
            SpatialBatchNormalization(name='batch_norm3'),
            Rectifier()
        ]
    if depth > 1:
        encoder_layers = encoder_layers + [
            Convolutional(filter_size=(3, 3),
                          border_mode=(1, 1),
                          num_filters=64,
                          name='conv4'),
            SpatialBatchNormalization(name='batch_norm4'),
            Rectifier(),
            Convolutional(filter_size=(3, 3),
                          border_mode=(1, 1),
                          num_filters=64,
                          name='conv5'),
            SpatialBatchNormalization(name='batch_norm5'),
            Rectifier(),
            Convolutional(
                filter_size=(2, 2), step=(2, 2), num_filters=64, name='conv6'),
            SpatialBatchNormalization(name='batch_norm6'),
            Rectifier()
        ]
    if depth > 2:
        encoder_layers = encoder_layers + [
            Convolutional(filter_size=(3, 3),
                          border_mode=(1, 1),
                          num_filters=128,
                          name='conv7'),
            SpatialBatchNormalization(name='batch_norm7'),
            Rectifier(),
            Convolutional(filter_size=(3, 3),
                          border_mode=(1, 1),
                          num_filters=128,
                          name='conv8'),
            SpatialBatchNormalization(name='batch_norm8'),
            Rectifier(),
            Convolutional(
                filter_size=(2, 2), step=(2, 2), num_filters=128,
                name='conv9'),
            SpatialBatchNormalization(name='batch_norm9'),
            Rectifier()
        ]
    if depth > 3:
        encoder_layers = encoder_layers + [
            Convolutional(filter_size=(3, 3),
                          border_mode=(1, 1),
                          num_filters=256,
                          name='conv10'),
            SpatialBatchNormalization(name='batch_norm10'),
            Rectifier(),
            Convolutional(filter_size=(3, 3),
                          border_mode=(1, 1),
                          num_filters=256,
                          name='conv11'),
            SpatialBatchNormalization(name='batch_norm11'),
            Rectifier(),
            Convolutional(filter_size=(2, 2),
                          step=(2, 2),
                          num_filters=256,
                          name='conv12'),
            SpatialBatchNormalization(name='batch_norm12'),
            Rectifier(),
        ]
    if depth > 4:
        encoder_layers = encoder_layers + [
            Convolutional(filter_size=(3, 3),
                          border_mode=(1, 1),
                          num_filters=512,
                          name='conv13'),
            SpatialBatchNormalization(name='batch_norm13'),
            Rectifier(),
            Convolutional(filter_size=(3, 3),
                          border_mode=(1, 1),
                          num_filters=512,
                          name='conv14'),
            SpatialBatchNormalization(name='batch_norm14'),
            Rectifier(),
            Convolutional(filter_size=(2, 2),
                          step=(2, 2),
                          num_filters=512,
                          name='conv15'),
            SpatialBatchNormalization(name='batch_norm15'),
            Rectifier()
        ]

    decoder_layers = []
    if depth > 4:
        decoder_layers = decoder_layers + [
            Convolutional(filter_size=(3, 3),
                          border_mode=(1, 1),
                          num_filters=512,
                          name='conv_n3'),
            SpatialBatchNormalization(name='batch_norm_n3'),
            Rectifier(),
            Convolutional(filter_size=(3, 3),
                          border_mode=(1, 1),
                          num_filters=512,
                          name='conv_n2'),
            SpatialBatchNormalization(name='batch_norm_n2'),
            Rectifier(),
            ConvolutionalTranspose(
                filter_size=(2, 2),
                step=(2, 2),
                original_image_size=(g_image_size5, g_image_size5),
                num_filters=512,
                name='conv_n1'),
            SpatialBatchNormalization(name='batch_norm_n1'),
            Rectifier()
        ]

    if depth > 3:
        decoder_layers = decoder_layers + [
            Convolutional(filter_size=(3, 3),
                          border_mode=(1, 1),
                          num_filters=256,
                          name='conv1'),
            SpatialBatchNormalization(name='batch_norm1'),
            Rectifier(),
            Convolutional(filter_size=(3, 3),
                          border_mode=(1, 1),
                          num_filters=256,
                          name='conv2'),
            SpatialBatchNormalization(name='batch_norm2'),
            Rectifier(),
            ConvolutionalTranspose(
                filter_size=(2, 2),
                step=(2, 2),
                original_image_size=(g_image_size4, g_image_size4),
                num_filters=256,
                name='conv3'),
            SpatialBatchNormalization(name='batch_norm3'),
            Rectifier()
        ]

    if depth > 2:
        decoder_layers = decoder_layers + [
            Convolutional(filter_size=(3, 3),
                          border_mode=(1, 1),
                          num_filters=128,
                          name='conv4'),
            SpatialBatchNormalization(name='batch_norm4'),
            Rectifier(),
            Convolutional(filter_size=(3, 3),
                          border_mode=(1, 1),
                          num_filters=128,
                          name='conv5'),
            SpatialBatchNormalization(name='batch_norm5'),
            Rectifier(),
            ConvolutionalTranspose(
                filter_size=(2, 2),
                step=(2, 2),
                original_image_size=(g_image_size3, g_image_size3),
                num_filters=128,
                name='conv6'),
            SpatialBatchNormalization(name='batch_norm6'),
            Rectifier()
        ]

    if depth > 1:
        decoder_layers = decoder_layers + [
            Convolutional(filter_size=(3, 3),
                          border_mode=(1, 1),
                          num_filters=64,
                          name='conv7'),
            SpatialBatchNormalization(name='batch_norm7'),
            Rectifier(),
            Convolutional(filter_size=(3, 3),
                          border_mode=(1, 1),
                          num_filters=64,
                          name='conv8'),
            SpatialBatchNormalization(name='batch_norm8'),
            Rectifier(),
            ConvolutionalTranspose(
                filter_size=(2, 2),
                step=(2, 2),
                original_image_size=(g_image_size2, g_image_size2),
                num_filters=64,
                name='conv9'),
            SpatialBatchNormalization(name='batch_norm9'),
            Rectifier()
        ]

    if depth > 0:
        decoder_layers = decoder_layers + [
            Convolutional(filter_size=(3, 3),
                          border_mode=(1, 1),
                          num_filters=32,
                          name='conv10'),
            SpatialBatchNormalization(name='batch_norm10'),
            Rectifier(),
            Convolutional(filter_size=(3, 3),
                          border_mode=(1, 1),
                          num_filters=32,
                          name='conv11'),
            SpatialBatchNormalization(name='batch_norm11'),
            Rectifier(),
            ConvolutionalTranspose(
                filter_size=(2, 2),
                step=(2, 2),
                original_image_size=(g_image_size, g_image_size),
                num_filters=32,
                name='conv12'),
            SpatialBatchNormalization(name='batch_norm12'),
            Rectifier()
        ]

    decoder_layers = decoder_layers + [
        Convolutional(filter_size=(1, 1), num_filters=3, name='conv_out'),
        Logistic()
    ]

    print("creating model of depth {} with {} encoder and {} decoder layers".
          format(depth, len(encoder_layers), len(decoder_layers)))

    encoder_convnet = ConvolutionalSequence(
        layers=encoder_layers,
        num_channels=3,
        image_size=(g_image_size, g_image_size),
        use_bias=False,
        weights_init=IsotropicGaussian(0.033),
        biases_init=Constant(0),
        name='encoder_convnet')
    encoder_convnet.initialize()

    encoder_filters = numpy.prod(encoder_convnet.get_dim('output'))

    encoder_mlp = MLP(
        dims=[encoder_filters, 1000, z_dim],
        activations=[
            Sequence([BatchNormalization(1000).apply,
                      Rectifier().apply],
                     name='activation1'),
            Identity().apply
        ],
        weights_init=IsotropicGaussian(0.033),
        biases_init=Constant(0),
        name='encoder_mlp')
    encoder_mlp.initialize()

    decoder_mlp = BatchNormalizedMLP(
        activations=[Rectifier(), Rectifier()],
        dims=[encoder_mlp.output_dim // 2, 1000, encoder_filters],
        weights_init=IsotropicGaussian(0.033),
        biases_init=Constant(0),
        name='decoder_mlp')
    decoder_mlp.initialize()

    decoder_convnet = ConvolutionalSequence(
        layers=decoder_layers,
        num_channels=encoder_convnet.get_dim('output')[0],
        image_size=encoder_convnet.get_dim('output')[1:],
        use_bias=False,
        weights_init=IsotropicGaussian(0.033),
        biases_init=Constant(0),
        name='decoder_convnet')
    decoder_convnet.initialize()

    return encoder_convnet, encoder_mlp, decoder_convnet, decoder_mlp
Example #34
0
def test_variable_filter():
    # Creating computation graph
    brick1 = Linear(input_dim=2, output_dim=2, name='linear1')
    brick2 = Bias(2, name='bias1')
    activation = Logistic(name='sigm')

    x = tensor.vector()
    h1 = brick1.apply(x, call_id='brick1_call_id')
    h2 = activation.apply(h1, call_id='act')
    h2.name = "h2act"
    y = brick2.apply(h2)
    cg = ComputationGraph(y)

    parameters = [brick1.W, brick1.b, brick2.parameters[0]]
    bias = [brick1.b, brick2.parameters[0]]
    brick1_bias = [brick1.b]

    # Testing filtering by role
    role_filter = VariableFilter(roles=[PARAMETER])
    assert parameters == role_filter(cg.variables)
    role_filter = VariableFilter(roles=[FILTER])
    assert [] == role_filter(cg.variables)

    # Testing filtering by role using each_role flag
    role_filter = VariableFilter(roles=[PARAMETER, BIAS])
    assert parameters == role_filter(cg.variables)
    role_filter = VariableFilter(roles=[PARAMETER, BIAS], each_role=True)
    assert not parameters == role_filter(cg.variables)
    assert bias == role_filter(cg.variables)

    # Testing filtering by bricks classes
    brick_filter = VariableFilter(roles=[BIAS], bricks=[Linear])
    assert brick1_bias == brick_filter(cg.variables)

    # Testing filtering by bricks instances
    brick_filter = VariableFilter(roles=[BIAS], bricks=[brick1])
    assert brick1_bias == brick_filter(cg.variables)

    # Testing filtering by brick instance
    brick_filter = VariableFilter(roles=[BIAS], bricks=[brick1])
    assert brick1_bias == brick_filter(cg.variables)

    # Testing filtering by name
    name_filter = VariableFilter(name='W_norm')
    assert [cg.variables[2]] == name_filter(cg.variables)

    # Testing filtering by name regex
    name_filter_regex = VariableFilter(name_regex='W_no.?m')
    assert [cg.variables[2]] == name_filter_regex(cg.variables)

    # Testing filtering by theano name
    theano_name_filter = VariableFilter(theano_name='h2act')
    assert [cg.variables[11]] == theano_name_filter(cg.variables)

    # Testing filtering by theano name regex
    theano_name_filter_regex = VariableFilter(theano_name_regex='h2a.?t')
    assert [cg.variables[11]] == theano_name_filter_regex(cg.variables)

    brick1_apply_variables = [cg.variables[1], cg.variables[8]]
    # Testing filtering by application
    appli_filter = VariableFilter(applications=[brick1.apply])
    assert brick1_apply_variables == appli_filter(cg.variables)

    # Testing filtering by unbound application
    unbound_appli_filter = VariableFilter(applications=[Linear.apply])
    assert brick1_apply_variables == unbound_appli_filter(cg.variables)

    # Testing filtering by call identifier
    call_id_filter = VariableFilter(call_id='brick1_call_id')
    assert brick1_apply_variables == call_id_filter(cg.variables)

    input1 = tensor.matrix('input1')
    input2 = tensor.matrix('input2')
    merge = Merge(['input1', 'input2'], [5, 6], 2)
    merged = merge.apply(input1, input2)
    merge_cg = ComputationGraph(merged)
    outputs = VariableFilter(
        roles=[OUTPUT], bricks=[merge])(merge_cg.variables)
    assert merged in outputs
    assert len(outputs) == 3

    outputs_application = VariableFilter(
        roles=[OUTPUT], applications=[merge.apply])(merge_cg.variables)
    assert outputs_application == [merged]
Example #35
0
def create_model_brick():
    layers = [
        conv_brick(2, 1, 64),
        bn_brick(),
        LeakyRectifier(leak=LEAK),
        conv_brick(7, 2, 128),
        bn_brick(),
        LeakyRectifier(leak=LEAK),
        conv_brick(5, 2, 256),
        bn_brick(),
        LeakyRectifier(leak=LEAK),
        conv_brick(7, 2, 256),
        bn_brick(),
        LeakyRectifier(leak=LEAK),
        conv_brick(4, 1, 512),
        bn_brick(),
        LeakyRectifier(leak=LEAK),
        conv_brick(1, 1, 2 * NLAT)
    ]
    encoder_mapping = ConvolutionalSequence(layers=layers,
                                            num_channels=NUM_CHANNELS,
                                            image_size=IMAGE_SIZE,
                                            use_bias=False,
                                            name='encoder_mapping')
    encoder = GaussianConditional(encoder_mapping, name='encoder')

    layers = [
        conv_transpose_brick(4, 1, 512),
        bn_brick(),
        LeakyRectifier(leak=LEAK),
        conv_transpose_brick(7, 2, 256),
        bn_brick(),
        LeakyRectifier(leak=LEAK),
        conv_transpose_brick(5, 2, 256),
        bn_brick(),
        LeakyRectifier(leak=LEAK),
        conv_transpose_brick(7, 2, 128),
        bn_brick(),
        LeakyRectifier(leak=LEAK),
        conv_transpose_brick(2, 1, 64),
        bn_brick(),
        LeakyRectifier(leak=LEAK),
        conv_brick(1, 1, NUM_CHANNELS),
        Logistic()
    ]
    decoder_mapping = ConvolutionalSequence(layers=layers,
                                            num_channels=NLAT,
                                            image_size=(1, 1),
                                            use_bias=False,
                                            name='decoder_mapping')
    decoder = DeterministicConditional(decoder_mapping, name='decoder')

    layers = [
        conv_brick(2, 1, 64),
        LeakyRectifier(leak=LEAK),
        conv_brick(7, 2, 128),
        bn_brick(),
        LeakyRectifier(leak=LEAK),
        conv_brick(5, 2, 256),
        bn_brick(),
        LeakyRectifier(leak=LEAK),
        conv_brick(7, 2, 256),
        bn_brick(),
        LeakyRectifier(leak=LEAK),
        conv_brick(4, 1, 512),
        bn_brick(),
        LeakyRectifier(leak=LEAK)
    ]
    x_discriminator = ConvolutionalSequence(layers=layers,
                                            num_channels=NUM_CHANNELS,
                                            image_size=IMAGE_SIZE,
                                            use_bias=False,
                                            name='x_discriminator')
    x_discriminator.push_allocation_config()

    layers = [
        conv_brick(1, 1, 1024),
        LeakyRectifier(leak=LEAK),
        conv_brick(1, 1, 1024),
        LeakyRectifier(leak=LEAK)
    ]
    z_discriminator = ConvolutionalSequence(layers=layers,
                                            num_channels=NLAT,
                                            image_size=(1, 1),
                                            use_bias=False,
                                            name='z_discriminator')
    z_discriminator.push_allocation_config()

    layers = [
        conv_brick(1, 1, 2048),
        LeakyRectifier(leak=LEAK),
        conv_brick(1, 1, 2048),
        LeakyRectifier(leak=LEAK),
        conv_brick(1, 1, 1)
    ]
    joint_discriminator = ConvolutionalSequence(
        layers=layers,
        num_channels=(x_discriminator.get_dim('output')[0] +
                      z_discriminator.get_dim('output')[0]),
        image_size=(1, 1),
        name='joint_discriminator')

    discriminator = XZJointDiscriminator(x_discriminator,
                                         z_discriminator,
                                         joint_discriminator,
                                         name='discriminator')

    ali = ALI(encoder,
              decoder,
              discriminator,
              weights_init=GAUSSIAN_INIT,
              biases_init=ZERO_INIT,
              name='ali')
    ali.push_allocation_config()
    encoder_mapping.layers[-1].use_bias = True
    encoder_mapping.layers[-1].tied_biases = False
    decoder_mapping.layers[-2].use_bias = True
    decoder_mapping.layers[-2].tied_biases = False
    x_discriminator.layers[0].use_bias = True
    x_discriminator.layers[0].tied_biases = True
    ali.initialize()
    raw_marginals, = next(
        create_celeba_data_streams(500, 500)[0].get_epoch_iterator())
    b_value = get_log_odds(raw_marginals)
    decoder_mapping.layers[-2].b.set_value(b_value)

    return ali
Example #36
0
def build_model_new(fea2obj, num_targets, config, kl_weight, entropy_weight, deterministic=False, test=False ):
    hidden_size = config['hidden_units'].split()
    use_highway = str_to_bool(config['use_highway']) if 'use_highway' in config else False
    use_gaus = str_to_bool(config['use_gaus']) if 'use_gaus' in config else False 
    use_rec = str_to_bool(config['use_rec']) if 'use_rec' in config else True
    n_latent_z = int(config['n_latent']) if 'use_gaus' in config else 0
    use_noise = str_to_bool(config['use_noise']) if 'use_noise' in config else False
    use_vae=str_to_bool(config['use_vae']) if 'use_vae' in config else False
    hu_decoder = int(config['hu_decoder']) if 'hu_decoder' in config else hidden_size
    logger.info('use_gaus: %s, use_rec: %s, use_noise: %s, use_vae: %s, hidden_size: %s, n_latent_z: %d, hu_decoder: %s, hu_encoder: %s', use_gaus, use_rec, use_noise, use_vae, hidden_size, n_latent_z, hu_decoder, hidden_size)
    init_with_type = str_to_bool(config['init_with_type']) if 'init_with_type' in config else False
    y = T.matrix('targets', dtype='int32')
    
    drop_prob = float(config['dropout']) if 'dropout' in config else 0
    
    #build the feature vector with one model, e.g., with cnn or mean or lstm
    feature_vec, feature_vec_len = build_feature_vec(fea2obj, config)
    
    #drop out
    if drop_prob > 0:
        mask = T.cast(srng.binomial(n=1, p=1-drop_prob, size=feature_vec.shape), 'float32')
        if test:
            feature_vec *= (1 - drop_prob)
        else:
            feature_vec *= mask
            

    #Highway network
    if use_highway:
        g_mlp = MLP(activations=[Rectifier()], dims=[feature_vec_len, feature_vec_len], name='g_mlp')
        t_mlp = MLP(activations=[Logistic()], dims=[feature_vec_len, feature_vec_len], name='t_mlp')
        initialize([g_mlp, t_mlp])
        t = t_mlp.apply(feature_vec)
        z = t * g_mlp.apply(feature_vec) + (1. - t) * feature_vec
        feature_vec = z
        
    #MLP(s)         
    logger.info('feature vec length = %s and hidden layer units = %s', feature_vec_len, ' '.join(hidden_size))
    if len(hidden_size) > 1:
        #2 MLP on feature fector    
        mlp = MLP(activations=[Rectifier(), Rectifier()], dims=[feature_vec_len, int(hidden_size[0]), int(hidden_size[1])], name='joint_mlp')
        initialize([mlp])
        before_out = mlp.apply(feature_vec)
        last_hidden_size = int(hidden_size[1])
    else:
        hidden_size = int(hidden_size[0])
        mlp = MLP(activations=[Rectifier()], dims=[feature_vec_len, hidden_size], name='joint_mlp')
        initialize([mlp])
        before_out = mlp.apply(feature_vec)
        last_hidden_size = hidden_size

        
    #compute y_hat initial guess
    hidden_to_output = Linear(name='hidden_to_output', input_dim=last_hidden_size, output_dim=num_targets)
    
    typemfile = None
    if init_with_type:
        typemfile = config['dsdir'] + '/_typematrix.npy'
        #typemfile = config['dsdir'] + '/_typeCooccurrMatrix.npy'
        
    initialize_lasthid(hidden_to_output, typemfile)
#         initialize([hidden_to_output])
    
    y_hat_init = Logistic().apply(hidden_to_output.apply(before_out))
    y_hat_init.name='y_hat_init'
    y_hat_init = debug_print(y_hat_init, 'yhat_init', False)
    logpy_xz_init = cross_entropy_loss(y_hat_init, y)
    logpy_xz = logpy_xz_init  
    y_hat_recog = y_hat_init
    y_hat = y_hat_init
    KLD = 0
    
    if use_gaus:     
        if use_vae:
            logger.info('using VAE')
            vae_conditional=str_to_bool(config['vae_cond']) 
            if vae_conditional:
                y_hat, logpy_xz, KLD, y_hat_recog = build_vae_conditoinal(kl_weight, entropy_weight, y_hat_init, feature_vec, feature_vec_len, config, y,
                    test=test, deterministic=deterministic, num_targets=num_targets, n_latent_z=n_latent_z, hidden_size=hidden_size, hu_decoder=hu_decoder)
            else:
                y_hat, logpy_xz, KLD = build_vae_basic(kl_weight, feature_vec, feature_vec_len, config, y, 
                    test=test, deterministic=deterministic, num_targets=num_targets, n_latent_z=n_latent_z, hidden_size=hidden_size, hu_decoder=hu_decoder)
                y_hat_recog = y_hat
        else:
            if use_rec:
                logger.info('Not using VAE... but using recursion')
                prior_in = T.concatenate([feature_vec, y_hat_init], axis=1)
                mu_prior, log_sigma_prior = prior_network(x=prior_in, n_input=feature_vec_len+num_targets, hu_encoder=hidden_size, n_latent=n_latent_z)
                z_prior = sampler(mu_prior, log_sigma_prior, deterministic=deterministic, use_noise=use_noise)
                zl = [T.concatenate([z_prior, feature_vec], axis=1)]
                y_hat, logpy_xz = generation(zl, n_latent=n_latent_z+feature_vec_len, hu_decoder=hu_decoder, n_out=num_targets, y=y)
                y_hat = (y_hat + y_hat_init) / 2. 
                logpy_xz = (logpy_xz + logpy_xz_init) / 2.
            else:
                prior_in = T.concatenate([feature_vec], axis=1)
                mu_prior, log_sigma_prior = prior_network(x=prior_in, n_input=feature_vec_len, hu_encoder=hidden_size, n_latent=n_latent_z)
                z_prior = sampler(mu_prior, log_sigma_prior, deterministic=deterministic, use_noise=use_noise)
                zl = [T.concatenate([z_prior, feature_vec], axis=1)]
                y_hat, logpy_xz = generation(zl, n_latent=n_latent_z+feature_vec_len, hu_decoder=hu_decoder, n_out=num_targets, y=y)
            
            y_hat_recog = y_hat
                

    y_hat = debug_print(y_hat, 'y_hat', False)

    pat1 = T.mean(y[T.arange(y.shape[0]), T.argmax(y_hat, axis=1)])
    max_type = debug_print(T.argmax(y_hat_recog, axis=1), 'max_type', False)
    pat1_recog = T.mean(y[T.arange(y.shape[0]), max_type])
    mean_cross = T.mean(logpy_xz)
    mean_kld = T.mean(KLD)
    cost = mean_kld + mean_cross 
    cost.name = 'cost'; mean_kld.name = 'kld'; mean_cross.name = 'cross_entropy_loss'; pat1.name = 'p@1'; pat1_recog.name = 'p@1_recog'
    misclassify_rate = MultiMisclassificationRate().apply(y, T.ge(y_hat, 0.5))
    misclassify_rate.name = 'error_rate'

    return cost, pat1, y_hat, mean_kld, mean_cross, pat1_recog, misclassify_rate