Ejemplo n.º 1
0
    def __init__(self, readout, transition, dim_dec, attention=None,
                 add_contexts=True, pointer_weight=0.5,
                 transition_with_att_class=None,
                 use_word_annotations=False, **kwargs):
        super(Generator, self).__init__(**kwargs)
        self.inputs = [name for name in transition.apply.sequences
                       if 'mask' not in name]
        self.dim_dec = dim_dec
        self.pointer_weight = pointer_weight
        fork = Fork(self.inputs)
        kwargs.setdefault('fork', fork)
        if attention:
            transition = transition_with_att_class(
                transition, attention,
                add_contexts=add_contexts, name="att_trans")
        else:
            transition = FakeAttentionRecurrent(transition,
                                                name="with_fake_attention")
        self.readout = readout
        self.transition = transition
        self.fork = fork
        self.children = [self.readout, self.fork, self.transition]

        self.use_word_annotations = use_word_annotations
        if use_word_annotations:
            self.word_annotation_preprocessor = Linear(
                name='input_attention_preprocessor', bias=False)
            self.children.append(self.word_annotation_preprocessor)
Ejemplo n.º 2
0
class CoreNetwork(BaseRecurrent, Initializable):
    def __init__(self, input_dim, dim, **kwargs):
        super(CoreNetwork, self).__init__(**kwargs)
        self.input_dim = input_dim
        self.dim = dim
        self.lstm = LSTM(dim=dim, name=self.name + '_lstm',
                         weights_init=self.weights_init,
                         biases_init=self.biases_init)

        self.proj = Linear(input_dim=input_dim, output_dim=dim*4,
                           name=self.name + '_proj',
                           weights_init=self.weights_init,
                           biases_init=self.biases_init)
        self.children = [self.lstm, self.proj]

    def get_dim(self, name):
        if name == 'inputs':
            return self.input_dim
        elif name in ['state', 'cell']:
            return self.dim
        else:
            raise ValueError

    @recurrent(sequences=['inputs'], states=['state', 'cell'], contexts=[],
               outputs=['state', 'cell'])
    def apply(self, inputs, state, cell):
        state, cell = self.lstm.apply(self.proj.apply(inputs), state, cell,
                                      iterate=False)
        return state, cell
Ejemplo n.º 3
0
class LocationNetwork(Random, Initializable):
    def __init__(self, input_dim, loc_emb, std, non_hetro=False, **kwargs):
        super(LocationNetwork, self).__init__(**kwargs)
        self.std = std
        self.non_hetro = non_hetro
        self.mean_affine = Linear(
                input_dim=input_dim,
                output_dim=loc_emb,
                weights_init=self.weights_init,
                biases_init=self.biases_init)

        if non_hetro:
            self.std_affine = Linear(input_dim=input_dim,
                                     output_dim=loc_emb,
                                     weights_init=self.weights_init,
                                     biases_init=self.biases_init)

            self.children = [self.mean_affine, self.std_affine]
        else:
            self.children = [self.mean_affine]

    def get_dim(self, name):
        if name == 'hidden_g':
            return self.transform.get_dim('inputs')
        elif name in ['l', 'l_sample']:
            return self.transform.get_dim('outputs')
        else:
            raise ValueError

    @application(inputs=['hidden_g'], outputs=['l', 'l_sample'])
    def apply(self, hidden_g):
        loc_mean = self.mean_affine.apply(hidden_g)
        loc_u = hard_tanh(loc_mean)
        if self.non_hetro:
            loc_std = T.nnet.relu(self.std_affine.apply(hidden_g))
            std = loc_std
        else:
            std = self.std
        loc_sample = self.theano_rng.normal(avg=loc_u,
                                            std=std,
                                            size=loc_mean.shape,
                                            dtype=theano.config.floatX)

        return loc_u, hard_tanh(loc_sample)
Ejemplo n.º 4
0
    def __init__(self, input_dim, loc_emb, std, non_hetro=False, **kwargs):
        super(LocationNetwork, self).__init__(**kwargs)
        self.std = std
        self.non_hetro = non_hetro
        self.mean_affine = Linear(
                input_dim=input_dim,
                output_dim=loc_emb,
                weights_init=self.weights_init,
                biases_init=self.biases_init)

        if non_hetro:
            self.std_affine = Linear(input_dim=input_dim,
                                     output_dim=loc_emb,
                                     weights_init=self.weights_init,
                                     biases_init=self.biases_init)

            self.children = [self.mean_affine, self.std_affine]
        else:
            self.children = [self.mean_affine]
Ejemplo n.º 5
0
    def __init__(self, input_dim, n_classes, multi_object=False, **kwargs):
        super(ActionNetwork, self).__init__(**kwargs)
        self.transform = Linear(input_dim=input_dim,
                                output_dim=n_classes, **kwargs)
        if multi_object:
            self.out = Logistic()
        else:
            self.out = Softmax()

        self.children = [self.transform, self.out]
Ejemplo n.º 6
0
    def __init__(self, output_names, input_dim, prototype=None, **kwargs):
        if not prototype:
            prototype = Linear()

        self.output_names = output_names
        self.input_dim = input_dim

        kwargs.setdefault('child_prefix', 'fork')
        super(Fork, self).__init__(output_names, prototype=prototype, **kwargs)
        self.input_dims = None
Ejemplo n.º 7
0
 def __init__(self,
              input_names,
              input_dims,
              output_dim,
              prototype=None,
              **kwargs):
     if not prototype:
         prototype = Linear(use_bias=False)
     self.output_dim = output_dim
     super(Merge, self).__init__(input_names, input_dims,
                                 [output_dim for _ in input_names],
                                 prototype, **kwargs)
Ejemplo n.º 8
0
class ActionNetwork(Initializable):
    def __init__(self, input_dim, n_classes, multi_object=False, **kwargs):
        super(ActionNetwork, self).__init__(**kwargs)
        self.transform = Linear(input_dim=input_dim,
                                output_dim=n_classes, **kwargs)
        if multi_object:
            self.out = Logistic()
        else:
            self.out = Softmax()

        self.children = [self.transform, self.out]

    def get_dim(self, name):
        if name == 'hidden_g':
            return self.transform.get_dim('inputs')
        else:
            raise ValueError

    @application(inputs=['hidden_g'], outputs=['action'])
    def apply(self, hidden_g):
        return self.out.apply(self.transform.apply(hidden_g))
Ejemplo n.º 9
0
    def __init__(self, input_dim, dim, **kwargs):
        super(CoreNetwork, self).__init__(**kwargs)
        self.input_dim = input_dim
        self.dim = dim
        self.lstm = LSTM(dim=dim, name=self.name + '_lstm',
                         weights_init=self.weights_init,
                         biases_init=self.biases_init)

        self.proj = Linear(input_dim=input_dim, output_dim=dim*4,
                           name=self.name + '_proj',
                           weights_init=self.weights_init,
                           biases_init=self.biases_init)
        self.children = [self.lstm, self.proj]
Ejemplo n.º 10
0
    def __init__(self, dim,
                 n_channels, img_height, img_width, N, sensor=None,
                 n_retina=3, radius=4,
                 activations=None, **kwargs):

        super(GlimpseNetwork, self).__init__(**kwargs)
        if sensor is None or sensor == 'simple':
            self.sensor = GlimpseSensorBeta(channels=n_channels,
                                            img_height=img_height,
                                            img_width=img_width, N=N)
        elif sensor == 'retina':
            self.sensor = RetinaGlimpse(img_width, img_height, n_channels,
                                        n_retina=n_retina, radius=radius)
        else:
            raise ValueError("sensor mode support [simple]|[retina]." +
                             'Got ' + sensor + '.')

        self.loc_emb = self.sensor.emb_dim

        self.glimpes_0 = Linear(input_dim=self.loc_emb,
                                output_dim=dim,
                                name=self.name + '_glimp_0',
                                weights_init=self.weights_init,
                                biases_init=self.biases_init)

        self.glimpes_1 = Linear(input_dim=self.sensor.get_dim('glimpse'),
                                output_dim=dim, name=self.name + '_glimp_1',
                                weights_init=self.weights_init,
                                biases_init=self.biases_init)

        self.glimpes_out = Linear(input_dim=dim*2, output_dim=dim,
                                  name=self.name + '_glimp_out',
                                  weights_init=self.weights_init,
                                  biases_init=self.biases_init)

        self.children = [self.glimpes_0, self.glimpes_1, self.glimpes_out]
        self.output_dim = dim
Ejemplo n.º 11
0
    def __init__(self,
                 target_names,
                 source_name,
                 target_dims,
                 source_dim,
                 prototype=None,
                 **kwargs):
        if not prototype:
            prototype = Linear(use_bias=False)

        self.target_names = target_names
        self.source_name = source_name
        self.target_dims = target_dims
        self.source_dim = source_dim

        super(Distribute, self).__init__(output_names=target_names,
                                         output_dims=target_dims,
                                         input_dim=source_dim,
                                         prototype=prototype,
                                         **kwargs)
Ejemplo n.º 12
0
    def build_theano_functions(self, data_mean, data_std) :
        x = T.ftensor3('x') # shape of input : batch X time X value
        y = T.ftensor3('y')

        # before the cell, input, forget and output gates, x needs to
        # be transformed
        linear_transforms = []
        for transform in ['c','i','f','o'] :
            linear_transforms.append(
                Linear(self.input_dim,
                       self.lstm_dim,
                       weights_init=Uniform(mean=data_mean, std=data_std),
                       #weights_init=IsotropicGaussian(mean=1.,std=1),
                       biases_init=Constant(data_mean),
                       name=transform+"_transform")
            )

        for transform in linear_transforms :
            transform.initialize()

        linear_applications = []
        for transform in linear_transforms :
            linear_applications.append(
                transform.apply(x))

        lstm_input = T.concatenate(linear_applications, axis=2)

        # the lstm wants batch X time X value
        lstm = LSTM(
            dim=self.lstm_dim,
            weights_init=IsotropicGaussian(mean=0.5,std=1),
            biases_init=Constant(1))
        lstm.initialize()
        h, _dummy = lstm.apply(lstm_input)

        # this is where Alex Graves' paper starts
        output_transform = Linear(self.lstm_dim,
                                  self.output_dim,
                                  #weights_init=Uniform(mean=data_mean, std=data_std),
                                  weights_init=IsotropicGaussian(mean=0., std=1),
                                  biases_init=Constant(1),
                                  name="output_transform")
        output_transform.initialize()
        y_hat = output_transform.apply(h)

        # transforms to find each gmm params (mu, pi, sig)
        #pis = NDimensionalSoftmax.apply(y_hat[:,:,0:self.gmm_dim])
        # small hack to softmax a 3D tensor
        pis = T.reshape(
                    T.nnet.softmax(
                        T.reshape(y_hat[:,:,0:self.gmm_dim], (self.time_dim*self.batch_dim, self.gmm_dim)))
                    , (self.batch_dim, self.time_dim, self.gmm_dim))
        #sig = T.exp(y_hat[:,:,self.gmm_dim:self.gmm_dim*2])
        sig = T.nnet.relu(y_hat[:,:,self.gmm_dim:self.gmm_dim*2])+0.1
        mus = y_hat[:,:,self.gmm_dim*2:]

        pis = pis[:,:,:,np.newaxis]
        mus = mus[:,:,:,np.newaxis]
        sig = sig[:,:,:,np.newaxis]
        y = y[:,:,np.newaxis,:]

        #sig=theano.printing.Print()(sig)

        # sum likelihood with targets
        # sum inside log accross mixtures, sum outside log accross time
        #LL = -T.log((pis*(1./(T.sqrt(2.*np.pi)*sig))*T.exp(-0.5*((y-mus)**2)/sig**2)).sum(axis=2)).sum()
        expo = T.exp(-0.5*((y-mus)**2)/sig**2)
        test_expo = theano.function([x,y],[expo, mus, sig])
        return test_expo

        coeff = pis*(1./(T.sqrt(2.*np.pi)*sig))
        inside_log = (coeff*expo).sum(axis=2)
        LL = -(T.log(inside_log)).sum()


        model = Model(LL)
        self.model = model
        parameters = model.parameters

        grads = T.grad(LL, parameters)
        updates = []
        for i in range(len(grads)) :
            updates.append(tuple([parameters[i], parameters[i] - self.lr*grads[i]]))

        #gradf = theano.function([x, y],[LL],updates=updates, mode=NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=False))
        gradf = theano.function([x, y],[LL],updates=updates)
        f = theano.function([x],[pis, sig, mus])

        return gradf, f
Ejemplo n.º 13
0
    def build_theano_functions(self):
        x = T.fmatrix('time_sequence')
        x = x.reshape((self.batch_dim, self.sequence_dim, self.time_dim))

        y = x[:,1:self.sequence_dim,:]
        x = x[:,:self.sequence_dim-1,:]

        # if we try to include the spectrogram features
        spec_dims = 0
        if self.image_size is not None :
            print "Convolution activated"
            self.init_conv()
            spec = T.ftensor4('spectrogram')
            spec_features, spec_dims = self.conv.build_conv_layers(spec)
            print "Conv final dims =", spec_dims
            spec_dims = np.prod(spec_dims)
            spec_features = spec_features.reshape(
                (self.batch_dim, self.sequence_dim-1, spec_dims))
            x = T.concatenate([x, spec_features], axis=2)

        layers_input = [x]
        dims =np.array([self.time_dim + spec_dims])
        for dim in self.lstm_layers_dim :
            dims = np.append(dims, dim)
        print "Dimensions =", dims

        # layer is just an index of the layer
        for layer in range(len(self.lstm_layers_dim)) :

            # before the cell, input, forget and output gates, x needs to
            # be transformed
            linear = Linear(dims[layer],
                            dims[layer+1]*4,
                            weights_init=Orthogonal(self.orth_scale),
                            biases_init=Constant(0),
                            name="linear"+str(layer))
            linear.initialize()
            lstm_input = linear.apply(layers_input[layer])

            # the lstm wants batch X sequence X time
            lstm = LSTM(
                dim=dims[layer+1],
                weights_init=IsotropicGaussian(mean=0.,std=0.5),
                biases_init=Constant(1),
                name="lstm"+str(layer))
            lstm.initialize()
            # hack to use Orthogonal on lstm w_state
            lstm.W_state.set_value(
                self.orth_scale*Orthogonal().generate(np.random, lstm.W_state.get_value().shape))
            h, _dummy = lstm.apply(lstm_input)

            layers_input.append(h)

        # this is where Alex Graves' paper starts
        print "Last linear transform dim :", dims[1:].sum()
        output_transform = Linear(dims[1:].sum(),
                                  self.output_dim,
                                  weights_init=Orthogonal(self.orth_scale),
                                  use_bias=False,
                                  name="output_transform")
        output_transform.initialize()
        if len(self.lstm_layers_dim) == 1 :
            print "hallo there, only one layer speaking"
            y_hat = output_transform.apply(layers_input[-1])
        else :
            y_hat = output_transform.apply(T.concatenate(layers_input[1:], axis=2))

        # transforms to find each gmm params (mu, pi, sig)
        # small hack to softmax a 3D tensor
        pis = T.reshape(
                    T.nnet.softmax(
                        T.reshape(y_hat[:,:,:self.gmm_dim], ((self.sequence_dim-1)*self.batch_dim, self.gmm_dim))),
                    (self.batch_dim, (self.sequence_dim-1), self.gmm_dim))
        sig = T.exp(y_hat[:,:,self.gmm_dim:self.gmm_dim*2])+1e-6
        mus = y_hat[:,:,self.gmm_dim*2:]

        pis = pis[:,:,:,np.newaxis]
        mus = mus[:,:,:,np.newaxis]
        sig = sig[:,:,:,np.newaxis]
        y = y[:,:,np.newaxis,:]

        y = T.patternbroadcast(y, (False, False, True, False))
        mus = T.patternbroadcast(mus, (False, False, False, True))
        sig = T.patternbroadcast(sig, (False, False, False, True))

        # sum likelihood with targets
        # see blog for this crazy Pr() = sum log sum prod
        # axes :: (batch, sequence, mixture, time)
        expo_term = -0.5*((y-mus)**2)/sig**2
        coeff = T.log(T.maximum(1./(T.sqrt(2.*np.pi)*sig), EPS))
        #coeff = T.log(1./(T.sqrt(2.*np.pi)*sig))
        sequences = coeff + expo_term
        log_sequences = T.log(pis + EPS) + T.sum(sequences, axis=3, keepdims=True)

        log_sequences_max = T.max(log_sequences, axis=2, keepdims=True)

        LL = -(log_sequences_max + T.log(EPS + T.sum(T.exp(log_sequences - log_sequences_max), axis=2, keepdims=True))).mean()
        LL.name = "summed_likelihood"

        model = Model(LL)
        self.model = model
        parameters = model.parameters

        algorithm = GradientDescent(
            cost=LL,
            parameters=model.parameters,
            step_rule=Adam())

        f = theano.function([x],[pis, sig, mus])

        return algorithm, f
Ejemplo n.º 14
0
    def build_theano_functions(self):
        x = T.ftensor3('x')  # shape of input : batch X time X value
        y = T.ftensor4('y')

        layers_input = [x]
        dims = np.array([self.time_dim])
        for dim in self.lstm_layers_dim:
            dims = np.append(dims, dim)
        print "Dimensions =", dims

        # layer is just an index of the layer
        for layer in range(len(self.lstm_layers_dim)):

            # before the cell, input, forget and output gates, x needs to
            # be transformed
            linear = Linear(
                dims[layer],
                dims[layer + 1] * 4,
                weights_init=Orthogonal(self.orth_scale),
                #weights_init=IsotropicGaussian(mean=1.,std=1),
                biases_init=Constant(0),
                name="linear" + str(layer))
            linear.initialize()
            lstm_input = linear.apply(layers_input[layer])

            # the lstm wants batch X time X value
            lstm = LSTM(dim=dims[layer + 1],
                        weights_init=IsotropicGaussian(mean=0., std=0.5),
                        biases_init=Constant(1),
                        name="lstm" + str(layer))
            lstm.initialize()
            # hack to use Orthogonal on lstm w_state
            lstm.W_state.set_value(
                self.orth_scale *
                Orthogonal().generate(np.random,
                                      lstm.W_state.get_value().shape))
            h, _dummy = lstm.apply(lstm_input)

            layers_input.append(h)

        # this is where Alex Graves' paper starts
        print "Last linear transform dim :", dims[1:].sum()
        output_transform = Linear(
            dims[1:].sum(),
            self.output_dim,
            weights_init=Orthogonal(self.orth_scale),
            #weights_init=IsotropicGaussian(mean=0., std=1),
            use_bias=False,
            name="output_transform")
        output_transform.initialize()
        if len(self.lstm_layers_dim) == 1:
            print "hallo there, only one layer speaking"
            y_hat = output_transform.apply(layers_input[-1])
        else:
            y_hat = output_transform.apply(
                T.concatenate(layers_input[1:], axis=2))

        # transforms to find each gmm params (mu, pi, sig)
        # small hack to softmax a 3D tensor
        #pis = T.reshape(
        #            T.nnet.softmax(
        #                T.nnet.sigmoid(
        #                    T.reshape(y_hat[:,:,0:self.gmm_dim], (self.time_dim*self.batch_dim, self.gmm_dim)))),
        #            (self.batch_dim, self.time_dim, self.gmm_dim))
        pis = T.reshape(
            T.nnet.softmax(
                T.reshape(y_hat[:, :, :self.gmm_dim],
                          (self.sequence_dim * self.batch_dim, self.gmm_dim))),
            (self.batch_dim, self.sequence_dim, self.gmm_dim))
        sig = T.exp(y_hat[:, :, self.gmm_dim:self.gmm_dim * 2]) + 1e-6
        #sig = T.nnet.relu(y_hat[:,:,self.gmm_dim:self.gmm_dim*2])+0.1
        #mus = 2.*T.tanh(y_hat[:,:,self.gmm_dim*2:])
        mus = y_hat[:, :, self.gmm_dim * 2:]

        pis = pis[:, :, :, np.newaxis]
        mus = mus[:, :, :, np.newaxis]
        sig = sig[:, :, :, np.newaxis]
        #y = y[:,:,np.newaxis,:]

        y = T.patternbroadcast(y, (False, False, True, False))
        mus = T.patternbroadcast(mus, (False, False, False, True))
        sig = T.patternbroadcast(sig, (False, False, False, True))

        # sum likelihood with targets
        # see blog for this crazy Pr() = sum log sum prod
        # axes :: (batch, sequence, mixture, time)
        expo_term = -0.5 * ((y - mus)**2) / sig**2
        coeff = T.log(T.maximum(1. / (T.sqrt(2. * np.pi) * sig), EPS))
        #coeff = T.log(1./(T.sqrt(2.*np.pi)*sig))
        sequences = coeff + expo_term
        log_sequences = T.log(pis + EPS) + T.sum(
            sequences, axis=3, keepdims=True)

        log_sequences_max = T.max(log_sequences, axis=2, keepdims=True)

        LL = -(log_sequences_max + T.log(EPS + T.sum(
            T.exp(log_sequences - log_sequences_max), axis=2, keepdims=True))
               ).mean()

        model = Model(LL)
        self.model = model
        parameters = model.parameters

        grads = T.grad(LL, parameters)
        updates = []
        lr = T.scalar('lr')
        for i in range(len(grads)):
            #updates.append(tuple([parameters[i], parameters[i] - self.lr*grads[i]]))
            updates.append(
                tuple([parameters[i], parameters[i] - lr * grads[i]]))

        #gradf = theano.function([x, y],[LL],updates=updates, mode=NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=False))
        if self.debug:
            gradf = theano.function([x, y, lr], [LL, pis, mus, sig],
                                    updates=updates)
        else:
            #gradf = theano.function([x, y, z],[zLL],updates=updates)
            gradf = theano.function([x, y, lr], [LL], updates=updates)
        f = theano.function([x], [pis, sig, mus])

        return gradf, f
Ejemplo n.º 15
0
class GlimpseNetwork(Initializable):
    """
    GlimpseSensor & Linear + Rectifier
    ----------------------------------

    apply:
        input_shape (batch_size, n_channels * img_width * img_height)
        output_dim (batch_size, dim)

    """
    def __init__(self, dim,
                 n_channels, img_height, img_width, N, sensor=None,
                 n_retina=3, radius=4,
                 activations=None, **kwargs):

        super(GlimpseNetwork, self).__init__(**kwargs)
        if sensor is None or sensor == 'simple':
            self.sensor = GlimpseSensorBeta(channels=n_channels,
                                            img_height=img_height,
                                            img_width=img_width, N=N)
        elif sensor == 'retina':
            self.sensor = RetinaGlimpse(img_width, img_height, n_channels,
                                        n_retina=n_retina, radius=radius)
        else:
            raise ValueError("sensor mode support [simple]|[retina]." +
                             'Got ' + sensor + '.')

        self.loc_emb = self.sensor.emb_dim

        self.glimpes_0 = Linear(input_dim=self.loc_emb,
                                output_dim=dim,
                                name=self.name + '_glimp_0',
                                weights_init=self.weights_init,
                                biases_init=self.biases_init)

        self.glimpes_1 = Linear(input_dim=self.sensor.get_dim('glimpse'),
                                output_dim=dim, name=self.name + '_glimp_1',
                                weights_init=self.weights_init,
                                biases_init=self.biases_init)

        self.glimpes_out = Linear(input_dim=dim*2, output_dim=dim,
                                  name=self.name + '_glimp_out',
                                  weights_init=self.weights_init,
                                  biases_init=self.biases_init)

        self.children = [self.glimpes_0, self.glimpes_1, self.glimpes_out]
        self.output_dim = dim

    def get_dim(self, name):
        if name == 'img':
            return self.sensor.get_dim('img')
        elif name == 'l_last':
            return self.sensor.emb_dim
        else:
            raise ValueError

    @application(contexts=['img'], sequences=[],
                 state=['l_last'], outputs=['hidden_g'])
    def apply(self, img, l_last):
        """
        Params
        ------
        img: (batch_size, img_height, img_width, n_channels)
        center_x: (batch_size,)
        center_y: (batch_size,)
        ---

        Return
        ------
        h_g : (batch_size, output_dim)
        """
        l_unpack = self.sensor.nn2att(l_last)
        glimpes = self.sensor.read(img, *l_unpack)
        h0 = T.nnet.relu(self.glimpes_0.apply(l_last))
        h1 = T.nnet.relu(self.glimpes_1.apply(glimpes))
        h_c = T.concatenate([h0, h1], axis=1)
        hidden_g = T.nnet.relu(self.glimpes_out.apply(h_c))
        return hidden_g
Ejemplo n.º 16
0
    def build_theano_functions(self):
        x = T.ftensor3('x')  # shape of input : batch X time X value
        y = T.ftensor3('y')
        z = T.ftensor3('z')

        layers_input = [x]
        dims = np.array([self.input_dim])
        for dim in self.lstm_layers_dim:
            dims = np.append(dims, dim)
        print "Dimensions =", dims

        # layer is just an index of the layer
        for layer in range(len(self.lstm_layers_dim)):

            # before the cell, input, forget and output gates, x needs to
            # be transformed
            linear = Linear(
                dims[layer],
                dims[layer + 1] * 4,
                #weights_init=Uniform(mean=data_mean, std=1),
                weights_init=IsotropicGaussian(mean=1., std=1),
                biases_init=Constant(0),
                name="linear" + str(layer))
            linear.initialize()
            lstm_input = linear.apply(layers_input[layer])

            # the lstm wants batch X time X value
            lstm = LSTM(dim=dims[layer + 1],
                        weights_init=IsotropicGaussian(mean=0., std=0.5),
                        biases_init=Constant(1),
                        name="lstm" + str(layer))
            lstm.initialize()
            # hack to use Orthogonal on lstm w_state
            lstm.W_state.set_value(Orthogonal().generate(
                np.random,
                lstm.W_state.get_value().shape))
            h, _dummy = lstm.apply(lstm_input)

            layers_input.append(h)

        # the idea is to have one gaussian parametrize every frequency bin
        print "Last linear transform dim :", dims[1:].sum()
        output_transform = Linear(
            dims[1:].sum(),
            self.output_dim,
            weights_init=IsotropicGaussian(mean=0., std=1),
            biases_init=Constant(0),
            #use_bias=False,
            name="output_transform")
        output_transform.initialize()
        if len(self.lstm_layers_dim) == 1:
            print "hallo there, only one layer speaking"
            y_hat = output_transform.apply(layers_input[-1])
        else:
            y_hat = output_transform.apply(
                T.concatenate(layers_input[1:], axis=2))

        sig = T.nnet.relu(y_hat[:, :, :self.output_dim / 2]) + 0.05
        mus = y_hat[:, :, self.output_dim / 2:]

        # sum likelihood with targets
        # sum inside log accross mixtures, sum outside log accross time
        inside_expo = -0.5 * ((y - mus)**2) / sig**2
        expo = T.exp(inside_expo)
        coeff = 1. / (T.sqrt(2. * np.pi) * sig)
        inside_log = T.log(coeff * expo)
        inside_log_max = T.max(inside_log, axis=2, keepdims=True)
        LL = -(inside_log_max + T.log(
            T.sum(T.exp(inside_log - inside_log_max), axis=2,
                  keepdims=True))).sum()

        #zinside_expo = -0.5*((z-mus)**2)/sig**2
        #zexpo = T.exp(zinside_expo)
        #zcoeff = pis*(1./(T.sqrt(2.*np.pi)*sig))
        #zinside_log = (zcoeff*zexpo).sum(axis=2)
        #zLL = -(T.log(zinside_log)).sum()

        model = Model(LL)
        self.model = model
        parameters = model.parameters

        grads = T.grad(LL, parameters)
        updates = []
        lr = T.scalar('lr')
        for i in range(len(grads)):
            #updates.append(tuple([parameters[i], parameters[i] - self.lr*grads[i]]))
            updates.append(
                tuple([parameters[i], parameters[i] - lr * grads[i]]))

        #gradf = theano.function([x, y],[LL],updates=updates, mode=NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=False))
        if self.debug:
            gradf = theano.function([x, y, lr], [LL, mus, sig],
                                    updates=updates)
        else:
            #gradf = theano.function([x, y, z],[zLL],updates=updates)
            gradf = theano.function([x, y, lr], [LL], updates=updates)
        f = theano.function([x], [sig, mus])

        return gradf, f
Ejemplo n.º 17
0
    batch_size = 5
    input_dim = 6
    output_dim = 8
    n_classes = 10
    test_data = {x: np.random.normal(size=(n_steps, batch_size, input_dim)
                                     ).astype(np.float32),
                 y: np.random.randint(n_classes, size=(batch_size, )
                                      ).astype(np.int32)}
    inits = {
        'weights_init': IsotropicGaussian(0.1),
        'biases_init': Constant(0.),
    }

    core = CoreNetwork(input_dim=input_dim, dim=output_dim, **inits)
    core.initialize()
    proj = Linear(input_dim=output_dim*2, output_dim=n_classes, **inits)
    proj.initialize()
    out = Softmax()

    state, cell = core.apply(x)

    a = T.concatenate([state, cell], axis=2)
    a = a.reshape((a.shape[0]*a.shape[1], a.shape[2]))

    a = proj.apply(a)
    prop = out.apply(a).reshape((n_steps, batch_size, n_classes))
    pred = prop[-1]
    prop = prop.reshape((n_steps * batch_size, n_classes))

    print prop.eval({x: test_data[x]})
    y_reat = T.repeat(y[None, :], n_steps, axis=0).reshape(
Ejemplo n.º 18
0
    def build_theano_functions(self):
        x = T.fmatrix('time_sequence')
        x = x.reshape((self.batch_dim, self.sequence_dim, self.time_dim))

        y = x[:, 1:self.sequence_dim, :]
        x = x[:, :self.sequence_dim - 1, :]

        # if we try to include the spectrogram features
        spec_dims = 0
        if self.image_size is not None:
            print "Convolution activated"
            self.init_conv()
            spec = T.ftensor4('spectrogram')
            spec_features, spec_dims = self.conv.build_conv_layers(spec)
            print "Conv final dims =", spec_dims
            spec_dims = np.prod(spec_dims)
            spec_features = spec_features.reshape(
                (self.batch_dim, self.sequence_dim - 1, spec_dims))
            x = T.concatenate([x, spec_features], axis=2)

        layers_input = [x]
        dims = np.array([self.time_dim + spec_dims])
        for dim in self.lstm_layers_dim:
            dims = np.append(dims, dim)
        print "Dimensions =", dims

        # layer is just an index of the layer
        for layer in range(len(self.lstm_layers_dim)):

            # before the cell, input, forget and output gates, x needs to
            # be transformed
            linear = Linear(dims[layer],
                            dims[layer + 1] * 4,
                            weights_init=Orthogonal(self.orth_scale),
                            biases_init=Constant(0),
                            name="linear" + str(layer))
            linear.initialize()
            lstm_input = linear.apply(layers_input[layer])

            # the lstm wants batch X sequence X time
            lstm = LSTM(dim=dims[layer + 1],
                        weights_init=IsotropicGaussian(mean=0., std=0.5),
                        biases_init=Constant(1),
                        name="lstm" + str(layer))
            lstm.initialize()
            # hack to use Orthogonal on lstm w_state
            lstm.W_state.set_value(
                self.orth_scale *
                Orthogonal().generate(np.random,
                                      lstm.W_state.get_value().shape))
            h, _dummy = lstm.apply(lstm_input)

            layers_input.append(h)

        # this is where Alex Graves' paper starts
        print "Last linear transform dim :", dims[1:].sum()
        output_transform = Linear(dims[1:].sum(),
                                  self.output_dim,
                                  weights_init=Orthogonal(self.orth_scale),
                                  use_bias=False,
                                  name="output_transform")
        output_transform.initialize()
        if len(self.lstm_layers_dim) == 1:
            print "hallo there, only one layer speaking"
            y_hat = output_transform.apply(layers_input[-1])
        else:
            y_hat = output_transform.apply(
                T.concatenate(layers_input[1:], axis=2))

        # transforms to find each gmm params (mu, pi, sig)
        # small hack to softmax a 3D tensor
        pis = T.reshape(
            T.nnet.softmax(
                T.reshape(
                    y_hat[:, :, :self.gmm_dim],
                    ((self.sequence_dim - 1) * self.batch_dim, self.gmm_dim))),
            (self.batch_dim, (self.sequence_dim - 1), self.gmm_dim))
        sig = T.exp(y_hat[:, :, self.gmm_dim:self.gmm_dim * 2]) + 1e-6
        mus = y_hat[:, :, self.gmm_dim * 2:]

        pis = pis[:, :, :, np.newaxis]
        mus = mus[:, :, :, np.newaxis]
        sig = sig[:, :, :, np.newaxis]
        y = y[:, :, np.newaxis, :]

        y = T.patternbroadcast(y, (False, False, True, False))
        mus = T.patternbroadcast(mus, (False, False, False, True))
        sig = T.patternbroadcast(sig, (False, False, False, True))

        # sum likelihood with targets
        # see blog for this crazy Pr() = sum log sum prod
        # axes :: (batch, sequence, mixture, time)
        expo_term = -0.5 * ((y - mus)**2) / sig**2
        coeff = T.log(T.maximum(1. / (T.sqrt(2. * np.pi) * sig), EPS))
        #coeff = T.log(1./(T.sqrt(2.*np.pi)*sig))
        sequences = coeff + expo_term
        log_sequences = T.log(pis + EPS) + T.sum(
            sequences, axis=3, keepdims=True)

        log_sequences_max = T.max(log_sequences, axis=2, keepdims=True)

        LL = -(log_sequences_max + T.log(EPS + T.sum(
            T.exp(log_sequences - log_sequences_max), axis=2, keepdims=True))
               ).mean()
        LL.name = "summed_likelihood"

        model = Model(LL)
        self.model = model
        parameters = model.parameters

        algorithm = GradientDescent(cost=LL,
                                    parameters=model.parameters,
                                    step_rule=Adam())

        f = theano.function([x], [pis, sig, mus])

        return algorithm, f
Ejemplo n.º 19
0
    def build_theano_functions(self) :
        x = T.ftensor3('x') # shape of input : batch X time X value
        y = T.ftensor4('y')

        layers_input = [x]
        dims =np.array([self.time_dim])
        for dim in self.lstm_layers_dim :
            dims = np.append(dims, dim)
        print "Dimensions =", dims

        # layer is just an index of the layer
        for layer in range(len(self.lstm_layers_dim)) :

            # before the cell, input, forget and output gates, x needs to
            # be transformed
            linear = Linear(dims[layer],
                            dims[layer+1]*4,
                            weights_init=Orthogonal(self.orth_scale),
                            #weights_init=IsotropicGaussian(mean=1.,std=1),
                            biases_init=Constant(0),
                            name="linear"+str(layer))
            linear.initialize()
            lstm_input = linear.apply(layers_input[layer])

            # the lstm wants batch X time X value
            lstm = LSTM(
                dim=dims[layer+1],
                weights_init=IsotropicGaussian(mean=0.,std=0.5),
                biases_init=Constant(1),
                name="lstm"+str(layer))
            lstm.initialize()
            # hack to use Orthogonal on lstm w_state
            lstm.W_state.set_value(
                self.orth_scale*Orthogonal().generate(np.random, lstm.W_state.get_value().shape))
            h, _dummy = lstm.apply(lstm_input)

            layers_input.append(h)

        # this is where Alex Graves' paper starts
        print "Last linear transform dim :", dims[1:].sum()
        output_transform = Linear(dims[1:].sum(),
                                  self.output_dim,
                                  weights_init=Orthogonal(self.orth_scale),
                                  #weights_init=IsotropicGaussian(mean=0., std=1),
                                  use_bias=False,
                                  name="output_transform")
        output_transform.initialize()
        if len(self.lstm_layers_dim) == 1 :
            print "hallo there, only one layer speaking"
            y_hat = output_transform.apply(layers_input[-1])
        else :
            y_hat = output_transform.apply(T.concatenate(layers_input[1:], axis=2))

        # transforms to find each gmm params (mu, pi, sig)
        # small hack to softmax a 3D tensor
        #pis = T.reshape(
        #            T.nnet.softmax(
        #                T.nnet.sigmoid(
        #                    T.reshape(y_hat[:,:,0:self.gmm_dim], (self.time_dim*self.batch_dim, self.gmm_dim)))),
        #            (self.batch_dim, self.time_dim, self.gmm_dim))
        pis = T.reshape(
                    T.nnet.softmax(
                        T.reshape(y_hat[:,:,:self.gmm_dim], (self.sequence_dim*self.batch_dim, self.gmm_dim))),
                    (self.batch_dim, self.sequence_dim, self.gmm_dim))
        sig = T.exp(y_hat[:,:,self.gmm_dim:self.gmm_dim*2])+1e-6
        #sig = T.nnet.relu(y_hat[:,:,self.gmm_dim:self.gmm_dim*2])+0.1
        #mus = 2.*T.tanh(y_hat[:,:,self.gmm_dim*2:])
        mus = y_hat[:,:,self.gmm_dim*2:]

        pis = pis[:,:,:,np.newaxis]
        mus = mus[:,:,:,np.newaxis]
        sig = sig[:,:,:,np.newaxis]
        #y = y[:,:,np.newaxis,:]

        y = T.patternbroadcast(y, (False, False, True, False))
        mus = T.patternbroadcast(mus, (False, False, False, True))
        sig = T.patternbroadcast(sig, (False, False, False, True))

        # sum likelihood with targets
        # see blog for this crazy Pr() = sum log sum prod
        # axes :: (batch, sequence, mixture, time)
        expo_term = -0.5*((y-mus)**2)/sig**2
        coeff = T.log(T.maximum(1./(T.sqrt(2.*np.pi)*sig), EPS))
        #coeff = T.log(1./(T.sqrt(2.*np.pi)*sig))
        sequences = coeff + expo_term
        log_sequences = T.log(pis + EPS) + T.sum(sequences, axis=3, keepdims=True)

        log_sequences_max = T.max(log_sequences, axis=2, keepdims=True)

        LL = -(log_sequences_max + T.log(EPS + T.sum(T.exp(log_sequences - log_sequences_max), axis=2, keepdims=True))).mean()

        model = Model(LL)
        self.model = model
        parameters = model.parameters

        grads = T.grad(LL, parameters)
        updates = []
        lr = T.scalar('lr')
        for i in range(len(grads)) :
            #updates.append(tuple([parameters[i], parameters[i] - self.lr*grads[i]]))
            updates.append(tuple([parameters[i], parameters[i] - lr*grads[i]]))

        #gradf = theano.function([x, y],[LL],updates=updates, mode=NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=False))
        if self.debug :
            gradf = theano.function([x, y, lr],[LL, pis, mus, sig],updates=updates)
        else :
            #gradf = theano.function([x, y, z],[zLL],updates=updates)
            gradf = theano.function([x, y, lr],[LL],updates=updates)
        f = theano.function([x],[pis, sig, mus])

        return gradf, f
Ejemplo n.º 20
0
    def build_theano_functions(self, data_mean, data_std):
        x = T.ftensor3('x')  # shape of input : batch X time X value
        y = T.ftensor3('y')

        # before the cell, input, forget and output gates, x needs to
        # be transformed
        linear_transforms = []
        for transform in ['c', 'i', 'f', 'o']:
            linear_transforms.append(
                Linear(
                    self.input_dim,
                    self.lstm_dim,
                    weights_init=Uniform(mean=data_mean, std=data_std),
                    #weights_init=IsotropicGaussian(mean=1.,std=1),
                    biases_init=Constant(data_mean),
                    name=transform + "_transform"))

        for transform in linear_transforms:
            transform.initialize()

        linear_applications = []
        for transform in linear_transforms:
            linear_applications.append(transform.apply(x))

        lstm_input = T.concatenate(linear_applications, axis=2)

        # the lstm wants batch X time X value
        lstm = LSTM(dim=self.lstm_dim,
                    weights_init=IsotropicGaussian(mean=0.5, std=1),
                    biases_init=Constant(1))
        lstm.initialize()
        h, _dummy = lstm.apply(lstm_input)

        # this is where Alex Graves' paper starts
        output_transform = Linear(
            self.lstm_dim,
            self.output_dim,
            #weights_init=Uniform(mean=data_mean, std=data_std),
            weights_init=IsotropicGaussian(mean=0., std=1),
            biases_init=Constant(1),
            name="output_transform")
        output_transform.initialize()
        y_hat = output_transform.apply(h)

        # transforms to find each gmm params (mu, pi, sig)
        #pis = NDimensionalSoftmax.apply(y_hat[:,:,0:self.gmm_dim])
        # small hack to softmax a 3D tensor
        pis = T.reshape(
            T.nnet.softmax(
                T.reshape(y_hat[:, :, 0:self.gmm_dim],
                          (self.time_dim * self.batch_dim, self.gmm_dim))),
            (self.batch_dim, self.time_dim, self.gmm_dim))
        #sig = T.exp(y_hat[:,:,self.gmm_dim:self.gmm_dim*2])
        sig = T.nnet.relu(y_hat[:, :, self.gmm_dim:self.gmm_dim * 2]) + 0.1
        mus = y_hat[:, :, self.gmm_dim * 2:]

        pis = pis[:, :, :, np.newaxis]
        mus = mus[:, :, :, np.newaxis]
        sig = sig[:, :, :, np.newaxis]
        y = y[:, :, np.newaxis, :]

        #sig=theano.printing.Print()(sig)

        # sum likelihood with targets
        # sum inside log accross mixtures, sum outside log accross time
        #LL = -T.log((pis*(1./(T.sqrt(2.*np.pi)*sig))*T.exp(-0.5*((y-mus)**2)/sig**2)).sum(axis=2)).sum()
        expo = T.exp(-0.5 * ((y - mus)**2) / sig**2)
        test_expo = theano.function([x, y], [expo, mus, sig])
        return test_expo

        coeff = pis * (1. / (T.sqrt(2. * np.pi) * sig))
        inside_log = (coeff * expo).sum(axis=2)
        LL = -(T.log(inside_log)).sum()

        model = Model(LL)
        self.model = model
        parameters = model.parameters

        grads = T.grad(LL, parameters)
        updates = []
        for i in range(len(grads)):
            updates.append(
                tuple([parameters[i], parameters[i] - self.lr * grads[i]]))

        #gradf = theano.function([x, y],[LL],updates=updates, mode=NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=False))
        gradf = theano.function([x, y], [LL], updates=updates)
        f = theano.function([x], [pis, sig, mus])

        return gradf, f
Ejemplo n.º 21
0
    def build_theano_functions(self) :
        x = T.ftensor3('x') # shape of input : batch X time X value
        y = T.ftensor3('y')
        z = T.ftensor3('z')

        layers_input = [x]
        dims =np.array([self.input_dim])
        for dim in self.lstm_layers_dim :
            dims = np.append(dims, dim)
        print "Dimensions =", dims

        # layer is just an index of the layer
        for layer in range(len(self.lstm_layers_dim)) :

            # before the cell, input, forget and output gates, x needs to
            # be transformed
            linear = Linear(dims[layer],
                            dims[layer+1]*4,
                            #weights_init=Uniform(mean=data_mean, std=1),
                            weights_init=IsotropicGaussian(mean=1.,std=1),
                            biases_init=Constant(0),
                            name="linear"+str(layer))
            linear.initialize()
            lstm_input = linear.apply(layers_input[layer])

            # the lstm wants batch X time X value
            lstm = LSTM(
                dim=dims[layer+1],
                weights_init=IsotropicGaussian(mean=0.,std=0.5),
                biases_init=Constant(1),
                name="lstm"+str(layer))
            lstm.initialize()
            # hack to use Orthogonal on lstm w_state
            lstm.W_state.set_value(Orthogonal().generate(np.random, lstm.W_state.get_value().shape))
            h, _dummy = lstm.apply(lstm_input)

            layers_input.append(h)

        # the idea is to have one gaussian parametrize every frequency bin
        print "Last linear transform dim :", dims[1:].sum()
        output_transform = Linear(dims[1:].sum(),
                                  self.output_dim,
                                  weights_init=IsotropicGaussian(mean=0., std=1),
                                  biases_init=Constant(0),
                                  #use_bias=False,
                                  name="output_transform")
        output_transform.initialize()
        if len(self.lstm_layers_dim) == 1 :
            print "hallo there, only one layer speaking"
            y_hat = output_transform.apply(layers_input[-1])
        else :
            y_hat = output_transform.apply(T.concatenate(layers_input[1:], axis=2))

        sig = T.nnet.relu(y_hat[:,:,:self.output_dim/2])+0.05
        mus = y_hat[:,:,self.output_dim/2:]

        # sum likelihood with targets
        # sum inside log accross mixtures, sum outside log accross time
        inside_expo = -0.5*((y-mus)**2)/sig**2
        expo = T.exp(inside_expo)
        coeff = 1./(T.sqrt(2.*np.pi)*sig)
        inside_log = T.log(coeff*expo)
        inside_log_max = T.max(inside_log, axis=2, keepdims=True)
        LL = -(inside_log_max + T.log(T.sum(T.exp(inside_log - inside_log_max), axis=2, keepdims=True))).sum()

        #zinside_expo = -0.5*((z-mus)**2)/sig**2
        #zexpo = T.exp(zinside_expo)
        #zcoeff = pis*(1./(T.sqrt(2.*np.pi)*sig))
        #zinside_log = (zcoeff*zexpo).sum(axis=2)
        #zLL = -(T.log(zinside_log)).sum()

        model = Model(LL)
        self.model = model
        parameters = model.parameters

        grads = T.grad(LL, parameters)
        updates = []
        lr = T.scalar('lr')
        for i in range(len(grads)) :
            #updates.append(tuple([parameters[i], parameters[i] - self.lr*grads[i]]))
            updates.append(tuple([parameters[i], parameters[i] - lr*grads[i]]))

        #gradf = theano.function([x, y],[LL],updates=updates, mode=NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=False))
        if self.debug :
            gradf = theano.function([x, y, lr],[LL, mus, sig],updates=updates)
        else :
            #gradf = theano.function([x, y, z],[zLL],updates=updates)
            gradf = theano.function([x, y, lr],[LL],updates=updates)
        f = theano.function([x],[sig, mus])

        return gradf, f
Ejemplo n.º 22
0
    def build_theano_functions(self) :
        # shape of theano inpu is time+1 X features
        x = T.fmatrix('frequency_sequence')
        x = x.reshape((self.batch_dim, self.time_dim+1, self.input_dim))

        y = x[:,1:self.time_dim+1,:]
        x = x[:,:self.time_dim,:]

        layers_input = [x]
        dims =np.array([self.input_dim])
        for dim in self.lstm_layers_dim :
            dims = np.append(dims, dim)
        print "Dimensions =", dims

        # layer is just an index of the layer
        for layer in range(len(self.lstm_layers_dim)) :

            # before the cell, input, forget and output gates, x needs to
            # be transformed
            linear = Linear(dims[layer],
                            dims[layer+1]*4,
                            weights_init=Orthogonal(self.orth_scale),
                            #weights_init=IsotropicGaussian(mean=1.,std=1),
                            biases_init=Constant(0),
                            name="linear"+str(layer))
            linear.initialize()
            lstm_input = linear.apply(layers_input[layer])

            # the lstm wants batch X time X value
            lstm = LSTM(
                dim=dims[layer+1],
                weights_init=IsotropicGaussian(mean=0.,std=0.5),
                biases_init=Constant(1),
                name="lstm"+str(layer))
            lstm.initialize()
            # hack to use Orthogonal on lstm w_state
            lstm.W_state.set_value(
                self.orth_scale*Orthogonal().generate(np.random, lstm.W_state.get_value().shape))
            h, _dummy = lstm.apply(lstm_input)

            layers_input.append(h)

        # the idea is to have one gaussian parametrize every frequency bin
        print "Last linear transform dim :", dims[1:].sum()
        output_transform = Linear(dims[1:].sum(),
                                  self.output_dim,
                                  #weights_init=IsotropicGaussian(mean=0., std=1),
                                  weights_init=Orthogonal(self.orth_scale),
                                  biases_init=Constant(0),
                                  #use_bias=False,
                                  name="output_transform")
        output_transform.initialize()
        if len(self.lstm_layers_dim) == 1 :
            print "hallo there, only one layer speaking"
            y_hat = output_transform.apply(layers_input[-1])
        else :
            y_hat = output_transform.apply(T.concatenate(layers_input[1:], axis=2))

        sig = T.nnet.relu(y_hat[:,:,:self.output_dim/2])+0.05
        mus = y_hat[:,:,self.output_dim/2:]

        # sum likelihood with targets
        # sum inside log accross mixtures, sum outside log accross time
        inside_expo = -0.5*((y-mus)**2)/sig**2
        expo = T.exp(inside_expo)
        coeff = 1./(T.sqrt(2.*np.pi)*sig)
        inside_log = T.log(coeff*expo)
        inside_log_max = T.max(inside_log, axis=2, keepdims=True)
        LL = -(inside_log_max + T.log(T.sum(T.exp(inside_log - inside_log_max), axis=2, keepdims=True))).sum()
        LL.name = "summed_likelihood"

        model = Model(LL)
        self.model = model

        algorithm = GradientDescent(
            cost=LL,
            parameters=model.parameters,
            step_rule=AdaGrad())

        f = theano.function([x],[sig, mus])

        return algorithm, f