예제 #1
0
    def __init__(self, readout, transition, dim_dec, attention=None,
                 add_contexts=True, pointer_weight=0.5,
                 transition_with_att_class=None,
                 use_word_annotations=False, **kwargs):
        super(Generator, self).__init__(**kwargs)
        self.inputs = [name for name in transition.apply.sequences
                       if 'mask' not in name]
        self.dim_dec = dim_dec
        self.pointer_weight = pointer_weight
        fork = Fork(self.inputs)
        kwargs.setdefault('fork', fork)
        if attention:
            transition = transition_with_att_class(
                transition, attention,
                add_contexts=add_contexts, name="att_trans")
        else:
            transition = FakeAttentionRecurrent(transition,
                                                name="with_fake_attention")
        self.readout = readout
        self.transition = transition
        self.fork = fork
        self.children = [self.readout, self.fork, self.transition]

        self.use_word_annotations = use_word_annotations
        if use_word_annotations:
            self.word_annotation_preprocessor = Linear(
                name='input_attention_preprocessor', bias=False)
            self.children.append(self.word_annotation_preprocessor)
예제 #2
0
    def __init__(self, output_names, input_dim, prototype=None, **kwargs):
        if not prototype:
            prototype = Linear()

        self.output_names = output_names
        self.input_dim = input_dim

        kwargs.setdefault('child_prefix', 'fork')
        super(Fork, self).__init__(output_names, prototype=prototype, **kwargs)
        self.input_dims = None
예제 #3
0
 def __init__(self,
              input_names,
              input_dims,
              output_dim,
              prototype=None,
              **kwargs):
     if not prototype:
         prototype = Linear(use_bias=False)
     self.output_dim = output_dim
     super(Merge, self).__init__(input_names, input_dims,
                                 [output_dim for _ in input_names],
                                 prototype, **kwargs)
예제 #4
0
    def __init__(self,
                 target_names,
                 source_name,
                 target_dims,
                 source_dim,
                 prototype=None,
                 **kwargs):
        if not prototype:
            prototype = Linear(use_bias=False)

        self.target_names = target_names
        self.source_name = source_name
        self.target_dims = target_dims
        self.source_dim = source_dim

        super(Distribute, self).__init__(output_names=target_names,
                                         output_dims=target_dims,
                                         input_dim=source_dim,
                                         prototype=prototype,
                                         **kwargs)
예제 #5
0
    def build_theano_functions(self):
        x = T.ftensor3('x')  # shape of input : batch X time X value
        y = T.ftensor4('y')

        layers_input = [x]
        dims = np.array([self.time_dim])
        for dim in self.lstm_layers_dim:
            dims = np.append(dims, dim)
        print "Dimensions =", dims

        # layer is just an index of the layer
        for layer in range(len(self.lstm_layers_dim)):

            # before the cell, input, forget and output gates, x needs to
            # be transformed
            linear = Linear(
                dims[layer],
                dims[layer + 1] * 4,
                weights_init=Orthogonal(self.orth_scale),
                #weights_init=IsotropicGaussian(mean=1.,std=1),
                biases_init=Constant(0),
                name="linear" + str(layer))
            linear.initialize()
            lstm_input = linear.apply(layers_input[layer])

            # the lstm wants batch X time X value
            lstm = LSTM(dim=dims[layer + 1],
                        weights_init=IsotropicGaussian(mean=0., std=0.5),
                        biases_init=Constant(1),
                        name="lstm" + str(layer))
            lstm.initialize()
            # hack to use Orthogonal on lstm w_state
            lstm.W_state.set_value(
                self.orth_scale *
                Orthogonal().generate(np.random,
                                      lstm.W_state.get_value().shape))
            h, _dummy = lstm.apply(lstm_input)

            layers_input.append(h)

        # this is where Alex Graves' paper starts
        print "Last linear transform dim :", dims[1:].sum()
        output_transform = Linear(
            dims[1:].sum(),
            self.output_dim,
            weights_init=Orthogonal(self.orth_scale),
            #weights_init=IsotropicGaussian(mean=0., std=1),
            use_bias=False,
            name="output_transform")
        output_transform.initialize()
        if len(self.lstm_layers_dim) == 1:
            print "hallo there, only one layer speaking"
            y_hat = output_transform.apply(layers_input[-1])
        else:
            y_hat = output_transform.apply(
                T.concatenate(layers_input[1:], axis=2))

        # transforms to find each gmm params (mu, pi, sig)
        # small hack to softmax a 3D tensor
        #pis = T.reshape(
        #            T.nnet.softmax(
        #                T.nnet.sigmoid(
        #                    T.reshape(y_hat[:,:,0:self.gmm_dim], (self.time_dim*self.batch_dim, self.gmm_dim)))),
        #            (self.batch_dim, self.time_dim, self.gmm_dim))
        pis = T.reshape(
            T.nnet.softmax(
                T.reshape(y_hat[:, :, :self.gmm_dim],
                          (self.sequence_dim * self.batch_dim, self.gmm_dim))),
            (self.batch_dim, self.sequence_dim, self.gmm_dim))
        sig = T.exp(y_hat[:, :, self.gmm_dim:self.gmm_dim * 2]) + 1e-6
        #sig = T.nnet.relu(y_hat[:,:,self.gmm_dim:self.gmm_dim*2])+0.1
        #mus = 2.*T.tanh(y_hat[:,:,self.gmm_dim*2:])
        mus = y_hat[:, :, self.gmm_dim * 2:]

        pis = pis[:, :, :, np.newaxis]
        mus = mus[:, :, :, np.newaxis]
        sig = sig[:, :, :, np.newaxis]
        #y = y[:,:,np.newaxis,:]

        y = T.patternbroadcast(y, (False, False, True, False))
        mus = T.patternbroadcast(mus, (False, False, False, True))
        sig = T.patternbroadcast(sig, (False, False, False, True))

        # sum likelihood with targets
        # see blog for this crazy Pr() = sum log sum prod
        # axes :: (batch, sequence, mixture, time)
        expo_term = -0.5 * ((y - mus)**2) / sig**2
        coeff = T.log(T.maximum(1. / (T.sqrt(2. * np.pi) * sig), EPS))
        #coeff = T.log(1./(T.sqrt(2.*np.pi)*sig))
        sequences = coeff + expo_term
        log_sequences = T.log(pis + EPS) + T.sum(
            sequences, axis=3, keepdims=True)

        log_sequences_max = T.max(log_sequences, axis=2, keepdims=True)

        LL = -(log_sequences_max + T.log(EPS + T.sum(
            T.exp(log_sequences - log_sequences_max), axis=2, keepdims=True))
               ).mean()

        model = Model(LL)
        self.model = model
        parameters = model.parameters

        grads = T.grad(LL, parameters)
        updates = []
        lr = T.scalar('lr')
        for i in range(len(grads)):
            #updates.append(tuple([parameters[i], parameters[i] - self.lr*grads[i]]))
            updates.append(
                tuple([parameters[i], parameters[i] - lr * grads[i]]))

        #gradf = theano.function([x, y],[LL],updates=updates, mode=NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=False))
        if self.debug:
            gradf = theano.function([x, y, lr], [LL, pis, mus, sig],
                                    updates=updates)
        else:
            #gradf = theano.function([x, y, z],[zLL],updates=updates)
            gradf = theano.function([x, y, lr], [LL], updates=updates)
        f = theano.function([x], [pis, sig, mus])

        return gradf, f
예제 #6
0
    def build_theano_functions(self):
        x = T.ftensor3('x')  # shape of input : batch X time X value
        y = T.ftensor3('y')
        z = T.ftensor3('z')

        layers_input = [x]
        dims = np.array([self.input_dim])
        for dim in self.lstm_layers_dim:
            dims = np.append(dims, dim)
        print "Dimensions =", dims

        # layer is just an index of the layer
        for layer in range(len(self.lstm_layers_dim)):

            # before the cell, input, forget and output gates, x needs to
            # be transformed
            linear = Linear(
                dims[layer],
                dims[layer + 1] * 4,
                #weights_init=Uniform(mean=data_mean, std=1),
                weights_init=IsotropicGaussian(mean=1., std=1),
                biases_init=Constant(0),
                name="linear" + str(layer))
            linear.initialize()
            lstm_input = linear.apply(layers_input[layer])

            # the lstm wants batch X time X value
            lstm = LSTM(dim=dims[layer + 1],
                        weights_init=IsotropicGaussian(mean=0., std=0.5),
                        biases_init=Constant(1),
                        name="lstm" + str(layer))
            lstm.initialize()
            # hack to use Orthogonal on lstm w_state
            lstm.W_state.set_value(Orthogonal().generate(
                np.random,
                lstm.W_state.get_value().shape))
            h, _dummy = lstm.apply(lstm_input)

            layers_input.append(h)

        # the idea is to have one gaussian parametrize every frequency bin
        print "Last linear transform dim :", dims[1:].sum()
        output_transform = Linear(
            dims[1:].sum(),
            self.output_dim,
            weights_init=IsotropicGaussian(mean=0., std=1),
            biases_init=Constant(0),
            #use_bias=False,
            name="output_transform")
        output_transform.initialize()
        if len(self.lstm_layers_dim) == 1:
            print "hallo there, only one layer speaking"
            y_hat = output_transform.apply(layers_input[-1])
        else:
            y_hat = output_transform.apply(
                T.concatenate(layers_input[1:], axis=2))

        sig = T.nnet.relu(y_hat[:, :, :self.output_dim / 2]) + 0.05
        mus = y_hat[:, :, self.output_dim / 2:]

        # sum likelihood with targets
        # sum inside log accross mixtures, sum outside log accross time
        inside_expo = -0.5 * ((y - mus)**2) / sig**2
        expo = T.exp(inside_expo)
        coeff = 1. / (T.sqrt(2. * np.pi) * sig)
        inside_log = T.log(coeff * expo)
        inside_log_max = T.max(inside_log, axis=2, keepdims=True)
        LL = -(inside_log_max + T.log(
            T.sum(T.exp(inside_log - inside_log_max), axis=2,
                  keepdims=True))).sum()

        #zinside_expo = -0.5*((z-mus)**2)/sig**2
        #zexpo = T.exp(zinside_expo)
        #zcoeff = pis*(1./(T.sqrt(2.*np.pi)*sig))
        #zinside_log = (zcoeff*zexpo).sum(axis=2)
        #zLL = -(T.log(zinside_log)).sum()

        model = Model(LL)
        self.model = model
        parameters = model.parameters

        grads = T.grad(LL, parameters)
        updates = []
        lr = T.scalar('lr')
        for i in range(len(grads)):
            #updates.append(tuple([parameters[i], parameters[i] - self.lr*grads[i]]))
            updates.append(
                tuple([parameters[i], parameters[i] - lr * grads[i]]))

        #gradf = theano.function([x, y],[LL],updates=updates, mode=NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=False))
        if self.debug:
            gradf = theano.function([x, y, lr], [LL, mus, sig],
                                    updates=updates)
        else:
            #gradf = theano.function([x, y, z],[zLL],updates=updates)
            gradf = theano.function([x, y, lr], [LL], updates=updates)
        f = theano.function([x], [sig, mus])

        return gradf, f
예제 #7
0
    def build_theano_functions(self):
        x = T.fmatrix('time_sequence')
        x = x.reshape((self.batch_dim, self.sequence_dim, self.time_dim))

        y = x[:, 1:self.sequence_dim, :]
        x = x[:, :self.sequence_dim - 1, :]

        # if we try to include the spectrogram features
        spec_dims = 0
        if self.image_size is not None:
            print "Convolution activated"
            self.init_conv()
            spec = T.ftensor4('spectrogram')
            spec_features, spec_dims = self.conv.build_conv_layers(spec)
            print "Conv final dims =", spec_dims
            spec_dims = np.prod(spec_dims)
            spec_features = spec_features.reshape(
                (self.batch_dim, self.sequence_dim - 1, spec_dims))
            x = T.concatenate([x, spec_features], axis=2)

        layers_input = [x]
        dims = np.array([self.time_dim + spec_dims])
        for dim in self.lstm_layers_dim:
            dims = np.append(dims, dim)
        print "Dimensions =", dims

        # layer is just an index of the layer
        for layer in range(len(self.lstm_layers_dim)):

            # before the cell, input, forget and output gates, x needs to
            # be transformed
            linear = Linear(dims[layer],
                            dims[layer + 1] * 4,
                            weights_init=Orthogonal(self.orth_scale),
                            biases_init=Constant(0),
                            name="linear" + str(layer))
            linear.initialize()
            lstm_input = linear.apply(layers_input[layer])

            # the lstm wants batch X sequence X time
            lstm = LSTM(dim=dims[layer + 1],
                        weights_init=IsotropicGaussian(mean=0., std=0.5),
                        biases_init=Constant(1),
                        name="lstm" + str(layer))
            lstm.initialize()
            # hack to use Orthogonal on lstm w_state
            lstm.W_state.set_value(
                self.orth_scale *
                Orthogonal().generate(np.random,
                                      lstm.W_state.get_value().shape))
            h, _dummy = lstm.apply(lstm_input)

            layers_input.append(h)

        # this is where Alex Graves' paper starts
        print "Last linear transform dim :", dims[1:].sum()
        output_transform = Linear(dims[1:].sum(),
                                  self.output_dim,
                                  weights_init=Orthogonal(self.orth_scale),
                                  use_bias=False,
                                  name="output_transform")
        output_transform.initialize()
        if len(self.lstm_layers_dim) == 1:
            print "hallo there, only one layer speaking"
            y_hat = output_transform.apply(layers_input[-1])
        else:
            y_hat = output_transform.apply(
                T.concatenate(layers_input[1:], axis=2))

        # transforms to find each gmm params (mu, pi, sig)
        # small hack to softmax a 3D tensor
        pis = T.reshape(
            T.nnet.softmax(
                T.reshape(
                    y_hat[:, :, :self.gmm_dim],
                    ((self.sequence_dim - 1) * self.batch_dim, self.gmm_dim))),
            (self.batch_dim, (self.sequence_dim - 1), self.gmm_dim))
        sig = T.exp(y_hat[:, :, self.gmm_dim:self.gmm_dim * 2]) + 1e-6
        mus = y_hat[:, :, self.gmm_dim * 2:]

        pis = pis[:, :, :, np.newaxis]
        mus = mus[:, :, :, np.newaxis]
        sig = sig[:, :, :, np.newaxis]
        y = y[:, :, np.newaxis, :]

        y = T.patternbroadcast(y, (False, False, True, False))
        mus = T.patternbroadcast(mus, (False, False, False, True))
        sig = T.patternbroadcast(sig, (False, False, False, True))

        # sum likelihood with targets
        # see blog for this crazy Pr() = sum log sum prod
        # axes :: (batch, sequence, mixture, time)
        expo_term = -0.5 * ((y - mus)**2) / sig**2
        coeff = T.log(T.maximum(1. / (T.sqrt(2. * np.pi) * sig), EPS))
        #coeff = T.log(1./(T.sqrt(2.*np.pi)*sig))
        sequences = coeff + expo_term
        log_sequences = T.log(pis + EPS) + T.sum(
            sequences, axis=3, keepdims=True)

        log_sequences_max = T.max(log_sequences, axis=2, keepdims=True)

        LL = -(log_sequences_max + T.log(EPS + T.sum(
            T.exp(log_sequences - log_sequences_max), axis=2, keepdims=True))
               ).mean()
        LL.name = "summed_likelihood"

        model = Model(LL)
        self.model = model
        parameters = model.parameters

        algorithm = GradientDescent(cost=LL,
                                    parameters=model.parameters,
                                    step_rule=Adam())

        f = theano.function([x], [pis, sig, mus])

        return algorithm, f
예제 #8
0
    def build_theano_functions(self, data_mean, data_std):
        x = T.ftensor3('x')  # shape of input : batch X time X value
        y = T.ftensor3('y')

        # before the cell, input, forget and output gates, x needs to
        # be transformed
        linear_transforms = []
        for transform in ['c', 'i', 'f', 'o']:
            linear_transforms.append(
                Linear(
                    self.input_dim,
                    self.lstm_dim,
                    weights_init=Uniform(mean=data_mean, std=data_std),
                    #weights_init=IsotropicGaussian(mean=1.,std=1),
                    biases_init=Constant(data_mean),
                    name=transform + "_transform"))

        for transform in linear_transforms:
            transform.initialize()

        linear_applications = []
        for transform in linear_transforms:
            linear_applications.append(transform.apply(x))

        lstm_input = T.concatenate(linear_applications, axis=2)

        # the lstm wants batch X time X value
        lstm = LSTM(dim=self.lstm_dim,
                    weights_init=IsotropicGaussian(mean=0.5, std=1),
                    biases_init=Constant(1))
        lstm.initialize()
        h, _dummy = lstm.apply(lstm_input)

        # this is where Alex Graves' paper starts
        output_transform = Linear(
            self.lstm_dim,
            self.output_dim,
            #weights_init=Uniform(mean=data_mean, std=data_std),
            weights_init=IsotropicGaussian(mean=0., std=1),
            biases_init=Constant(1),
            name="output_transform")
        output_transform.initialize()
        y_hat = output_transform.apply(h)

        # transforms to find each gmm params (mu, pi, sig)
        #pis = NDimensionalSoftmax.apply(y_hat[:,:,0:self.gmm_dim])
        # small hack to softmax a 3D tensor
        pis = T.reshape(
            T.nnet.softmax(
                T.reshape(y_hat[:, :, 0:self.gmm_dim],
                          (self.time_dim * self.batch_dim, self.gmm_dim))),
            (self.batch_dim, self.time_dim, self.gmm_dim))
        #sig = T.exp(y_hat[:,:,self.gmm_dim:self.gmm_dim*2])
        sig = T.nnet.relu(y_hat[:, :, self.gmm_dim:self.gmm_dim * 2]) + 0.1
        mus = y_hat[:, :, self.gmm_dim * 2:]

        pis = pis[:, :, :, np.newaxis]
        mus = mus[:, :, :, np.newaxis]
        sig = sig[:, :, :, np.newaxis]
        y = y[:, :, np.newaxis, :]

        #sig=theano.printing.Print()(sig)

        # sum likelihood with targets
        # sum inside log accross mixtures, sum outside log accross time
        #LL = -T.log((pis*(1./(T.sqrt(2.*np.pi)*sig))*T.exp(-0.5*((y-mus)**2)/sig**2)).sum(axis=2)).sum()
        expo = T.exp(-0.5 * ((y - mus)**2) / sig**2)
        test_expo = theano.function([x, y], [expo, mus, sig])
        return test_expo

        coeff = pis * (1. / (T.sqrt(2. * np.pi) * sig))
        inside_log = (coeff * expo).sum(axis=2)
        LL = -(T.log(inside_log)).sum()

        model = Model(LL)
        self.model = model
        parameters = model.parameters

        grads = T.grad(LL, parameters)
        updates = []
        for i in range(len(grads)):
            updates.append(
                tuple([parameters[i], parameters[i] - self.lr * grads[i]]))

        #gradf = theano.function([x, y],[LL],updates=updates, mode=NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=False))
        gradf = theano.function([x, y], [LL], updates=updates)
        f = theano.function([x], [pis, sig, mus])

        return gradf, f
예제 #9
0
    def build_theano_functions(self) :
        # shape of theano inpu is time+1 X features
        x = T.fmatrix('frequency_sequence')
        x = x.reshape((self.batch_dim, self.time_dim+1, self.input_dim))

        y = x[:,1:self.time_dim+1,:]
        x = x[:,:self.time_dim,:]

        layers_input = [x]
        dims =np.array([self.input_dim])
        for dim in self.lstm_layers_dim :
            dims = np.append(dims, dim)
        print "Dimensions =", dims

        # layer is just an index of the layer
        for layer in range(len(self.lstm_layers_dim)) :

            # before the cell, input, forget and output gates, x needs to
            # be transformed
            linear = Linear(dims[layer],
                            dims[layer+1]*4,
                            weights_init=Orthogonal(self.orth_scale),
                            #weights_init=IsotropicGaussian(mean=1.,std=1),
                            biases_init=Constant(0),
                            name="linear"+str(layer))
            linear.initialize()
            lstm_input = linear.apply(layers_input[layer])

            # the lstm wants batch X time X value
            lstm = LSTM(
                dim=dims[layer+1],
                weights_init=IsotropicGaussian(mean=0.,std=0.5),
                biases_init=Constant(1),
                name="lstm"+str(layer))
            lstm.initialize()
            # hack to use Orthogonal on lstm w_state
            lstm.W_state.set_value(
                self.orth_scale*Orthogonal().generate(np.random, lstm.W_state.get_value().shape))
            h, _dummy = lstm.apply(lstm_input)

            layers_input.append(h)

        # the idea is to have one gaussian parametrize every frequency bin
        print "Last linear transform dim :", dims[1:].sum()
        output_transform = Linear(dims[1:].sum(),
                                  self.output_dim,
                                  #weights_init=IsotropicGaussian(mean=0., std=1),
                                  weights_init=Orthogonal(self.orth_scale),
                                  biases_init=Constant(0),
                                  #use_bias=False,
                                  name="output_transform")
        output_transform.initialize()
        if len(self.lstm_layers_dim) == 1 :
            print "hallo there, only one layer speaking"
            y_hat = output_transform.apply(layers_input[-1])
        else :
            y_hat = output_transform.apply(T.concatenate(layers_input[1:], axis=2))

        sig = T.nnet.relu(y_hat[:,:,:self.output_dim/2])+0.05
        mus = y_hat[:,:,self.output_dim/2:]

        # sum likelihood with targets
        # sum inside log accross mixtures, sum outside log accross time
        inside_expo = -0.5*((y-mus)**2)/sig**2
        expo = T.exp(inside_expo)
        coeff = 1./(T.sqrt(2.*np.pi)*sig)
        inside_log = T.log(coeff*expo)
        inside_log_max = T.max(inside_log, axis=2, keepdims=True)
        LL = -(inside_log_max + T.log(T.sum(T.exp(inside_log - inside_log_max), axis=2, keepdims=True))).sum()
        LL.name = "summed_likelihood"

        model = Model(LL)
        self.model = model

        algorithm = GradientDescent(
            cost=LL,
            parameters=model.parameters,
            step_rule=AdaGrad())

        f = theano.function([x],[sig, mus])

        return algorithm, f