Exemple #1
0
    def prediction(self, h, bias):
        srng = RandomStreams(seed=42)

        prop, mean_x, mean_y, std_x, std_y, rho, bernoulli = \
            self.compute_parameters(h, bias)

        mode = T.argmax(srng.multinomial(pvals=prop, dtype=prop.dtype), axis=1)

        v = T.arange(0, mean_x.shape[0])
        m_x = mean_x[v, mode]
        m_y = mean_y[v, mode]
        s_x = std_x[v, mode]
        s_y = std_y[v, mode]
        r = rho[v, mode]
        # cov = r * (s_x * s_y)

        normal = srng.normal((h.shape[0], 2))
        x = normal[:, 0]
        y = normal[:, 1]

        # x_n = T.shape_padright(s_x * x + cov * y + m_x)
        # y_n = T.shape_padright(s_y * y + cov * x + m_y)

        x_n = T.shape_padright(m_x + s_x * x)
        y_n = T.shape_padright(m_y + s_y * (x * r + y * T.sqrt(1.-r**2)))

        uniform = srng.uniform((h.shape[0],))
        pin = T.shape_padright(T.cast(bernoulli > uniform, floatX))

        return T.concatenate([x_n, y_n, pin], axis=1)
 def CTC_train(self):
     CTC_LOSSs = T.cast(T.mean(self.CTC_LOSS(), axis=0), "float32")
     train_data_d = []
     train_data_m = []
     train_data_m_s = [] 
     learning_rate = T.scalar()
     decay = T.scalar()
     seed = np.random.randint(10e6)
     rng = RandomStreams(seed=seed)
     grad_rate = 0.8
     for data in self.train_data:
         data_d = rng.binomial((1,), p=grad_rate, dtype="float32")[0]*T.grad(CTC_LOSSs, data)
         train_data_d.append(data_d)
         data_m_s = theano.shared(np.zeros(data.get_value().shape).astype(np.float32))
         train_data_m_s.append(data_m_s)
         data_m = data_m_s*decay + (1-decay)*data_d**2
         train_data_m.append(data_m)
     #self.grad_test = theano.function([self.X, self.Y], train_data_d[-4])
     #self.data_d_print = theano.function([self.X,self.Y],train_data_d[0][0])
     #upd = [(d,d-learning_rate*d_d)for d,d_d in zip(self.train_data,train_data_d)]
     upd = [(d, d-learning_rate*d_d/T.sqrt(d_m+1e-4))for d,d_d,d_m in zip(self.train_data,train_data_d,train_data_m)]
     upd1 = [(d_m_s, decay*d_m_s+(1-decay)*d_d**2) for d_m_s,d_d in zip(train_data_m_s,train_data_d)]
     upd +=upd1    
     #self.test = theano.function([self.X,self.Y],train_data_d[0])
     self.sgd_train = theano.function([self.X, self.Y, learning_rate, decay],
                                      [],
                                      updates = upd
                                      )
Exemple #3
0
    def _negative_sampling(self, num_negative_samples, target_indices):
        assert num_negative_samples > 0

        logging.debug('Stochastically sampling %d negative instances '
                      'out of %d classes (%.2f%%).',
                      num_negative_samples, self.num_entities,
                      100.0 *
                      float(num_negative_samples) / self.num_entities)

        from theano.tensor.shared_randomstreams import RandomStreams

        srng = RandomStreams(
            seed=np.random.randint(low=0, high=(1 << 30)))

        rng_sample_size = (self.batch_size, num_negative_samples,)

        logging.debug(
            'Using %s for random sample generation of %s tensors.',
            RandomStreams, rng_sample_size)

        logging.debug('For every batch %d random integers are sampled.',
                      np.prod(rng_sample_size))

        random_negative_indices = srng.choice(
            rng_sample_size,
            a=self.num_entities,
            p=self.clazz_distribution)

        if self.__DEBUG__:
            random_negative_indices = theano.printing.Print(
                'random_negative_indices')(random_negative_indices)

        return random_negative_indices
    def common_init(self, mr, vr, sr, di, ce, node_id):
        """
        Initialization function used by the base class and subclasses.
        """

        self.MEANRATE = mr
        self.VARRATE = vr
        self.STARVRATE = sr
        self.DIMS = di
        self.CENTS = ce
        self.ID = node_id
        srng = RandomStreams(seed=100)
        rv_u = srng.uniform((self.CENTS, self.DIMS))
        f = function([], rv_u)
        self.mean = 2*f()
        #print self.mean
        var1 = T.dscalar('var1')
        var2 = T.dmatrix('var2')
        var3 = T.mul
        self.var = theanoScaMatMul(0.001,np.ones((self.CENTS, self.DIMS)))
        self.starv = np.ones((self.CENTS, 1))
        self.belief = np.zeros((1, self.CENTS))
        self.children = []
        self.last = np.zeros((1, self.DIMS))
        self.whitening = False
    def test_binomial_vector(self):
        random = RandomStreams(utt.fetch_seed())
        n = tensor.lvector()
        prob = tensor.vector()
        out = random.binomial(n=n, p=prob)
        assert out.ndim == 1
        f = function([n, prob], out)

        n_val = [1, 2, 3]
        prob_val = numpy.asarray([.1, .2, .3], dtype=config.floatX)
        seed_gen = numpy.random.RandomState(utt.fetch_seed())
        numpy_rng = numpy.random.RandomState(int(seed_gen.randint(2**30)))

        # Arguments of size (3,)
        val0 = f(n_val, prob_val)
        numpy_val0 = numpy_rng.binomial(n=n_val, p=prob_val)
        assert numpy.all(val0 == numpy_val0)

        # arguments of size (2,)
        val1 = f(n_val[:-1], prob_val[:-1])
        numpy_val1 = numpy_rng.binomial(n=n_val[:-1], p=prob_val[:-1])
        assert numpy.all(val1 == numpy_val1)

        # Specifying the size explicitly
        g = function([n, prob], random.binomial(n=n, p=prob, size=(3,)))
        val2 = g(n_val, prob_val)
        numpy_rng = numpy.random.RandomState(int(seed_gen.randint(2**30)))
        numpy_val2 = numpy_rng.binomial(n=n_val, p=prob_val, size=(3,))
        assert numpy.all(val2 == numpy_val2)
        self.assertRaises(ValueError, g, n_val[:-1], prob_val[:-1])
    def test_uniform_vector(self):
        random = RandomStreams(utt.fetch_seed())
        low = tensor.dvector()
        high = tensor.dvector()
        out = random.uniform(low=low, high=high)
        assert out.ndim == 1
        f = function([low, high], out)

        low_val = [.1, .2, .3]
        high_val = [1.1, 2.2, 3.3]
        seed_gen = numpy.random.RandomState(utt.fetch_seed())
        numpy_rng = numpy.random.RandomState(int(seed_gen.randint(2**30)))

        # Arguments of size (3,)
        val0 = f(low_val, high_val)
        numpy_val0 = numpy_rng.uniform(low=low_val, high=high_val)
        print('THEANO', val0)
        print('NUMPY', numpy_val0)
        assert numpy.all(val0 == numpy_val0)

        # arguments of size (2,)
        val1 = f(low_val[:-1], high_val[:-1])
        numpy_val1 = numpy_rng.uniform(low=low_val[:-1], high=high_val[:-1])
        print('THEANO', val1)
        print('NUMPY', numpy_val1)
        assert numpy.all(val1 == numpy_val1)

        # Specifying the size explicitly
        g = function([low, high], random.uniform(low=low, high=high, size=(3,)))
        val2 = g(low_val, high_val)
        numpy_rng = numpy.random.RandomState(int(seed_gen.randint(2**30)))
        numpy_val2 = numpy_rng.uniform(low=low_val, high=high_val, size=(3,))
        assert numpy.all(val2 == numpy_val2)
        self.assertRaises(ValueError, g, low_val[:-1], high_val[:-1])
    def test_random_integers_vector(self):
        random = RandomStreams(utt.fetch_seed())
        low = tensor.lvector()
        high = tensor.lvector()
        out = random.random_integers(low=low, high=high)
        assert out.ndim == 1
        f = function([low, high], out)

        low_val = [100, 200, 300]
        high_val = [110, 220, 330]
        seed_gen = numpy.random.RandomState(utt.fetch_seed())
        numpy_rng = numpy.random.RandomState(int(seed_gen.randint(2**30)))

        # Arguments of size (3,)
        val0 = f(low_val, high_val)
        numpy_val0 = numpy.asarray([numpy_rng.randint(low=lv, high=hv+1)
            for lv, hv in zip(low_val, high_val)])
        assert numpy.all(val0 == numpy_val0)

        # arguments of size (2,)
        val1 = f(low_val[:-1], high_val[:-1])
        numpy_val1 = numpy.asarray([numpy_rng.randint(low=lv, high=hv+1)
            for lv, hv in zip(low_val[:-1], high_val[:-1])])
        assert numpy.all(val1 == numpy_val1)

        # Specifying the size explicitly
        g = function([low, high], random.random_integers(low=low, high=high, size=(3,)))
        val2 = g(low_val, high_val)
        numpy_rng = numpy.random.RandomState(int(seed_gen.randint(2**30)))
        numpy_val2 = numpy.asarray([numpy_rng.randint(low=lv, high=hv+1)
            for lv, hv in zip(low_val, high_val)])
        assert numpy.all(val2 == numpy_val2)
        self.assertRaises(ValueError, g, low_val[:-1], high_val[:-1])
Exemple #8
0
    def __init__(self, n_visible, n_hidden, weights=None, hidden_bias=None, visible_bias=None, random_on_gpu=False,
                 seed=69, activation=T.nnet.sigmoid):
        self.n_visible = n_visible
        self.n_hidden = n_hidden

        if random_on_gpu:
            self.t_rng = GPU_RandomStreams(seed)
        else:
            self.t_rng = RandomStreams(seed)

        if not weights:
            weights = np.asarray(
                np.random.normal(
                    scale=0.01,
                    size=(self.n_visible, self.n_hidden)),
                dtype=theano.config.floatX)
        self.ts_weights = theano.shared(value=weights, name='W', borrow=True)

        if not hidden_bias:
            hidden_bias = np.zeros(n_hidden, dtype=theano.config.floatX)

        self.ts_hidden_bias = theano.shared(value=hidden_bias, name='hb', borrow=True)

        if not visible_bias:
            visible_bias = np.zeros(n_visible, dtype=theano.config.floatX)

        self.ts_visible_bias = theano.shared(value=visible_bias, name='vb', borrow=True)

        self.x = T.matrix(name='x')

        self.activation = activation

        self.params = [self.ts_weights, self.ts_hidden_bias, self.ts_visible_bias]
    def test_default_dtype(self):
        random = RandomStreams(utt.fetch_seed())
        low = tensor.dscalar()
        high = tensor.dscalar()

        # Should not silently downcast from low and high
        out0 = random.uniform(low=low, high=high, size=(42,))
        assert out0.dtype == 'float64'
        f0 = function([low, high], out0)
        val0 = f0(-2.1, 3.1)
        assert val0.dtype == 'float64'

        # Should downcast, since asked explicitly
        out1 = random.uniform(low=low, high=high, size=(42,), dtype='float32')
        assert out1.dtype == 'float32'
        f1 = function([low, high], out1)
        val1 = f1(-1.1, 1.1)
        assert val1.dtype == 'float32'

        # Should use floatX
        lowf = tensor.fscalar()
        highf = tensor.fscalar()
        outf = random.uniform(low=lowf, high=highf, size=(42,))
        assert outf.dtype == config.floatX
        ff = function([lowf, highf], outf)
        valf = ff(numpy.float32(-0.1), numpy.float32(0.3))
        assert valf.dtype == config.floatX
    def test_multinomial_vector(self):
        random = RandomStreams(utt.fetch_seed())
        n = tensor.lvector()
        pvals = tensor.matrix()
        out = random.multinomial(n=n, pvals=pvals)
        assert out.ndim == 2
        f = function([n, pvals], out)

        n_val = [1, 2, 3]
        pvals_val = [[.1, .9], [.2, .8], [.3, .7]]
        pvals_val = numpy.asarray(pvals_val, dtype=config.floatX)
        seed_gen = numpy.random.RandomState(utt.fetch_seed())
        numpy_rng = numpy.random.RandomState(int(seed_gen.randint(2**30)))

        # Arguments of size (3,)
        val0 = f(n_val, pvals_val)
        numpy_val0 = numpy.asarray([numpy_rng.multinomial(n=nv, pvals=pv)
            for nv, pv in zip(n_val, pvals_val)])
        assert numpy.all(val0 == numpy_val0)

        # arguments of size (2,)
        val1 = f(n_val[:-1], pvals_val[:-1])
        numpy_val1 = numpy.asarray([numpy_rng.multinomial(n=nv, pvals=pv)
            for nv, pv in zip(n_val[:-1], pvals_val[:-1])])
        assert numpy.all(val1 == numpy_val1)

        # Specifying the size explicitly
        g = function([n, pvals], random.multinomial(n=n, pvals=pvals, size=(3,)))
        val2 = g(n_val, pvals_val)
        numpy_rng = numpy.random.RandomState(int(seed_gen.randint(2**30)))
        numpy_val2 = numpy.asarray([numpy_rng.multinomial(n=nv, pvals=pv)
            for nv, pv in zip(n_val, pvals_val)])
        assert numpy.all(val2 == numpy_val2)
        self.assertRaises(ValueError, g, n_val[:-1], pvals_val[:-1])
    def theano_sentence_prediction(self, Sentence, Chars, WordLengths):

        input_lstm_res_f = self.input_lstm_forward_layer.function(Sentence, Chars, WordLengths)
        input_lstm_res_b = self.input_lstm_backward_layer.function(Sentence, Chars, WordLengths)
        input_combined = T.concatenate((input_lstm_res_f, input_lstm_res_b), axis=1)

        #Make pairwise features. This is really just "tensor product with concatenation instead of multiplication". Is there a command for that?
        full_matrix, _ = theano.scan(fn=self.__pairwise_features,
                                  outputs_info=None,
                                  sequences=input_combined,
                                  non_sequences=[input_combined, Sentence.shape[0]])

        if len(self.lstm_layers) > 0 and self.lstm_layers[0].training:
            srng = RandomStreams(seed=12345)
            full_matrix = T.switch(srng.binomial(size=(Sentence.shape[0], Sentence.shape[0]+1, self.hidden_dimension*4), p=0.5), full_matrix, 0)
        else:
            full_matrix = 0.5 * full_matrix

        full_matrix = self.transition_layer.function(full_matrix)
            
        for layer in self.lstm_layers:
            if layer.training:
                print("hah-train")
                full_matrix = T.switch(srng.binomial(size=(Sentence.shape[0], Sentence.shape[0]+1, self.hidden_dimension*4), p=0.5), full_matrix, 0)
            else:
                print("heh-notrain")
                full_matrix = 0.5 * full_matrix
            
            
            full_matrix = layer.function(full_matrix)
        
        final_matrix = self.output_convolution.function(full_matrix)

        return T.nnet.softmax(final_matrix)
    def _dropout_from_layer(self, layer):

        stream = RandomStreams(self.numpy_range.randint(999999))

        mask = stream.binomial(size=layer.shape, n=1, p=(1-self._p), dtype=theano.config.floatX)

        return layer * Tensor.cast(mask, theano.config.floatX)
Exemple #13
0
    def __init__(self, rng, train_input, test_input, n_in, n_out):

        # self.input = input.flatten(2)

        self.W = theano.shared(
            value=numpy.asarray(
                rng.uniform(
                    low=-numpy.sqrt(6. / (n_in + n_out)),
                    high=numpy.sqrt(6. / (n_in + n_out)),
                    size=(n_in, n_out)
                ),
                dtype=theano.config.floatX
            ),
            name='W',
            borrow=True
        )

        self.b = theano.shared(
            value=numpy.zeros((n_out,), dtype=theano.config.floatX),
            name='b',
            borrow=True
        )

        p = 0.5

        tmp_output = T.nnet.relu(T.dot(train_input.flatten(2), self.W) + self.b)
        srng = RandomStreams(rng.randint(1234))
        mask = (srng.uniform(size=tmp_output.shape) < p)/p

        self.train_output = tmp_output * mask
        self.test_output = T.nnet.relu(T.dot(test_input.flatten(2), self.W) + self.b)
        self.params = [self.W, self.b]
Exemple #14
0
    def __init__(self, classifier, args, noise_dist):
        self.y = T.ivector("y")

        ## Cost function
        #  Sum over minibatch instances (log ( u(w|c) / (u(w|c) + k * p_n(w)) ) + sum over noise samples ( log ( u(x|c) / ( u(x|c) + k * p_n(x) ) )))

        # Generating noise samples
        srng = RandomStreams(seed=1234)
        noise_samples = srng.choice(
            size=(self.y.shape[0], args.num_noise_samples), a=args.num_classes, p=noise_dist, dtype="int32"
        )

        log_noise_dist = theano.shared(np.log(noise_dist.get_value()), borrow=True)
        # log_num_noise_samples = theano.shared(math.log(args.num_noise_samples)).astype(theano.config.floatX)
        log_num_noise_samples = theano.shared(np.log(args.num_noise_samples, dtype=theano.config.floatX))
        # Data Part of Cost Function: log ( u(w|c) / (u(w|c) + k * p_n(w))
        data_scores = classifier.output[T.arange(self.y.shape[0]), self.y]
        data_denom = self.logadd(data_scores, log_num_noise_samples + log_noise_dist[self.y])
        data_prob = data_scores - data_denom
        # Sumation of Noise Part of Cost Function: sum over noise samples ( log ( u(x|c) / ( u(x|c) + k * p_n(x) ) ))
        noise_mass = (
            log_num_noise_samples + log_noise_dist[noise_samples]
        )  # log(k) + log(p_n(x)) for all noise samples (Shape: #instaces x k)
        noise_scores = classifier.output[T.arange(noise_samples.shape[0]).reshape((-1, 1)), noise_samples]
        noise_denom = self.logadd(noise_scores, noise_mass)
        noise_prob_sum = T.sum(noise_mass - noise_denom, axis=1)

        self.cost = -T.mean(data_prob + noise_prob_sum)
        self.test = T.sum(data_scores)
    def test_normal_vector(self):
        random = RandomStreams(utt.fetch_seed())
        avg = tensor.dvector()
        std = tensor.dvector()
        out = random.normal(avg=avg, std=std)
        assert out.ndim == 1
        f = function([avg, std], out)

        avg_val = [1, 2, 3]
        std_val = [.1, .2, .3]
        seed_gen = numpy.random.RandomState(utt.fetch_seed())
        numpy_rng = numpy.random.RandomState(int(seed_gen.randint(2**30)))

        # Arguments of size (3,)
        val0 = f(avg_val, std_val)
        numpy_val0 = numpy_rng.normal(loc=avg_val, scale=std_val)
        assert numpy.allclose(val0, numpy_val0)

        # arguments of size (2,)
        val1 = f(avg_val[:-1], std_val[:-1])
        numpy_val1 = numpy_rng.normal(loc=avg_val[:-1], scale=std_val[:-1])
        assert numpy.allclose(val1, numpy_val1)

        # Specifying the size explicitly
        g = function([avg, std], random.normal(avg=avg, std=std, size=(3,)))
        val2 = g(avg_val, std_val)
        numpy_rng = numpy.random.RandomState(int(seed_gen.randint(2**30)))
        numpy_val2 = numpy_rng.normal(loc=avg_val, scale=std_val, size=(3,))
        assert numpy.allclose(val2, numpy_val2)
        self.assertRaises(ValueError, g, avg_val[:-1], std_val[:-1])
Exemple #16
0
    def __init__(self, neurons, dimensions, count = 1, max_rate = (200, 300),
            intercept = (-1.0, 1.0), t_ref = 0.002, t_rc = 0.02, seed = None,
            type = 'lif', dt = 0.001, encoders = None, name = None, address = "localhost"):
        self.seed = seed
        self.neurons = neurons
        self.dimensions = dimensions
        self.count = count
        self.name = name
        self.address = address
        self.ticker_conn = None

        # create the neurons
        # TODO: handle different neuron types, which may have different parameters to pass in
        self.neuron = neuron.names[type]((count, self.neurons), t_rc = t_rc, t_ref = t_ref, dt = dt)

        # compute alpha and bias
        srng = RandomStreams(seed=seed)
        max_rates = srng.uniform([neurons], low=max_rate[0], high=max_rate[1])
        threshold = srng.uniform([neurons], low=intercept[0], high=intercept[1])
        alpha, self.bias = theano.function([], self.neuron.make_alpha_bias(max_rates,threshold))()
        self.bias = self.bias.astype('float32')

        # compute encoders
        self.encoders = make_encoders(neurons, dimensions, srng, encoders=encoders)
        self.encoders = (self.encoders.T * alpha).T

        # make default origin
        self.origin = dict(X=origin.Origin(self))
        self.accumulator = {}
Exemple #17
0
    def theano_sentence_prediction(self, Vs):
        #Make pairwise features. This is really just "tensor product with concatenation instead of multiplication". Is there a command for that?
        pairwise_vs, _ = theano.scan(fn=self.__pairwise_features,
                                  outputs_info=None,
                                  sequences=Vs,
                                  non_sequences=[Vs, Vs.shape[0]])

        if self.input_lstm_layer.training:
            srng = RandomStreams(seed=12345)
        
        full_matrix = self.input_lstm_layer.function(pairwise_vs)

        for layer in self.lstm_layers:            
            if self.input_lstm_layer.training:
                print("hah-train")
                full_matrix = T.switch(srng.binomial(size=(Vs.shape[0], Vs.shape[0]+1, self.hidden_dimension*4), p=0.5), full_matrix, 0)
            else:
                print("heh-notrain")
                full_matrix = 0.5 * full_matrix
            
            full_matrix = layer.function(full_matrix)

        if self.input_lstm_layer.training:
            print("hah-train")
            full_matrix = T.switch(srng.binomial(size=(Vs.shape[0], Vs.shape[0]+1, self.hidden_dimension*4), p=0.5), full_matrix, 0)
        else:
            print("heh-notrain")
            full_matrix = 0.5 * full_matrix
            
        final_matrix = self.output_convolution.function(full_matrix)

        return T.nnet.softmax(final_matrix)
def kmeans(train_set_x):

    if train_set_x is None:
        train_set_x = T.matrix('train_set_x')

    ########################
    # Normalize the inputs #
    ########################

    epsilon_norm = 10
    epsilon_zca = 0.015
    K = 500

    train_set_x = train_set_x - T.mean(train_set_x, axis=0) / T.sqrt(T.var(train_set_x, axis=0) + epsilon_norm)

    #####################
    # Whiten the inputs #
    #####################

    # a simple choice of whitening transform is the ZCA whitening transform
    # epsilon_zca is small constant
    # for contrast-normalizaed data, setting epsilon_zca to 0.01 for 16-by-16 pixel patches,
    #                                                 or to  0.1 for 8-by-8   pixel patches
    # is good starting point
    cov = T.dot(train_set_x, T.transpose(train_set_x)) / train_set_x.shape[1]
    U, S, V = linalg.svd(cov)
    tmp = T.dot(U, T.diag(1/T.sqrt(S + epsilon_zca)))
    tmp = T.dot(tmp, T.transpose(U))
    whitened_x = T.dot(tmp, train_set_x)

    ######################
    # Training the Model #
    ######################

    # Initialization
    dimension_size = whitened_x.shape[0]
    num_samples = whitened_x.shape[1]
    srng = RandomStreams(seed=234)

    D = srng.normal(size=(dimension_size, K))
    D = D / T.sqrt(T.sum(T.sqr(D), axis=0))

    # typically 10 iterations is enough
    num_iteration = 15

    # compute new centroids, D_new
    for i in xrange(num_iteration):

        dx = T.dot(D.T, whitened_x)
        arg_max_dx = T.argmax(dx, axis=0)
        s = dx[arg_max_dx, T.arange(num_samples)]

        S = T.zeros((K, num_samples))
        S = T.set_subtensor(S[arg_max_dx, T.arange(num_samples)], s)
        D = T.dot(whitened_x, T.transpose(S)) + D

        D = D / T.sqrt(T.sum(T.sqr(D), axis=0))

    return D
Exemple #19
0
def build_model(tparams, options):
    trng = RandomStreams(1234)
    use_noise = theano.shared(numpy.float32(0.))

    # description string: #words x #samples
    if options['use_target_as_input']:
        x = tensor.tensor3('x', dtype='float32')
    else:
        x = tensor.matrix('x', dtype='int64')
    mask = tensor.matrix('mask', dtype='float32')
    # context: #samples x dim
    ctx = tensor.matrix('ctx', dtype='float32')

    n_timesteps = x.shape[0]
    n_samples = x.shape[1]

    # word embedding
    if options['use_target_as_input']:
        emb = x
    else:
        emb = tparams['Wemb'][x.flatten()].reshape([n_timesteps, n_samples, options['dim_word']])
    # decoder
    if options.setdefault('feedforward', False):
        proj_h = tensor.dot(emb, tparams['Wff'])
        proj_h = (proj_h * mask[:,:,None]).sum(axis=0)
        proj_h = proj_h / mask.sum(axis=0)[:,None]
    elif options.setdefault('regress', False):
        proj_h = (emb * mask[:,:,None]).sum(axis=0)
        proj_h = tensor.dot(proj_h, tparams['Wff'])
        proj_h = proj_h / mask.sum(axis=0)[:,None]
    else:
        proj = get_layer('lstm')[1](tparams, emb, options, 
                                    prefix='encoder', 
                                    mask=mask)
        proj_h = proj[0]
        if options['use_mean']:
            proj_h = (proj_h * mask[:,:,None]).sum(axis=0)
            proj_h = proj_h / mask.sum(axis=0)[:,None]
        else:
            proj_h = proj_h[-1]

    if 'n_layers' in options:
        for lidx in xrange(1, options['n_layers']):
            proj_h = get_layer('ff')[1](tparams, proj_h, options, prefix='ff_out_%d'%lidx, activ='tanh')
    out = get_layer('ff')[1](tparams, proj_h, options, prefix='ff_out', activ='linear')

    # cost
    if options['loss_type'] == 'cosine':
        out = out / tensor.sqrt((out ** 2).sum(1))[:,None]
        cost = 1. - (out * ctx).sum(1)
    elif options['loss_type'] == 'ranking':
        out = out / tensor.sqrt((out ** 2).sum(1))[:,None]
        rndidx = trng.permutation(n=ctx.shape[0])
        ctx_rnd = ctx[rndidx]
        cost = tensor.maximum(0., 1 - (out * ctx).sum(1) + (out * ctx_rnd).sum(1))
    else:
        raise Exception('Unknown loss function')

    return trng, use_noise, x, mask, ctx, cost
Exemple #20
0
    def __init__(self,X,mask,shape,is_train=1,p=0.5,state_pre=None):
        prefix="GRU"
        self.in_size,self.hidden_size=shape

        self.W_xr=theano.shared(value=np.asarray((np.random.randn(self.in_size,self.hidden_size) * 0.1),dtype=theano.config.floatX),
                                name=prefix+'_W_xr')
        self.W_hr=theano.shared(value=np.asarray((np.random.randn(self.hidden_size,self.hidden_size) * 0.1),dtype=theano.config.floatX),
                                name=prefix+'_W_hr')
        self.b_r=theano.shared(value=np.asarray(np.zeros(self.hidden_size),dtype=theano.config.floatX),
                               name=prefix+'_b_r')

        self.W_xz=theano.shared(value=np.asarray((np.random.randn(self.in_size,self.hidden_size) * 0.1),dtype=theano.config.floatX),
                                name=prefix+'_W_xz')
        self.W_hz=theano.shared(value=np.asarray((np.random.randn(self.hidden_size,self.hidden_size) * 0.1),dtype=theano.config.floatX),
                                name=prefix+'_W_hz')
        self.b_z=theano.shared(value=np.asarray(np.zeros(self.hidden_size),dtype=theano.config.floatX),
                               name=prefix+'_b_z')

        self.W_xh=theano.shared(value=np.asarray((np.random.randn(self.in_size,self.hidden_size) * 0.1),dtype=theano.config.floatX),
                                name=prefix+'_W_xh')
        self.W_hh=theano.shared(value=np.asarray((np.random.randn(self.hidden_size,self.hidden_size) * 0.1),dtype=theano.config.floatX),
                                name=prefix+'_W_hh')
        self.b_h=theano.shared(value=np.asarray(np.zeros(self.hidden_size),dtype=theano.config.floatX),
                               name=prefix+'_b_h')

        self.X=X
        self.mask=mask


        batch_size=self.X.shape[1]
        if state_pre==None:
            state_pre=T.zeros((batch_size,self.hidden_size),dtype=theano.config.floatX)

        def _step(x,m,h_tm1):
            r=T.nnet.sigmoid(T.dot(x,self.W_xr) + T.dot(h_tm1,self.W_hr) +self.b_r)
            z=T.nnet.sigmoid(T.dot(x,self.W_xz) + T.dot(h_tm1,self.W_hz) +self.b_z)

            gh=T.tanh(T.dot(x , self.W_xh) + T.dot(r * h_tm1 , self.W_hh) + self.b_h)

            h_t=z * h_tm1 + (T.ones_like(z) - z) * gh

            h_t = h_t * m[:,None]

            return h_t

        h,_=theano.scan(fn=_step,
                        sequences=[self.X,self.mask],
                        outputs_info=state_pre)
        self.h=h
        if p>0:
            trng=RandomStreams(12345)
            drop_mask=trng.binomial(n=1,p=1-p,size=h.shape,dtype=theano.config.floatX)
            self.activation=T.switch(T.eq(is_train,1),h*drop_mask,h*(1-p))
        else:
            self.activation=T.switch(T.eq(is_train,1),h,h)

        self.params=[self.W_xr,self.W_hr,self.b_r,
                     self.W_xz,self.W_hz,self.b_z,
                     self.W_xh,self.W_hh,self.b_h]
  def __init__(self, input, filter_shape, corruption_level = 0.1, 
               shared_W = None, shared_b = None, image_shape = None, 
               poolsize = (2,2)):

    theano_rng = RandomStreams()
    
    fan_in = numpy.prod(filter_shape[1:])
    fan_out = filter_shape[0] * numpy.prod(filter_shape[2:])

    center = theano.shared(value = 1, name="center")
    scale = theano.shared(value = 2, name="scale")

    if shared_W != None and shared_b != None :
        self.W = shared_W
        self.b = shared_b
    else:
        initial_W = numpy.asarray( numpy.random.uniform(
              low = -numpy.sqrt(6./(fan_in+fan_out)),
              high = numpy.sqrt(6./(fan_in+fan_out)),
              size = filter_shape), dtype = theano.config.floatX)
        initial_b = numpy.zeros((filter_shape[0],), dtype=theano.config.floatX)
        self.W = theano.shared(value = initial_W, name = "W")
        self.b = theano.shared(value = initial_b, name = "b")
    
 
    initial_b_prime= numpy.zeros((filter_shape[1],),dtype=theano.config.floatX)

    self.b_prime = theano.shared(value = initial_b_prime, name = "b_prime")
 
    self.x = input

    self.tilde_x = theano_rng.binomial( self.x.shape, 1, 1 - corruption_level,dtype=theano.config.floatX) * self.x

    conv1_out = conv.conv2d(self.tilde_x, self.W, filter_shape=filter_shape,
                            image_shape=image_shape, border_mode='valid')
    
    self.y = T.tanh(conv1_out + self.b.dimshuffle('x', 0, 'x', 'x'))
    
    da_filter_shape = [ filter_shape[1], filter_shape[0], 
                        filter_shape[2], filter_shape[3] ]
    initial_W_prime =  numpy.asarray( numpy.random.uniform( \
              low = -numpy.sqrt(6./(fan_in+fan_out)), \
              high = numpy.sqrt(6./(fan_in+fan_out)), \
              size = da_filter_shape), dtype = theano.config.floatX)
    self.W_prime = theano.shared(value = initial_W_prime, name = "W_prime")

    conv2_out = conv.conv2d(self.y, self.W_prime,
                            filter_shape = da_filter_shape,
                            border_mode='full')

    self.z =  (T.tanh(conv2_out + self.b_prime.dimshuffle('x', 0, 'x', 'x'))+center) / scale

    scaled_x = (self.x + center) / scale

    self.L = - T.sum( scaled_x*T.log(self.z) + (1-scaled_x)*T.log(1-self.z), axis=1 )

    self.cost = T.mean(self.L)

    self.params = [ self.W, self.b, self.b_prime ] 
Exemple #22
0
def dropout(rng, x, p=0.5):
    """ Zero-out random values in x with probability p using rng """
    if p > 0. and p < 1.:
        seed = rng.randint(2 ** 30)
        srng = RandomStreams(seed)
        mask = srng.binomial(n=1, p=1.-p, size=x.shape, dtype=theano.config.floatX)
        return x * mask
    return x
Exemple #23
0
def dropout(random_state, X, keep_prob=0.5):
    if keep_prob > 0. and keep_prob < 1.:
        seed = random_state.randint(2 ** 30)
        srng = RandomStreams(seed)
        mask = srng.binomial(n=1, p=keep_prob, size=X.shape,
                             dtype=theano.config.floatX)
        return X * mask
    return X
Exemple #24
0
    def __init__(self, input, rescale, recentre):
        srng = RandomStreams(seed=234)

        self.input = input

        dequantize_input = input + srng.uniform(size=input.shape, low=-0.5/255, high=0.5/255)

        self.output = rescale * (dequantize_input - recentre)
Exemple #25
0
    def __init__(self, rng, x, n_in, n_h, p, training, rnn_batch_training=False):
        """ This is to initialise a standard RNN hidden unit

        :param rng: random state, fixed value for randome state for reproducible objective results
        :param x: input data to current layer
        :param n_in: dimension of input data
        :param n_h: number of hidden units/blocks
        :param p: the probability of dropout
        :param training: a binary value to indicate training or testing (for dropout training)
        """
        self.input = x

        if p > 0.0:
            if training==1:
                srng = RandomStreams(seed=123456)
                self.input = T.switch(srng.binomial(size=x.shape,p=p), x, 0)
            else:
                self.input =  (1-p) * x #(1-p) *

        self.n_in = int(n_in)
        self.n_h  = int(n_h)

        self.rnn_batch_training = rnn_batch_training

        # random initialisation
        Wx_value = np.asarray(rng.normal(0.0, 1.0/np.sqrt(n_in), size=(n_in, n_h)), dtype=config.floatX)
        Wh_value = np.asarray(rng.normal(0.0, 1.0/np.sqrt(n_h), size=(n_h, n_h)), dtype=config.floatX)

        # Input gate weights
        self.W_xi = theano.shared(value=Wx_value, name='W_xi')
        self.W_hi = theano.shared(value=Wh_value, name='W_hi')

        # bias
        self.b_i = theano.shared(value=np.zeros((n_h, ), dtype=config.floatX), name='b_i')


        # initial value of hidden and cell state
        if self.rnn_batch_training:
            self.h0 = theano.shared(value=np.zeros((1, n_h), dtype = config.floatX), name = 'h0')
            self.c0 = theano.shared(value=np.zeros((1, n_h), dtype = config.floatX), name = 'c0')

            self.h0 = T.repeat(self.h0, x.shape[1], 0)
            self.c0 = T.repeat(self.c0, x.shape[1], 0)
        else:
            self.h0 = theano.shared(value=np.zeros((n_h, ), dtype = config.floatX), name = 'h0')
            self.c0 = theano.shared(value=np.zeros((n_h, ), dtype = config.floatX), name = 'c0')


        self.Wix = T.dot(self.input, self.W_xi)

        [self.h, self.c], _ = theano.scan(self.recurrent_as_activation_function, sequences = [self.Wix],
                                                                      outputs_info = [self.h0, self.c0])

        self.output = self.h

        self.params = [self.W_xi, self.W_hi, self.b_i]

        self.L2_cost = (self.W_xi ** 2).sum() + (self.W_hi ** 2).sum()
Exemple #26
0
 def expr(self, model, data, **kwargs):
     """
     Overwrites the Cost.expr so we can inject our theano.Op.  
     """
     space,source = self.get_data_specs(model)
     space.validate(data)
     #really no point to using these random values.  Could be zeros
     srng = RandomStreams(seed=234)
     return OverwriteOp(self.cost,model)(srng.uniform(low=0.0,high=1000.0,dtype=theano.config.floatX),data)
    def test_symbolic_shape(self):
        random = RandomStreams(utt.fetch_seed())
        shape = tensor.lvector()
        f = function([shape], random.uniform(size=shape, ndim=2))

        assert f([2, 3]).shape == (2, 3)
        assert f([4, 8]).shape == (4, 8)
        self.assertRaises(ValueError, f, [4])
        self.assertRaises(ValueError, f, [4, 3, 4, 5])
def Kmeans(X_train=None, K=300, epsilon_whitening=0.015):

    if X_train is None:
        X_train = T.matrix("X_train")

    ########################
    # Normalize the inputs #
    ########################

    # A constant added to the variance to avoid division by zero
    epsilon_norm = 10

    # We subtract from each training sample (each column in X_train) its mean
    X_train = X_train - T.mean(X_train, axis=0) / T.sqrt(T.var(X_train, axis=0) + epsilon_norm)

    #####################
    # Whiten the inputs #
    #####################

    sigma = T.dot(X_train, T.transpose(X_train)) / X_train.shape[1]
    U, s, V = linalg.svd(sigma, full_matrices=False)
    tmp = T.dot(U, T.diag(1 / T.sqrt(s + epsilon_whitening)))
    tmp = T.dot(tmp, T.transpose(U))
    X_Whitened = T.dot(tmp, X_train)

    ######################
    # Training the Model #
    ######################

    # Initialization
    dimensions = X_Whitened.shape[0]
    samples = X_Whitened.shape[1]
    srng = RandomStreams(seed=234)

    # We initialize the centroids by sampling them from a normal
    # distribution, and then normalizing them to unit length
    # D \in R^{n \times k}
    D = srng.normal(size=(dimensions, K))
    D = D / T.sqrt(T.sum(T.sqr(D), axis=0))

    iterations = 30

    for i in xrange(iterations):

        # Initialize new point representations
        # for every pass of the algorithm
        S = T.zeros((K, samples))

        tmp = T.dot(D.T, X_Whitened)
        res = T.argmax(tmp, axis=0)
        max_values = tmp[res, T.arange(samples)]
        S = T.set_subtensor(S[res, T.arange(samples)], max_values)

        D = T.dot(X_Whitened, T.transpose(S))
        D = D / T.sqrt(T.sum(T.sqr(D), axis=0))

    return D
Exemple #29
0
    def __init__(self, n_v, n_h, inputs, vbias=None,
                 hbias=None, initial_W=None, v_unit='BIN',
                 unit_type='LOG'):
        '''
        v_unit: str, optional, default: 'BIN'
        This variable control the output unit of our RBM
        The possible value are ['BIN', 'LOG', 'GAUSS']

        unit_type: str, optional, default:'LOG'
        This variable control the activation function of the unit,
        'LIN' -> W.h +b
        'LOG' -> sig(W.h +b)
        '''
        self.type = unit_type
        
        if initial_W is None:
            initial_W = np.asarray(np.random.uniform(
                            low=-4*np.sqrt(6. / (n_v + n_h)),
                            high=4*np.sqrt(6. / (n_v + n_h)),
                            size=(n_v, n_h)),
                            dtype=theano.config.floatX)
        if hbias is None:
            hbias = theano.shared(value=np.zeros(n_h,
                                dtype=theano.config.floatX), name='hbias')
        if vbias is None:
            vbias = theano.shared(value=np.zeros(n_v,
                                dtype=theano.config.floatX), name='vbias')

        e1 = np.zeros((n_v, n_h), dtype=theano.config.floatX)
        e2 = np.zeros((n_h, n_v), dtype=theano.config.floatX)

        self.inputs = inputs
        self.shape = (n_v, n_h)

        self.W = theano.shared(value=initial_W, name='W')
        self.eps_up = theano.shared(value=e1, name='eps_u')
        self.eps_down = theano.shared(value=e2, name='eps_d')
        self.vbias = vbias
        self.hbias = hbias

        np_rng = np.random.RandomState()
        theano_rng = RandomStreams(np_rng.randint(2**30))

        self.v_type = v_unit
        if v_unit is 'LOG':
            theano_rng.v_unit = self.log_sample
        elif v_unit is 'GAUSS':
            theano_rng.v_unit = self.gauss_sample
        else:
            theano_rng.v_unit = theano_rng.binomial
        self.theano_rng = theano_rng

        self.params = [self.W, self.vbias, self.hbias]
        self.params_ft = [self.eps_up, self.eps_down]

        self.hid = theano.function([self.inputs], self.up(self.inputs))
def maxout(Z, stop_dropout, archi, dropout_rate, seed=5432):
    th.config.floatX = 'float32'
    Z_out = T.maximum(Z[:, :int(archi / 2)], Z[:, int(archi / 2):])
    prob = (1 - dropout_rate)
    srng = RandomStreams(seed=seed)

    return ifelse(T.lt(stop_dropout, 1.05),
                  Z_out * srng.binomial(size=T.shape(Z_out),
                                        p=prob).astype('float32'),
                  Z_out)
        import download_datasets.mnist
        filename = r'./data/mnist.pkl.gz'

    train_x, train_y, valid_x, valid_y, test_x, test_y = load_mnist(filename)

    if args.reinit:
        init_batch_size = min(64, size)
        init_batch = train_x[:size][-init_batch_size:].reshape(
            init_batch_size, 784)
    else:
        init_batch = None

    if args.model == 'BHN_MLPWN':
        model = MLPWeightNorm_BHN(lbda=lbda,
                                  perdatapoint=perdatapoint,
                                  srng=RandomStreams(seed=args.seed + 2000),
                                  prior=prior,
                                  coupling=coupling,
                                  n_hiddens=n_hiddens,
                                  n_units=n_units,
                                  flow=args.flow,
                                  noise_distribution=args.noise_distribution,
                                  init_batch=init_batch)
    elif args.model == 'BHN_MLPCD':
        model = MLPConcreteDropout_BHN(
            lbda=lbda,
            alpha=args.alpha,
            beta=args.beta,
            perdatapoint=perdatapoint,
            srng=RandomStreams(seed=args.seed + 2000),
            prior=prior,
Exemple #32
0
    def __init__(
        self,
        numpy_rng,
        n_visible,
        n_hidden,
        theano_rng=None,
        input=None,
        theta=None,
        bvis=None
    ):
        """
        :type numpy_rng: numpy.random.RandomState
        :param numpy_rng: number random generator used to generate weights

        :type theano_rng: theano.tensor.shared_randomstreams.RandomStreams
        :param theano_rng: Theano random generator; if None is given one is
                     generated based on a seed drawn from `rng`
        """
        self.n_visible = n_visible
        self.n_hidden = n_hidden
        self.input = input

        # create a Theano random generator that gives symbolic random values
        if not theano_rng:
            theano_rng = RandomStreams(numpy_rng.randint(2 ** 30))

        # initialize theta = (W,b) with 0s; W gets the shape (n_visible, n_hidden),
        # while b is a vector of n_out elements, making theta a vector of
        # n_visible*n_hidden + n_hidden elements
        if not theta:
            theta_values = numpy.asarray(
                numpy_rng.uniform(
                    low=-4 * numpy.sqrt(6. / (n_hidden + n_visible + 1)),
                    high=4 * numpy.sqrt(6. / (n_hidden + n_visible + 1)),
                    size=(n_visible * n_hidden + n_hidden)
                ),
                dtype=theano.config.floatX
            )
            theta = theano.shared(
                value=theta_values,
                name='theta',
                borrow=True
            )
        self.theta = theta
        
        # W is represented by the fisr n_visible*n_hidden elements of theta
        W = self.theta[0:n_visible * n_hidden].reshape((n_visible, n_hidden))
        # b is the rest (last n_hidden elements)
        bhid = self.theta[n_visible * n_hidden:n_visible * n_hidden + n_hidden]

        if not bvis:
            bvis_values = numpy.asarray(
                numpy_rng.uniform(self.n_visible,),
                dtype=theano.config.floatX
            )
            bvis = theano.shared(
                value=bvis_values,
                borrow=True
            )

        self.W = W
        # b corresponds to the bias of the hidden
        self.b = bhid
        # b_prime corresponds to the bias of the visible
        self.b_prime = bvis
        # tied weights, therefore W_prime is W transpose
        self.W_prime = self.W.T
        self.theano_rng = theano_rng
        # if no input is given, generate a variable representing the input
        if input is None:
            self.x = T.matrix(name='input')
        else:
            self.x = input

        self.params = [self.theta, self.b_prime]
        
        self.train_cost_array=[]
        self.valid_error_array = []
        self.epoch=0
Exemple #33
0
class SS_ReconsSRBM:
    def reset_rng(self):

        self.rng = N.random.RandomState([12.,9.,2.])
        self.theano_rng = RandomStreams(self.rng.randint(2**30))
        if self.initialized:
            self.redo_theano()
    #

    def __getstate__(self):
        d = copy.copy(self.__dict__)

        #remove everything set up by redo_theano

        for name in self.names_to_del:
            if name in d:
                del d[name]

        print "WARNING: not pickling random number generator!!!!"
        del d['theano_rng']

        return d

    def __setstate__(self, d):
        self.__dict__.update(d)
        #self.redo_theano()      # todo: make some way of not running this, so it's possible to just open something up and look at its weights fast without recompiling it

    def weights_format(self):
        return ['v','h']

    def get_dimensionality(self):
        return 0

    def important_error(self):
        return 2

    def __init__(self, nvis, nhid,
                learning_rate,
                irange,
                init_c, mean_field_iters,
                q_damping_factor,
                s_default_damping_factor,
                tau,
                fancy_damp,
                no_damp_iters,
                persistent_chains, init_a, init_alpha, init_beta,  gibbs_iters,
                enc_weight_decay,
                use_cd, instrumented = False):
        self.initialized = False
        self.fancy_damp = fancy_damp
        if fancy_damp:
            assert q_damping_factor == s_default_damping_factor
        self.s_default_damping_factor = s_default_damping_factor
        self.tau = tau
        assert type(tau) == type(1.)
        self.reset_rng()
        self.nhid = nhid
        self.nvis = nvis
        self.learning_rate = learning_rate
        self.ERROR_RECORD_MODE_MONITORING = 0
        self.error_record_mode = self.ERROR_RECORD_MODE_MONITORING
        self.init_weight_mag = irange
        self.force_batch_size = 0
        self.init_c = init_c
        self.init_a = init_a
        self.init_alpha = init_alpha
        self.persistent_chains = persistent_chains
        self.mean_field_iters = mean_field_iters
        self.no_damp_iters = no_damp_iters
        self.gibbs_iters = gibbs_iters
        self.q_damping_factor = q_damping_factor
        self.enc_weight_decay = N.cast[floatX](enc_weight_decay)
        self.names_to_del = []
        self.use_cd = use_cd
        self.init_beta = init_beta
        self.instrumented = instrumented
        self.redo_everything()

    def set_error_record_mode(self, mode):
        self.error_record_mode = mode

    def set_size_from_dataset(self, dataset):
        self.nvis = dataset.get_output_dim()
        self.redo_everything()
        self.b.set_value( dataset.get_marginals(), borrow=False)
    #

    def get_input_dim(self):
        return self.nvis

    def get_output_dim(self):
        return self.nhid

    def redo_everything(self):
        self.initialized = True

        self.error_record = []
        if self.instrumented:
            self.instrument_record = InstrumentRecord()
        #
        self.examples_seen = 0
        self.batches_seen = 0

        self.W = shared( N.cast[floatX](self.rng.uniform(-self.init_weight_mag, self.init_weight_mag, (self.nvis, self.nhid ) ) ))
        self.W.name = 'W'

        self.c = shared( N.cast[floatX](N.zeros(self.nhid) + self.init_c) )
        self.c.name = 'c'

        self.b = shared( N.cast[floatX](N.zeros(self.nvis)))
        self.b.name = 'b'

        self.chains = shared ( N.cast[floatX]( N.zeros((self.persistent_chains,self.nvis))) )
        self.chains.name = 'chains'

        self.a = shared(N.cast[floatX](N.zeros(self.nhid)+self.init_a))
        self.a.name = 'a'
        self.alpha = shared(N.cast[floatX] (N.zeros(self.nhid)+self.init_alpha))
        self.alpha.name = 'alpha'
        self.beta = shared(N.cast[floatX] (N.zeros(self.nvis)+self.init_beta))
        self.beta.name = 'beta'

        self.params = [ self.W, self.a, self.b, self.c, self.alpha, self.beta ]
        self.clip = [ 0, 0, 0, 0, 1, 1 ]

        self.redo_theano()
    #


    def expected_energy(self, V, Q, Mu1):

        name = V.name
        #V = Print('V.'+V.name,attrs=['min','mean','max'])(V); V.name = name
        #Q = #Print('Q.'+V.name,attrs=['min','mean','max'])(Q)
        #Mu1 = #Print('Mu1.'+V.name,attrs=['min','mean','max'])(Mu1)

        ugly = Q*(1/self.gamma+T.sqr(Mu1)) - T.sqr(Q)*T.sqr(Mu1)
        #ugly = #Print('ugly',attrs=['shape'])(ugly)
        ugly.name = 'ugly'
        term_1 = 0.5 * T.dot(self.w, T.mean(ugly,axis=0))
        term_1.name = 'term_1'
        #term_1 = #Print('term_1')(term_1)

        recons = T.dot(Q*Mu1,self.W.T)
        #recons = #Print('recons',attrs=['shape'])(recons)
        recons.name = 'recons'
        iterm = 0.5*self.nvis*T.mean(T.sqr(recons)*self.beta)
        #iterm = #Print('iterm',attrs=['shape'])(iterm)
        #iterm = #Print('iterm')(iterm)
        iterm.name = 'iterm'

        normalized_vis = self.beta * (V-self.b)
        main_term = - self.nvis * T.mean(normalized_vis*recons)
        #main_term = #Print('main_term',attrs=['shape'])(main_term)
        #main_term = #Print('main_term')(main_term)
        normalized_vis.name = 'normalized_vis'
        #normalized_vis = #Print('normalized_vis',attrs=['shape'])(normalized_vis)
        main_term.name = 'main_term'

        S = (1-Q)*(T.sqr(self.a)/T.sqr(self.alpha)+1./self.alpha) + Q*(T.sqr(Mu1)+1./self.gamma)
        #S = #Print('S',attrs=['shape'])(S)
        #S = #Print('S.'+V.name)(S)
        S.name = 'S'

        contain_s = 0.5 * T.mean(T.dot(S,self.alpha))
        #contain_s = #Print('contain_s',attrs=['shape'])(contain_s)
        #contain_s = #Print('contain_s')(contain_s)
        contain_s.name = 'contain_s'

        vis_bias = - self.nvis * T.mean(normalized_vis)
        #vis_bias = #Print('vis_bias',attrs=['shape'])(vis_bias)
        #vis_bias = #Print('vis_bias')(vis_bias)
        vis_bias.name = 'vis_bias'

        contain_v = 0.5 * T.mean(T.dot(T.sqr(V),self.beta))
        #contain_v = #Print('contain_v',attrs=['shape'])(contain_v)
        #contain_v = #Print('contain_v')(contain_v)
        contain_v.name = 'contain_v'

        hid_bias = -T.mean(T.dot(Q,self.c))
        #hid_bias = #Print('hid_bias',attrs=['shape'])(hid_bias)
        #hid_bias = #Print('his_bias')(hid_bias)
        hid_bias.name = 'hid_bias'

        s_bias = -T.mean(T.dot(Q*Mu1+(1.-Q)*(self.a/self.alpha),self.a))
        #s_bias = #Print('s_bias',attrs=['s_bias'])(s_bias)
        #s_bias = #Print('s_bias')(s_bias)
        s_bias.name = 's_boas'

        rval =   term_1 + iterm + main_term + contain_s + vis_bias \
                + contain_v + hid_bias + s_bias
        rval.name = 'rval'

        assert len(rval.type().broadcastable) == 0

        return rval

    def redo_theano(self):

        init_names = dir(self)

        if 'theano_rng' not in dir(self):
            assert self.initialized
            print "WARNING: pickle did not contain theano_rng, starting from default one"
            self.reset_rng()
            return


        self.W_T = self.W.T

        self.w = T.sum(self.beta * T.sqr(self.W).T,axis=1)
        self.w.name = 'w'

        #self.alpha = #Print('alpha',attrs=['min','mean','max'])(self.alpha)
        #self.w = #Print('w',attrs=['min','mean','max'])(self.w)

        self.gamma = self.alpha + self.w
        #self.gamma = #Print('gamma',attrs=['min','mean','max'])(self.gamma)

        lr = T.scalar()

        X = T.matrix()
        X.name = 'X'

        pos_Q, pos_Mu1 = self.infer_Q_Mu1(X)
        pos_Q.name = 'pos_Q'
        pos_Mu1.name = 'pos_Mu1'

        self.H_exp_func = function([X],pos_Q)
        self.Mu1_func = function([X],pos_Mu1)
        self.hid_exp_func = function([X],pos_Q*pos_Mu1)

        if self.use_cd:
            samples = [ X ]
        else:
            samples = [ self.chains ]

        outside_pos_Q = shared(N.cast[floatX](N.zeros((1,1))))
        outside_neg_Q = shared(N.cast[floatX](N.zeros((1,1))))
        outside_pos_Mu1 = shared(N.cast[floatX](N.zeros((1,1))))
        outside_neg_Mu1 = shared(N.cast[floatX](N.zeros((1,1))))

        for i in xrange(self.gibbs_iters):
            if i == 0 and not self.use_cd:
                #if using SML, the first Q of gibbs sampling was already computed during the
                #previous call to learn_mini_batch
                samples.append(self.gibbs_step( Q = outside_neg_Q, Mu1 = outside_neg_Mu1) )
            else:
                samples.append(self.gibbs_step( V = samples[-1]))
            #
        #

        #if using SML, this needs to be called on the first mini batch to make sure outside_neg_Q is initialized
        first_Q, first_Mu1 = self.infer_Q_Mu1(self.chains)
        self.set_up_sampler = function([],updates=[
            (outside_neg_Q, first_Q),
            (outside_neg_Mu1, first_Mu1)])
        self.first_mini_batch = True

        final_sample = samples[-1]
        final_sample.name = 'final_sample'

        neg_Q, neg_Mu1 = self.infer_Q_Mu1(final_sample)
        neg_Q.name = 'neg_Q'
        neg_Mu1.name = 'neg_Mu1'

        sampling_updates = [ (outside_pos_Q, pos_Q), (outside_neg_Q, neg_Q),
                             (outside_pos_Mu1, pos_Mu1), (outside_neg_Mu1, neg_Mu1) ]

        if not self.use_cd:
            sampling_updates.append((self.chains,final_sample))

        self.run_sampling = function([X], updates = sampling_updates, name = 'run_sampling')

        obj = self.expected_energy(X,outside_pos_Q, outside_pos_Mu1) \
            - self.expected_energy(self.chains,outside_neg_Q, outside_neg_Mu1) \
            + self.enc_weight_decay * T.mean(T.sqr(self.W))


        grads = [ T.grad(obj,param) for param in self.params ]

        learning_updates = []

        for i in xrange(len(self.params)):
            update = self.params[i] - lr * grads[i]
            #update = #Print(self.params[i].name+' preclip',attrs=['min','mean','max'])(update)
            if self.clip[i]:
                update = T.clip(update,.1,1000)
            #
            learning_updates.append((self.params[i],update))
        #

        self.learn_from_samples = function([X, lr], updates =
        learning_updates , name='learn_from_samples')

        self.recons_func = function([X], self.gibbs_step_exp(X) , name = 'recons_func')

        self.sample = function([X], self.gibbs_step(X), name = 'sample_func')

        if self.instrumented:
            self.make_instruments()
        #

        final_names = dir(self)

        self.names_to_del = [ name for name in final_names if name not in init_names ]

    def learn(self, dataset, batch_size):
        self.learn_mini_batch(dataset.get_batch_design(batch_size))


    def error_func(self, x):
        return N.square( x - self.recons_func(x)).mean()

    def record_monitoring_error(self, dataset, batch_size, batches):
        print 'running on monitoring set'
        assert self.error_record_mode == self.ERROR_RECORD_MODE_MONITORING

        w = self.W.get_value(borrow=True)

        #alpha = self.alpha.get_value(borrow=True)
        beta = self.beta.get_value(borrow=True)
        #print "alpha summary: "+str( (alpha.min(),alpha.mean(),alpha.max()))
        print "beta summary: "+str( (beta.min(), beta.mean(), beta.max()))

        if N.any(N.isnan(w)):
            raise Exception("Nan")
        print 'weights summary: '+str( (w.min(),w.mean(),w.max()))

        errors = []

        if self.instrumented:
            self.clear_instruments()

        for i in xrange(batches):
            x = dataset.get_batch_design(batch_size)
            error = self.error_func(x)
            errors.append( error )
            if self.instrumented:
                self.update_instruments(x)
            #
        #


        self.error_record.append( (self.examples_seen, self.batches_seen, N.asarray(errors).mean() ) )


        if self.instrumented:
            self.instrument_record.begin_report(examples_seen = self.examples_seen, batches_seen = self.batches_seen)
            self.make_instrument_report()
            self.instrument_record.end_report()
            self.clear_instruments()
        #
        print 'monitoring set done'
    #


    def recons_from_Q_Mu1(self,Q,Mu1):
        return self.b + T.dot(Q*Mu1, self.W.T)
    #

    def recons_err_from_Q_Mu1(self,Q,Mu1,V):
        return T.mean(T.sqr(V-self.recons_from_Q_Mu1(Q,Mu1)))

    def binary_entropy(self,Q):
        mod_Q = 1e-6 + (1.-2e-6)*Q

        return -(mod_Q * T.log(Q) + (1.-mod_Q)*T.log(1.-mod_Q))

    def make_instruments(self):
        assert not self.use_cd #currently just supports PCD

        recons_outputs = []
        ave_act_outputs = []
        cond_ent_outputs = []
        neg_chains_recons_outputs = []
        neg_chains_ave_act_outputs = []
        neg_chains_cond_ent_outputs = []

        self.instrument_X = T.matrix()

        for max_iters in xrange(1,self.mean_field_iters+1):
            pos_Q, pos_Mu1 = self.infer_Q_Mu1(self.instrument_X, max_iters = max_iters)
            neg_Q, neg_Mu1 = self.infer_Q_Mu1(self.chains, max_iters = max_iters)

            recons_outputs.append(self.recons_err_from_Q_Mu1(pos_Q,pos_Mu1,self.instrument_X))
            neg_chains_recons_outputs.append(self.recons_err_from_Q_Mu1(neg_Q,neg_Mu1,self.chains))

            ave_act_outputs.append(T.mean(pos_Q, axis=0))
            neg_chains_ave_act_outputs.append(T.mean(neg_Q, axis=0))

            cond_ent_outputs.append(T.mean(self.binary_entropy(pos_Q),axis=0))
            neg_chains_cond_ent_outputs.append(T.mean(self.binary_entropy(neg_Q),axis=0))
        #

        self.neg_chains_recons_after_mean_field   = function([],neg_chains_recons_outputs)
        self.neg_chains_ave_act_after_mean_field  = function([],neg_chains_ave_act_outputs)
        self.neg_chains_cond_ent_after_mean_field = function([],neg_chains_cond_ent_outputs)

        self.recons_after_mean_field_func = function([self.instrument_X],recons_outputs)
        self.ave_act_after_mean_field_func = function([self.instrument_X],ave_act_outputs)
        self.cond_ent_after_mean_field_func = function([self.instrument_X],cond_ent_outputs)

        neg_chain_norms = T.sqrt(T.sum(T.sqr(self.chains),axis=1))
        self.neg_chain_norms_summary = function([], [neg_chain_norms.min(),neg_chain_norms.mean(),neg_chain_norms.max()])

        weight_norms = T.sqrt(T.sum(T.sqr(self.W),axis=0))
        self.weight_norms_summary = function([], [weight_norms.min(),weight_norms.mean(),weight_norms.max()])

        self.hid_bias_summary = function([],[self.c.min(),self.c.mean(),self.c.max()])
        self.vis_bias_summary = function([],[self.b.min(),self.b.mean(),self.b.max()])

        self.beta_func = function([],self.beta)
    #

    def clear_instruments(self):

        self.cond_ent_after_mean_field = [[] for i in xrange(self.mean_field_iters)]
        self.recons_after_mean_field = [[] for i in xrange(self.mean_field_iters)]
        self.ave_act_after_mean_field = [[] for i in xrange(self.mean_field_iters)]
    #

    def update_instruments(self, X):
        ce = self.cond_ent_after_mean_field_func(X)
        re = self.recons_after_mean_field_func(X)

        aa = self.ave_act_after_mean_field_func(X)

        for fr, to in [ (ce,self.cond_ent_after_mean_field),
                        (re, self.recons_after_mean_field),
                        (aa, self.ave_act_after_mean_field) ]:
            assert len(to) == self.mean_field_iters
            assert len(fr) == self.mean_field_iters

            for fr_elem, to_elem in zip(fr,to):
                to_elem.append(fr_elem)
            #
        #
    #


    def make_instrument_report(self):
        r = self.instrument_record

        neg_chains_recons = self.neg_chains_recons_after_mean_field()
        neg_chains_ave_act = self.neg_chains_ave_act_after_mean_field()
        neg_chains_cond_ent = self.neg_chains_cond_ent_after_mean_field()

        for i in xrange(1,self.mean_field_iters+1):
            re = N.asarray(self.recons_after_mean_field[i-1]).mean()
            r.report(('recons_err_after_mean_field',i),re)
            r.report(('neg_recons_err_after_mean_field',i),neg_chains_recons[i-1])

            aa_mat = N.asarray(self.ave_act_after_mean_field[i-1])
            assert len(aa_mat.shape) == 2
            assert aa_mat.shape[1] == self.nhid

            aa_vec = aa_mat.mean(axis=0)
            aa_min = aa_vec.min()
            aa_mean = aa_vec.mean()
            aa_max = aa_vec.max()
            naa_vec = neg_chains_ave_act[i-1]
            naa_min = naa_vec.min()
            naa_mean = naa_vec.mean()
            naa_max = naa_vec.max()
            r.report(('ave_act_after_mean_field_min',i),aa_min)
            r.report(('ave_act_after_mean_field_mean',i),aa_mean)
            r.report(('ave_act_after_mean_field_max',i),aa_max)
            r.report(('neg_ave_act_after_mean_field_min',i),naa_min)
            r.report(('neg_ave_act_after_mean_field_mean',i),naa_mean)
            r.report(('neg_ave_act_after_mean_field_max',i),naa_max)

            ce_mat = N.asarray(self.cond_ent_after_mean_field[i-1])
            assert len(ce_mat.shape) == 2
            assert ce_mat.shape[1] == self.nhid
            ce_vec = ce_mat.mean(axis=0)
            ce_min, ce_mean, ce_max = ce_vec.min(), ce_vec.mean(), ce_vec.max()
            nce_vec = neg_chains_cond_ent[i-1]
            nce_min, nce_mean, nce_max = nce_vec.min(), nce_vec.mean(), nce_vec.max()
            r.report(('cond_ent_after_mean_field_min',i),ce_min)
            r.report(('cond_ent_after_mean_field_mean',i),ce_mean)
            r.report(('cond_ent_after_mean_field_max',i),ce_max)
            r.report(('neg_cond_ent_after_mean_field_min',i),nce_min)
            r.report(('neg_cond_ent_after_mean_field_mean',i),nce_mean)
            r.report(('neg_cond_ent_after_mean_field_max',i),nce_max)
        #


        neg_chain_norms_min, neg_chain_norms_mean, neg_chain_norms_max  = self.neg_chain_norms_summary()
        r.report('neg_chain_norms_min', neg_chain_norms_min)
        r.report('neg_chain_norms_mean', neg_chain_norms_mean)
        r.report('neg_chain_norms_max', neg_chain_norms_max)

        weight_norms_min, weight_norms_mean, weight_norms_max = self.weight_norms_summary()
        r.report('weight_norms_min', weight_norms_min)
        r.report('weight_norms_mean', weight_norms_mean)
        r.report('weight_norms_max', weight_norms_max)


        hid_bias_min, hid_bias_mean, hid_bias_max = self.hid_bias_summary()
        r.report('hid_bias_min', hid_bias_min)
        r.report('hid_bias_mean', hid_bias_mean)
        r.report('hid_bias_max', hid_bias_max)

        vis_bias_min, vis_bias_mean, vis_bias_max = self.vis_bias_summary()
        r.report('vis_bias_min', vis_bias_min)
        r.report('vis_bias_mean', vis_bias_mean)
        r.report('vis_bias_max', vis_bias_max)


        r.report('beta',self.beta_func())

    #


    def reconstruct(self, x, use_noise):
        assert x.shape[0] == 1

        print 'x summary: '+str((x.min(),x.mean(),x.max()))

        #this method is mostly a hack to make the formatting work the same as denoising autoencoder
        self.truth_shared = shared(x.copy())

        if use_noise:
            self.vis_shared = shared(x.copy() + 0.15 *  N.cast[floatX](self.rng.randn(*x.shape)))
        else:
            self.vis_shared = shared(x.copy())

        self.reconstruction = self.recons_func(self.vis_shared.get_value())

        print 'recons summary: '+str((self.reconstruction.min(),self.reconstruction.mean(),self.reconstruction.max()))


    def gibbs_step_exp(self, V = None, Q = None, Mu1 = None):
        if V is not None:
            assert Q is None
            assert Mu1 is None

            base_name = V.name

            if base_name is None:
                base_name = 'anon'

            Q, Mu1 = self.infer_Q_Mu1(V)
        else:
            assert Q is not None
            assert Mu1 is not None

            Q_name = Q.name

            if Q_name is None:
                Q_name = 'anon'

            base_name = 'from_Q_'+Q_name
        #


        H, S = self.sample_hid(Q, Mu1)

        H.name =  base_name + '->hid_sample'


        sample =  self.b + T.dot(H*S,self.W_T)

        sample.name = base_name + '->sample_expectation'

        return sample


    def gibbs_step(self, V = None, Q = None, Mu1 = None):

        if V is not None:
            assert Q is None

            base_name = V.name

            if base_name is None:
                base_name = 'anon'
            #

        else:
            assert Q is not None
            Q_name = Q.name

            if Q_name is None:
                Q_name = 'anon'
            #

            base_name = 'from_Q_'+Q_name

        #

        m = self.gibbs_step_exp(V, Q, Mu1)

        assert m.dtype == floatX
        std = T.sqrt(1./self.beta)
        #std = #Print('vis_std',attrs=['min','mean','max'])(std)
        sample = self.theano_rng.normal(size = m.shape, avg = m,
                                    std = std, dtype = m.dtype)

        sample.name = base_name + '->sample'

        return sample

    def sample_hid(self, Q, Mu1):
        H =  self.theano_rng.binomial(size = Q.shape, n = 1, p = Q,
                                dtype = Q.dtype)
        std = T.sqrt(1./self.gamma)
        #std = #Print('hid_std',attrs=['min','mean','max'])(std)
        S = self.theano_rng.normal(size = Mu1.shape, avg = Mu1, std = std, dtype = Mu1.dtype)

        return H, S

    def infer_Q_Mu1(self, V, max_iters = 0):

        if max_iters > 0:
            iters = min(max_iters, self.mean_field_iters)
        else:
            iters = self.mean_field_iters
        #

        base_name = V.name

        if base_name is None:
            base_name = 'anon'

        first_Q, first_Mu1 = self.init_mean_field_step(V)
        Q =  [ first_Q ]
        Mu1 = [ first_Mu1 ]

        no_damp = 0

        for i in xrange(iters - 1):
            damp = i + 1 < self.mean_field_iters - self.no_damp_iters
            no_damp += (damp == False)
            new_Q, new_Mu1 = self.damped_mean_field_step(V,Q[-1],Mu1[-1],damp)
            Q.append ( new_Q )
            Mu1.append( new_Mu1)
        #

        if max_iters == 0:
            assert no_damp == self.no_damp_iters
        else:
            assert self.no_damp_iters is not None
            assert self.mean_field_iters is not None
            assert max_iters is not None
            assert no_damp == max(0, self.no_damp_iters - (self.mean_field_iters - max_iters))
        #

        for i in xrange(len(Q)):
            Q[i].name = base_name + '->Q ('+str(i)+')'

        assert len(Q[-1].type().broadcastable) == 2
        assert len(Mu1[-1].type().broadcastable) == 2

        return Q[-1], Mu1[-1]

    def Q_from_A(self, A):
        assert len(A.type().broadcastable) == 2
        return T.nnet.sigmoid(0.5*(T.sqr(A)/self.gamma-T.sqr(self.a)/self.alpha)+self.c-0.5*T.log(self.gamma/self.alpha))


    def mean_field_step(self, V, P, Mu):

        assert len(V.type().broadcastable) == 2

        iterm = T.dot(T.dot(P*Mu,self.W.T*self.beta),self.W)

        normalized_V = self.beta * (V-self.b)
        main_term = T.dot(normalized_V, self.W)

        A = self.w * P*Mu - iterm + main_term + self.a
        Mu1 = A / self.gamma
        Q = self.Q_from_A( A)

        assert len(Q.type().broadcastable) == 2

        return Q, Mu1
    #


    def mean_field_fancy_step(self, V, P, Mu):

        iterm = T.dot(T.dot(P*Mu,self.W.T*self.beta),self.W)

        normalized_V = self.beta * (V-self.b)
        main_term = T.dot(normalized_V, self.W)


        iA = self.w * P*Mu - iterm

        full_A = iA + main_term+self.a
        Mu1 = full_A / self.gamma

        Q = self.Q_from_A( full_A)

        iMu = iA / self.gamma

        #if this is negative, we are ammplifying so we use default damping
        #if this is positive, we are flipping, use max(0,lambda(tau))
        discriminant = T.sgn(Mu-iMu) * Mu/(1e-10+abs(Mu-iMu))

        Lambda = self.tau * discriminant -  T.sgn(Mu-iMu) * iMu/(1e-10+abs(Mu-iMu))

        mask = discriminant <= 0

        fancy_damp = mask*self.s_default_damping_factor + (1.-mask)*T.maximum(0.,Lambda)


        return Q, Mu1, fancy_damp


    def init_mean_field_step(self, V, damp = True):
        #return self.damped_mean_field_step(V, T.nnet.sigmoid(self.c-0.5*T.log(self.gamma/self.alpha)), self.a/self.alpha, damp)
        return self.damped_mean_field_step(V, T.zeros_like(T.dot(V,self.W)), T.zeros_like(T.dot(V,self.W)), damp)

    def damped_mean_field_step(self, V, P, Mu, damp):

        if self.fancy_damp:
            Q, Mu1, fancy_damp = self.mean_field_fancy_step(V,P,Mu)
        else:
            Q, Mu1 = self.mean_field_step(V,P,Mu)
        #

        if damp:
            r_Q =  self.q_damping_factor * P + (1.0 - self.q_damping_factor) * Q
            if self.fancy_damp:
                r_Mu = fancy_damp * Mu + (1.0-fancy_damp) * Mu1
            else:
                r_Mu = self.s_default_damping_factor * Mu + (1.0-self.s_default_damping_factor) * Mu1
            #
        else:
            r_Q = Q
            r_Mu = Mu1
        #

        assert len(r_Q.type().broadcastable) == 2

        return r_Q, r_Mu
    #

    def debug_dump(self, x):

        print "making debug dump"

        print 'x: '+str((x.min(),x.mean(),x.max()))
        W = self.W.get_value()
        print 'W: '+str((W.min(),W.mean(),W.max()))
        w = function([],self.w)()
        print 'w: '+str((w.min(),w.mean(),w.max()))
        alpha = self.alpha.get_value()
        print 'alpha: '+str((alpha.min(),alpha.mean(),alpha.max()))
        beta = self.beta.get_value()
        print 'beta: '+str((beta.min(),beta.mean(),beta.max()))


        prior_Q = function([],T.nnet.sigmoid(self.c-0.5*T.log(self.gamma/self.alpha)))()
        print 'prior_Q: '+str((prior_Q.min(),prior_Q.mean(),prior_Q.max()))

        prior_Mu = function([],self.a/self.alpha)()
        print 'prior_Mu: '+str((prior_Mu.min(),prior_Mu.mean(),prior_Mu.max()))


        var = T.matrix()
        var.name = 'debug_x'
        for i in xrange(1,self.mean_field_iters+1):
            outputs = self.infer_Q_Mu1(var,max_iters=i)
            Q, Mu = function([var],outputs)(x)
            print 'after '+str(i)+' mean field steps:'
            print '\tQ: '+str((Q.min(),Q.mean(),Q.max()))
            print '\tMu: '+str((Mu.min(),Mu.mean(),Mu.max()))
        #

        assert False


    def learn_mini_batch(self, x):

        #t1 = time.time()

        if self.first_mini_batch:
            self.first_mini_batch = False
            if not self.use_cd:
                self.set_up_sampler()
            #
        #

        #Mu1 = self.Mu1_func(x)
        #if Mu1.max() > 500.:
        #    self.debug_dump(x)


        #print '\nrun_sampling\n'

        self.run_sampling(x)

        #print '\nlearn_from_samples\n'

        self.learn_from_samples(x,self.learning_rate)

        #pos_Q, neg_Q = self.run_sampling(x)
        #self.learn_from_samples(x, pos_Q, neg_Q, self.learning_rate)

        #t2 = time.time()

        #print 'batch took '+str(t2-t1)+' sec'

        self.examples_seen += x.shape[0]
        self.batches_seen += 1
Exemple #34
0
    def __init__(self, numpy_rng, theano_rng=None, n_ins=784,
                 hidden_layers_sizes=[1000, 1000, 1000]):
        """This class is made to support a variable number of layers.

        :type numpy_rng: numpy.random.RandomState
        :param numpy_rng: numpy random number generator used to draw initial
                    weights

        :type theano_rng: theano.tensor.shared_randomstreams.RandomStreams
        :param theano_rng: Theano random generator; if None is given one is
                           generated based on a seed drawn from `rng`

        :type n_ins: int
        :param n_ins: dimension of the input to the DBN

        :type hidden_layers_sizes: list of ints
        :param hidden_layers_sizes: intermediate layers size, must contain
                               at least one value

        :type n_outs: int
        :param n_outs: dimension of the output of the network
        """

        self.sigmoid_layers = []
        self.rbm_layers = []
        self.params = []
        self.n_layers = len(hidden_layers_sizes)
        self.layer_sizes = hidden_layers_sizes

        assert self.n_layers > 0

        if not theano_rng:
            theano_rng = RandomStreams(numpy_rng.randint(2 ** 30))

        # allocate symbolic variables for the data
        self.x = T.matrix('x')  # the data is presented as rasterized images
        self.x_mask = T.matrix('x_mask')    # For partial information.

        # end-snippet-1
        # The DBN is an MLP, for which all weights of intermediate
        # layers are shared with a different RBM.  We will first
        # construct the DBN as a deep multilayer perceptron, and when
        # constructing each sigmoidal layer we also construct an RBM
        # that shares weights with that layer. During pretraining we
        # will train these RBMs (which will lead to chainging the
        # weights of the MLP as well) During finetuning we will finish
        # training the DBN by doing stochastic gradient descent on the
        # MLP.

        for i in xrange(self.n_layers):
            # construct the sigmoidal layer

            # the size of the input is either the number of hidden
            # units of the layer below or the input size if we are on
            # the first layer
            if i == 0:
                input_size = n_ins
            else:
                input_size = hidden_layers_sizes[i - 1]

            # the input to this layer is either the activation of the
            # hidden layer below or the input of the DBN if you are on
            # the first layer
            if i == 0:
                layer_input = self.x
            else:
                layer_input = self.sigmoid_layers[-1].output

            sigmoid_layer = HiddenLayer(rng=numpy_rng,
                                        input=layer_input,
                                        n_in=input_size,
                                        n_out=hidden_layers_sizes[i],
                                        activation=T.nnet.sigmoid)

            # add the layer to our list of layers
            self.sigmoid_layers.append(sigmoid_layer)


            # Construct an RBM that shared weights with this layer
            rbm_layer = RBM(numpy_rng=numpy_rng,
                            theano_rng=theano_rng,
                            input=layer_input,
                            n_visible=input_size,
                            n_hidden=hidden_layers_sizes[i],
                            W=sigmoid_layer.W,
                            hbias=sigmoid_layer.b)
            self.rbm_layers.append(rbm_layer)
            self.params.extend(rbm_layer.params)

        # And build the upside-down network.  This shares parameters with the
        # forward network. Except the weights are transposed and stuff.

        # The "isolated" layers let you run only the upside-down part of the
        # network, for generation.  The non-isolated layers are connected to
        # the forward, compressing part of the network, and are used for
        # training.
        reverse_input = self.sigmoid_layers[-1].output
        self.isolated_reverse_input = theano.shared(
            numpy.zeros([10, hidden_layers_sizes[-1]]))
        isolated_input = self.isolated_reverse_input
        self.reverse_layers = [None] * self.n_layers
        self.isolated_reverse = [None] * self.n_layers
        for i in reversed(xrange(self.n_layers)):    
            if i == 0:
                out_size = n_ins
            else:
                out_size = hidden_layers_sizes[i-1]
            reverse_sigmoid = HiddenLayer(rng=numpy_rng,
                input=reverse_input,
                n_in=hidden_layers_sizes[i],
                n_out=out_size,
                W=self.sigmoid_layers[i].W.T,
                b=self.rbm_layers[i].vbias,
                activation=T.nnet.sigmoid
            )
            isolated_sigmoid = HiddenLayer(rng=numpy_rng,
                input=isolated_input,
                n_in=hidden_layers_sizes[i],
                n_out=out_size,
                W=self.sigmoid_layers[i].W.T,
                b=self.rbm_layers[i].vbias,
                activation=T.nnet.sigmoid
            )
            
            reverse_input = reverse_sigmoid.output
            isolated_input = isolated_sigmoid.output
            self.reverse_layers[i] = reverse_sigmoid
            self.isolated_reverse[i] = isolated_sigmoid


        # The fine-tune cost is the reconstruction error of the entire net.
        self.finetune_cost = ((self.x - self.reverse_layers[0].output)**2).sum()

        # The cost for training the generative net - in this case, self.x is
        # completely disconnected, and we feed a pattern into the reverse net.
        self.generative_cost = ((self.x - self.isolated_reverse[0].output)**2).sum()

        # The l1 cost is for generating constrained samples of the input.  (Aka
        # harmonizing a melody.)  Given a melody in self.x and a mask
        # self.x_mask of which parts of self.x actually matter, it computes the
        # error between the generated sample and the melody.
        self.l1_cost = (((self.x - self.isolated_reverse[0].output) * self.x_mask)**2).sum()
Exemple #35
0
def test_rbm(learning_rate=0.1,
             training_epochs=15,
             dataset='../../data/mnist.pkl.gz',
             batch_size=20,
             n_chains=20,
             n_samples=10,
             output_folder='rbm_plots',
             n_hidden=500):
    """
    Demonstrate how to train and afterwards sample from it using Theano.

    This is demonstrated on MNIST.

    :param learning_rate: learning rate used for training the RBM

    :param training_epochs: number of epochs used for training

    :param dataset: path the the pickled dataset

    :param batch_size: size of a batch used to train the RBM

    :param n_chains: number of parallel Gibbs chains to be used for sampling

    :param n_samples: number of samples to plot for each chain

    """
    datasets = load_data(dataset)

    train_set_x, train_set_y = datasets[0]
    test_set_x, test_set_y = datasets[2]

    # compute number of minibatches for training, validation and testing
    n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size

    # allocate symbolic variables for the data
    index = T.lscalar()  # index to a [mini]batch
    x = T.matrix('x')  # the data is presented as rasterized images

    rng = numpy.random.RandomState(123)
    theano_rng = RandomStreams(rng.randint(2**30))

    # initialize storage for the persistent chain (state = hidden
    # layer of chain)
    persistent_chain = theano.shared(numpy.zeros((batch_size, n_hidden),
                                                 dtype=theano.config.floatX),
                                     borrow=True)

    # construct the RBM class
    rbm = RBM(input=x,
              n_visible=28 * 28,
              n_hidden=n_hidden,
              numpy_rng=rng,
              theano_rng=theano_rng)

    # get the cost and the gradient corresponding to one step of CD-15
    #cost, updates = rbm.get_cost_updates(lr=learning_rate,
    #                                     persistent=persistent_chain, k=1)
    cost, updates = rbm.get_cost_updates(lr=learning_rate,
                                         persistent=None,
                                         k=1)

    #################################
    #     Training the RBM          #
    #################################
    if not os.path.isdir(output_folder):
        os.makedirs(output_folder)
    os.chdir(output_folder)

    # it is ok for a theano function to have no output
    # the purpose of train_rbm is solely to update the RBM parameters
    train_rbm = theano.function(
        [index],
        cost,
        updates=updates,
        givens={x: train_set_x[index * batch_size:(index + 1) * batch_size]},
        name='train_rbm')

    plotting_time = 0.
    start_time = time.clock()

    # go through training epochs
    for epoch in xrange(training_epochs):

        # go through the training set
        mean_cost = []
        for batch_index in xrange(n_train_batches):
            mean_cost += [train_rbm(batch_index)]

        print 'Training epoch %d, cost is ' % epoch, numpy.mean(mean_cost)

        # Plot filters after each training epoch
        plotting_start = time.clock()
        # Construct image from the weight matrix
        image = PIL.Image.fromarray(
            tile_raster_images(X=rbm.W.get_value(borrow=True).T,
                               img_shape=(28, 28),
                               tile_shape=(10, 10),
                               tile_spacing=(1, 1)))
        image.save('filters_at_epoch_%i.png' % epoch)
        plotting_stop = time.clock()
        plotting_time += (plotting_stop - plotting_start)

    end_time = time.clock()

    pretraining_time = (end_time - start_time) - plotting_time

    print('Training took %f minutes' % (pretraining_time / 60.))

    #################################
    #     Sampling from the RBM     #
    #################################
    # find out the number of test samples
    number_of_test_samples = test_set_x.get_value(borrow=True).shape[0]

    # pick random test examples, with which to initialize the persistent chain
    test_idx = rng.randint(number_of_test_samples - n_chains)
    persistent_vis_chain = theano.shared(
        numpy.asarray(test_set_x.get_value(borrow=True)[test_idx:test_idx +
                                                        n_chains],
                      dtype=theano.config.floatX))

    plot_every = 1000
    # define one step of Gibbs sampling (mf = mean-field) define a
    # function that does `plot_every` steps before returning the
    # sample for plotting
    [presig_hids, hid_mfs, hid_samples, presig_vis,
     vis_mfs, vis_samples], updates =  \
                        theano.scan(rbm.gibbs_vhv,
                                outputs_info=[None,  None, None, None,
                                              None, persistent_vis_chain],
                                n_steps=plot_every)

    # add to updates the shared variable that takes care of our persistent
    # chain :.
    updates.update({persistent_vis_chain: vis_samples[-1]})
    # construct the function that implements our persistent chain.
    # we generate the "mean field" activations for plotting and the actual
    # samples for reinitializing the state of our persistent chain
    sample_fn = theano.function([], [vis_mfs[-1], vis_samples[-1]],
                                updates=updates,
                                name='sample_fn')

    # create a space to store the image for plotting ( we need to leave
    # room for the tile_spacing as well)
    image_data = numpy.zeros((29 * n_samples + 1, 29 * n_chains - 1),
                             dtype='uint8')
    for idx in xrange(n_samples):
        # generate `plot_every` intermediate samples that we discard,
        # because successive samples in the chain are too correlated
        vis_mf, vis_sample = sample_fn()
        print ' ... plotting sample ', idx
        image_data[29 * idx:29 * idx + 28, :] = tile_raster_images(
            X=vis_mf,
            img_shape=(28, 28),
            tile_shape=(1, n_chains),
            tile_spacing=(1, 1))
        # construct image

    image = PIL.Image.fromarray(image_data)
    image.save('samples.png')
    os.chdir('../')
Exemple #36
0
class AELADLSTMS(object):
    def __init__(self, wordlist, argv, aspect_num=0):
        parser = argparse.ArgumentParser()
        parser.add_argument('--name', type=str, default='lstm')
        parser.add_argument('--rseed', type=int, default=int(1000 * time.time()) % 19491001)
        parser.add_argument('--dim_word', type=int, default=300)
        parser.add_argument('--dim_hidden', type=int, default=300)
        parser.add_argument('--dim_aspect', type=int, default=300)
        parser.add_argument('--grained', type=int, default=3, choices=[3])
        parser.add_argument('--regular', type=float, default=0.001)
        parser.add_argument('--word_vector', type=str, default='data/glove.840B.300d.txt')
        args, _ = parser.parse_known_args(argv)

        self.name = args.name
        self.srng = RandomStreams(seed=args.rseed)
        self.dim_word, self.dim_hidden = args.dim_word, args.dim_hidden
        self.dim_aspect = args.dim_aspect
        self.grained = args.grained
        self.regular = args.regular
        self.num = len(wordlist) + 1
        self.init_param()
        self.load_word_vector(args.word_vector, wordlist)
        self.init_function()

    def init_param(self):
        def shared_matrix(dim, name, u=0, b=0):
            matrix = self.srng.uniform(dim, low=-u, high=u, dtype=theano.config.floatX) + b
            f = theano.function([], matrix)
            return theano.shared(f(), name=name)

        u = lambda x: 1 / np.sqrt(x)
        dimc, dimh, dima = self.dim_word, self.dim_hidden, self.dim_aspect
        dim_lstm_para = dimh + dimc
        self.Vw = shared_matrix((self.num, dimc), 'Vw', 0.01)
        self.Wi = shared_matrix((dimh, dim_lstm_para), 'Wi', u(dimh))
        self.Wo = shared_matrix((dimh, dim_lstm_para), 'Wo', u(dimh))
        self.Wf = shared_matrix((dimh, dim_lstm_para), 'Wf', u(dimh))

        self.bi = shared_matrix((dimh,), 'bi', 0.)
        self.bo = shared_matrix((dimh,), 'bo', 0.)
        self.bf = shared_matrix((dimh,), 'bf', 0.)

        self.Wc = shared_matrix((dimh, dim_lstm_para), 'Wc', u(dimh))
        self.bc = shared_matrix((dimh,), 'bc', 0.)

        self.Ws = shared_matrix((dimh + dimh, self.grained), 'Ws', u(dimh))
        self.bs = shared_matrix((self.grained,), 'bs', 0.)

        self.h0, self.c0 = np.zeros(dimh, dtype=theano.config.floatX), np.zeros(dimc,
                                                                                dtype=theano.config.floatX)
        self.params = [self.Wi, self.Wo, self.Wf, self.Wc, self.bi, self.bo, self.bf, self.bc, self.Ws,
                       self.bs]
        self.Wp_L = shared_matrix((dimh, dimh), 'Wp', u(dimh))
        self.Wx_L = shared_matrix((dimh, dimh), 'Wx', u(dimh))
        self.Wp_R = shared_matrix((dimh, dimh), 'Wp', u(dimh))
        self.Wx_R = shared_matrix((dimh, dimh), 'Wx', u(dimh))
        self.params.extend([self.Wp_L, self.Wx_L, self.Wp_R, self.Wx_R])

        self.alpha_h_W_L = shared_matrix((dimh, dimh + dimh), 'alpha_h_W_L', u(dimh * 2))
        self.alpha_h_W_R = shared_matrix((dimh, dimh + dimh), 'alpha_h_W_R', u(dimh * 2))
        self.params.extend([self.alpha_h_W_L, self.alpha_h_W_R])

        self.a_for_left = theano.shared(1.0, name='a_for_left')
        self.a_for_middle = theano.shared(1.0, name='a_for_middle')
        self.b_for_left = theano.shared(0.0, name='b_for_left')

        self.a_back_right = theano.shared(1.0, name='a_back_right')
        self.b_back_right = theano.shared(0.0, name='b_back_right')

        self.params.extend([self.a_for_left, self.a_for_middle, self.b_for_left])
        self.params.extend([self.a_back_right, self.b_back_right])

    def init_function(self):

        self.seq_loc = T.lvector()
        self.seq_idx = T.lvector()
        self.target = T.lvector()
        self.target_content_index = T.lscalar()
        self.seq_len = T.lscalar()
        self.solution = T.matrix()
        self.seq_matrix = T.take(self.Vw, self.seq_idx, axis=0)

        self.all_tar_vector = T.take(self.Vw, self.target, axis=0)
        self.tar_vector = T.mean(self.all_tar_vector, axis=0)
        self.target_vector_dim = self.tar_vector.dimshuffle('x', 0)
        self.seq_matrix = T.concatenate([self.seq_matrix[0:self.target_content_index], self.target_vector_dim,
                                         self.seq_matrix[self.target_content_index + 1:]], axis=0)
        h, c = T.zeros_like(self.bf, dtype=theano.config.floatX), T.zeros_like(self.bc,
                                                                               dtype=theano.config.floatX)

        def rnn(X, aspect):
            def encode_forward(x_t, h_fore, c_fore):
                v = T.concatenate([h_fore, x_t])
                f_t = T.nnet.sigmoid(T.dot(self.Wf, v) + self.bf)
                i_t = T.nnet.sigmoid(T.dot(self.Wi, v) + self.bi)
                o_t = T.nnet.sigmoid(T.dot(self.Wo, v) + self.bo)
                c_next = f_t * c_fore + i_t * T.tanh(T.dot(self.Wc, v) + self.bc)
                h_next = o_t * T.tanh(c_next)
                return h_next, c_next

            def encode_backward(x_t, h_fore, c_fore):
                v = T.concatenate([h_fore, x_t])
                f_t = T.nnet.sigmoid(T.dot(self.Wf, v) + self.bf)
                i_t = T.nnet.sigmoid(T.dot(self.Wi, v) + self.bi)
                o_t = T.nnet.sigmoid(T.dot(self.Wo, v) + self.bo)
                c_next = f_t * c_fore + i_t * T.tanh(T.dot(self.Wc, v) + self.bc)
                h_next = o_t * T.tanh(c_next)
                return h_next, c_next

            loc_for = T.zeros_like(self.seq_loc) + self.target_content_index
            al_for = self.a_for_left * T.exp(
                -self.b_for_left * T.abs_(
                    self.seq_loc[0:self.target_content_index] - loc_for[0:self.target_content_index]))
            am_for = self.a_for_middle * [1]
            a_for = T.concatenate([al_for, am_for])
            locate_for = T.zeros_like(self.seq_matrix[0:self.target_content_index + 1],
                                      dtype=T.config.floatX) + T.reshape(a_for, [-1, 1])
            loc_back = T.zeros_like(self.seq_loc) + self.target_content_index
            ar_back = self.a_back_right * T.exp(
                -self.b_back_right * T.abs_(
                    self.seq_loc[self.target_content_index + 1:] - loc_back[self.target_content_index + 1:]))
            ar_back = ar_back[::-1]
            a_back = T.concatenate([am_for, ar_back])
            locate_back = T.zeros_like(self.seq_matrix[self.target_content_index:], dtype=T.config.floatX) + T.reshape(
                a_back, [-1, 1])

            scan_result_forward, _forward = theano.scan(fn=encode_forward,
                                                        sequences=locate_for * X[0:self.target_content_index + 1],
                                                        outputs_info=[h, c])
            scan_result_backward, _backward = theano.scan(fn=encode_backward,
                                                          sequences=locate_back * X[self.target_content_index:][::-1],
                                                          outputs_info=[h, c])
            embedding_l = scan_result_forward[0]
            embedding_r = scan_result_backward[0]
            h_target_for = embedding_l[-1]
            h_target_back = embedding_r[-1]

            attention_h_target_l = embedding_l
            cont_l = T.concatenate([h_target_for, h_target_back])
            yuyi_l = T.transpose(cont_l)
            alpha_h_l = T.dot(T.dot(attention_h_target_l, self.alpha_h_W_L), yuyi_l)
            alpha_tmp_l = T.nnet.softmax(alpha_h_l)
            r_l = T.dot(alpha_tmp_l, embedding_l)
            h_star_L = T.tanh(T.dot(r_l, self.Wp_L))

            attention_h_target_r = embedding_r
            cont_r = T.concatenate([h_target_for, h_target_back])
            yuyi_r = T.transpose(cont_r)

            alpha_h_r = T.dot(T.dot(attention_h_target_r, self.alpha_h_W_R), yuyi_r)
            alpha_tmp_r = T.nnet.softmax(alpha_h_r)
            r_r = T.dot(alpha_tmp_r, embedding_r)
            h_star_R = T.tanh(T.dot(r_r, self.Wp_R))
            embedding = T.concatenate([h_star_L, h_star_R],
                                      axis=1)
            return embedding

        embedding = rnn(self.seq_matrix, self.tar_vector)
        embedding_for_train = embedding * self.srng.binomial(embedding.shape, p=0.5, n=1, dtype=embedding.dtype)
        embedding_for_test = embedding * 0.5

        self.pred_for_train = T.nnet.softmax(T.dot(embedding_for_train, self.Ws) + self.bs)
        self.pred_for_test = T.nnet.softmax(T.dot(embedding_for_test, self.Ws) + self.bs)

        self.l2 = sum([T.sum(param ** 2) for param in self.params]) - T.sum(self.Vw ** 2)
        self.loss_sen = -T.tensordot(self.solution, T.log(self.pred_for_train), axes=2)
        self.loss_l2 = 0.5 * self.l2 * self.regular
        self.loss = self.loss_sen + self.loss_l2

        grads = T.grad(self.loss, self.params)
        self.updates = collections.OrderedDict()
        self.grad = {}
        for param, grad in zip(self.params, grads):
            g = theano.shared(np.asarray(np.zeros_like(param.get_value()), \
                                         dtype=theano.config.floatX))
            self.grad[param] = g
            self.updates[g] = g + grad

        self.func_train = theano.function(
            inputs=[self.seq_idx, self.target, self.solution,
                    self.target_content_index, self.seq_loc, self.seq_len,
                    theano.In(h, value=self.h0),
                    theano.In(c, value=self.c0)],
            outputs=[self.loss, self.loss_sen, self.loss_l2],
            updates=self.updates,
            on_unused_input='warn')

        self.func_test = theano.function(
            inputs=[self.seq_idx, self.target, self.target_content_index, self.seq_loc, self.seq_len,
                    theano.In(h, value=self.h0),
                    theano.In(c, value=self.c0)],
            outputs=self.pred_for_test,
            on_unused_input='warn')

    def load_word_vector(self, fname, wordlist):
        loader = WordLoader()
        dic = loader.load_word_vector(fname, wordlist, self.dim_word)
        not_found = 0
        Vw = self.Vw.get_value()
        for word, index in wordlist.items():
            try:
                Vw[index] = dic[word]
            except:
                not_found += 1
        print 'not_found:', not_found
        self.Vw.set_value(Vw)
Exemple #37
0
class BaseModel(object):

  def init_start(self, config):
    self._params = {}
    self._is_training = tt.iscalar('is_training')
    self._np_rng = np.random.RandomState(config.seed // 2 + 123)
    if config.device == 'cpu':
      from theano.tensor.shared_randomstreams import RandomStreams          # works on cpu
    else:
      from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams # works on gpu
    self._theano_rng = RandomStreams(config.seed // 2 + 321)
    self._init_scale = config.init_scale
    self._pre_epoch_hooks = []

  def register_pre_epoch_hook(self, func, for_train=False, for_eval=False):
    assert for_train or for_eval
    self._pre_epoch_hooks.append((func, for_train, for_eval))

  def invoke_pre_epoch_hooks(self, is_train=False, is_eval=False):
    assert is_train or is_eval
    for func, for_train, for_eval in self._pre_epoch_hooks:
      if (for_train and is_train) or (for_eval and is_eval):
        func()

  ################### Constructing shared vars ###################

  def get_param_init(self, shape, init_scheme, init_scale=None):
    if isinstance(init_scheme, numbers.Number):
      value = np.full(shape, float(init_scheme))
    elif init_scheme == 'identity':
      assert len(shape) == 2 and shape[0] == shape[1]
      value = np.eye(shape[0])
    elif init_scheme == 'uniform':
      scale = init_scale or self._init_scale
      value = self._np_rng.uniform(low=-scale, high=scale, size=shape)
    elif init_scheme == 'gaussian':
      scale = init_scale or self._init_scale
      value = self._np_rng.normal(loc=0., scale=scale, size=shape)
    elif init_scheme == 'glorot_uniform':
      assert len(shape) == 2
      s = np.sqrt(6.0 / (shape[0] + shape[1]))
      value = self._np_rng.uniform(low=-s, high=s, size=shape)
    elif init_scheme == 'orthogonal':
      assert len(shape) == 2
      u, _, v = np.linalg.svd(self._np_rng.normal(0.0, 1.0, shape), full_matrices=False)
      #assert u.shape == shape
      value = u if u.shape == shape else v
      scale = init_scale or 1.1
      value *= scale
    else:
      raise AssertionError('unrecognized init scheme')
    return value

  def make_param_from_value(self, name, value):
    if name in self._params:
      param = self._params[name];
      if value.shape != param.get_value().shape:
        raise AssertionError('parameter {} re-use attempt with mis-matching shapes: '
          'existing shape {}, requested shape {}'.format(
            name, param.get_value().shape, value.shape))
      return param
    param = get_shared_floatX(value, name)
    self._params[name] = param
    return param

  def make_param(self, name, shape, init_scheme, init_scale=None):
    value = self.get_param_init(shape, init_scheme, init_scale)
    return self.make_param_from_value(name, value)

  def make_concat_param(self, name, shapes, init_schemes, axis):
    if len(shapes) != len(init_schemes):
      raise AssertionError('number of shapes and number of init schemes are incompatible')
    if len(set([shape[:axis] + shape[axis+1:] for shape in shapes])) != 1:
      raise AssertionError('all shapes should be identical on all axes except given axis')
    val = np.concatenate([self.get_param_init(shape, init_scheme)
      for shape, init_scheme in zip(shapes, init_schemes)], axis=axis)
    w = self.make_param_from_value(name, val)
    return w

  ################### I/O ###################

  def save(self, filename):
    logging.getLogger().info('Saving model weights to {}'.format(filename))
    verify_dir_exists(filename)
    param_dict = {name : param.get_value() for name, param in self._params.iteritems()}
    with open(filename, 'wb') as f:
      cPickle.dump(param_dict, f, protocol=cPickle.HIGHEST_PROTOCOL)

  def load_if_exists(self, filename, allow_mismatch=False):
    if not os.path.isfile(filename):
      return False
    logger = logging.getLogger()
    logger.info('Loading model weights found in {}'.format(filename))
    with open(filename, 'rb') as f:
      param_dict = cPickle.load(f)
      param_names, loaded_param_names = set(self._params.keys()), set(param_dict.keys())
      if param_names != loaded_param_names:
        msg = ('Parameter names loaded from {} do not match model\'s parameter names.\n'
          'param names only found in model: {}\n'
          'param names only found in loaded model: {}').format(
            filename, param_names.difference(loaded_param_names),
            loaded_param_names.difference(param_names))
        if allow_mismatch:
          logger.info(msg)
          param_dict = {param_name: param_dict[param_name] for param_name \
            in param_names.intersection(loaded_param_names)}
        else:
          raise AssertionError(msg)
      for name, value in param_dict.iteritems():
        self._params[name].set_value(value)
    return True

  ################### Dropout ###################

  def get_dropout_noise(self, shape, dropout_p):
    if dropout_p == 0:
      return 1
    keep_p = 1 - dropout_p
    return cast_floatX_np(1. / keep_p) * self._theano_rng.binomial(
      size=shape, p=keep_p, n=1, dtype=floatX)

  def apply_dropout_noise(self, x, noise):
    return ifelse(self._is_training, noise * x, x)

  def dropout(self, x, dropout_p):
    return self.apply_dropout_noise(x, self.get_dropout_noise(x.shape, dropout_p))

  ################### Misc ###################

  def get_param_sizes(self):
    param_sizes = {name: param.get_value().size for name, param in self._params.iteritems()}
    return sum(param_sizes.values()), param_sizes

  ################### Simple layers ###################

  def linear(self, name, x, input_dim, output_dim, with_bias=True, w_init='uniform', bias_init=0):
    # x                 (..., input_dim)
    n = namer(name)
    W = self.make_param(n('W'), (input_dim, output_dim), w_init)
    y = tt.dot(x, W)     # (..., output_dim)
    if with_bias:
      b = self.make_param(n('b'), (output_dim,), bias_init)
      y += b
    return y
        
  def ff(self, name, x, dims, activation, dropout_ps, **kwargs):
    assert len(dims) >= 2
    if dropout_ps:
      if isinstance(dropout_ps, numbers.Number):
        dropout_ps = [dropout_ps] * (len(dims) - 1)
      else:
        assert len(dropout_ps) == len(dims) - 1
    n = namer(name)
    h = x
    if activation == 'relu':
      f = tt.nnet.relu
    elif activation == 'sigmoid':
      f = tt.nnet.sigmoid
    elif activation == 'tanh':
      f = tt.tanh
    else:
      raise AssertionError('unrecognized activation function')
    for i, (input_dim, output_dim) in enumerate(zip(dims[:-1], dims[1:])):
      if dropout_ps:
        h = self.dropout(h, dropout_ps[i])
      h = f(self.linear(n('l%d' % (i+1)), h, input_dim, output_dim, **kwargs))
    return h

  ################### LSTM ###################

  def stacked_bi_lstm(self, name,
    x, x_mask, num_layers, input_dim, hidden_dim, drop_x, drop_h, **kwargs):
    n = namer(name)
    h = x
    for l in range(1, num_layers+1):
      h = self.bi_lstm(n('l%d' % l),
        h, x_mask, input_dim if l == 1 else 2*hidden_dim, hidden_dim, drop_x, drop_h, **kwargs)
    return h    # (timesteps, batch_size, 2*hidden_dim)

  def bi_lstm(self, name, x, x_mask, input_dim, hidden_dim, drop_x, drop_h, **kwargs):
    n = namer(name)
    fwd_h = self.lstm(n('fwd'),
      x, x_mask, input_dim, hidden_dim, drop_x, drop_h, backward=False, **kwargs)
    bck_h = self.lstm(n('bck'),
      x, x_mask, input_dim, hidden_dim, drop_x, drop_h, backward=True, **kwargs)
    bi_h = tt.concatenate([fwd_h, bck_h], axis=2)     # (timesteps, batch_size, 2*hidden_dim)
    return bi_h

  def lstm(self, name,
    x, x_mask,
    input_dim, hidden_dim,
    drop_x, drop_h,
    backward=False, couple_i_and_f=False, learn_initial_state=False,
    tie_x_dropout=True, sep_x_dropout=False,
    sep_h_dropout=False,
    w_init='uniform', u_init='orthogonal', forget_bias_init=1, other_bias_init=0):
    """Customizable uni-directional LSTM layer.
    Handles masks, can learn initial state, input and forget gate can be coupled,
    with recurrent dropout, no peephole connections.
    Args:
      x:                    Theano tensor, shape (timesteps, batch_size, input_dim)
      x_mask:               Theano tensor, shape (timesteps, batch_size)
      input_dim:            int, dimension of input vectors
      hidden_dim:           int, dimension of hidden state
      drop_x:               float, dropout rate to apply to inputs
      drop_h:               float, dropout rate to apply to hidden state
      backward:             boolean, whether to recur over timesteps in reveresed order
      couple_i_and_f:       boolean, whether to have input gate = 1 - forget gate
      learn_initial_state:  boolean, whether to have initial cell state and initial hidden state
                            as learnt parameters
      tie_x_dropout:        boolean, whether to have the same dropout masks across timesteps
                            for input
      sep_x_dropout:        boolean, if True dropout is applied over weights of lin. trans. of
                            input; otherwise it is applied over input activations
      sep_h_dropout:        boolean, if True dropout is applied over weights of lin. trans. of
                            hidden state; otherwise it is applied over hidden state activations
      w_init:               string, initialization scheme for weights of lin. trans. of input
      u_init:               string, initialization scheme for weights of lin. trans. of hidden state
      forget_bias_init:     string, initialization scheme for forget gate's bias
      other_bias_init:      string, initialization scheme for other biases
    Note:
      Proper variational dropout (Gal 2015) is:
        tie_x_dropout=True, sep_x_dropout=True, sep_h_dropout=True
      A faster alternative is:
        tie_x_dropout=True, sep_x_dropout=False, sep_h_dropout=False
    Returns:
      h:                    Theano variable, recurrent hidden states at each timestep,
                            shape (timesteps, batch_size, hidden_dim)
    """
    n = namer(name)
    timesteps, batch_size = x.shape[0], x.shape[1]

    num_non_lin = 3 if couple_i_and_f else 4
    num_gates = num_non_lin - 1

    W = self.make_concat_param(n('W'),            # (input_dim, [3|4]*hidden_dim)
      num_non_lin*[(input_dim, hidden_dim)], num_non_lin*[w_init], axis=1)
    b = self.make_concat_param(n('b'),            # ([3|4]*hidden_dim,)
      num_non_lin*[(hidden_dim,)], [forget_bias_init] + num_gates*[other_bias_init], axis=0)
    U = self.make_concat_param(n('U'),            # (hidden_dim, [3|4]*hidden_dim)
      num_non_lin*[(hidden_dim, hidden_dim)], num_non_lin*[u_init], axis=1) 

    if not sep_x_dropout:
      if tie_x_dropout:
        x = self.apply_dropout_noise(x, self.get_dropout_noise((batch_size, input_dim), drop_x))
      else:
        x = self.dropout(x, drop_x)
      lin_x = tt.dot(x, W) + b                    # (timesteps, batch_size, [3|4]*hidden_dim)
    else:
      if tie_x_dropout:
        x_for_f = self.apply_dropout_noise(
          x, self.get_dropout_noise((batch_size, input_dim), drop_x))
        x_for_o = self.apply_dropout_noise(
          x, self.get_dropout_noise((batch_size, input_dim), drop_x))
        if num_gates == 3:
          x_for_i = self.apply_dropout_noise(
            x, self.get_dropout_noise((batch_size, input_dim), drop_x))
        x_for_g = self.apply_dropout_noise(
          x, self.get_dropout_noise((batch_size, input_dim), drop_x))
      else:
        x_for_f = self.dropout(x, drop_x)
        x_for_o = self.dropout(x, drop_x)
        if num_gates == 3:
          x_for_i = self.dropout(x, drop_x)
        x_for_g = self.dropout(x, drop_x)
      lin_x_tensors = [tt.dot(x_for_f, W[:,:hidden_dim]),
        tt.dot(x_for_o, W[:,hidden_dim:2*hidden_dim])]
      if num_gates == 3:
        lin_x_tensors.append(tt.dot(x_for_i, W[:,2*hidden_dim:3*hidden_dim]))
      lin_x_tensors.append(tt.dot(x_for_g, W[:,num_gates*hidden_dim:]))
      lin_x = tt.concatenate(lin_x_tensors, axis=2) + b # (timesteps, batch_size, [3|4]*hidden_dim)

    def step_fn(lin_x_t, x_mask_t, h_tm1, c_tm1, h_noise, U):
      # lin_x_t       (batch_size, [3|4]*hidden_dim)
      # x_mask_t      (batch_size, 1)
      # h_tm1         (batch_size, hidden_dim)
      # c_tm1         (batch_size, hidden_dim)
      # h_noise       (batch_size, [1|3|4]*hidden_dim)
      #               1 if not sep_h_dropout, otherwise: 3 or 4 depending on num_non_lin
      # U             (hidden_dim, [3|4]*hidden_dim)

      if not sep_h_dropout:
        h_tm1 = self.apply_dropout_noise(h_tm1, h_noise)
        lin_h_tm1 = tt.dot(h_tm1, U)                    # (batch_size, [3|4]*hidden_dim)
      else:
        h_tm1_for_f = self.apply_dropout_noise(h_tm1, h_noise[:,:hidden_dim])
        h_tm1_for_o = self.apply_dropout_noise(h_tm1, h_noise[:,hidden_dim:2*hidden_dim])
        if num_gates == 3:
          h_tm1_for_i = self.apply_dropout_noise(h_tm1, h_noise[:,2*hidden_dim:3*hidden_dim])
        h_tm1_for_g = self.apply_dropout_noise(h_tm1, h_noise[:,num_gates*hidden_dim:])
        lin_h_tm1_tensors = [tt.dot(h_tm1_for_f, U[:,:hidden_dim]),
          tt.dot(h_tm1_for_o, U[:,hidden_dim:2*hidden_dim])]
        if num_gates == 3:
          lin_h_tm1_tensors.append(tt.dot(h_tm1_for_i, U[:,2*hidden_dim:3*hidden_dim]))
        lin_h_tm1_tensors.append(tt.dot(h_tm1_for_g, U[:,num_gates*hidden_dim:]))
        lin_h_tm1 = tt.concatenate(lin_h_tm1_tensors, axis=1)             # (batch_size, [3|4]*hidden_dim)

      lin = lin_x_t + lin_h_tm1                                           # (batch_size, [3|4]*hidden_dim)

      gates = tt.nnet.sigmoid(lin[:, :num_gates*hidden_dim])              # (batch_size, [3|4]*hidden_dim)
      f_gate = gates[:, :hidden_dim]                                      # (batch_size, hidden_dim)
      o_gate = gates[:, hidden_dim:2*hidden_dim]                          # (batch_size, hidden_dim)
      i_gate = gates[:, 2*hidden_dim:] if num_gates == 3 else 1 - f_gate  # (batch_size, hidden_dim)
      g = tt.tanh(lin[:, num_gates*hidden_dim:])                          # (batch_size, hidden_dim)

      c_t = f_gate * c_tm1 + i_gate * g
      h_t = o_gate * tt.tanh(c_t)

      h_t = tt.switch(x_mask_t, h_t, h_tm1)
      c_t = tt.switch(x_mask_t, c_t, c_tm1)

      return h_t, c_t
      # end of step_fn

    if learn_initial_state:
      h0 = self.make_param(n('h0'), (hidden_dim,), 0)
      c0 = self.make_param(n('c0'), (hidden_dim,), 0)
      batch_h0 = tt.extra_ops.repeat(h0[None,:], batch_size, axis=0)
      batch_c0 = tt.extra_ops.repeat(c0[None,:], batch_size, axis=0)
    else:
      batch_h0 = batch_c0 = tt.zeros((batch_size, hidden_dim))

    x_mask = tt.shape_padright(x_mask)    # (timesteps, batch_size, 1)

    original_x_mask = x_mask
    if backward:
      lin_x = lin_x[::-1]
      x_mask = x_mask[::-1]

    h_noise = self.get_dropout_noise(
      (batch_size, hidden_dim if not sep_h_dropout else num_non_lin*hidden_dim), drop_h)

    results, _ = theano.scan(step_fn,
        sequences = [lin_x, x_mask],
        outputs_info = [batch_h0, batch_c0],
        non_sequences = [h_noise, U],
        name = n('scan'))

    h = results[0]    # (timesteps, batch_size, hidden_dim)
    if backward:
      h = h[::-1]
    h *= original_x_mask
    return h
Exemple #38
0
 def __init__(self, L1, ratio):
     self.random_stream = RandomStreams(seed=1)
     self.L1 = L1
     self.one_ratio = ratio
Exemple #39
0
import theano.tensor as T
from theano import function
from theano.tensor.shared_randomstreams import RandomStreams
import numpy
from theano.printing import pydotprint

random = RandomStreams(seed=42)

a = random.normal((1, 3))
b = T.dmatrix('a')

f1 = a * b

g1 = function([b], f1)
pydotprint(g1, outfile="s9.png", var_with_name_simple=True)

print "Invocation 1:", g1(numpy.ones((1, 3)))
print "Invocation 2:", g1(numpy.ones((1, 3)))
print "Invocation 3:", g1(numpy.ones((1, 3)))

# Invocation 1: [[ 1.25614218 -0.53793023 -0.10434045]]
# Invocation 2: [[ 0.66992188 -0.70813926  0.99601177]]
# Invocation 3: [[ 0.0724739  -0.66508406  0.93707751]]
Exemple #40
0
def train_dA(
    learning_rate,
    training_epochs,
    window_size,
    corruption_level,
    n_hidden,
    train_set,
    output_folder,
    train_algo="sgd"):

    """
    This dA is tested on ICHI_Data

    :type learning_rate: float
    :param learning_rate: learning rate used for training the DeNosing
                          AutoEncoder

    :type training_epochs: int
    :param training_epochs: number of epochs used for training

    :type window_size: int
    :param window_size: size of window used for training

    :type corruption_level: float
    :param corruption_level: corruption_level used for training the DeNosing
                          AutoEncoder

    :type n_hidden: int
    :param n_hidden: count of nodes in hidden layer

    :type output_folder: string
    :param output_folder: folder for costand error graphics with results

    """
    
    start_time = time.clock()
    
    rng = numpy.random.RandomState(123)
    theano_rng = RandomStreams(rng.randint(2 ** 30))
    
    x = T.vector('x')    

    da = dA(
        numpy_rng=rng,
        theano_rng=theano_rng,
        input=x,
        n_visible=window_size,
        n_hidden=n_hidden
    )
    
    if train_algo == "sgd":
        updated_da = train_da_sgd(
            learning_rate=learning_rate,
            window_size=window_size,
            training_epochs=training_epochs,
            corruption_level=corruption_level,
            train_set=train_set,
            da=da
        )
        base_folder = "da_sgd"
    else:
        updated_da = train_da_cg(
            da=da,
            train_set=train_set,
            window_size=window_size,
            corruption_level=corruption_level,
            training_epochs=training_epochs
        )
        base_folder = "da_cg"
    
    visualize_da(train_cost=updated_da.train_cost_array,
                 window_size=window_size,
                 learning_rate=learning_rate,
                 corruption_level=corruption_level,
                 n_hidden=n_hidden,
                 output_folder=output_folder,
                 base_folder=base_folder)
    
    end_time = time.clock()
    training_time = (end_time - start_time)
    
    print >> sys.stderr, ('The with corruption %f code for file ' +
                          os.path.split(__file__)[1] +
                         ' ran for %.2fm' % (corruption_level, (training_time) / 60.))
Exemple #41
0
import numpy
import numpy.random
import theano
import theano.tensor as T
from theano.tensor.shared_randomstreams import RandomStreams
numpy_rng = numpy.random.RandomState(1)
theano_rng = RandomStreams(1)
from scipy import misc

from hyperParams import *


def loadFrames():

    frames_train = numpy.load(data_path + 'train/' +
                              'frames.npy')[:numframes_train, :]
    frames_train = numpy.reshape(frames_train,
                                 (numseqs_train, seq_dim)).astype('float32')

    frames_test = numpy.load(data_path + 'test/' +
                             'frames.npy')[:numframes_test, :]
    frames_test = numpy.reshape(frames_test,
                                (numseqs_test, seq_dim)).astype('float32')

    return (frames_train, frames_test)


def loadOpticalFlow():

    ofx_train = numpy.load(data_path + 'train/' +
                           'ofx.npy')[:numframes_train, :]
    def __init__(self, D, M, Q, Domain_number, m, pre_params, Hiddenlayerdim1,
                 Hiddenlayerdim2):

        self.Xlabel = T.matrix('Xlabel')

        self.X = T.matrix('X')
        N = self.X.shape[0]

        self.Weight = T.matrix('Weight')

        ker = kernel(Q)
        mmd = MMD(M, Domain_number)

        mu_value = np.random.randn(M, D)
        Sigma_b_value = np.zeros((M, M)) + np.log(0.01)

        Z_value = m[:M]
        self.test = Z_value
        ls_value = np.zeros(Domain_number) + np.log(0.1)

        self.mu = theano.shared(value=mu_value, name='mu', borrow=True)
        self.Sigma_b = theano.shared(value=Sigma_b_value,
                                     name='Sigma_b',
                                     borrow=True)
        self.Z = theano.shared(value=Z_value, name='Z', borrow=True)
        self.ls = theano.shared(value=ls_value, name='ls', borrow=True)

        self.params = [self.mu, self.Sigma_b, self.Z, self.ls]

        self.hiddenLayer_x = HiddenLayer(rng=rng,
                                         input=self.X,
                                         n_in=D,
                                         n_out=Hiddenlayerdim1,
                                         activation=T.nnet.relu,
                                         number='_x')
        self.hiddenLayer_hidden = HiddenLayer(rng=rng,
                                              input=self.hiddenLayer_x.output,
                                              n_in=Hiddenlayerdim1,
                                              n_out=Hiddenlayerdim2,
                                              activation=T.nnet.relu,
                                              number='_h')
        self.hiddenLayer_m = HiddenLayer(rng=rng,
                                         input=self.hiddenLayer_hidden.output,
                                         n_in=Hiddenlayerdim2,
                                         n_out=Q,
                                         activation=T.nnet.relu,
                                         number='_m')
        self.hiddenLayer_S = HiddenLayer(rng=rng,
                                         input=self.hiddenLayer_hidden.output,
                                         n_in=Hiddenlayerdim2,
                                         n_out=Q,
                                         activation=T.nnet.relu,
                                         number='_S')

        self.loc_params = []
        self.loc_params.extend(self.hiddenLayer_x.params)
        self.loc_params.extend(self.hiddenLayer_hidden.params)
        self.loc_params.extend(self.hiddenLayer_m.params)
        self.loc_params.extend(self.hiddenLayer_S.params)

        self.local_params = {}
        for i in self.loc_params:
            self.local_params[str(i)] = i

        self.params.extend(ker.params)
        self.params.extend(mmd.params)

        self.hyp_params = {}
        for i in [self.mu, self.Sigma_b, self.ls]:
            self.hyp_params[str(i)] = i

        self.Z_params = {}
        for i in [self.Z]:
            self.Z_params[str(i)] = i

        self.global_params = {}
        for i in self.params:
            self.global_params[str(i)] = i

        self.params.extend(self.hiddenLayer_x.params)
        self.params.extend(self.hiddenLayer_hidden.params)
        self.params.extend(self.hiddenLayer_m.params)
        self.params.extend(self.hiddenLayer_S.params)

        self.wrt = {}
        for i in self.params:
            self.wrt[str(i)] = i

        for i, j in pre_params.items():
            self.wrt[i].set_value(j)

        m = self.hiddenLayer_m.output
        S_0 = self.hiddenLayer_S.output
        S_1 = T.exp(S_0)
        S = T.sqrt(S_1)

        from theano.tensor.shared_randomstreams import RandomStreams
        srng = RandomStreams(seed=234)
        eps_NQ = srng.normal((N, Q))
        eps_M = srng.normal((M, D))  #平均と分散で違う乱数を使う必要があるので別々に銘銘

        beta = T.exp(self.ls)
        #uについては対角でないのでコレスキー分解するとかして三角行列を作る必要がある

        Sigma = T.tril(self.Sigma_b - T.diag(T.diag(self.Sigma_b)) +
                       T.diag(T.exp(T.diag(self.Sigma_b))))

        #スケール変換
        mu_scaled, Sigma_scaled = ker.sf2**0.5 * self.mu, ker.sf2**0.5 * Sigma

        Xtilda = m + S * eps_NQ
        self.U = mu_scaled + Sigma_scaled.dot(eps_M)

        Kmm = ker.RBF(self.Z)
        Kmm = mmd.MMD_kenel_Xonly(mmd.Zlabel_T, Kmm, self.Weight)
        KmmInv = sT.matrix_inverse(Kmm)

        Kmn = ker.RBF(self.Z, Xtilda)
        Kmn = mmd.MMD_kenel_ZX(self.Xlabel, Kmn, self.Weight)

        Knn = ker.RBF(Xtilda)
        Knn = mmd.MMD_kenel_Xonly(self.Xlabel, Knn, self.Weight)

        Ktilda = Knn - T.dot(Kmn.T, T.dot(KmmInv, Kmn))

        Kinterval = T.dot(KmmInv, Kmn)

        mean_U = T.dot(Kinterval.T, self.U)
        betaI = T.diag(T.dot(self.Xlabel, beta))
        Covariance = betaI

        self.LL = (self.log_mvn(self.X, mean_U, Covariance) -
                   0.5 * T.sum(T.dot(betaI, Ktilda)))
        self.KL_X = -self.KLD_X(m, S)
        self.KL_U = -self.KLD_U(mu_scaled, Sigma_scaled, Kmm, KmmInv)
Exemple #43
0
import pickle
import sys
import time

import numpy
import theano
import theano.sparse
import theano.tensor as T
from sklearn.metrics import mean_squared_error
from sklearn.metrics import roc_auc_score
from theano.tensor.shared_randomstreams import RandomStreams

import dl_utils as ut
import sampling_based_gaussian_binary_rbm_sparse as gbrbm

srng = RandomStreams(seed=234)
rng = numpy.random
rng.seed(1234)
batch_size = 1000  # batch size
lr = 0.0006  # learning rate
lambda1 = 0.0001  # .01
hidden0 = 300  # regularisation rate
hidden1 = 300  # hidden layer 1
hidden2 = 100  # hidden layer 2
acti_type = 'tanh'  # activation type
epoch = 100  # epochs number
advertiser = '2997'
if len(sys.argv) > 1:
    advertiser = sys.argv[1]
train_file = '../data/train.fm.txt'  # training file
test_file = '../data/test.fm.txt'  # test file
Exemple #44
0
    def __init__(self,
                 numargs,
                 embed_size,
                 pred_vocab_size,
                 arg_vocab_size,
                 initial_pred_rep=None,
                 initial_arg_rep=None,
                 margin=5,
                 lr=0.01,
                 activation=T.nnet.sigmoid):
        numpy_rng = numpy.random.RandomState(12345)
        theano_rng = RandomStreams(54321)
        self.lr = lr
        #margin = 5
        # Initializing predicate representations
        if initial_pred_rep is not None:
            num_preds, pred_dim = initial_pred_rep.shape
            assert pred_vocab_size == num_arrays, "Initial predicate representation is not the same size as pred_vocab_size"
            assert embed_size == pred_dim, "Initial predicate representation does not have the same dimensionality as embed_size"
        else:
            initial_pred_rep_range = 4 * numpy.sqrt(
                6. / (pred_vocab_size + embed_size))
            initial_pred_rep = numpy.asarray(
                numpy_rng.uniform(low=-initial_pred_rep_range,
                                  high=initial_pred_rep_range,
                                  size=(pred_vocab_size, embed_size)))

        self.pred_rep = theano.shared(value=initial_pred_rep, name='P')

        # Initializing argument representations
        if initial_arg_rep is not None:
            arg_rep_len, arg_dim = initial_arg_rep.shape
            assert arg_vocab_size == arg_rep_len, "Initial argument representation is not the same size as arg_vocab_size"
            assert embed_size == arg_dim, "Initial argument representation does not have the same dimensionality as embed_size"
        else:
            initial_arg_rep_range = 4 * numpy.sqrt(
                6. / (arg_vocab_size + embed_size))
            initial_arg_rep = numpy.asarray(
                numpy_rng.uniform(low=-initial_arg_rep_range,
                                  high=initial_arg_rep_range,
                                  size=(arg_vocab_size, embed_size)))

        self.arg_rep = theano.shared(value=initial_arg_rep, name='A')

        # Initialize scorer
        scorer_dim = embed_size * (numargs + 1)  # Predicate is +1
        initial_scorer_range = 4 * numpy.sqrt(6. / scorer_dim)
        initial_scorer = numpy.asarray(
            numpy_rng.uniform(low=-initial_scorer_range,
                              high=initial_scorer_range,
                              size=scorer_dim))
        self.scorer = theano.shared(value=initial_scorer, name='s')

        # Initialize indicator
        indicator_dim = embed_size * (numargs + 1)  # Predicate is +1
        initial_indicator_range = 4 * numpy.sqrt(6. /
                                                 (indicator_dim + numargs))
        initial_indicator = numpy.asarray(
            numpy_rng.uniform(low=-initial_indicator_range,
                              high=initial_indicator_range,
                              size=(indicator_dim, numargs)))
        self.indicator = theano.shared(value=initial_indicator, name='I')

        # Define symbolic pred-arg
        self.pred_ind = T.iscalar('p')
        self.arg_inds = T.iscalars(numargs)
        pred = self.pred_rep[self.pred_ind].reshape((1, embed_size))
        args = self.arg_rep[self.arg_inds].reshape((1, embed_size * numargs))
        pred_arg = activation(T.concatenate([pred, args], axis=1))

        # Define symbolic rand pred-arg for training scorer
        rand_pred_ind = theano_rng.random_integers(low=0,
                                                   high=pred_vocab_size - 1)
        rand_arg_inds = theano_rng.random_integers([1, numargs],
                                                   low=0,
                                                   high=arg_vocab_size - 1)
        rand_pred = self.pred_rep[rand_pred_ind].reshape((1, embed_size))
        rand_args = self.arg_rep[rand_arg_inds].reshape(
            (1, embed_size * numargs))
        rand_pred_arg = activation(
            T.concatenate([rand_pred, rand_args], axis=1))

        # Define symbolic pred_rand-arg for training indicator
        pred_rand_arg = activation(T.concatenate([pred, rand_args], axis=1))

        # Define scores and loss
        self.corr_score = T.sum(T.dot(pred_arg, self.scorer))
        rand_score = T.sum(T.dot(rand_pred_arg, self.scorer))
        self.margin_loss = T.maximum(0, margin - self.corr_score + rand_score)

        # Define indicator values and loss
        orig_ind_labels = T.constant(numpy.zeros(numargs))
        self.indicator_pred = T.nnet.sigmoid(T.dot(pred_arg, self.indicator))
        rand_ind_labels = T.constant(numpy.ones(numargs))
        rand_indicator_pred = T.nnet.sigmoid(
            T.dot(pred_rand_arg, self.indicator))
        self.indicator_loss = T.mean(
            (self.indicator_pred - orig_ind_labels)**2) + T.mean(
                (rand_indicator_pred - rand_ind_labels)**2)

        # Define params and inputs
        self.score_params = [self.pred_rep, self.arg_rep, self.scorer]
        self.indicator_params = [self.pred_rep, self.arg_rep, self.indicator]
        self.score_ind_inputs = [self.pred_ind] + list(self.arg_inds)
Exemple #45
0
    def __init__(self,
                 numpy_rng,
                 theano_rng=None,
                 cfg=None,
                 dnn_shared=None,
                 shared_layers=[]):

        self.layers = []
        self.dropout_layers = []
        self.params = []
        self.delta_params = []

        self.cfg = cfg
        self.n_ins = cfg.n_ins
        self.n_outs = cfg.n_outs
        self.hidden_layers_sizes = cfg.hidden_layers_sizes
        self.hidden_layers_number = len(self.hidden_layers_sizes)
        self.activation = cfg.activation

        self.do_maxout = cfg.do_maxout
        self.pool_size = cfg.pool_size
        self.input_dropout_factor = cfg.input_dropout_factor
        self.dropout_factor = cfg.dropout_factor

        self.max_col_norm = cfg.max_col_norm
        self.l1_reg = cfg.l1_reg
        self.l2_reg = cfg.l2_reg

        if not theano_rng:
            theano_rng = RandomStreams(numpy_rng.randint(2**30))
        # allocate symbolic variables for the data
        self.x = T.matrix('x')
        self.y = T.ivector('y')

        for i in xrange(self.hidden_layers_number):
            # construct the hidden layer
            if i == 0:
                input_size = self.n_ins
                layer_input = self.x
                if self.input_dropout_factor > 0.0:
                    dropout_layer_input = _dropout_from_layer(
                        theano_rng, self.x, self.input_dropout_factor)
                else:
                    dropout_layer_input = self.x
            else:
                input_size = self.hidden_layers_sizes[i - 1]
                layer_input = (
                    1 - self.dropout_factor[i - 1]) * self.layers[-1].output
                dropout_layer_input = self.dropout_layers[-1].dropout_output

            W = None
            b = None
            if (i in shared_layers):
                W = dnn_shared.layers[i].W
                b = dnn_shared.layers[i].b

            if self.do_maxout == False:
                dropout_layer = DropoutHiddenLayer(
                    rng=numpy_rng,
                    input=dropout_layer_input,
                    n_in=input_size,
                    n_out=self.hidden_layers_sizes[i],
                    W=W,
                    b=b,
                    activation=self.activation,
                    dropout_factor=self.dropout_factor[i])
                hidden_layer = HiddenLayer(rng=numpy_rng,
                                           input=layer_input,
                                           n_in=input_size,
                                           n_out=self.hidden_layers_sizes[i],
                                           activation=self.activation,
                                           W=dropout_layer.W,
                                           b=dropout_layer.b)
            else:
                dropout_layer = DropoutHiddenLayer(
                    rng=numpy_rng,
                    input=dropout_layer_input,
                    n_in=input_size,
                    n_out=self.hidden_layers_sizes[i] * self.pool_size,
                    W=W,
                    b=b,
                    activation=(lambda x: 1.0 * x),
                    dropout_factor=self.dropout_factor[i],
                    do_maxout=True,
                    pool_size=self.pool_size)
                hidden_layer = HiddenLayer(rng=numpy_rng,
                                           input=layer_input,
                                           n_in=input_size,
                                           n_out=self.hidden_layers_sizes[i] *
                                           self.pool_size,
                                           activation=(lambda x: 1.0 * x),
                                           W=dropout_layer.W,
                                           b=dropout_layer.b,
                                           do_maxout=True,
                                           pool_size=self.pool_size)
            # add the layer to our list of layers
            self.layers.append(hidden_layer)
            self.dropout_layers.append(dropout_layer)
            self.params.extend(dropout_layer.params)
            self.delta_params.extend(dropout_layer.delta_params)
        # We now need to add a logistic layer on top of the MLP
        self.dropout_logLayer = LogisticRegression(
            input=self.dropout_layers[-1].dropout_output,
            n_in=self.hidden_layers_sizes[-1],
            n_out=self.n_outs)

        self.logLayer = LogisticRegression(
            input=(1 - self.dropout_factor[-1]) * self.layers[-1].output,
            n_in=self.hidden_layers_sizes[-1],
            n_out=self.n_outs,
            W=self.dropout_logLayer.W,
            b=self.dropout_logLayer.b)

        self.dropout_layers.append(self.dropout_logLayer)
        self.layers.append(self.logLayer)
        self.params.extend(self.dropout_logLayer.params)
        self.delta_params.extend(self.dropout_logLayer.delta_params)

        # compute the cost
        self.finetune_cost = self.dropout_logLayer.negative_log_likelihood(
            self.y)
        self.errors = self.logLayer.errors(self.y)

        if self.l1_reg is not None:
            for i in xrange(self.hidden_layers_number):
                W = self.layers[i].W
                self.finetune_cost += self.l1_reg * (abs(W).sum())

        if self.l2_reg is not None:
            for i in xrange(self.hidden_layers_number):
                W = self.layers[i].W
                self.finetune_cost += self.l2_reg * T.sqr(W).sum()
    def __init__(
        self,
        numpy_rng,
        theano_rng=None,
        n_ins=23,
        hidden_layers_size=[128,32],
        corruption_levels=[0.0,0.0],
        v_h_learning_rates=[0.1,0.1],
        h_v_learning_rates=[0.1,0.1]
    ):
        #self.hidden_layers = []
        self.dA_layers = []
        self.params = []
        self.n_layers = len(hidden_layers_size)
        self.v_h_learning_rates = v_h_learning_rates
        self.h_v_learning_rates = h_v_learning_rates
        self.corruption_levels = corruption_levels

        assert self.n_layers > 0

        if not theano_rng:
            theano_rng = RandomStreams(numpy_rng.randint(2 ** 30))

        self.x = T.matrix('x') 

        # The SdA is an MLP, for which all weights of intermediate layers
        # are shared with a different denoising autoencoders
        # We will first construct the SdA as a deep multilayer perceptron,
        # and when constructing each sigmoidal layer we also construct a
        # denoising autoencoder that shares weights with that layer
        # During pretraining we will train these autoencoders (which will
        # lead to chainging the weights of the MLP as well)
        # During finetunining we will finish training the SdA by doing
        # stochastich gradient descent on the MLP

        for i in range(self.n_layers):
            if i == 0:
                input_size = n_ins
            else:
                input_size = hidden_layers_size[i - 1]

            if i == 0:
                layer_input = self.x
            else:
                layer_input = self.dA_layers[-1].output

            '''
            hidden_layer = HiddenLayer(rng=numpy_rng,
                                        input = layer_input,
                                        n_in = input_size,
                                        n_out=hidden_layers_size[i],
                                        activation=T.nnet.sigmoid)

            self.hidden_layers.append(hidden_layer)
            '''

            if i == 0:
                dA_layer = dA(numpy_rng=numpy_rng,
                              theano_rng = theano_rng,
                              input=layer_input,
                              n_visible=input_size,
                              n_hidden=hidden_layers_size[i],#W=hidden_layer.W,bhid=hidden_layer.b,
                              v_h_active = T.nnet.sigmoid)
            else:
                dA_layer = dA(numpy_rng=numpy_rng,
                              theano_rng = theano_rng,
                              input=layer_input,
                              n_visible=input_size,
                              n_hidden=hidden_layers_size[i],#W=hidden_layer.W,bhid=hidden_layer.b,
                              v_h_active = T.nnet.sigmoid,
                              h_v_active = T.nnet.sigmoid
                              )
            self.dA_layers.append(dA_layer)
            self.params.extend(dA_layer.params)


        #reconstructed_input = self.x
        #reconstructed_input = self.dA_layers[0].x
        #for i in range(self.n_layers):
        #    reconstructed_input = self.dA_layers[i].get_hidden_values(reconstructed_input)
        #reconstructed_input = [self.x]
        #for i in range(self.n_layers):
        #    temp = self.dA_layers[i].get_hidden_values(reconstructed_input[-1])
        #    reconstructed_input.append(temp)
        reconstructed_input = []
        reconstructed_input.append(self.dA_layers[-1].output)
        for i in range(self.n_layers - 1, -1, -1):
            temp = self.dA_layers[i].get_reconstructed_input(reconstructed_input[-1])
            reconstructed_input.append(temp)

        self.finetune_cost = self.dA_layers[0].get_error(reconstructed_input[-1])
def adam(params,
         cost=None,
         gradients=None,
         learningrate=0.0002,
         beta1=0.9,
         beta2=0.999,
         epsilon=1e-8,
         eta=0.,
         gamma=0.55,
         iterstart=0):
    """
    Computes the updates for ADAM.

    :type params: list
    :param params: Network parameters.

    :type cost: theano.tensor.var.TensorVariable
    :param cost: Cost variable (scalar). Optional if the gradient is provided.

    :type gradients: list
    :param gradients: Gradient of a cost w.r.t. parameters. Optional if the cost is provided.

    :type learningrate: theano.tensor.var.TensorVariable or float
    :param learningrate: Learning rate of SGD. Can be a float (static) or a dynamic theano variable.

    :type beta1: float
    :param beta1: See Kingma and Ba 2014: http://arxiv.org/abs/1412.6980

    :type beta2: float
    :param beta2: See Kingma and Ba 2014: http://arxiv.org/abs/1412.6980

    :type epsilon: float
    :param epsilon: See Kingma and Ba 2014: http://arxiv.org/abs/1412.6980

    :type eta: float
    :param eta: Eta for noisy gradient. See Neelakantan et al. 2015: http://arxiv.org/pdf/1511.06807v1.pdf

    :type gamma: float
    :param gamma: Gamma for noisy gradient. See Neelakantan et al. 2015: http://arxiv.org/pdf/1511.06807v1.pdf

    :type iterstart: int or float
    :param iterstart: Adam anneals the learning rate with iterations. This parameter specifies the initial value of the
                      iteration count, such that the learning rate is scaled appropriately (or the model might jump out
                      of the potential minimum where it's at).

    :return: List of updates
    """

    # Validate input
    assert not (cost is None and gradients is None), "Update function adam requires either a cost scalar or a list of " \
                                                     "gradients."

    # Compute gradients if requested
    if gradients is None and cost is not None:
        pdC = T.grad(cost, wrt=params)
        # Kill gradients if cost is nan
        dC = [
            th.ifelse.ifelse(T.isnan(cost), T.zeros_like(dparam), dparam)
            for dparam in pdC
        ]
    else:
        dC = gradients

    updates = []

    # Gradient noising
    if not (eta == 0):
        # RNG
        srng = RandomStreams()
        # Iteration counter
        itercount = th.shared(np.asarray(iterstart, dtype=th.config.floatX))
        # Add noise
        dC = [
            dparam + srng.normal(size=dparam.shape,
                                 std=T.sqrt(eta / (1 + itercount)**gamma),
                                 dtype='floatX') for dparam in dC
        ]
        # Update itercount
        updates.append((itercount, itercount + 1))

    # Implementation as in reference paper, nothing spectacular here...
    tm1 = th.shared(np.asarray(iterstart, dtype=th.config.floatX))
    t = tm1 + 1
    at = learningrate * T.sqrt(1 - beta2**t) / (1 - beta1**t)

    for param, dparam in zip(params, dC):
        paramshape = param.get_value().shape

        mtm1 = th.shared(np.zeros(paramshape, dtype=th.config.floatX))
        vtm1 = th.shared(np.zeros(paramshape, dtype=th.config.floatX))

        mt = beta1 * mtm1 + (1 - beta1) * dparam
        vt = beta2 * vtm1 + (1 - beta2) * dparam**2
        u = at * mt / (T.sqrt(vt) + epsilon)

        updates.append((mtm1, mt))
        updates.append((vtm1, vt))
        updates.append((param, param - u))

    updates.append((tm1, t))

    return updates
Exemple #48
0
 def __init__(self, train, test, alpha_lambda, n_user, n_item, n_in,
              n_hidden):
     """
     构建 模型参数
     :param train: 添加mask后的
     :param test: 添加mask后的
     :param n_user: 用户的真实数目
     :param n_item: 商品items的真正数目,init()里补全一个商品作为填充符
     :param n_in: rnn输入向量的维度
     :param n_hidden: rnn隐层向量的维度
     :return:
     """
     # 来自于theano官网的dAE部分。
     rng = np.random.RandomState(123)
     self.thea_rng = RandomStreams(rng.randint(2**30))  # 旗下随机函数可在GPU下运行。
     # 用mask进行补全后的train/test
     tra_buys_masks, tra_masks, tra_buys_neg_masks = train
     tes_buys_masks, tes_masks, tes_buys_neg_masks = test
     self.tra_buys_masks = theano.shared(borrow=True,
                                         value=np.asarray(tra_buys_masks,
                                                          dtype='int32'))
     self.tes_buys_masks = theano.shared(borrow=True,
                                         value=np.asarray(tes_buys_masks,
                                                          dtype='int32'))
     self.tra_masks = theano.shared(borrow=True,
                                    value=np.asarray(tra_masks,
                                                     dtype='int32'))
     self.tes_masks = theano.shared(borrow=True,
                                    value=np.asarray(tes_masks,
                                                     dtype='int32'))
     self.tra_buys_neg_masks = theano.shared(borrow=True,
                                             value=np.asarray(
                                                 tra_buys_neg_masks,
                                                 dtype='int32'))
     self.tes_buys_neg_masks = theano.shared(borrow=True,
                                             value=np.asarray(
                                                 tes_buys_neg_masks,
                                                 dtype='int32'))
     # 把超参数shared
     self.alpha_lambda = theano.shared(borrow=True,
                                       value=np.asarray(
                                           alpha_lambda,
                                           dtype=theano.config.floatX))
     # 初始化,先定义局部变量,再self.修饰成实例变量
     rang = 0.5
     lt = uniform(-rang, rang,
                  (n_item + 1, n_in))  # 多出来一个(填充符),存放用于补齐用户购买序列/实际不存在的item
     ui = uniform(-rang, rang, (4, n_hidden, n_hidden))
     wh = uniform(-rang, rang, (4, n_hidden, n_hidden))
     c0 = np.zeros((n_hidden, ), dtype=theano.config.floatX)
     h0 = np.zeros((n_hidden, ), dtype=theano.config.floatX)
     bi = np.zeros((4, n_hidden), dtype=theano.config.floatX)
     # 建立参数。
     self.lt = theano.shared(borrow=True,
                             value=lt.astype(theano.config.floatX))
     self.ui = theano.shared(borrow=True,
                             value=ui.astype(theano.config.floatX))
     self.wh = theano.shared(borrow=True,
                             value=wh.astype(theano.config.floatX))
     self.c0 = theano.shared(borrow=True, value=c0)
     self.h0 = theano.shared(borrow=True, value=h0)
     self.bi = theano.shared(borrow=True, value=bi)
     # 存放训练好的users、items表达。用于计算所有users对所有items的评分:users * items
     trained_items = uniform(-rang, rang, (n_item + 1, n_hidden))
     trained_users = uniform(-rang, rang, (n_user, n_hidden))
     self.trained_items = theano.shared(borrow=True,
                                        value=trained_items.astype(
                                            theano.config.floatX))
     self.trained_users = theano.shared(borrow=True,
                                        value=trained_users.astype(
                                            theano.config.floatX))
Exemple #49
0
    def __init__(self,
                 numpy_rng,
                 theano_rng=None,
                 n_ins=784,
                 hidden_layers_sizes=[500, 500],
                 n_outs=10):
        """This class is made to support a variable number of layers. 

        :type numpy_rng: numpy.random.RandomState
        :param numpy_rng: numpy random number generator used to draw initial 
                    weights

        :type theano_rng: theano.tensor.shared_randomstreams.RandomStreams
        :param theano_rng: Theano random generator; if None is given one is 
                           generated based on a seed drawn from `rng`

        :type n_ins: int
        :param n_ins: dimension of the input to the DBN

        :type n_layers_sizes: list of ints
        :param n_layers_sizes: intermediate layers size, must contain 
                               at least one value

        :type n_outs: int
        :param n_outs: dimension of the output of the network
        """

        self.sigmoid_layers = []
        self.rbm_layers = []
        self.params = []
        self.n_layers = len(hidden_layers_sizes)

        assert self.n_layers > 0

        if not theano_rng:
            theano_rng = RandomStreams(numpy_rng.randint(2**30))

        # allocate symbolic variables for the data
        self.x = T.matrix('x')  # the data is presented as rasterized images
        self.y = T.ivector('y')  # the labels are presented as 1D vector of
        # [int] labels

        # The DBN is an MLP, for which all weights of intermediate layers are shared with a
        # different RBM.  We will first construct the DBN as a deep multilayer perceptron, and
        # when constructing each sigmoidal layer we also construct an RBM that shares weights
        # with that layer. During pretraining we will train these RBMs (which will lead
        # to chainging the weights of the MLP as well) During finetuning we will finish
        # training the DBN by doing stochastic gradient descent on the MLP.

        for i in xrange(self.n_layers):
            # construct the sigmoidal layer

            # the size of the input is either the number of hidden units of the layer below or
            # the input size if we are on the first layer
            if i == 0:
                input_size = n_ins
            else:
                input_size = hidden_layers_sizes[i - 1]

            # the input to this layer is either the activation of the hidden layer below or the
            # input of the DBN if you are on the first layer
            if i == 0:
                layer_input = self.x
            else:
                layer_input = self.sigmoid_layers[-1].output

            sigmoid_layer = HiddenLayer(rng=numpy_rng,
                                        input=layer_input,
                                        n_in=input_size,
                                        n_out=hidden_layers_sizes[i],
                                        activation=T.nnet.sigmoid)

            # add the layer to our list of layers
            self.sigmoid_layers.append(sigmoid_layer)

            # its arguably a philosophical question...  but we are going to only declare that
            # the parameters of the sigmoid_layers are parameters of the DBN. The visible
            # biases in the RBM are parameters of those RBMs, but not of the DBN.
            self.params.extend(sigmoid_layer.params)

            # Construct an RBM that shared weights with this layer
            rbm_layer = RBM(numpy_rng=numpy_rng,
                            theano_rng=theano_rng,
                            input=layer_input,
                            n_visible=input_size,
                            n_hidden=hidden_layers_sizes[i],
                            W=sigmoid_layer.W,
                            hbias=sigmoid_layer.b)
            self.rbm_layers.append(rbm_layer)

        # We now need to add a logistic layer on top of the MLP
        self.logLayer = LogisticRegression(\
                         input = self.sigmoid_layers[-1].output,\
                         n_in = hidden_layers_sizes[-1], n_out = n_outs)
        self.params.extend(self.logLayer.params)

        # compute the cost for second phase of training, defined as the
        # negative log likelihood of the logistic regression (output) layer
        self.finetune_cost = self.logLayer.negative_log_likelihood(self.y)

        # compute the gradients with respect to the model parameters
        # symbolic variable that points to the number of errors made on the
        # minibatch given by self.x and self.y
        self.errors = self.logLayer.errors(self.y)
Exemple #50
0
 def __init__(self, rate):
     self.p = numpy.array(1 - rate).astype(theano.config.floatX)
     self.rng = RandomStreams(numpy.random.randint(1234))
Exemple #51
0
    def __init__(self, input=None, n_visible=784, n_hidden=500, \
        W=None, hbias=None, vbias=None, numpy_rng=None,
        theano_rng=None):
        """
        RBM constructor. Defines the parameters of the model along with
        basic operations for inferring hidden from visible (and vice-versa),
        as well as for performing CD updates.

        :param input: None for standalone RBMs or symbolic variable if RBM is
        part of a larger graph.

        :param n_visible: number of visible units

        :param n_hidden: number of hidden units

        :param W: None for standalone RBMs or symbolic variable pointing to a
        shared weight matrix in case RBM is part of a DBN network; in a DBN,
        the weights are shared between RBMs and layers of a MLP

        :param hbias: None for standalone RBMs or symbolic variable pointing
        to a shared hidden units bias vector in case RBM is part of a
        different network

        :param vbias: None for standalone RBMs or a symbolic variable
        pointing to a shared visible units bias
        """

        self.n_visible = n_visible
        self.n_hidden = n_hidden

        if numpy_rng is None:
            # create a number generator
            numpy_rng = numpy.random.RandomState(1234)

        if theano_rng is None:
            theano_rng = RandomStreams(numpy_rng.randint(2**30))

        if W is None:
            # W is initialized with `initial_W` which is uniformely
            # sampled from -4*sqrt(6./(n_visible+n_hidden)) and
            # 4*sqrt(6./(n_hidden+n_visible)) the output of uniform if
            # converted using asarray to dtype theano.config.floatX so
            # that the code is runable on GPU
            initial_W = numpy.asarray(numpy_rng.uniform(
                low=-4 * numpy.sqrt(6. / (n_hidden + n_visible)),
                high=4 * numpy.sqrt(6. / (n_hidden + n_visible)),
                size=(n_visible, n_hidden)),
                                      dtype=theano.config.floatX)
            # theano shared variables for weights and biases
            W = theano.shared(value=initial_W, name='W', borrow=True)

        if hbias is None:
            # create shared variable for hidden units bias
            hbias = theano.shared(value=numpy.zeros(
                n_hidden, dtype=theano.config.floatX),
                                  name='hbias',
                                  borrow=True)

        if vbias is None:
            # create shared variable for visible units bias
            vbias = theano.shared(value=numpy.zeros(
                n_visible, dtype=theano.config.floatX),
                                  name='vbias',
                                  borrow=True)

        # initialize input layer for standalone RBM or layer0 of DBN
        self.input = input
        if not input:
            self.input = T.matrix('input')

        self.W = W
        self.hbias = hbias
        self.vbias = vbias
        self.theano_rng = theano_rng
        # **** WARNING: It is not a good idea to put things in this list
        # other than shared variables created in this function.
        self.params = [self.W, self.hbias, self.vbias]
Exemple #52
0
    def __init__(self, nin, nout, nhidden, ngauss, nvar):
        self.nin = nin
        self.nout = nout
        self.nhidden = nhidden
        self.ngauss = ngauss
        self.nu = nu = (self.nout * (self.nout + 1)) // 2

        # In/out:
        x = T.dmatrix("x")
        theta = T.dmatrix("theta")

        # Parameters:
        W = theano.shared(np.random.rand(nin, nhidden), name="W")
        b = theano.shared(np.random.rand(nhidden), name="b")
        y = T.dot(x, W) + b

        W_alpha = theano.shared(1e-8 * np.random.rand(nhidden, ngauss),
                                name="W_alpha")
        b_alpha = theano.shared(np.zeros(ngauss), name="b_alpha")
        alpha = T.nnet.softmax(T.dot(y, W_alpha) + b_alpha)

        W_mk = theano.shared(1e-8 * np.random.rand(nhidden, ngauss * nout),
                             name="W_mk")
        b_mk = theano.shared(np.zeros((ngauss * nout)), name="b_mk")

        W_u = theano.shared(1e-8 * np.random.rand(nhidden, ngauss * nu),
                            name="W_u")
        b_u = theano.shared(np.zeros((ngauss * nu)), name="b_u")

        # Compute the Gaussian cost using a reduce:
        Uvals = T.dot(y, W_u) + b_u
        mkvals = T.dot(y, W_mk) + b_mk

        def apply_gaussian(Uv, mk, a, th, current):
            for i in range(ngauss):
                arg = T.exp(Uv[i * nu:i * nu + nout])
                current += T.sum(arg)
                U = T.diag(arg)
                U = T.set_subtensor(U[np.triu_indices(nout, 1)],
                                    Uv[i * nu + nout:(i + 1) * nu])
                r = th - mk[i * nout:(i + 1) * nout]
                r2 = T.dot(r, T.dot(U.T, T.dot(U, r)))
                current += T.log(a[i]) - 0.5 * r2
            return current

        outputs_info = T.as_tensor_variable(np.asarray(0.0, float))
        lnprob, _ = theano.reduce(apply_gaussian,
                                  [Uvals, mkvals, alpha, theta], outputs_info)
        cost = -lnprob

        self.params = [W, b, W_alpha, b_alpha, W_mk, b_mk, W_u, b_u]
        self.grads = T.grad(cost, self.params)
        updates = get_adam_updates(cost, self.params)

        self.update_step = theano.function([x, theta],
                                           outputs=cost,
                                           updates=updates)
        self.cost_func = theano.function([x, theta], outputs=cost)

        # Stochastic objective:
        ntot = np.sum([np.prod(np.shape(p.get_value())) for p in self.params])
        rng = RandomStreams()
        u = rng.normal((nvar, ntot))
        phi_m = theano.shared(np.zeros(ntot), name="phi_m")
        phi_s = theano.shared(np.zeros(ntot), name="phi_s")
        phi = (phi_m + T.exp(0.5 * phi_s))[None, :] * u

        print(theano.function([], outputs=phi)().shape)
Exemple #53
0
    def reset_rng(self):

        self.rng = N.random.RandomState([12.,9.,2.])
        self.theano_rng = RandomStreams(self.rng.randint(2**30))
        if self.initialized:
            self.redo_theano()
    def __init__(self,
                 numpy_rng,
                 theano_rng=None,
                 n_ins=110,
                 hidden_layers_sizes=[30, 5],
                 n_outs=2,
                 corruption_levels=[0.1, 0.2]):

        self.sigmoid_layers = []
        self.dA_layers = []
        self.params = []
        self.n_layers = len(hidden_layers_sizes)

        assert self.n_layers > 0

        if not theano_rng:
            theano_rng = RandomStreams(numpy_rng.randint(2**30))

        # allocate symbolic variables for the data
        self.x = T.matrix('x')
        self.y = T.ivector('y')

        for i in xrange(self.n_layers):
            # construct the sigmoidal layer
            # the size of the input is either the number of hidden units
            # or the layer below or the input size if we are on the first layer
            if i == 0:
                input_size = n_ins
            else:
                input_size = hidden_layers_sizes[i - 1]

            if i == 0:
                layer_input = self.x
            else:
                layer_input = self.sigmoid_layers[-1].output

            sigmoid_layer = HiddenLayer(rng=numpy_rng,
                                        input=layer_input,
                                        n_in=input_size,
                                        n_out=hidden_layers_sizes[i],
                                        activation=T.nnet.sigmoid)
            # add the layer to our list of layers
            self.sigmoid_layers.append(sigmoid_layer)
            self.params.extend(sigmoid_layer.params)

            # Construct a denoising autoencoder that shared weights with this layer
            dA_layer = dA(numpy_rng=numpy_rng,
                          theano_rng=theano_rng,
                          input=layer_input,
                          n_visible=input_size,
                          n_hidden=hidden_layers_sizes[i],
                          W=sigmoid_layer.W,
                          bhid=sigmoid_layer.b)

            self.dA_layers.append(dA_layer)

        self.logLayer = LogisticRegression(
            input=self.sigmoid_layers[-1].output,
            n_in=hidden_layers_sizes[-1],
            n_out=n_outs)
        self.params.extend(self.logLayer.params)
        # compute the cost for second phase of training
        self.finetune_cost = self.logLayer.negative_log_likelihood(self.y)
        self.errors = self.logLayer.errors(self.y)
        self.y_pred = self.logLayer.y_pred
    def __init__(self, numpy_rng, theano_rng=None, input=None,
                 n_visible=784, n_hidden=500,
                 W=None, bhid=None, bvis=None):
        """
        Initialize the dA class by specifying the number of visible units (the
        dimension d of the input ), the number of hidden units ( the dimension
        d' of the latent or hidden space ) and the corruption level. The
        constructor also receives symbolic variables for the input, weights and
        bias. Such a symbolic variables are useful when, for example the input
        is the result of some computations, or when weights are shared between
        the dA and an MLP layer. When dealing with SdAs this always happens,
        the dA on layer 2 gets as input the output of the dA on layer 1,
        and the weights of the dA are used in the second stage of training
        to construct an MLP.

        :type numpy_rng: numpy.random.RandomState
        :param numpy_rng: number random generator used to generate weights

        :type theano_rng: theano.tensor.shared_randomstreams.RandomStreams
        :param theano_rng: Theano random generator; if None is given one is
                     generated based on a seed drawn from `rng`

        :type input: theano.tensor.TensorType
        :param input: a symbolic description of the input or None for
                      standalone dA

        :type n_visible: int
        :param n_visible: number of visible units

        :type n_hidden: int
        :param n_hidden:  number of hidden units

        :type W: theano.tensor.TensorType
        :param W: Theano variable pointing to a set of weights that should be
                  shared belong the dA and another architecture; if dA should
                  be standalone set this to None

        :type bhid: theano.tensor.TensorType
        :param bhid: Theano variable pointing to a set of biases values (for
                     hidden units) that should be shared belong dA and another
                     architecture; if dA should be standalone set this to None

        :type bvis: theano.tensor.TensorType
        :param bvis: Theano variable pointing to a set of biases values (for
                     visible units) that should be shared belong dA and another
                     architecture; if dA should be standalone set this to None


        """
        self.n_visible = n_visible
        self.n_hidden = n_hidden

        # create a Theano random generator that gives symbolic random values
        if not theano_rng:
            theano_rng = RandomStreams(numpy_rng.randint(2 ** 30))

        # note : W' was written as `W_prime` and b' as `b_prime`
        if not W:
            # W is initialized with `initial_W` which is uniformely sampled
            # from -4*sqrt(6./(n_visible+n_hidden)) and
            # 4*sqrt(6./(n_hidden+n_visible))the output of uniform if
            # converted using asarray to dtype
            # theano.config.floatX so that the code is runable on GPU
            initial_W = numpy.asarray(numpy_rng.uniform(
                      low=-4 * numpy.sqrt(6. / (n_hidden + n_visible)),
                      high=4 * numpy.sqrt(6. / (n_hidden + n_visible)),
                      size=(n_visible, n_hidden)), dtype=theano.config.floatX)
            W = theano.shared(value=initial_W, name='W', borrow=True)

        if not bvis:
            bvis = theano.shared(value=numpy.zeros(n_visible,
                                         dtype=theano.config.floatX),
                                 borrow=True)

        if not bhid:
            bhid = theano.shared(value=numpy.zeros(n_hidden,
                                                   dtype=theano.config.floatX),
                                 name='b',
                                 borrow=True)

        self.W = W
        # b corresponds to the bias of the hidden
        self.b = bhid
        # b_prime corresponds to the bias of the visible
        self.b_prime = bvis
        # tied weights, therefore W_prime is W transpose
        self.W_prime = self.W.T
        self.theano_rng = theano_rng
        # if no input is given, generate a variable representing the input
        if input == None:
            # we use a matrix because we expect a minibatch of several
            # examples, each example being a row
            self.x = T.dmatrix(name='input')
        else:
            self.x = input

        self.params = [self.W, self.b, self.b_prime]
    extra_in_scp_file = arguments['extra_in_scp_file']
    lstm_param_file = arguments['lstm_param_file']
    lstm_cfg_file = arguments['lstm_cfg_file']
    layer_index = int(arguments['layer_index'])

    # network structure
    cfg = cPickle.load(open(lstm_cfg_file, 'r'))
    cfg.init_activation()

    kaldiread = KaldiReadIn(in_scp_file)
    extra_kaldiread = KaldiReadIn(extra_in_scp_file)
    kaldiwrite = KaldiWriteOut(out_ark_file)

    log('> ... setting up the Phase ATTEND LSTM layers')
    rng = numpy.random.RandomState(89677)
    theano_rng = RandomStreams(rng.randint(2**30))

    lstm = PhaseATTEND_LSTM(numpy_rng=rng, theano_rng=theano_rng, cfg=cfg)
    _file2nnet(layers=lstm.layers,
               set_layer_num=len(lstm.layers),
               filename=lstm_param_file)
    out_function = lstm.build_extract_feat_function()

    while True:
        uttid, in_matrix = kaldiread.read_next_utt()
        extra_uttid, extra_in_matrix = extra_kaldiread.read_next_utt()
        if uttid == '':
            break
        print 'in_matrix:' + str(in_matrix.shape)
        final_matrix = numpy.zeros((in_matrix.shape[0], cfg.n_outs),
                                   dtype=theano.config.floatX)
def test_dA(learning_rate=0.1, training_epochs=15,
            dataset='mnist.pkl.gz',
            batch_size=20, output_folder='dA_plots'):

    """
    This demo is tested on MNIST

    :type learning_rate: float
    :param learning_rate: learning rate used for training the DeNosing
                          AutoEncoder

    :type training_epochs: int
    :param training_epochs: number of epochs used for training

    :type dataset: string
    :param dataset: path to the picked dataset

    """
    datasets = load_data(dataset)
    train_set_x, train_set_y = datasets[0]

    # compute number of minibatches for training, validation and testing
    n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size

    # allocate symbolic variables for the data
    index = T.lscalar()    # index to a [mini]batch
    x = T.matrix('x')  # the data is presented as rasterized images

    if not os.path.isdir(output_folder):
        os.makedirs(output_folder)
    os.chdir(output_folder)
    ####################################
    # BUILDING THE MODEL NO CORRUPTION #
    ####################################

    rng = numpy.random.RandomState(123)
    theano_rng = RandomStreams(rng.randint(2 ** 30))

    da = dA(numpy_rng=rng, theano_rng=theano_rng, input=x,
            n_visible=28 * 28, n_hidden=500)

    cost, updates = da.get_cost_updates(corruption_level=0.,
                                        learning_rate=learning_rate)

    train_da = theano.function([index], cost, updates=updates,
         givens={x: train_set_x[index * batch_size:
                                (index + 1) * batch_size]})

    start_time = time.clock()

    ############
    # TRAINING #
    ############

    # go through training epochs
    for epoch in xrange(training_epochs):
        # go through trainng set
        c = []
        for batch_index in xrange(n_train_batches):
            c.append(train_da(batch_index))

        print 'Training epoch %d, cost ' % epoch, numpy.mean(c)

    end_time = time.clock()

    training_time = (end_time - start_time)

    print >> sys.stderr, ('The no corruption code for file ' +
                          os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((training_time) / 60.))
    image = PIL.Image.fromarray(
        tile_raster_images(X=da.W.get_value(borrow=True).T,
                           img_shape=(28, 28), tile_shape=(10, 10),
                           tile_spacing=(1, 1)))
    image.save('filters_corruption_0.png')

    #####################################
    # BUILDING THE MODEL CORRUPTION 30% #
    #####################################

    rng = numpy.random.RandomState(123)
    theano_rng = RandomStreams(rng.randint(2 ** 30))

    da = dA(numpy_rng=rng, theano_rng=theano_rng, input=x,
            n_visible=28 * 28, n_hidden=500)

    cost, updates = da.get_cost_updates(corruption_level=0.3,
                                        learning_rate=learning_rate)

    train_da = theano.function([index], cost, updates=updates,
         givens={x: train_set_x[index * batch_size:
                                  (index + 1) * batch_size]})

    start_time = time.clock()

    ############
    # TRAINING #
    ############

    # go through training epochs
    for epoch in xrange(training_epochs):
        # go through trainng set
        c = []
        for batch_index in xrange(n_train_batches):
            c.append(train_da(batch_index))

        print 'Training epoch %d, cost ' % epoch, numpy.mean(c)

    end_time = time.clock()

    training_time = (end_time - start_time)

    print >> sys.stderr, ('The 30% corruption code for file ' +
                          os.path.split(__file__)[1] +
                          ' ran for %.2fm' % (training_time / 60.))

    image = PIL.Image.fromarray(tile_raster_images(
        X=da.W.get_value(borrow=True).T,
        img_shape=(28, 28), tile_shape=(10, 10),
        tile_spacing=(1, 1)))
    image.save('filters_corruption_30.png')

    os.chdir('../')
Exemple #58
0
class SampledMeanBinaryCrossEntropy(DefaultDataSpecsMixin, Cost):
    """
    .. todo::

        WRITEME properly

    CE cost that goes with sparse autoencoder with L1 regularization on activations

    For theory:
    Y. Dauphin, X. Glorot, Y. Bengio. ICML2011
    Large-Scale Learning of Embeddings with Reconstruction Sampling

    Parameters
    ----------
    L1 : WRITEME
    ratio : WRITEME
    """
    def __init__(self, L1, ratio):
        self.random_stream = RandomStreams(seed=1)
        self.L1 = L1
        self.one_ratio = ratio

    def expr(self, model, data, **kwargs):
        """
        .. todo::

            WRITEME
        """
        self.get_data_specs(model)[0].validate(data)
        X = data
        # X is theano sparse
        X_dense = theano.sparse.dense_from_sparse(X)
        noise = self.random_stream.binomial(size=X_dense.shape,
                                            n=1,
                                            prob=self.one_ratio,
                                            ndim=None)

        # a random pattern that indicates to reconstruct all the 1s and some of the 0s in X
        P = noise + X_dense
        P = theano.tensor.switch(P > 0, 1, 0)
        P = tensor.cast(P, theano.config.floatX)

        # L1 penalty on activations
        reg_units = theano.tensor.abs_(model.encode(X)).sum(axis=1).mean()

        # penalty on weights, optional
        # params = model.get_params()
        # W = params[2]

        # there is a numerical problem when using
        # tensor.log(1 - model.reconstruct(X, P))
        # Pascal fixed it.
        before_activation = model.reconstruct_without_dec_acti(X, P)

        cost = (
            1 * X_dense *
            tensor.log(tensor.log(1 + tensor.exp(-1 * before_activation))) +
            (1 - X_dense) *
            tensor.log(1 + tensor.log(1 + tensor.exp(before_activation))))

        cost = (cost * P).sum(axis=1).mean()

        cost = cost + self.L1 * reg_units

        return cost
Exemple #59
0
 def __init__(self, dropout_rate):
     Layer.__init__(self)
     self.dropout_rate = dropout_rate
     numpy_rng = np.random.RandomState(123)
     self.theano_rng = RandomStreams(numpy_rng.randint(2**30))
Exemple #60
0
class LstmBasic(object):
    def __init__(self, train, test, alpha_lambda, n_user, n_item, n_in,
                 n_hidden):
        """
        构建 模型参数
        :param train: 添加mask后的
        :param test: 添加mask后的
        :param n_user: 用户的真实数目
        :param n_item: 商品items的真正数目,init()里补全一个商品作为填充符
        :param n_in: rnn输入向量的维度
        :param n_hidden: rnn隐层向量的维度
        :return:
        """
        # 来自于theano官网的dAE部分。
        rng = np.random.RandomState(123)
        self.thea_rng = RandomStreams(rng.randint(2**30))  # 旗下随机函数可在GPU下运行。
        # 用mask进行补全后的train/test
        tra_buys_masks, tra_masks, tra_buys_neg_masks = train
        tes_buys_masks, tes_masks, tes_buys_neg_masks = test
        self.tra_buys_masks = theano.shared(borrow=True,
                                            value=np.asarray(tra_buys_masks,
                                                             dtype='int32'))
        self.tes_buys_masks = theano.shared(borrow=True,
                                            value=np.asarray(tes_buys_masks,
                                                             dtype='int32'))
        self.tra_masks = theano.shared(borrow=True,
                                       value=np.asarray(tra_masks,
                                                        dtype='int32'))
        self.tes_masks = theano.shared(borrow=True,
                                       value=np.asarray(tes_masks,
                                                        dtype='int32'))
        self.tra_buys_neg_masks = theano.shared(borrow=True,
                                                value=np.asarray(
                                                    tra_buys_neg_masks,
                                                    dtype='int32'))
        self.tes_buys_neg_masks = theano.shared(borrow=True,
                                                value=np.asarray(
                                                    tes_buys_neg_masks,
                                                    dtype='int32'))
        # 把超参数shared
        self.alpha_lambda = theano.shared(borrow=True,
                                          value=np.asarray(
                                              alpha_lambda,
                                              dtype=theano.config.floatX))
        # 初始化,先定义局部变量,再self.修饰成实例变量
        rang = 0.5
        lt = uniform(-rang, rang,
                     (n_item + 1, n_in))  # 多出来一个(填充符),存放用于补齐用户购买序列/实际不存在的item
        ui = uniform(-rang, rang, (4, n_hidden, n_hidden))
        wh = uniform(-rang, rang, (4, n_hidden, n_hidden))
        c0 = np.zeros((n_hidden, ), dtype=theano.config.floatX)
        h0 = np.zeros((n_hidden, ), dtype=theano.config.floatX)
        bi = np.zeros((4, n_hidden), dtype=theano.config.floatX)
        # 建立参数。
        self.lt = theano.shared(borrow=True,
                                value=lt.astype(theano.config.floatX))
        self.ui = theano.shared(borrow=True,
                                value=ui.astype(theano.config.floatX))
        self.wh = theano.shared(borrow=True,
                                value=wh.astype(theano.config.floatX))
        self.c0 = theano.shared(borrow=True, value=c0)
        self.h0 = theano.shared(borrow=True, value=h0)
        self.bi = theano.shared(borrow=True, value=bi)
        # 存放训练好的users、items表达。用于计算所有users对所有items的评分:users * items
        trained_items = uniform(-rang, rang, (n_item + 1, n_hidden))
        trained_users = uniform(-rang, rang, (n_user, n_hidden))
        self.trained_items = theano.shared(borrow=True,
                                           value=trained_items.astype(
                                               theano.config.floatX))
        self.trained_users = theano.shared(borrow=True,
                                           value=trained_users.astype(
                                               theano.config.floatX))
        # 内建predict函数。不要写在这里,写在子类里,否则子类里会无法覆盖掉重写。
        # self.__theano_predict__(n_in, n_hidden)

    def update_neg_masks(self, tra_buys_neg_masks, tes_buys_neg_masks):
        # 每个epoch都更新负样本
        self.tra_buys_neg_masks.set_value(np.asarray(tra_buys_neg_masks,
                                                     dtype='int32'),
                                          borrow=True)
        self.tes_buys_neg_masks.set_value(np.asarray(tes_buys_neg_masks,
                                                     dtype='int32'),
                                          borrow=True)

    def update_trained_items(self):
        # 更新最终的items表达
        lt = self.lt.get_value(borrow=True)  # self.lt是shared,用get_value()。
        self.trained_items.set_value(np.asarray(lt,
                                                dtype=theano.config.floatX),
                                     borrow=True)  # update

    def update_trained_users(self, all_hus):
        # 外部先计算好,传进来直接更新
        self.trained_users.set_value(np.asarray(all_hus,
                                                dtype=theano.config.floatX),
                                     borrow=True)  # update

    def compute_sub_all_scores(self, start_end):  # 其实可以直接传过来实数参数
        # 计算users * items,每个用户对所有商品的评分(需去掉填充符)
        sub_all_scores = T.dot(self.trained_users[start_end],
                               self.trained_items[:-1].T)
        return sub_all_scores.eval()  # shape=(sub_n_user, n_item)

    def compute_sub_auc_preference(self, start_end):
        # items.shape=(n_item+1, 20),因为是mask形式,所以需要填充符。
        # 注意矩阵形式的索引方式。
        tes_items = self.trained_items[self.tes_buys_masks[
            start_end]]  # shape=(sub_n_user, len(tes_mask[0]), n_hidden)
        tes_items_neg = self.trained_items[self.tes_buys_neg_masks[start_end]]
        users = self.trained_users[start_end]
        shp0, shp2 = users.shape  # shape=(sub_n_user, n_hidden)
        # 利用性质:(n_user, 1, n_hidden) * (n_user, len, n_hidden) = (n_user, len, n_hidden),即broadcast
        # 利用性质:np.sum((n_user, len, n_hidden), axis=2) = (n_user, len),
        #         即得到各用户对test里正负样本的偏好值
        all_upqs = T.sum(users.reshape(
            (shp0, 1, shp2)) * (tes_items - tes_items_neg),
                         axis=2)
        all_upqs *= self.tes_masks[start_end]  # 只保留原先test items对应有效位置的偏好值
        return all_upqs.eval() > 0  # 将>0的标为True, 也就是1

    def get_corrupted_input_whole(self, inp, corruption_prob):
        # 处理2D矩阵:randomly set whole feature to zero. Matrix.shape=(n, m)
        # denoising方式0:随机将某些图、文特征整体性置为0
        # 比如原先一条序列的图像特征是(num, 1024); 那么0/1概率矩阵是(num, 1), T.Rebroadcast,再相乘
        # if corruption_prob < 0. or corruption_prob >= 1.:
        #     raise Exception('Drop prob must be in interval [0, 1)')
        retain_prob = 1. - corruption_prob
        randoms = self.thea_rng.binomial(
            size=(inp.shape[0], 1),  # shape=(num, 1)
            n=1,
            p=retain_prob,  # p是得1的概率。
            dtype=theano.config.floatX)
        randoms = T.Rebroadcast((1, True))(randoms)
        return inp * randoms  # shape=(num, 1024)

    def get_corrupted_input_whole_minibatch(self, inp, corruption_prob):
        # 亲测表明:在序列前做data的corruption,效果更好更稳定。
        # 处理3D矩阵
        retain_prob = 1. - corruption_prob
        randoms = self.thea_rng.binomial(
            size=(inp.shape[0], inp.shape[1],
                  1),  # shape=(seq_length, batch_size, 1)
            n=1,
            p=retain_prob,  # p是得1的概率。
            dtype=theano.config.floatX)
        randoms = T.Rebroadcast((2, True))(randoms)
        return inp * randoms  # shape=(seq_length, batch_size, 1024)

    def dropout(self, inp, drop_prob):
        # 处理向量:randomly set some positions to zero. Vector.shape=(n, )
        # 例如一个向量20维,就有20个位置,也就是有20个神经元。
        # train时做dropout,test时还是全连接。
        # if drop_prob < 0. or drop_prob >= 1.:
        #     raise Exception('Drop prob must be in interval [0, 1)')
        retain_prob = 1. - drop_prob  # 取0.5就可以了。
        randoms = self.thea_rng.binomial(
            size=inp.shape,  # 生成与向量inp同样维度的0、1向量
            n=1,  # 每个神经元实验一次
            p=retain_prob)  # 每个神经元*1的概率为p/retain_prob。*0的概率为drop_prob
        inp *= randoms  # 屏蔽某些神经元,重置为0
        inp /= retain_prob  # drop完后需要rescale,以维持inp在dropout前后的数值和(sum)不变。
        return inp  # 直接本地修改inp,所以调用时'self.dropout(x, 0.5)'即可直接本地修改输入x。

    def __theano_predict__(self, n_in, n_hidden):
        """
        测试阶段再跑一遍训练序列得到各个隐层。用全部数据一次性得出所有用户的表达
        """
        ui, wh = self.ui, self.wh

        tra_mask = T.imatrix()
        actual_batch_size = tra_mask.shape[0]
        seq_length = T.max(T.sum(tra_mask,
                                 axis=1))  # 获取mini-batch里各序列的长度最大值作为seq_length

        c0 = T.alloc(self.c0, actual_batch_size, n_hidden)  # shape=(n, 20)
        h0 = T.alloc(self.h0, actual_batch_size, n_hidden)  # shape=(n, 20)
        bi = T.alloc(self.bi, actual_batch_size, 4,
                     n_hidden)  # shape=(n, 4, 20), 原维度放在后边
        bi = bi.dimshuffle(1, 2, 0)  # shape=(4, 20, n)

        # 隐层是1个GRU Unit:都可以用这个统一的格式。
        pidxs = T.imatrix()
        ps = self.trained_items[
            pidxs]  # shape((actual_batch_size, seq_length, n_hidden))
        ps = ps.dimshuffle(
            1, 0, 2)  # shape=(seq_length, batch_size, n_hidden)=(157, n, 20)

        def recurrence(p_t, c_t_pre1, h_t_pre1):
            # 特征、隐层都处理成shape=(batch_size, n_hidden)=(n, 20)
            gates = T.dot(ui, p_t.T) + T.dot(
                wh, h_t_pre1.T) + bi  # shape=(4, 20, n)
            i, f, g, o = sigmoid(gates[0]).T, sigmoid(gates[1]).T, tanh(
                gates[2]).T, sigmoid(gates[3]).T
            c_t = f * c_t_pre1 + i * g  # shape=(n, 20)
            h_t = o * tanh(c_t)  # shape=(n, 20)
            return [c_t, h_t]

        [c, h], _ = theano.scan(  # h.shape=(157, n, 20)
            fn=recurrence,
            sequences=ps,
            outputs_info=[c0, h0],
            n_steps=seq_length)

        # 得到batch_hts.shape=(n, 20),就是这个batch里每个用户的表达ht。
        # 必须要用T.sum(),不然无法建模到theano的graph里、报length not known的错
        hs = h.dimshuffle(1, 0, 2)  # shape=(batch_size, seq_length, n_hidden)
        hts = hs[  # shape=(n, n_hidden)
            T.arange(actual_batch_size),  # 行. 花式索引a[[1,2,3],[2,5,6]],需给定行列的表示
            T.sum(tra_mask, axis=1) - 1]  # 列。需要mask是'int32'型的

        # givens给数据
        start_end = T.ivector()
        self.seq_predict = theano.function(
            inputs=[start_end],
            outputs=hts,
            givens={
                pidxs: self.
                tra_buys_masks[start_end],  # 类型是 TensorType(int32, matrix)
                tra_mask: self.tra_masks[start_end]
            })

    def predict(self, idxs):
        return self.seq_predict(idxs)