Ejemplo n.º 1
0
    def get_output_for(self, input, **kwargs):
        '''
        Computes 2D FFT. Input layer must have dimension [n, 2, nx, ny]
        '''
        if self.is_3d:

            n, nc, nx, ny, nt = self.data_shape
            lin = T.transpose(input, axes=(0, 4, 1, 2, 3))
            lin = lin.reshape((-1, nc, nx, ny))
            lout, updates = theano.scan(self.transform, sequences=lin)
            lout = lout.reshape((-1, nt, nc, nx, ny))
            out = T.transpose(lout, axes=(0, 2, 3, 4, 1))
            return out

            # def loop_over_n(i, arr):
            #     out, updates = theano.scan(self.transform,
            #                                sequences=arr[:, :, i])[0]
            #     return out

            # nt = self.data_shape[-1]
            # out, updates = theano.scan(loop_over_n,
            #                            non_sequences=input,
            #                            sequences=xrange(nt))
            # return out

        out, updates = theano.scan(self.transform, sequences=input)
        return out
Ejemplo n.º 2
0
    def cost_seq(self, start, end, A, tagger_out, targets):
        # compute gold seq's score with using A and tagger_out
        gold_seq = T.argmax(targets, axis=1)

        seq_score = start[gold_seq[0]]

        seq_score += end[gold_seq[-1]]

        # tagger_out_scores
        tout_chooser = lambda gold_i, i, tagger_out: tagger_out[i][gold_i]
        tout_seq_scores, updates = theano.scan(
            fn=tout_chooser,
            sequences=[gold_seq, T.arange(gold_seq.shape[0])],
            non_sequences=[tagger_out],
            outputs_info=None
        )
        seq_score += tout_seq_scores.sum()

        # A matrix scores
        A_chooser = lambda i, next_i, A: A[i][next_i]
        A_seq_scores, updates = theano.scan(
            fn=A_chooser,
            sequences=[gold_seq[:-1], gold_seq[1:]],
            non_sequences=[A],
            outputs_info=None
        )
        seq_score += A_seq_scores.sum()

        return seq_score
    def test_scan_err1(self):
        # This test should fail when building fx for the first time
        orig_compute_test_value = theano.config.compute_test_value
        try:
            theano.config.compute_test_value = 'raise'

            k = T.iscalar("k")
            A = T.matrix("A")
            k.tag.test_value = 3
            A.tag.test_value = numpy.random.rand(5,3).astype(config.floatX)

            def fx(prior_result, A):
                return T.dot(prior_result, A)

            # Since we have to inspect the traceback,
            # we cannot simply use self.assertRaises()
            try:
                theano.scan(
                        fn=fx,
                        outputs_info=T.ones_like(A),
                        non_sequences=A,
                        n_steps=k)
                assert False
            except ValueError, e:
                # Get traceback
                tb = sys.exc_info()[2]
                # Get frame info 4 layers up
                frame_info = traceback.extract_tb(tb)[-5]
                # We should be in the "fx" function defined above
                assert os.path.split(frame_info[0])[1] == 'test_compute_test_value.py'
                assert frame_info[2] == 'fx'

        finally:
            theano.config.compute_test_value = orig_compute_test_value
Ejemplo n.º 4
0
    def output_h_vals(self, train=False):
        if self.inputs_dict.has_key('input_single'):
            input = self.get_input('input_single', train) #(nb_sample, input_dim)
            X = TU.repeat(input, self.input_length) # (input_length, nb_sample, input_dim)    
            mask = None
        else:
            input = self.get_input('input_sequence', train)  # (nb_sample, input_length, input_dim)
            X = input.dimshuffle((1, 0, 2))  # (input_length, nb_sample, input_dim) 
            mask = self.get_input_mask('input_sequence',train) # (nb_sample, input_length)    
            if mask:
                mask = T.cast(mask, dtype='int8').dimshuffle((1, 0, 'x')) # (input_length, nb_sample, 1)

        #h_0 = T.zeros((X.shape[1], self.output_dim), X.dtype)  # (nb_samples, output_dim)
        h_0 = self._get_initial_state(X)

        if mask:
            h_vals, _ = theano.scan( self.step,
                                            sequences=[mask, X],
                                            outputs_info=h_0,
                                            non_sequences=[self.W, self.U, self.b],
                                            truncate_gradient=self.truncate_gradient,
                                            go_backwards=self.go_backwards,
                                            strict=True)
        else:
            h_vals, _ = theano.scan( self.step_no_mask,
                                sequences=[X],
                                outputs_info=h_0,
                                non_sequences=[self.W, self.U, self.b],
                                truncate_gradient=self.truncate_gradient,
                                go_backwards=self.go_backwards,
                                strict=True)
            
        return h_vals #(input_length, nb_samples, output_dim)
Ejemplo n.º 5
0
 def apply(self , src , mask_length , tgt):
     """
         viterbi algorithm
     """
     result , updates = theano.scan(
         fn = self.train_step,
         sequences = src,
         outputs_info = [self.A_start, None] ,
         non_sequences = self.A ,
         n_steps = mask_length
     )
     # the score of best path
     best_path_score = result[0][-1].max()
     idx = T.argmax(result[0][-1])
     #backtracking
     res2 , _ = theano.scan(
         fn = lambda dps , idx , idx2 : [dps[idx] , idx],
         sequences = result[1][::-1],
         outputs_info = [idx , idx],
         n_steps = mask_length
     )
     # the path of best score
     best_path = res2[1]
     #if len(best_path) < seq_len:
     #    best_path.extend((seq_len - len(best_path)) * [2])
     # the score of tgt path
     tgt_score = self.decode(src , mask_length , tgt)
     # max_margin
     max_margin = T.sum(T.neq(tgt[:mask_length] , best_path))
     cost = best_path_score + max_margin - tgt_score
     return T.switch(T.lt(cost , T.alloc(numpy.float32(0.)))
                     , T.alloc(numpy.float32(0.))
                     , cost
                     ),best_path
Ejemplo n.º 6
0
    def __init__(self, layers, num_possible_characters):
        print("Building the model...")
        self.rng = theano.tensor.shared_randomstreams.RandomStreams()

        self.model = StackedCells(num_possible_characters, layers=layers, activation=T.tanh, celltype=LSTM)
        self.model.layers[0].in_gate2.activation = lambda x: x
        self.model.layers.append(Layer(layers[-1], num_possible_characters, lambda x: T.nnet.softmax(x)[0]))

        num_steps = T.scalar(dtype='int32')
        # function to put into scan to fire the network recurrently
        def step(prev_char, *prev_hiddens):
            new_hiddens = self.model.forward(int_to_onehot(T.cast(prev_char, 'int32'), num_possible_characters), prev_hiddens)
            dist = new_hiddens[-1]
            next_char = self.rng.choice(size=[1], a=num_possible_characters, p=dist)
            return [T.cast(next_char, 'int32')] + new_hiddens[:-1]

        results, updates = theano.scan(step, n_steps=num_steps,
        outputs_info=[dict(initial=np.int32([-1]),taps=[-1])] 
        + [dict(initial=layer.initial_hidden_state, taps=[-1]) for layer in self.model.layers if hasattr(layer, 'initial_hidden_state')])

        self.forward_pass = theano.function([num_steps], [results[0].dimshuffle((1,0))[0]], updates=updates, allow_input_downcast=True)

        training_data = T.matrix('training data') # list of character values less than num_possible_characters

        def step_inner(prev_char, desired_output, *prev_hiddens):
            new_hiddens = self.model.forward(int_to_onehot(prev_char, num_possible_characters), prev_hiddens)
            prob_correct = new_hiddens[-1][desired_output]
            return [prob_correct] + new_hiddens[:-1]

        # I have no idea whether nesting scan will work at all
        def step_outer(training_sample, *initial_states):
            print(list(initial_states))
            # different call to scan that uses the training data as prior timesteps
            results_inner, updates_inner = theano.scan(step_inner, n_steps=training_sample.shape[0], sequences=[dict(input=T.cast(T.concatenate(([0], training_sample)), 'int32'), taps=[0,1])],
            outputs_info=[None] + list(initial_states))
            return results_inner, updates_inner

        results_outer, updates_outer = theano.scan(step_outer, n_steps=training_data.shape[0], 
            sequences=[training_data], 
            non_sequences=[layer.initial_hidden_state for layer in self.model.layers if hasattr(layer, 'initial_hidden_state')],
            outputs_info=[None, None])

        results_inner = results[0] # this should be a list of each "results" from step_inner
        updates_inner = results[1] # this should be a list of updates

        # I want to find the zero position of each results vector in results_inner

        prob_correct_v = results_inner[:][0] # should be a matrix of probabilities between 0 and 1
        cost = -T.mean(T.log(prob_correct_v)) # mean should take the average across all dimensions

        u, gsums, xsums, lr, max_norm = create_optimization_updates(cost, self.model.params, method='adadelta')

        # combine all the updates into one dictionary
        all_updates = {}
        for d in updates_inner:
            all_updates.update(d)
        all_updates.update(updates_outer)

        self.training_pass = theano.function([training_data], [cost], updates=all_updates + u, allow_input_downcast=True)
        self.validation_pass = theano.function([training_data], [cost], updates=all_updates, allow_input_downcast=True)
Ejemplo n.º 7
0
Archivo: draw.py Proyecto: samim23/seya
    def get_output(self, train=False):
        self._train_state = train
        X, eps = self.get_input(train).values()
        eps = eps.dimshuffle(1, 0, 2)
        canvas, init_enc, init_dec = self._get_initial_states(X)

        if self.inner_rnn == 'gru':
            outputs, updates = scan(self._step,
                                    sequences=eps,
                                    outputs_info=[canvas, init_enc, init_dec, None],
                                    non_sequences=[X, ] + self.params,
                                    # n_steps=self.n_steps,
                                    truncate_gradient=self.truncate_gradient)

        elif self.inner_rnn == 'lstm':
            outputs, updates = scan(self._step_lstm,
                                    sequences=eps,
                                    outputs_info=[0*canvas, 0*init_enc, 0*init_enc,
                                                  0*init_dec, 0*init_dec, None],
                                    non_sequences=[X, ] + self.params,
                                    truncate_gradient=self.truncate_gradient)

        kl = outputs[-1].sum(axis=0).mean()
        if train:
            # self.updates = updates
            self.regularizers = [SimpleCost(kl), ]
        if self.return_sequences:
            return [outputs[0].dimshuffle(1, 0, 2, 3, 4), kl]
        else:
            return [outputs[0][-1], kl]
Ejemplo n.º 8
0
    def build_rnnrbm(self, n_visible, n_hidden, n_hidden_recurrent):
        u0 = T.zeros((self.n_hidden_recurrent,))  # initial value for the RNN hidden

        def recurrence(v_t, u_tm1):
            bv_t = self.bv + T.dot(u_tm1, self.Wuv)
            bh_t = self.bh + T.dot(u_tm1, self.Wuh)
            generate = v_t is None
            if generate:
                v_t, _, _, updates = self.build_rbm(T.zeros((n_visible,)), self.W, bv_t, bh_t, k=25)
            u_t = T.tanh(self.bu + T.dot(v_t, self.Wvu) + T.dot(u_tm1, self.Wuu))
            return ([v_t, u_t], updates) if generate else [u_t, bv_t, bh_t]

        (u_t, bv_t, bh_t), updates_train = theano.scan(
            lambda v_t, u_tm1, *_: recurrence(v_t, u_tm1),
            sequences=self.v,
            outputs_info=[u0, None, None],
            non_sequences=self.params,
        )
        v_sample, cost, monitor, updates_rbm = self.build_rbm(self.v, self.W, bv_t[:], bh_t[:], k=15)

        updates_bh_t = updates_train.copy()

        updates_train.update(updates_rbm)

        # symbolic loop for sequence generation
        (v_t, u_t), updates_generate = theano.scan(
            lambda u_tm1, *_: recurrence(None, u_tm1), outputs_info=[None, u0], non_sequences=self.params, n_steps=1
        )

        return (self.v, v_sample, cost, monitor, self.params, updates_train, v_t, updates_generate, bh_t, updates_bh_t)
Ejemplo n.º 9
0
def nin(X, param):
    w1, w2, w3, b1, b2, b3 = param
    X = X.dimshuffle(0, 1, 'x', 2, 3)  # (n,32,1,r,c)
    w1 = w1.dimshuffle(0, 1, 2, 'x', 3, 4)  # (64,32,16,1,3,3)
    w2 = w2.dimshuffle(0, 1, 'x', 2, 'x', 'x')  # (64,32,1,16,1,1)
    w3 = w3.dimshuffle(0, 1, 2, 'x', 'x')  # (64,2,32,1,1)
    b1 = b1.dimshuffle(0, 1, 'x', 2, 'x', 'x')  # (64,32,1,16,1,1)
    b2 = b2.dimshuffle(0, 1, 'x', 2, 'x', 'x')  # (64,32,1,1,1,1)
    b3 = b3.dimshuffle(0, 'x', 1, 'x', 'x')  # (64,1,2,1,1)
    indexi = T.arange(w1.shape[0], dtype='int32')  # (0:64)
    indexi = T.repeat(indexi, w1.shape[1], axis=0)
    indexj = T.arange(w1.shape[1], dtype='int32')  # (0:64)
    indexj = T.tile(indexj, w1.shape[0])
    results, updates = scan(fn=metaOp1,
                            sequences=[indexi, indexj],
                            outputs_info=None,
                            non_sequences=[X, w1, w2, b1, b2],
                            strict=True)  # (64*32,n,1,r,c)
    metaShape1 = results.shape[-4], results.shape[-2], results.shape[-1]
    reshaped1 = results.reshape((w1.shape[0], w1.shape[1]) + metaShape1)  # (64,32,n,r,c)
    permuted1 = T.transpose(reshaped1, axes=(0, 2, 1, 3, 4))  # (64,n,32,r,c)
    indexi = T.arange(w1.shape[0], dtype='int32')  # (0:64)
    results, updates = scan(fn=metaOp2,
                            sequences=[indexi],
                            outputs_info=None,
                            non_sequences=[permuted1, w3, b3],
                            strict=True)  # (64,n,2,r,c)
    permuted2 = T.transpose(results, axes=(1, 0, 2, 3, 4))  # (n,64,2,r,c)
    metaShape2 = permuted2.shape[-2], permuted2.shape[-1]
    reshaped2 = permuted2.reshape((permuted2.shape[0], -1) + metaShape2)  # (n,128,r,c)
    return reshaped2
Ejemplo n.º 10
0
    def __theano_build__(self):
        params = self.params
        param_names = self.param_names
        hidden_dim = self.hidden_dim

        x1  = T.imatrix('x1')    # first sentence
        x2  = T.imatrix('x2')    # second sentence
        x1_mask = T.fmatrix('x1_mask')    #mask
        x2_mask = T.fmatrix('x2_mask')
        y   = T.ivector('y')     # label
        y_c = T.ivector('y_c')   # class weights 
        
        # Embdding words
        _E1 = params["E"].dot(params["W"][0]) + params["B"][0]
        _E2 = params["E"].dot(params["W"][1]) + params["B"][1]
        statex1 = _E1[x1.flatten(), :].reshape([x1.shape[0], x1.shape[1], hidden_dim])
        statex2 = _E2[x2.flatten(), :].reshape([x2.shape[0], x2.shape[1], hidden_dim])
        
        def rnn_cell(x, mx, ph, Wh):
            h = T.tanh(ph.dot(Wh) + x)
            h = mx[:, None] * h + (1-mx[:, None]) * ph
            return [h] 
            
        [h1], updates = theano.scan(
            fn=rnn_cell,
            sequences=[statex1, x1_mask],
            truncate_gradient=self.truncate,
            outputs_info=[dict(initial=T.zeros([self.batch_size, self.hidden_dim]))],
            non_sequences=params["W"][2])
        
        [h2], updates = theano.scan(
            fn=rnn_cell,
            sequences=[statex2, x2_mask],
            truncate_gradient=self.truncate,
            outputs_info=[dict(initial=h1[-1])],
            non_sequences=params["W"][3])
       
        #predict
        _s = T.nnet.softmax(h1[-1].dot(params["lrW"][0]) + h2[-1].dot(params["lrW"][1]) + params["lrb"])
        _p = T.argmax(_s, axis=1)
        _c = T.nnet.categorical_crossentropy(_s, y)
        _c = T.sum(_c * y_c)
        _l = T.sum(params["lrW"]**2)
        _cost = _c + 0.01 * _l
        
        # SGD parameters
        learning_rate = T.scalar('learning_rate')
        decay = T.scalar('decay')
        
        # Gradients and updates
        _grads, _updates = rms_prop(_cost, param_names, params, learning_rate, decay)
        
        # Assign functions
        self.bptt = theano.function([x1, x2, x1_mask, x2_mask, y, y_c], _grads)
        self.loss = theano.function([x1, x2, x1_mask, x2_mask, y, y_c], _c)
        self.weights = theano.function([x1, x2, x1_mask, x2_mask], _s)
        self.predictions = theano.function([x1, x2, x1_mask, x2_mask], _p)
        self.sgd_step = theano.function(
            [x1, x2, x1_mask, x2_mask, y, y_c, learning_rate, decay],
            updates=_updates)
Ejemplo n.º 11
0
def loss_fn_per_context(word_position,context):
	# sum up the global vectors of the context
	context_vector = T.sum(W_g[context], axis = 0)
	# start with -1 with none of the words disambiguated
	start = -1*T.ones_like(context)

	output_alg, updates = theano.scan(l2C, sequences = [context, T.arange(4)], outputs_info = [start, context_vector])

	disambiguated_senses = output_alg[0][-1]
	augmented_context_vector = output_alg[1][-1]


	sense_of_actual_word = disambiguated_senses[word_position]
	#return T.argsort(T.dot(context_vector, W_s[actual_word].T)), T.dot(context_vector, W_s[actual_word].T)

	actual_word = context[word_position]
	# Compute loss to update the global word vectors ignoring the word itself
	def score(i):
		return T.switch(T.eq(i, actual_word), 0, T.log(T.nnet.sigmoid(T.dot(W_g[actual_word], W_g[i]))))

	scores, ignore_updates  = theano.scan(score, sequences = [context])

	def calc_score(context_word, sense_of_context_word):
	 	return T.switch(T.eq(context_word, actual_word), 0, T.log(T.nnet.sigmoid(T.dot(W_s[actual_word][sense_of_actual_word], W_s[context_word][sense_of_context_word] ))))

	sense_scores, ignore_updates_ = theano.scan(calc_score, sequences = [context, disambiguated_senses])
	loss_this_example = T.sum(scores, axis = 0) + T.sum(sense_scores, axis = 0)
	return loss_this_example
Ejemplo n.º 12
0
Archivo: mlp.py Proyecto: zbxzc35/cws
    def __init__(self, ne, de, cs, nh, nc, L2_reg = 0.0, rng = np.random.RandomState()):
	self.nc = nc
	self.hiddenLayer = Layer(de*cs, nh, rng = rng)
	self.outputLayer = Layer(nh, nc)
	self.emb = theano.shared(rng.normal(loc = 0.0, scale = 0.01, size = (ne, de)).astype(theano.config.floatX))
	A = rng.normal(loc = 0.0, scale = 0.01, size = (nc, nc)).astype(theano.config.floatX)
	self.A = theano.shared(value = A, name = 'A', borrow = True)

	self.params = self.hiddenLayer.params + self.outputLayer.params + [self.emb, self.A]
	self.names = ['Wh', 'bh', 'w', 'b', 'emb', 'A']

	idxs = T.imatrix('idxs')
	x = self.emb[idxs].reshape((idxs.shape[0], de*cs))
	y = T.bvector('y')
	ans = T.bvector('ans')

	INF = 1e9
	result, updates1 = theano.scan(fn = self.one_step, sequences = x, outputs_info = [theano.shared(0.0), theano.shared(-INF), theano.shared(-INF), theano.shared(-INF), None, None, None, None])
	self.decode = theano.function(inputs = [idxs], outputs = result, updates = updates1)

	score, updates2 = theano.scan(fn = self.two_step, sequences = [x, dict(input = y, taps = [-1, 0]), dict(input = ans, taps = [-1, 0])], outputs_info = theano.shared(0.0))

	cost = score[-1]
	gradients = T.grad(cost, self.params)
	lr = T.scalar('lr')
	for p, g in zip(self.params, gradients):
	    updates2[p] = p + lr * g

	self.fit = theano.function(inputs = [idxs, y, ans, lr], outputs = cost, updates = updates2)
	self.normalize = theano.function(inputs = [], updates = {self.emb: self.emb / T.sqrt((self.emb**2).sum(axis=1)).dimshuffle(0, 'x')})
Ejemplo n.º 13
0
    def function(self, input_tensor):
        init_hs = T.zeros((input_tensor.shape[1], self.output_neurons))
        init_cs = T.zeros((input_tensor.shape[1], self.output_neurons))

        lstm_out_1, _ = theano.scan(fn=lambda a,b,c: self.__lstm_wrapper(a,b,c,self.d_forward, go_forwards=True),
                                      outputs_info=[init_hs,init_cs],
                                      sequences=input_tensor,
                                      non_sequences=None)
        
        lstm_out_2, _ = theano.scan(fn=lambda a,b,c: self.__lstm_wrapper(a,b,c,self.d_backward, go_forwards=False),
                                      outputs_info=[init_hs,init_cs],
                                      sequences=input_tensor,
                                      non_sequences=None)
        
        lstm_out_3, _ = theano.scan(fn=lambda a,b,c: self.__lstm_wrapper(a,b,c,self.u_forward, go_forwards=True),
                                      outputs_info=[init_hs,init_cs],
                                      sequences=input_tensor,
                                      non_sequences=None,
                                      go_backwards=True)

        lstm_out_4, _ = theano.scan(fn=lambda a,b,c: self.__lstm_wrapper(a,b,c,self.u_backward, go_forwards=False),
                                      outputs_info=[init_hs,init_cs],
                                      sequences=input_tensor,
                                      non_sequences=None,
                                      go_backwards=True)


        return T.concatenate((lstm_out_1[0],
                              lstm_out_2[0],
                              lstm_out_3[0][::-1],
                              lstm_out_4[0][::-1]), axis=2)
def get_square_norm_gradients_scan(D_by_layer, cost, accum = 0):

    # This returns a theano variable that will be of shape (minibatch_size, ).
    # It will contain, for each training example, the associated square-norm of the total gradient.
    # If you take the element-wise square-root afterwards, you will get
    # the associated 2-norms, which is what you want for importance sampling.

    for (layer_name, D) in D_by_layer.items():

        backprop_output = tensor.grad(cost, D['output'])

        if D.has_key('weight'):
            A = D['input']
            B = backprop_output
            S, _ =  theano.scan(fn=lambda A, B: tensor.sqr(tensor.outer(A,B)).sum(),
                                        sequences=[A,B])
            accum = accum + S

        if D.has_key('bias'):

            B = backprop_output
            S, _ =  theano.scan(fn=lambda B: tensor.sqr(B).sum(),
                                        sequences=[B])
            accum = accum + S
        
    return accum
Ejemplo n.º 15
0
def mvNormal_logp(mu, tau, value):
    """
    This logp function is for multivariate normal distribution

    Inputs:
    -------
    mu    = mu values assumed for each observation (num_obs x dims)
    tau   = tau values assumed for each observations (num_obs x dim x dim)
    value = observed values (num_obs x dims)

    Output:
    -------
    output = log likelihood 
    """

    dim = mu.shape[-1]
    k = tau.shape[1]
    n_count = value.shape[0]
    delta = value - mu

    # first function
    long_sum1, updates = theano.scan(lambda n: tt.log(1.0 / tt.nlinalg.det(n)), sequences=[tau], strict=True)

    # second function
    long_sum2, updates = theano.scan(lambda t, d: d.reshape((1, -1)).dot(t).dot(d), sequences=[tau, delta], strict=True)

    output = k * tt.log(2 * np.pi)
    output += long_sum1
    output += long_sum2

    output *= -1 / 2.0

    return output
Ejemplo n.º 16
0
    def inference(self):
        # A bit hacky
        # Re-initialize the visible unit (avoid copying useless dimshuffle
        # part of the graph computation of v)
        self.v = self.v_init
        # We have to dimshuffle so that time is the first dimension
        self.v = self.v.dimshuffle((1,0,2))

        # Write the recurrence to get the bias for the RBM
        (_, bv_t, bh_t), updates_inference = theano.scan(
            fn=self.recurrence,
            sequences=self.v, outputs_info=[self.u0, None, None])

        # Reshuffle the variables
        self.bv_dynamic = bv_t.dimshuffle((1,0,2))
        self.bh_dynamic = bh_t.dimshuffle((1,0,2))
        self.v = self.v.dimshuffle((1,0,2))

        # Train the RBMs by blocks
        # Perform k-step gibbs sampling
        v_chain, updates_rbm = theano.scan(
            fn=lambda v,bv,bh: self.gibbs_step(v,bv,bh)[1],
            outputs_info=[self.v],
            non_sequences=[self.bv_dynamic, self.bh_dynamic],
            n_steps=self.k
        )

        # Add updates of the rbm
        updates_inference.update(updates_rbm)

        # Get last sample of the gibbs chain
        v_sample = v_chain[-1]
        mean_v = self.gibbs_step(v_sample,self.bv_dynamic,self.bh_dynamic)[0]

        return v_sample, mean_v, updates_inference
Ejemplo n.º 17
0
 def gibbs_all(self, sample, W, vBias, hBias, countSteps, function_mode):
     if function_mode < 3:
         gibbsOne_format = lambda sample: self.list_function_for_gibbs[function_mode](sample, W, vBias, hBias);
         format, updates = theano.scan(fn=gibbsOne_format, \
                                       outputs_info=sample, \
                                       n_steps=countSteps)
         return format, updates
     else:
         if function_mode == MODE_WITH_COIN_EXCEPT_LAST:
             gibbsOne_format = lambda sample: self.list_function_for_gibbs[MODE_WITH_COIN](sample, W, vBias, hBias);
             format, updates = theano.scan(fn=gibbsOne_format, \
                                       outputs_info=sample, \
                                       n_steps=countSteps - 1)
             gibbsOne_format = lambda sample: self.list_function_for_gibbs[MODE_WITHOUT_COIN](sample, W, vBias, hBias);
             res = gibbsOne_format(format[-1])
             res = T.concatenate([format, [res]])
             return res, updates
         else:
             gibbsOne_format = lambda sample: self.list_function_for_gibbs[MODE_WITHOUT_COIN](sample, W, vBias, hBias);
             format, updates = theano.scan(fn=gibbsOne_format, \
                                           outputs_info=sample, \
                                           n_steps=countSteps - 1)
             gibbsOne_format = lambda sample: self.list_function_for_gibbs[MODE_WITH_COIN](sample, W, vBias, hBias);
             res = gibbsOne_format(format[-1])
             res = T.concatenate([format, [res]])
             return res, updates
Ejemplo n.º 18
0
    def call(self, x, mask=None):
        def _step(v1, v2):
            cosine_score = T.tensordot(v1 / T.sqrt(T.sum(T.sqr(v1), axis=2, keepdims=True) + 1e-6),
                                       (v2) / T.sqrt(T.sum(T.sqr(v2), axis=2, keepdims=True) + 1e-6),
                                       [[2], [2]])
            return cosine_score

        l_s = x[0]  # n_b x n_s x n_w_s x D
        l_a = x[1]  # n_b x 4 x n_w_qa x D
        # get cosine similarity for ALL word pairs
        output, _ = theano.scan(_step, sequences=[l_s, l_a], outputs_info=None)  # n_b x n_s x n_w_s x 4 x n_w_qa
        # return T.max(T.max(output, axis=4), axis=2)
        output = output.dimshuffle(2, 1, 0, 3, 4)  # n_w_s x n_s x n_b x 4 x n_w_qa

        def slide_max(i, X):
            size = self.window_size
            M = X[i:i + size]
            W = self.w_gaussian
            return T.max((W * M.T).T, axis=0), theano.scan_module.until(i >= X.shape[0] - size + 1)

        output, _ = theano.scan(slide_max,
                                sequences=[
                                    T.arange(0, stop=(output.shape[0] - self.window_size + 1), step=3, dtype='int32')],
                                non_sequences=output)
        if self.use_qa_idf:
            average = weighted_average(output.dimshuffle(2, 1, 0, 3, 4), x[2], axis=4)
        else:
            average = masked_mean(output.dimshuffle(2, 1, 0, 3, 4), axis=4)
        return T.max(average, axis=2) * self.alpha
def renet_layer_ud(X, Wx, Wh, Wo, Bh, Bo, H0, w, h, wp, hp):
    def recurrence(x_t, h_tm1):
        dot = T.dot(Wx, x_t)
        h_t = T.tanh(dot + T.dot(h_tm1, Wh) + Bh)
        s_t = T.tanh(T.dot(h_t, Wo) + Bo)
        return [h_t, s_t]

    list_of_images = []
    for j in xrange(w/wp):
        # x = X[:,:,j*wp:(j*wp + wp)].dimshuffle((2, 0, 1)).flatten(ndim=2)
        # reshape the row into a 2-D matrix to be fed into scan
        x = X[:,:,j*wp:(j*wp + wp)].dimshuffle((2, 0, 1)).flatten().reshape((h/hp, X.shape[0]*wp*hp))
        [h1, s1], _ = theano.scan(
            fn=recurrence,
            sequences=x,
            outputs_info=[H0, None],
            n_steps=x.shape[0]
        )
        [h2, s2], _ = theano.scan(
            fn=recurrence,
            sequences=x,
            outputs_info=[H0, None],
            n_steps=x.shape[0],
            go_backwards=True
        )
        # combine the last values of s1 and s2 into an image
        img = T.concatenate([s1.T, s2.T])
        list_of_images.append(img)

    return T.stacklists(list_of_images).dimshuffle((1, 0, 2))
Ejemplo n.º 20
0
  def get_output(self, train=False):
    input = self.get_input(train)
    proj_input = self.activation(T.tensordot(input, self.att_proj, axes=(3,0)))
    #else:
    #  proj_fun = lambda proj_i, inp: T.tensordot(inp, proj_i, axes=((1,3), (0,1)))
    #  lin_proj_input, _ = theano.scan(fn=proj_fun, sequences=self.att_proj, non_sequences=input)
    #  proj_input = self.activation(lin_proj_input.dimshuffle((1,0,2,3)))
    if self.context == 'word':
      att_scores = T.tensordot(proj_input, self.att_scorer, axes=(3, 0))
    elif self.context == 'clause':
      #att_scores = T.tensordot(proj_input, self.att_scorer, axes=(3, 1)).sum(axis=2)
      def step(a_t, h_tm1, W_in, W, sc):
        h_t = T.tanh(T.tensordot(a_t, W_in, axes=(2,0)) + T.tensordot(h_tm1, W, axes=(2,0)))
        s_t = T.tensordot(h_t, sc, axes=(2,0))
        return h_t, s_t
      [_, scores], _ = theano.scan(step, sequences=[proj_input.dimshuffle(2,0,1,3)], outputs_info=[T.zeros((proj_input.shape[0], self.td1, self.rec_hid_dim)), None], non_sequences=[self.rec_in_weights, self.rec_hid_weights, self.att_scorer])
      att_scores = scores.dimshuffle(1,2,0)
    elif self.context == 'para':
      att_scores = T.tensordot(proj_input, self.att_scorer, axes=(3, 2)).sum(axis=(1, 2))
    # Nested scans. For shame!
    def get_sample_att(sample_input, sample_att):
      sample_att_inp, _ = theano.scan(fn=lambda s_att_i, s_input_i: T.dot(s_att_i, s_input_i), sequences=[T.nnet.softmax(sample_att), sample_input])
      return sample_att_inp

    att_input, _ = theano.scan(fn=get_sample_att, sequences=[input, att_scores])
    return att_input
    def __init__(self, cell, rng, layer_id, shape, X, mask, is_train = 1, batch_size = 1, p = 0.5):
        prefix = "SentDecoderLayer_"
        layer_id = "_" + layer_id
        self.in_size, self.out_size = shape
        self.X = X
        self.summs = batch_size
        
        self.W_hy = init_weights((self.in_size, self.out_size), prefix + "W_hy" + layer_id)
        self.b_y = init_bias(self.out_size, prefix + "b_y" + layer_id)

        if cell == "gru":
            self.decoder = GRULayer(rng, prefix + layer_id, shape, self.X, mask, is_train, 1, p)
            def _active(pre_h, x):
                h = self.decoder._active(x, pre_h)
                y = T.tanh(T.dot(h, self.W_hy) + self.b_y)
                return h, y
            [h, y], updates = theano.scan(_active, n_steps = self.summs, sequences = [],
                                      outputs_info = [{'initial':self.X, 'taps':[-1]},
                                                      T.alloc(floatX(0.), 1, self.out_size)])
        elif cell == "lstm":
            self.decoder = LSTMLayer(rng, prefix + layer_id, shape, self.X, mask, is_train, 1, p)
            def _active(pre_h, pre_c, x):
                h, c = self.decoder._active(x, pre_h, pre_c)
                y = T.tanh(T.dot(h, self.W_hy) + self.b_y)
                return h, c, y
            [h, c, y], updates = theano.scan(_active, n_steps = self.summs, sequences = [],
                                             outputs_info = [{'initial':self.X, 'taps':[-1]},
                                                             {'initial':self.X, 'taps':[-1]},
                                                             T.alloc(floatX(0.), 1, self.out_size)])
       
        y = T.reshape(y, (self.summs, self.out_size))
        self.activation = y

        self.params = self.decoder.params + [self.W_hy, self.b_y]
Ejemplo n.º 22
0
    def get_output_for(self,net_input,**kwargs):
        if 'unary' in kwargs and kwargs['unary']==True:
            return net_input

        logger.info('Initializing the messages')
        Wp=self.W
        unary_sequence = net_input.dimshuffle(1,0,2)    #Reshuffling the batched unary potential shape so that it can be used for word level iterations in theano.scan

        def forward_scan1(unary_sequence,forward_sm,Wp):
            forward_sm=forward_sm+unary_sequence
            forward_sm=theano_logsumexp(forward_sm.dimshuffle(0,1,'x')+Wp,1)
            return forward_sm

        def backward_scan1(unary_sequence,forward_sm,Wp):
            forward_sm=forward_sm+unary_sequence
            forward_sm=theano_logsumexp(forward_sm.dimshuffle(0,1,'x')+Wp.T,1)
            return forward_sm


        forward_results,_=theano.scan(fn=forward_scan1,sequences=[unary_sequence],outputs_info=T.zeros_like(unary_sequence[0]),non_sequences=[Wp],n_steps=unary_sequence.shape[0]-1)
        backward_results,_=theano.scan(fn=backward_scan1,sequences=[unary_sequence[::-1]],outputs_info=T.zeros_like(unary_sequence[0]),non_sequences=[Wp],n_steps=unary_sequence.shape[0]-1)

        backward_results=T.concatenate([backward_results[::-1],T.zeros_like(backward_results[:1])],axis=0)
        forward_results=T.concatenate([T.zeros_like(forward_results[:1]),forward_results],axis=0)

        unnormalized_prob = forward_results+unary_sequence+backward_results
        marginal_results = theano_logsumexp(unnormalized_prob,axis=2)
        normalized_prob = unnormalized_prob - marginal_results.dimshuffle(0,1,'x')
        # provided for debugging purposes.
        #marginal_all = theano.function([l_in.input_var,l_mask.input_var],marginal_results)
        #probs=theano.function([l_in.input_var,l_mask.input_var],normalized_prob.dimshuffle(1,0,2))
        if 'normalized' in kwargs and kwargs['normalized']==True:
            return normalized_prob.dimshuffle(1,0,2)
        else:
            return unnormalized_prob.dimshuffle(1,0,2)
Ejemplo n.º 23
0
    def fprop(self, data):
        if self.use_ground_truth:
            self.input_space.validate(data)
            features, phones = data

            init_h = T.alloc(numpy.cast[theano.config.floatX](0), self.nhid)
            init_out = T.alloc(numpy.cast[theano.config.floatX](0), 1)
            init_out = T.unbroadcast(init_out, 0)

            fn = lambda f, p, h, o: self.fprop_step(f, p, h, o)

            ((h, out), updates) = theano.scan(fn=fn,
                                              sequences=[features, phones],
                                              outputs_info=[dict(initial=init_h,
                                                                 taps=[-1]),
                                                            init_out])
            return out
        else:
            self.input_space.validate(data)
            features, phones = data

            init_in = features[0]
            init_h = T.alloc(numpy.cast[theano.config.floatX](0), self.nhid)
            init_out = T.alloc(numpy.cast[theano.config.floatX](0), 1)
            init_out = T.unbroadcast(init_out, 0)

            fn = lambda t, p, f, h, o: self.fprop_step_prime(t, p, f, h, o)

            ((f, h, out), updates) = theano.scan(fn=fn,
                                                 sequences=[features, phones],
                                                 outputs_info=[init_in,
                                                               dict(initial=init_h,
                                                                    taps=[-1]),
                                                               init_out])
            return out
Ejemplo n.º 24
0
 def For_MMD_Sub_class(self,target,data,omega,num_FF,Xlabel):
     
     Num=T.sum(Xlabel,0)
     D_num=Xlabel.shape[1]
     N=data.shape[0]
     
     F_times_Omega = T.dot(data, omega)#minibatch_size*n_rff
     Phi = (self.sf2**0.5 /num_FF**0.5 ) * T.concatenate([T.cos(F_times_Omega), T.sin(F_times_Omega)],1)
     
     #各RFFは2N_rffのたてベクトル
     Phi_total=T.sum(Phi.T,-1)/N
     
     #Domain_number*2N_rffの行列
     Phi_each_domain, updates = theano.scan(fn=lambda a,b: T.switch(T.neq(b,0), Phi.T*a/b, 0),
                           sequences=[Xlabel.T,Num])
     each_Phi=T.sum(Phi_each_domain,-1)
     #まず自分自身との内積 結果はD次元のベクトル
     each_domain_sum=T.sum(each_Phi*each_Phi,-1)
     
     #全体の内積
     tot_sum=T.dot(Phi_total,Phi_total)
     
     #全体とドメインのクロス内積
     tot_domain_sum, updates=theano.scan(fn=lambda a: a*Phi_total,
                           sequences=[each_Phi])
     
     #MMDの計算
     MMD_central=T.sum(each_domain_sum)+D_num*tot_sum-2*T.sum(tot_domain_sum)
     
     return MMD_central     
Ejemplo n.º 25
0
 def apply(self):
     result , updates = theano.scan(
         fn = self.train_step,
         sequences = self.f,
         outputs_info = [self.A_start , None],
         non_sequences = self.A ,
         n_steps = self.tgt.shape[0]
     )
     best_path_score = result[0][-1].max()
     idx = T.argmax(result[0][-1])
     res2 , _ = theano.scan(
         fn = lambda dps , idx : [dps[idx] , idx],
         sequences = result[1][::-1],
         outputs_info = [idx , None]
     )
     best_path = res2[1]
     tgt_score = self.decode()
     max_margin = T.sum(T.neq(self.tgt , best_path))
     self.cost = best_path_score + max_margin - tgt_score
     #if T.lt(self.cost , T.alloc(numpy.int64(0))):
     #    self.cost = T.alloc(numpy.int64(0))
     #return T.argmax(result[-1])
     #self.cost = T.mean(T.nnet.categorical_crossentropy(self.p_y_given_x , tgt))
     #return best_path_score
     #return best_path
     return self.cost
    def __dealWithOneDoc(self, DocSentenceCount0, oneDocSentenceCount1, \
                         docs, corpusPos, oneDocSentenceWordCount, docW, docB, sentenceW, sentenceB, posW, posB):
#         t = T.and_((shareRandge < oneDocSentenceCount1 + 1),  (shareRandge >= DocSentenceCount0)).nonzero()
        oneDocSentenceWordCount = oneDocSentenceWordCount[DocSentenceCount0:oneDocSentenceCount1 + 1]
        
        sentenceResults0, _ = theano.scan(fn=self.__dealWithSentence,
                            non_sequences=[docs, sentenceW, sentenceB],
                             sequences=[dict(input=oneDocSentenceWordCount, taps=[-1, -0])],
                             strict=True)
        sentenceResults1, _ = theano.scan(fn=self.__dealWithSentence,
                            non_sequences=[corpusPos, posW, posB],
                             sequences=[dict(input=oneDocSentenceWordCount, taps=[-1, -0])],
                             strict=True)
        sentenceResults = T.concatenate([sentenceResults0, sentenceResults1], axis=1)
#         p = printing.Print('docPool')
#         docPool = p(docPool)
#         p = printing.Print('sentenceResults')
#         sentenceResults = p(sentenceResults)
#         p = printing.Print('doc_out')
#         doc_out = p(doc_out)
        doc_out = conv.conv2d(input=sentenceResults, filters=docW)
        docPool = downsample.max_pool_2d(doc_out, (self.__MAXDIM, 1), mode=self.__pooling_mode, ignore_border=False)
        docOutput = T.tanh(docPool + docB.dimshuffle([0, 'x', 'x']))
        doc_embedding = docOutput.flatten(1)
        return doc_embedding
Ejemplo n.º 27
0
    def call(self, x, mask=None):
        maxlen = x.shape[1]

        hidden0 = x
        # shape: (batch_size, maxlen, hidden_dim) 
        pyramid, _ = theano.scan(fn=self.build_pyramid, 
                                 sequences=T.arange(maxlen-1),
                                 outputs_info=[hidden0],
                                 non_sequences=maxlen)
        # shape: (maxlen-1, batch_size, maxlen, hidden_dim)

        hidden0 = K.expand_dims(hidden0, dim=0)
        # shape: (1, batch_size, maxlen, hidden_dim)

        pyramid = K.concatenate([hidden0, pyramid], axis=0)
        # shape: (maxlen, batch_size, maxlen, hidden_dim)

        hierarchy, _ = theano.scan(fn=self.compress_pyramid,
                                   sequences=[T.arange(maxlen, 0, -1), 
                                              pyramid])
        # shape: (maxlen, batch_size, hidden_dim)

        hierarchy = K.permute_dimensions(hierarchy, (1, 0, 2))
        # shape: (batch_size, maxlen, hidden_dim)
        
        return hierarchy
Ejemplo n.º 28
0
    def _build_model(self, input, options, layers, params, go_backwards=False):

        def _step1(x_, t_, layer_):
            layer_ = str(layer_.data)
            v = layers['conv_' + layer_ + '_v'].conv(x_)
            t = layers['conv_' + layer_ + '_t'].conv(t_)
            h = v + t

            return x_, h

        def _step2(h, r_, layer_):
            layer_ = str(layer_.data)
            o = h + params['b_' + layer_].dimshuffle('x', 0, 'x', 'x')
            if layer_ != str(len(options['filter_shape']) - 1):
                r = layers['conv_' + layer_ + '_r'].conv(r_)
                o = tensor.nnet.relu(o + r)
            return o

        rval = input
        if go_backwards:
            rval = rval[::-1]
        for i in range(len(options['filter_shape'])):
            rval, _ = theano.scan(_step1, sequences=[rval],
                                  outputs_info=[rval[0], None],
                                  non_sequences=[i],
                                  name='rnn_layers_k_' + str(i))
            rval = rval[1]
            rval, _ = theano.scan(_step2, sequences=[rval],
                                  outputs_info=[rval[-1]],
                                  non_sequences=[i],
                                  name='rnn_layers_q_' + str(i))
        proj = rval

        return proj
Ejemplo n.º 29
0
    def layers(self, n_layers=1):
        layers = []
        params = []
        layer_output = []
        for i in xrange(n_layers):
            if i == 0:
                layer_input = self.x.reshape((self.batch_size, self.n_words, self.n_in)).dimshuffle(1, 0, 2)  # 100 * 10 * 32
                layer = FirstLayer(n_i=self.n_in)
            else:
                layer_input = layer_output[-1][::-1]
                layer = Layer(n_i=self.n_in)
            [h, c], _ = theano.scan(fn=layer.forward,
                                    sequences=layer_input,
                                    outputs_info=[self.h0, self.c0])
            layers.append(layer)
            params.extend(layer.params)
            layer_output.append(h)

        layer_input = layer_output[-1]
        layer = LastLayer(n_i=self.n_in, n_h=self.n_y)
        y, _ = theano.scan(fn=layer.forward,
                           sequences=layer_input,
                           outputs_info=[None])
        layers.append(layer)
        params.extend(layer.params)
        layer_output.append(y)
        return layers, params, layer_output
Ejemplo n.º 30
0
	def predict(self, input):   #input is an array of vectors (2D np.array)
		self.input = input
		padw = int(self.window/2)
		if padw>0:
			padding = np.asarray([np.zeros((self.dim_in,), dtype=theano.config.floatX)] * (padw))
			inp = T.concatenate((padding, input, padding), axis=0)
		else:
			inp = self.input
		seq = T.arange(T.shape(inp)[0]-self.window+1)
		self.input, _ = theano.scan(lambda v: inp[v : v+self.window].flatten(), sequences=seq)

		# initialize the gates
		out = theano.shared(numpy.zeros((self.dim_out,), dtype=theano.config.floatX))

		# gate computations
		def rnn_step(x, h_prev):
			if self.use_bias:
				out = T.nnet.sigmoid(T.dot(x, self.Wx) + T.dot(h_prev, self.Wh) + self.b)
			else:
				out = T.nnet.sigmoid(T.dot(x, self.Wx) + T.dot(h_prev, self.Wh))
			return out

		self.output, _ = theano.scan(fn=rnn_step, 
								  sequences = dict(input=self.input, taps=[0]), 
								  outputs_info = [out])
		if self.use_last_output:
			self.output = self.output[-1]
		if self.pooling != None:
			self.output = self.pooling(self.output)
		return self.output
def restrictedBoltzmannMachines(learning_rate, training_epochs, dataset,
                                batch_size, n_chains, n_samples, output_folder,
                                n_hidden, destination_file):
    """
    Demonstrate how to train and afterwards sample from it using Theano.

    This is demonstrated on MNIST.

    :param learning_rate: learning rate used for training the RBM

    :param training_epochs: number of epochs used for training

    :param dataset: path the the pickled dataset

    :param batch_size: size of a batch used to train the RBM

    :param n_chains: number of parallel Gibbs chains to be used for sampling

    :param n_samples: number of samples to plot for each chain

    """
    datasets = load_data(dataset)

    train_set_x, train_set_y = datasets[0]
    test_set_x, test_set_y = datasets[2]

    # compute number of minibatches for training, validation and testing
    n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size

    # allocate symbolic variables for the data
    index = T.lscalar()  # index to a [mini]batch
    x = T.matrix('x')  # the data is presented as rasterized images

    rng = numpy.random.RandomState(123)
    theano_rng = RandomStreams(rng.randint(2**30))

    print " "
    print "###################"
    print "# BUILD THE MODEL #"
    print "###################"
    print " "
    print "Building the model ..."

    # initialize storage for the persistent chain (state = hidden
    # layer of chain)
    persistent_chain = theano.shared(numpy.zeros((batch_size, n_hidden),
                                                 dtype=theano.config.floatX),
                                     borrow=True)

    # construct the RBM class
    rbm = RBM(input=x,
              n_visible=28 * 28,
              n_hidden=n_hidden,
              numpy_rng=rng,
              theano_rng=theano_rng)

    # get the cost and the gradient corresponding to one step of CD-15
    cost, updates = rbm.get_cost_updates(lr=learning_rate,
                                         persistent=persistent_chain,
                                         k=15)

    #################################
    #     Training the RBM          #
    #################################

    print " "
    print "####################"
    print "# TRAINING THE RBM #"
    print "####################"
    print " "
    print "Training the RBM ..."

    if not os.path.isdir(output_folder):
        os.makedirs(output_folder)
    os.chdir(output_folder)

    # start-snippet-5
    # it is ok for a theano function to have no output
    # the purpose of train_rbm is solely to update the RBM parameters
    train_rbm = theano.function(
        [index],
        cost,
        updates=updates,
        givens={x: train_set_x[index * batch_size:(index + 1) * batch_size]},
        name='train_rbm')

    plotting_time = 0.
    start_time = time.clock()

    # go through training epochs
    for epoch in xrange(training_epochs):

        # go through the training set
        mean_cost = []
        for batch_index in xrange(n_train_batches):
            mean_cost += [train_rbm(batch_index)]

        print 'Training epoch %d, cost is ' % epoch, numpy.mean(mean_cost)

        # Plot filters after each training epoch
        plotting_start = time.clock()
        # Construct image from the weight matrix
        image = Image.fromarray(
            tile_raster_images(X=rbm.W.get_value(borrow=True).T,
                               img_shape=(28, 28),
                               tile_shape=(10, 10),
                               tile_spacing=(1, 1)))
        image.save('filters_at_epoch_%i.png' % epoch)
        plotting_stop = time.clock()
        plotting_time += (plotting_stop - plotting_start)

    end_time = time.clock()

    pretraining_time = (end_time - start_time) - plotting_time

    print('Training took %f minutes' % (pretraining_time / 60.))
    # end-snippet-5 start-snippet-6
    #################################
    #     Sampling from the RBM     #
    #################################

    print " "
    print "####################################"
    print "# EXTRACT THE SAMPLES FROM THE RBM #"
    print "####################################"
    print " "
    print "Extracting the samples from the RBM ..."
    # find out the number of test samples
    number_of_test_samples = test_set_x.get_value(borrow=True).shape[0]

    # pick random test examples, with which to initialize the persistent chain
    test_idx = rng.randint(number_of_test_samples - n_chains)
    persistent_vis_chain = theano.shared(
        numpy.asarray(test_set_x.get_value(borrow=True)[test_idx:test_idx +
                                                        n_chains],
                      dtype=theano.config.floatX))
    # end-snippet-6 start-snippet-7
    plot_every = 1000
    # define one step of Gibbs sampling (mf = mean-field) define a
    # function that does `plot_every` steps before returning the
    # sample for plotting
    ([presig_hids, hid_mfs, hid_samples, presig_vis, vis_mfs,
      vis_samples], updates) = theano.scan(
          rbm.gibbs_vhv,
          outputs_info=[None, None, None, None, None, persistent_vis_chain],
          n_steps=plot_every)

    # add to updates the shared variable that takes care of our persistent
    # chain :.
    updates.update({persistent_vis_chain: vis_samples[-1]})
    # construct the function that implements our persistent chain.
    # we generate the "mean field" activations for plotting and the actual
    # samples for reinitializing the state of our persistent chain
    sample_fn = theano.function([], [vis_mfs[-1], vis_samples[-1]],
                                updates=updates,
                                name='sample_fn')

    # create a space to store the image for plotting ( we need to leave
    # room for the tile_spacing as well)
    image_data = numpy.zeros((29 * n_samples + 1, 29 * n_chains - 1),
                             dtype='uint8')
    for idx in xrange(n_samples):
        # generate `plot_every` intermediate samples that we discard,
        # because successive samples in the chain are too correlated
        vis_mf, vis_sample = sample_fn()
        print 'Plotting sample ...', idx
        image_data[29 * idx:29 * idx + 28, :] = tile_raster_images(
            X=vis_mf,
            img_shape=(28, 28),
            tile_shape=(1, n_chains),
            tile_spacing=(1, 1))

    # construct image
    image = Image.fromarray(image_data)
    image.save(destination_file)
    # end-snippet-7
    os.chdir('../')
Ejemplo n.º 32
0
    def build_decoder(self,
                      inputs,
                      source,
                      target,
                      smask=None,
                      tmask=None,
                      context=None):
        """
        Build the Pointer Network Decoder Computational Graph
        """
        # inputs : (nb_samples, source_num, ptr_embedd_dim)
        # source : (nb_samples, source_num, source_dim)
        # smask  : (nb_samples, source_num)
        # target : (nb_samples, target_num)
        # tmask  : (nb_samples, target_num)
        # context: (nb_sample, context_dim)

        # initialized hidden state.
        assert context is not None
        Init_h = self.Initializer(context)

        # target is the source inputs.
        X = self.grab_source(inputs,
                             target)  # (nb_samples, target_num, source_dim)

        nb_dim = X.shape[0]
        tg_num = X.shape[1]
        sc_dim = X.shape[2]

        # since it changes to two pointers once a time:
        # concatenate + reshape
        def _get_ht(A, mask=False):
            if A.ndim == 2:
                B = A[:, -1:]
                if mask:
                    B *= 0.
                A = T.concatenate([A, B], axis=1)
                return A[:, ::2], A[:, 1::2]
            else:
                B = A[:, -1:, :]
                print B.ndim
                if mask:
                    B *= 0.
                A = T.concatenate([A, B], axis=1)
                return A[:, ::2, :], A[:, 1::2, :]

        Xh, Xt = _get_ht(X)
        Th, Tt = _get_ht(target)
        Mh, Mt = _get_ht(tmask, mask=True)

        Xa = Xh + Xt
        Xa = T.concatenate(
            [alloc_zeros_matrix(nb_dim, 1, sc_dim), Xa[:, :-1, :, :]], axis=1)
        Xa = Xa.dimshuffle((1, 0, 2))

        # eat by recurrent net
        def _recurrence(x, prev_h, c, s, s_mask):
            # RNN read-out
            x_out = self.RNN(x, mask=None, C=c, init_h=prev_h, one_step=True)
            h_out = self.att_head(x_out, s, s_mask, return_log=True)
            t_out = self.att_tail(x_out, s, s_mask, return_log=True)

            return x_out, h_out, t_out

        outputs, _ = theano.scan(_recurrence,
                                 sequences=[Xa],
                                 outputs_info=[Init_h, None, None],
                                 non_sequences=[context, source, smask])
        log_prob_head = outputs[1].dimshuffle((1, 0, 2))
        log_prob_tail = outputs[2].dimshuffle((1, 0, 2))

        log_prob      = T.sum(self.grab_prob(log_prob_head, Th) * Mh, axis=1) \
                      + T.sum(self.grab_prob(log_prob_tail, Tt) * Mt, axis=1)
        return log_prob
Ejemplo n.º 33
0
    def sample_step(x_tm1, h1_tm1, h2_tm1, h3_tm1, k_tm1, w_tm1, ctx):
        xinp_h1_t, xgate_h1_t = inp_to_h1.proj(x_tm1)
        xinp_h2_t, xgate_h2_t = inp_to_h2.proj(x_tm1)
        xinp_h3_t, xgate_h3_t = inp_to_h3.proj(x_tm1)

        attinp_h1, attgate_h1 = att_to_h1.proj(w_tm1)

        h1_t = cell1.step(xinp_h1_t + attinp_h1, xgate_h1_t + attgate_h1,
                          h1_tm1)
        h1inp_h2, h1gate_h2 = h1_to_h2.proj(h1_t)
        h1inp_h3, h1gate_h3 = h1_to_h3.proj(h1_t)

        a_t = h1_t.dot(h1_to_att_a)
        b_t = h1_t.dot(h1_to_att_b)
        k_t = h1_t.dot(h1_to_att_k)

        a_t = tensor.exp(a_t)
        b_t = tensor.exp(b_t)
        k_t = k_tm1 + tensor.exp(k_t)

        ss_t = calc_phi(k_t, a_t, b_t, u)
        # calculate and return stopping criteria
        sh_t = calc_phi(k_t, a_t, b_t, u_max)
        ss5 = ss_t.dimshuffle(0, 1, 'x')
        ss6 = ss5 * ctx.dimshuffle(1, 0, 2)
        w_t = ss6.sum(axis=1)

        attinp_h2, attgate_h2 = att_to_h2.proj(w_t)
        attinp_h3, attgate_h3 = att_to_h3.proj(w_t)

        h2_t = cell2.step(xinp_h2_t + h1inp_h2 + attinp_h2,
                          xgate_h2_t + h1gate_h2 + attgate_h2, h2_tm1)

        h2inp_h3, h2gate_h3 = h2_to_h3.proj(h2_t)

        h3_t = cell3.step(xinp_h3_t + h1inp_h3 + h2inp_h3 + attinp_h3,
                          xgate_h3_t + h1gate_h3 + h2gate_h3 + attgate_h3,
                          h3_tm1)
        out_t = h1_t.dot(h1_to_outs) + h2_t.dot(h2_to_outs) + h3_t.dot(
            h3_to_outs)

        out_t = out_t.dimshuffle(1, 0, 'x')
        counter = tensor.arange(out_t.shape[0])
        switch = out_t.shape[0] // 2

        def sample_out_step(c_t, o_tm1, x_tm1, v_h1_tm1):
            j_tm1 = tensor.concatenate((x_tm1, o_tm1), axis=1)
            vinp_h1_t, vgate_h1_t = inp_to_v_h1.proj(j_tm1)

            v_h1_t = v_cell1.step(vinp_h1_t, vgate_h1_t, v_h1_tm1)
            o = v_h1_t.dimshuffle('x', 0, 'x', 1)
            mu_mag, sigma_mag, coeff_mag = _slice_outs(o)
            mu_phase, sigma_phase, coeff_phase = _slice_outs(o)
            # Filthiest of the filthy hacks
            s = tensor.ge(switch, c_t)
            mu = s * (mu_mag) + (1 - s) * (mu_phase)
            sigma = s * (sigma_mag) + (1 - s) * (sigma_phase)
            coeff = s * (coeff_mag) + (1 - s) * (coeff_phase)
            mu = mu[0].dimshuffle(0, 'x', 1)
            sigma = sigma[0].dimshuffle(0, 'x', 1)
            coeff = coeff[0]
            samp_mag = sample_single_dimensional_gmms(mu, sigma, coeff, srng)
            samp_phase = sample_single_dimensional_gmms(mu, sigma, coeff, srng)
            samp_phase = tensor.mod(samp_phase + np.pi, 2 * np.pi) - np.pi
            samp = s * samp_mag + (1 - s) * samp_phase
            return samp, v_h1_t

        init_corr_out = tensor.zeros((out_t.shape[1], n_density))
        init_samp_out = tensor.zeros((out_t.shape[1], 1))
        r, isupdates = theano.scan(fn=sample_out_step,
                                   sequences=[counter, out_t],
                                   outputs_info=[init_samp_out, init_corr_out])
        corr_out_t = r[0]
        x_t = corr_out_t.dimshuffle(2, 1, 0)[0]
        return x_t, h1_t, h2_t, h3_t, k_t, w_t, ss_t, sh_t, isupdates
Ejemplo n.º 34
0
    """
    # Old multistep code which doesn't work with updates in the internal scan
    n_steps_sym = tensor.iscalar()
    n_steps_sym.tag.test_value = 10
    (sampled, h1_s, h2_s, h3_s, k_s, w_s, stop_s, stop_h), supdates = theano.scan(
        fn=sample_step,
        n_steps=n_steps_sym,
        sequences=[],
        outputs_info=[init_x, init_h1, init_h2, init_h3,
                      init_kappa, init_w, None, None],
        non_sequences=[context])
    """

    (h1, h2, h3, kappa, w), updates = theano.scan(
        fn=step,
        sequences=[inp_h1, inpgate_h1, inp_h2, inpgate_h2, inp_h3, inpgate_h3],
        outputs_info=[init_h1, init_h2, init_h3, init_kappa, init_w],
        non_sequences=[context])

    outs = h1.dot(h1_to_outs) + h2.dot(h2_to_outs) + h3.dot(h3_to_outs)

    orig_shapes = outs.shape
    outs = outs.dimshuffle(2, 1, 0)
    outs = outs.reshape((orig_shapes[2], orig_shapes[1] * orig_shapes[0], 1))

    shuff_inpt_shapes = inpt.shape
    shuff_inpt = inpt.dimshuffle(2, 1, 0)
    shuff_inpt = shuff_inpt.reshape(
        (shuff_inpt_shapes[2], shuff_inpt_shapes[1] * shuff_inpt_shapes[0], 1))

    def out_step(x_tm1, o_tm1, v_h1_tm1):

input2 = T.dtensor4()

left2 = T.ivector()
right2 = T.ivector()
Slen2 = T.ivector()

input1 = T.dtensor3()

left1 = T.iscalar()
right1 = T.iscalar()
Slen1 = T.iscalar()

# ok = atData(input1, left1, right1, Slen1)
ok, _ = theano.scan(atData, sequences=[input2, left2, right2, Slen2])

myfunc = theano.function([input2, left2, right2, Slen2], ok, on_unused_input='ignore')

input2_init = np.reshape(np.arange(2 * 5280, dtype=theano.config.floatX), (2, 1, 88, 60))
left2_init = np.asarray([1, 2], dtype='int32')
right2_init = np.asarray([60, 59], dtype='int32')
Slen2_init = np.asarray([70, 69], dtype='int32')

input1_init = np.reshape(np.arange(5280, dtype=theano.config.floatX), (1, 88, 60))
left1_init = 1
right1_init = 60
Slen1_init = 70

# ok1, ok2, ok3 = myfunc(input_init, left_init, right_init)
#
Ejemplo n.º 36
0
def _zoom(a_lo,
          a_hi,
          phi_lo,
          phi_hi,
          derphi_lo,
          phi,
          derphi,
          phi0,
          derphi0,
          c1,
          c2,
          n_iters=10,
          profile=False):
    """
    WRITEME

    Part of the optimization algorithm in `scalar_search_wolfe2`.

    Parameters
    ----------
    a_lo : float
        Step size
    a_hi : float
        Step size
    phi_lo : float
        Value of f at a_lo
    phi_hi : float
        Value of f at a_hi
    derphi_lo : float
        Value of derivative at a_lo
    phi : callable
        Generates computational graph
    derphi : callable
        Generates computational graph
    phi0 : float
        Value of f at 0
    derphi0 : float
        Value of the derivative at 0
    c1 : float
        Wolfe parameter
    c2 : float
        Wolfe parameter
    profile : bool
        True if you want printouts of profiling information
    """

    # Function reprensenting the computations of one step of the while loop
    def while_zoom(phi_rec, a_rec, a_lo, a_hi, phi_hi, phi_lo, derphi_lo,
                   a_star, val_star, valprime):
        # interpolate to find a trial step length between a_lo and
        # a_hi Need to choose interpolation here.  Use cubic
        # interpolation and then if the result is within delta *
        # dalpha or outside of the interval bounded by a_lo or a_hi
        # then use quadratic interpolation, if the result is still too
        # close, then use bisection
        dalpha = a_hi - a_lo
        a = TT.switch(dalpha < zero, a_hi, a_lo)
        b = TT.switch(dalpha < zero, a_lo, a_hi)

        # minimizer of cubic interpolant
        # (uses phi_lo, derphi_lo, phi_hi, and the most recent value of phi)
        #
        # if the result is too close to the end points (or out of the
        # interval) then use quadratic interpolation with phi_lo,
        # derphi_lo and phi_hi if the result is stil too close to the
        # end points (or out of the interval) then use bisection

        # cubic interpolation
        cchk = delta1 * dalpha
        a_j_cubic = _cubicmin(a_lo, phi_lo, derphi_lo, a_hi, phi_hi, a_rec,
                              phi_rec)
        # quadric interpolation
        qchk = delta2 * dalpha
        a_j_quad = _quadmin(a_lo, phi_lo, derphi_lo, a_hi, phi_hi)
        cond_q = lazy_or('condq', TT.isnan(a_j_quad), a_j_quad > b - qchk,
                         a_j_quad < a + qchk)
        a_j_quad = TT.switch(cond_q, a_lo +
                             numpy.asarray(0.5, dtype=theano.config.floatX) * \
                             dalpha, a_j_quad)

        # pick between the two ..
        cond_c = lazy_or(
            'condc', TT.isnan(a_j_cubic),
            TT.bitwise_or(a_j_cubic > b - cchk, a_j_cubic < a + cchk))
        # this lazy if actually decides if we need to run the quadric
        # interpolation
        a_j = TT.switch(cond_c, a_j_quad, a_j_cubic)
        #a_j = ifelse(cond_c, a_j_quad,  a_j_cubic)

        # Check new value of a_j
        phi_aj = phi(a_j)
        derphi_aj = derphi(a_j)

        stop = lazy_and(
            'stop',
            TT.bitwise_and(phi_aj <= phi0 + c1 * a_j * derphi0,
                           phi_aj < phi_lo),
            abs(derphi_aj) <= -c2 * derphi0)

        cond1 = TT.bitwise_or(phi_aj > phi0 + c1 * a_j * derphi0,
                              phi_aj >= phi_lo)
        cond2 = derphi_aj * (a_hi - a_lo) >= zero

        # Switches just make more sense here because they have a C
        # implementation and they get composed
        phi_rec = ifelse(cond1,
                         phi_hi,
                         TT.switch(cond2, phi_hi, phi_lo),
                         name='phi_rec')
        a_rec = ifelse(cond1, a_hi, TT.switch(cond2, a_hi, a_lo), name='a_rec')
        a_hi = ifelse(cond1, a_j, TT.switch(cond2, a_lo, a_hi), name='a_hi')
        phi_hi = ifelse(cond1,
                        phi_aj,
                        TT.switch(cond2, phi_lo, phi_hi),
                        name='phi_hi')

        a_lo = TT.switch(cond1, a_lo, a_j)
        phi_lo = TT.switch(cond1, phi_lo, phi_aj)
        derphi_lo = ifelse(cond1, derphi_lo, derphi_aj, name='derphi_lo')

        a_star = a_j
        val_star = phi_aj
        valprime = ifelse(cond1,
                          nan,
                          TT.switch(cond2, derphi_aj, nan),
                          name='valprime')

        return ([
            phi_rec, a_rec, a_lo, a_hi, phi_hi, phi_lo, derphi_lo, a_star,
            val_star, valprime
        ], theano.scan_module.scan_utils.until(stop))

    maxiter = n_iters
    # cubic interpolant check
    delta1 = TT.constant(numpy.asarray(0.2, dtype=theano.config.floatX))
    # quadratic interpolant check
    delta2 = TT.constant(numpy.asarray(0.1, dtype=theano.config.floatX))
    phi_rec = phi0
    a_rec = zero

    # Initial iteration

    dalpha = a_hi - a_lo
    a = TT.switch(dalpha < zero, a_hi, a_lo)
    b = TT.switch(dalpha < zero, a_lo, a_hi)
    #a = ifelse(dalpha < 0, a_hi, a_lo)
    #b = ifelse(dalpha < 0, a_lo, a_hi)

    # minimizer of cubic interpolant
    # (uses phi_lo, derphi_lo, phi_hi, and the most recent value of phi)
    #
    # if the result is too close to the end points (or out of the
    # interval) then use quadratic interpolation with phi_lo,
    # derphi_lo and phi_hi if the result is stil too close to the
    # end points (or out of the interval) then use bisection

    # quadric interpolation
    qchk = delta2 * dalpha
    a_j = _quadmin(a_lo, phi_lo, derphi_lo, a_hi, phi_hi)
    cond_q = lazy_or('mcond_q', TT.isnan(a_j),
                     TT.bitwise_or(a_j > b - qchk, a_j < a + qchk))

    a_j = TT.switch(cond_q, a_lo +
                    numpy.asarray(0.5, dtype=theano.config.floatX) * \
                    dalpha, a_j)

    # Check new value of a_j
    phi_aj = phi(a_j)
    derphi_aj = derphi(a_j)

    cond1 = TT.bitwise_or(phi_aj > phi0 + c1 * a_j * derphi0, phi_aj >= phi_lo)
    cond2 = derphi_aj * (a_hi - a_lo) >= zero

    # Switches just make more sense here because they have a C
    # implementation and they get composed
    phi_rec = ifelse(cond1,
                     phi_hi,
                     TT.switch(cond2, phi_hi, phi_lo),
                     name='mphirec')
    a_rec = ifelse(cond1, a_hi, TT.switch(cond2, a_hi, a_lo), name='marec')
    a_hi = ifelse(cond1, a_j, TT.switch(cond2, a_lo, a_hi), name='mahi')
    phi_hi = ifelse(cond1,
                    phi_aj,
                    TT.switch(cond2, phi_lo, phi_hi),
                    name='mphihi')

    onlyif = lazy_and(
        'only_if',
        TT.bitwise_and(phi_aj <= phi0 + c1 * a_j * derphi0, phi_aj < phi_lo),
        abs(derphi_aj) <= -c2 * derphi0)

    a_lo = TT.switch(cond1, a_lo, a_j)
    phi_lo = TT.switch(cond1, phi_lo, phi_aj)
    derphi_lo = ifelse(cond1, derphi_lo, derphi_aj, name='derphi_lo_main')
    phi_rec.name = 'phi_rec'
    a_rec.name = 'a_rec'
    a_lo.name = 'a_lo'
    a_hi.name = 'a_hi'
    phi_hi.name = 'phi_hi'
    phi_lo.name = 'phi_lo'
    derphi_lo.name = 'derphi_lo'
    vderphi_aj = ifelse(cond1,
                        nan,
                        TT.switch(cond2, derphi_aj, nan),
                        name='vderphi_aj')
    states = [
        phi_rec, a_rec, a_lo, a_hi, phi_hi, phi_lo, derphi_lo, zero, zero, zero
    ]

    # print'while_zoom'
    outs, updates = scan(while_zoom,
                         outputs_info=states,
                         n_steps=maxiter,
                         name='while_zoom',
                         mode=theano.Mode(linker='cvm_nogc'),
                         profile=profile)
    # print 'done_while'
    a_star = ifelse(onlyif, a_j, outs[7][-1], name='astar')
    val_star = ifelse(onlyif, phi_aj, outs[8][-1], name='valstar')
    valprime = ifelse(onlyif, vderphi_aj, outs[9][-1], name='valprime')

    ## WARNING !! I ignore updates given by scan which I should not do !!!
    return a_star, val_star, valprime
Ejemplo n.º 37
0
def scalar_search_wolfe2(phi,
                         derphi,
                         phi0=None,
                         old_phi0=None,
                         derphi0=None,
                         n_iters=20,
                         c1=1e-4,
                         c2=0.9,
                         profile=False):
    """
    Find alpha that satisfies strong Wolfe conditions.

    alpha > 0 is assumed to be a descent direction.

    Parameters
    ----------
    phi : callable f(x)
        Objective scalar function.
    derphi : callable f'(x)
        Objective function derivative (can be None)
    phi0 : float, optional
        Value of phi at s=0
    old_phi0 : float, optional
        Value of phi at previous point
    derphi0 : float, optional
        Value of derphi at s=0
    c1 : float
        Parameter for Armijo condition rule.
    c2 : float
        Parameter for curvature condition rule.
    profile : flag (boolean)
        True if you want printouts of profiling information

    Returns
    -------
    alpha_star : float
        Best alpha
    phi_star : WRITEME
        phi at alpha_star
    phi0 : WRITEME
        phi at 0
    derphi_star : WRITEME
        derphi at alpha_star

    Notes
    -----
    Uses the line search algorithm to enforce strong Wolfe
    conditions.  See Wright and Nocedal, 'Numerical Optimization',
    1999, pg. 59-60.

    For the zoom phase it uses an algorithm by [...].

    """

    if phi0 is None:
        phi0 = phi(zero)
    else:
        phi0 = phi0

    if derphi0 is None and derphi is not None:
        derphi0 = derphi(zero)
    else:
        derphi0 = derphi0

    alpha0 = zero
    alpha0.name = 'alpha0'
    if old_phi0 is not None:
        alpha1 = TT.minimum(one,
                            numpy.asarray(1.01, dtype=theano.config.floatX) *
                            numpy.asarray(2, dtype=theano.config.floatX) * \
                            (phi0 - old_phi0) / derphi0)
    else:
        old_phi0 = nan
        alpha1 = one

    alpha1 = TT.switch(alpha1 < zero, one, alpha1)
    alpha1.name = 'alpha1'

    # This shouldn't happen. Perhaps the increment has slipped below
    # machine precision?  For now, set the return variables skip the
    # useless while loop, and raise warnflag=2 due to possible imprecision.
    phi0 = TT.switch(TT.eq(alpha1, zero), old_phi0, phi0)
    # I need a lazyif for alpha1 == 0 !!!
    phi_a1 = ifelse(TT.eq(alpha1, zero), phi0, phi(alpha1), name='phi_a1')
    phi_a1.name = 'phi_a1'

    phi_a0 = phi0
    phi_a0.name = 'phi_a0'
    derphi_a0 = derphi0
    derphi_a0.name = 'derphi_a0'
    # Make sure variables are tensors otherwise strange things happen
    c1 = TT.as_tensor_variable(c1)
    c2 = TT.as_tensor_variable(c2)
    maxiter = n_iters

    def while_search(alpha0, alpha1, phi_a0, phi_a1, derphi_a0, i_t,
                     alpha_star, phi_star, derphi_star):
        derphi_a1 = derphi(alpha1)
        cond1 = TT.bitwise_or(phi_a1 > phi0 + c1 * alpha1 * derphi0,
                              TT.bitwise_and(phi_a1 >= phi_a0, i_t > zero))
        cond2 = abs(derphi_a1) <= -c2 * derphi0
        cond3 = derphi_a1 >= zero
        alpha_star_c1, phi_star_c1, derphi_star_c1 = \
                _zoom(alpha0, alpha1, phi_a0, phi_a1, derphi_a0,
                      phi, derphi, phi0, derphi0, c1, c2,
                     profile=profile)
        alpha_star_c3, phi_star_c3, derphi_star_c3 = \
                _zoom(alpha1, alpha0, phi_a1, phi_a0, derphi_a1, phi,
                      derphi, phi0, derphi0, c1, c2,
                     profile=profile)
        nw_alpha1 = alpha1 * numpy.asarray(2, dtype=theano.config.floatX)
        nw_phi = phi(nw_alpha1)
        alpha_star, phi_star, derphi_star = \
                ifelse(cond1,
                          (alpha_star_c1, phi_star_c1, derphi_star_c1),
                ifelse(cond2,
                          (alpha1, phi_a1, derphi_a1),
                ifelse(cond3,
                          (alpha_star_c3, phi_star_c3, derphi_star_c3),
                           (nw_alpha1, nw_phi, nan),
                      name='alphastar_c3'),
                      name='alphastar_c2'),
                      name='alphastar_c1')

        return ([
            alpha1, nw_alpha1, phi_a1,
            ifelse(lazy_or('allconds', cond1, cond2, cond3),
                   phi_a1,
                   nw_phi,
                   name='nwphi1'),
            ifelse(cond1, derphi_a0, derphi_a1, name='derphi'), i_t + one,
            alpha_star, phi_star, derphi_star
        ],
                theano.scan_module.scan_utils.until(
                    lazy_or('until_cond_', TT.eq(nw_alpha1, zero), cond1,
                            cond2, cond3)))

    states = [alpha0, alpha1, phi_a0, phi_a1, derphi_a0]
    # i_t
    states.append(zero)
    # alpha_star
    states.append(zero)
    # phi_star
    states.append(zero)
    # derphi_star
    states.append(zero)
    # print 'while_search'
    outs, updates = scan(while_search,
                         outputs_info=states,
                         n_steps=maxiter,
                         name='while_search',
                         mode=theano.Mode(linker='cvm_nogc'),
                         profile=profile)
    # print 'done_while_search'
    out3 = outs[-3][-1]
    out2 = outs[-2][-1]
    out1 = outs[-1][-1]
    alpha_star, phi_star, derphi_star = \
            ifelse(TT.eq(alpha1, zero),
                        (nan, phi0, nan),
                        (out3, out2, out1), name='main_alphastar')
    return alpha_star, phi_star, phi0, derphi_star
Ejemplo n.º 38
0
    def __theano_build__(self):
        parameters = [E, V, U, W, b,
                      c] = self.E, self.V, self.U, self.W, self.b, self.c
        x = T.imatrix('x')
        y = T.imatrix('y')
        coversion_ones = T.ones((self.mini_batch_size, 1))

        def forward_prop_step(x_t, s_prev1, s_prev2, s_prev3):
            # Embedding layer
            x_e = E[:, x_t]

            def GRU(i, U, W, b, x_0, s_previous):
                b1 = T.specify_shape((coversion_ones * b[i * 3, :]).T,
                                     T.shape(x_0))
                b2 = T.specify_shape((coversion_ones * b[i * 3 + 1, :]).T,
                                     T.shape(x_0))
                b3 = T.specify_shape((coversion_ones * b[i * 3 + 2, :]).T,
                                     T.shape(x_0))

                z = T.nnet.hard_sigmoid(U[i * 3 + 0].dot(x_0) +
                                        W[i * 3 + 0].dot(s_previous) + b1)
                r = T.nnet.hard_sigmoid(U[i * 3 + 1].dot(x_0) +
                                        W[i * 3 + 1].dot(s_previous) + b2)
                s_candidate = T.tanh(U[i * 3 + 2].dot(x_0) +
                                     W[i * 3 + 2].dot(s_previous * r) + b3)

                return (T.ones_like(z) - z) * s_candidate + z * s_previous

            # GRU Layer 1
            s1 = GRU(0, U, W, b, x_e, s_prev1)

            # GRU Layer 2
            s2 = GRU(1, U, W, b, s1, s_prev2)

            # GRU Layer 3
            s3 = GRU(2, U, W, b, s2, s_prev3)

            # Final output calculation
            c_matrix = (coversion_ones * c).T
            juju = V.dot(s3) + c_matrix

            o_t = T.nnet.softmax(juju.T).T

            return [o_t, s1, s2, s3]

        # p_o = printing.Print('prediction')
        [o, s1, s2, s3], updates = theano.scan(
            forward_prop_step,
            sequences=x.T,
            truncate_gradient=self.bptt_truncate,
            outputs_info=[
                None,
                dict(initial=T.zeros((self.hidden_dim, self.mini_batch_size))),
                dict(initial=T.zeros((self.hidden_dim, self.mini_batch_size))),
                dict(initial=T.zeros((self.hidden_dim, self.mini_batch_size)))
            ])

        def p(j, name):
            return printing.Print(name)(j)

        prediction = T.argmax(o, axis=1)

        e = ((prediction - y.T)**
             2) / (T.shape(prediction)[0] * T.shape(prediction)[1])
        cost_batch = self.calculate_ce_vector(o, y)
        mse_cost_batch = self.calculate_mean_squared_error_vector(
            prediction, y)
        # Total cost
        cost = (1 / self.mini_batch_size) * self.calculate_error(o, y)

        # Gradients
        derivatives = self.calculate_gradients(cost, parameters)

        # Assign functions
        self.predict = theano.function([x], [o])
        self.predict_class = theano.function([x, y], [prediction, e],
                                             allow_input_downcast=True)
        self.error = theano.function([x, y], e)
        self.calculate_loss_vector = theano.function([x, y],
                                                     cost_batch,
                                                     allow_input_downcast=True)
        self.calculate_mse_vector = theano.function([x, y],
                                                    mse_cost_batch,
                                                    allow_input_downcast=True)
        self.ce_error = theano.function([x, y],
                                        cost,
                                        allow_input_downcast=True)
        self.bptt = theano.function([x, y],
                                    derivatives,
                                    allow_input_downcast=True)

        # SGD parameters

        # rmsprop cache updates
        self.update_RMSPROP(cost, parameters, derivatives, x, y)
Ejemplo n.º 39
0
    def __init__(self,
                 rng,
                 input,
                 n_in,
                 n_out,
                 n_attendout,
                 initial_hidden=None,
                 W_rec=None,
                 activation=T.tanh):
        self.input = input
        self.n_in = n_in
        self.n_out = n_out
        self.n_attendout = n_attendout
        self.n_attendin = 100

        self.type = 'attendrnn'

        if initial_hidden is None:
            initial_hidden_values_s = numpy.zeros((n_out, ),
                                                  dtype=theano.config.floatX)
            initial_hidden_s = theano.shared(value=initial_hidden_values_s,
                                             name='s0',
                                             borrow=True)
        self.s0 = initial_hidden_s

        if W_rec is None:
            W_type1 = numpy.asarray(rng.uniform(
                low=-numpy.sqrt(6. / (n_in + n_out)),
                high=numpy.sqrt(6. / (n_in + n_out)),
                size=(n_in, n_out)),
                                    dtype=theano.config.floatX)
            W_type2 = numpy.asarray(rng.uniform(
                low=-numpy.sqrt(6. / (n_in + n_out)),
                high=numpy.sqrt(6. / (n_in + n_out)),
                size=(n_out, n_out)),
                                    dtype=theano.config.floatX)
            W_type3 = numpy.asarray(rng.uniform(
                low=-numpy.sqrt(6. / (n_in + n_out)),
                high=numpy.sqrt(6. / (n_in + n_out)),
                size=(self.n_attendin, self.n_attendout)),
                                    dtype=theano.config.floatX)
            W_type4 = numpy.asarray(rng.uniform(
                low=-numpy.sqrt(6. / (n_in + n_out)),
                high=numpy.sqrt(6. / (n_in + n_out)),
                size=(self.n_in, self.n_attendin)),
                                    dtype=theano.config.floatX)
            W_type5 = numpy.asarray(rng.uniform(
                low=-numpy.sqrt(6. / (n_in + n_out)),
                high=numpy.sqrt(6. / (n_in + n_out)),
                size=(self.n_out, self.n_attendin)),
                                    dtype=theano.config.floatX)
            b_values = numpy.zeros((n_out, ), dtype=theano.config.floatX)

            W_ic = theano.shared(value=W_type1, name='W_ic', borrow=True)
            W_rec = theano.shared(value=W_type2, name='W_rec', borrow=True)
            W_outattend = theano.shared(value=W_type3,
                                        name='W_outattend',
                                        borrow=True)
            W_inattend_feat = theano.shared(value=W_type4,
                                            name='W_inattend_feat',
                                            borrow=True)
            W_inattend_prevstate = theano.shared(value=W_type5,
                                                 name='W_inattend_prevstate',
                                                 borrow=True)
            b = theano.shared(value=b_values, name='b', borrow=True)

        self.W_ic = W_ic
        self.W_rec = W_rec
        self.W_outattend = W_outattend
        self.W_inattend_feat = W_inattend_feat
        self.W_inattend_prevstate = W_inattend_prevstate
        self.b = b

        self.delta_W_ic = theano.shared(value=numpy.zeros(
            (n_in, n_out), dtype=theano.config.floatX),
                                        name='delta_W_ic')
        self.delta_W_rec = theano.shared(value=numpy.zeros(
            (n_out, n_out), dtype=theano.config.floatX),
                                         name='delta_W_rec')
        self.delta_W_outattend = theano.shared(value=numpy.zeros(
            (self.n_attendin, self.n_attendout), dtype=theano.config.floatX),
                                               name='delta_W_outattend')
        self.delta_W_inattend_feat = theano.shared(
            value=numpy.zeros((n_in, self.n_attendin),
                              dtype=theano.config.floatX),
            name='delta_W_inattend_feat')
        self.delta_W_inattend_prevstate = theano.shared(
            value=numpy.zeros((n_out, self.n_attendin),
                              dtype=theano.config.floatX),
            name='delta_W_inattend_prevstate')
        self.delta_b = theano.shared(value=numpy.zeros_like(
            self.b.get_value(borrow=True), dtype=theano.config.floatX),
                                     name='delta_b')

        self.test8 = numpy.zeros((8, ), dtype=theano.config.floatX)

        # sequences: h_l
        # prior results: s_tm1
        # non sequences: W_outattend, W_inattend_prevstate, W_ic, W_rec, b, W_inattend_feat
        def one_step(h_l, s_tm1, W_outattend, W_inattend_prevstate, W_ic,
                     W_rec, b, W_inattend_feat):
            e_tl = T.dot(
                T.tanh(
                    T.dot(s_tm1, W_inattend_prevstate) +
                    T.dot(h_l, W_inattend_feat)), W_outattend)
            a_tl = T.exp(e_tl) / (T.exp(e_tl)).sum(0, keepdims=True)
            c_t = T.dot(a_tl, self.input)
            s_t = T.tanh(T.dot(c_t, W_ic) + T.dot(s_tm1, W_rec) + b)
            return s_t

        self.y_vals, _ = theano.scan(fn=one_step,
                                     sequences=self.input,
                                     outputs_info=self.s0,
                                     non_sequences=[
                                         self.W_outattend,
                                         self.W_inattend_prevstate, self.W_ic,
                                         self.W_rec, self.b,
                                         self.W_inattend_feat
                                     ])

        # parameters of the model
        self.params = [
            self.W_outattend, self.W_inattend_prevstate, self.W_ic, self.W_rec,
            self.b, self.W_inattend_feat
        ]
        self.delta_params = [
            self.delta_W_outattend, self.delta_W_inattend_prevstate,
            self.delta_W_ic, self.delta_W_rec, self.delta_b,
            self.delta_W_inattend_feat
        ]

        sigma = lambda x: 1 / (1 + T.exp(-x))
        self.output = sigma(self.y_vals)
    def get_cost_updates(self, lr=0.1, persistent=None, k=1):
        """This functions implements one step of CD-k or PCD-k

        :param lr: learning rate used to train the RBM

        :param persistent: None for CD. For PCD, shared variable
            containing old state of Gibbs chain. This must be a shared
            variable of size (batch size, number of hidden units).

        :param k: number of Gibbs steps to do in CD-k/PCD-k

        Returns a proxy for the cost and the updates dictionary. The
        dictionary contains the update rules for weights and biases but
        also an update of the shared variable used to store the persistent
        chain, if one is used.

        """

        # compute positive phase
        pre_sigmoid_ph, ph_mean, ph_sample = self.sample_h_given_v(self.input)

        # decide how to initialize persistent chain:
        # for CD, we use the newly generate hidden sample
        # for PCD, we initialize from the old state of the chain
        if persistent is None:
            chain_start = ph_sample
        else:
            chain_start = persistent
        # end-snippet-2
        # perform actual negative phase
        # in order to implement CD-k/PCD-k we need to scan over the
        # function that implements one gibbs step k times.
        # Read Theano tutorial on scan for more information :
        # http://deeplearning.net/software/theano/library/scan.html
        # the scan will return the entire Gibbs chain
        ([
            pre_sigmoid_nvs, nv_means, nv_samples, pre_sigmoid_nhs, nh_means,
            nh_samples
        ], updates) = theano.scan(
            self.gibbs_hvh,
            # the None are place holders, saying that
            # chain_start is the initial state corresponding to the
            # 6th output
            outputs_info=[None, None, None, None, None, chain_start],
            n_steps=k)
        # start-snippet-3
        # determine gradients on RBM parameters
        # note that we only need the sample at the end of the chain
        chain_end = nv_samples[-1]

        cost = T.mean(self.free_energy(self.input)) - T.mean(
            self.free_energy(chain_end))
        # We must not compute the gradient through the gibbs sampling
        gparams = T.grad(cost, self.params, consider_constant=[chain_end])
        # end-snippet-3 start-snippet-4
        # constructs the update dictionary
        for gparam, param in zip(gparams, self.params):
            # make sure that the learning rate is of the right dtype
            updates[param] = param - gparam * T.cast(
                lr, dtype=theano.config.floatX)
        if persistent:
            # Note that this works only if persistent is a shared variable
            updates[persistent] = nh_samples[-1]
            # pseudo-likelihood is a better proxy for PCD
            monitoring_cost = self.get_pseudo_likelihood_cost(updates)
        else:
            # reconstruction cross-entropy is a better proxy for CD
            monitoring_cost = self.get_reconstruction_cost(
                updates, pre_sigmoid_nvs[-1])

        return monitoring_cost, updates
Ejemplo n.º 41
0
    def build_minibatch(self, batch_size):
        '''
            dimension:  n_steps * batch_size * embed_dim
        :return:
        '''
        V, U, W, b, c = self.V, self.U, self.W, self.b, self.c

        x = T.tensor3('x')
        y = T.matrix('y')
        m = T.matrix('mask')
        self.batch_size = batch_size

        def forward_prop_step(x_t, m_t, s_t_prev):
            # This is how we calculated the hidden state in a simple RNN. No longer!
            # s_t = T.tanh(U[:,x_t] + W.dot(s_t1_prev))

            # GRU Layer
            z_t = T.nnet.hard_sigmoid(T.dot(x_t, U[0]) + T.dot(s_t_prev, W[0]) + b[0])
            r_t = T.nnet.hard_sigmoid(T.dot(x_t, U[1]) + T.dot(s_t_prev, W[1]) + b[1])
            c_t = T.tanh(T.dot(x_t, U[2]) + T.dot((s_t_prev*r_t), W[2]) + b[2])
            s_t = (T.ones_like(z_t) - z_t) * c_t + z_t * s_t_prev
            s_t = m_t[:, None] * s_t + (1.0 - m_t)[:, None] * s_t_prev
            return s_t


        s, _ = theano.scan(
            forward_prop_step,
            sequences=[x, m],
            truncate_gradient=self.bptt_truncate,
            outputs_info=[dict(initial=T.zeros((batch_size, self.hidden_dim)))])

        # Final output calculation
        # Theano's softmax returns a matrix with one row, we only need the row
        p_y = T.nnet.softmax(T.dot(s[-1], V) + c)  # [0]
        prediction = T.argmax(p_y, axis=1)
        o_error = T.sum(T.nnet.categorical_crossentropy(p_y, y))/self.batch_size

        # Total cost (could add regularization here)
        self.cost = o_error

        # Assign functions
        self.predict = theano.function([x, m], p_y)
        self.predict_class = theano.function([x, m], prediction)
        self.ce_error = theano.function([x, y, m], self.cost)

        # Gradients
        dU = T.grad(self.cost, U)
        dW = T.grad(self.cost, W)
        db = T.grad(self.cost, b)
        dV = T.grad(self.cost, V)
        dc = T.grad(self.cost, c)

        self.bptt = theano.function([x, y, m], [dU, dW, db, dV, dc])

        # SGD parameters
        learning_rate = T.scalar('learning_rate')
        decay = T.scalar('decay')

        # rmsprop cache updates
        mU = decay * self.mU + (1 - decay) * dU ** 2
        mW = decay * self.mW + (1 - decay) * dW ** 2
        mV = decay * self.mV + (1 - decay) * dV ** 2
        mb = decay * self.mb + (1 - decay) * db ** 2
        mc = decay * self.mc + (1 - decay) * dc ** 2

        self.f_update = theano.function(
            [x, y, m, learning_rate, theano.In(decay, value=0.9)],
            [],
            updates=[
                     (U, U - learning_rate * dU / T.sqrt(mU + 1e-6)),
                     (W, W - learning_rate * dW / T.sqrt(mW + 1e-6)),
                     (V, V - learning_rate * dV / T.sqrt(mV + 1e-6)),
                     (b, b - learning_rate * db / T.sqrt(mb + 1e-6)),
                     (c, c - learning_rate * dc / T.sqrt(mc + 1e-6)),
                     (self.mU, mU),
                     (self.mW, mW),
                     (self.mV, mV),
                     (self.mb, mb),
                     (self.mc, mc)
                    ])
Ejemplo n.º 42
0
#This is just a template: it does not learn anything, and always returns the class "0":
W0 = theano.shared(numpy.ones((n_words, word_embedding_size)), 'W0')
W1 = theano.shared(numpy.ones((n_words, word_embedding_size)), 'W1')


def rnn_step(x, h_prev, W0, W1):
    b = theano.tensor.dot(W0, x)
    a = theano.tensor.dot(W1, h_prev)
    c = a + b
    return theano.tensor.tanh(c)


initial_context_vector = theano.tensor.alloc(
    numpy.array(0, dtype=theano.config.floatX), n_words)
activations, other_info = theano.scan(rnn_step,
                                      sequences=input_vectors,
                                      outputs_info=initial_context_vector,
                                      non_sequences=[W0, W1])
activations = activations[-1]
predicted_class = theano.tensor.argmax(activations)
output = theano.tensor.nnet.softmax(activations)[0]
cost = -theano.tensor.log(output[target_class])
updates = [(word_embeddings,
            word_embeddings - .1 * theano.tensor.grad(cost, word_embeddings)),
           (W0, W0 - .1 * theano.tensor.grad(cost, W0)),
           (W1, W1 - .1 * theano.tensor.grad(cost, W1))]
theano.config.on_unused_input = 'ignore'
Accuracy = -cost
#Change this to something meaningful and it will work!

train = theano.function([input_indices, target_class],
                        [Accuracy, predicted_class],
Ejemplo n.º 43
0
def build_model(tparams, options):
    # MIKE: why is this not a shared variable as in
    # trng = theano.tensor.shared_randomstreams.RandomStreams(1234)
    trng = RandomStreams(SEED)

    # Used for dropout.
    use_noise = theano.shared(numpy_floatX(0.))

    x = tensor.matrix('x', dtype='int64')
    mask = tensor.matrix('mask', dtype=config.floatX)
    xt = tensor.matrix('xt', dtype=config.floatX)
    y = tensor.matrix('y', dtype='int64')
    yt = tensor.matrix('yt', dtype=config.floatX)

    n_timesteps = x.shape[0]
    n_examples = x.shape[1]

    if (options['arch_remap_input']):
       emb = tparams['Wemb'][x.flatten()].reshape([n_timesteps,
						   n_examples,
						   options['n_hid']])
    else:
       Wemb = theano.shared( numpy.concatenate(
                      (numpy.zeros((1,options['n_hid']),dtype=config.floatX),
                       numpy.identity(options['n_hid'],dtype=config.floatX)),
                       axis=0), name='Wemb')
       emb = Wemb[x.flatten()].reshape([n_timesteps,
                                        n_examples,
                                        options['n_hid']])

    # this is the call to either lstm_layer or hpm_layer
    if (options['encoder'] == 'lstm'):
        proj = get_layer(options['encoder'])[1](
                                            tparams, emb, xt, yt, options,
                                            prefix=options['encoder'],
                                            mask=mask)
        h = c = None
    else:
        proj, h, c = get_layer(options['encoder'])[1](
                                            tparams, emb, xt, yt, options,
                                            prefix=options['encoder'],
                                            mask=mask)

    # proj has dim n_timesteps X n_examples X n_hid
    if options['use_dropout']:
        proj = dropout_layer(proj, use_noise, trng)
    
    def _step(proj_step):
        if (options['arch_output_fn'] == 'softmax'):
            pred_prob_step = tensor.nnet.softmax(
                             tensor.dot(proj_step, tparams['U']) + tparams['b'])
        elif (options['arch_output_fn'] == 'logistic'):
            pred_prob_step = tensor.nnet.sigmoid(
                             tensor.dot(proj_step, tparams['U']) + tparams['b'])
        else: # '1-1'
            pred_prob_step = (proj_step+1.0e-6) / tensor.sum(proj_step+1.0e-6,axis=1,keepdims=True)
            # No longer needed if there's no '0' output
	    #pred_prob_step = tensor.concatenate([tensor.alloc(0,n_examples,1),
	    #                                     pred_prob_step], axis=1)
        return pred_prob_step
        # pred_prob_step should have dim n_examples X n_outputs
        # pred_prob has dim n_timesteps x n_examples x n_outputs
        # pred_step has have dim n_examples 
 
    pred_prob, updates = theano.scan(_step,
                                sequences=proj,
				outputs_info=None,
				non_sequences=None,
				n_steps=n_timesteps)

    def _cost_step_norm(pred_prob_step, y_step):
        # tgt_prob_step should have dim n_examples 
        tgt_prob_step = tensor.switch(tensor.eq(y_step, 0), 1.0, 
                             pred_prob_step[tensor.arange(n_examples),y_step-1])

        pred_ix_step = tensor.argmax(pred_prob_step,axis=1) + 1
        if (options['type_token_sim']): 
            corr_step = tensor.switch(tensor.eq(y_step, 0), 0,
                      tensor.switch(tensor.eq((y_step-1)//5, 
                                     (pred_ix_step-1)//5), 1, -1))
        else:
            corr_step = tensor.switch(tensor.eq(y_step, 0), 0,
                      tensor.switch(tensor.eq(y_step,pred_ix_step), 1, -1))
        return tgt_prob_step, corr_step

    # cost function for predicting target value of a specific event
    # tgt_prob_step should have dim n_examples
    def _cost_step_tgt(pred_prob_step, y_step):
        tgt_prob_step = tensor.switch(tensor.eq(y_step, 0), 1.0, 
                     tensor.switch(tensor.gt(y_step, 0),
                        pred_prob_step[tensor.arange(n_examples),y_step-1],
                        1.0-pred_prob_step[tensor.arange(n_examples),-y_step-1]))
        corr_step = tensor.switch(tensor.eq(y_step, 0), 0,
                     tensor.switch(tensor.gt(tgt_prob_step, 0.5), 1, -1))
        return tgt_prob_step, corr_step

    if (options['signed_out']):
        cost_fn = _cost_step_tgt
    else:
        cost_fn = _cost_step_norm

    (tgt_prob, corr), updates = theano.scan(cost_fn,
                                            sequences=[pred_prob, y],
                                            outputs_info=None,
                                            non_sequences=None,
                                            n_steps=n_timesteps)

    off = 1e-8
    if tgt_prob.dtype == 'float16':
        off = 1e-6
    # tgt_prob: probability correct (dimensions n_timesteps X n_examples)
    cost = -tensor.sum(tensor.log(tgt_prob.clip(off, 1.0))) 
    # Note: not dividing by count because it will reweight minibatch by size
    #         / tensor.sum(tensor.gt(y,0))

    return use_noise, x, xt, y, yt, mask, pred_prob, corr, cost, proj, h, c, tgt_prob
Ejemplo n.º 44
0
    def create_gradientfunctions(self,data):
        """This function takes as input the whole dataset and creates the entire model"""
        def encodingstep(x_t, h_t):
            return T.tanh(self.params["W_xhe"].dot(x_t) + self.params["W_hhe"].dot(h_t) + self.params["b_he"])

        x = T.tensor3("x")

        h0_enc = T.matrix("h0_enc")
        result, _ = theano.scan(encodingstep, 
                sequences = x, 
                outputs_info = h0_enc)

        h_encoder = result[-1]

        #log sigma encoder is squared
        mu_encoder = T.dot(self.params["W_hmu"],h_encoder) + self.params["b_hmu"]
        log_sigma_encoder = T.dot(self.params["W_hsigma"],h_encoder) + self.params["b_hsigma"]

        #Use a very wide prior to make it possible to learn something with Z
        logpz = 0.005 * T.sum(1 + log_sigma_encoder - mu_encoder**2 - T.exp(log_sigma_encoder), axis = 0)

        seed = 42
        
        if "gpu" in theano.config.device:
            srng = theano.sandbox.cuda.rng_curand.CURAND_RandomStreams(seed=seed)
        else:
            srng = T.shared_randomstreams.RandomStreams(seed=seed)

        #Reparametrize Z
        eps = srng.normal((self.latent_variables,self.batch_size), avg = 0.0, std = 1.0, dtype=theano.config.floatX)
        z = mu_encoder + T.exp(0.5 * log_sigma_encoder) * eps

        h0_dec = T.tanh(self.params["W_zh"].dot(z) + self.params["b_zh"])

        def decodingstep(x_t, h_t):
            h = T.tanh(self.params["W_hhd"].dot(h_t) + self.params["W_xhd"].dot(x_t) + self.params["b_hd"])
            x = T.nnet.sigmoid(self.params["W_hx"].dot(h) + self.params["b_hx"])

            return x, h

        x0 = T.matrix("x0")
        [y, _], _ = theano.scan(decodingstep,
                n_steps = x.shape[0], 
                outputs_info = [x0, h0_dec])

        # Clip y to avoid NaNs, necessary when lowerbound goes to 0
        y = T.clip(y, 1e-6, 1 - 1e-6)
        logpxz = T.sum(-T.nnet.binary_crossentropy(y,x), axis = 1)

        logpxz = T.mean(logpxz, axis = 0)

        #Average over time dimension
        logpx = T.mean(logpxz + logpz)

        #Compute all the gradients
        gradients = T.grad(logpx, self.params.values())

        #Let Theano handle the updates on parameters for speed
        updates = OrderedDict()
        epoch = T.iscalar("epoch")
        gamma = T.sqrt(1 - (1 - self.b2)**epoch)/(1 - (1 - self.b1)**epoch)

        #Adam
        for parameter, gradient, m, v in zip(self.params.values(), gradients, self.m.values(), self.v.values()):
            new_m = self.b1 * gradient + (1 - self.b1) * m
            new_v = self.b2 * (gradient**2) + (1 - self.b2) * v

            updates[parameter] = parameter + self.learning_rate * gamma * new_m / (T.sqrt(new_v)+ 1e-8)
            updates[m] = new_m
            updates[v] = new_v

        batch = T.iscalar('batch')

        givens = {
            h0_enc: np.zeros((self.hidden_units_encoder,self.batch_size)).astype(theano.config.floatX), 
            x0:     np.zeros((self.features,self.batch_size)).astype(theano.config.floatX),
            x:      data[:,:,batch*self.batch_size:(batch+1)*self.batch_size]
        }

        self.updatefunction = theano.function([batch,epoch], logpx, updates=updates, givens=givens, allow_input_downcast=True)

        return True
Ejemplo n.º 45
0
def lstm_layer(tparams, state_below, xt, yt, options, prefix='lstm', mask=None):
    # xt and yt are used as additional inputs

    nsteps = state_below.shape[0]
    if state_below.ndim == 3:
        n_samples = state_below.shape[1]
    else:
        n_samples = 1

    assert mask is not None

    def _slice(_x, n, dim):
        if _x.ndim == 3:
            return _x[:, :, n * dim:(n + 1) * dim]
        return _x[:, n * dim:(n + 1) * dim]

    def _step(m_, x_, delta_t_input, delta_t_output, h_, c_):
        # h_ has dim n_training_examples  X  n_lstm
        # delta_t_input has dim n_training_examples

        # include input and output delta_t values as LSTM input features
        if (options['arch_lstm_include_delta_t']):
            h_aug = tensor.concatenate([h_, delta_t_input[:,None], 
                                     delta_t_output[:,None]], axis=1)
        else:
            h_aug = h_

        preact = tensor.dot(h_aug, tparams[_p(prefix, 'U')])
        preact += x_

        c = tensor.tanh(_slice(preact, 3, options['n_hid']))

        # original code:   c = f * c_ + i * c

        if (options['arch_lstm_include_input_gate']):
            i = tensor.nnet.sigmoid(_slice(preact, 0, options['n_hid']))
            c = i * c
        if (options['arch_lstm_include_forget_gate']):
            f = tensor.nnet.sigmoid(_slice(preact, 1, options['n_hid']))
            c = c + f * c_ 
        else:
            c = c + c_ 

        c = m_[:, None] * c + (1. - m_)[:, None] * c_

        if (options['arch_lstm_include_output_gate']):
            o = tensor.nnet.sigmoid(_slice(preact, 2, options['n_hid']))
            h = o * tensor.tanh(c)
        else:
            h = tensor.tanh(c)
        h = m_[:, None] * h + (1. - m_)[:, None] * h_

        return h, c

    state_below = (tensor.dot(state_below, tparams[_p(prefix, 'W')]) +
                   tparams[_p(prefix, 'b')])

    n_hid = options['n_hid']
    rval, updates = theano.scan(_step,
                                sequences=[mask, state_below, xt, yt],
                                outputs_info=[tensor.alloc(numpy_floatX(0.),
                                                           n_samples,
                                                           n_hid),
                                              tensor.alloc(numpy_floatX(0.),
                                                           n_samples,
                                                           n_hid)],
                                name=_p(prefix, '_layers'),
                                n_steps=nsteps)
    return rval[0]
Ejemplo n.º 46
0
    def call(self, x, mask=None):
        # TODO: validate input shape

        assert (len(x) == 3)
        L_flat = x[0]
        mu = x[1]
        a = x[2]

        if self.mode == 'full':
            # Create L and L^T matrix, which we use to construct the positive-definite matrix P.
            L = None
            LT = None
            if K.backend() == 'theano':
                import theano.tensor as T
                import theano

                def fn(x, L_acc, LT_acc):
                    x_ = K.zeros((self.nb_actions, self.nb_actions))
                    x_ = T.set_subtensor(x_[np.tril_indices(self.nb_actions)], x)
                    diag = K.exp(T.diag(x_)) + K.epsilon()
                    x_ = T.set_subtensor(x_[np.diag_indices(self.nb_actions)], diag)
                    return x_, x_.T

                outputs_info = [
                    K.zeros((self.nb_actions, self.nb_actions)),
                    K.zeros((self.nb_actions, self.nb_actions)),
                ]
                results, _ = theano.scan(fn=fn, sequences=L_flat, outputs_info=outputs_info)
                L, LT = results
            elif K.backend() == 'tensorflow':
                import tensorflow as tf

                # Number of elements in a triangular matrix.
                nb_elems = (self.nb_actions * self.nb_actions + self.nb_actions) // 2

                # Create mask for the diagonal elements in L_flat. This is used to exponentiate
                # only the diagonal elements, which is done before gathering.
                diag_indeces = [0]
                for row in range(1, self.nb_actions):
                    diag_indeces.append(diag_indeces[-1] + (row + 1))
                diag_mask = np.zeros(1 + nb_elems)  # +1 for the leading zero
                diag_mask[np.array(diag_indeces) + 1] = 1
                diag_mask = K.variable(diag_mask)

                # Add leading zero element to each element in the L_flat. We use this zero
                # element when gathering L_flat into a lower triangular matrix L.
                nb_rows = tf.shape(L_flat)[0]
                zeros = tf.expand_dims(tf.tile(K.zeros((1,)), [nb_rows]), 1)
                try:
                    # Old TF behavior.
                    L_flat = tf.concat(1, [zeros, L_flat])
                except TypeError:
                    # New TF behavior
                    L_flat = tf.concat([zeros, L_flat], 1)

                # Create mask that can be used to gather elements from L_flat and put them
                # into a lower triangular matrix.
                tril_mask = np.zeros((self.nb_actions, self.nb_actions), dtype='int32')
                tril_mask[np.tril_indices(self.nb_actions)] = range(1, nb_elems + 1)

                # Finally, process each element of the batch.
                init = [
                    K.zeros((self.nb_actions, self.nb_actions)),
                    K.zeros((self.nb_actions, self.nb_actions)),
                ]

                def fn(a, x):
                    # Exponentiate everything. This is much easier than only exponentiating
                    # the diagonal elements, and, usually, the action space is relatively low.
                    x_ = K.exp(x) + K.epsilon()
                    # Only keep the diagonal elements.
                    x_ *= diag_mask
                    # Add the original, non-diagonal elements.
                    x_ += x * (1. - diag_mask)
                    # Finally, gather everything into a lower triangular matrix.
                    L_ = tf.gather(x_, tril_mask)
                    return [L_, tf.transpose(L_)]

                tmp = tf.scan(fn, L_flat, initializer=init)
                if isinstance(tmp, (list, tuple)):
                    # TensorFlow 0.10 now returns a tuple of tensors.
                    L, LT = tmp
                else:
                    # Old TensorFlow < 0.10 returns a shared tensor.
                    L = tmp[:, 0, :, :]
                    LT = tmp[:, 1, :, :]
            else:
                raise RuntimeError('Unknown Keras backend "{}".'.format(K.backend()))
            assert L is not None
            assert LT is not None
            P = K.batch_dot(L, LT)
        elif self.mode == 'diag':
            if K.backend() == 'theano':
                import theano.tensor as T
                import theano

                def fn(x, P_acc):
                    x_ = K.zeros((self.nb_actions, self.nb_actions))
                    x_ = T.set_subtensor(x_[np.diag_indices(self.nb_actions)], x)
                    return x_

                outputs_info = [
                    K.zeros((self.nb_actions, self.nb_actions)),
                ]
                P, _ = theano.scan(fn=fn, sequences=L_flat, outputs_info=outputs_info)
            elif K.backend() == 'tensorflow':
                import tensorflow as tf

                # Create mask that can be used to gather elements from L_flat and put them
                # into a diagonal matrix.
                diag_mask = np.zeros((self.nb_actions, self.nb_actions), dtype='int32')
                diag_mask[np.diag_indices(self.nb_actions)] = range(1, self.nb_actions + 1)

                # Add leading zero element to each element in the L_flat. We use this zero
                # element when gathering L_flat into a lower triangular matrix L.
                nb_rows = tf.shape(L_flat)[0]
                zeros = tf.expand_dims(tf.tile(K.zeros((1,)), [nb_rows]), 1)
                try:
                    # Old TF behavior.
                    L_flat = tf.concat(1, [zeros, L_flat])
                except TypeError:
                    # New TF behavior
                    L_flat = tf.concat([zeros, L_flat], 1)

                # Finally, process each element of the batch.
                def fn(a, x):
                    x_ = tf.gather(x, diag_mask)
                    return x_

                P = tf.scan(fn, L_flat, initializer=K.zeros((self.nb_actions, self.nb_actions)))
            else:
                raise RuntimeError('Unknown Keras backend "{}".'.format(K.backend()))
        assert P is not None
        assert K.ndim(P) == 3

        # Combine a, mu and P into a scalar (over the batches). What we compute here is
        # -.5 * (a - mu)^T * P * (a - mu), where * denotes the dot-product. Unfortunately
        # TensorFlow handles vector * P slightly suboptimal, hence we convert the vectors to
        # 1xd/dx1 matrices and finally flatten the resulting 1x1 matrix into a scalar. All
        # operations happen over the batch size, which is dimension 0.
        prod = K.batch_dot(K.expand_dims(a - mu, 1), P)
        prod = K.batch_dot(prod, K.expand_dims(a - mu, -1))
        A = -.5 * K.batch_flatten(prod)
        assert K.ndim(A) == 2
        return A
Ejemplo n.º 47
0
def lstm_decoder_layer(tparams_all, input_state, options, maxlen, dp, prefix="lstm_decoder_layer"):

    tparams_d = tparams_all[0]
    tparams_g = tparams_all[1]

    #rng = numpy.random.RandomState(4567)
    trng = RandomStreams(SEED)

    def _slice(_x, n, dim):
        if _x.ndim == 3:
            return _x[:, :, n * dim:(n + 1) * dim]
        return _x[:, n * dim:(n + 1) * dim]

    def _step(x_, m_, h_, c_):

        preact = tensor.dot(x_, tparams_g[_p(prefix, 'W')]) + tparams_g[_p(prefix, 'b')] + \
                 tensor.dot(h_, tparams_g[_p(prefix, 'U')])
        
        i = tensor.nnet.sigmoid(_slice(preact, 0, options[_p(prefix, 'n')]))
        f = tensor.nnet.sigmoid(_slice(preact, 1, options[_p(prefix, 'n')]))
        o = tensor.nnet.sigmoid(_slice(preact, 2, options[_p(prefix, 'n')]))
        c = tensor.tanh(_slice(preact, 3, options[_p(prefix, 'n')]))

        c = f * c_ + i * c
        
        h = o * tensor.tanh(c)

        s = tensor.nnet.softmax(tensor.dot(h, tparams_g['to_idx_emb']))

        #x_t = tensor.dot((s / s.max(axis=1)[:,None]).astype('int32').astype(theano.config.floatX), tparams_d['Wemb'])
        x_t = tensor.dot(tensor.switch(s < s.max(axis=1)[:,None], 0.0, 1.0).astype(theano.config.floatX), 
                         tparams_d['Wemb'])

        x_out = s.argmax(axis=1)

        m = tensor.switch(tensor.eq(x_out, 10), 0.0, 1.0).astype(theano.config.floatX) * m_
        
        #x_t = tensor.dot(h_, tparams[_p(prefix, 'W_x')]) + tparams[_p(prefix, 'b_x')]

        return x_out, x_t, m, h, c


    ##############################################################################################
    rval, updates = theano.scan(_step,
                                outputs_info=[None,
                                              input_state,
                                              tensor.alloc(numpy_floatX(1.), input_state.shape[0]),
                                              tensor.alloc(numpy_floatX(0.), input_state.shape[0], options['lstm_decoder_layer_n']),
                                              tensor.alloc(numpy_floatX(0.), input_state.shape[0], options['lstm_decoder_layer_n'])],
                                name=_p(prefix, '_layers'),
                                n_steps=maxlen)


    #proj_0 = rval[1]#tensor.tanh(rval[0])

    m22 = trng.binomial(size=(input_state.shape[0],), p=dp, n=1, dtype=theano.config.floatX)
    
    #return rval[0]*m2, rval[1]*m2[:,None], rval[2]*m2

    if(tensor.gt(maxlen, 4) == 1):
        x2 = tensor.alloc(numpy.asarray(0, dtype='int32'), maxlen - 4, input_state.shape[0])
        x2 = tensor.concatenate((tensor.alloc(numpy.asarray(options['end_idx'], dtype='int32'), input_state.shape[0])[None, :],
                                 tensor.alloc(numpy.asarray(options['end_idx'], dtype='int32'), input_state.shape[0])[None, :],
                                 tensor.alloc(numpy.asarray(7, dtype='int32'), input_state.shape[0])[None, :],
                                 tensor.alloc(numpy.asarray(10, dtype='int32'), input_state.shape[0])[None, :],
                                 x2),
                                 axis=0)


        m2 = tensor.alloc(numpy_floatX(0.), maxlen - 3, input_state.shape[0])
        m2 = tensor.concatenate((tensor.alloc(numpy_floatX(1.), input_state.shape[0])[None, :],
                                 tensor.alloc(numpy_floatX(1.), input_state.shape[0])[None, :],
                                 tensor.alloc(numpy_floatX(1.), input_state.shape[0])[None, :],
                                 m2), 
                                 axis=0)
    
        xt2 = tparams_d['Wemb'][x2]

        return rval[0]*m22+x2*(1-m22), rval[1]*m22[:,None]+xt2*(1-m22[:,None]), rval[2]*m22+m2*(1-m22)

    else:
        return rval[0]*m22, rval[1]*m22[:,None], rval[2]*m22
Ejemplo n.º 48
0
def hpm_layer(tparams, state_below, xt, yt, options, prefix='hpm', mask=None):
    nsteps = state_below.shape[0]
    if state_below.ndim == 3:
        n_examples = state_below.shape[1]
    else:
        n_examples = 1

    assert mask is not None

    n_hid = options['n_hid']
    timescales = options['timescales']
    gamma = (1.0 / numpy_floatX(timescales)).reshape((1, -1, 1))
    n_timescales = len(timescales)

    alpha0 = tensor.nnet.sigmoid(
                           tparams[_p(prefix,'alpha0')]).dimshuffle(('x','x',0))
    if (options['arch_hpm_gamma_scaled_alpha']):
        gamma_exp_for_alpha = (tensor.nnet.sigmoid(
            tparams[_p(prefix, 'gamma_exp_for_alpha')])).dimshuffle(('x','x',0))
        alpha = alpha0 * gamma ** gamma_exp_for_alpha
                            #* numpy.min(gamma) ** (1.0-gamma_exp_for_alpha)
    else:
        alpha = alpha0 * gamma

    # determine asymnptotic (stationary) rate from mu and alpha0
    stationary_rate = tensor.nnet.softplus(
        tparams[_p(prefix, 'mu')]).dimshuffle(('x','x',0)) / (1.0 - alpha0)
    if (options['arch_hpm_gamma_scaled_mu']):
        gamma_exp_for_mu = (tensor.nnet.softplus(
               tparams[_p(prefix, 'gamma_exp_for_mu')])).dimshuffle(('x','x',0))
        stationary_rate *= gamma ** gamma_exp_for_mu


    #agratio = 1.0 / (1.0 - tensor.nnet.sigmoid(
    #        tparams[_p(prefix, 'alpha_gamma_ratio')]).dimshuffle(('x','x',0)))
            
    eta = tensor.nnet.softplus(tparams[_p(prefix,'eta')]).dimshuffle(('x',0))

    def _timescale_posterior(likelihood, prior):
        # likelihood, prior, posterior have dimensions: 
        #               n_training_examples  X  n_timescales  X  n_hid
        # Make sure we don't crap out with 0 likelihoods
        off = 1e-30
        if prior.dtype == 'float16':
            off = 1e-5
        posterior = prior * likelihood + off
        # This doesn't work and I don't know why
        #posterior = tensor.switch(
        #            tensor.gt(tensor.max(posterior,axis=1,keepdims=True), 0.0),
        #                      posterior, off)
        posterior = posterior / tensor.sum(posterior,axis=1,keepdims=True)
        return posterior

    def _marginalize_timescale(quantity, timescale_prob):
	q = quantity.dimshuffle([1,0,2]).flatten(ndim=2).dimshuffle([1,0])
	t = timescale_prob.dimshuffle([1,0,2]).flatten(ndim=2).dimshuffle([1,0])
        return tensor.batched_dot(q, t).reshape([n_examples, n_hid])

    def _event_prob(intensity, delta_t):
        # remember that stationary_rate is subtracted from all intensities

        new_intensity = (intensity * 
                                  tensor.exp(-gamma * delta_t * (1.0 - alpha0)))
        return new_intensity

    def _slice(_x, n, dim):
        if _x.ndim == 3:
            return _x[:, :, n * dim:(n + 1) * dim]
        return _x[:, n * dim:(n + 1) * dim]

    def _step(m, state_below, delta_t_input, delta_t_output, h_, c_, yhat_):

        h  = _event_prob(h_, delta_t_input[:,None,None])

        if (not options['arch_remap_input']):
            event = state_below
        else:
            net = tparams[_p(prefix,'b')]
            if (options['arch_hpm_recurrent']):
                h_with_sr = h + stationary_rate
                # event versus ghost event?
                scaled_intensity_input = h_with_sr / (eta * alpha + h_with_sr)
                marginal_intensity_input = _marginalize_timescale(scaled_intensity_input, c_)
                net += tensor.dot(
                              marginal_intensity_input, tparams[_p(prefix,'U')])
                # marginal_intensity_input: n_examples X n_hid
                # U: n_hid X 2n_hid (with gated) or n_hid X n_hid (without)
            if (options['arch_hpm_gated']):
                # state_below: n_examples X n_hid
                # W:           n_hid X 2n_hid
                # b:                   2n_hid [broadcasting is R to L]
                net += tensor.dot(state_below, tparams[_p(prefix, 'W')])
                gate  = tensor.nnet.sigmoid(_slice(net, 1, n_hid))
                ungated_event = tensor.nnet.sigmoid(_slice(net, 0, n_hid))
                event = gate * ungated_event
            else:
                net += state_below
                event = tensor.nnet.sigmoid(net)

	# dimensions: n_training_examples X n_timescales X n_hid
	event = event.dimshuffle((0,'x',1))

        # credit assignment across timescales
        c = (event * _timescale_posterior(h + stationary_rate,c_) 
                                                             + (1.0-event) * c_)

        # update intensity 
	h += alpha * event

	# clear out updates after end of sequence
        c = m[:, None, None] * c + (1. - m)[:, None, None] * c_
        h = m[:, None, None] * h + (1. - m)[:, None, None] * h_

	# predict next event conditioned on timescale
        hhat_with_sr = _event_prob(h, delta_t_output[:,None,None]) + stationary_rate

        # event versus ghost event?
        scaled_intensity_output = hhat_with_sr / (eta * alpha + hhat_with_sr)
        # expectation of intensity 
        marginal_intensity_output = _marginalize_timescale(scaled_intensity_output, c)
                            # has dimensions n_training_examples X n_hid
        # event versus ghost event?
        marginal_intensity_output = (marginal_intensity_output / 
                               (eta + marginal_intensity_output))

	return h, c, marginal_intensity_output

    h = tensor.tensor3('h', dtype=config.floatX)
    # dimensions: n_training_examples  X  n_timescales  X  n_hid
    c = tensor.tensor3('c', dtype=config.floatX)
    # dimensions: n_training_examples  X  n_timescales  X  n_hid

    if (options['arch_hpm_prior_exp']):
        c0 = gamma ** tparams[_p(prefix,'priorexp')].dimshuffle(('x','x',0))
        c0 = c0 / tensor.sum(c0, axis=1)
    else:
        c0 = 1.0 / numpy_floatX(n_timescales)

    rval, updates = theano.scan(_step,
                                sequences=[mask, state_below, xt, yt],
                                outputs_info=[tensor.alloc(numpy_floatX(0.), #h
                                                           n_examples,
							   n_timescales,
                                                           n_hid),
                                              tensor.alloc(c0, #c
                                                           n_examples,
							   n_timescales,
                                                           n_hid),
                                              tensor.alloc(numpy_floatX(0.),#yhat
                                                           n_examples,
                                                           n_hid)],
                                name=_p(prefix, '_layers'),
                                n_steps=nsteps)
    return rval[2], rval[0], rval[1] # return yhat, h, c
Ejemplo n.º 49
0
    def rnn(self,
            step_function,
            inputs,
            initial_states,
            go_backwards=False,
            mask=None,
            unroll=False,
            input_length=None):
        '''Iterates over the time dimension of a tensor.

        # Arguments
            inputs: tensor of temporal data of shape (samples, time, ...)
                (at least 3D).
            step_function:
                Parameters:
                    input: tensor with shape (samples, ...) (no time dimension),
                        representing input for the batch of samples at a certain
                        time step.
                    states: list of tensors.
                Returns:
                    output: tensor with shape (samples, ...) (no time dimension),
                    new_states: list of tensors, same length and shapes
                        as 'states'.
            initial_states: tensor with shape (samples, ...) (no time dimension),
                containing the initial values for the states used in
                the step function.
            go_backwards: boolean. If True, do the iteration over
                the time dimension in reverse order.
            mask: binary tensor with shape (samples, time),
                with a zero for every element that is masked.
            unroll: whether to unroll the RNN or to use a symbolic loop (`scan`).
            input_length: must be specified if using `unroll`.

        # Returns
            A tuple (last_output, outputs, new_states).
                last_output: the latest output of the rnn, of shape (samples, ...)
                outputs: tensor with shape (samples, time, ...) where each
                    entry outputs[s, t] is the output of the step function
                    at time t for sample s.
                new_states: list of tensors, latest states returned by
                    the step function, of shape (samples, ...).
        '''
        ndim = inputs.ndim
        assert ndim >= 3, 'Input should be at least 3D.'

        if unroll:
            if input_length is None:
                raise Exception(
                    'When specifying `unroll=True`, an `input_length` '
                    'must be provided to `rnn`.')

        axes = [1, 0] + list(range(2, ndim))
        inputs = inputs.dimshuffle(axes)

        if mask is not None:
            if mask.ndim == ndim - 1:
                mask = self.expand_dims(mask)
            assert mask.ndim == ndim
            mask = mask.dimshuffle(axes)

            if unroll:
                indices = list(range(input_length))
                if go_backwards:
                    indices = indices[::-1]

                successive_outputs = []
                successive_states = []
                states = initial_states
                for i in indices:
                    output, new_states = step_function(inputs[i], states)

                    if len(successive_outputs) == 0:
                        prev_output = self.zeros_like(output)
                    else:
                        prev_output = successive_outputs[-1]

                    output = T.switch(mask[i], output, prev_output)
                    kept_states = []
                    for state, new_state in zip(states, new_states):
                        kept_states.append(T.switch(mask[i], new_state, state))
                    states = kept_states

                    successive_outputs.append(output)
                    successive_states.append(states)

                outputs = T.stack(*successive_outputs)
                states = []
                for i in range(len(successive_states[-1])):
                    states.append(
                        T.stack(*[
                            states_at_step[i]
                            for states_at_step in successive_states
                        ]))
            else:
                # build an all-zero tensor of shape (samples, output_dim)
                initial_output = step_function(inputs[0],
                                               initial_states)[0] * 0
                # Theano gets confused by broadcasting patterns in the scan op
                initial_output = T.unbroadcast(initial_output, 0, 1)

                def _step(input, mask, output_tm1, *states):
                    output, new_states = step_function(input, states)
                    # output previous output if masked.
                    output = T.switch(mask, output, output_tm1)
                    return_states = []
                    for state, new_state in zip(states, new_states):
                        return_states.append(T.switch(mask, new_state, state))
                    return [output] + return_states

                results, _ = theano.scan(_step,
                                         sequences=[inputs, mask],
                                         outputs_info=[initial_output] +
                                         initial_states,
                                         go_backwards=go_backwards)

                # deal with Theano API inconsistency
                if type(results) is list:
                    outputs = results[0]
                    states = results[1:]
                else:
                    outputs = results
                    states = []
        else:
            if unroll:
                indices = list(range(input_length))
                if go_backwards:
                    indices = indices[::-1]

                successive_outputs = []
                successive_states = []
                states = initial_states
                for i in indices:
                    output, states = step_function(inputs[i], states)
                    successive_outputs.append(output)
                    successive_states.append(states)
                outputs = T.stack(*successive_outputs)
                states = []
                for i in range(len(successive_states[-1])):
                    states.append(
                        T.stack(*[
                            states_at_step[i]
                            for states_at_step in successive_states
                        ]))

            else:

                def _step(input, *states):
                    output, new_states = step_function(input, states)
                    return [output] + new_states

                results, _ = theano.scan(_step,
                                         sequences=inputs,
                                         outputs_info=[None] + initial_states,
                                         go_backwards=go_backwards)

                # deal with Theano API inconsistency
                if type(results) is list:
                    outputs = results[0]
                    states = results[1:]
                else:
                    outputs = results
                    states = []

        outputs = T.squeeze(outputs)
        last_output = outputs[-1]

        axes = [1, 0] + list(range(2, outputs.ndim))
        outputs = outputs.dimshuffle(axes)
        states = [T.squeeze(state[-1]) for state in states]
        return last_output, outputs, states
Ejemplo n.º 50
0

def rnn_step(X, H, U, b, W):
    """ One RNN step for all examples in a batch in parallel

    X: shape (batch_size, n_features) -> features at same time step for all examples in batch
    H: shape (batch_size, n_state) -> state at previous time step for all examples in batch
    U, b, W: RNN parameters
    
    returns: (batch_size, n_state) -> new state value at time step for all examples in batch
    """
    return T.tanh(b + T.dot(X, U) + T.dot(H, W))


results, updates = theano.scan(fn=rnn_step,
                               outputs_info=T.zeros_like(initial_state),
                               sequences=X.dimshuffle(1, 0, 2),
                               non_sequences=[U, b, W])
# results: (n_step, batch_size, n_state)


def pred_step(H, V, c):
    return T.nnet.sigmoid(c + T.dot(H, V))


preds, pupds = theano.scan(fn=pred_step,
                           outputs_info=None,
                           sequences=results,
                           non_sequences=[V, c])
# preds: (n_step, batch_size, n_out)

# ## SGD machinery
Ejemplo n.º 51
0
    def _run(self, num_features, num_timesteps, batch_size, mode):
        # determine shapes of inputs and targets depending on the batch size
        if batch_size == 1:
            inputs_size = (num_timesteps, num_features)
            targets_size = (num_timesteps, 1)
        else:
            inputs_size = (num_timesteps, batch_size, num_features)
            targets_size = (num_timesteps, batch_size, 1)

        # make inputs and targets shared variables
        inputs = theano.shared(self.rng.uniform(size=inputs_size).astype(
            config.floatX),
                               borrow=True)
        targets = theano.shared(self.rng.uniform(size=targets_size).astype(
            config.floatX),
                                borrow=True)

        # create symbolic inputs and targets variables
        if batch_size == 1:
            x = T.matrix('inputs')
            t = T.matrix('targets')
        else:
            x = T.tensor3('inputs')
            t = T.tensor3('inputs')
        x.tag.test_value = inputs.get_value(borrow=True)
        t.tag.test_value = targets.get_value(borrow=True)

        # create a set of parameters for a simple RNN
        W_xh = theano.shared(
            (0.01 * self.rng.uniform(size=(num_features, 10))).astype(
                config.floatX),
            borrow=True)
        W_hh = theano.shared(
            (0.01 * self.rng.uniform(size=(10, 10))).astype(config.floatX),
            borrow=True)
        W_hy = theano.shared(
            (0.01 * self.rng.uniform(size=(10, 1))).astype(config.floatX),
            borrow=True)
        b_h = theano.shared(np.zeros(10).astype(config.floatX), borrow=True)
        b_y = theano.shared(np.zeros(1).astype(config.floatX), borrow=True)

        params = [W_xh, W_hh, W_hy, b_h, b_y]

        # recurrent function
        def step(x_t, h_tm1):
            h = T.tanh(T.dot(h_tm1, W_hh) + T.dot(x_t, W_xh) + b_h)
            return h

        # build recurrent graph
        if batch_size == 1:
            h_0 = T.alloc(0.0, 10).astype(config.floatX)
        else:
            h_0 = T.alloc(0.0, batch_size, 10).astype(config.floatX)
        h, updates = theano.scan(step, sequences=[x], outputs_info=[h_0])
        # network output
        y = T.dot(h, W_hy) + b_y

        # Create Gauss-Newton-Matrix object. Not really of any use here, but I
        # need it for Hessian-Free optimization.
        gn = GaussNewtonMatrix(y)

        # compute MSE
        cost = ((t - y)**2).sum(axis=1).mean()

        # Compute the cost at some other point in the parameter
        # space. Not really of any use here, but this is how I do it
        # during certain iterations of CG in the HF algorithm. There,
        # it's in fact `pi + current update proposal`.  For simplicity,
        # I just multiply by 2 here.
        cost_ = theano.clone(cost,
                             replace=dict([(pi, 2 * pi) for pi in params]))

        # Compute Gauss-Newton-Matrix times some vector `v` which is `p` in CG,
        # but for simplicity, I just take the parameters vector because it's
        # already there.
        Gv = gn(v=params, cost=cost, parameters=params, damp=T.constant(1.0))

        # compile Theano function
        f = theano.function([], [cost_] + Gv,
                            givens={
                                x: inputs,
                                t: targets
                            },
                            mode=mode)
        # execute
        f()
Ejemplo n.º 52
0
def lstm_layer(tparams, input_state, mask, options, prefix='lstm_layer'):

    def _slice(_x, n, dim):
        if _x.ndim == 3:
            return _x[:, :, n * dim:(n + 1) * dim]
        return _x[:, n * dim:(n + 1) * dim]

    def _step_f(m_, x_, h_, c_):
        preact = tensor.dot(x_, tparams[_p(prefix, 'Wf')]) + tparams[_p(prefix, 'bf')] + \
                 tensor.dot(h_, tparams[_p(prefix, 'Uf')])

        i = tensor.nnet.sigmoid(_slice(preact, 0, options[_p(prefix, 'n')]))
        f = tensor.nnet.sigmoid(_slice(preact, 1, options[_p(prefix, 'n')]))
        o = tensor.nnet.sigmoid(_slice(preact, 2, options[_p(prefix, 'n')]))
        c = tensor.tanh(_slice(preact, 3, options[_p(prefix, 'n')]))

        c = f * c_ + i * c
        c = m_[:, None] * c + (1. - m_)[:, None] * c_

        h = o * tensor.tanh(c)
        h = m_[:, None] * h + (1. - m_)[:, None] * h_

        return h, c

    def _step_b(m_, x_, h_, c_):
        preact = tensor.dot(x_, tparams[_p(prefix, 'Wb')]) + tparams[_p(prefix, 'bb')] + \
                 tensor.dot(h_, tparams[_p(prefix, 'Ub')])

        i = tensor.nnet.sigmoid(_slice(preact, 0, options[_p(prefix, 'n')]))
        f = tensor.nnet.sigmoid(_slice(preact, 1, options[_p(prefix, 'n')]))
        o = tensor.nnet.sigmoid(_slice(preact, 2, options[_p(prefix, 'n')]))
        c = tensor.tanh(_slice(preact, 3, options[_p(prefix, 'n')]))

        c = f * c_ + i * c
        c = m_[:, None] * c + (1. - m_)[:, None] * c_

        h = o * tensor.tanh(c)
        h = m_[:, None] * h + (1. - m_)[:, None] * h_

        return h, c


    dim_proj = options[_p(prefix, 'n')]
    ##############################################################################################
    rval_f, updates_f = theano.scan(_step_f,
                                sequences=[mask, input_state],
                                outputs_info=[tensor.alloc(numpy_floatX(0.), input_state.shape[1], dim_proj),
                                              tensor.alloc(numpy_floatX(0.), input_state.shape[1], dim_proj)],
                                name=_p(prefix, '_layers'),
                                n_steps=input_state.shape[0])

    rval_b, updates_b = theano.scan(_step_b,
                                sequences=[mask, input_state],
                                outputs_info=[tensor.alloc(numpy_floatX(0.), input_state.shape[1], dim_proj),
                                              tensor.alloc(numpy_floatX(0.), input_state.shape[1], dim_proj)],
                                name=_p(prefix, '_layers'),
                                n_steps=input_state.shape[0],
                                go_backwards=True)


    proj_0 = rval_f[0] + rval_b[0][::-1]

    # Attention
    y_0 = (tensor.tanh(proj_0) * mask[:, :, None]) * tparams[_p(prefix, 'V')]
    y_0 = y_0.sum(axis=2).transpose()
    alpha = tensor.nnet.softmax(y_0).transpose()
    proj_0 = proj_0 * alpha[:, :, None]#(proj_0 * mask[:, :, None])

    proj_0 = proj_0.sum(axis=0)#(proj_0 * mask[:, :, None])
    ##############################################################################################


    proj_0 = tensor.tanh(proj_0)
    
     
    return proj_0
Ejemplo n.º 53
0
def _elbo_t(logp, uw_g, uw_l, inarray_g, inarray_l, c_g, c_l, n_mcsamples,
            random_seed):
    """Return expression of approximate ELBO based on Monte Carlo sampling.
    """
    if random_seed is None:
        r = MRG_RandomStreams(gen_random_state())
    else:
        r = MRG_RandomStreams(seed=random_seed)

    normal_const = floatX(1 + np.log(2.0 * np.pi))

    elbo = 0

    # Sampling local variational parameters
    if uw_l is not None:
        l_l = (uw_l.size / 2).astype('int32')
        u_l = uw_l[:l_l]
        w_l = uw_l[l_l:]
        ns_l = r.normal(size=(n_mcsamples, inarray_l.tag.test_value.shape[0]))
        zs_l = ns_l * tt.exp(w_l) + u_l
        elbo += tt.sum(c_l * (w_l + 0.5 * normal_const))
    else:
        zs_l = None

    # Sampling global variational parameters
    if uw_g is not None:
        l_g = (uw_g.size / 2).astype('int32')
        u_g = uw_g[:l_g]
        w_g = uw_g[l_g:]
        ns_g = r.normal(size=(n_mcsamples, inarray_g.tag.test_value.shape[0]))
        zs_g = ns_g * tt.exp(w_g) + u_g
        elbo += tt.sum(c_g * (w_g + 0.5 * normal_const))
    else:
        zs_g = None

    if (zs_l is not None) and (zs_g is not None):

        def logp_(z_g, z_l):
            return theano.clone(logp,
                                OrderedDict({
                                    inarray_g: z_g,
                                    inarray_l: z_l
                                }),
                                strict=False)

        sequences = [zs_g, zs_l]

    elif zs_l is not None:

        def logp_(z_l):
            return theano.clone(logp,
                                OrderedDict({inarray_l: z_l}),
                                strict=False)

        sequences = [zs_l]

    else:

        def logp_(z_g):
            return theano.clone(logp,
                                OrderedDict({inarray_g: z_g}),
                                strict=False)

        sequences = [zs_g]

    logps, _ = theano.scan(fn=logp_, outputs_info=None, sequences=sequences)
    elbo += tt.mean(logps)

    return elbo
Ejemplo n.º 54
0
    def test_machine_translation(self):
        """
        This test case comes from https://github.com/rizar/scan-grad-speed and
        is an example of actual computation done with scan in the context of
        machine translation

        'dim' has been reduced from 1000 to 5 to make the test run faster
        """

        # Parameters from an actual machine tranlation run
        batch_size = 80
        seq_len = 50
        n_words = 80 * 50
        dim = 5

        # Weight matrices
        U = theano.shared(
            np.random.normal(size=(dim, dim),
                             scale=0.0001).astype(config.floatX))
        U.name = 'U'
        V = theano.shared(U.get_value())
        V.name = 'V'
        W = theano.shared(U.get_value())
        W.name = 'W'

        # Variables and their values
        x = T.tensor3('x')
        x_value = np.random.normal(size=(seq_len, batch_size, dim),
                                   scale=0.0001).astype(config.floatX)

        ri = T.tensor3('ri')
        ri_value = x_value

        zi = T.tensor3('zi')
        zi_value = x_value

        init = T.alloc(np.cast[config.floatX](0), batch_size, dim)

        def rnn_step1(
                # sequences
                x,
                ri,
                zi,
                # outputs_info
                h):
            pre_r = ri + h.dot(U)
            pre_z = zi + h.dot(V)
            r = T.nnet.sigmoid(pre_r)
            z = T.nnet.sigmoid(pre_z)

            after_r = r * h
            pre_h = x + after_r.dot(W)
            new_h = T.tanh(pre_h)

            res_h = z * new_h + (1 - z) * h
            return res_h

        # Compile the function twice, once with the optimization and once
        # without
        opt_mode = mode.including("scan")
        h, _ = theano.scan(rnn_step1,
                           sequences=[x, ri, zi],
                           n_steps=seq_len,
                           outputs_info=init,
                           name='fpass1',
                           mode=opt_mode)
        cost = h[-1].sum()
        grad1 = T.grad(cost, [U, V, W])
        f_opt = theano.function(inputs=[x, ri, zi],
                                outputs=grad1,
                                mode=opt_mode)

        no_opt_mode = mode.excluding("scanOp_pushout_output")
        h, _ = theano.scan(rnn_step1,
                           sequences=[x, ri, zi],
                           n_steps=seq_len,
                           outputs_info=init,
                           name='fpass1',
                           mode=no_opt_mode)
        cost = h[-1].sum()
        grad1 = T.grad(cost, [U, V, W])
        f_no_opt = theano.function(inputs=[x, ri, zi],
                                   outputs=grad1,
                                   mode=no_opt_mode)

        # Validate that the optimization has been applied
        scan_node_grad = [
            node for node in f_opt.maker.fgraph.toposort()
            if isinstance(node.op, Scan)
        ][1]

        for output in scan_node_grad.op.outputs:
            assert not (
                isinstance(output.owner.op, T.elemwise.Elemwise)
                and any([isinstance(i, T.Dot) for i in output.owner.inputs]))

        # Compare the outputs of the two functions on the same input data.
        f_opt_output = f_opt(x_value, ri_value, zi_value)
        f_no_opt_output = f_no_opt(x_value, ri_value, zi_value)
        utt.assert_allclose(f_opt_output, f_no_opt_output)
Ejemplo n.º 55
0
def plot_zero_crossing(K = 600):
    x = T.ftensor3()

    def f(X):
        X_ = T.zeros_like(X)
        X_ = T.set_subtensor(X_[:,:,0], (6.0 / (mass * length * length)) * \
                                        ((2.0 * X[:,:,2] - 3.0 * T.cos(X[:,:,0] - X[:,:,1]) * X[:,:,3]) / \
                                        (16.0 - 9.0 * T.square(T.cos(X[:,:,0] - X[:,:,1])))))
        X_ = T.set_subtensor(X_[:,:,1], (6.0 / (mass * length * length)) * \
                                        (8.0 * X[:,:,3] - 3.0 * T.cos(X[:,:,0] - X[:,:,1]) * X[:,:,2]) / \
                                        (16.0 - 9.0 * T.square(T.cos(X[:,:,0] - X[:,:,1]))))
        X_ = T.set_subtensor(X_[:,:,2], -0.5 * mass * length * length * (X_[:,:,0] * X_[:,:,1] * T.sin(X[:,:,0] - X[:,:,1]) + \
                                                                     3.0 * gravity / length * T.sin(X[:,:,0])))
        X_ = T.set_subtensor(X_[:,:,3], -0.5 * mass * length * length * (-X_[:,:,0] * X_[:,:,1] * T.sin(X[:,:,0] - X[:,:,1]) + \
                                                                         gravity / length * T.sin(X[:,:,1])))
        return X_

    def step(X):
        k1 = h * f(X)
        k2 = h * f(X + 0.5 * k1)
        k3 = h * f(X + 0.5 * k2)
        k4 = h * f(X + k3)

        X_ = X + (1.0 / 6.0) * k1 + (1.0 / 3.0) * k2 + (1.0 / 3.0) * k3 + (1.0 / 6.0) * k4

        return X_

    result, _ = theano.scan(fn=step,
                            outputs_info=x,
                            n_steps=N)

    RK4 = theano.function([x,h,mass,length,gravity,N],
                          result,
                          allow_input_downcast=True)

    l      = 1.0
    m      = 1.0
    g      = 9.81

    theta1_, theta2_ = np.meshgrid(np.linspace(-np.pi, np.pi, K),
                                   np.linspace(-np.pi, np.pi, K))

    initial_states = np.stack((theta1_, theta2_, np.zeros_like(theta1_), np.zeros_like(theta2_)), axis=2)

    state_array = RK4(initial_states, 0.025, m, l, g, 1000)

    min_crossing_time = []

    for i in range(state_array.shape[1]):
        for j in range(state_array.shape[2]):
            theta_diff    = np.mod(state_array[:,i,j,0] - state_array[:,i,j,1] - np.pi, 2.0 * np.pi)
            crossings     = np.abs(np.diff(theta_diff)) > np.pi

            if np.sum(crossings) == 0:
                min_crossing_time.append(np.nan)
            else:
                min_crossing_time.append(np.min(np.where(crossings)))

    min_crossing_time = np.array(min_crossing_time)

    ax = plt.subplot(111)
    ax.imshow(np.log(min_crossing_time.reshape(K,K)),
              cmap='Blues_r',
              origin='lower',
              extent=[np.min(theta1_), np.max(theta1_), np.min(theta2_), np.max(theta2_)])
    ax.set_aspect('equal')
    plt.savefig(os.path.join(__file__.split('.')[0], 'TimeToFlip.png'), dpi=400)
    plt.show()
Ejemplo n.º 56
0
def convolve1d_4D_conv2d(input, W, mode='full'):
  conv_out, _ = theano.scan(fn=lambda i: conv2d(input[:,:,:,i:i+1], W[:,:,:,i:i+1], border_mode=mode),
                                outputs_info=None,
                                sequences=[T.arange(0, W.shape[3])])
  conv_out = conv_out.flatten(ndim=4).dimshuffle(1,2,3,0)
  return conv_out
Ejemplo n.º 57
0
    def __init__(self, nh, nc, ne, de, cs):
        '''
        nh :: dimension of the hidden layer
        nc :: number of classes
        ne :: number of word embeddings in the vocabulary
        de :: dimension of the word embeddings
        cs :: word window context size
        '''
        #assert st in ['proba', 'argmax']

        self.emb = theano.shared(0.2 * numpy.random.uniform(-1.0, 1.0,\
                   (ne+1, de)).astype(theano.config.floatX)) # add one for PADDING at the end
        self.Wx  = theano.shared(0.2 * numpy.random.uniform(-1.0, 1.0,\
                   (de * cs, nh)).astype(theano.config.floatX))
        self.Ws  = theano.shared(0.2 * numpy.random.uniform(-1.0, 1.0,\
                   (nc, nh)).astype(theano.config.floatX))
        self.W   = theano.shared(0.2 * numpy.random.uniform(-1.0, 1.0,\
                   (nh, nc)).astype(theano.config.floatX))
        self.bh = theano.shared(numpy.zeros(nh, dtype=theano.config.floatX))
        self.b = theano.shared(numpy.zeros(nc, dtype=theano.config.floatX))
        self.s0 = theano.shared(numpy.zeros(nc, dtype=theano.config.floatX))

        # bundle
        self.params = [
            self.emb, self.Wx, self.Ws, self.W, self.bh, self.b, self.s0
        ]
        self.names = ['embeddings', 'Wx', 'Wh', 'W', 'bh', 'b', 's0']
        idxs = T.imatrix(
        )  # as many columns as context window size/lines as words in the sentence
        x = self.emb[idxs].reshape((idxs.shape[0], de * cs))
        y = T.iscalar('y')  # label

        def recurrence(x_t, s_tm1):
            h_t = T.nnet.sigmoid(T.dot(x_t, self.Wx) + \
                                 T.dot(s_tm1, self.Ws) + self.bh)
            s_t = T.nnet.softmax(T.dot(h_t, self.W) + self.b)[0]
            return [h_t, s_t]

        [h, s], _ = theano.scan(fn=recurrence, \
            sequences=x, outputs_info=[None, self.s0], \
            n_steps=x.shape[0])

        p_y_given_x_lastword = s[-1, :]
        p_y_given_x_sentence = s
        y_pred = T.argmax(p_y_given_x_sentence, axis=1)

        # cost and gradients and learning rate
        lr = T.scalar('lr')
        nll = -T.mean(T.log(p_y_given_x_lastword)[y])
        gradients = T.grad(nll, self.params)
        updates = OrderedDict(
            (p, p - lr * g) for p, g in zip(self.params, gradients))

        # theano functions
        self.classify = theano.function(inputs=[idxs], outputs=y_pred)

        self.train = theano.function(inputs=[idxs, y, lr],
                                     outputs=nll,
                                     updates=updates)

        self.normalize = theano.function( inputs = [],
                         updates = {self.emb:\
                         self.emb/T.sqrt((self.emb**2).sum(axis=1)).dimshuffle(0,'x')})
Ejemplo n.º 58
0
 def forward_pass(x, dropout):
     if dropout != 0.0:
         x *= theano_rng.binomial(
             n=1,
             p=1 - dropout,
             size=x.shape,
             dtype=theano.config.floatX) / (1 - dropout)
     for i in range(Nlayers):
         h = (x.dimshuffle((1, 0, 2)).dot(self.Win)
              if i == 0 else h.dot(self.Wup[i - 1])) + self.Bhid[i]
         rep = lambda x: T.extra_ops.repeat(
             x.reshape((1, -1)), h.shape[1], axis=0)
         if Ah != "lstm":
             h = T.concatenate([
                 theano.scan(
                     fn=step_rnn,
                     sequences=[
                         h[:, :, Nh * d:Nh * (d + 1)], mask_float[d]
                     ],
                     outputs_info=[rep(self.h0[i, d])],
                     non_sequences=[
                         self.Wrec[i, d],
                         rep(self.h0[i, d])
                     ],
                     go_backwards=(d == 1),
                 )[0][::(1 if d == 0 else -1)] for d in range(Ndirs)
             ],
                               axis=2)
         else:
             h = T.concatenate([
                 theano.scan(
                     fn=step_lstm,
                     sequences=[
                         h[:, :, Nh * 4 * d:Nh * 4 *
                           (d + 1)], mask_float[d]
                     ],
                     outputs_info=[
                         rep(self.c0[i, d]),
                         rep(self.h0[i, d])
                     ],
                     non_sequences=[
                         self.Wrec[i, d],
                         rep(self.c0[i, d]),
                         rep(self.h0[i, d])
                     ],
                     go_backwards=(d == 1),
                 )[0][1][::(1 if d == 0 else -1)] for d in range(Ndirs)
             ],
                               axis=2)
         if dropout != 0.0:
             h *= theano_rng.binomial(
                 n=1,
                 p=1 - dropout,
                 size=h.shape,
                 dtype=theano.config.floatX) / (1 - dropout)
     h = h.dimshuffle((1, 0, 2))
     if predictPer == "sequence":
         h = T.concatenate([
             h[mask_int[1 - d]][:, Nh * d:Nh * (d + 1)]
             for d in range(Ndirs)
         ],
                           axis=1)
     return ACTIVATION[Ay](h.dot(self.Wout) + self.Bout)
    def get_output_for(self, inputs, **kwargs):
        # Retrieve the layer input
        input = inputs[0]
        # Retrieve the mask when it is supplied
        mask = None
        hid_init = None
        if self.mask_incoming_index > 0:
            mask = inputs[self.mask_incoming_index]
        if self.hid_init_incoming_index > 0:
            hid_init = inputs[self.hid_init_incoming_index]

        # Input should be provided as (n_batch, n_time_steps, n_features)
        # but scan requires the iterable dimension to be first
        # So, we need to dimshuffle to (n_time_steps, n_batch, n_features)
        #input = input.dimshuffle(1, 0, *range(2, input.ndim))
        seq_len, num_batch = input.shape[0], input.shape[1]

        # We will always pass the hidden-to-hidden layer params to step
        non_seqs = helper.get_all_params(self.hidden_to_hidden)

        # Create single recurrent computation step function
        def step(input_n, hid_previous, *args):
            # Compute the hidden-to-hidden activation
            hid_pre = helper.get_output(
                self.hidden_to_hidden, hid_previous, **kwargs)

            hid_pre += input_n

            # Clip gradients
            if self.grad_clipping:
                hid_pre = theano.gradient.grad_clip(
                    hid_pre, -self.grad_clipping, self.grad_clipping)

            return self.nonlinearity(hid_pre)

        def step_masked(input_n, mask_n, hid_previous, *args):
            # Skip over any input with mask 0 by copying the previous
            # hidden state; proceed normally for any input with mask 1.
            hid = step(input_n, hid_previous, *args)
            hid_out = T.switch(mask_n, hid, hid_previous)
            return [hid_out]

        if mask is not None:
            mask = mask.dimshuffle(1, 0, 'x')
            sequences = [input, mask]
            step_fun = step_masked
        else:
            sequences = input
            step_fun = step

        if not isinstance(self.hid_init, Layer):
            # The code below simply repeats self.hid_init num_batch times in
            # its first dimension.  Turns out using a dot product and a
            # dimshuffle is faster than T.repeat.
            dot_dims = (list(range(1, self.hid_init.ndim - 1)) +
                        [0, self.hid_init.ndim - 1])
            hid_init = T.dot(T.ones((num_batch, 1)),
                             self.hid_init.dimshuffle(dot_dims))

        if self.unroll_scan:
            # Retrieve the dimensionality of the incoming layer
            input_shape = self.input_shapes[0]
            # Explicitly unroll the recurrence instead of using scan
            hid_out = unroll_scan(
                fn=step_fun,
                sequences=sequences,
                outputs_info=[hid_init],
                go_backwards=self.backwards,
                non_sequences=non_seqs,
                n_steps=input_shape[1])[0]
        else:
            # Scan op iterates over first dimension of input and repeatedly
            # applies the step function
            hid_out = theano.scan(
                fn=step_fun,
                sequences=sequences,
                go_backwards=self.backwards,
                outputs_info=[hid_init],
                non_sequences=non_seqs,
                truncate_gradient=self.gradient_steps,
                strict=True)[0]

        # When it is requested that we only return the final sequence step,
        # we need to slice it out immediately after scan is applied
        if self.only_return_final:
            hid_out = hid_out[-1]
        else:
            # dimshuffle back to (n_batch, n_time_steps, n_features))
            #hid_out = hid_out.dimshuffle(1, 0, *range(2, hid_out.ndim))

            # if scan is backward reverse the output
            if self.backwards:
                hid_out = hid_out[::-1,:]

        return hid_out
Ejemplo n.º 60
0
                                                                 gravity / length * T.sin(X[1])))
    return X_


def step(X):
    k1 = h * f(X)
    k2 = h * f(X + 0.5 * k1)
    k3 = h * f(X + 0.5 * k2)
    k4 = h * f(X + k3)

    X_ = X + (1.0 / 6.0) * k1 + (1.0 / 3.0) * k2 + (1.0 / 3.0) * k3 + (1.0 / 6.0) * k4

    return X_

result, _ = theano.scan(fn=step,
                        outputs_info=x,
                        n_steps=N)

RK4 = theano.function([x,h,mass,length,gravity,N],
                      result,
                      allow_input_downcast=True)




def plot_path():
    theta1 = np.random.uniform(0.0, 2.0 * np.pi)
    theta2 = np.random.uniform(0.0, 2.0 * np.pi)
    l      = 1.0
    m      = 1.0
    g      = 9.81