Esempio n. 1
0
def stack_and_shared(_input):
    """
    This will take a list of input variables, turn them into theano shared variables, and return them stacked
    in a single tensor.

    :param _input: list of input variables
    :type _input: list, object, or none

    :return: symbolic tensor of the input variables stacked, or none
    :rtype: Tensor or None
    """
    if _input is None:
        return None
    elif isinstance(_input, list):
        shared_ins = []
        for _in in _input:
            try:
                shared_ins.append(theano.shared(_in))
            except TypeError as _:
                shared_ins.append(_in)
        return T.stack(shared_ins)
    else:
        try:
            _output = [theano.shared(_input)]
        except TypeError as _:
            _output = [_input]
        return T.stack(_output)
Esempio n. 2
0
        def _step(x_, h_, c_, pred_, prob_):
            h_a = []
            c_a = []
            for it in range(self.n_levels):
                preact = T.dot(h_[it], self.U[it])
                preact += T.dot(x_, self.W[it]) + self.b[it]

                i = T.nnet.sigmoid(_slice(preact, 0, self.n_dim))
                f = T.nnet.sigmoid(_slice(preact, 1, self.n_dim))
                o = T.nnet.sigmoid(_slice(preact, 2, self.n_dim))
                c = T.tanh(_slice(preact, 3, self.n_dim))

                c = f * c_[it] + i * c
                h = o * T.tanh(c)

                h_a.append(h)
                c_a.append(c)

                x_ = h

            q = T.dot(h, self.L) + self.b0
            prob = T.nnet.softmax(q)
            pred = T.argmax(prob, axis=1)

            return T.stack(h_a).squeeze(), T.stack(c_a).squeeze(), pred, prob
Esempio n. 3
0
    def predict_K(self, x, z, params):
        # s_mean, s_x for computing mean from s_x
        Ks = []
        Ks_new = []
        offset = 0
        for kern, slice_k in zip(self.kernels, self.slices):
            params_k = params[offset: offset + kern.n_params]
            K_k, K_new_k = kern.predict_K(
                x[:, slice_k], z[:, slice_k], params_k)
            Ks.append(K_k)
            Ks_new.append(K_new_k)
            offset += kern.n_params

        log_weights = TT.concatenate((np.asarray([0]),
                                      params[offset:offset + self.n_my_params]))
        weights = TT.exp(log_weights) / TT.exp(log_weights).sum()

        if len(self.kernels) == 1:
            return Ks[0], Ks_new[0]
        else:
            # XXX: log_K, should be logadd here (#11)
            wK = TT.sum(
                weights[:, None, None] * TT.stack(*Ks), axis=0)
            wK_new = TT.sum(
                weights[:, None, None] * TT.stack(*Ks_new), axis=0)
            return wK, wK_new
Esempio n. 4
0
    def _for_step(self,
                  xi_t, xf_t, xo_t, xc_t, mask_t,
                  h_tm1, c_tm1,
                  context, context_mask, context_att_trans,
                  hist_h, hist_h_att_trans,
                  b_u):

        # context: (batch_size, context_size, context_dim)

        # (batch_size, att_layer1_dim)
        h_tm1_att_trans = T.dot(h_tm1, self.att_h_W1)

        # (batch_size, context_size, att_layer1_dim)
        att_hidden = T.tanh(context_att_trans + h_tm1_att_trans[:, None, :])

        # (batch_size, context_size, 1)
        att_raw = T.dot(att_hidden, self.att_W2) + self.att_b2

        # (batch_size, context_size)
        ctx_att = T.exp(att_raw).reshape((att_raw.shape[0], att_raw.shape[1]))

        if context_mask:
            ctx_att = ctx_att * context_mask

        ctx_att = ctx_att / T.sum(ctx_att, axis=-1, keepdims=True)

        # (batch_size, context_dim)
        ctx_vec = T.sum(context * ctx_att[:, :, None], axis=1)

        ##### attention over history #####

        if hist_h:
            hist_h = T.stack(hist_h).dimshuffle((1, 0, 2))
            hist_h_att_trans = T.stack(hist_h_att_trans).dimshuffle((1, 0, 2))
            h_tm1_hatt_trans = T.dot(h_tm1, self.hatt_h_W1)

            hatt_hidden = T.tanh(hist_h_att_trans + h_tm1_hatt_trans[:, None, :])
            hatt_raw = T.dot(hatt_hidden, self.hatt_W2) + self.hatt_b2
            hatt_raw = hatt_raw.flatten(2)
            h_att_weights = T.nnet.softmax(hatt_raw)

            # (batch_size, output_dim)
            h_ctx_vec = T.sum(hist_h * h_att_weights[:, :, None], axis=1)
        else:
            h_ctx_vec = T.zeros_like(h_tm1)

        ##### attention over history #####

        i_t = self.inner_activation(xi_t + T.dot(h_tm1 * b_u[0], self.U_i) + T.dot(ctx_vec, self.C_i) + T.dot(h_ctx_vec, self.H_i))
        f_t = self.inner_activation(xf_t + T.dot(h_tm1 * b_u[1], self.U_f) + T.dot(ctx_vec, self.C_f) + T.dot(h_ctx_vec, self.H_f))
        c_t = f_t * c_tm1 + i_t * self.activation(xc_t + T.dot(h_tm1 * b_u[2], self.U_c) + T.dot(ctx_vec, self.C_c) + T.dot(h_ctx_vec, self.H_c))
        o_t = self.inner_activation(xo_t + T.dot(h_tm1 * b_u[3], self.U_o) + T.dot(ctx_vec, self.C_o) + T.dot(h_ctx_vec, self.H_o))
        h_t = o_t * self.activation(c_t)

        h_t = (1 - mask_t) * h_tm1 + mask_t * h_t
        c_t = (1 - mask_t) * c_tm1 + mask_t * c_t

        # ctx_vec = theano.printing.Print('ctx_vec')(ctx_vec)

        return h_t, c_t, ctx_vec
Esempio n. 5
0
def stack_and_shared(input):
    """
    This will take a list of input variables, turn them into theano shared variables, and return them stacked
    in a single tensor.

    Parameters
    ----------
    input : list or object
        List of input variables to stack into a single shared tensor.

    Returns
    -------
    tensor
        Symbolic tensor of the input variables stacked, or None if input was None.
    """
    if input is None:
        return None
    elif isinstance(input, list):
        shared_ins = []
        for _in in input:
            try:
                shared_ins.append(theano.shared(_in))
            except TypeError as _:
                shared_ins.append(_in)
        return T.stack(shared_ins)
    else:
        try:
            _output = [theano.shared(input)]
        except TypeError as _:
            _output = [input]
        return T.stack(_output)
Esempio n. 6
0
        def forward_prop_step_stack(x_t, masks, h_prevs, c_prevs, stack_prevs, ptrs_to_top_prevs):
            # determine, for all layers, if this input was a push/pop
            is_push, is_pop = map_push_pop(x_t, self.PUSH, self.POP)
            is_null = get_is_null(x_t, self.NULL)

            nonsymbolic_hs = []
            nonsymbolic_cs = []
            nonsymbolic_stacks = []
            nonsymbolic_ptrs_to_tops = []

            h = x_t
            for i,layer in enumerate(self.layers):
                h, c, stack, ptrs_to_top = layer.forward_prop_stack(h, h_prevs[i,:,:], c_prevs[i,:,:], stack_prevs[i,:,:,:], ptrs_to_top_prevs[i,:,:,:], is_push, is_pop, is_null)
                h = h*masks[:,:,i] / self.dropout # inverted dropout for scaling

                nonsymbolic_hs.append(h)
                nonsymbolic_cs.append(c)
                nonsymbolic_stacks.append(stack)
                nonsymbolic_ptrs_to_tops.append(ptrs_to_top)
            
            h_s = T.stack(nonsymbolic_hs)
            c_s = T.stack(nonsymbolic_cs)
            stack_s = T.stack(nonsymbolic_stacks)
            ptrs_to_top_s = T.stack(nonsymbolic_ptrs_to_tops)

            o_t = self.W_hy.dot(h)
            
            return o_t, h_s, c_s, stack_s, ptrs_to_top_s
Esempio n. 7
0
 def tangent2ambient(self, X, Z):
     U = tensor.stack((X.U.dot(Z.M) + Z.Up, X.U), 0).reshape((-1, X.U.shape[1]))
     #U = np.hstack((X.U.dot(Z.M) + Z.Up, X.U))
     S = tensor.eye(2*self._k)
     V = tensor.stack((X.V, Z.Vp), 1).reshape((X.V.shape[0], -1))
     #V = np.vstack((X.V, Z.Vp))
     return ManifoldElementShared.from_vars((U, S, V), shape=(self._m, self._n), r=self._k)
 def tangent2ambient(self, X, Z):
     U = tensor.stack((X.U.dot(Z.M) + Z.Up, X.U), 0).reshape((-1, X.U.shape[1]))
     #U = np.hstack((X.U.dot(Z.M) + Z.Up, X.U))
     S = tensor.eye(2*self._k)
     V = tensor.stack((X.V, Z.Vp), 1).reshape((X.V.shape[0], -1))
     #V = np.vstack((X.V, Z.Vp))
     return (U, S, V)
    def retr(self, X, Z, t=None):
        U, S, V = X
        Up, M, Vp = Z
        if t is None:
            t = 1.0
        Qu, Ru = tensor.nlinalg.qr(Up)

        # we need rq decomposition here
        Qv, Rv = tensor.nlinalg.qr(Vp[::-1].T)
        Rv = Rv.T[::-1]
        Rv = Rv[:, ::-1]
        Qv = Qv.T[::-1]

        # now we have rq decomposition (Rv @ Qv = Z.Vp)
        #Rv, Qv = rq(Z.Vp, mode='economic')


        zero_block = tensor.zeros((Ru.shape[0], Rv.shape[1]))
        block_mat = tensor.stack(
            (
                tensor.stack((S + t * M, t * Rv), 1).reshape((Rv.shape[0], -1)),
                tensor.stack((t * Ru, zero_block), 1).reshape((Ru.shape[0], -1))
            )
        ).reshape((-1, Ru.shape[1] + Rv.shape[1]))

        Ut, St, Vt = tensor.nlinalg.svd(block_mat, full_matrices=False)

        U_res = tensor.stack((U, Qu), 1).reshape((Qu.shape[0], -1)).dot(Ut[:, :self._k])
        V_res = Vt[:self._k, :].dot(tensor.stack((V, Qv), 0).reshape((-1, Qv.shape[1])))
        # add some machinery eps to get a slightly perturbed element of a manifold
        # even if we have some zeros in S
        S_res = tensor.diag(St[:self._k]) + tensor.diag(np.spacing(1) * tensor.ones(self._k))
        return (U_res, S_res, V_res)
Esempio n. 10
0
    def generate(self, h_, c_, x_):
        h_a = []
        c_a = []
        for it in range(self.n_levels):
            preact = T.dot(x_, self.W[it])
            preact += T.dot(h_[it], self.U[it]) + self.b[it]

            i = T.nnet.sigmoid(self.slice(preact, 0, self.n_dim))
            f = T.nnet.sigmoid(self.slice(preact, 1, self.n_dim))
            o = T.nnet.sigmoid(self.slice(preact, 2, self.n_dim))
            c = T.tanh(self.slice(preact, 3, self.n_dim))

            c = f * c_[it] + i * c
            h = o * T.tanh(c)

            h_a.append(h)
            c_a.append(c)

            x_ = h

        q = T.dot(h, self.L) + self.b0
        # mask = T.concatenate([T.alloc(np_floatX(1.), q.shape[0] - 1), T.alloc(np_floatX(0.), 1)])
        prob = T.nnet.softmax(q / 1)

        return prob, T.stack(h_a).squeeze(), T.stack(c_a)[0].squeeze()
Esempio n. 11
0
 def func(chol_vec, delta):
     chol = tt.stack([
         tt.stack([tt.exp(0.1 * chol_vec[0]), 0]),
         tt.stack([chol_vec[1], 2 * tt.exp(chol_vec[2])]),
     ])
     cov = tt.dot(chol, chol.T)
     return MvNormalLogp()(cov, delta)
Esempio n. 12
0
    def finetune_cost_updates(self, center, mu, learning_rate):
        """ This function computes the cost and the updates ."""

        # note : we sum over the size of a datapoint; if we are using
        #        minibatches, L will be a vector, withd one entry per
        #        example in minibatch
        network_output = self.get_output()
        temp = T.pow(center - network_output, 2)    
        
        L =  T.sum(temp, axis=1) 
        # Add the network reconstruction error 
        z = self.get_network_reconst()
        reconst_err = T.sum(T.pow(self.x - z, 2), axis = 1)            
        L = self.beta*L + self.lbd*reconst_err
        
        cost1 = T.mean(L)
        cost2 = self.lbd*T.mean(reconst_err)  
        cost3 = cost1 - cost2

        # compute the gradients of the cost of the `dA` with respect
        # to its parameters
        gparams = T.grad(cost1, self.params)  
        # generate the list of updates
        updates = []
        grad_values = []
        param_norm = []
        for param, delta, gparam in zip(self.params, self.delta, gparams):
            updates.append( (delta, mu*delta - learning_rate * gparam) )
            updates.append( (param, param + mu*mu*delta - (1+mu)*learning_rate*gparam ))
            grad_values.append(gparam.norm(L=2))
            param_norm.append(param.norm(L=2))
        
        grad_ = T.stack(*grad_values)
        param_ = T.stack(*param_norm)
        return ((cost1, cost2, cost3, grad_, param_), updates)
Esempio n. 13
0
	def _setOutputs(self) :
		inps = []
		for l in self.network.inConnections[self] :
			inps.append(l.outputs)

		self.outputs = tt.stack(inps).reshape((-1, self.nbChannels, self.height, self.width))
		self.testOutputs = tt.stack(inps).reshape((-1, self.nbChannels, self.height, self.width))
    def _batch_vectorization(self,**args):
        fun_in = args["fun"]
        symbolic_X_list = args["symbolic_X_list"]
        if "symbolic_c_inp_list" in args and "t" in args:
            t = args["t"]
            symbolic_c_inp_list = args["symbolic_c_inp_list"]
            fun = lambda x,y: fun_in(x,y,t)
        elif "symbolic_c_inp_list" in args and "t" not in args:
            symbolic_c_inp_list = args["symbolic_c_inp_list"]
            fun = fun_in
        elif "symbolic_c_inp_list" not in args and "t" in args:
            t = args["t"]
            symbolic_c_inp_list = []
            fun = lambda x,y: fun_in(x,t)

        fun_list = []
        for i in np.arange(self.number_of_rollouts):
            symbolic_X_list_i = [a[i] for a in symbolic_X_list]
            symbolic_c_inp_list_i = [a[i] for a in symbolic_c_inp_list]
            out_list = fun(symbolic_X_list_i,symbolic_c_inp_list)
            fun_list.append(out_list)
        if type(fun_list[0]) != list:
            return T.stack(fun_list,axis = 0)
        else:
            ziped_list = [list(a) for a in zip(*fun_list)]
            return [T.stack(a,axis = 0) for a in ziped_list]
Esempio n. 15
0
    def retr(self, X, Z, t=None):
        if t is None:
            t = 1.0
        Qu, Ru = tensor.nlinalg.QRFull(Z.Up)

        # we need rq decomposition here
        Qv, Rv = tensor.nlinalg.QRFull(Z.Vp[::-1].T)
        Rv = Rv.T[::-1]
        Rv[:, :] = Rv[:, ::-1]
        Qv = Qv.T[::-1]

        # now we have rq decomposition (Rv @ Qv = Z.Vp)
        #Rv, Qv = rq(Z.Vp, mode='economic')


        zero_block = tensor.zeros((Ru.shape[0], Rv.shape[1]))
        block_mat = tensor.stack(
            (
                tensor.stack((X.S + t * Z.M, t * Rv), 1).reshape((Rv.shape[0], -1)),
                tensor.stack((t * Ru, zero_block), 1).reshape((Ru.shape[0], -1))
            )
        ).reshape((-1, Ru.shape[1] + Rv.shape[1]))

        Ut, St, Vt = tensor.nlinalg.svd(block_mat, full_matrices=False)

        U = tensor.stack((X.U, Qu), 1).reshape((Qu.shape[0], -1)).dot(Ut[:, :self._k])
        V = Vt[:self._k, :].dot(tensor.stack((X.V, Qv), 0).reshape((-1, Qv.shape[1])))
        # add some machinery eps to get a slightly perturbed element of a manifold
        # even if we have some zeros in S
        S = tensor.diag(St[:self._k]) + tensor.diag(np.spacing(1) * tensor.ones(self._k))
        return ManifoldElementShared.from_vars((U, S, V), shape=(self._m, self._n), r=self._k)
Esempio n. 16
0
def local_gpu_sum(node):
    if isinstance(node.op, tensor.elemwise.CAReduce):
        if node.op.scalar_op == scal.add:
            x, = node.inputs
            if x.owner and x.owner.op == host_from_gpu:
                if node.op.axis is None:
                    reduce_mask = [1] * x.type.ndim
                else:
                    reduce_mask = [0] * x.type.ndim
                    for a in node.op.axis:
                        assert reduce_mask[a] == 0
                        reduce_mask[a] = 1
                gsum = GpuSum(reduce_mask)
                pattern = "".join(str(i) for i in reduce_mask)
                if hasattr(gsum, "c_code_reduce_%s" % pattern):
                    rval = host_from_gpu(gsum(gpu_from_host(x)))
                    if rval.type == node.outputs[0].type:
                        return [rval]
                    else:
                        print >>sys.stderr, "WARNING: local_gpu_sum got type wrong"
                        return None
                else:

                    # Try to make a simpler pattern based on reshaping
                    # The principle is that if two adjacent dimensions have the same value in
                    # the reduce_mask, then we can reshape to make them a single dimension, do
                    # the sum, and then reshape to get them back.

                    shape_of = node.env.shape_feature.shape_of

                    x_shape = shape_of[x]

                    new_in_shp = [x_shape[0]]
                    new_mask = [reduce_mask[0]]
                    for i in range(1, x.type.ndim):
                        if reduce_mask[i] == reduce_mask[i - 1]:
                            new_in_shp[-1] *= x_shape[i]
                        else:
                            new_mask.append(reduce_mask[i])
                            new_in_shp.append(x_shape[i])

                    pattern = "".join(str(i) for i in new_mask)
                    new_gsum = GpuSum(new_mask)
                    if hasattr(new_gsum, "c_code_reduce_%s" % pattern):
                        reshaped_x = x.reshape(tensor.stack(*new_in_shp))
                        sum_reshaped_x = host_from_gpu(new_gsum(gpu_from_host(reshaped_x)))

                        if sum_reshaped_x.ndim != node.outputs[0].ndim:
                            unreshaped_sum = sum_reshaped_x.reshape(tensor.stack(*shape_of[node.outputs[0]]))
                        else:
                            unreshaped_sum = sum_reshaped_x
                        if unreshaped_sum.type == node.outputs[0].type:
                            return [unreshaped_sum]
                        else:
                            print >>sys.stderr, "WARNING: local_gpu_sum got type wrong"
                            return None

                        raise Exception("GpuSum don't have implemented the pattern", pattern)
    return False
Esempio n. 17
0
	def new_attn_step(self,c_t,g_tm1,m_im1,q):
		cWq = T.stack([T.dot(T.dot(c_t, self.Wb), q)])
        	cWm = T.stack([T.dot(T.dot(c_t, self.Wb), m_im1)])
		z = T.concatenate([c_t,m_im1,q,c_t*q,c_t*m_im1,T.abs_(c_t-q),T.abs_(c_t-m_im1),cWq,cWm],axis=0)
		l_1 = T.dot(self.W1, z) + self.b1
		l_1 = T.tanh(l_1)
		l_2 = T.dot(self.W2,l_1) + self.b2
		return l_2[0]
Esempio n. 18
0
 def new_attention_step(self, ct, prev_g, mem, q_q):
     cWq = T.stack([T.dot(T.dot(ct, self.W_b), q_q)])
     cWm = T.stack([T.dot(T.dot(ct, self.W_b), mem)])
     z = T.concatenate([ct, mem, q_q, ct * q_q, ct * mem, (ct - q_q) ** 2, (ct - mem) ** 2, cWq, cWm])        
     l_1 = T.dot(self.W_1, z) + self.b_1
     l_1 = T.tanh(l_1)
     l_2 = T.dot(self.W_2, l_1) + self.b_2
     G = T.nnet.sigmoid(l_2)[0]
     return G
Esempio n. 19
0
def max_pool(images, imgshp, maxpoolshp):
    """Implements a max pooling layer

    Takes as input a 2D tensor of shape batch_size x img_size and
    performs max pooling.  Max pooling downsamples by taking the max
    value in a given area, here defined by maxpoolshp. Outputs a 2D
    tensor of shape batch_size x output_size.

    :param images: 2D tensor containing images on which to apply convolution.
                   Assumed to be of shape batch_size x img_size
    :param imgshp: tuple containing image dimensions
    :param maxpoolshp: tuple containing shape of area to max pool over

    :return: out1, symbolic result (2D tensor)
    :return: out2, logical shape of the output
    """
    N = numpy
    poolsize = N.int64(N.prod(maxpoolshp))

    # imgshp contains either 2 entries (height,width) or 3 (nfeatures,h,w)
    # in the first case, default nfeatures to 1
    if N.size(imgshp) == 2:
        imgshp = (1,) + imgshp

    # construct indices and index pointers for sparse matrix, which,
    # when multiplied with input images will generate a stack of image
    # patches
    indices, indptr, spmat_shape, sptype, outshp = \
            convolution_indices.conv_eval(imgshp, maxpoolshp,
                                          maxpoolshp, mode='valid')

#    print 'XXXXXXXXXXXXXXXX MAX POOLING LAYER XXXXXXXXXXXXXXXXXXXX'
#    print 'imgshp = ', imgshp
#    print 'maxpoolshp = ', maxpoolshp
#    print 'outshp = ', outshp

    # build sparse matrix, then generate stack of image patches
    csc = theano.sparse.CSM(sptype)(N.ones(indices.size), indices,
                                    indptr, spmat_shape)
    patches = sparse.structured_dot(csc, images.T).T

    pshape = tensor.stack([images.shape[0] *\
                               tensor.as_tensor(N.prod(outshp)),
                           tensor.as_tensor(imgshp[0]),
                           tensor.as_tensor(poolsize)])
    patch_stack = tensor.reshape(patches, pshape, ndim=3)

    out1 = tensor.max(patch_stack, axis=2)

    pshape = tensor.stack([images.shape[0],
                           tensor.as_tensor(N.prod(outshp)),
                           tensor.as_tensor(imgshp[0])])
    out2 = tensor.reshape(out1, pshape, ndim=3)

    out3 = tensor.DimShuffle(out2.broadcastable, (0, 2, 1))(out2)

    return tensor.flatten(out3, 2), outshp
Esempio n. 20
0
    def cost(self, readouts, outputs):
        # initial state
        state = self.frnn_initial_state.apply(self.mlp.apply(readouts))

        inputs = outputs

        mus = []
        sigmas = []
        coeffs = []

        for i in range(self.number_of_steps):
            last_iteration = i == self.number_of_steps - 1

            # First generating distribution parameters and sampling.
            freq_mu = self.mu.apply(state)
            freq_sigma = self.sigma.apply(state) + self.const
            freq_coeff = self.coeff2.apply(self.coeff.apply(state), extra_ndim=state.ndim - 2) + self.const

            freq_mu = freq_mu.reshape((-1, self.frnn_step_size, self.k))
            freq_sigma = freq_sigma.reshape((-1, self.frnn_step_size, self.k))
            freq_coeff = freq_coeff.reshape((-1, self.k))
            # mu,sigma: shape (-1,fs,k)
            # coeff: shape (-1,k)

            mus.append(freq_mu)
            sigmas.append(freq_sigma)
            coeffs.append(freq_coeff)

            index = self.frnn_step_size
            freq_inputs = inputs[
                tuple([slice(0, None)] * (inputs.ndim - 1) + [slice(index, index + self.frnn_step_size)])
            ]

            if not last_iteration:
                state = self.frnn_activation.apply(
                    self.frnn_linear_transition_state.apply(state)
                    + self.frnn_linear_transition_input.apply(freq_inputs)
                )

        mus = tensor.stack(mus, axis=-2)
        sigmas = tensor.stack(sigmas, axis=-2)
        coeffs = tensor.stack(coeffs, axis=-2)

        mus = mus.reshape((-1, self.frnn_step_size * self.number_of_steps, self.k))
        sigmas = sigmas.reshape((-1, self.frnn_step_size * self.number_of_steps, self.k))
        coeffs = coeffs.repeat(self.frnn_step_size, axis=-2)

        mus = mus[tuple([slice(0, None)] * (mus.ndim - 2) + [slice(0, self.frame_size)] + [slice(0, None)])]
        sigmas = sigmas[tuple([slice(0, None)] * (sigmas.ndim - 2) + [slice(0, self.frame_size)] + [slice(0, None)])]
        coeffs = coeffs[tuple([slice(0, None)] * (coeffs.ndim - 2) + [slice(0, self.frame_size)] + [slice(0, None)])]
        # actually prob not necessary
        mu = mus.reshape((-1, self.target_size))
        sigma = sigmas.reshape((-1, self.target_size))
        coeff = coeffs.reshape((-1, self.target_size))

        return FRNN_NLL(y=outputs, mu=mu, sig=sigma, coeff=coeff, frame_size=self.frame_size, k=self.k)
Esempio n. 21
0
    def _grad_single(self, ct, s, lnC2, GAMMI2):
        lnC = lnC2
        GAMMI = GAMMI2
        v = self.v#T.as_tensor(self.v)[:,ct:]
        v0 = T.as_tensor(v[v[:,0]==0, :])
        v1 = T.as_tensor(v[v[:,0]==1, :])

        cnp = v.shape[0]

        # Gradient of fE wrt the priors over final state
        [ofE, oxS], upd_fE_single = th.scan(fn=self._free_energy,
                                   sequences=v,
                                   non_sequences=[s,self.h,lnC,self.b])
        ofE0 = ofE[v0].sum()
        ofE1 = ofE[v1].sum()

        dFE0dlnC = T.jacobian(ofE0, lnC)
        dFE1dlnC = T.jacobian(ofE1, lnC)
        dFEdlnC  = T.jacobian(ofE,  lnC)
        ofE_ = T.vector()
        ofE_.tag.test_value = ofE.tag.test_value

        # Gradient of Gamma with respect to its initial condition:
        GAMMA, upd_GAMMA = th.scan(fn=self._upd_gamma,
               outputs_info=[GAMMI],
               non_sequences=[ofE, self.lambd, self.alpha, self.beta, cnp],
               n_steps=4)
        dGdg = T.grad(GAMMA[-1], GAMMI)

        dGdfE = T.jacobian(GAMMA[-1], ofE)
        dGdlnC = dGdfE.dot(dFEdlnC)

        out1 = ofE0
        out2 = ofE1
        maxout = T.max([out1, out2])

        exp_out1 = T.exp(GAMMA[-1]*(out1 - maxout))
        exp_out2 = T.exp(GAMMA[-1]*(out2 - maxout))
        norm_const = exp_out1 + exp_out2

        # Derivative wrt the second output (gammi):
        Jac1_gammi = (-(out1-out2)*dGdg*
                T.exp(GAMMA[-1]*(out1+out2 - 2*maxout))/(norm_const**2))
        Jac2_gammi = -Jac1_gammi
#        dfd1_tZ = Jac1_gammi*dCdf[1][0]+ Jac2_gammi*dCdf[1][1]

        # Derivative wrt first input (lnc)
        Jac1_lnC = (T.exp(GAMMA[-1]*(out1 + out2 - 2*maxout))/(norm_const**2)*
                  (-dGdlnC*(out1 - out2) - GAMMA[-1]*(dFE0dlnC - dFE1dlnC)))
        Jac2_lnC = -Jac1_lnC

        Jac1 = T.concatenate([T.stack(Jac1_gammi), Jac1_lnC])
        Jac2 = T.concatenate([T.stack(Jac2_gammi), Jac2_lnC])
        self.debug = [Jac1_lnC, Jac2_lnC, Jac2_gammi, Jac1_gammi, dFE0dlnC,
                      dFE1dlnC, dGdg, out1, out2, v0, v1, v, ct]
        return Jac1, Jac2
Esempio n. 22
0
 def underdamped():
     Q = self.Q
     f = tt.sqrt(tt.maximum(1.0 - 4.0*Q**2, self.eps))
     return (
         0.5*self.S0*self.w0*Q*tt.stack([1.0+1.0/f, 1.0-1.0/f]),
         0.5*self.w0/Q*tt.stack([1.0-f, 1.0+f]),
         tt.zeros(0, dtype=self.dtype),
         tt.zeros(0, dtype=self.dtype),
         tt.zeros(0, dtype=self.dtype),
         tt.zeros(0, dtype=self.dtype),
     )
def gen_stats(params, infos, what_stats):
    if not what_stats:
        return []
    results = []
    for stat in what_stats:
        print len(params), len(infos)
        res = stat.comp_all(params, infos)
        print stat
        print res
        results.append(T.stack(*res))
    return T.stack(*results)
Esempio n. 24
0
File: opt.py Progetto: chagge/Theano
def local_gpua_careduce(node):
    if (isinstance(node.op.scalar_op, scalar.basic.Add) or
        isinstance(node.op.scalar_op, scalar.basic.Mul)):
        x, = node.inputs
        greduce = GpuCAReduceCuda(node.op.scalar_op, axis=node.op.axis)
        if x.dtype != "float32":
            return
        gvar = greduce(x)
        #We need to have the make node called, otherwise the mask can
        #be None
        if gvar.owner.op.supports_c_code([gpu_from_host(x)]):
            return greduce
        else:
            # Try to make a simpler pattern based on reshaping
            # The principle is that if two adjacent dimensions have
            # the same value in the reduce_mask, then we can reshape
            # to make them a single dimension, do the reduction, and
            # then reshape to get them back.

            if node.op.axis is None:
                reduce_mask = [1] * x.type.ndim
            else:
                reduce_mask = [0] * x.type.ndim
                for a in node.op.axis:
                    assert reduce_mask[a] == 0
                    reduce_mask[a] = 1

            shape_of = node.fgraph.shape_feature.shape_of

            x_shape = shape_of[x]

            new_in_shp = [x_shape[0]]
            new_mask = [reduce_mask[0]]
            for i in xrange(1, x.type.ndim):
                if reduce_mask[i] == reduce_mask[i - 1]:
                    new_in_shp[-1] *= x_shape[i]
                else:
                    new_mask.append(reduce_mask[i])
                    new_in_shp.append(x_shape[i])

            new_greduce = GpuCAReduceCuda(new_mask, scalar_op)
            reshaped_x = x.reshape(tensor.stack(*new_in_shp))
            gpu_reshaped_x = gpu_from_host(reshaped_x)
            reshaped_gpu_inputs = [gpu_reshaped_x]
            if new_greduce.supports_c_code(reshaped_gpu_inputs):
                reduce_reshaped_x = host_from_gpu(
                    new_greduce(gpu_reshaped_x))

                if reduce_reshaped_x.ndim != node.outputs[0].ndim:
                    unreshaped_reduce = reduce_reshaped_x.reshape(
                        tensor.stack(*shape_of[node.outputs[0]]))
                else:
                    unreshaped_reduce = reduce_reshaped_x
                return [unreshaped_reduce]
Esempio n. 25
0
 def test_hessian(self):
     chol_vec = tt.vector('chol_vec')
     chol_vec.tag.test_value = np.array([0.1, 2, 3])
     chol = tt.stack([
         tt.stack([tt.exp(0.1 * chol_vec[0]), 0]),
         tt.stack([chol_vec[1], 2 * tt.exp(chol_vec[2])]),
     ])
     cov = tt.dot(chol, chol.T)
     delta = tt.matrix('delta')
     delta.tag.test_value = np.ones((5, 2))
     logp = MvNormalLogp()(cov, delta)
     g_cov, g_delta = tt.grad(logp, [cov, delta])
     tt.grad(g_delta.sum() + g_cov.sum(), [delta, cov])
Esempio n. 26
0
 def get_output_for(self, input, **kwargs):
     if input.ndim > 2:
         # if the input has more than two dimensions, flatten it into a
         # batch of feature vectors.
         input = input.flatten(2)
     cores = tuple(getattr(self, attr_name) for attr_name in self.attr_names)
     unitary_input = tensor.reshape(input, (input.shape[0], 2, self.num_inputs))
     IR, II = unitary_input[:, 0, :], unitary_input[:, 1, :]
     I = tensor.stack([IR, II], axis=0)
     output = comp_wtt_image(I, cores, self.nd, self.ranks)
     output = tensor.stack([output[0, ...], output[1, ...]], axis=1)
     output = output.reshape((input.shape[0], -1))
     return output
Esempio n. 27
0
def t_mk_pool_ready(t_pool_input, t_pool_shape):
    """
    Prepare pooling input
    :param t_pool_input: 4D theano tensor batch_sz x channels x height x width
    :param t_pool_shape: theano lvector pool_ch x pool_h x pool_w
    :return: aux. sizes and input reshaped for pooling
    """
    # sizes
    # input
    t_batch_sz = t_pool_input.shape[0]
    t_in_ch = t_pool_input.shape[1]
    t_in_h = t_pool_input.shape[2]
    t_in_w = t_pool_input.shape[3]
    # pooling
    t_pool_ch = t_pool_shape[0]
    t_pool_h = t_pool_shape[1]
    t_pool_w = t_pool_shape[2]
    # output
    t_out_ch = (t_in_ch + t_pool_ch - 1) // t_pool_ch
    t_out_h = (t_in_h + t_pool_h - 1) // t_pool_h
    t_out_w = (t_in_w + t_pool_w - 1) // t_pool_w

    # we will need to pad input (probably), so here's the padded shape:
    t_padded_ch = t_out_ch * t_pool_ch
    t_padded_h = t_out_h * t_pool_h
    t_padded_w = t_out_w * t_pool_w
    t_padded_pool_in_z = T.zeros(T.stack([t_batch_sz, t_padded_ch, t_padded_h, t_padded_w]))
    t_padded_pool_in = T.inc_subtensor(t_padded_pool_in_z[:t_batch_sz, :t_in_ch, :t_in_h, :t_in_w], t_pool_input)

    # below is all computed
    # spatial pooling
    t_sp_pooled = images2neibs(t_padded_pool_in, T.stack([t_pool_h, t_pool_w]))
    # spatial pooling output shape
    # has size (B * C * H/h * W/w) x (h*w)
    t_sp_pooled_dims = t_sp_pooled.shape
    # lines per channel
    # H*W / (h*w)
    t_lpc = (t_padded_h * t_padded_w) // (t_pool_h * t_pool_w)
    # shape to collect channels
    t_ch_pool_prep_dims_1 = T.stack([t_sp_pooled_dims[0] // t_lpc, t_lpc, t_sp_pooled_dims[1]])
    # preparing pooling by channels
    # reshape to collect channels in a separate dimension
    t_ch_pool_prep_1 = T.reshape(t_sp_pooled, t_ch_pool_prep_dims_1)
    t_ch_pool_prep_2 = T.shape_padleft(T.transpose(t_ch_pool_prep_1, [1, 0, 2]))
    # prepare for channel pooling
    t_ch_pool_dims = T.stack([t_pool_ch, t_ch_pool_prep_dims_1[-1]])
    t_pool_ready = images2neibs(t_ch_pool_prep_2, t_ch_pool_dims)
    return t_batch_sz, t_in_ch, t_in_h, t_in_w, t_out_ch, t_out_h, t_out_w, t_pool_ready
Esempio n. 28
0
    def symbolic_call(self,x,u):

        u = TT.clip(u, -self.max_force, self.max_force) #pylint: disable=E1111

        dt = self.dt

        z = TT.take(x,0,axis=x.ndim-1)
        zdot = TT.take(x,1,axis=x.ndim-1)    
        th = TT.take(x,2,axis=x.ndim-1)
        thdot = TT.take(x,3,axis=x.ndim-1)
        u0 = TT.take(u,0,axis=u.ndim-1)

        th1 = np.pi - th

        g = 10.
        mc = 1. # mass of cart
        mp = .1 # mass of pole
        muc = .0005 # coeff friction of cart
        mup = .000002 # coeff friction of pole
        l = 1. # length of pole

        def sign(x):
            return TT.switch(x>0, 1, -1)

        thddot = -(-g*TT.sin(th1)
         + TT.cos(th1) * (-u0 - mp * l *thdot**2 * TT.sin(th1) + muc*sign(zdot))/(mc+mp)
          - mup*thdot / (mp*l))  \
        / (l*(4/3. - mp*TT.cos(th1)**2 / (mc + mp)))
        zddot = (u0 + mp*l*(thdot**2 * TT.sin(th1) - thddot * TT.cos(th1)) - muc*sign(zdot))  \
            / (mc+mp)

        newzdot = zdot + dt*zddot
        newz = z + dt*newzdot
        newthdot = thdot + dt*thddot
        newth = th + dt*newthdot

        done = (z > self.max_cart_pos) | (z < -self.max_cart_pos) | (th > self.max_pole_angle) | (th < -self.max_pole_angle) 

        ucost = 1e-5*(u**2).sum(axis=u.ndim-1)
        xcost = 1-TT.cos(th)
        # notdone = TT.neg(done) #pylint: disable=W0612,E1111
        notdone = 1-done
        costs = TT.stack((done-1)*10., notdone*xcost, notdone*ucost).T #pylint: disable=E1103


        newx = TT.stack(newz, newzdot, newth, newthdot).T #pylint: disable=E1103

        return [newx,newx,costs,done]
def normout_actfun(input, pool_size, filt_count):
    """Apply (L2) normout over non-overlapping sets of values."""
    l_start = filt_count - pool_size
    relu_vals = T.stack(\
        *[input[:,i:(l_start+i+1):pool_size] for i in range(pool_size)])
    pooled_vals = T.sqrt(T.mean(relu_vals**2.0, axis=0))
    return pooled_vals
 def get_reconstructed_input(self, hidden):
     """ Computes the reconstructed input given the values of the hidden layer """
     repeated_conv = conv.conv2d(input = hidden, filters = self.W_prime, border_mode='full')
     multiple_conv_out = [repeated_conv.flatten()] * numpy.prod(self.poolsize)
     stacked_conv_neibs = T.stack(*multiple_conv_out).T
     stretch_unpooling_out = neibs2images(stacked_conv_neibs, self.pl, self.x.shape) 
     return ReLU(stretch_unpooling_out + self.b_prime.dimshuffle('x', 0, 'x', 'x'))
Esempio n. 31
0
def convolve(kerns,
             kshp,
             nkern,
             images,
             imgshp,
             step=(1, 1),
             bias=None,
             mode='valid',
             flatten=True):
    """Convolution implementation by sparse matrix multiplication.

    :note: For best speed, put the matrix which you expect to be
           smaller as the 'kernel' argument

    "images" is assumed to be a matrix of shape batch_size x img_size,
    where the second dimension represents each image in raster order

    If flatten is "False", the output feature map will have shape:

    .. code-block:: python

        batch_size x number of kernels x output_size

    If flatten is "True", the output feature map will have shape:

    .. code-block:: python

        batch_size x number of kernels * output_size

    .. note::

        IMPORTANT: note that this means that each feature map (image
        generate by each kernel) is contiguous in memory. The memory
        layout will therefore be: [ <feature_map_0> <feature_map_1>
        ... <feature_map_n>], where <feature_map> represents a
        "feature map" in raster order

    kerns is a 2D tensor of shape nkern x N.prod(kshp)

    :param kerns: 2D tensor containing kernels which are applied at every pixel
    :param kshp: tuple containing actual dimensions of kernel (not symbolic)
    :param nkern: number of kernels/filters to apply.
                  nkern=1 will apply one common filter to all input pixels
    :param images: tensor containing images on which to apply convolution
    :param imgshp: tuple containing image dimensions
    :param step: determines number of pixels between adjacent receptive fields
                 (tuple containing dx,dy values)
    :param mode: 'full', 'valid' see CSM.evaluate function for details
    :param sumdims: dimensions over which to sum for the tensordot operation.
                    By default ((2,),(1,)) assumes kerns is a nkern x kernsize
                    matrix and images is a batchsize x imgsize matrix
                    containing flattened images in raster order
    :param flatten: flatten the last 2 dimensions of the output. By default,
                    instead of generating a batchsize x outsize x nkern tensor,
                    will flatten to batchsize x outsize*nkern

    :return: out1, symbolic result
    :return: out2, logical shape of the output img (nkern,heigt,width)

    :TODO: test for 1D and think of how to do n-d convolutions
    """
    N = numpy
    # start by computing output dimensions, size, etc
    kern_size = N.int64(N.prod(kshp))

    # inshp contains either 2 entries (height,width) or 3 (nfeatures,h,w)
    # in the first case, default nfeatures to 1
    if N.size(imgshp) == 2:
        imgshp = (1, ) + imgshp

    # construct indices and index pointers for sparse matrix, which,
    # when multiplied with input images will generate a stack of image
    # patches
    indices, indptr, spmat_shape, sptype, outshp = \
            convolution_indices.conv_eval(imgshp, kshp, step, mode)

    # build sparse matrix, then generate stack of image patches
    csc = theano.sparse.CSM(sptype)(N.ones(indices.size), indices, indptr,
                                    spmat_shape)
    patches = (sparse.structured_dot(csc, images.T)).T

    # compute output of linear classifier
    pshape = tensor.stack(images.shape[0] * tensor.as_tensor(N.prod(outshp)),\
                          tensor.as_tensor(imgshp[0] * kern_size))
    patch_stack = tensor.reshape(patches, pshape, ndim=2)

    # kern is of shape: nkern x ksize*number_of_input_features
    # output is thus of shape: bsize*outshp x nkern
    output = tensor.dot(patch_stack, kerns.T)

    # add bias across each feature map (more efficient to do it now)
    if bias is not None:
        output += bias

    # now to have feature maps in raster order ...
    # go from bsize*outshp x nkern to bsize x nkern*outshp
    newshp = tensor.stack(images.shape[0],\
                          tensor.as_tensor(N.prod(outshp)),\
                          tensor.as_tensor(nkern))
    tensout = tensor.reshape(output, newshp, ndim=3)
    output = tensor.DimShuffle((False, ) * tensout.ndim, (0, 2, 1))(tensout)
    if flatten:
        output = tensor.flatten(output, 2)

    return output, N.hstack((nkern, outshp))
Esempio n. 32
0
def build_graph(FLAGS):
    """Define training graph.
  """
    tparams = OrderedDict()
    trng = RandomStreams(
        np.random.RandomState(np.random.randint(1024)).randint(
            np.iinfo(np.int32).max))
    print("Building the computational graph")
    # Define bunch of shared variables
    init_state = np.zeros((3, 2, FLAGS.batch_size, FLAGS.n_hidden),
                          dtype=np.float32)
    tstate = sharedX(init_state, name='rnn_state')
    # Graph input
    inp = tensor.matrix('inp', dtype='int64')
    inp_mask = tensor.matrix('inp_mask', dtype='float32')
    inp.tag.test_value = np.zeros((FLAGS.max_seq_len, FLAGS.batch_size),
                                  dtype='int64')
    inp_mask.tag.test_value = np.ones((FLAGS.max_seq_len, FLAGS.batch_size),
                                      dtype='float32')
    x, y = inp[:-1], inp[1:]
    y_mask = inp_mask[1:]
    # Define input embedding layer
    _i_embed = LinearCell(FLAGS.n_class,
                          FLAGS.n_input_embed,
                          prefix='i_embed',
                          bias=False,
                          input_is_int=True)
    tparams = merge_dict(tparams, _i_embed._params)
    # Call input embedding layer
    h_i_emb_3d = _i_embed(x)
    # Define the first LSTM module
    _rnn_1 = LSTMModule(FLAGS.n_input_embed, FLAGS.n_hidden, prefix='lstm_1')
    tparams = merge_dict(tparams, _rnn_1._params)
    # Call the first LSTM module
    (h_rnn_1_3d, c_rnn_1_3d), last_state_1 = _rnn_1(h_i_emb_3d, tstate[0])
    # Define the second LSTM module
    _rnn_2 = LSTMModule(FLAGS.n_hidden, FLAGS.n_hidden, prefix='lstm_2')
    tparams = merge_dict(tparams, _rnn_1._params)
    # Call the second LSTM module
    (h_rnn_2_3d, c_rnn_2_3d), last_state_2 = _rnn_2(h_rnn_1_3d, tstate[1])
    # Define the third LSTM module
    _rnn_3 = LSTMModule(FLAGS.n_hidden, FLAGS.n_hidden, prefix='lstm_3')
    tparams = merge_dict(tparams, _rnn_3._params)
    # Call the third LSTM module
    (h_rnn_3_3d, c_rnn_3_3d), last_state_3 = _rnn_3(h_rnn_2_3d, tstate[2])
    # Define output gating layer
    _o_gate = LinearCell([FLAGS.n_hidden] * 3,
                         3,
                         prefix='o_gate',
                         activation=tensor.nnet.sigmoid)
    tparams = merge_dict(tparams, _o_gate._params)
    # Call output gating layer
    h_o_gate = _o_gate([h_rnn_1_3d, h_rnn_2_3d, h_rnn_3_3d])
    # Define output embedding layer
    _o_embed = LinearCell([FLAGS.n_hidden] * 3,
                          FLAGS.n_output_embed,
                          prefix='o_embed',
                          activation=tensor.nnet.relu)
    tparams = merge_dict(tparams, _o_embed._params)
    # Call output embedding layer
    h_o_embed = _o_embed([
        h_rnn_1_3d * h_o_gate[:, :, 0][:, :, None],
        h_rnn_2_3d * h_o_gate[:, :, 1][:, :, None],
        h_rnn_3_3d * h_o_gate[:, :, 2][:, :, None]
    ])
    # Define output layer
    _output = LinearCell(FLAGS.n_output_embed, FLAGS.n_class, prefix='output')
    tparams = merge_dict(tparams, _output._params)
    # Call output layer
    h_logit = _output([h_o_embed])
    logit_shape = h_logit.shape
    logit = h_logit.reshape([logit_shape[0] * logit_shape[1], logit_shape[2]])
    logit = logit - logit.max(axis=1).dimshuffle(0, 'x')
    probs = logit - tensor.log(
        tensor.exp(logit).sum(axis=1).dimshuffle(0, 'x'))
    # Compute the cost
    y_flat = y.flatten()
    y_flat_idx = tensor.arange(y_flat.shape[0]) * FLAGS.n_class + y_flat
    cost = -probs.flatten()[y_flat_idx]
    cost = cost.reshape([y.shape[0], y.shape[1]])
    cost = (cost * y_mask).sum(0)
    cost_len = y_mask.sum(0)
    last_state = tensor.stack([last_state_1, last_state_2, last_state_3],
                              axis=0)
    f_prop_updates = OrderedDict()
    f_prop_updates[tstate] = last_state
    states = [tstate]
    # Later use for visualization
    inps = [inp, inp_mask]
    print("Building f_log_prob function")
    f_log_prob = theano.function(inps, [cost, cost_len],
                                 updates=f_prop_updates)
    cost = cost.mean()
    # If the flag is on, apply L2 regularization on weights
    if FLAGS.weight_decay > 0.:
        weights_norm = 0.
        for k, v in tparams.iteritems():
            weights_norm += (v**2).sum()
        cost += weights_norm * FLAGS.weight_decay
    #print("Computing the gradients")
    grads = tensor.grad(cost, wrt=itemlist(tparams))
    grads = gradient_clipping(grads, tparams, 1.)
    # Compile the optimizer, the actual computational graph
    learning_rate = tensor.scalar(name='learning_rate')
    gshared = [
        theano.shared(p.get_value() * 0., name='%s_grad' % k)
        for k, p in tparams.iteritems()
    ]
    gsup = OrderedDict(izip(gshared, grads))
    print("Building f_prop function")
    f_prop = theano.function(inps, [cost],
                             updates=merge_dict(gsup, f_prop_updates))
    opt_updates, opt_tparams = adam(learning_rate, tparams, gshared)
    if FLAGS.start_from_ckpt and os.path.exists(opt_file_name):
        opt_params = np.load(opt_file_name)
        zipp(opt_params, opt_tparams)
    print("Building f_update function")
    f_update = theano.function([learning_rate], [],
                               updates=opt_updates,
                               on_unused_input='ignore')
    #print("Building f_debug function")
    f_debug = theano.function(inps, [h_rnn_1_3d, h_rnn_2_3d, h_rnn_3_3d],
                              updates=f_prop_updates,
                              on_unused_input='ignore')
    return f_prop, f_update, f_log_prob, f_debug, tparams, opt_tparams, states, None
Esempio n. 33
0
    def RHNLayer(self, inputs, depth, batch_size, hidden_size, drop_i, drop_s,
                 init_T_bias, init_H_bias, tied_noise):
        """Variational Recurrent Highway Layer (Theano implementation).

    References:
      Zilly, J, Srivastava, R, Koutnik, J, Schmidhuber, J., "Recurrent Highway Networks", 2016
    Args:
      inputs: Theano variable, shape (num_steps, batch_size, hidden_size).
      depth: int, the number of RHN inner layers i.e. the number of micro-timesteps per timestep.
      drop_i: float, probability of dropout over inputs.
      drop_s: float, probability of dropout over recurrent hidden state.
      init_T_bias: a valid bias_init argument for linear(), initialization of bias of transform gate T.
      init_H_bias: a valid bias_init argument for linear(), initialization of bias of non-linearity H.
      tied_noise: boolean, whether to use the same dropout masks when calculating H and when calculating T.
    Returns:
      y: Theano variable, recurrent hidden states at each timestep. Shape (num_steps, batch_size, hidden_size).
      sticky_state_updates: a list of (shared variable, new shared variable value).
    """
        # We first compute the linear transformation of the inputs over all timesteps.
        # This is done outside of scan() in order to speed up computation.
        # The result is then fed into scan()'s step function, one timestep at a time.
        noise_i_for_H = self.get_dropout_noise((batch_size, hidden_size),
                                               drop_i)
        noise_i_for_T = self.get_dropout_noise(
            (batch_size,
             hidden_size), drop_i) if not tied_noise else noise_i_for_H

        i_for_H = self.apply_dropout(inputs, noise_i_for_H)
        i_for_T = self.apply_dropout(inputs, noise_i_for_T)

        i_for_H = self.linear(i_for_H,
                              in_size=hidden_size,
                              out_size=hidden_size,
                              bias=True,
                              bias_init=init_H_bias)
        i_for_T = self.linear(i_for_T,
                              in_size=hidden_size,
                              out_size=hidden_size,
                              bias=True,
                              bias_init=init_T_bias)

        # Dropout noise for recurrent hidden state.
        noise_s = self.get_dropout_noise((batch_size, hidden_size), drop_s)
        if not tied_noise:
            noise_s = tt.stack(
                noise_s,
                self.get_dropout_noise((batch_size, hidden_size), drop_s))

        def step_fn(i_for_H_t, i_for_T_t, y_tm1, noise_s):
            """
      Args:
        Elements of sequences given to scan():
          i_for_H_t: linear trans. of inputs for calculating non-linearity H at timestep t. Shape (batch_size, hidden_size).
          i_for_T_t: linear trans. of inputs for calculating transform gate T at timestep t. Shape (batch_size, hidden_size).
        Result of previous step function invocation (equals the outputs_info given to scan() on first timestep):
          y_tm1: Shape (batch_size, hidden_size).
        Non-sequences given to scan() (these are the same at all timesteps):
          noise_s: (batch_size, hidden_size) or (2, batch_size, hidden_size), depending on value of tied_noise.
      """
            tanh, sigm = tt.tanh, tt.nnet.sigmoid
            noise_s_for_H = noise_s if tied_noise else noise_s[0]
            noise_s_for_T = noise_s if tied_noise else noise_s[1]

            s_lm1 = y_tm1
            for l in range(depth):
                s_lm1_for_H = self.apply_dropout(s_lm1, noise_s_for_H)
                s_lm1_for_T = self.apply_dropout(s_lm1, noise_s_for_T)
                if l == 0:
                    # On the first micro-timestep of each timestep we already have bias
                    # terms summed into i_for_H_t and into i_for_T_t.
                    H = tanh(i_for_H_t + self.linear(s_lm1_for_H,
                                                     in_size=hidden_size,
                                                     out_size=hidden_size,
                                                     bias=False))
                    T = sigm(i_for_T_t + self.linear(s_lm1_for_T,
                                                     in_size=hidden_size,
                                                     out_size=hidden_size,
                                                     bias=False))
                else:
                    H = tanh(
                        self.linear(s_lm1_for_H,
                                    in_size=hidden_size,
                                    out_size=hidden_size,
                                    bias=True,
                                    bias_init=init_H_bias))
                    T = sigm(
                        self.linear(s_lm1_for_T,
                                    in_size=hidden_size,
                                    out_size=hidden_size,
                                    bias=True,
                                    bias_init=init_T_bias))
                s_l = (H - s_lm1) * T + s_lm1
                s_lm1 = s_l

            y_t = s_l
            return y_t

        # The recurrent hidden state of the RHN is sticky (the last hidden state of one batch is carried over to the next batch,
        # to be used as an initial hidden state).  These states are kept in a shared variable.
        y_0 = theano.shared(np.zeros((batch_size, hidden_size), floatX))
        self.reset_hidden_state = lambda: y_0.set_value(
            np.zeros_like(y_0.get_value()))  # invoked before every epoch.

        y, _ = theano.scan(step_fn,
                           sequences=[i_for_H, i_for_T],
                           outputs_info=[y_0],
                           non_sequences=[noise_s])

        y_last = y[-1]
        sticky_state_updates = [(y_0, y_last)]

        return y, sticky_state_updates
Esempio n. 34
0
    def init_opt(self):
        obs_var = ext.new_tensor(
            'obs', ndim=2, dtype=theano.config.floatX)  # todo: check the dtype

        manager_obs_var = ext.new_tensor('manager_obs',
                                         ndim=2,
                                         dtype=theano.config.floatX)

        action_var = self.env.action_space.new_tensor_variable(
            'action',
            extra_dims=1,
        )

        # this will have to be the advantage every time the manager makes a decision
        manager_advantage_var = ext.new_tensor('manager_advantage',
                                               ndim=1,
                                               dtype=theano.config.floatX)

        skill_advantage_var = ext.new_tensor('skill_advantage',
                                             ndim=1,
                                             dtype=theano.config.floatX)

        latent_var_sparse = ext.new_tensor('sparse_latent',
                                           ndim=2,
                                           dtype=theano.config.floatX)

        latent_var = ext.new_tensor('latents',
                                    ndim=2,
                                    dtype=theano.config.floatX)

        assert isinstance(self.policy, HierarchicalPolicy)

        #############################################################
        ### calculating the manager portion of the surrogate loss ###
        #############################################################

        # i, j should contain the probability of latent j at time step self.period*i
        # should be a len(obs)//self.period by len(self.latent) tensor
        latent_probs = self.policy.manager.dist_info_sym(
            manager_obs_var)['prob']
        old_latent_probs = self.old_policy.manager.dist_info_sym(
            manager_obs_var)['prob']

        actual_latent_probs = TT.sum(latent_probs * latent_var_sparse, axis=1)
        old_actual_latent_probs = TT.sum(old_latent_probs * latent_var_sparse,
                                         axis=1)
        lr = TT.exp(
            TT.log(actual_latent_probs) - TT.log(old_actual_latent_probs))
        manager_surr_loss_vector = TT.minimum(
            lr * manager_advantage_var,
            TT.clip(lr, 1 - self.epsilon, 1 + self.epsilon) *
            manager_advantage_var)
        manager_surr_loss = -TT.mean(manager_surr_loss_vector)

        ############################################################
        ### calculating the skills portion of the surrogate loss ###
        ############################################################

        dist_info_vars = self.policy.low_policy.dist_info_sym_all_latents(
            obs_var)
        probs = TT.stack([
            self.diagonal.log_likelihood_sym(action_var, dist_info)
            for dist_info in dist_info_vars
        ],
                         axis=1)
        actual_action_log_probs = TT.sum(
            probs * latent_var,
            axis=1)  # todo: verify that dist_info_vars is in order

        # old policy stuff
        old_dist_info_vars = self.old_policy.low_policy.dist_info_sym_all_latents(
            obs_var)
        old_probs = TT.stack([
            self.diagonal.log_likelihood_sym(action_var, dist_info)
            for dist_info in old_dist_info_vars
        ],
                             axis=1)
        old_actual_action_log_probs = TT.sum(old_probs * latent_var, axis=1)
        skill_lr = TT.exp(actual_action_log_probs -
                          old_actual_action_log_probs)

        skill_surr_loss_vector = TT.minimum(
            skill_lr * skill_advantage_var,
            TT.clip(skill_lr, 1 - self.epsilon, 1 + self.epsilon) *
            skill_advantage_var)
        skill_surr_loss = -TT.mean(skill_surr_loss_vector)

        surr_loss = manager_surr_loss / self.average_period + skill_surr_loss

        input_list = [
            obs_var, manager_obs_var, action_var, manager_advantage_var,
            skill_advantage_var, latent_var, latent_var_sparse
        ]

        self.optimizer.update_opt(loss=surr_loss,
                                  target=self.policy,
                                  inputs=input_list)
        return dict()
Esempio n. 35
0
 def get_output(self, train):
     X = self.get_input(train)
     tensors = [T.roll(X, off, axis=self.axis) for off in self.offsets]
     return T.stack(tensors, axis=self.offset_axis)
Esempio n. 36
0
def any_to_tensor_and_labels(x, labels=None):
    """Util for converting input x to tensor trying to
    create labels for columns if they are not provided.

    Default names for columns are ['x0', 'x1', ...], for mappable
    arrays (e.g. pd.DataFrame) their names are treated as labels.
    You can override them with `labels` argument.

    If you have tensor input you should provide labels as we
    cannot get their shape directly

    If you pass dict input we cannot rely on labels order thus dict
    keys are treated as labels anyway

    Parameters
    ----------
    x : np.ndarray | pd.DataFrame | tt.Variable | dict | list
    labels : list - names for columns of output tensor

    Returns
    -------
    (x, labels) - tensor and labels for its columns
    """
    if isinstance(labels, six.string_types):
        labels = [labels]
    # pandas.DataFrame
    # labels can come from here
    # we can override them
    if isinstance(x, pd.DataFrame):
        if not labels:
            labels = x.columns
        x = x.as_matrix()

    # pandas.Series
    # there can still be a label
    # we can override labels
    elif isinstance(x, pd.Series):
        if not labels:
            labels = [x.name]
        x = x.as_matrix()[:, None]

    # dict
    # labels are keys,
    # cannot override them
    elif isinstance(x, dict):
        # try to do it via pandas
        try:
            x = pd.DataFrame.from_dict(x)
            labels = x.columns
            x = x.as_matrix()
        # some types fail there
        # another approach is to construct
        # variable by hand
        except (PandasError, TypeError):
            res = []
            labels = []
            for k, v in x.items():
                res.append(v)
                labels.append(k)
            x = tt.stack(res, axis=1)
            if x.ndim == 1:
                x = x[:, None]
    # case when it can appear to be some
    # array like value like lists of lists
    # numpy deals with it
    elif not isinstance(x, tt.Variable):
        x = np.asarray(x)
        if x.ndim == 0:
            raise ValueError('Cannot use scalars')
        elif x.ndim == 1:
            x = x[:, None]
    # something really strange goes here,
    # but user passes labels trusting seems
    # to be a good option
    elif labels is not None:
        x = tt.as_tensor_variable(x)
        if x.ndim == 0:
            raise ValueError('Cannot use scalars')
        elif x.ndim == 1:
            x = x[:, None]
    else:  # trust input
        pass
    # we should check that we can extract labels
    if labels is None and not isinstance(x, tt.Variable):
        labels = ['x%d' % i for i in range(x.shape[1])]
    # for theano variables we should have labels from user
    elif labels is None:
        raise ValueError('Please provide labels as '
                         'we cannot infer shape of input')
    else:  # trust labels, user knows what he is doing
        pass
    # it's time to check shapes if we can
    if not isinstance(x, tt.Variable):
        if not len(labels) == x.shape[1]:
            raise ValueError('Please provide full list '
                             'of labels for coefficients, '
                             'got len(labels)=%d instead of %d' %
                             (len(labels), x.shape[1]))
    else:
        # trust labels, as we raised an
        # error in bad case, we have labels
        pass
    # convert labels to list
    if isinstance(labels, pd.RangeIndex):
        labels = ['x%d' % i for i in labels]
    # maybe it was a tuple ot whatever
    elif not isinstance(labels, list):
        labels = list(labels)
    # as output we need tensor
    if not isinstance(x, tt.Variable):
        x = tt.as_tensor_variable(x)
        # finally check dimensions
        if x.ndim == 0:
            raise ValueError('Cannot use scalars')
        elif x.ndim == 1:
            x = x[:, None]
    return x, labels
    def __init__(self, babi_train_raw, babi_test_raw, word2vec,
                 word_vector_size, dim, mode, answer_module, input_mask_mode,
                 memory_hops, l2, normalize_attention, **kwargs):

        print "==> not used params in DMN class:", kwargs.keys()
        self.vocab = {}
        self.ivocab = {}

        self.word2vec = word2vec
        self.word_vector_size = word_vector_size
        self.dim = dim
        self.mode = mode
        self.answer_module = answer_module
        self.input_mask_mode = input_mask_mode
        self.memory_hops = memory_hops
        self.l2 = l2
        self.normalize_attention = normalize_attention

        self.train_input, self.train_q, self.train_answer, self.train_input_mask = self._process_input(
            babi_train_raw)
        self.test_input, self.test_q, self.test_answer, self.test_input_mask = self._process_input(
            babi_test_raw)
        self.vocab_size = len(self.vocab)

        self.input_var = T.matrix('input_var')
        self.q_var = T.matrix('question_var')
        self.answer_var = T.iscalar('answer_var')
        self.input_mask_var = T.ivector('input_mask_var')

        print "==> building input module"
        self.W_inp_res_in = nn_utils.normal_param(
            std=0.1, shape=(self.dim, self.word_vector_size))
        self.W_inp_res_hid = nn_utils.normal_param(std=0.1,
                                                   shape=(self.dim, self.dim))
        self.b_inp_res = nn_utils.constant_param(value=0.0, shape=(self.dim, ))

        self.W_inp_upd_in = nn_utils.normal_param(
            std=0.1, shape=(self.dim, self.word_vector_size))
        self.W_inp_upd_hid = nn_utils.normal_param(std=0.1,
                                                   shape=(self.dim, self.dim))
        self.b_inp_upd = nn_utils.constant_param(value=0.0, shape=(self.dim, ))

        self.W_inp_hid_in = nn_utils.normal_param(
            std=0.1, shape=(self.dim, self.word_vector_size))
        self.W_inp_hid_hid = nn_utils.normal_param(std=0.1,
                                                   shape=(self.dim, self.dim))
        self.b_inp_hid = nn_utils.constant_param(value=0.0, shape=(self.dim, ))

        inp_c_history, _ = theano.scan(fn=self.input_gru_step,
                                       sequences=self.input_var,
                                       outputs_info=T.zeros_like(
                                           self.b_inp_hid))

        self.inp_c = inp_c_history.take(self.input_mask_var, axis=0)

        self.q_q, _ = theano.scan(fn=self.input_gru_step,
                                  sequences=self.q_var,
                                  outputs_info=T.zeros_like(self.b_inp_hid))

        self.q_q = self.q_q[-1]

        print "==> creating parameters for memory module"
        self.W_mem_res_in = nn_utils.normal_param(std=0.1,
                                                  shape=(self.dim, self.dim))
        self.W_mem_res_hid = nn_utils.normal_param(std=0.1,
                                                   shape=(self.dim, self.dim))
        self.b_mem_res = nn_utils.constant_param(value=0.0, shape=(self.dim, ))

        self.W_mem_upd_in = nn_utils.normal_param(std=0.1,
                                                  shape=(self.dim, self.dim))
        self.W_mem_upd_hid = nn_utils.normal_param(std=0.1,
                                                   shape=(self.dim, self.dim))
        self.b_mem_upd = nn_utils.constant_param(value=0.0, shape=(self.dim, ))

        self.W_mem_hid_in = nn_utils.normal_param(std=0.1,
                                                  shape=(self.dim, self.dim))
        self.W_mem_hid_hid = nn_utils.normal_param(std=0.1,
                                                   shape=(self.dim, self.dim))
        self.b_mem_hid = nn_utils.constant_param(value=0.0, shape=(self.dim, ))

        self.W_b = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim))
        self.W_1 = nn_utils.normal_param(std=0.1,
                                         shape=(self.dim, 7 * self.dim + 0))
        self.W_2 = nn_utils.normal_param(std=0.1, shape=(1, self.dim))
        self.b_1 = nn_utils.constant_param(value=0.0, shape=(self.dim, ))
        self.b_2 = nn_utils.constant_param(value=0.0, shape=(1, ))

        print "==> building episodic memory module (fixed number of steps: %d)" % self.memory_hops
        memory = [self.q_q.copy()]
        for iter in range(1, self.memory_hops + 1):
            current_episode = self.new_episode(memory[iter - 1])
            memory.append(
                self.GRU_update(memory[iter - 1], current_episode,
                                self.W_mem_res_in, self.W_mem_res_hid,
                                self.b_mem_res, self.W_mem_upd_in,
                                self.W_mem_upd_hid, self.b_mem_upd,
                                self.W_mem_hid_in, self.W_mem_hid_hid,
                                self.b_mem_hid))

        last_mem = memory[-1]

        print "==> building answer module"
        self.W_a = nn_utils.normal_param(std=0.1,
                                         shape=(self.vocab_size, self.dim))

        if self.answer_module == 'feedforward':
            self.prediction = nn_utils.softmax(T.dot(self.W_a, last_mem))

        elif self.answer_module == 'recurrent':
            self.W_ans_res_in = nn_utils.normal_param(
                std=0.1, shape=(self.dim, self.dim + self.vocab_size))
            self.W_ans_res_hid = nn_utils.normal_param(std=0.1,
                                                       shape=(self.dim,
                                                              self.dim))
            self.b_ans_res = nn_utils.constant_param(value=0.0,
                                                     shape=(self.dim, ))

            self.W_ans_upd_in = nn_utils.normal_param(
                std=0.1, shape=(self.dim, self.dim + self.vocab_size))
            self.W_ans_upd_hid = nn_utils.normal_param(std=0.1,
                                                       shape=(self.dim,
                                                              self.dim))
            self.b_ans_upd = nn_utils.constant_param(value=0.0,
                                                     shape=(self.dim, ))

            self.W_ans_hid_in = nn_utils.normal_param(
                std=0.1, shape=(self.dim, self.dim + self.vocab_size))
            self.W_ans_hid_hid = nn_utils.normal_param(std=0.1,
                                                       shape=(self.dim,
                                                              self.dim))
            self.b_ans_hid = nn_utils.constant_param(value=0.0,
                                                     shape=(self.dim, ))

            def answer_step(prev_a, prev_y):
                a = self.GRU_update(prev_a, T.concatenate([prev_y, self.q_q]),
                                    self.W_ans_res_in, self.W_ans_res_hid,
                                    self.b_ans_res, self.W_ans_upd_in,
                                    self.W_ans_upd_hid, self.b_ans_upd,
                                    self.W_ans_hid_in, self.W_ans_hid_hid,
                                    self.b_ans_hid)

                y = nn_utils.softmax(T.dot(self.W_a, a))
                return [a, y]

            # TODO: add conditional ending
            dummy = theano.shared(np.zeros((self.vocab_size, ), dtype=floatX))
            results, updates = theano.scan(
                fn=answer_step,
                outputs_info=[last_mem, T.zeros_like(dummy)],
                n_steps=1)
            self.prediction = results[1][-1]

        else:
            raise Exception("invalid answer_module")

        print "==> collecting all parameters"
        self.params = [
            self.W_inp_res_in,
            self.W_inp_res_hid,
            self.b_inp_res,
            self.W_inp_upd_in,
            self.W_inp_upd_hid,
            self.b_inp_upd,
            self.W_inp_hid_in,
            self.W_inp_hid_hid,
            self.b_inp_hid,
            self.W_mem_res_in,
            self.W_mem_res_hid,
            self.b_mem_res,
            self.W_mem_upd_in,
            self.W_mem_upd_hid,
            self.b_mem_upd,
            self.W_mem_hid_in,
            self.W_mem_hid_hid,
            self.b_mem_hid,  #self.W_b
            self.W_1,
            self.W_2,
            self.b_1,
            self.b_2,
            self.W_a
        ]

        if self.answer_module == 'recurrent':
            self.params = self.params + [
                self.W_ans_res_in, self.W_ans_res_hid, self.b_ans_res,
                self.W_ans_upd_in, self.W_ans_upd_hid, self.b_ans_upd,
                self.W_ans_hid_in, self.W_ans_hid_hid, self.b_ans_hid
            ]

        print "==> building loss layer and computing updates"
        self.loss_ce = T.nnet.categorical_crossentropy(
            self.prediction.dimshuffle('x', 0), T.stack([self.answer_var]))[0]

        if self.l2 > 0:
            self.loss_l2 = self.l2 * nn_utils.l2_reg(self.params)
        else:
            self.loss_l2 = 0

        self.loss = self.loss_ce + self.loss_l2

        updates = lasagne.updates.adadelta(self.loss, self.params)

        if self.mode == 'train':
            print "==> compiling train_fn"
            self.train_fn = theano.function(
                inputs=[
                    self.input_var, self.q_var, self.answer_var,
                    self.input_mask_var
                ],
                outputs=[self.prediction, self.loss],
                updates=updates)

        print "==> compiling test_fn"
        self.test_fn = theano.function(inputs=[
            self.input_var, self.q_var, self.answer_var, self.input_mask_var
        ],
                                       outputs=[self.prediction, self.loss])
Esempio n. 38
0
    def __init__(self,
                 babi_train_raw,
                 babi_test_raw,
                 word2vec,
                 word_vector_size,
                 dim,
                 mode,
                 answer_module,
                 input_mask_mode,
                 memory_hops,
                 l2,
                 normalize_attention,
                 answer_vec,
                 debug,
                 sentEmbdLoadState,
                 sentEmbdType="basic",
                 **kwargs):
        self.vocab = {}
        self.ivocab = {}
        self.debug = debug

        self.word2vec = word2vec
        self.word_vector_size = word_vector_size
        self.dim = dim
        self.mode = mode
        self.answer_module = answer_module
        self.input_mask_mode = input_mask_mode
        self.memory_hops = memory_hops
        self.l2 = l2
        self.normalize_attention = normalize_attention
        self.answer_vec = answer_vec
        self.sentEmbdType = sentEmbdType
        if (self.mode != 'deploy'):
            self.train_input, self.train_q, self.train_answer, self.train_input_mask = self._process_input(
                babi_train_raw)
            self.test_input, self.test_q, self.test_answer, self.test_input_mask = self._process_input(
                babi_test_raw)
            self.vocab_size = len(self.vocab)
            print(self.vocab_size)
        elif self.mode == 'deploy':
            self.train_input, self.train_q, self.train_answer, self.train_input_mask = self._process_input(
                babi_train_raw)
            self.vocab_size = len(self.vocab)
            print(self.vocab_size)
            # print(self.train_input.shape)
            # print(self.train_q.shape)
            # print(self.train_input_mask.shape)

        #Setting up pre-trained Sentence Embedder for question and input module:
        if self.mode != 'deploy':
            print("==> Setting up pre-trained Sentence Embedder")
        if self.sentEmbdType == "basic":
            self.sent_embd = SentEmbd.SentEmbd_basic(self.word_vector_size,
                                                     self.dim)
        else:
            dep_tags = utils.load_dep_tags
            self.sent_embd = SentEmbd.SentEmbd_syntactic(
                50, hid_dim, len(dep_tags))  #TODO: Dependency Tags
        self.sent_embd.load_params(sentEmbdLoadState)

        self.input_var = T.matrix('input_var')
        self.q_var = T.vector('question_var')
        if self.answer_vec == 'word2vec':
            self.answer_var = T.vector('answer_var')
        else:
            self.answer_var = T.iscalar('answer_var')
        self.input_mask_var = T.ivector('input_mask_var')

        if self.answer_vec == 'one_hot' or self.answer_vec == 'index':
            self.answer_size = self.vocab_size
        elif self.answer_vec == 'word2vec':
            self.answer_size = self.word_vector_size
        else:
            raise Exception("Invalid answer_vec type")

        #Setting up Untrained Memory module
        if self.mode != 'deploy':
            print("==> Creating parameters for memory module")
        self.W_mem_res_in = nn_utils.normal_param(std=0.1,
                                                  shape=(self.dim, self.dim))
        self.W_mem_res_hid = nn_utils.normal_param(std=0.1,
                                                   shape=(self.dim, self.dim))
        self.b_mem_res = nn_utils.constant_param(value=0.0, shape=(self.dim, ))

        self.W_mem_upd_in = nn_utils.normal_param(std=0.1,
                                                  shape=(self.dim, self.dim))
        self.W_mem_upd_hid = nn_utils.normal_param(std=0.1,
                                                   shape=(self.dim, self.dim))
        self.b_mem_upd = nn_utils.constant_param(value=0.0, shape=(self.dim, ))

        self.W_mem_hid_in = nn_utils.normal_param(std=0.1,
                                                  shape=(self.dim, self.dim))
        self.W_mem_hid_hid = nn_utils.normal_param(std=0.1,
                                                   shape=(self.dim, self.dim))
        self.b_mem_hid = nn_utils.constant_param(value=0.0, shape=(self.dim, ))

        self.W_b = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim))
        self.W_1 = nn_utils.normal_param(std=0.1,
                                         shape=(self.dim, 7 * self.dim + 2))
        self.W_2 = nn_utils.normal_param(std=0.1, shape=(1, self.dim))
        self.b_1 = nn_utils.constant_param(value=0.0, shape=(self.dim, ))
        self.b_2 = nn_utils.constant_param(value=0.0, shape=(1, ))

        if self.mode != 'deploy':
            print(
                "==> Building episodic memory module (fixed number of steps: %d)"
                % self.memory_hops)
        memory = [self.q_var.copy()]
        for iter in range(1, self.memory_hops + 1):
            current_episode = self.new_episode(memory[iter - 1])
            memory.append(
                self.GRU_update(memory[iter - 1], current_episode,
                                self.W_mem_res_in, self.W_mem_res_hid,
                                self.b_mem_res, self.W_mem_upd_in,
                                self.W_mem_upd_hid, self.b_mem_upd,
                                self.W_mem_hid_in, self.W_mem_hid_hid,
                                self.b_mem_hid))

        last_mem = memory[-1]

        if self.mode != 'deploy': print("==> Building answer module")

        self.W_a = nn_utils.normal_param(std=0.1,
                                         shape=(self.answer_size, self.dim))

        if self.answer_module == 'feedforward':
            self.prediction = nn_utils.softmax(T.dot(self.W_a, last_mem))
        # elif self.answer_module == 'recurrent':
        #     self.W_ans_res_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim + self.answer_size))
        #     self.W_ans_res_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim))
        #     self.b_ans_res = nn_utils.constant_param(value=0.0, shape=(self.dim,))

        #     self.W_ans_upd_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim + self.answer_size))
        #     self.W_ans_upd_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim))
        #     self.b_ans_upd = nn_utils.constant_param(value=0.0, shape=(self.dim,))

        #     self.W_ans_hid_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim + self.answer_size))
        #     self.W_ans_hid_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim))
        #     self.b_ans_hid = nn_utils.constant_param(value=0.0, shape=(self.dim,))

        #     def answer_step(prev_a, prev_y):
        #         a = self.GRU_update(prev_a, T.concatenate([prev_y, self.q_q]),
        #                           self.W_ans_res_in, self.W_ans_res_hid, self.b_ans_res,
        #                           self.W_ans_upd_in, self.W_ans_upd_hid, self.b_ans_upd,
        #                           self.W_ans_hid_in, self.W_ans_hid_hid, self.b_ans_hid)
        #         y = T.dot(self.W_a, a)
        #         if self.answer_vec == 'one_hot' or self.answer_vec == 'index':
        #             y = nn_utils.softmax(y)
        #         return [a, y]

        #     # TODO: add conditional ending
        #     dummy = theano.shared(np.zeros((self.answer_size, ), dtype=floatX))
        #     results, updates = theano.scan(fn=answer_step,
        #         outputs_info=[last_mem, T.zeros_like(dummy)],
        #         n_steps=1)
        #     self.prediction = results[1][-1]

        else:
            raise Exception("invalid answer_module")

        if self.mode != 'deploy':
            print("==> Collecting all parameters to be trained")
        self.params = [
            self.W_mem_res_in, self.W_mem_res_hid, self.b_mem_res,
            self.W_mem_upd_in, self.W_mem_upd_hid, self.b_mem_upd,
            self.W_mem_hid_in, self.W_mem_hid_hid, self.b_mem_hid, self.W_b,
            self.W_1, self.W_2, self.b_1, self.b_2, self.W_a
        ]

        # if self.answer_module == 'recurrent':
        #     self.params = self.params + [self.W_ans_res_in, self.W_ans_res_hid, self.b_ans_res,
        #                       self.W_ans_upd_in, self.W_ans_upd_hid, self.b_ans_upd,
        #                       self.W_ans_hid_in, self.W_ans_hid_hid, self.b_ans_hid]

        if self.mode != 'deploy':
            print("==> Building loss layer and computing updates")
        if debug:
            print('Prediction dim:', self.prediction.dimshuffle('x', 0).ndim)
            print('Answer dim:', self.answer_var.ndim)
        if self.answer_vec == 'word2vec':
            self.loss_ce = nn_utils.cosine_proximity_loss(
                self.prediction.dimshuffle('x', 0),
                T.stack([self.answer_var]))[0][0]
        else:
            self.loss_ce = T.nnet.categorical_crossentropy(
                self.prediction.dimshuffle('x', 0),
                T.stack([self.answer_var]))[0]
        if self.l2 > 0:
            self.loss_l2 = self.l2 * nn_utils.l2_reg(self.params)
        else:
            self.loss_l2 = 0

        self.loss = self.loss_ce + self.loss_l2

        if debug: print(self.loss.ndim)
        # if self.debug: print(self.loss.eval({self.input_var:self.train_input,self.q_var:self.train_q,self.answer_var:self.train_answer,self.input_mask_var:self.train_input_mask}))
        updates = lasagne.updates.adadelta(self.loss, self.params)

        if self.mode == 'deploy':
            self.deploy_fn = theano.function(
                inputs=[self.input_var, self.q_var], outputs=[self.prediction])

        else:
            if self.mode == 'train':
                print("==> compiling train_fn")
                self.train_fn = theano.function(
                    inputs=[self.input_var, self.q_var, self.answer_var],
                    outputs=[self.prediction, self.loss],
                    updates=updates)

            print("==> compiling test_fn")
            self.test_fn = theano.function(
                inputs=[self.input_var, self.q_var, self.answer_var],
                outputs=[
                    self.prediction, self.loss, self.input_var, self.q_var,
                    last_mem
                ])

            if self.mode == 'train':
                print("==> computing gradients (for debugging)")
                gradient = T.grad(self.loss, self.params)
                self.get_gradient_fn = theano.function(
                    inputs=[self.input_var, self.q_var, self.answer_var],
                    outputs=gradient)
Esempio n. 39
0
    def get_output(self, input_, mask_, hidden_init):
        """
        This function overrides the parents' one.
        Creates symbolic function to compute output from an input and output (hidden).
        
        Math Expression
        -------------------
        Y[t] = out_activation(dot(X[t], W) + dot(Y[t-1], U) + b)

        if precompute True, compute dot(X[t],W) for all steps first.
        if mask exist and 1, Y[t] = Y[t-1]

        Parameters
        ----------
        input_: TensorVariable
        mask_: TensorVariable
        hidden_init: TensorVariable

        Returns
        -------
        TensorVariable
        """
        # input_ are (n_batch, n_timesteps, n_features)
        # change to (n_timesteps, n_batch, n_features)
        input_ = input_.dimshuffle(1, 0, 2)
        # mask_ are (n_batch, n_timesteps)
        masks = masks.dimshuffle(1, 0, 'x')
        sequence_length = input_.shape[0]
        batch_num = input_.shape[1]

        # precompute input
        if self.precompute:
            additional_dims = tuple(
                input.shape[k] for k in range(2, input.ndim))  # (output_dim,)
            input = T.reshape(input, (sequence_length * batch_num, ) +
                              additional_dims)
            input = T.dot(input, self.W)
            additional_dims = tuple(
                input.shape[k] for k in range(1, input.ndim))  # (output_dim,)
            input = T.reshape(input, (
                sequence_length,
                batch_num,
            ) + additional_dims)

        # step function
        def step(input_, hidden):
            if self.precompute:
                return self.out_activation.get_output(input_ +
                                                      T.dot(hidden, self.U) +
                                                      self.b)
            else:
                return self.out_activation.get_output(
                    T.dot(input_, self.W) + T.dot(hidden, self.U) + self.b)

        # step function, with mask
        def step_masked(input_, mask_, hidden):
            hidden_computed = step(input_, hidden)
            return T.switch(mask_, hidden_computed, hidden)

        # main operation
        if self.unroll:
            counter = range(self.gradient_steps)
            if self.backward:
                counter = counter[::-1]  # reversed index
            iter_output = []
            outputs_info = [hidden_init]
            for index in counter:
                step_input = [input_[index], mask_[index]] + outputs_info
                step_output = step_masked(*step_input)
                iter_output.append(step_output)
                outputs_info = [iter_output[-1]]
            hidden_output = T.stack(iter_output, axis=0)

        else:
            hidden_output = theano.scan(
                fn=step_masked,
                sequences=[input_, mask_],
                outputs_info=[hidden_init],
                go_backwards=self.backward,
                n_steps=None,
                truncate_gradient=self.gradient_steps)[
                    0]  # only need outputs, not updates

        # computed output are (n_timesteps, n_batch, n_features)
        # select only required
        if self.output_return_index is None:
            hidden_output_return = hidden_output
        else:
            hidden_output_return = hidden_output[self.output_return_index]
        # change to (n_batch, n_timesteps, n_features)
        hidden_output_return = hidden_output_return.dimshuffle(
            1, 0, *range(2, hidden_output_return.ndim))

        # backward order straight
        if self.backward:
            hidden_output_return = hidden_output_return[:, ::-1]

        return hidden_output_return
Esempio n. 40
0
def neibs2images(neibs, neib_shape, original_shape, mode='valid'):
    """
    Function :func:`neibs2images <theano.sandbox.neighbours.neibs2images>`
    performs the inverse operation of
    :func:`images2neibs <theano.sandbox.neigbours.neibs2images>`. It inputs
    the output of :func:`images2neibs <theano.sandbox.neigbours.neibs2images>`
    and reconstructs its input.

    :param neibs: matrix like the one obtained by 
                  :func:`images2neibs <theano.sandbox.neigbours.neibs2images>`
    :param neib_shape: `neib_shape` that was used in 
                  :func:`images2neibs <theano.sandbox.neigbours.neibs2images>`
    :param original_shape: original shape of the 4d tensor given to 
                  :func:`images2neibs <theano.sandbox.neigbours.neibs2images>`

    :return: Reconstructs the input of 
                  :func:`images2neibs <theano.sandbox.neigbours.neibs2images>`,
                  a 4d tensor of shape `original_shape`.

    .. note:: Currently, the function doesn't support tensors created with
       `neib_step` different from default value. This means that it may be
       impossible to compute the gradient of a variable gained by 
       :func:`images2neibs <theano.sandbox.neigbours.neibs2images>` w.r.t. 
       its inputs in this case, because it uses 
       :func:`images2neibs <theano.sandbox.neigbours.neibs2images>` for 
       gradient computation.
    

    Example, which uses a tensor gained in example for
    :func:`images2neibs <theano.sandbox.neigbours.neibs2images>`:

    .. code-block:: python

        im_new = neibs2images(neibs, (5, 5), im_val.shape)
        # Theano function definition
        inv_window = theano.function([neibs], im_new)
        # Function application
        im_new_val = inv_window(neibs_val)

    .. note:: The code will output the initial image array.
    """
    neibs = T.as_tensor_variable(neibs)
    neib_shape = T.as_tensor_variable(neib_shape)
    original_shape = T.as_tensor_variable(original_shape)

    new_neib_shape = T.stack(original_shape[-1] // neib_shape[1],
                             neib_shape[1])
    output_2d = images2neibs(neibs.dimshuffle('x', 'x', 0, 1),
                             new_neib_shape, mode=mode)

    if mode == 'ignore_borders':
        valid_shape = list(original_shape)
        valid_shape[2] = (valid_shape[2] // neib_shape[0]) * neib_shape[0]
        valid_shape[3] = (valid_shape[3] // neib_shape[1]) * neib_shape[1]
        output_4d = output_2d.reshape(valid_shape)
        # padding the borders with zeros
        for d in [2, 3]:
            pad_shape = list(output_4d.shape)
            pad_shape[d] = original_shape[d] - valid_shape[d]
            output_4d = T.concatenate([output_4d, T.zeros(pad_shape)], axis=d)
    elif mode == 'valid':
        # TODO: we do not implement all mode with this code.
        # Add a check for the good cases.
        output_4d = output_2d.reshape(original_shape)
    else:
        raise NotImplementedError("neibs2images do not support mode=%s" % mode)

    return output_4d
Esempio n. 41
0
    def __init__(self,
                 cooccurrence,
                 z_k,
                 opt,
                 initializer,
                 initial_pz_weight=None,
                 initial_b=None,
                 pz_regularizer=None,
                 tao0=5.,
                 tao_min=0.25,
                 tao_decay=1e-6,
                 eps=1e-9):
        cooccurrence = cooccurrence.astype(np.float32)
        self.cooccurrence = cooccurrence
        self.z_k = z_k
        self.opt = opt
        x_k = cooccurrence.shape[0]
        self.x_k = x_k

        # cooccurrence matrix
        n = np.sum(cooccurrence, axis=None)
        _co = cooccurrence / n
        co = T.constant(_co, name="co")  # (x_k, x_k)
        _co_m = np.sum(_co, axis=1, keepdims=True)
        co_m = T.constant(_co_m, name="co_m")  # (x_k,1)
        _co_c = _co / (eps + _co_m)
        _co_h = np.sum(_co * -np.log(eps + _co_c), axis=1, keepdims=True)  # (x_k, 1)
        print "COh: {}".format(np.sum(_co_h))
        co_h = T.constant(_co_h, name="co_h")

        if initial_pz_weight is None:
            initial_pz_weight = initializer((x_k, z_k))
        pz_weight = K.variable(initial_pz_weight)
        pz = softmax_nd(pz_weight)

        srng = RandomStreams(123)
        rnd = srng.uniform(low=0., high=1., dtype='float32', size=(x_k, z_k))
        gumbel = -T.log(eps + T.nnet.relu(-T.log(eps + rnd)))

        iteration = K.variable(0, dtype='int32')
        temp = T.max(T.stack((tao_min, tao0 / (1. + (tao_decay * iteration)))))

        z = softmax_nd((pz_weight + gumbel) / (eps + temp))
        # z = pz
        w = K.variable(initializer((z_k, x_k)))
        if initial_b is None:
            initial_b = initializer((x_k,))
        b = K.variable(initial_b)
        y = softmax_nd(T.dot(z, w) + b)

        self.params = [pz_weight, w, b]

        nll_loss = -T.sum(co * T.log(eps + y), axis=None)
        reg_loss = T.constant(0.)
        if pz_regularizer:
            reg_loss = pz_regularizer(pz)
        total_loss = nll_loss + reg_loss

        decay_updates = [(iteration, iteration + 1)]

        encoding = T.argmax(pz, axis=1)
        one_hot_encoding = tensor_one_hot(encoding, z_k)  # (x_k, z_k)

        pb = T.dot(T.transpose(one_hot_encoding, (1, 0)), co)
        m = T.sum(pb, axis=1, keepdims=True)
        c = pb / (m + eps)
        validation_nll = -T.sum(pb * T.log(eps + c), axis=None)

        utilization = T.sum(T.gt(T.sum(one_hot_encoding, axis=0), 0), axis=0)
        updates = opt.get_updates(loss=total_loss, params=self.params)

        self.val_fun = theano.function([], validation_nll)
        self.encodings_fun = theano.function([], encoding)
        self.train_fun = theano.function([], [reg_loss, nll_loss, utilization, temp],
                                         updates=updates + decay_updates)
        self.weights = self.params + opt.weights + [iteration]
Esempio n. 42
0
def _infer_ndim_bcast(ndim, shape, *args):
    """
    Infer the number of dimensions from the shape or the other arguments.

    :rtype: (int, variable, tuple) triple, where the variable is an integer
    vector, and the tuple contains Booleans.
    :returns: the first element returned is the inferred number of dimensions.
    The second element is the shape inferred (combining symbolic and constant
    informations from shape and args).
    The third element is a broadcasting pattern corresponding to that shape.
    """

    # Find the minimum value of ndim required by the *args
    if args:
        args_ndim = max(arg.ndim for arg in args)
    else:
        args_ndim = 0

    if isinstance(shape, (tuple, list)):
        # there is a convention that -1 means the corresponding shape of a
        # potentially-broadcasted symbolic arg
        #
        # This case combines together symbolic and non-symbolic shape
        # information
        shape_ndim = len(shape)
        if ndim is None:
            ndim = shape_ndim
        else:
            if shape_ndim != ndim:
                raise ValueError(
                    'ndim should be equal to len(shape), but\n',
                    'ndim = %s, len(shape) = %s, shape = %s' %
                    (ndim, shape_ndim, shape))

        bcast = []
        pre_v_shape = []
        for i, s in enumerate(shape):
            if hasattr(s, 'type'):  # s is symbolic
                bcast.append(False)  # todo - introspect further
                pre_v_shape.append(s)
            else:
                if s >= 0:
                    pre_v_shape.append(tensor.as_tensor_variable(s))
                    bcast.append((s == 1))
                elif s == -1:
                    n_a_i = 0
                    for a in args:
                        # ndim: _   _   _   _   _   _
                        # ashp:         s0  s1  s2  s3
                        #           i
                        if i >= ndim - a.ndim:
                            n_a_i += 1
                            a_i = i + a.ndim - ndim
                            if not a.broadcastable[a_i]:
                                pre_v_shape.append(a.shape[a_i])
                                bcast.append(False)
                                break
                    else:
                        if n_a_i == 0:
                            raise ValueError(
                                ('Auto-shape of -1 must overlap'
                                 'with the shape of one of the broadcastable'
                                 'inputs'))
                        else:
                            pre_v_shape.append(tensor.as_tensor_variable(1))
                            bcast.append(True)
                else:
                    ValueError('negative shape', s)
        # post-condition: shape may still contain both symbolic and
        # non-symbolic things
        if len(pre_v_shape) == 0:
            v_shape = tensor.constant([], dtype='int32')
        else:
            v_shape = tensor.stack(*pre_v_shape)

    elif shape is None:
        # The number of drawn samples will be determined automatically,
        # but we need to know ndim
        if not args:
            raise TypeError(('_infer_ndim_bcast cannot infer shape without'
                             ' either shape or args'))
        template = reduce(lambda a, b: a + b, args)
        v_shape = template.shape
        bcast = template.broadcastable
        ndim = template.ndim
    else:
        v_shape = tensor.as_tensor_variable(shape)
        if ndim is None:
            ndim = tensor.get_vector_length(v_shape)
        bcast = [False] * ndim

    if (not (v_shape.dtype.startswith('int')
             or v_shape.dtype.startswith('uint'))):
        raise TypeError('shape must be an integer vector or list',
                        v_shape.dtype)

    if args_ndim > ndim:
        raise ValueError(
            'ndim should be at least as big as required by args value',
            (ndim, args_ndim), args)

    assert ndim == len(bcast)
    return ndim, tensor.cast(v_shape, 'int32'), tuple(bcast)
Esempio n. 43
0
    def __init__(self, babi_train_raw, babi_test_raw, word2vec,
                 word_vector_size, dim, mode, answer_module, input_mask_mode,
                 memory_hops, l2, normalize_attention, **kwargs):

        print "==> not used params in DMN class:", kwargs.keys()
        self.vocab = {}
        self.ivocab = {}

        self.word2vec = word2vec
        self.word_vector_size = word_vector_size
        self.dim = dim
        self.mode = mode
        self.answer_module = answer_module
        self.input_mask_mode = input_mask_mode
        self.memory_hops = memory_hops
        #self.batch_size = 1
        self.l2 = l2
        self.normalize_attention = normalize_attention

        self.train_input, self.train_q, self.train_answer, self.train_choices, self.train_input_mask = self._process_input(
            babi_train_raw)
        self.test_input, self.test_q, self.test_answer, self.test_choices, self.test_input_mask = self._process_input(
            babi_test_raw)
        self.vocab_size = 2  # number of answer choices

        self.inp_var = T.matrix('input_var')
        self.q_var = T.matrix('question_var')
        self.ca_var = T.matrix('ca_var')
        self.cb_var = T.matrix('cb_var')
        #self.cc_var = T.matrix('cc_var')
        #self.cd_var = T.matrix('cd_var')
        self.ans_var = T.iscalar('answer_var')
        self.input_mask_var = T.ivector('input_mask_var')

        print "==> building input module"
        self.W_inp_res_in = theano.shared(lasagne.init.Normal(0.1).sample(
            (self.dim, self.word_vector_size)),
                                          borrow=True)
        self.W_inp_res_hid = theano.shared(lasagne.init.Normal(0.1).sample(
            (self.dim, self.dim)),
                                           borrow=True)
        self.b_inp_res = theano.shared(lasagne.init.Constant(0.0).sample(
            (self.dim, )),
                                       borrow=True)

        self.W_inp_upd_in = theano.shared(lasagne.init.Normal(0.1).sample(
            (self.dim, self.word_vector_size)),
                                          borrow=True)
        self.W_inp_upd_hid = theano.shared(lasagne.init.Normal(0.1).sample(
            (self.dim, self.dim)),
                                           borrow=True)
        self.b_inp_upd = theano.shared(lasagne.init.Constant(0.0).sample(
            (self.dim, )),
                                       borrow=True)

        self.W_inp_hid_in = theano.shared(lasagne.init.Normal(0.1).sample(
            (self.dim, self.word_vector_size)),
                                          borrow=True)
        self.W_inp_hid_hid = theano.shared(lasagne.init.Normal(0.1).sample(
            (self.dim, self.dim)),
                                           borrow=True)
        self.b_inp_hid = theano.shared(lasagne.init.Constant(0.0).sample(
            (self.dim, )),
                                       borrow=True)

        inp_c_history, _ = theano.scan(fn=self.input_gru_step,
                                       sequences=self.inp_var,
                                       outputs_info=T.zeros_like(
                                           self.b_inp_hid))

        self.inp_c = inp_c_history.take(self.input_mask_var, axis=0)

        self.q_q, _ = theano.scan(fn=self.input_gru_step,
                                  sequences=self.q_var,
                                  outputs_info=T.zeros_like(self.b_inp_hid))

        self.q_q = self.q_q[-1]

        self.c_vecs = []
        #for choice in [self.ca_var, self.cb_var, self.cc_var, self.cd_var]:
        for choice in [self.ca_var, self.cb_var]:
            history, _ = theano.scan(fn=self.input_gru_step,
                                     sequences=choice,
                                     outputs_info=T.zeros_like(self.b_inp_hid))
            self.c_vecs.append(history[-1])

        self.c_vecs = T.stack(self.c_vecs).transpose((1, 0))  # (dim, 4)
        self.inp_c = T.stack([self.inp_c] * 2).transpose(
            (1, 2, 0))  # (fact_cnt, dim, 4)
        self.q_q = T.stack([self.q_q] * 2).transpose((1, 0))  # (dim, 4)

        print "==> creating parameters for memory module"
        self.W_mem_res_in = theano.shared(lasagne.init.Normal(0.1).sample(
            (self.dim, self.dim)),
                                          borrow=True)
        self.W_mem_res_hid = theano.shared(lasagne.init.Normal(0.1).sample(
            (self.dim, self.dim)),
                                           borrow=True)
        self.b_mem_res = theano.shared(lasagne.init.Constant(0.0).sample(
            (self.dim, )),
                                       borrow=True)

        self.W_mem_upd_in = theano.shared(lasagne.init.Normal(0.1).sample(
            (self.dim, self.dim)),
                                          borrow=True)
        self.W_mem_upd_hid = theano.shared(lasagne.init.Normal(0.1).sample(
            (self.dim, self.dim)),
                                           borrow=True)
        self.b_mem_upd = theano.shared(lasagne.init.Constant(0.0).sample(
            (self.dim, )),
                                       borrow=True)

        self.W_mem_hid_in = theano.shared(lasagne.init.Normal(0.1).sample(
            (self.dim, self.dim)),
                                          borrow=True)
        self.W_mem_hid_hid = theano.shared(lasagne.init.Normal(0.1).sample(
            (self.dim, self.dim)),
                                           borrow=True)
        self.b_mem_hid = theano.shared(lasagne.init.Constant(0.0).sample(
            (self.dim, )),
                                       borrow=True)

        self.W_b = theano.shared(lasagne.init.Normal(0.1).sample(
            (self.dim, self.dim)),
                                 borrow=True)
        self.W_1 = theano.shared(lasagne.init.Normal(0.1).sample(
            (self.dim, 10 * self.dim + 3)),
                                 borrow=True)
        self.W_2 = theano.shared(lasagne.init.Normal(0.1).sample(
            (1, self.dim)),
                                 borrow=True)
        self.b_1 = theano.shared(lasagne.init.Constant(0.0).sample(
            (self.dim, )),
                                 borrow=True)
        self.b_2 = theano.shared(lasagne.init.Constant(0.0).sample((1, )),
                                 borrow=True)

        print "==> building episodic memory module (fixed number of steps: %d)" % self.memory_hops
        memory = [self.q_q.copy()]  # (dim, 4)
        for iter in range(1, self.memory_hops + 1):
            current_episode = self.new_episode(memory[iter - 1])
            memory.append(
                self.GRU_update_batch(memory[iter - 1], current_episode,
                                      self.W_mem_res_in, self.W_mem_res_hid,
                                      self.b_mem_res, self.W_mem_upd_in,
                                      self.W_mem_upd_hid, self.b_mem_upd,
                                      self.W_mem_hid_in, self.W_mem_hid_hid,
                                      self.b_mem_hid))

        last_mem = memory[-1].flatten()

        print "==> building answer module"
        self.W_a = theano.shared(lasagne.init.Normal(0.1).sample(
            (self.vocab_size, 2 * self.dim)),
                                 borrow=True)

        if self.answer_module == 'feedforward':
            self.prediction = nn_utils.softmax(T.dot(self.W_a, last_mem))

        elif self.answer_module == 'recurrent':
            self.W_ans_res_in = nn_utils.normal_param(
                std=0.1, shape=(self.dim, self.dim + self.vocab_size))
            self.W_ans_res_hid = nn_utils.normal_param(std=0.1,
                                                       shape=(self.dim,
                                                              self.dim))
            self.b_ans_res = nn_utils.constant_param(value=0.0,
                                                     shape=(self.dim, ))

            self.W_ans_upd_in = nn_utils.normal_param(
                std=0.1, shape=(self.dim, self.dim + self.vocab_size))
            self.W_ans_upd_hid = nn_utils.normal_param(std=0.1,
                                                       shape=(self.dim,
                                                              self.dim))
            self.b_ans_upd = nn_utils.constant_param(value=0.0,
                                                     shape=(self.dim, ))

            self.W_ans_hid_in = nn_utils.normal_param(
                std=0.1, shape=(self.dim, self.dim + self.vocab_size))
            self.W_ans_hid_hid = nn_utils.normal_param(std=0.1,
                                                       shape=(self.dim,
                                                              self.dim))
            self.b_ans_hid = nn_utils.constant_param(value=0.0,
                                                     shape=(self.dim, ))

            def answer_step(prev_a, prev_y):
                a = self.GRU_update(prev_a, T.concatenate([prev_y, self.q_q]),
                                    self.W_ans_res_in, self.W_ans_res_hid,
                                    self.b_ans_res, self.W_ans_upd_in,
                                    self.W_ans_upd_hid, self.b_ans_upd,
                                    self.W_ans_hid_in, self.W_ans_hid_hid,
                                    self.b_ans_hid)

                y = nn_utils.softmax(T.dot(self.W_a, a))
                return [a, y]

            # TODO: add conditional ending
            dummy = theano.shared(np.zeros((self.vocab_size, ), dtype=floatX))
            results, updates = theano.scan(
                fn=answer_step,
                outputs_info=[last_mem, T.zeros_like(dummy)],
                n_steps=1)
            self.prediction = results[1][-1]

        print "==> collecting all parameters"
        self.params = [
            self.W_inp_res_in, self.W_inp_res_hid, self.b_inp_res,
            self.W_inp_upd_in, self.W_inp_upd_hid, self.b_inp_upd,
            self.W_inp_hid_in, self.W_inp_hid_hid, self.b_inp_hid,
            self.W_mem_res_in, self.W_mem_res_hid, self.b_mem_res,
            self.W_mem_upd_in, self.W_mem_upd_hid, self.b_mem_upd,
            self.W_mem_hid_in, self.W_mem_hid_hid, self.b_mem_hid, self.W_b,
            self.W_1, self.W_2, self.b_1, self.b_2, self.W_a
        ]

        if self.answer_module == 'recurrent':
            self.params = self.params + [
                self.W_ans_res_in, self.W_ans_res_hid, self.b_ans_res,
                self.W_ans_upd_in, self.W_ans_upd_hid, self.b_ans_upd,
                self.W_ans_hid_in, self.W_ans_hid_hid, self.b_ans_hid
            ]

        print "==> building loss layer and computing updates"
        self.loss_ce = T.nnet.categorical_crossentropy(
            self.prediction.dimshuffle('x', 0), T.stack([self.ans_var]))[0]
        if self.l2 > 0:
            self.loss_l2 = self.l2 * nn_utils.l2_reg(self.params)
        else:
            self.loss_l2 = 0

        self.loss = self.loss_ce + self.loss_l2

        updates = lasagne.updates.adadelta(self.loss, self.params)

        if self.mode == 'train':
            print "==> compiling train_fn"
            self.train_fn = theano.function(
                inputs=[
                    self.inp_var,
                    self.q_var,
                    self.ans_var,
                    self.ca_var,
                    self.cb_var,  # self.cc_var, self.cd_var,
                    self.input_mask_var
                ],
                outputs=[self.prediction, self.loss],
                updates=updates)

        print "==> compiling test_fn"
        self.test_fn = theano.function(
            inputs=[
                self.inp_var,
                self.q_var,
                self.ans_var,
                self.ca_var,
                self.cb_var,  # self.cc_var, self.cd_var,
                self.input_mask_var
            ],
            outputs=[
                self.prediction, self.loss, self.inp_c, self.q_q, last_mem
            ])

        if self.mode == 'train':
            print "==> computing gradients (for debugging)"
            gradient = T.grad(self.loss, self.params)
            self.get_gradient_fn = theano.function(
                inputs=[
                    self.inp_var,
                    self.q_var,
                    self.ans_var,
                    self.ca_var,
                    self.cb_var,  # self.cc_var, self.cd_var,
                    self.input_mask_var
                ],
                outputs=gradient)
Esempio n. 44
0
def generate_hierarchical_model_parameters(parameter, n_subjects, design,
                                           mu_lower, mu_upper, sd_lower,
                                           sd_upper, bound_lower, bound_upper,
                                           val, testval):

    if (design['conditions'] is not None):
        if val is None:
            mu = tt.stack([
                pm.Uniform('{}_{}_mu'.format(parameter, condition),
                           mu_lower,
                           mu_upper,
                           testval=testval)
                for condition in design['conditions']
            ])
            sd = tt.stack([
                pm.Uniform('{}_{}_sd'.format(parameter, condition),
                           sd_lower,
                           sd_upper,
                           testval=testval)
                for condition in design['conditions']
            ])
            bounded = pm.Bound(pm.Normal, bound_lower, bound_upper)
            parms = []
            n_subjects_per_condition = []
            for c, condition in enumerate(design['conditions']):
                n_subjects_in_condition = np.unique(design['subject_index'][
                    design['condition_index'] == c]).size
                n_subjects_per_condition.append(n_subjects_in_condition)
                parms_tmp = bounded('{}_{}'.format(parameter, condition),
                                    mu=mu[c],
                                    sd=sd[c],
                                    shape=(n_subjects_in_condition))
                parms_tmp = tt.concatenate([tt.zeros(1), parms_tmp])
                parms.append(parms_tmp[design['D'][:, c]][:, None])
            parms = tt.concatenate(parms, axis=1)

        else:
            parms = []
            n_subjects_per_condition = []
            for c, condition in enumerate(design['conditions']):
                n_subjects_in_condition = np.unique(design['subject_index'][
                    design['condition_index'] == c]).size
                n_subjects_per_condition.append(n_subjects_in_condition)
                if len(val) == len(design['conditions']):
                    parms.append(
                        pm.Deterministic(
                            '{}_{}'.format(parameter, condition),
                            tt.ones(n_subjects_in_condition, 1) * val[c]))
                else:
                    raise ValueError(
                        'Number of values in {}_val does not match the number of specified {}-conditions.'
                        .format(parameter, parameter))
            # make sure all elements in parms have same size
            for set_i, parm_set in enumerate(parms):
                if n_subjects_per_condition[set_i] < n_subjects:
                    parms[set_i] = tt.concatenate([
                        parm_set,
                        tt.zeros(
                            (n_subjects - n_subjects_per_condition[set_i], 1))
                    ],
                                                  axis=0)
            parms = tt.concatenate(parms, axis=1)

    else:
        if val is None:
            mu = pm.Uniform('{}_mu'.format(parameter),
                            mu_lower,
                            mu_upper,
                            testval=testval)
            sd = pm.Uniform('{}_sd'.format(parameter),
                            sd_lower,
                            sd_upper,
                            testval=testval)
            bounded = pm.Bound(pm.Normal, bound_lower, bound_upper)
            parms = bounded(parameter, mu=mu, sd=sd, shape=(n_subjects, 1))
        else:
            parms = pm.Deterministic(parameter, tt.ones((n_subjects, 1)) * val)

    return parms
Esempio n. 45
0
    # The number of spindles per epoch:
    num_spindles_per_epoch = pm.Categorical('num_spindles_per_epoch',
                                            p=pm.Dirichlet(
                                                'spindle_num_prior',
                                                a=spindle_number_prior),
                                            testval=1)

    # ----Tracking is a raters spindle marker is real or contaminate-----
    # if the number of spindles in an epoch (z) is greater than 0, then use conf to determine if a spindle is real or not
    #spindle_chance = data['conf']  # pm.math.switch(num_spindles_per_epoch[data['epoch_i']] > 0, data['conf'], 0)
    spindle_chance_prior = pm.Beta('spindle_chance_prior', alpha=2, beta=1)
    marker_is_from_real_spindle = pm.Bernoulli('marker_is_from_real_spindle',
                                               p=spindle_chance_prior,
                                               shape=n_data)
    marker_is_from_real_spindle_stacked = tt.stack(
        [marker_is_from_real_spindle, 1 - marker_is_from_real_spindle],
        axis=1)  # stack theta for use in mixture model

    # ----Mapping between rater's spindles and real spindles (w)----
    ## Handy matrix to compare z too
    compare = np.arange(0, max_spindles_per_epoch + 1)  # [0 1 2 3 4 5]*epochs

    # Acutual prior for "mapping_marker_to_true_spindle"
    # shape=[n_epochs, max_spindles_per_epoch],
    # e.g. mapping_marker_to_true_spindle_prior for a single epoch will be like [1 1 1 0 0 0 0],
    # and therefore the mapping_marker_to_true_spindle's can only be from [0-2]-1 = [-1, 0, 1], where -1=no mapping
    mapping_marker_to_true_spindle_prior = pm.math.where(
        compare - num_spindles_per_epoch <= 0, 1, 0)
    # no_spindles_prior = np.zeros((n_data, 6))
    # no_spindles_prior[:, 0] = 1
    # mapping_prior = tt.switch(marker_is_from_real_spindle.reshape((n_data, 1)), mapping_marker_to_true_spindle_prior, no_spindles_prior)
Esempio n. 46
0
    def __init__(self, model, algo='fisher', c_lambd_inv=1e-3, rate=1.05,
                 over_sampling=1, rescale='momentum'):
        """ Init self.

        Args:
            model,
            algo,
            c_lambd_inv: Start value of \lambda regularizer (used in matrix
                inversion and in F*v computation).
            rate: Change per iteration for \lambda.
            over_sampling: For Fisher-like methods, use multiple random
                vectors per one sample from dataset.
            rescale: Can be either False, True or 'momentum'.

        Implemented algos:
            'gn' - Gauss-Newton matrix,
            'fisher' - Fisher matrix,
            'kr' - Khatri-Rao matrix,
            'kr_diag' - block-diagonal KR matrix.
        """
        self.model = model
        self.algo = algo

        self.x = self.model.x
        self.y = T.ivector('y')
        self.outc = T.matrix('outc')

        # due to theano bugs
        self.x_d = shared_empty(2)
        self.y_d = shared_empty(1, dtype='int32')
        self.outc_d = shared_empty(2)
        self.rand_outc_d = shared_empty(3)
        # ---

        self.rand_outc = T.tensor3('rand_outc')
        self.lambd_inv = T.scalar('lambd_inv')

        self.c_lambd_inv = c_lambd_inv
        self.rate = rate
        self.over_sampling = over_sampling
        self.rescale = rescale

        # -- target def --
        self.f_loss = 0
        self.f_loss_samples = 0
        for i in range(self.over_sampling):
            self.f_loss += get_loss(self.model.a, self.rand_outc[i] + my_consider_constant(self.model.a)) * scalar_floatX(self.model.a.shape[0])
            self.f_loss_samples += get_loss_samples(self.model.a, self.rand_outc[i] + my_consider_constant(self.model.a))

        self.loss = get_loss(self.model.a, self.outc)
        self.err = get_error(get_pred(self.model.a), self.y)

        self.updates = OrderedDict()

        self.grad = sum(([T.grad(self.loss, p)] for p in self.model.params), [])
        self.grad_vec = T.concatenate([g.flatten() for g in self.grad])

        def get_fisher_mat():
            grad2d = []
            for p in self.model.params:
                grad2d += [T.jacobian(self.f_loss_samples, p)]
                if grad2d[-1].ndim == 2:
                    grad2d[-1] = grad2d[-1].dimshuffle(0, 1, 'x')

            grad2d_vec = T.concatenate([g.flatten(2).T for g in grad2d]).T

            # tensor wise: F_p,i,j = sum_k grad2d[p,i,k]*grad2d[p,k,j]
            # just a slow reference implementation of what is below
            # F = T.mean(T.batched_dot(grad2d_vec.dimshuffle(0, 1, 'x'), grad2d_vec.dimshuffle(0, 'x', 1)), 0)/self.over_sampling
            F = T.dot(grad2d_vec.T, grad2d_vec)/T.cast(grad2d_vec.shape[0], theano.config.floatX)/self.over_sampling
            return F

        if self.algo == 'fisher':
            self.grad2d = []
            for p in self.model.params:
                self.grad2d += [T.jacobian(self.f_loss_samples, p)]
                if self.grad2d[-1].ndim == 2:
                    self.grad2d[-1] = self.grad2d[-1].dimshuffle(0, 1, 'x')

            self.grad2d_vec = T.concatenate([g.flatten(2).T for g in self.grad2d]).T

            # tensor wise: F_p,i,j = sum_k grad2d[p,i,k]*grad2d[p,k,j]
            # just a slow reference implementation of what is below
            # F = T.mean(T.batched_dot(grad2d_vec.dimshuffle(0, 1, 'x'), grad2d_vec.dimshuffle(0, 'x', 1)), 0)/self.over_sampling
            self.F = T.dot(self.grad2d_vec.T, self.grad2d_vec)/T.cast(self.grad2d_vec.shape[0], theano.config.floatX)/self.over_sampling
        elif self.algo == 'gn':
            self.grad2d = []
            for p in self.model.params:
                self.grad2d += [T.jacobian(self.model.a.flatten(), p)]
                new_shape = (self.model.a.shape[0], self.model.a.shape[1], -1)
                self.grad2d[-1] = self.grad2d[-1].reshape(new_shape)


            self.grad2d_vec = T.concatenate([g.flatten(3) for g in self.grad2d], 2)

            # just a slow reference implementation of what is below
            # self.F = T.mean(T.batched_dot(self.grad2d_vec.dimshuffle(0, 2, 1),
            #                               self.grad2d_vec.dimshuffle(0, 1, 2)), axis=0)

            self.F = T.tensordot(self.grad2d_vec.dimshuffle(0, 2, 1),
                                 self.grad2d_vec.dimshuffle(0, 1, 2), [(0, 2), (0, 1)])/T.cast(self.grad2d_vec.shape[0], theano.config.floatX)
        elif self.algo.startswith('kr'):
        self.grads = []
        # self.acts = [T.concatenate([self.model.x, T.ones((self.model.x.shape[0], 1))], axis=1)]
        self.acts = [self.model.x]
        for l in self.model.layers:
            cg = T.grad(self.f_loss, l.s)
            self.grads.append(cg)
            # self.acts.append(T.concatenate([l.a, T.ones((l.a.shape[0], 1))], axis=1))
            self.acts.append(l.a)

        self.G = []
        self.A = []
        self.F_block = []
        self.F = []

        cnt = T.cast(self.grads[0].shape[0], theano.config.floatX)
        for i in range(len(self.grads)):
            self.G += [[]]
            self.A += [[]]
            for j in range(len(self.grads)):
                # self.G[-1] += [T.mean(T.batched_dot(self.grads[i].dimshuffle(0, 1, 'x'), self.grads[j].dimshuffle(0, 'x', 1)), 0).dimshuffle('x', 0, 1)]
                # self.A[-1] += [T.mean(T.batched_dot(self.acts[i].dimshuffle(0, 1, 'x'), self.acts[j].dimshuffle(0, 'x', 1)), 0).dimshuffle('x', 0, 1)]

                # self.G[-1] += [T.batched_dot(self.grads[i].dimshuffle(0, 1, 'x'), self.grads[j].dimshuffle(0, 'x', 1))]
                # self.A[-1] += [T.batched_dot(self.acts[i].dimshuffle(0, 1, 'x'), self.acts[j].dimshuffle(0, 'x', 1))]

                self.G[-1] += [self.grads[i].T.dot(self.grads[j]).dimshuffle('x', 0, 1)/cnt]
                self.A[-1] += [self.acts[i].T.dot(self.acts[j]).dimshuffle('x', 0, 1)/cnt]

                if self.algo.endswith('diag'):
                    self.G[-1][-1] *= float(i==j)
                    self.A[-1][-1] *= float(i==j)


        for i in range(len(self.grads)):
            self.F_block += [[]]
            for j in range(len(self.grads)):
                # depends on whether you want to compute the real fisher with this or the kr approximation
                # since numpy-base fast_kron somehow computes 3d tensors faster than theano

                # cblock = fast_kron(self.A[i][j], self.G[i][j])
                cblock = native_kron(self.A[i][j], self.G[i][j])

                cblock = cblock.reshape(cblock.shape[1:], ndim=2)
                self.F_block[i] += [cblock]
            self.F.append(T.concatenate(self.F_block[-1], axis=1))
        self.F = T.concatenate(self.F, axis=0)
        self.F = (self.F+self.F.T)/2


        self.Fdamp = self.F+T.identity_like(self.F)*self.lambd_inv

        # There're 3+ different ways of computing F^-1*v in theano,
        # and it seems like solve_sym_pos is quite neutral in terms
        # of performance + it throws an exception if the provided matrix
        # is singular.

        # self.new_grad_vec = theano.tensor.slinalg.solve(self.Fdamp, self.grad_vec.dimshuffle(0, 'x'))
        self.new_grad_vec = solve_sym_pos(self.Fdamp, self.grad_vec)
        # self.new_grad_vec = gpu_solve(self.Fdamp, self.grad_vec.dimshuffle(0, 'x'))

        pcount = sum(p.get_value().size for p in self.model.params)
        self.ch_history = theano.shared(np.zeros((pcount,), dtype=theano.config.floatX))

        if self.rescale == 'momentum':
            self.real_fish = get_fisher_mat() + T.identity_like(self.F)*self.lambd_inv

            FT = self.real_fish.dot(self.new_grad_vec)
            FM = self.real_fish.dot(self.ch_history)

            TFT = self.new_grad_vec.T.dot(FT)
            MFT = self.ch_history.T.dot(FT)
            MFM = self.ch_history.T.dot(FM)

            GT = self.grad_vec.T.dot(self.new_grad_vec)
            GM = self.grad_vec.T.dot(self.ch_history)


            tmp1 = T.stack([TFT.reshape(()), MFT.reshape(())], 0).dimshuffle('x', 0)
            tmp2 = T.stack([MFT.reshape(()), MFM.reshape(())], 0).dimshuffle('x', 0)

            A = T.concatenate([tmp1, tmp2], 0)
            A_pinv = T.nlinalg.MatrixPinv()(A)
            b = T.stack([GT.reshape(()), GM.reshape(())], 0).dimshuffle(0, 'x')

            res = A_pinv.dot(b).flatten()

            alpha = res[0]
            beta = res[1]

            self.new_grad_vec = self.new_grad_vec * alpha.reshape(()) + self.ch_history * beta.reshape(())
            self.F = self.real_fish

            self.updates[self.ch_history] = self.new_grad_vec
        elif self.rescale:
            self.real_fish = get_fisher_mat() + T.identity_like(self.F)*self.lambd_inv
            lin_fac = self.grad_vec.T.dot(self.new_grad_vec)
            quad_fac = self.new_grad_vec.T.dot(self.real_fish.dot(self.new_grad_vec))

            alpha = lin_fac/quad_fac
            beta = 0 * alpha

            self.new_grad_vec *= alpha.reshape(())
            self.F = self.real_fish
            # self.Fdamp = self.F+T.identity_like(self.F)*self.lambd_inv

        # alpha = T.as_tensor_variable(1)

        def _apply_gradient_vec(params, new_grad_vec, updates):
            new_grad = []
            offset = 0
            for p in params:
                pval = p.get_value()
                new_grad += [new_grad_vec[offset:offset+pval.size].reshape(pval.shape)]
                offset += pval.size

                updates[p] = p - new_grad[-1]

            return new_grad

        self.new_grad = _apply_gradient_vec(self.model.params, self.new_grad_vec, self.updates)

        self.get_params = theano.function(
            inputs=[],
            outputs=self.model.params,
            on_unused_input='warn'
        )

        self.quad_est_loss = self.new_grad_vec.T.dot(self.F.dot(self.new_grad_vec))/2
        self.est_loss = self.quad_est_loss + self.grad_vec.dot(self.new_grad_vec)

        self.print_pls = {}
        self.print_pls.update({'shape': self.F.shape[0], 'rank': rank(self.F*10000)})
        self.print_pls.update({'grad_mean': T.mean(self.grad_vec**2)**0.5})
        self.print_pls.update({'alpha': alpha, 'beta': beta})
        # self.print_pls += [self.F]
        # self.print_pls += [self.real_fish]

        self.train = theano.function(
            inputs=[self.lambd_inv],
            outputs=[self.est_loss, self.loss, self.err] + list(self.print_pls.values()),
            updates=self.updates,
            givens={
                self.x: self.x_d,
                self.y: self.y_d,
                self.outc: self.outc_d,
                self.rand_outc: self.rand_outc_d
            },
            on_unused_input='warn',
            allow_input_downcast=True,
            # profile=True
        )

        self.eva = theano.function(
            inputs=[],
            outputs=[self.loss, self.err],
            givens={
                self.x: self.x_d,
                self.y: self.y_d,
                self.outc: self.outc_d
            },
            on_unused_input='warn',
            allow_input_downcast=True
        )

    def step(self, X, y, outc):
        """Perform single train iteration.

        Args:
            X: input vectors
            y: target labels.
            outc: target vectors.

        Returns:
            Dict consisting of 'loss', 'err', 'est_loss', 'rho', 'delta_ll' and
            parameters from self.print_pls.

        """
        self.x_d.set_value(X)
        self.y_d.set_value(y)
        self.outc_d.set_value(outc)
        self.rand_outc_d.set_value(floatX(nprng.randn(self.over_sampling, *outc.shape)))

        old_params = self.get_params()
        while True:
            # reset params to saved
            for op, p in zip(old_params, self.model.params):
                p.set_value(op)

            try:
                t_r = self.train(self.c_lambd_inv)

                print_pls_vals = t_r[-len(self.print_pls):]
                self.print_pls_res = {k: v for k, v in zip(self.print_pls.keys(), print_pls_vals)}
            except numpy.linalg.linalg.LinAlgError:
                t_r = [1e20, 1e10, 10] + [None] * len(self.print_pls)
                self.print_pls_res = {k: None for k in self.print_pls.keys()}

            e_v = self.eva()
            delta_ll = t_r[1] - e_v[0]
            rho = delta_ll/float(t_r[0])

            print()
            print('lambda:', round(self.c_lambd_inv, 7), 'rho:', round(rho, 2), 'old loss:',  t_r[1], 'new loss:', e_v[0])
            if rho < 0:
                self.c_lambd_inv *= self.rate * 2
                continue
            elif rho < 0.5:
                self.c_lambd_inv *= self.rate
                # self.c_lambd_inv = min(self.c_lambd_inv, 0.02)
            elif rho > 0.5:
                self.c_lambd_inv /= self.rate
            else:
                pass
            break

        # self.train.profiler.print_summary()
        res = {'rho': rho, 'est_loss': t_r[0], 'loss': t_r[1], 'err': t_r[2], 'delta_ll': delta_ll}
        res.update(self.print_pls_res)

        return res

    def evaluate(X_test, y_test, outc_test):
        """Return loss and error for provided dataset.

        Args:
            X_test: input vectors,
            y_test: target labels,
            outc_test: target vectors.

        Returns:
            Dict consisting of 'test_loss', 'test_err'.
        """
        self.x_d.set_value(X_test)
        self.y_d.set_value(y_test)
        self.outc_d.set_value(outc_test)

        te_v = self.eva()
        test_loss = te_v[0]
        test_err = te_v[1]

        return {'test_loss': test_loss, 'test_err': test_err}

    def _check_gv_matrix_correctness(self):
        v = T.vector('v')
        get_Fv = theano.function(
            inputs=[v],
            outputs=[self.F.dot(v)],
            givens={
                self.x: self.x_d,
                self.outc: self.outc_d
            },
            allow_input_downcast=True
        )

        grad_at = theano.function(
            inputs=[],
            outputs=sum(([T.grad(self.loss, p)] for p in self.model.params), []),
            givens={
                self.x: self.x_d,
                self.outc: self.outc_d
            },
            allow_input_downcast=True
        )
        grads0 = grad_at()

        vec = []

        EPS = 1e-5
        for p in self.model.params:
            vec += [nprng.randn(*p.get_value().shape).astype(theano.config.floatX)]
            p.set_value(p.get_value()+vec[-1]*EPS)
        grads1 = grad_at()

        vec_vec = np.concatenate([p.flatten() for p in vec])
        F_vec = get_Fv(vec_vec)
        F_vec_vec = np.concatenate([f.flatten() for f in F_vec])

        grads0_vec = np.concatenate([p.flatten() for p in grads0])
        grads1_vec = np.concatenate([p.flatten() for p in grads1])

        F_vec_emp = (grads1_vec-grads0_vec)/EPS

        print(np.mean(F_vec_emp**2)**0.5, np.mean(F_vec_vec**2)**0.5)
        print(np.max(np.abs(F_vec_emp-F_vec_vec)))
        exit(0)
Esempio n. 47
0
def conv2d(
        input,
        filters,
        image_shape=None,
        filter_shape=None,
        border_mode="valid",
        subsample=(1, 1),
        **kargs,
):
    """
    signal.conv.conv2d performs a basic 2D convolution of the input with the
    given filters. The input parameter can be a single 2D image or a 3D tensor,
    containing a set of images. Similarly, filters can be a single 2D filter or
    a 3D tensor, corresponding to a set of 2D filters.

    Shape parameters are optional and will result in faster execution.

    Parameters
    ----------
    input   : Symbolic theano tensor for images to be filtered.
              Dimensions: ([num_images], image height, image width)
    filters : Symbolic theano tensor for convolution filter(s).
              Dimensions: ([num_filters], filter height, filter width)
    border_mode: {'valid', 'full'}
        See scipy.signal.convolve2d.
    subsample
        Factor by which to subsample output.
    image_shape : tuple of length 2 or 3
        ([num_images,] image height, image width).
    filter_shape : tuple of length 2 or 3
        ([num_filters,] filter height, filter width).
    kwargs
        See theano.tensor.nnet.conv.conv2d.

    Returns
    -------
    symbolic 2D,3D or 4D tensor
        Tensor of filtered images, with shape
        ([number images,] [number filters,] image height, image width).

    """
    assert input.ndim in (2, 3)
    assert filters.ndim in (2, 3)

    # use shape information if it is given to us ###
    if filter_shape and image_shape:
        if input.ndim == 3:
            bsize = image_shape[0]
        else:
            bsize = 1
        imshp = (1, ) + tuple(image_shape[-2:])

        if filters.ndim == 3:
            nkern = filter_shape[0]
        else:
            nkern = 1
        kshp = filter_shape[-2:]
    else:
        nkern, kshp = None, None
        bsize, imshp = None, None

    # reshape tensors to 4D, for compatibility with ConvOp ###
    if input.ndim == 3:
        sym_bsize = input.shape[0]
    else:
        sym_bsize = 1

    if filters.ndim == 3:
        sym_nkern = filters.shape[0]
    else:
        sym_nkern = 1

    new_input_shape = tensor.join(0, tensor.stack([sym_bsize, 1]),
                                  input.shape[-2:])
    input4D = tensor.reshape(input, new_input_shape, ndim=4)

    new_filter_shape = tensor.join(0, tensor.stack([sym_nkern, 1]),
                                   filters.shape[-2:])
    filters4D = tensor.reshape(filters, new_filter_shape, ndim=4)

    # perform actual convolution ###
    op = conv.ConvOp(
        output_mode=border_mode,
        dx=subsample[0],
        dy=subsample[1],
        imshp=imshp,
        kshp=kshp,
        nkern=nkern,
        bsize=bsize,
        **kargs,
    )

    output = op(input4D, filters4D)

    # flatten to 3D tensor if convolving with single filter or single image
    if input.ndim == 2 and filters.ndim == 2:
        if config.warn__signal_conv2d_interface:
            warnings.warn(
                "theano.tensor.signal.conv2d() now outputs a 2d tensor when both"
                " inputs are 2d. To disable this warning, set the Theano flag"
                " warn__signal_conv2d_interface to False",
                stacklevel=3,
            )

        output = tensor.flatten(output.T, ndim=2).T
    elif input.ndim == 2 or filters.ndim == 2:
        output = tensor.flatten(output.T, ndim=3).T

    return output
Esempio n. 48
0
def rnn(step_function,
        inputs,
        initial_states,
        go_backwards=False,
        mask=None,
        constants=None,
        unroll=False,
        input_length=None):
    '''Iterates over the time dimension of a tensor.

    # Arguments
        inputs: tensor of temporal data of shape (samples, time, ...)
            (at least 3D).
        step_function:
            Parameters:
                input: tensor with shape (samples, ...) (no time dimension),
                    representing input for the batch of samples at a certain
                    time step.
                states: list of tensors.
            Returns:
                output: tensor with shape (samples, ...) (no time dimension),
                new_states: list of tensors, same length and shapes
                    as 'states'.
        initial_states: tensor with shape (samples, ...) (no time dimension),
            containing the initial values for the states used in
            the step function.
        go_backwards: boolean. If True, do the iteration over
            the time dimension in reverse order.
        mask: binary tensor with shape (samples, time),
            with a zero for every element that is masked.
        constants: a list of constant values passed at each step.
        unroll: whether to unroll the RNN or to use a symbolic loop (`scan`).
        input_length: must be specified if using `unroll`.

    # Returns
        A tuple (last_output, outputs, new_states).
            last_output: the latest output of the rnn, of shape (samples, ...)
            outputs: tensor with shape (samples, time, ...) where each
                entry outputs[s, t] is the output of the step function
                at time t for sample s.
            new_states: list of tensors, latest states returned by
                the step function, of shape (samples, ...).
    '''
    ndim = inputs.ndim
    assert ndim >= 3, 'Input should be at least 3D.'

    if unroll:
        if input_length is None:
            raise Exception('When specifying `unroll=True`, an `input_length` '
                            'must be provided to `rnn`.')

    axes = [1, 0] + list(range(2, ndim))
    inputs = inputs.dimshuffle(axes)

    if constants is None:
        constants = []

    if mask is not None:
        if mask.ndim == ndim - 1:
            mask = expand_dims(mask)
        assert mask.ndim == ndim
        mask = mask.dimshuffle(axes)

        if unroll:
            indices = list(range(input_length))
            if go_backwards:
                indices = indices[::-1]

            successive_outputs = []
            successive_states = []
            states = initial_states
            for i in indices:
                output, new_states = step_function(inputs[i],
                                                   states + constants)

                if len(successive_outputs) == 0:
                    prev_output = zeros_like(output)
                else:
                    prev_output = successive_outputs[-1]

                output = T.switch(mask[i], output, prev_output)
                kept_states = []
                for state, new_state in zip(states, new_states):
                    kept_states.append(T.switch(mask[i], new_state, state))
                states = kept_states

                successive_outputs.append(output)
                successive_states.append(states)

            outputs = T.stack(*successive_outputs)
            states = []
            for i in range(len(successive_states[-1])):
                states.append(
                    T.stack(*[
                        states_at_step[i]
                        for states_at_step in successive_states
                    ]))
        else:
            # build an all-zero tensor of shape (samples, output_dim)
            initial_output = step_function(inputs[0],
                                           initial_states + constants)[0] * 0
            # Theano gets confused by broadcasting patterns in the scan op
            initial_output = T.unbroadcast(initial_output, 0, 1)

            def _step(input, mask, output_tm1, *states):
                output, new_states = step_function(input, states)
                # output previous output if masked.
                output = T.switch(mask, output, output_tm1)
                return_states = []
                for state, new_state in zip(states, new_states):
                    return_states.append(T.switch(mask, new_state, state))
                return [output] + return_states

            results, _ = theano.scan(_step,
                                     sequences=[inputs, mask],
                                     outputs_info=[initial_output] +
                                     initial_states,
                                     non_sequences=constants,
                                     go_backwards=go_backwards)

            # deal with Theano API inconsistency
            if type(results) is list:
                outputs = results[0]
                states = results[1:]
            else:
                outputs = results
                states = []
    else:
        if unroll:
            indices = list(range(input_length))
            if go_backwards:
                indices = indices[::-1]

            successive_outputs = []
            successive_states = []
            states = initial_states
            for i in indices:
                output, states = step_function(inputs[i], states + constants)
                successive_outputs.append(output)
                successive_states.append(states)
            outputs = T.stack(*successive_outputs)
            states = []
            for i in range(len(successive_states[-1])):
                states.append(
                    T.stack(*[
                        states_at_step[i]
                        for states_at_step in successive_states
                    ]))

        else:

            def _step(input, *states):
                output, new_states = step_function(input, states)
                return [output] + new_states

            results, _ = theano.scan(_step,
                                     sequences=inputs,
                                     outputs_info=[None] + initial_states,
                                     non_sequences=constants,
                                     go_backwards=go_backwards)

            # deal with Theano API inconsistency
            if type(results) is list:
                outputs = results[0]
                states = results[1:]
            else:
                outputs = results
                states = []

    outputs = T.squeeze(outputs)
    last_output = outputs[-1]

    axes = [1, 0] + list(range(2, outputs.ndim))
    outputs = outputs.dimshuffle(axes)
    states = [T.squeeze(state[-1]) for state in states]
    return last_output, outputs, states
Esempio n. 49
0
 def get_output_mask(self, train=False):
     X = self.get_input_mask(train)
     if X is None:
         return None
     tensors = [T.roll(X, off, axis=self.axis) for off in self.offsets]
     return T.stack(tensors, axis=self.offset_axis)
Esempio n. 50
0
def identity(n):
    return tensor.stack([tensor.eye(n), tensor.zeros((n, n))], axis=0)
Esempio n. 51
0
 def __init__(self, train_raw, test_raw, dim, mode, l2, l1,
              batch_norm, dropout, batch_size,
              ihm_C, los_C, ph_C, decomp_C,
              partition, nbins, **kwargs):
             
     print "==> not used params in network class:", kwargs.keys()
     self.train_raw = train_raw
     self.test_raw = test_raw
     
     self.dim = dim
     self.mode = mode
     self.l2 = l2
     self.l1 = l1
     self.batch_norm = batch_norm
     self.dropout = dropout
     self.batch_size = batch_size
     self.ihm_C = ihm_C
     self.los_C = los_C
     self.ph_C = ph_C
     self.decomp_C = decomp_C
     self.nbins = nbins
     
     if (partition == 'log'):
         self.get_bin = metrics.get_bin_log
         self.get_estimate = metrics.get_estimate_log
     else:
         assert self.nbins == 10
         self.get_bin = metrics.get_bin_custom
         self.get_estimate = metrics.get_estimate_custom
     
     self.train_batch_gen = self.get_batch_gen(self.train_raw)
     self.test_batch_gen = self.get_batch_gen(self.test_raw)    
     
     self.input_var = T.tensor3('X')
     self.input_lens = T.ivector('L')
     
     self.ihm_pos = T.ivector('ihm_pos')
     self.ihm_mask = T.ivector('ihm_mask')
     self.ihm_label = T.ivector('ihm_label')
     
     self.los_mask = T.imatrix('los_mask')
     self.los_label = T.matrix('los_label') # for regression
     #self.los_label = T.imatrix('los_label')
     
     self.ph_label = T.imatrix('ph_label')
     
     self.decomp_mask = T.imatrix('decomp_mask')
     self.decomp_label = T.imatrix('decomp_label')
     
     print "==> Building neural network"
     
     # common network
     network = layers.InputLayer((None, None, self.train_raw[0][0].shape[1]), 
                                 input_var=self.input_var)
     
     if (self.dropout > 0):
         network = layers.DropoutLayer(network, p=self.dropout)
     
     network = layers.LSTMLayer(incoming=network, num_units=dim,
                                only_return_final=False,
                                grad_clipping=10,
                                ingate=lasagne.layers.Gate(
                                     W_in=Orthogonal(),
                                     W_hid=Orthogonal(),
                                     W_cell=Normal(0.1)),
                                forgetgate=lasagne.layers.Gate(
                                     W_in=Orthogonal(),
                                     W_hid=Orthogonal(),
                                     W_cell=Normal(0.1)),
                                cell=lasagne.layers.Gate(W_cell=None,
                                     nonlinearity=lasagne.nonlinearities.tanh,
                                     W_in=Orthogonal(),
                                     W_hid=Orthogonal()),
                                outgate=lasagne.layers.Gate(
                                     W_in=Orthogonal(),
                                     W_hid=Orthogonal(),
                                     W_cell=Normal(0.1)))
     
     if (self.dropout > 0):
         network = layers.DropoutLayer(network, p=self.dropout)
     
     lstm_output = layers.get_output(network)
     self.params = layers.get_all_params(network, trainable=True)
     self.reg_params = layers.get_all_params(network, regularizable=True)
     
     # for each example in minibatch take the last output
     last_outputs = []
     for index in range(self.batch_size):
         last_outputs.append(lstm_output[index, self.input_lens[index]-1, :])
     last_outputs = T.stack(last_outputs)
     
     # take 48h outputs for fixed mortality task
     mid_outputs = []
     for index in range(self.batch_size):
         mid_outputs.append(lstm_output[index, self.ihm_pos[index], :])
     mid_outputs = T.stack(mid_outputs)
     
     
     # in-hospital mortality related network
     ihm_network = layers.InputLayer((None, dim), input_var=mid_outputs)
     ihm_network = layers.DenseLayer(incoming=ihm_network, num_units=2,
                                    nonlinearity=softmax)
     self.ihm_prediction = layers.get_output(ihm_network)
     self.ihm_det_prediction = layers.get_output(ihm_network, deterministic=True)
     self.params += layers.get_all_params(ihm_network, trainable=True)
     self.reg_params += layers.get_all_params(ihm_network, regularizable=True)
     self.ihm_loss = (self.ihm_mask * categorical_crossentropy(self.ihm_prediction, 
                                                       self.ihm_label)).mean()
     
     
     # length of stay related network
     # Regression
     los_network = layers.InputLayer((None, None, dim), input_var=lstm_output)
     los_network = layers.ReshapeLayer(los_network, (-1, dim))
     los_network = layers.DenseLayer(incoming=los_network, num_units=1,
                                     nonlinearity=rectify)
     los_network = layers.ReshapeLayer(los_network, (lstm_output.shape[0], -1))
     self.los_prediction = layers.get_output(los_network)
     self.los_det_prediction = layers.get_output(los_network, deterministic=True)
     self.params += layers.get_all_params(los_network, trainable=True)
     self.reg_params += layers.get_all_params(los_network, regularizable=True)
     self.los_loss = (self.los_mask * squared_error(self.los_prediction,
                                                   self.los_label)).mean(axis=1).mean(axis=0)
     
     
     # phenotype related network
     ph_network = layers.InputLayer((None, dim), input_var=last_outputs)
     ph_network = layers.DenseLayer(incoming=ph_network, num_units=25,
                                    nonlinearity=sigmoid)
     self.ph_prediction = layers.get_output(ph_network)
     self.ph_det_prediction = layers.get_output(ph_network, deterministic=True)
     self.params += layers.get_all_params(ph_network, trainable=True)
     self.reg_params += layers.get_all_params(ph_network, regularizable=True)
     self.ph_loss = nn_utils.multilabel_loss(self.ph_prediction, self.ph_label)
             
     
     # decompensation related network
     decomp_network = layers.InputLayer((None, None, dim), input_var=lstm_output)
     decomp_network = layers.ReshapeLayer(decomp_network, (-1, dim))
     decomp_network = layers.DenseLayer(incoming=decomp_network, num_units=2,
                                    nonlinearity=softmax)
     decomp_network = layers.ReshapeLayer(decomp_network, (lstm_output.shape[0], -1, 2))
     self.decomp_prediction = layers.get_output(decomp_network)[:, :, 1]
     self.decomp_det_prediction = layers.get_output(decomp_network, deterministic=True)[:, :, 1]
     self.params += layers.get_all_params(decomp_network, trainable=True)
     self.reg_params += layers.get_all_params(decomp_network, regularizable=True)
     self.decomp_loss = nn_utils.multilabel_loss_with_mask(self.decomp_prediction,
                                                       self.decomp_label,
                                                       self.decomp_mask)
     
     """
     data = next(self.train_batch_gen)
     print max(data[1])
     print lstm_output.eval({self.input_var:data[0]}).shape
     exit()
     """
     
     
     if self.l2 > 0: 
         self.loss_l2 = self.l2 * nn_utils.l2_reg(self.reg_params)
     else: 
         self.loss_l2 = T.constant(0)
     
     if self.l1 > 0: 
         self.loss_l1 = self.l1 * nn_utils.l1_reg(self.reg_params)
     else: 
         self.loss_l1 = T.constant(0)
     
     self.reg_loss = self.loss_l1 + self.loss_l2
     
     self.loss = (ihm_C * self.ihm_loss + los_C * self.los_loss + 
                  ph_C * self.ph_loss + decomp_C * self.decomp_loss + 
                  self.reg_loss)
           
     #updates = lasagne.updates.adadelta(self.loss, self.params,
     #                                    learning_rate=0.001)
     #updates = lasagne.updates.momentum(self.loss, self.params,
     #                                    learning_rate=0.00003)
     #updates = lasagne.updates.adam(self.loss, self.params)
     updates = lasagne.updates.adam(self.loss, self.params, beta1=0.5,
                                    learning_rate=0.0001) # from DCGAN paper
     #updates = lasagne.updates.nesterov_momentum(loss, params, momentum=0.9,
     #                                             learning_rate=0.001,
     
     all_inputs = [self.input_var, self.input_lens,
                   self.ihm_pos, self.ihm_mask, self.ihm_label,
                   self.los_mask, self.los_label,
                   self.ph_label,
                   self.decomp_mask, self.decomp_label]
     
     train_outputs = [self.ihm_prediction, self.los_prediction,
                      self.ph_prediction, self.decomp_prediction,
                      self.loss,
                      self.ihm_loss, self.los_loss,
                      self.ph_loss, self.decomp_loss,
                      self.reg_loss]
                      
     test_outputs = [self.ihm_det_prediction, self.los_det_prediction,
                     self.ph_det_prediction, self.decomp_det_prediction,
                     self.loss,
                     self.ihm_loss, self.los_loss,
                     self.ph_loss, self.decomp_loss,
                     self.reg_loss]
     
     ## compiling theano functions
     if self.mode == 'train':
         print "==> compiling train_fn"
         self.train_fn = theano.function(inputs=all_inputs,
                                         outputs=train_outputs,
                                         updates=updates)
     
     print "==> compiling test_fn"
     self.test_fn = theano.function(inputs=all_inputs,
                                    outputs=test_outputs)
Esempio n. 52
0
    def __init__(self,
                 n_out,
                 collapse_output=False,
                 directions=4,
                 projection='average',
                 base=None,
                 **kwargs):
        if base is None:
            base = []
        super(TwoDLSTMLayer, self).__init__(n_out, **kwargs)
        assert len(self.sources) == 1
        source = self.sources[0]
        n_in = source.attrs['n_out']
        X = source.output
        assert X.ndim == 4
        sizes = source.output_sizes
        self.output_sizes = sizes
        assert directions in [1, 2,
                              4], "only 1, 2 or 4 directions are supported"
        assert projection in ['average', 'concat'], "invalid projection"

        if base:
            self.b1 = self.add_param(base[0].b1)
            self.b2 = self.add_param(base[0].b2)
            if directions >= 1:
                self.b3 = self.add_param(base[0].b3)
                self.b4 = self.add_param(base[0].b4)
            self.W1, self.V_h1, self.V_v1 = self.add_param(
                base[0].W1), self.add_param(base[0].V_h1), self.add_param(
                    base[0].V_v1)
            self.W2, self.V_h2, self.V_v2 = self.add_param(
                base[0].W2), self.add_param(base[0].V_h2), self.add_param(
                    base[0].V_v2)
            if directions >= 1:
                self.W3, self.V_h3, self.V_v3 = self.add_param(
                    base[0].W3), self.add_param(base[0].V_h3), self.add_param(
                        base[0].V_v3)
                self.W4, self.V_h4, self.V_v4 = self.add_param(
                    base[0].W4), self.add_param(base[0].V_h4), self.add_param(
                        base[0].V_v4)
            #self.mass = base[0].mass
            #self.masks = base[0].masks
            #self.b1 = base[0].b1
            #self.b2 = base[0].b2
            #if directions >= 1:
            #  self.b3 = base[0].b3
            #  self.b4 = base[0].b4
            #self.W1, self.V_h1, self.V_v1 = base[0].W1, base[0].V_h1, base[0].V_v1
            #self.W2, self.V_h2, self.V_v2 = base[0].W2, base[0].V_h2, base[0].V_v2
            #if directions >= 1:
            #  self.W3, self.V_h3, self.V_v3 = base[0].W3, base[0].V_h3, base[0].V_v3
            #  self.W4, self.V_h4, self.V_v4 = base[0].W4, base[0].V_h4, base[0].V_v4
            self.mass = base[0].mass
            self.masks = base[0].masks
        else:
            self.b1 = self.create_and_add_bias(n_out, "1")
            self.b2 = self.create_and_add_bias(n_out, "2")
            if directions >= 1:
                self.b3 = self.create_and_add_bias(n_out, "3")
                self.b4 = self.create_and_add_bias(n_out, "4")

            self.W1, self.V_h1, self.V_v1 = self.create_and_add_2d_lstm_weights(
                n_in, n_out, "1")
            self.W2, self.V_h2, self.V_v2 = self.create_and_add_2d_lstm_weights(
                n_in, n_out, "2")
            if directions >= 1:
                self.W3, self.V_h3, self.V_v3 = self.create_and_add_2d_lstm_weights(
                    n_in, n_out, "3")
                self.W4, self.V_h4, self.V_v4 = self.create_and_add_2d_lstm_weights(
                    n_in, n_out, "4")

        # dropout
        assert len(self.masks) == 1
        mask = self.masks[0]
        if mask is not None:
            X = self.mass * mask * X

        if str(theano.config.device).startswith('cpu'):
            Y = T.zeros_like(X)
            if projection == 'concat':
                Y = Y.repeat(directions, axis=-1)
                n_out *= directions
        else:
            if directions <= 2:
                Y = BidirectionalTwoDLSTMOpInstance(X, self.W1, self.W2,
                                                    self.V_h1, self.V_h2,
                                                    self.V_v1, self.V_v2,
                                                    self.b1, self.b2, sizes)
            else:
                Y = MultiDirectionalTwoDLSTMOpInstance(
                    X, self.W1, self.W2, self.W3, self.W4, self.V_h1,
                    self.V_h2, self.V_h3, self.V_h4, self.V_v1, self.V_v2,
                    self.V_v3, self.V_v4, self.b1, self.b2, self.b3, self.b4,
                    sizes)

            if directions > 1:
                Y = T.stack(Y[:directions], axis=-1)
                if projection == 'average':
                    Y = Y.mean(axis=-1)
                elif projection == 'concat':
                    Y = Y.reshape((Y.shape[0], Y.shape[1], Y.shape[2],
                                   Y.shape[3] * Y.shape[4]))
                    n_out *= directions
            else:
                Y = Y[0]

        Y.name = 'Y'
        self.set_attr('n_out', n_out)
        self.set_attr('collapse_output', collapse_output)
        self.set_attr('directions', directions)
        self.set_attr('projection', projection)

        #index handling
        def index_fn(index, size):
            return T.set_subtensor(index[:size], numpy.cast['int8'](1))

        index_init = T.zeros((Y.shape[2], Y.shape[1]), dtype='int8')
        self.index, _ = theano.scan(
            index_fn, [index_init, T.cast(sizes[:, 1], "int32")])
        self.index = self.index.dimshuffle(1, 0)

        if collapse_output == 'sum' or collapse_output == True:
            Y = Y.sum(axis=0)
        elif collapse_output == 'mean':
            Y = Y.mean(axis=0)
        elif collapse_output == 'conv':
            from TheanoUtil import circular_convolution
            Y, _ = theano.scan(lambda x_i, x_p: circular_convolution(x_i, x_p),
                               Y, Y[0])
            Y = Y[-1]
        elif collapse_output == 'flatten':
            self.index = T.ones((Y.shape[0] * Y.shape[1], Y.shape[2]),
                                dtype='int8')
            Y = Y.reshape((Y.shape[0] * Y.shape[1], Y.shape[2], Y.shape[3]))
        elif str(collapse_output).startswith('pad_'):
            pad = numpy.int32(collapse_output.split('_')[-1])
            Y = ifelse(
                T.lt(Y.shape[0], pad),
                T.concatenate([
                    Y,
                    T.zeros(
                        (pad - Y.shape[0], Y.shape[1], Y.shape[2], Y.shape[3]),
                        'float32')
                ],
                              axis=0), ifelse(T.gt(Y.shape[0], pad), Y[:pad],
                                              Y))
            Y = Y.dimshuffle(1, 2, 3, 0).reshape(
                (Y.shape[1], Y.shape[2], Y.shape[3] * Y.shape[0]))
            self.attrs['n_out'] *= pad
        elif collapse_output != False:
            assert False, "invalid collapse mode"

        if self.attrs['batch_norm']:
            Y = self.batch_norm(
                Y,
                self.attrs['n_out'],
                index=sizes if not collapse_output else self.index,
                force_sample=False)

        self.output = Y
Esempio n. 53
0
    def __init__(self, config):

        self._params = []
        self._np_rng = np.random.RandomState(config.seed // 2 + 123)
        self._theano_rng = RandomStreams(
            config.seed // 2 + 321)  # generates random numbers directly on GPU
        self._init_scale = config.init_scale
        self._is_training = tt.iscalar('is_training')
        self._lr = theano.shared(cast_floatX(config.learning_rate), 'lr')

        input_data = tt.imatrix('input_data')  # (batch_size, num_steps)
        targets = tt.imatrix('targets')  # (batch_size, num_steps)
        noise_x = tt.matrix('noise_x')  # (batch_size, num_steps)

        # Embed input words and apply variational dropout (for each sample, the embedding of
        # a dropped word-type consists of all zeros at all occurrences of word-type in sample).
        embedding = self.make_param((config.vocab_size, config.hidden_size),
                                    'uniform')
        inputs = embedding[
            input_data.T]  # (num_steps, batch_size, hidden_size)
        inputs = self.apply_dropout(inputs, tt.shape_padright(noise_x.T))

        rhn_updates = []
        for _ in range(config.num_layers):
            # y shape: (num_steps, batch_size, hidden_size)
            y, sticky_state_updates = self.RHNLayer(
                inputs, config.depth, config.batch_size, config.hidden_size,
                config.drop_i, config.drop_s, config.init_T_bias,
                config.init_other_bias, config.tied_noise)
            rhn_updates += sticky_state_updates
            inputs = y

        noise_o = self.get_dropout_noise(
            (config.batch_size, config.hidden_size), config.drop_o)
        outputs = self.apply_dropout(
            y,
            tt.shape_padleft(noise_o))  # (num_steps, batch_size, hidden_size)

        # logits
        softmax_w = embedding.T if config.tied_embeddings else self.make_param(
            (config.hidden_size, config.vocab_size), 'uniform')
        softmax_b = self.make_param((config.vocab_size, ),
                                    config.init_other_bias)
        logits = tt.dot(
            outputs,
            softmax_w) + softmax_b  # (num_steps, batch_size, vocab_size)

        # probabilities and prediction loss
        flat_logits = logits.reshape(
            (config.batch_size * config.num_steps, config.vocab_size))
        flat_probs = tt.nnet.softmax(flat_logits)
        flat_targets = targets.T.flatten()  # (batch_size * num_steps,)
        xentropies = tt.nnet.categorical_crossentropy(
            flat_probs, flat_targets)  # (batch_size * num_steps,)
        pred_loss = xentropies.sum() / config.batch_size

        # weight decay
        l2_loss = 0.5 * tt.sum(tt.stack([tt.sum(p**2) for p in self._params]))

        loss = pred_loss + config.weight_decay * l2_loss
        grads = theano.grad(loss, self._params)

        # gradient clipping
        global_grad_norm = tt.sqrt(
            tt.sum(tt.stack([tt.sum(g**2) for g in grads])))
        clip_factor = ifelse(
            global_grad_norm < config.max_grad_norm, cast_floatX(1),
            tt.cast(config.max_grad_norm / global_grad_norm, floatX))

        param_updates = [(p, p - self._lr * clip_factor * g)
                         for p, g in zip(self._params, grads)]

        self.train = theano.function([input_data, targets, noise_x],
                                     loss,
                                     givens={self._is_training: np.int32(1)},
                                     updates=rhn_updates + param_updates)

        self.evaluate = theano.function(
            [input_data, targets],
            loss,
            # Note that noise_x is unused in computation graph of this function since _is_training is false.
            givens={
                self._is_training: np.int32(0),
                noise_x: tt.zeros((config.batch_size, config.num_steps))
            },
            updates=rhn_updates)

        self._num_params = np.sum(
            [param.get_value().size for param in self._params])

        if config.load_model:
            self.load(config.load_model)
Esempio n. 54
0
 def e_hat(y, X, *es):
     e = tt.stack(es[:-2])
     return y - tt.dot(X, es[-2]) - tt.dot(e, es[-1])
def Unskew(padded):
    """
    input.shape: (batch size, HEIGHT, 2*WIDTH - 1, dim)
    """
    return T.stack([padded[:, i, i:i + WIDTH, :] for i in range(HEIGHT)],
                   axis=1)
Esempio n. 56
0
def local_gpua_careduce(node, context_name):
    if isinstance(node.op.scalar_op, (scalar.Add, scalar.Mul,
                                      scalar.Maximum, scalar.Minimum)):
        ctx = get_context(context_name)
        if ctx.kind == b'opencl':
            op = GpuCAReduceCPY
            if node.op.scalar_op not in [scalar.add, scalar.mul]:
                # We don't support yet all reduction with cpy code.
                return
        elif ctx.kind == b'cuda':
            op = GpuCAReduceCuda
        else:
            return False
        x, = node.inputs

        greduce = op(
            node.op.scalar_op, axis=node.op.axis,
            dtype=getattr(node.op, 'dtype', None),
            acc_dtype=getattr(node.op, 'acc_dtype', None))
        gvar = greduce(x)
        # We need to have the make node called, otherwise the mask can
        # be None
        if (op is GpuCAReduceCPY or
                gvar.owner.op.supports_c_code([
                    as_gpuarray_variable(x, context_name)])):
            return greduce
        else:
            # Try to make a simpler pattern based on reshaping
            # The principle is that if two adjacent dimensions have
            # the same value in the reduce_mask, then we can reshape
            # to make them a single dimension, do the reduction, and
            # then reshape to get them back.

            if node.op.axis is None:
                reduce_mask = [1] * x.type.ndim
            else:
                reduce_mask = [0] * x.type.ndim
                for a in node.op.axis:
                    assert reduce_mask[a] == 0
                    reduce_mask[a] = 1

            shape_of = node.fgraph.shape_feature.shape_of

            x_shape = shape_of[x]

            new_in_shp = [x_shape[0]]
            new_mask = [reduce_mask[0]]
            for i in xrange(1, x.type.ndim):
                if reduce_mask[i] == reduce_mask[i - 1]:
                    new_in_shp[-1] *= x_shape[i]
                else:
                    new_mask.append(reduce_mask[i])
                    new_in_shp.append(x_shape[i])
            new_axis = []
            for idx, m in enumerate(new_mask):
                if m == 1:
                    new_axis.append(idx)
            greduce = op(
                node.op.scalar_op,
                axis=new_axis, reduce_mask=new_mask,
                dtype=getattr(node.op, 'dtype', None),
                acc_dtype=getattr(node.op, 'acc_dtype', None))

            reshaped_x = x.reshape(tensor.stack(new_in_shp))
            gpu_reshaped_x = as_gpuarray_variable(reshaped_x, context_name)
            gvar = greduce(gpu_reshaped_x)
            # We need to have the make node called, otherwise the mask can
            # be None
            reshaped_gpu_inputs = [gpu_reshaped_x]
            if greduce.supports_c_code(reshaped_gpu_inputs):
                reduce_reshaped_x = host_from_gpu(
                    greduce(gpu_reshaped_x))

                if reduce_reshaped_x.ndim != node.outputs[0].ndim:
                    unreshaped_reduce = reduce_reshaped_x.reshape(
                        tensor.stack(shape_of[node.outputs[0]]))
                else:
                    unreshaped_reduce = reduce_reshaped_x
                return [unreshaped_reduce]
Esempio n. 57
0
def pack(x):
    return T.stack(*x)
Esempio n. 58
0
def build_model(prepared_data, clamp_L0=0.4, eeg_column_i=None, **kwargs):
    # ##########
    # STEP1: order the data properly so that we can read from it sequentially
    # when training the model

    subject_x, skill_x, correct_y, start_x, eeg_x, eeg_table, stim_pairs, train_idx, valid_idx = prepared_data
    N = len(correct_y)
    train_mask = idx_to_mask(train_idx, N)
    valid_mask = idx_to_mask(valid_idx, N)

    # sort data by subject and skill
    sorted_i = sorted(xrange(N),
                      key=lambda i: (subject_x[i], skill_x[i], start_x[i]))
    skill_x = skill_x[sorted_i]
    subject_x = subject_x[sorted_i]
    correct_y = correct_y[sorted_i]
    start_x = start_x[sorted_i]
    train_mask = train_mask[sorted_i]
    valid_mask = valid_mask[sorted_i]
    train_idx = np.nonzero(train_mask)[0]
    valid_idx = np.nonzero(valid_mask)[0]

    n_skills = np.max(skill_x) + 1
    n_subjects = np.max(subject_x) + 1

    # binarize eeg
    eeg_single_x = np.zeros(N)
    if eeg_column_i is not None:
        eeg_column = eeg_table[eeg_x, eeg_column_i]
        above_median = np.greater(eeg_column, np.median(eeg_column))
        eeg_single_x[above_median] = 1

    # prepare parameters
    p_T = 0.5
    p_G = 0.1
    p_S = 0.2
    p_L0 = 0.7
    if clamp_L0 is None:
        p_L0 = 0.7
    else:
        p_L0 = clamp_L0
    # eeg_single_x = np.zeros(N)
    parameter_base = np.ones(n_skills)
    tp_L0, t_L0 = make_probability(parameter_base * p_L0, name='L0')
    tp_T, t_T = make_probability(np.ones((n_skills, 2)) * p_T, name='p(T)')
    tp_G, t_G = make_probability(p_G, name='p(G)')
    tp_S, t_S = make_probability(p_S, name='p(S)')

    # declare and prepare variables for theano
    i = T.ivector('i')
    dummy_float = make_shared(0, name='dummy')
    skill_i, subject_i = T.iscalars('skill_i', 'subject_i')
    correct_y = make_shared(correct_y, to_int=True)
    eeg_single_x = make_shared(eeg_single_x, to_int=True)

    def step(correct_i, eeg, prev_L, prev_p_C, P_T, P_S, P_G):
        Ln = prev_L + (1 - prev_L) * P_T[eeg]
        p_C = prev_L * (1 - P_S) + (1 - prev_L) * P_G
        return Ln, p_C

    # set up theano functions
    ((results, p_C),
     updates) = theano.scan(fn=step,
                            sequences=[correct_y[i], eeg_single_x[i]],
                            outputs_info=[tp_L0[skill_i], dummy_float],
                            non_sequences=[tp_T[skill_i], tp_G, tp_S])

    p_y = T.stack(1 - p_C, p_C)
    loss = neg_log_loss(p_y, correct_y[i])

    learning_rate = T.fscalar('learning_rate')
    if clamp_L0 is None:
        params = [t_T, t_L0]
    else:
        params = [t_T]
    update_parameters = [(param, param - learning_rate * T.grad(loss, param))
                         for param in params]

    tf_train = theano.function(inputs=[i, skill_i, learning_rate],
                               updates=update_parameters,
                               outputs=[loss, results, i],
                               allow_input_downcast=True)
    tf_valid = theano.function(inputs=[i, skill_i],
                               outputs=[loss, results, i],
                               allow_input_downcast=True)

    def f_train((i, (subject_i, skill_i)), learning_rate):
        return tf_train(i, skill_i, learning_rate)
Esempio n. 59
0
u0 = T.dscalar("u0")
u1 = T.dscalar("u1")
u2 = T.dscalar("u2")

x_inputs = [x0_v, x1_v, x0_h, x1_h, u0, u1]

u_inputs = [u2]

# Discrete dynamics model definition.
f = T.stack([
    x0_v + (x1_v * dt),
    x1_v + (((U0 / (T0**2)) * alpha_v * u2 +
             (U0 / T0) * beta_v * u1 + gamma_v * U0 * u0 - phi_v *
             (X0 / T0) * x1_v - xi_v * X0 * x0_v) / (X0 / (T0**2))) * dt,
    # x2_v + ,
    x0_h + (x1_h * dt),
    x1_h + (((U0 / (T0**2)) * alpha_h * u2 +
             (U0 / T0) * beta_h * u1 + gamma_h * U0 * u0 - phi_h *
             (X0 / T0) * x1_h - xi_h * X0 * x0_h) / (X0 / (T0**2))) * dt,
    # x2_h + ,
    u0 + (u1 * dt),
    u1 + (u2 * dt)
])

dynamics = AutoDiffDynamics(f, x_inputs, u_inputs)
# dynamics = FiniteDiffDynamics(f, 6, 1)
# dynamics = BatchAutoDiffDynamics(f, state_size, action_size)

# Q = np.eye(dynamics.state_size)#state error

# cost = transpose(x) * Q * x + transpose(u) * R * u
Esempio n. 60
0
def neibs2images(neibs, neib_shape, original_shape, mode="valid"):
    """
    Function :func:`neibs2images <theano.sandbox.neighbours.neibs2images>`
    performs the inverse operation of
    :func:`images2neibs <theano.sandbox.neigbours.neibs2images>`. It inputs
    the output of :func:`images2neibs <theano.sandbox.neigbours.neibs2images>`
    and reconstructs its input.

    Parameters
    ----------
    neibs : 2d tensor
        Like the one obtained by
        :func:`images2neibs <theano.sandbox.neigbours.neibs2images>`.
    neib_shape
        `neib_shape` that was used in
        :func:`images2neibs <theano.sandbox.neigbours.neibs2images>`.
    original_shape
        Original shape of the 4d tensor given to
        :func:`images2neibs <theano.sandbox.neigbours.neibs2images>`

    Returns
    -------
    object
        Reconstructs the input of
        :func:`images2neibs <theano.sandbox.neigbours.neibs2images>`,
        a 4d tensor of shape `original_shape`.

    Notes
    -----
    Currently, the function doesn't support tensors created with
    `neib_step` different from default value. This means that it may be
    impossible to compute the gradient of a variable gained by
    :func:`images2neibs <theano.sandbox.neigbours.neibs2images>` w.r.t.
    its inputs in this case, because it uses
    :func:`images2neibs <theano.sandbox.neigbours.neibs2images>` for
    gradient computation.

    Examples
    --------
    Example, which uses a tensor gained in example for
    :func:`images2neibs <theano.sandbox.neigbours.neibs2images>`:

    .. code-block:: python

        im_new = neibs2images(neibs, (5, 5), im_val.shape)
        # Theano function definition
        inv_window = theano.function([neibs], im_new)
        # Function application
        im_new_val = inv_window(neibs_val)

    .. note:: The code will output the initial image array.

    """
    neibs = tt.as_tensor_variable(neibs)
    neib_shape = tt.as_tensor_variable(neib_shape)
    original_shape = tt.as_tensor_variable(original_shape)

    new_neib_shape = tt.stack(
        [original_shape[-1] // neib_shape[1], neib_shape[1]])
    output_2d = images2neibs(neibs.dimshuffle("x", "x", 0, 1),
                             new_neib_shape,
                             mode=mode)

    if mode == "ignore_borders":
        # We use set_subtensor to accept original_shape we can't infer
        # the shape and still raise error when it don't have the right
        # shape.
        valid_shape = original_shape
        valid_shape = tt.set_subtensor(
            valid_shape[2], (valid_shape[2] // neib_shape[0]) * neib_shape[0])
        valid_shape = tt.set_subtensor(
            valid_shape[3], (valid_shape[3] // neib_shape[1]) * neib_shape[1])
        output_4d = output_2d.reshape(valid_shape, ndim=4)
        # padding the borders with zeros
        for d in [2, 3]:
            pad_shape = list(output_4d.shape)
            pad_shape[d] = original_shape[d] - valid_shape[d]
            output_4d = tt.concatenate(
                [output_4d, tt.zeros(pad_shape)], axis=d)
    elif mode == "valid":
        # TODO: we do not implement all mode with this code.
        # Add a check for the good cases.
        output_4d = output_2d.reshape(original_shape, ndim=4)
    else:
        raise NotImplementedError(f"neibs2images do not support mode={mode}")

    return output_4d