Example #1
0
    def marginalize_over_v_z(self, h):
        # energy = \sum_{i=1}^{|h|} h_i*b_i - \beta * ln(1 + e^{b_i})

        # In theory should use the following line
        # energy = (h * self.b).T
        # However, when there is broadcasting, the Theano element-wise multiplication between np.NaN and 0 is 0 instead of np.NaN!
        # so we use T.tensordot and T.diagonal instead as a workaround!
        # See Theano issue #3848 (https://github.com/Theano/Theano/issues/3848)
        energy = T.tensordot(h, self.b, axes=0)
        energy = T.diagonal(energy, axis1=1, axis2=2).T

        if self.penalty == "softplus_bi":
            energy = energy - self.beta * T.log(1 + T.exp(self.b))[:, None]

        elif self.penalty == "softplus0":
            energy = energy - self.beta * T.log(1 + T.exp(0))[:, None]

        else:
            raise NameError("Invalid penalty term")

        energy = T.set_subtensor(energy[(T.isnan(energy)).nonzero()], 0)  # Remove NaN
        energy = T.sum(energy, axis=0, keepdims=True).T

        ener = T.tensordot(h, self.W, axes=0)
        ener = T.diagonal(ener, axis1=1, axis2=2)
        ener = T.set_subtensor(ener[(T.isnan(ener)).nonzero()], 0)
        ener = T.sum(ener, axis=2) + self.c[None, :]
        ener = T.sum(T.log(1 + T.exp(ener)), axis=1, keepdims=True)

        return -(energy + ener)
Example #2
0
    def test_transfer(self):
        tensor1 = self.rng.rand(20, 10, 5, 8).astype("float32")
        tensor2 = self.rng.rand(5, 8, 20).astype("float32")
        tensor3 = self.rng.rand(8, 20, 5).astype("float32")

        x = tensor.ftensor4("x")
        y = tensor.ftensor3("y")

        tdot1 = tensor.tensordot(x, y, 2)
        f1 = theano.function([x, y], tdot1, mode=mode_with_gpu)
        topo1 = f1.maker.fgraph.toposort()
        assert topo1[-1].op == cuda.host_from_gpu
        # Let DebugMode debug
        f1(tensor1, tensor2)

        tdot2 = tensor.tensordot(x, y, axes=[(0, 3), (1, 0)])
        f2 = theano.function([x, y], tdot2, mode=mode_with_gpu)
        topo2 = f2.maker.fgraph.toposort()
        assert topo2[-1].op == cuda.host_from_gpu
        f2(tensor1, tensor3)

        tdot3 = tensor.tensordot(x, y, axes=[(0, 3, 2), (1, 0, 2)])
        f3 = theano.function([x, y], tdot3, mode=mode_with_gpu)
        topo3 = f3.maker.fgraph.toposort()
        assert topo3[-1].op == cuda.host_from_gpu
        f3(tensor1, tensor3)
Example #3
0
  def get_output(self, train=False):
    input = self.get_input(train)
    proj_input = self.activation(T.tensordot(input, self.att_proj, axes=(3,0)))
    #else:
    #  proj_fun = lambda proj_i, inp: T.tensordot(inp, proj_i, axes=((1,3), (0,1)))
    #  lin_proj_input, _ = theano.scan(fn=proj_fun, sequences=self.att_proj, non_sequences=input)
    #  proj_input = self.activation(lin_proj_input.dimshuffle((1,0,2,3)))
    if self.context == 'word':
      att_scores = T.tensordot(proj_input, self.att_scorer, axes=(3, 0))
    elif self.context == 'clause':
      #att_scores = T.tensordot(proj_input, self.att_scorer, axes=(3, 1)).sum(axis=2)
      def step(a_t, h_tm1, W_in, W, sc):
        h_t = T.tanh(T.tensordot(a_t, W_in, axes=(2,0)) + T.tensordot(h_tm1, W, axes=(2,0)))
        s_t = T.tensordot(h_t, sc, axes=(2,0))
        return h_t, s_t
      [_, scores], _ = theano.scan(step, sequences=[proj_input.dimshuffle(2,0,1,3)], outputs_info=[T.zeros((proj_input.shape[0], self.td1, self.rec_hid_dim)), None], non_sequences=[self.rec_in_weights, self.rec_hid_weights, self.att_scorer])
      att_scores = scores.dimshuffle(1,2,0)
    elif self.context == 'para':
      att_scores = T.tensordot(proj_input, self.att_scorer, axes=(3, 2)).sum(axis=(1, 2))
    # Nested scans. For shame!
    def get_sample_att(sample_input, sample_att):
      sample_att_inp, _ = theano.scan(fn=lambda s_att_i, s_input_i: T.dot(s_att_i, s_input_i), sequences=[T.nnet.softmax(sample_att), sample_input])
      return sample_att_inp

    att_input, _ = theano.scan(fn=get_sample_att, sequences=[input, att_scores])
    return att_input
Example #4
0
    def shade(self, shape, lights, camera):
        # See: http://en.wikipedia.org/wiki/Phong_reflection_model#Description

        # Since our material params are 1d we calculate bw shadings first and
        # convert to color after
        light = lights[0]
        material = shape.material
        normals = shape.normals(camera.rays)

        ambient_light = material.ka

        # diffuse (lambertian)
        diffuse_shadings = material.kd*T.tensordot(normals, -light.normed_dir(), 1)

        # specular
        rm = 2.0*(T.tensordot(normals, -light.normed_dir(), 1).dimshuffle(
            0, 1, 'x'))*normals + light.normed_dir()
        specular_shadings = material.ks*(T.tensordot(rm, camera.look_at, 1) ** material.shininess)

        # phong
        phong_shadings = ambient_light + diffuse_shadings + specular_shadings

        colorized = phong_shadings.dimshuffle(0, 1, 'x') * material.color.dimshuffle('x', 'x', 0) * light.intensity.dimshuffle('x', 'x', 0)
        clipped = T.clip(colorized, 0, 1)
        distances = shape.distance(camera.rays)
        return broadcasted_switch(T.isinf(distances), [0., 0., 0.], clipped)
Example #5
0
    def sym_mask_logdensity_estimator_intermediate(self, x, mask):
        non_linearity_name = self.parameters["nonlinearity"].get_name()
        assert non_linearity_name == "sigmoid" or non_linearity_name == "RLU"
        x = x.T  # BxD
        mask = mask.T  # BxD
        output_mask = constantX(1) - mask  # BxD
        D = constantX(self.n_visible)
        d = mask.sum(1)  # d is the 1-based index of the dimension whose value to infer (not the size of the context)
        masked_input = x * mask  # BxD
        h = self.nonlinearity(T.dot(masked_input, self.W1) + T.dot(mask, self.Wflags) + self.b1)  # BxH
        for l in xrange(self.n_layers - 1):
            h = self.nonlinearity(T.dot(h, self.Ws[l]) + self.bs[l])  # BxH
        z_alpha = T.tensordot(h, self.V_alpha, [[1], [1]]) + T.shape_padleft(self.b_alpha)
        z_mu = T.tensordot(h, self.V_mu, [[1], [1]]) + T.shape_padleft(self.b_mu)
        z_sigma = T.tensordot(h, self.V_sigma, [[1], [1]]) + T.shape_padleft(self.b_sigma)
        temp = T.exp(z_alpha)  # + 1e-6
        # temp += T.shape_padright(temp.sum(2)/1e-3)
        Alpha = temp / T.shape_padright(temp.sum(2))  # BxDxC
        Mu = z_mu  # BxDxC
        Sigma = T.exp(z_sigma)  # + 1e-6 #BxDxC

        # Alpha = Alpha * T.shape_padright(output_mask) + T.shape_padright(mask)
        # Mu = Mu * T.shape_padright(output_mask)
        # Sigma = Sigma * T.shape_padright(output_mask) + T.shape_padright(mask)
        # Phi = -constantX(0.5) * T.sqr((Mu - T.shape_padright(x*output_mask)) / Sigma) - T.log(Sigma) - constantX(0.5 * np.log(2*np.pi)) #BxDxC

        Phi = (
            -constantX(0.5) * T.sqr((Mu - T.shape_padright(x)) / Sigma)
            - T.log(Sigma)
            - constantX(0.5 * np.log(2 * np.pi))
        )  # BxDxC
        logdensity = (log_sum_exp(Phi + T.log(Alpha), axis=2) * output_mask).sum(1) * D / (D - d)
        return (logdensity, z_alpha, z_mu, z_sigma, Alpha, Mu, Sigma, h)
Example #6
0
def test_tensordot_reshape():
    '''Test that the tensordot implementation using dimshuffle, reshape and dot
    gives the same results as the default (numpy) version'''
    # define some tensors
    a = numpy.arange(20, dtype=theano.config.floatX) / 20.0
    b = numpy.arange(10, dtype=theano.config.floatX) / 10.0
    c = numpy.arange(5, dtype=theano.config.floatX) / 5.0
    d = numpy.arange(8, dtype=theano.config.floatX) / 8.0
    
    tensor1 = numpy.tensordot(a, numpy.tensordot(b, numpy.tensordot(c, d, 0), 0), 0)
    tensor2 = numpy.tensordot(c, numpy.tensordot(d, a, 0), 0)
    tensor3 = tensor2.swapaxes(1, 2).swapaxes(0, 2) # d, a, c
    
    x = T.tensor4('x')
    y = T.tensor3('y')
    
    # case 1: number of axes to sum over
    default1 = theano.function([x,y], T.tensordot(x, y, 2))(tensor1, tensor2)
    reshape1 = theano.function([x,y], B.tensordot(x, y, 2))(tensor1, tensor2)
    assert numpy.allclose(default1, reshape1)
    
    # case 2: axis pairs
    default2 = theano.function([x,y], T.tensordot(x, y, axes=[(0, 3), (1, 0)]))(tensor1, tensor3)
    reshape2 = theano.function([x,y], B.tensordot(x, y, axes=[(0, 3), (1, 0)]))(tensor1, tensor3)
    assert numpy.allclose(default2, reshape2)

    default3 = theano.function([x,y], T.tensordot(x, y, axes=[(0, 3, 2), (1, 0, 2)]))(tensor1, tensor3)
    reshape3 = theano.function([x,y], B.tensordot(x, y, axes=[(0, 3, 2), (1, 0, 2)]))(tensor1, tensor3)
    assert numpy.allclose(default3, reshape3)
Example #7
0
    def sym_masked_neg_loglikelihood_gradient(self, x, mask):
        """ x is a matrix of column datapoints (DxB) D = n_visible, Bfloat = batch size """
        logdensity, z_alpha, z_mu, z_sigma, Alpha, Mu, Sigma, h = self.sym_mask_logdensity_estimator_intermediate(
            x, mask
        )

        #        nnz = output_mask.sum(0)
        #        sparsity_multiplier = T.shape_padright(T.shape_padleft((B+1e-6)/(nnz+1e-6)))

        #        wPhi = T.maximum(Phi + T.log(Alpha), constantX(-100.0)) #BxDxC
        #        lp_current = log_sum_exp(wPhi, axis = 2) * output_mask #BxD
        #        lp_current_sum = (lp_current.sum(1) * D / (D-d)).sum() #1

        loglikelihood = logdensity.mean(dtype=floatX)
        loss = -loglikelihood

        dp_dz_alpha = T.grad(loss, z_alpha)  # BxDxC
        gb_alpha = dp_dz_alpha.sum(0)  # DxC
        gV_alpha = T.tensordot(h.T, dp_dz_alpha, [[1], [0]]).dimshuffle((1, 0, 2))  # DxHxC

        dp_dz_mu = T.grad(loss, z_mu)  # BxDxC
        dp_dz_mu = dp_dz_mu * Sigma  # Heuristic
        gb_mu = dp_dz_mu.sum(0)  # DxC
        gV_mu = T.tensordot(h.T, dp_dz_mu, [[1], [0]]).dimshuffle((1, 0, 2))  # DxHxC

        dp_dz_sigma = T.grad(loss, z_sigma)  # BxDxC
        gb_sigma = dp_dz_sigma.sum(0)  # DxC
        gV_sigma = T.tensordot(h.T, dp_dz_sigma, [[1], [0]]).dimshuffle((1, 0, 2))  # DxHxC

        if self.n_layers > 1:
            gWs, gbs, gW1, gWflags, gb1 = T.grad(loss, [self.Ws, self.bs, self.W1, self.Wflags, self.b1])
            gradients = {
                "V_alpha": gV_alpha,
                "b_alpha": gb_alpha,
                "V_mu": gV_mu,
                "b_mu": gb_mu,
                "V_sigma": gV_sigma,
                "b_sigma": gb_sigma,
                "Ws": gWs,
                "bs": gbs,
                "W1": gW1,
                "b1": gb1,
                "Wflags": gWflags,
            }
        else:
            gW1, gWflags, gb1 = T.grad(loss, [self.W1, self.Wflags, self.b1])
            gradients = {
                "V_alpha": gV_alpha,
                "b_alpha": gb_alpha,
                "V_mu": gV_mu,
                "b_mu": gb_mu,
                "V_sigma": gV_sigma,
                "b_sigma": gb_sigma,
                "W1": gW1,
                "b1": gb1,
                "Wflags": gWflags,
            }
        # Gradients
        return (loss, gradients)
Example #8
0
    def output(self, input_vectors):
        """
        Calculate the n_output dot product scalars of this layer
        @param input_vectors: n_input vectors (actual shape should be (n_batch, n_input, n_dimension)
        """

        return T.sum(T.tensordot(input_vectors, self.W1, [[1], [0]]) *
                     T.tensordot(input_vectors, self.W2, [[1], [0]]), axis=1)
Example #9
0
    def output(self, train):
        X = self.get_input(train)
        X = X.dimshuffle((1,0,2))


        if self.is_entity:
            Entity = X[-1:].dimshuffle(1,0,2)
            X = X[:-1]

        b_y = self.b_y
        b_yn = T.repeat(T.repeat(b_y.reshape((1,self.output_dim)),X.shape[0],axis=0).reshape((1,X.shape[0],self.output_dim)), X.shape[1], axis=0)

        xif = T.dot(X, self.W_if) + self.b_if
        xib = T.dot(X, self.W_ib) + self.b_ib

        xff = T.dot(X, self.W_ff) + self.b_ff
        xfb = T.dot(X, self.W_fb) + self.b_fb

        xcf = T.dot(X, self.W_cf) + self.b_cf
        xcb = T.dot(X, self.W_cb) + self.b_cb

        xof = T.dot(X, self.W_of) + self.b_of
        xob = T.dot(X, self.W_ob) + self.b_ob

        [outputs_f, memories_f], updates_f = theano.scan(
            self._step,
            sequences=[xif, xff, xof, xcf],
            outputs_info=[
                alloc_zeros_matrix(X.shape[1], self.output_dim),
                alloc_zeros_matrix(X.shape[1], self.output_dim)
            ],
            non_sequences=[self.U_if, self.U_ff, self.U_of, self.U_cf],
            truncate_gradient=self.truncate_gradient
        )
        [outputs_b, memories_b], updates_b = theano.scan(
            self._step,
            sequences=[xib, xfb, xob, xcb],
            outputs_info=[
                alloc_zeros_matrix(X.shape[1], self.output_dim),
                alloc_zeros_matrix(X.shape[1], self.output_dim)
            ],
            non_sequences=[self.U_ib, self.U_fb, self.U_ob, self.U_cb],
            truncate_gradient=self.truncate_gradient
        )
        if self.return_sequences:
            y = T.add(T.add(
                    T.tensordot(outputs_f.dimshuffle((1,0,2)), self.W_yf, [[2],[0]]),
                    T.tensordot(outputs_b[::-1].dimshuffle((1,0,2)), self.W_yb, [[2],[0]])),
                b_yn)
            # y = T.add(T.tensordot(
            #     T.add(outputs_f.dimshuffle((1, 0, 2)),
            #           outputs_b[::-1].dimshuffle((1,0,2))),
            #     self.W_y,[[2],[0]]),b_yn)
            if self.is_entity:
                return T.concatenate([y, Entity], axis=1)
            else:
                return y
        return T.concatenate((outputs_f[-1], outputs_b[0]))
def complex_tensordot(a, b, axes=2):
    AR, AI = a[0, ...], a[1, ...]
    BR, BI = b[0, ...], b[1, ...]

    output = tensor.stack([
        tensor.tensordot(AR, BR, axes=axes) - tensor.tensordot(AI, BI, axes=axes),
        tensor.tensordot(AR, BI, axes=axes) + tensor.tensordot(AI, BR, axes=axes),
    ], axis=0)
    return output
def apply_mat_to_kron(x, a, b, arg_type="numpy"):
    X = x.reshape((x.shape[0], a.shape[0], b.shape[0]))
    if arg_type == "numpy":
        result = np.tensordot(np.tensordot(X, a, axes=([1], [0])), b, axes=([1], [0]))
    elif arg_type == "theano":
        result = T.tensordot(T.tensordot(X, a, axes=([1], [0])), b, axes=([1], [0]))
    else:
        raise ValueError("arg_type must be 'numpy' or 'theano'")
    return result.reshape((x.shape[0], -1))
Example #12
0
 def output(self, input_value):
     if self.size is not None:
         if self.dotdim is None:
             input_value = T.tensordot(input_value, self.weight, axes = [input_value.ndim - 1, 0]) + self.bias
         else:
             input_value = T.tensordot(input_value, self.weight, axes = [self.dotdim + 1, 0]) + self.bias
             if self.dotdim + 1 < input_value.ndim - 1:
                 input_value = input_value.swapaxes(input_value.ndim - 1, self.dotdim + 1)
     return self.activation_function(input_value)
 def contrastive_divergence_1(self, v1):
     '''Determine the weight updates according to CD-1'''
     h1 = self.sample_h_given_v(v1)
     v2 = self.sample_v_given_h(h1)
     h2p = self.propup(v2)
     updates = T.tensordot(v1, h1, [[0],[0]]) - T.tensordot(v2, h2p, [[0],[0]])
     f = 1.0 / self.minibatch_size
     return (updates * f,
             T.sum(v1 - v2, axis=0) * f,
             T.sum(h1 - h2p, axis=0) * f)
Example #14
0
    def get_output_for(self, inputs, **kwargs):
        """

        :param inputs: inputs: list of theano.TensorType
            `inputs[0]` should always be the symbolic input variable.  When
            this layer has a mask input (i.e. was instantiated with
            `mask_input != None`, indicating that the lengths of sequences in
            each batch vary), `inputs` should have length 2, where `inputs[1]`
            is the `mask`.  The `mask` should be supplied as a Theano variable
            denoting whether each time step in each sequence in the batch is
            part of the sequence or not.  `mask` should be a matrix of shape
            ``(n_batch, n_time_steps)`` where ``mask[i, j] = 1`` when ``j <=
            (length of sequence i)`` and ``mask[i, j] = 0`` when ``j > (length
            of sequence i)``.
        :return: theano.TensorType
            Symbolic output variable.
        """
        input = inputs[0]
        mask = None
        if self.mask_incoming_index > 0:
            mask = inputs[self.mask_incoming_index]

        # compute the bi-affine part
        # first via tensor dot ([batch, length, dim] * [dim, dim, num_label])
        # output shape = [batch, length, dim, num_label]
        out = T.tensordot(input, self.U, axes=[[2], [0]])
        # second via tensor dot ([batch, length, dim, num_label] * [batch, dim, length)
        # output shape = [batch, length, length, num_label]
        out = T.batched_tensordot(out, input.dimshuffle(0, 2, 1), axes=([2], [1]))
        out = out.dimshuffle(0, 1, 3, 2)

        # compute head bias part by tensor dot ([batch, length, dim] * [dim, num_label])
        # the shape of s_h should be [batch, length, num_label]
        if self.W_h is not None:
            s_h = T.tensordot(input, self.W_h, axes=[[2], [0]])
            out = out + s_h.dimshuffle(0, 1, 'x', 2)

        # compute child part by tensor dot ([batch, length, dim] * [dim, num_label]
        # the shape of s_c should be [batch, length, num_label]
        if self.W_c is not None:
            s_c = T.tensordot(input, self.W_c, axes=[[2], [0]])
            out = out + s_c.dimshuffle(0, 'x', 1, 2)

        # add bias part.
        if self.b is not None:
            out = out + self.b.dimshuffle('x', 'x', 'x', 0)

        if mask is not None:
            mask_shuffled = mask.dimshuffle(0, 1, 'x', 'x')
            out = out * mask_shuffled
            mask_shuffled = mask.dimshuffle(0, 'x', 1, 'x')
            out = out * mask_shuffled
        return out
Example #15
0
    def function(self, xs, h_prevs, c_prevs):
        biases = T.shape_padright(T.ones_like(xs[:,0]))
        input_vector = T.concatenate((xs, h_prevs, biases), axis=1)

        forget_gate = T.nnet.sigmoid(T.tensordot(input_vector, self.W_forget_theano, axes=[[1],[1]]))
        input_gate = T.nnet.sigmoid(T.tensordot(input_vector, self.W_input_theano, axes=[[1],[1]]))
        candidate_vector = T.tanh(T.tensordot(input_vector, self.W_candidate_theano, axes=[[1],[1]]))
        cell_state = forget_gate*c_prevs + input_gate * candidate_vector

        output = T.nnet.sigmoid(T.tensordot(input_vector, self.W_output_theano, axes=[[1],[1]]))
        h = output * T.tanh(cell_state)
        return h, cell_state
Example #16
0
    def __init__(self, word_context, char_context, V, K, word_context_sz, char_context_sz, rng):
        """
        Initialize the parameters of the language model
        """
        # word training contexts
        self.word_context = word_context
        # character training contexts
        self.char_context = char_context

        # initialize context word embedding matrix Rw of shape (V, K)
        Rw_values = np.asarray(rng.uniform(-0.01, 0.01, size=(V, K)), 
                              dtype=theano.config.floatX)
        self.Rw = theano.shared(value=Rw_values, name='Rw', borrow=True)
        # initialize context character embedding matrix Rc of shape (V, K)
        Rc_values = np.asarray(rng.uniform(-0.01, 0.01, size=(V, K)), 
                              dtype=theano.config.floatX)
        self.Rc = theano.shared(value=Rc_values, name='Rc', borrow=True)

        # initialize target word embedding matrix Q of shape (V, K)
        Q_values = np.asarray(rng.uniform(-0.01, 0.01, size=(V, K)), 
                              dtype=theano.config.floatX)
        self.Q = theano.shared(value=Q_values, name='Q', borrow=True)
        # initialize word weight tensor Cw of shape (word_context_sz, K, K)
        Cw_values = np.asarray(rng.normal(0, math.sqrt(0.1), 
                                          size=(word_context_sz, K, K)), 
                              dtype=theano.config.floatX)
        self.Cw = theano.shared(value=Cw_values, name='Cw', borrow=True)
        # initialize character weight tensor Cc of shape (char_context_sz, K, K)
        Cc_values = np.asarray(rng.normal(0, math.sqrt(0.1), 
                                          size=(char_context_sz, K, K)), 
                               dtype=theano.config.floatX)
        self.Cc = theano.shared(value=Cc_values, name='Cc', borrow=True)
        # initialize bias vector 
        b_values = np.asarray(rng.normal(0, math.sqrt(0.1), size=(V,)), 
                              dtype=theano.config.floatX)
        self.b = theano.shared(value=b_values, name='b', borrow=True)
        # context word representations
        self.r_w = self.Rw[word_context]
        # context character representations
        self.r_c = self.Rc[char_context]
        # predicted word representation for target word by word context
        self.qw_hat = T.tensordot(self.Cw, self.r_w, axes=[[0,1], [1,2]])
        # predicted word representation for target word by character context
        self.qc_hat = T.tensordot(self.Cc, self.r_c, axes=[[0,1], [1,2]])
        # combine word and charafter predictions
        self.q_hat = self.qw_hat + self.qc_hat
        # similarity score between predicted word and all target words
        self.s = T.transpose(T.dot(self.Q, self.q_hat) + T.reshape(self.b, (V,1)))
        # softmax activation function
        self.p_w_given_h = T.nnet.softmax(self.s)
        # parameters of the model
        self.params = [self.Rw, self.Rc, self.Q, self.Cw, self.Cc, self.b]
Example #17
0
 def get_output(self, train=False):
     [X_w, X_t] = self.get_input(train)
     t_w = self.W_t[X_w[:,:, 0]] # doc_l, n_tags*n_samples, n_dim
     w_w = self.W_w[X_w[:,:, 1]]
     dot_tw = T.sum(w_w * t_w, axis=2)
     inter_1 = T.tensordot(w_w, self.S, axes = [[2],[2]])
     inter_2 = T.tensordot(t_w, self.P, axes = [[2],[2]]) # doc_l, n_tags*n_samples, 2,5
     inter = T.sum(inter_1 * inter_2, axis = 3)
     sim_tw = T.tensordot(inter + T.shape_padleft(self.B, 2), self.U, axes=[[2],[0]]) 
     sim_tw = T.reshape(sim_tw, (X_w.shape[0], X_w.shape[1]))
     dot_sum_w = T.sum(dot_tw * T.nnet.sigmoid(sim_tw), axis = 0)/(X_w.shape[0])
     dot_w = theano.tensor.reshape(dot_sum_w, (X_w.shape[1], 1))
     return self.activation(dot_w)
     '''
Example #18
0
    def __init__(self, model, glm, latent):
        """ Initialize the filtered stim model
        """
        self.model = model
        self.bkgd_model = model["bkgd"]
        self.n = glm.n
        self.tuningcurves = latent[self.bkgd_model["tuningcurves"]]
        self.spatial_basis = self.tuningcurves.spatial_basis
        self.tc_spatial_shape = self.tuningcurves.spatial_shape
        self.tc_spatial_ndim = self.tuningcurves.spatial_ndim
        self.temporal_basis = self.tuningcurves.temporal_basis
        self.Bx = self.tuningcurves.Bx
        self.Bt = self.tuningcurves.Bt
        self.w_x = self.tuningcurves.w_x[:, self.tuningcurves.Y[self.n]]
        self.w_t = self.tuningcurves.w_t[:, self.tuningcurves.Y[self.n]]

        # Create a shared variable for the filtered stimulus. This is a 4D
        # tensor with dimensions:
        #   - time
        #   - location (pixel)
        #   - spatial basis
        #   - temporal basis
        # To get a stimulus current we need to select a location and take a
        # weighted sum along both the spatial and temporal axes.
        self.filtered_stim = theano.shared(name="stim", value=np.ones((1, 1, 1, 1)))

        self.locations = latent[self.bkgd_model["locations"]]
        self.L = self.locations.Lmatrix[self.n, :]
        self.loc_index = self.locations.location_prior.ravel_index(self.L)

        # Expose outputs to the Glm class

        # It matters that we do the dot products in order of outermost
        # to innermost dimension. This improves memory efficiency.
        # Compute the spatially filtered stimulus
        # Result is T x L x B_t
        self.I_stim_t = T.tensordot(self.filtered_stim, self.w_t, axes=[[3], [0]])
        self.I_stim_t.name = "I_stim_t"

        # Take dot product with temporal basis coefficients
        # Result is T x L (where L is number of locations)
        self.I_stim_xt = T.tensordot(self.I_stim_t, self.w_x, axes=[[2], [0]])
        self.I_stim_xt.name = "I_stim_xt"

        self.I_stim = self.I_stim_xt[:, self.loc_index]
        self.I_stim.name = "I_stim"

        # There are no latent variables in this class. They all belong
        # to global latent variables.
        self.log_p = T.constant(0.0)
 def learningstep(self, Y, L, W, epsilon, threshold):
     s = self._activation(Y,L,W,threshold)
     s.name = 's_%d.%d[t]'%(self._nmultilayer,self._nlayer)
     W_new = W + epsilon*(T.tensordot(s,Y,axes=[0,0]) -
                          T.sum(s,axis=0)[:,np.newaxis]*W)
     W_new.name = 'W_%d.%d[t]'%(self._nmultilayer,self._nlayer)
     return s, W_new
Example #20
0
def gram_mat(vecs):
    # theano gram matrix
    
    vecs = vecs.flatten(ndim = 3)
    gram = T.tensordot(vecs, vecs, axes=([2], [2]))
    
    return gram
Example #21
0
 def forwardrankrel(self, x, y):
     """Forward function in the special case of relation ranking to avoid a
     broadcast problem. @TODO: think about a workaround."""
     xW = T.tensordot(x, self.W, axes=([1], [0]))
     xW = xW.reshape((1, xW.shape[1], xW.shape[2]))
     xWy = ((y.reshape((y.shape[0], y.shape[1], 1))) * xW).sum(1)
     return self.act(xWy + self.b)
Example #22
0
 def make_consensus(self, networks, axis=2):
   cns = self.attrs['consensus']
   if cns == 'max':
     return T.max(networks, axis=axis)
   elif cns == 'min':
     return T.min(networks, axis=axis)
   elif cns == 'mean':
     return T.mean(networks, axis=axis)
   elif cns == 'flat':
     if self.depth == 1:
       return networks
     if axis == 2:
       return networks.flatten(ndim=3)
       #return T.reshape(networks, (networks.shape[0], networks.shape[1], T.prod(networks.shape[2:]) ))
     else:
       return networks.flatten(ndim=2) # T.reshape(networks, (networks.shape[0], T.prod(networks.shape[1:]) ))
   elif cns == 'sum':
     return T.sum(networks, axis=axis, acc_dtype=theano.config.floatX)
   elif cns == 'prod':
     return T.prod(networks, axis=axis)
   elif cns == 'var':
     return T.var(networks, axis=axis)
   elif cns == 'project':
     p = self.add_param(self.create_random_uniform_weights(self.attrs['n_out'], 1, self.attrs['n_out'] + self.depth + 1))
     return T.tensordot(p, networks, [[1], [axis]])
   elif cns == 'random':
     idx = self.rng.random_integers(size=(1,), low=0, high=self.depth)
     if axis == 0: return networks[idx]
     if axis == 1: return networks[:,idx]
     if axis == 2: return networks[:,:,idx]
     if axis == 3: return networks[:,:,:,idx]
     assert False, "axis too large"
   else:
     assert False, "consensus method unknown: " + cns
Example #23
0
    def get_output_for(self, input, init=False, **kwargs):
        if input.ndim > 2:
            # if the input has more than two dimensions, flatten it into a
            # batch of feature vectors.
            input = input.flatten(2)
        
        activation = T.tensordot(input, self.W, [[1], [0]])
        abs_dif = (T.sum(abs(activation.dimshuffle(0,1,2,'x') - activation.dimshuffle('x',1,2,0)),axis=2)
                    + 1e6 * T.eye(input.shape[0]).dimshuffle(0,'x',1))

        if init:
            mean_min_abs_dif = 0.5 * T.mean(T.min(abs_dif, axis=2),axis=0)
            abs_dif /= mean_min_abs_dif.dimshuffle('x',0,'x')
            self.init_updates = [(self.log_weight_scale, self.log_weight_scale-T.log(mean_min_abs_dif).dimshuffle(0,'x'))]
        
        f = T.sum(T.exp(-abs_dif),axis=2)

        if init:
            mf = T.mean(f,axis=0)
            f -= mf.dimshuffle('x',0)
            self.init_updates.append((self.b, -mf))
        else:
            f += self.b.dimshuffle('x',0)

        return T.concatenate([input, f], axis=1)
Example #24
0
    def __init__(self, rng, input, n_in, n_in2, n_out,
                 activation, W=None, b=None,
                 use_bias=False):

        self.input = input
        self.activation = activation

        if W is None:
            W_values = np.asarray(0.01 * rng.standard_normal(
                size=(n_out, n_in, 1, n_in2)), dtype=theano.config.floatX)
            W = theano.shared(value=W_values, name='W')
        
        if b is None:
            b_values = np.zeros((n_out,), dtype=theano.config.floatX)
            b = theano.shared(value=b_values, name='b')

        self.W = W
        self.b = b

        if use_bias:
            lin_output = T.dot(input, self.W,) + self.b
        else:
            lin_output = T.tensordot(input, self.W, axes = [[1,2,3],[1,2,3]])

        self.output = (lin_output if activation is None else activation(lin_output))
    
        # parameters of the model
        if use_bias:
            self.params = [self.W, self.b]
        else:
            self.params = [self.W]            
Example #25
0
	def _step(self,xg_t, xo_t, xc_t, mask_tm1,h_tm1, c_tm1, u_g, u_o, u_c):

		h_mask_tm1 = mask_tm1 * h_tm1
		c_mask_tm1 = mask_tm1 * c_tm1
		act = T.tensordot( xg_t + h_mask_tm1, u_g , [[1],[2]])
		gate = T.nnet.softmax(act.reshape((-1, act.shape[-1]))).reshape(act.shape)

		c_tilda = self.activation(xc_t + T.dot(h_mask_tm1, u_c))

		sigma_se = self.k_parameters[0]
		sigma_per = self.k_parameters[1]
		sigma_b_lin = self.k_parameters[2]
		sigma_v_lin = self.k_parameters[3]
		sigma_rq = self.k_parameters[4]

		l_se = self.k_parameters[5]
		l_per = self.k_parameters[6]
		l_lin = self.k_parameters[7]
		l_rq = self.k_parameters[8]

		alpha_rq = self.k_parameters[9]
		p_per = self.k_parameters[10]

		k_se = T.pow(sigma_se,2) * T.exp( -T.pow(c_mask_tm1 - c_tilda,2) / (2* T.pow(l_se,2) + self.EPS))
		k_per = T.pow(sigma_per,2) * T.exp( -2*T.pow(T.sin( math.pi*(c_mask_tm1 - c_tilda)/ (p_per + self.EPS) ),2)	 / ( T.pow(l_per,2) + self.EPS ))
		k_lin = T.pow(sigma_b_lin,2) + T.pow(sigma_v_lin,2)	 * (c_mask_tm1 - l_lin) * (c_tilda - l_lin )
		k_rq = T.pow(sigma_rq,2) * T.pow( 1 + T.pow( (c_mask_tm1 - c_tilda),2)	/ ( 2 * alpha_rq * T.pow(l_rq,2) + self.EPS), -alpha_rq)

		ops = [c_mask_tm1,c_tilda,k_se, k_per, k_lin,k_rq]
		yshuff = T.as_tensor_variable( ops, name='yshuff').dimshuffle(1,2,0)
		c_t = (gate.reshape((-1,gate.shape[-1])) * yshuff.reshape((-1,yshuff.shape[-1]))).sum(axis = 1).reshape(gate.shape[:2])
		o_t = self.inner_activation(xo_t + T.dot(h_mask_tm1, u_o))
		h_t = o_t * self.activation(c_t)
		return h_t, c_t
Example #26
0
 def next_state_fn(self, a, last_state, U, u):
     U_act = U[a]
     u_act = u[a]
     return T.tensordot(
         U_act,
         (last_state), [[0], [0]]
     ) + u_act
Example #27
0
    def fprop(self, state_below):

        self.input_space.validate(state_below)

        if self.needs_reformat:
            state_below = self.input_space.format_as(state_below, self.desired_space)

        for value in get_debug_values(state_below):
            if self.mlp.batch_size is not None and value.shape[0] != self.mlp.batch_size:
                raise ValueError("state_below should have batch size "+str(self.dbm.batch_size)+" but has "+str(value.shape[0]))

        self.desired_space.validate(state_below)
        assert state_below.ndim == 2

        assert self.W.ndim == 3

        Z = T.tensordot(state_below, self.W, axes=[[1],[0]]) + self.b

        rval = batched_softmax(Z)

        for value in get_debug_values(rval):
            if self.mlp.batch_size is not None:
                assert value.shape[0] == self.mlp.batch_size

        return rval
Example #28
0
    def get_output(self, train=False):
        X = self.get_input()
        Wx = T.tensordot(X, self.W, axes=(2, 0)).dimshuffle(1, 0, 2, 3)
        s_init = T.zeros((X.shape[0], self.output_dim))
        u_init = T.ones((X.shape[0], self.causes_dim)) / self.causes_dim
        outputs, uptdates = scan(
            self._step,
            sequences=[Wx],
            outputs_info=[s_init, u_init],
            non_sequences=[self.b] + self.hid2output.params,
            truncate_gradient=self.truncate_gradient)

        if self.return_mode == 'both':
            return T.concatenate([outputs[0], outputs[1]],
                                 axis=-1)
        elif self.return_mode == 'states':
            out = outputs[0]
        elif self.return_mode == 'causes':
            out = outputs[1]
        else:
            raise ValueError("return_model {0} not valid. Choose "
                             "'both', 'states' or 'causes'".format(
                                 self.return_mode))

        if self.return_sequences:
            return out.dimshuffle(1, 0, 2)
        else:
            return out[-1]
Example #29
0
    def getScores(self, args1, args2, l, n, relationProbs, neg1, neg2, entropy):
        argembed1 = self.A[args1]
        argembed2 = self.A[args2]

        weightedC = T.tensordot(relationProbs, self.C, axes=[[1], [2]])
        one = self.factorization(batchSize=l,
                                 argsEmbA=argembed1,
                                 argsEmbB=argembed2,
                                 wC=weightedC)  # [l,n]

        u = T.concatenate([one + self.Ab[args1], one + self.Ab[args2]])

        logScoresP = T.log(T.nnet.sigmoid(u))

        allScores = logScoresP
        allScores = T.concatenate([allScores, entropy, entropy])


        negembed1 = self.A[neg1.flatten()].reshape((n, l, self.k))
        negembed2 = self.A[neg2.flatten()].reshape((n, l, self.k))
        negOne = self.negFactorization1(batchSize=l,
                                        negEmbA=negembed1,
                                        argsEmbB=argembed2,
                                        wC=weightedC)

        negTwo = self.negFactorization2(batchSize=l,
                                        argsEmbA=argembed1,
                                        negEmbB=negembed2,
                                        wC=weightedC)

        g = T.concatenate([negOne + self.Ab[neg1].dimshuffle(1, 0),
                           negTwo + self.Ab[neg2].dimshuffle(1, 0)])
        logScores = T.log(T.nnet.sigmoid(-g))
        allScores = T.concatenate([allScores, logScores.flatten()])
        return allScores
Example #30
0
def hessian(objective, argument):
    """
    Compute the directional derivative of the gradient
    (which is equal to the hessian multiplied by direction).
    """
    g = T.grad(objective, argument)

    # Create a new tensor A, which has the same type (i.e. same dimensionality)
    # as argument.
    A = argument.type()

    try:
        # First attempt efficient 'R-op', this directly calculates the
        # directional derivative of the gradient, rather than explicitly
        # calculating the hessian and then multiplying.
        R = T.Rop(g, argument, A)
    except NotImplementedError:
        shp = T.shape(argument)
        H = T.jacobian(g.flatten(), argument).reshape(
                                        T.concatenate([shp, shp]), 2*A.ndim)
        R = T.tensordot(H, A, A.ndim)

    try:
        hess = theano.function([argument, A], R, on_unused_input='raise')
    except theano.compile.UnusedInputError:
        warn('Theano detected unused input - suggests hessian may be zero or '
             'constant.')
        hess = theano.function([argument, A], R, on_unused_input='ignore')
    return hess
 def apply(self, input0_, input1_):
     W, b, W_Linear = self.parameters
     output = T.tensordot(input0_, W, axes=[[1, 2], [0, 1]]) + T.dot(
         input1_, W_Linear) + b
     return output
Example #32
0
 def H(q, p):
     return 0.5 * T.tensordot(p, T.tensordot(Ker(q, q), p, [[1], [0]]),
                              [[0, 1], [0, 1]]) + met(q, p)
    def sym_masked_neg_loglikelihood_gradient(self, x, mask):
        """ x is a matrix of column datapoints (DxB) D = n_visible, Bfloat = batch size """
        logdensity, z_alpha, z_mu, z_sigma, Alpha, Mu, Sigma, h = self.sym_mask_logdensity_estimator_intermediate(
            x, mask)

        #        nnz = output_mask.sum(0)
        #        sparsity_multiplier = T.shape_padright(T.shape_padleft((B+1e-6)/(nnz+1e-6)))

        #        wPhi = T.maximum(Phi + T.log(Alpha), constantX(-100.0)) #BxDxC
        #        lp_current = log_sum_exp(wPhi, axis = 2) * output_mask #BxD
        #        lp_current_sum = (lp_current.sum(1) * D / (D-d)).sum() #1

        loglikelihood = logdensity.mean(dtype=floatX)
        loss = -loglikelihood

        dp_dz_alpha = T.grad(loss, z_alpha)  # BxDxC
        gb_alpha = dp_dz_alpha.sum(0)  # DxC
        gV_alpha = T.tensordot(h.T, dp_dz_alpha, [[1], [0]]).dimshuffle(
            (1, 0, 2))  # DxHxC

        dp_dz_mu = T.grad(loss, z_mu)  # BxDxC
        dp_dz_mu = dp_dz_mu * Sigma  # Heuristic
        gb_mu = dp_dz_mu.sum(0)  # DxC
        gV_mu = T.tensordot(h.T, dp_dz_mu, [[1], [0]]).dimshuffle(
            (1, 0, 2))  # DxHxC

        dp_dz_sigma = T.grad(loss, z_sigma)  # BxDxC
        gb_sigma = dp_dz_sigma.sum(0)  # DxC
        gV_sigma = T.tensordot(h.T, dp_dz_sigma, [[1], [0]]).dimshuffle(
            (1, 0, 2))  # DxHxC

        if self.n_layers > 1:
            gWs, gbs, gW1, gWflags, gb1 = T.grad(
                loss, [self.Ws, self.bs, self.W1, self.Wflags, self.b1])
            gradients = {
                "V_alpha": gV_alpha,
                "b_alpha": gb_alpha,
                "V_mu": gV_mu,
                "b_mu": gb_mu,
                "V_sigma": gV_sigma,
                "b_sigma": gb_sigma,
                "Ws": gWs,
                "bs": gbs,
                "W1": gW1,
                "b1": gb1,
                "Wflags": gWflags
            }
        else:
            gW1, gWflags, gb1 = T.grad(loss, [self.W1, self.Wflags, self.b1])
            gradients = {
                "V_alpha": gV_alpha,
                "b_alpha": gb_alpha,
                "V_mu": gV_mu,
                "b_mu": gb_mu,
                "V_sigma": gV_sigma,
                "b_sigma": gb_sigma,
                "W1": gW1,
                "b1": gb1,
                "Wflags": gWflags
            }
        # Gradients
        return (loss, gradients)
def conv1d_sd(input, filters, image_shape, filter_shape, border_mode='valid',
              subsample=(1,)):
    """
    Using a single dot product.

    border_mode has to be 'valid' at the moment.
    """
    if border_mode != 'valid':
        log.error("Unsupported border_mode for conv1d_sd: "
                  "%s" % border_mode)
        raise RuntimeError("Unsupported border_mode for conv1d_sd: "
                           "%s" % border_mode)

    batch_size, num_input_channels, input_length = image_shape
    num_filters, num_input_channels_, filter_length = filter_shape
    stride = subsample[0]

    if filter_length % stride > 0:
        raise RuntimeError("Filter length (%d) is not a multiple of the "
                           "stride (%d)" % (filter_length, stride))

    num_steps = filter_length // stride
    output_length = (input_length - filter_length + stride) // stride

    # pad the input so all the shifted dot products fit inside.
    # shape is (b, c, l)
    padded_length = ((input_length // filter_length) * filter_length +
                     (num_steps - 1) * stride)

    # at this point, it is possible that the padded_length is SMALLER than the
    # input size. so then we have to truncate first.
    truncated_length = min(input_length, padded_length)
    input_truncated = input[:, :, :truncated_length]

    input_padded_shape = (batch_size, num_input_channels, padded_length)
    input_padded = T.zeros(input_padded_shape)
    input_padded = T.set_subtensor(input_padded[:, :, :truncated_length],
                                   input_truncated)

    inputs = []
    for num in range(num_steps):
        shift = num * stride
        length = (padded_length - shift) // filter_length

        r_input_shape = (batch_size, num_input_channels, length, filter_length)
        r_input = input_padded[
            :, :, shift:length * filter_length + shift].reshape(r_input_shape)

        inputs.append(r_input)

    inputs_stacked = T.stack(*inputs)  # shape is (n, b, c, w, f)
    filters_flipped = filters[:, :, ::-1]

    r_conved = T.tensordot(inputs_stacked, filters_flipped,
                           numpy.asarray([[2, 4], [1, 2]], dtype=theano.config.floatX))
    # resulting shape is (n, b, w, n_filters)
    # output needs to be (b, n_filters, w * n)
    r_conved = r_conved.dimshuffle(1, 3, 2, 0)  # (b, n_filters, w, n)
    conved = r_conved.reshape((r_conved.shape[0], r_conved.shape[1],
                               r_conved.shape[2] * r_conved.shape[3]))
    # result is (b, n_f, l)

    # remove padding
    return conved[:, :, :output_length]
Example #35
0
x_train_filt_T = theano.shared(x_train_filt.transpose(2, 0, 1))
x_test_filt_T = theano.shared(x_test_filt.transpose(2, 0, 1))
y_train_T = T.cast(theano.shared(y_train[:, 0]), 'int32')
y_test_T = T.cast(theano.shared(y_test[:, 0]), 'int32')

# lr         = 0.01 # learning rate
lr = T.scalar('lr')
batch_size = y_train.size / 4
epochs = 2500
index = T.lscalar('index')
y = T.ivector('y')
X = T.tensor3('X')
csp_w = theano.shared(W)
avg_v = theano.shared(V)
proj_csp = T.tensordot(X, csp_w, axes=[2, 0])
layer0_out = T.pow(proj_csp, 2)

variance = T.tensordot(layer0_out, avg_v, axes=[1, 0])

layer1_out = T.log((variance))[:, :, 0]
layer2 = LogisticRegression(input=layer1_out, n_in=5, n_out=2)
cost = layer2.negative_log_likelihood(y) + .01 * T.sum(T.pow(
    avg_v, 2)) - 1000 * (T.sgn(T.min(avg_v)) - 1) * T.pow(T.min(avg_v), 2)

params = [csp_w, avg_v] + layer2.params

grads = T.grad(cost, params)
updates = []
for param_i, grad_i in zip(params, grads):
    updates.append((param_i, param_i - lr * grad_i))
from itertools import product
from warnings import warn
from time import time

##===================Theano expressions and functions===================

##-----model space-----

#theano
rf_stack_tnsr = tnsr.tensor3('rf_stack_tnsr')  ##G x stim_size x stim_size
feature_map_tnsr = tnsr.tensor4(
    'feature_map_tnsr')  ##T x D x stim_size x stim_size

apply_rf_to_feature_maps = function(inputs=[rf_stack_tnsr, feature_map_tnsr],
                                    outputs=tnsr.tensordot(rf_stack_tnsr,
                                                           feature_map_tnsr,
                                                           axes=[[1, 2],
                                                                 [2, 3]]))

#example python use case
#model_space = apply_rf_to_feature_maps(rf_stack, feature_maps)

##-----prediction menu----- (uses batched_tensordot. not sure why this is necessary, but memory error if normal tensordot is used.)
model_space_tnsr = tnsr.tensor3('X')  ##model-space tensor: G x T x D
feature_weight_tnsr = tnsr.tensor3('NU')  ##feature weight tensor: G x D x V
prediction_menu_tnsr = tnsr.batched_tensordot(
    model_space_tnsr, feature_weight_tnsr,
    axes=[[2], [1]])  ##prediction tensor: G x T x V
bigmult = function([model_space_tnsr, feature_weight_tnsr],
                   prediction_menu_tnsr)

##example python use case
Example #37
0
    def discriminative_free_energy(self, input=None):
        """
        discriminative_free_energy func
            The correct output is p(y|x)

        Parameters
        ----------
        self : RBM class object

        input : `[T.tensors]`, optional
            Used when calculating free energy of gibbs chain sampling

        Returns
        -------
        F(y|x) :
            A `list[]` of vectors of the discriminative model free energy
            for each output node. Negative loglikelihood can be used as the
            objective function.

        Notes
        -----
        The free energy for the discriminative model is computed as:

        :math:
        `F(y,x,h) = (xWh + yWh + yBx + vbias*x + hbias*h + cbias*y)`\n
        `    wx_b = xW_{ik} + yW_{jk} + hbias`\n
        `  F(y,x) = {cbias*y + yBx + sum_k[ln(1+exp(wx_b))]}`\n
        `  F(y|x) = {cbias + Bx + sum_k[ln(1+exp(wx_b)]}`\n
        `  F(y|x) = {cbias + Bx + hbias + yWh}`\n

        :params: used are W^1, W^2, B, c, h biases

        """
        # amend input if given an input. e.g. free_energy(chain_end)
        if input is None:
            visibles = self.input
        else:
            visibles = input
        hbias = self.hbias[0]
        cbiases = self.cbias
        vbias = self.vbias
        xWh_params = self.V_params
        hWy_params = self.U_params  # (items, outs, hiddens)
        B_params = self.B_params

        # rebroadcast hidden unit biases
        # (hiddens,) broadcast(T, F, T) --> ('x', hiddens, 'x')
        wx_b = hbias.dimshuffle('x', 0, 'x')
        utility = []

        for cbias in cbiases:
            # (items, outs) --> ('x', outs)
            # utility = [cbias,...]  ('x', outs)
            cbias = T.sum(cbias, axis=0)
            u = cbias.dimshuffle('x', 0)
            utility.append(u)

        # loop over all input nodes
        # x : input variables
        # W, B : weights
        # a : input biases
        for x, xWh, B in zip(visibles, xWh_params, B_params):
            # matrix dot product between input variables and hidden units
            # xw = xW_{ik} : (rows, hiddens)
            # wx_b = xW_{ik} + hbias : (rows, hiddens) --> (rows, hids, 'x')
            if xWh.ndim == 2:
                xw = T.dot(x, xWh)
                wx_b += xw.dimshuffle(0, 1, 'x')
            else:
                xw = T.tensordot(x, xWh, axes=[[1, 2], [0, 1]])
                wx_b += xw.dimshuffle(0, 1, 'x')

            # loop over all output nodes
            # hWy : weights (items, outs, hiddens)
            for i, hWy in enumerate(hWy_params):
                # wx_b = W_{jk} + W_{jk} + hbias : (rows, hiddens, outs)
                hWy = T.sum(hWy, axis=0)
                wx_b += hWy.dimshuffle('x', 1, 0)
                # xB : (rows, items, cats) . (items, cats, items, outs)
                # utility[i] = cbias + Bx : (rows, outs)
                # utility[i] = cbias + Bx : (rows, outs)
                utility[i] += T.tensordot(x,
                                          T.sum(B, axis=-2),
                                          axes=[[1, 2], [0, 1]])

        # sum over hiddens axis
        # sum_k \ln(1+\exp(wx_b)) : (rows, hiddens, outs) -- > (rows, outs)
        entropy = T.sum(T.log(1 + T.exp(wx_b)), axis=1)

        # add entropy to each expected utility term
        # -F(y|x)  (rows, outs)
        energy = []
        for u in utility:
            energy.append(u + entropy)

        return energy
 def _inference(self, Y, W):
     """Return the infered class label for a given input"""
     W_normalized = T.switch(T.eq(W,0), 0, W/T.sum(W, axis=0))
     s = T.tensordot(Y, W_normalized, axes=[1,1])
     return s
Example #39
0
 def tensordot(self, a, b):
     return tt.tensordot(a, b, axes=(0, 0))
Example #40
0
def gramMatrix(x):
    x = x.flatten(ndim=3)
    return T.tensordot(x, x, axes=([2], [2]))
Example #41
0
    def __init__(self, model, algo='fisher', c_lambd_inv=1e-3, rate=1.05,
                 over_sampling=1, rescale='momentum'):
        """ Init self.

        Args:
            model,
            algo,
            c_lambd_inv: Start value of \lambda regularizer (used in matrix
                inversion and in F*v computation).
            rate: Change per iteration for \lambda.
            over_sampling: For Fisher-like methods, use multiple random
                vectors per one sample from dataset.
            rescale: Can be either False, True or 'momentum'.

        Implemented algos:
            'gn' - Gauss-Newton matrix,
            'fisher' - Fisher matrix,
            'kr' - Khatri-Rao matrix,
            'kr_diag' - block-diagonal KR matrix.
        """
        self.model = model
        self.algo = algo

        self.x = self.model.x
        self.y = T.ivector('y')
        self.outc = T.matrix('outc')

        # due to theano bugs
        self.x_d = shared_empty(2)
        self.y_d = shared_empty(1, dtype='int32')
        self.outc_d = shared_empty(2)
        self.rand_outc_d = shared_empty(3)
        # ---

        self.rand_outc = T.tensor3('rand_outc')
        self.lambd_inv = T.scalar('lambd_inv')

        self.c_lambd_inv = c_lambd_inv
        self.rate = rate
        self.over_sampling = over_sampling
        self.rescale = rescale

        # -- target def --
        self.f_loss = 0
        self.f_loss_samples = 0
        for i in range(self.over_sampling):
            self.f_loss += get_loss(self.model.a, self.rand_outc[i] + my_consider_constant(self.model.a)) * scalar_floatX(self.model.a.shape[0])
            self.f_loss_samples += get_loss_samples(self.model.a, self.rand_outc[i] + my_consider_constant(self.model.a))

        self.loss = get_loss(self.model.a, self.outc)
        self.err = get_error(get_pred(self.model.a), self.y)

        self.updates = OrderedDict()

        self.grad = sum(([T.grad(self.loss, p)] for p in self.model.params), [])
        self.grad_vec = T.concatenate([g.flatten() for g in self.grad])

        def get_fisher_mat():
            grad2d = []
            for p in self.model.params:
                grad2d += [T.jacobian(self.f_loss_samples, p)]
                if grad2d[-1].ndim == 2:
                    grad2d[-1] = grad2d[-1].dimshuffle(0, 1, 'x')

            grad2d_vec = T.concatenate([g.flatten(2).T for g in grad2d]).T

            # tensor wise: F_p,i,j = sum_k grad2d[p,i,k]*grad2d[p,k,j]
            # just a slow reference implementation of what is below
            # F = T.mean(T.batched_dot(grad2d_vec.dimshuffle(0, 1, 'x'), grad2d_vec.dimshuffle(0, 'x', 1)), 0)/self.over_sampling
            F = T.dot(grad2d_vec.T, grad2d_vec)/T.cast(grad2d_vec.shape[0], theano.config.floatX)/self.over_sampling
            return F

        if self.algo == 'fisher':
            self.grad2d = []
            for p in self.model.params:
                self.grad2d += [T.jacobian(self.f_loss_samples, p)]
                if self.grad2d[-1].ndim == 2:
                    self.grad2d[-1] = self.grad2d[-1].dimshuffle(0, 1, 'x')

            self.grad2d_vec = T.concatenate([g.flatten(2).T for g in self.grad2d]).T

            # tensor wise: F_p,i,j = sum_k grad2d[p,i,k]*grad2d[p,k,j]
            # just a slow reference implementation of what is below
            # F = T.mean(T.batched_dot(grad2d_vec.dimshuffle(0, 1, 'x'), grad2d_vec.dimshuffle(0, 'x', 1)), 0)/self.over_sampling
            self.F = T.dot(self.grad2d_vec.T, self.grad2d_vec)/T.cast(self.grad2d_vec.shape[0], theano.config.floatX)/self.over_sampling
        elif self.algo == 'gn':
            self.grad2d = []
            for p in self.model.params:
                self.grad2d += [T.jacobian(self.model.a.flatten(), p)]
                new_shape = (self.model.a.shape[0], self.model.a.shape[1], -1)
                self.grad2d[-1] = self.grad2d[-1].reshape(new_shape)


            self.grad2d_vec = T.concatenate([g.flatten(3) for g in self.grad2d], 2)

            # just a slow reference implementation of what is below
            # self.F = T.mean(T.batched_dot(self.grad2d_vec.dimshuffle(0, 2, 1),
            #                               self.grad2d_vec.dimshuffle(0, 1, 2)), axis=0)

            self.F = T.tensordot(self.grad2d_vec.dimshuffle(0, 2, 1),
                                 self.grad2d_vec.dimshuffle(0, 1, 2), [(0, 2), (0, 1)])/T.cast(self.grad2d_vec.shape[0], theano.config.floatX)
        elif self.algo.startswith('kr'):
        self.grads = []
        # self.acts = [T.concatenate([self.model.x, T.ones((self.model.x.shape[0], 1))], axis=1)]
        self.acts = [self.model.x]
        for l in self.model.layers:
            cg = T.grad(self.f_loss, l.s)
            self.grads.append(cg)
            # self.acts.append(T.concatenate([l.a, T.ones((l.a.shape[0], 1))], axis=1))
            self.acts.append(l.a)

        self.G = []
        self.A = []
        self.F_block = []
        self.F = []

        cnt = T.cast(self.grads[0].shape[0], theano.config.floatX)
        for i in range(len(self.grads)):
            self.G += [[]]
            self.A += [[]]
            for j in range(len(self.grads)):
                # self.G[-1] += [T.mean(T.batched_dot(self.grads[i].dimshuffle(0, 1, 'x'), self.grads[j].dimshuffle(0, 'x', 1)), 0).dimshuffle('x', 0, 1)]
                # self.A[-1] += [T.mean(T.batched_dot(self.acts[i].dimshuffle(0, 1, 'x'), self.acts[j].dimshuffle(0, 'x', 1)), 0).dimshuffle('x', 0, 1)]

                # self.G[-1] += [T.batched_dot(self.grads[i].dimshuffle(0, 1, 'x'), self.grads[j].dimshuffle(0, 'x', 1))]
                # self.A[-1] += [T.batched_dot(self.acts[i].dimshuffle(0, 1, 'x'), self.acts[j].dimshuffle(0, 'x', 1))]

                self.G[-1] += [self.grads[i].T.dot(self.grads[j]).dimshuffle('x', 0, 1)/cnt]
                self.A[-1] += [self.acts[i].T.dot(self.acts[j]).dimshuffle('x', 0, 1)/cnt]

                if self.algo.endswith('diag'):
                    self.G[-1][-1] *= float(i==j)
                    self.A[-1][-1] *= float(i==j)


        for i in range(len(self.grads)):
            self.F_block += [[]]
            for j in range(len(self.grads)):
                # depends on whether you want to compute the real fisher with this or the kr approximation
                # since numpy-base fast_kron somehow computes 3d tensors faster than theano

                # cblock = fast_kron(self.A[i][j], self.G[i][j])
                cblock = native_kron(self.A[i][j], self.G[i][j])

                cblock = cblock.reshape(cblock.shape[1:], ndim=2)
                self.F_block[i] += [cblock]
            self.F.append(T.concatenate(self.F_block[-1], axis=1))
        self.F = T.concatenate(self.F, axis=0)
        self.F = (self.F+self.F.T)/2


        self.Fdamp = self.F+T.identity_like(self.F)*self.lambd_inv

        # There're 3+ different ways of computing F^-1*v in theano,
        # and it seems like solve_sym_pos is quite neutral in terms
        # of performance + it throws an exception if the provided matrix
        # is singular.

        # self.new_grad_vec = theano.tensor.slinalg.solve(self.Fdamp, self.grad_vec.dimshuffle(0, 'x'))
        self.new_grad_vec = solve_sym_pos(self.Fdamp, self.grad_vec)
        # self.new_grad_vec = gpu_solve(self.Fdamp, self.grad_vec.dimshuffle(0, 'x'))

        pcount = sum(p.get_value().size for p in self.model.params)
        self.ch_history = theano.shared(np.zeros((pcount,), dtype=theano.config.floatX))

        if self.rescale == 'momentum':
            self.real_fish = get_fisher_mat() + T.identity_like(self.F)*self.lambd_inv

            FT = self.real_fish.dot(self.new_grad_vec)
            FM = self.real_fish.dot(self.ch_history)

            TFT = self.new_grad_vec.T.dot(FT)
            MFT = self.ch_history.T.dot(FT)
            MFM = self.ch_history.T.dot(FM)

            GT = self.grad_vec.T.dot(self.new_grad_vec)
            GM = self.grad_vec.T.dot(self.ch_history)


            tmp1 = T.stack([TFT.reshape(()), MFT.reshape(())], 0).dimshuffle('x', 0)
            tmp2 = T.stack([MFT.reshape(()), MFM.reshape(())], 0).dimshuffle('x', 0)

            A = T.concatenate([tmp1, tmp2], 0)
            A_pinv = T.nlinalg.MatrixPinv()(A)
            b = T.stack([GT.reshape(()), GM.reshape(())], 0).dimshuffle(0, 'x')

            res = A_pinv.dot(b).flatten()

            alpha = res[0]
            beta = res[1]

            self.new_grad_vec = self.new_grad_vec * alpha.reshape(()) + self.ch_history * beta.reshape(())
            self.F = self.real_fish

            self.updates[self.ch_history] = self.new_grad_vec
        elif self.rescale:
            self.real_fish = get_fisher_mat() + T.identity_like(self.F)*self.lambd_inv
            lin_fac = self.grad_vec.T.dot(self.new_grad_vec)
            quad_fac = self.new_grad_vec.T.dot(self.real_fish.dot(self.new_grad_vec))

            alpha = lin_fac/quad_fac
            beta = 0 * alpha

            self.new_grad_vec *= alpha.reshape(())
            self.F = self.real_fish
            # self.Fdamp = self.F+T.identity_like(self.F)*self.lambd_inv

        # alpha = T.as_tensor_variable(1)

        def _apply_gradient_vec(params, new_grad_vec, updates):
            new_grad = []
            offset = 0
            for p in params:
                pval = p.get_value()
                new_grad += [new_grad_vec[offset:offset+pval.size].reshape(pval.shape)]
                offset += pval.size

                updates[p] = p - new_grad[-1]

            return new_grad

        self.new_grad = _apply_gradient_vec(self.model.params, self.new_grad_vec, self.updates)

        self.get_params = theano.function(
            inputs=[],
            outputs=self.model.params,
            on_unused_input='warn'
        )

        self.quad_est_loss = self.new_grad_vec.T.dot(self.F.dot(self.new_grad_vec))/2
        self.est_loss = self.quad_est_loss + self.grad_vec.dot(self.new_grad_vec)

        self.print_pls = {}
        self.print_pls.update({'shape': self.F.shape[0], 'rank': rank(self.F*10000)})
        self.print_pls.update({'grad_mean': T.mean(self.grad_vec**2)**0.5})
        self.print_pls.update({'alpha': alpha, 'beta': beta})
        # self.print_pls += [self.F]
        # self.print_pls += [self.real_fish]

        self.train = theano.function(
            inputs=[self.lambd_inv],
            outputs=[self.est_loss, self.loss, self.err] + list(self.print_pls.values()),
            updates=self.updates,
            givens={
                self.x: self.x_d,
                self.y: self.y_d,
                self.outc: self.outc_d,
                self.rand_outc: self.rand_outc_d
            },
            on_unused_input='warn',
            allow_input_downcast=True,
            # profile=True
        )

        self.eva = theano.function(
            inputs=[],
            outputs=[self.loss, self.err],
            givens={
                self.x: self.x_d,
                self.y: self.y_d,
                self.outc: self.outc_d
            },
            on_unused_input='warn',
            allow_input_downcast=True
        )

    def step(self, X, y, outc):
        """Perform single train iteration.

        Args:
            X: input vectors
            y: target labels.
            outc: target vectors.

        Returns:
            Dict consisting of 'loss', 'err', 'est_loss', 'rho', 'delta_ll' and
            parameters from self.print_pls.

        """
        self.x_d.set_value(X)
        self.y_d.set_value(y)
        self.outc_d.set_value(outc)
        self.rand_outc_d.set_value(floatX(nprng.randn(self.over_sampling, *outc.shape)))

        old_params = self.get_params()
        while True:
            # reset params to saved
            for op, p in zip(old_params, self.model.params):
                p.set_value(op)

            try:
                t_r = self.train(self.c_lambd_inv)

                print_pls_vals = t_r[-len(self.print_pls):]
                self.print_pls_res = {k: v for k, v in zip(self.print_pls.keys(), print_pls_vals)}
            except numpy.linalg.linalg.LinAlgError:
                t_r = [1e20, 1e10, 10] + [None] * len(self.print_pls)
                self.print_pls_res = {k: None for k in self.print_pls.keys()}

            e_v = self.eva()
            delta_ll = t_r[1] - e_v[0]
            rho = delta_ll/float(t_r[0])

            print()
            print('lambda:', round(self.c_lambd_inv, 7), 'rho:', round(rho, 2), 'old loss:',  t_r[1], 'new loss:', e_v[0])
            if rho < 0:
                self.c_lambd_inv *= self.rate * 2
                continue
            elif rho < 0.5:
                self.c_lambd_inv *= self.rate
                # self.c_lambd_inv = min(self.c_lambd_inv, 0.02)
            elif rho > 0.5:
                self.c_lambd_inv /= self.rate
            else:
                pass
            break

        # self.train.profiler.print_summary()
        res = {'rho': rho, 'est_loss': t_r[0], 'loss': t_r[1], 'err': t_r[2], 'delta_ll': delta_ll}
        res.update(self.print_pls_res)

        return res

    def evaluate(X_test, y_test, outc_test):
        """Return loss and error for provided dataset.

        Args:
            X_test: input vectors,
            y_test: target labels,
            outc_test: target vectors.

        Returns:
            Dict consisting of 'test_loss', 'test_err'.
        """
        self.x_d.set_value(X_test)
        self.y_d.set_value(y_test)
        self.outc_d.set_value(outc_test)

        te_v = self.eva()
        test_loss = te_v[0]
        test_err = te_v[1]

        return {'test_loss': test_loss, 'test_err': test_err}

    def _check_gv_matrix_correctness(self):
        v = T.vector('v')
        get_Fv = theano.function(
            inputs=[v],
            outputs=[self.F.dot(v)],
            givens={
                self.x: self.x_d,
                self.outc: self.outc_d
            },
            allow_input_downcast=True
        )

        grad_at = theano.function(
            inputs=[],
            outputs=sum(([T.grad(self.loss, p)] for p in self.model.params), []),
            givens={
                self.x: self.x_d,
                self.outc: self.outc_d
            },
            allow_input_downcast=True
        )
        grads0 = grad_at()

        vec = []

        EPS = 1e-5
        for p in self.model.params:
            vec += [nprng.randn(*p.get_value().shape).astype(theano.config.floatX)]
            p.set_value(p.get_value()+vec[-1]*EPS)
        grads1 = grad_at()

        vec_vec = np.concatenate([p.flatten() for p in vec])
        F_vec = get_Fv(vec_vec)
        F_vec_vec = np.concatenate([f.flatten() for f in F_vec])

        grads0_vec = np.concatenate([p.flatten() for p in grads0])
        grads1_vec = np.concatenate([p.flatten() for p in grads1])

        F_vec_emp = (grads1_vec-grads0_vec)/EPS

        print(np.mean(F_vec_emp**2)**0.5, np.mean(F_vec_vec**2)**0.5)
        print(np.max(np.abs(F_vec_emp-F_vec_vec)))
        exit(0)
def pymc3_simple(indep,
                 dep,
                 img_dir_orig,
                 degree=2,
                 mindep=-1.0,
                 maxdep=0.4,
                 sampling=1000,
                 tune=1000,
                 uniform=True,
                 extratext='',
                 plot=True):
    img_dir = op.join(img_dir_orig, 'deg_%d' % (degree), extratext)
    mkpath(img_dir)
    ndim = len(indep)
    limlist = []
    for indepi in indep:
        per = np.percentile(indepi, [1.0, 99.0])
        limlist.append(per)
    lower, upper = min(mindep, np.amin(dep)), max(
        maxdep, np.amax(dep))  # Limits for dependent variable
    x = np.empty(
        (0, degree +
         1))  # To set up grid on which true dust parameter n will be defined
    for lim in limlist:
        x = np.append(x,
                      np.linspace(lim[0], lim[-1], degree + 1)[None, :],
                      axis=0)
    xx = np.meshgrid(*x)  #N-D Grid for polynomial computations
    a_poly_T = get_a_polynd(
        xx
    ).T  #Array related to grid that will be used in least-squares computation
    aTinv = np.linalg.inv(a_poly_T)
    rc = -1.0  #Rcond parameter set to -1 for keeping all entries of result to machine precision, regardless of rank issues
    # 2-D array that will be multiplied by coefficients to calculate the dust parameter at the observed independent variable values
    term = calc_poly_tt(indep, degree)
    # breakpoint()
    with pm.Model() as model:
        # Priors on the parameters ngrid (n over the grid) and sigma (related to width of relation)
        if uniform:
            ngrid = pm.Uniform("ngrid",
                               lower=lower - 1.0e-5,
                               upper=upper + 1.0e-5,
                               shape=xx[0].size,
                               testval=np.random.uniform(
                                   lower, upper, xx[0].size))
        else:
            ngrid = pm.TruncatedNormal("ngrid",
                                       mu=0.3,
                                       sigma=1.0,
                                       lower=lower - 1.0e-5,
                                       upper=upper + 1.0e-5,
                                       shape=xx[0].size,
                                       testval=np.random.uniform(
                                           lower, upper / 2.0, xx[0].size))
        sigma = pm.HalfNormal("sigma", sigma=1)

        # Compute the expected n at each sample
        coefs = tt.dot(aTinv, ngrid)
        mu = tt.tensordot(coefs, term, axes=1)

        # Likelihood (sampling distribution) of observations
        dep_obs = pm.Normal("dep_obs", mu=mu, sigma=sigma, observed=dep)

        map_estimate = pm.find_MAP()
        print(map_estimate)

        trace = pm.sample(draws=sampling,
                          tune=tune,
                          init='adapt_full',
                          target_accept=0.9,
                          return_inferencedata=True)

        if plot:
            az.plot_trace(trace)
            plt.savefig(op.join(img_dir,
                                "polyND%s_trace_pm_simp.png" % (extratext)),
                        bbox_inches='tight',
                        dpi=300)
        print(az.summary(trace, round_to=2))

    return trace, xx, map_estimate
#
# Define Parameter Updates
#
#########################################################

FA_mean_perturbations = FA.mean(axis=1)

# Create List of Updates
param_updates = []

for i in range(len(params)):
    print 'Creating updates for parameter %d...' % i

    print 'Calculating derivative'
    normalization = T.nnet.softplus(sigmas[i]) + sig_min_perturbations
    delta = T.tensordot(FA_mean_perturbations, r_epsilons[i],
                        axes=[[0], [0]]) / normalization / n_perturbations

    # USE ADAM OPTIMIZER
    p_adam = Adam(delta, params[i], 0.9, 0.999, learning_rate, epsilon=10e-6)
    param_updates = param_updates + p_adam.updates

for i in range(len(sigmas)):

    print 'Creating updates for std dev of parameter %d...' % i

    print 'Calculating derivative'
    normalization = T.nnet.softplus(sigmas[i]) + sig_min_perturbations
    outer_der = (r_epsilons[i] * r_epsilons[i] - 1.0) / normalization
    inner_der = T.exp(sigmas[i]) / (1.0 + T.exp(sigmas[i]))
    delta_sigma = T.tensordot(FA_mean_perturbations,
                              outer_der * inner_der,
Example #44
0
    def __init__(self, num_actions):

        # remember parameters
        self.num_actions = num_actions

        # batch size is T_MAX now
        self.batch_size = 1  #BATCH_SIZE

        self.discount_rate = DISCOUNT_RATE

        self.history_length = HISTORY_LENGTH
        self.screen_dim = DIMS
        self.img_height = SCREEN_HEIGHT
        self.img_width = SCREEN_WIDTH

        self.beta = BETA

        self.learning_rate = LEARNING_RATE
        self.rms_decay = RMS_DECAY
        self.rms_epsilon = RMS_EPSILON

        # prepare tensors once and reuse them
        state = T.tensor3('state')
        reward = T.fscalar('reward')
        advantage = T.fscalar('advantage')
        action = T.iscalar('action')

        #beta = T.fscalar('regularization_rate')
        # set learning rate
        #self.shared_beta = theano.shared(np.zeros((1)), dtype=theano.config.floatX ,
        #        broadcastable=(True))
        #self.shared_beta.set_value([BETA])

        # create shared theano variables
        self.state_shared = theano.shared(
            np.zeros((self.history_length, self.img_height, self.img_width),
                     dtype=theano.config.floatX))

        self.reward_shared = theano.shared(
            np.zeros((1), dtype=theano.config.floatX))

        self.advantage_shared = theano.shared(
            np.zeros((1), dtype=theano.config.floatX))

        self.action_shared = theano.shared(np.zeros((1), dtype='int32'))

        # can add multiple nets here
        # Shared network parameters here
        self.shared_net = self.build_shared_network()
        shared_out = lasagne.layers.get_output(self.shared_net, state)

        ####### OPTIMIZATION here --------------
        # Policy network parameters here
        self.policy_network = self.build_policy_network(self.shared_net)
        policy_out = lasagne.layers.get_output(self.policy_network, state)

        # Value network parameters here
        self.value_network = self.build_value_network(self.shared_net)
        value_out = lasagne.layers.get_output(self.value_network, state)

        ## ----------------------- LOSS FUNCTION SHIT STARTS HERE ----------------------------------------

        # take log policy loss
        policy_loss = -T.log(policy_out[0][self.action_shared]).dot(
            self.advantage_shared)

        # take entropy and add with the regularizer
        entropy = -T.tensordot(policy_out, T.log(policy_out)).dot(-1)

        # add regullazrization
        policy_loss += self.beta * entropy

        policy_loss = T.sum(policy_loss)

        # get the value loss
        value_loss = ((self.reward_shared - value_out)**2) / 2
        value_loss = T.sum(value_loss)

        total_loss = T.sum(policy_loss + (0.5 * value_loss))

        ## ----------------------- LOSS FUNCTION SHIT ENDS HERE ----------------------------------------

        shared_params = lasagne.layers.helper.get_all_params(self.shared_net)
        only_policy_params = self.policy_network.get_params()
        only_value_params = self.value_network.get_params()

        policy_params = shared_params + only_policy_params
        value_params = shared_params + only_value_params

        g_time = time.time()
        logger.info("graph compiling")

        # get grads here
        policy_grad = T.grad(total_loss, policy_params)
        value_grad = T.grad(total_loss, value_params)

        # there'll be two kind of updates
        policy_updates = rmsprop_updates(policy_grad, policy_params,
                                         self.learning_rate, self.rms_decay,
                                         self.rms_epsilon)

        value_updates = rmsprop_updates(value_grad, value_params,
                                        self.learning_rate, self.rms_decay,
                                        self.rms_epsilon)

        givens = {
            state: self.state_shared,
            reward: self.reward_shared,
            action: self.action_shared,
            advantage: self.advantage_shared,
        }

        # theano functions for accumulating the grads
        self._policy_grad = theano.function([], policy_grad, givens=givens)
        self._value_grad = theano.function([], value_grad, givens=givens)

        # train will take input the grads and just apply them

        # NEEDS work here ------------
        self._train_policy = theano.function(policy_grad, [],
                                             updates=policy_updates)
        self._train_value = theano.function(value_grad, [],
                                            updates=value_updates)

        # get output for a state
        self._policy = theano.function([],
                                       policy_out,
                                       givens={state: self.state_shared})
        self._value = theano.function([],
                                      value_out,
                                      givens={state: self.state_shared})

        # need more theano functions for getting policy and value
        logger.info("Theano Graph Compiled !! %f", time.time() - g_time)
rtransform = T.roll(transform, -DISTANCE, axis=2)[:, :, DISTANCE:-DISTANCE]
routput = output[:, :, DISTANCE:-DISTANCE]

if args.continuous:
    # Continuous loss function uses the determinant of the covariance matrix for the signal and residual
    residual = rtransform - routput

    # For covariance, we want to subtract out the means...
    sigmean = rtransform.mean(axis=(0, 2), keepdims=True)
    epsmean = residual.mean(axis=(0, 2), keepdims=True)

    rtdelta = rtransform - sigmean
    epsdelta = residual - epsmean

    # Covariance matrices
    sig_cov = T.tensordot(rtdelta, rtdelta, axes=(
        (0, 2), (0, 2))) / (rtransform.shape[0] * rtransform.shape[2])
    eps_cov = T.tensordot(epsdelta, epsdelta, axes=(
        (0, 2), (0, 2))) / (residual.shape[0] * residual.shape[2])

    det_sig = TNL.Det()(sig_cov) + 1e-48
    det_eps = TNL.Det()(eps_cov) + 1e-48

    entropy = T.log(det_sig)
    info = T.log(det_eps)

    # First two terms gives the entropy contrast, but we'd also like the predictions to be correct (as opposed to constant offset), so we add a third term to encourage the mean residual to be zero.
    loss = info - entropy + 1e-2 * (epsmean**2).mean()
else:
    # Entropy term measures the entropy of the average transformed signal. We want to make this large
    entropy = -1 * (rtransform.mean(axis=(0, 2)) *
                    T.log(rtransform.mean(axis=(0, 2)) + 1e-6)).sum()
Example #46
0
    def _setup_functions(self):

        # Actual parameter lengths.
        #sh_w_n = (self.n_state + self.n_actions + 1, self.n_state + 1, self.n_state)
        #print("sh_w_n", sh_w_n)
        sh_w_n = (self.n_actions + 1, self.n_state + 1, self.n_state)
        print("sh_w_n", sh_w_n)
        sh_w_t = (self.n_tex + 1, self.n_state + 1, self.n_ray)
        print("sh_w_t", sh_w_t)
        sh_l1 = (self.n_ray + self.n_key, self.n_interaction)
        print("sh_l1", sh_l1)
        sh_l2 = (self.n_interaction, 1)
        print("sh_l2", sh_l2)

        # Memory cells.
        sh_mk = (self.n_scene, self.n_key)
        sh_mc = (self.n_scene, 4)
        print("sh_mk", sh_mk)
        print("sh_mc", sh_mc)

        if not hasattr(self, "params"):
            print('generating weights')

            # (A+1)x(S+1)xS
            wn = uniform(sh_w_n, scale=0.2)
            # (P+1)x(S+1)xR
            wt = uniform(sh_w_t, scale=0.2)
            # (R+K)xH
            wl1 = uniform(sh_l1, scale=0.2)
            # H
            wb1 = shared0s((self.n_interaction, ))
            # Hx1
            wl2 = uniform(sh_l2, scale=0.2)
            # MxK
            wmk = uniform(sh_mk, scale=0.2)
            # MxC
            wmc = uniform(sh_mc, scale=0.2)

            self.params = [wn, wt, wl1, wb1, wl2, wmk, wmc]
        else:
            wn, wt, wl1, wb1, wl2, wmk, wmc = self.params

        #TxNxA
        A = T.tensor3()
        #TxNxP
        P = T.tensor3()
        #TxNxC
        y = T.tensor3()

        # Inputs: NxS, NxA
        def state_transform(a_, s_):
            # Nx(S+1)xS
            temp_ = T.tensordot(T.concatenate(
                [a_, T.ones((s_.shape[0], 1))], axis=1),
                                wn,
                                axes=[1, 0])
            # NxS
            return T.sum(
                temp_ * T.concatenate([s_, T.ones(
                    (s_.shape[0], 1))], axis=1).dimshuffle([0, 1, 'x']),
                axis=1)
            #return s_

        # TxNxS
        S, _ = theano.scan(fn=state_transform,
                           outputs_info=[T.zeros([A.shape[1], self.n_state])],
                           sequences=[A])

        # TxNx(S+1)xR
        temp_ = T.tensordot(T.concatenate(
            [P, T.ones([S.shape[0], S.shape[1], 1])], axis=2),
                            wt,
                            axes=[2, 0])

        # TxNxR Ray Elements.
        R = T.sum(temp_ *
                  T.concatenate([S, T.ones((S.shape[0], S.shape[1], 1))],
                                axis=2).dimshuffle([0, 1, 2, 'x']),
                  axis=2)

        # TxNxMx(R+K) Transformation input.
        R_2 = T.concatenate([
            T.tile(R.dimshuffle([0, 1, 'x', 2]), [1, 1, self.n_scene, 1]),
            T.tile(wmk.dimshuffle(['x', 'x', 0, 1]),
                   [R.shape[0], R.shape[1], 1, 1])
        ],
                            axis=3)

        # TxNxMxH
        L1 = sigmoid(
            T.tensordot(R_2, wl1, axes=[3, 0]) +
            wb1.dimshuffle(['x', 'x', 'x', 0]))
        # TxNxM Soft attention weights.
        Att_temp = T.exp(T.tensordot(L1, wl2, axes=[3, 0]).sum(axis=3))
        Att = Att_temp / (T.sum(Att_temp, axis=2, keepdims=True) + 0.01)
        #Att = sigmoid( T.tensordot(L1, wl2, axes=[3,0]).sum( axis=3 ) )

        # TxNxC final colors.
        Col = T.tensordot(Att, wmc, axes=[2, 0])

        rec_cost = T.sum(T.sqr(Col - y))  # / T.cast(X.shape[0], 'float32')
        cost = rec_cost

        print('getting updates')
        #updates = Adam([wt,wn,wmk,wl1,wb1,wl2,wmc], cost)
        updates = Adam(self.params, cost)

        print('compiling')
        self._fit_function = theano.function([A, P, y], cost, updates=updates)
        self._predict = theano.function([A, P], Col)
        self._next_state = theano.function([A], S)
        self._predict_attn = theano.function([A, P], Att)
        # Output just the cost to check with a test set.
        self._cost = theano.function([A, P, y], cost)
Example #47
0
    def __init__(self,
                 vec_dim,
                 output_dim,
                 num_words,
                 mini_batch_size=30,
                 rho=1e-6):
        """
        :param vec_dim: Dimension of a single word vector.
        :param output_dim: Output dimension.
        :param num_words: Number of different words.
        :param mini_batch_size: Size of mini-batch.
        :param rho: L2 penalization coefficient in the cross-entropy error.
        """

        self.vec_dim = vec_dim
        self.output_dim = output_dim
        self.num_words = num_words
        self.mini_batch_size = mini_batch_size
        self.default_vec = lambda: np.zeros(self.vec_dim).astype(floatX)
        self.rho = rho

        # Embedding matrix L.
        # --------------------------
        # Size : (single-word dimension, number of words).
        # L is trained jointly with the comp. models.

        self.L = 0.01 * ran(self.vec_dim, self.num_words).astype(floatX)

        # Neural Tensor Layer weights.
        # --------------------------
        # V is the tensor that defines multiple bilinear forms.
        # W, b are classical-RNN weight and bias matrices.

        self.V = shared(0.01 * ran(self.vec_dim, 2 * self.vec_dim,
                                   2 * self.vec_dim).astype(floatX),
                        name='V',
                        borrow=True)
        self.W = shared(0.01 *
                        ran(self.vec_dim, 2 * self.vec_dim).astype(floatX),
                        name='W',
                        borrow=True)
        self.b = shared(np.zeros(self.vec_dim).astype(floatX),
                        name='b',
                        borrow=True)

        # Softmax weights.
        # --------------------------
        # W_s, b_s are the sentiment classification weight and bias matrices.

        self.W_s = shared(0.01 *
                          ran(self.output_dim, self.vec_dim).astype(floatX),
                          name='W_s',
                          borrow=True)
        self.b_s = shared(np.zeros(self.output_dim).astype(floatX),
                          name='b_s',
                          borrow=True)
        self.params = [self.V, self.W, self.b, self.W_s,
                       self.b_s]  # Only shared variables

        # Gradients.
        # --------------------------

        self.np_dV = np.empty(
            (self.vec_dim, 2 * self.vec_dim, 2 * self.vec_dim))
        self.np_dW = np.empty((self.vec_dim, 2 * self.vec_dim))
        self.np_db = np.empty(self.vec_dim)
        self.np_dW_s = np.empty((self.output_dim, self.vec_dim))
        self.np_db_s = np.empty(self.output_dim)

        self.dV = shared(self.np_dV.astype(floatX), name='dV', borrow=True)
        self.dW = shared(self.np_dW.astype(floatX), name='dW', borrow=True)
        self.db = shared(self.np_db.astype(floatX), name='db', borrow=True)
        self.dW_s = shared(self.np_dW_s.astype(floatX),
                           name='dW_s',
                           borrow=True)
        self.db_s = shared(self.np_db_s.astype(floatX),
                           name='db_s',
                           borrow=True)

        # As L is jointly trained with the above parameters, we need a "gradient" for L.
        # This comes in the form of a dictionary
        self.dL = collections.defaultdict(self.default_vec)

        # Theano variables for the computational graph.
        # --------------------------

        self.p_a = T.vector('Parent activation')
        self.lr = T.vector('Stacked activation')

        self.prob = T.vector('Probabilities')
        self.diff = T.vector('Distribution differences')
        self.node_error = T.vector('Soft-max node error')
        self.label = T.iscalar('Label')
        self.cost = T.scalar('Cost')
        self.rate = T.scalar('Learning rate')
        self.scale = T.scalar('Batch scale')

        prob = T.dot(self.W_s, self.p_a) + self.b_s
        prob -= T.max(prob)
        prob = T.exp(prob)
        prob /= T.sum(prob)

        outer = T.outer(self.node_error, self.lr)

        # Recombination
        # --------------------------
        # Returns parent activation via children activation.

        self.recombination = theano.function(
            [self.lr],
            T.tanh(
                T.dot(self.W, self.lr) + self.b + T.tensordot(
                    self.V, T.outer(self.lr, self.lr), axes=([1, 2], [0, 1]))),
            allow_input_downcast=True)

        # Probabilities
        # --------------------------
        # Returns posterior probabilities given parent activation.

        self.probabilities = theano.function([self.p_a],
                                             prob,
                                             allow_input_downcast=True)

        # Soft-max node error
        # --------------------------
        # Pre-computes softmax node error given distribution difference (target - real).
        # The Hadamard product is added afterwards

        updates_1 = collections.OrderedDict()
        updates_1[self.dW_s] = self.dW_s + T.outer(self.diff, self.p_a)
        updates_1[self.db_s] = self.db_s + self.diff

        self.softmax_node_error = theano.function([self.diff, self.p_a],
                                                  T.dot(self.W_s.T, self.diff),
                                                  updates=updates_1,
                                                  allow_input_downcast=True)

        # Soft-max node error
        # --------------------------
        #Add penalization term to the cost.

        self.add_penalization_term = theano.function(
            [self.cost],
            self.cost + (self.rho / 2) *
            (T.sum(self.V**2) + T.sum(self.W**2) + T.sum(self.W_s**2)),
            allow_input_downcast=True)

        # Prop error
        # --------------------------
        # Back-propagates error and updates gradients.

        updates_2 = collections.OrderedDict()
        updates_2[self.dV] = self.dV + (T.outer(self.lr, self.lr)[:, :, None] *
                                        self.node_error).T
        updates_2[self.dW] = self.dW + outer
        updates_2[self.db] = self.db + self.node_error

        self.prop_error = theano.function([self.node_error, self.lr],
                                          T.dot(self.W.T, self.node_error) +
                                          T.tensordot(self.V.transpose(
                                              (0, 2, 1)) + self.V,
                                                      outer.T,
                                                      axes=([1, 0], [0, 1])),
                                          updates=updates_2,
                                          allow_input_downcast=True)

        # Update params
        # --------------------------
        # Updates all weights & biases during gradient descent.

        updates_3 = collections.OrderedDict()
        updates_3[self.V] = self.V - self.rate * self.scale * (
            self.dV + self.rho * self.V)
        updates_3[self.W] = self.W - self.rate * self.scale * (
            self.dW + self.rho * self.W)
        updates_3[self.b] = self.b - self.rate * self.scale * self.db
        updates_3[self.W_s] = self.W_s - self.rate * self.scale * (
            self.dW_s + self.rho * self.W_s)
        updates_3[self.b_s] = self.db_s - self.rate * self.scale * self.db_s

        self.update_params = theano.function([self.scale, self.rate],
                                             self.scale,
                                             updates=updates_3,
                                             allow_input_downcast=True)
Example #48
0
def gram_matrix(mat):
    mat = mat.flatten(ndim=3)
    g = T.tensordot(mat, mat, axes=([2], [2]))
    return g
def gram_matrix(x):
    x = x.flatten(ndim=3)
    g = T.tensordot(x, x, axes=([2], [2]))
    return g
def h_softmax(x,
              batch_size,
              n_outputs,
              n_classes,
              n_outputs_per_class,
              W1,
              b1,
              W2,
              b2,
              target=None):
    """ Two-level hierarchical softmax.

    The architecture is composed of two softmax layers: the first predicts the
    class of the input x while the second predicts the output of the input x in
    the predicted class.
    More explanations can be found in the original paper [1]_.

    If target is specified, it will only compute the outputs of the
    corresponding targets. Otherwise, if target is None, it will compute all
    the outputs.

    The outputs are grouped in the same order as they are initially defined.

    .. versionadded:: 0.7.1

    Parameters
    ----------
    x: tensor of shape (batch_size, number of features)
        the minibatch input of the two-layer hierarchical softmax.
    batch_size: int
        the size of the minibatch input x.
    n_outputs: int
        the number of outputs.
    n_classes: int
        the number of classes of the two-layer hierarchical softmax. It
        corresponds to the number of outputs of the first softmax. See note at
        the end.
    n_outputs_per_class: int
        the number of outputs per class. See note at the end.
    W1: tensor of shape (number of features of the input x, n_classes)
        the weight matrix of the first softmax, which maps the input x to the
        probabilities of the classes.
    b1: tensor of shape (n_classes,)
        the bias vector of the first softmax layer.
    W2: tensor of shape (n_classes, number of features of the input x, n_outputs_per_class)
        the weight matrix of the second softmax, which maps the input x to
        the probabilities of the outputs.
    b2: tensor of shape (n_classes, n_outputs_per_class)
        the bias vector of the second softmax layer.
    target: tensor of shape either (batch_size,) or (batch_size, 1)
        (optional, default None)
        contains the indices of the targets for the minibatch
        input x. For each input, the function computes the output for its
        corresponding target. If target is None, then all the outputs are
        computed for each input.

    Returns
    -------
    output_probs: tensor of shape (batch_size, n_outputs) or (batch_size, 1)
        Output of the two-layer hierarchical softmax for input x. If target is
        not specified (None), then all the outputs are computed and the
        returned tensor has shape (batch_size, n_outputs). Otherwise, when
        target is specified, only the corresponding outputs are computed and
        the returned tensor has thus shape (batch_size, 1).

    Notes
    -----
    The product of n_outputs_per_class and n_classes has to be greater or equal
    to n_outputs. If it is strictly greater, then the irrelevant outputs will
    be ignored.
    n_outputs_per_class and n_classes have to be the same as the corresponding
    dimensions of the tensors of W1, b1, W2 and b2.
    The most computational efficient configuration is when n_outputs_per_class
    and n_classes are equal to the square root of n_outputs.

    References
    ----------
    .. [1] J. Goodman, "Classes for Fast Maximum Entropy Training,"
        ICASSP, 2001, <http://arxiv.org/abs/cs/0108006>`.
    """

    # First softmax that computes the probabilities of belonging to each class
    class_probs = theano.tensor.nnet.softmax(T.dot(x, W1) + b1)

    if target is None:  # Computes the probabilites of all the outputs

        # Second softmax that computes the output probabilities
        activations = T.tensordot(x, W2, (1, 1)) + b2
        output_probs = theano.tensor.nnet.softmax(
            activations.reshape((-1, n_outputs_per_class)))
        output_probs = output_probs.reshape((batch_size, n_classes, -1))
        output_probs = class_probs.dimshuffle(0, 1, 'x') * output_probs
        output_probs = output_probs.reshape((batch_size, -1))
        # output_probs.shape[1] is n_classes * n_outputs_per_class, which might
        # be greater than n_outputs, so we ignore the potential irrelevant
        # outputs with the next line:
        output_probs = output_probs[:, :n_outputs]

    else:  # Computes the probabilities of the outputs specified by the targets

        target = target.flatten()

        # Classes to which belong each target
        target_classes = target // n_outputs_per_class

        # Outputs to which belong each target inside a class
        target_outputs_in_class = target % n_outputs_per_class

        # Second softmax that computes the output probabilities
        activations = sparse_block_dot(W2.dimshuffle('x', 0, 1, 2),
                                       x.dimshuffle(0, 'x', 1),
                                       T.zeros((batch_size, 1), dtype='int32'),
                                       b2, target_classes.dimshuffle(0, 'x'))

        output_probs = theano.tensor.nnet.softmax(activations.dimshuffle(0, 2))
        target_class_probs = class_probs[T.arange(batch_size), target_classes]
        output_probs = output_probs[T.arange(batch_size),
                                    target_outputs_in_class]
        output_probs = target_class_probs * output_probs

    return output_probs
Example #51
0
 def met(q, p):
     return T.tensordot(nu(q),
                        T.tensordot(p, p, [[1], [1]]).diagonal(),
                        [[0], [0]])
    def _step_2(m_, x_, r_, h_, c_, w_, y_, c2_):
        # Concat x_, h_ and w_ to get Nx(X+W+H) matrix
        ip_mat = tensor.concatenate([x_, w_, h_], axis=1 )

        # Compute forget gate values
        # f : NxH matrix
        f = tensor.nnet.sigmoid(
            tensor.tensordot(ip_mat, tparams['weight'][0], axes=[1, 1]) + tparams['bias'][0, :][None, :])
        #f = tensor.nnet.sigmoid(tensor.dot(tparams['weight'][0, :, :], ip_mat) + tparams['bias'][0, :][:, None])

        # Compute input gate values
        # i : NxH matrix
        i = tensor.nnet.sigmoid(tensor.tensordot(ip_mat, tparams['weight'][1], axes=[1,1]) + tparams['bias'][1, :][None, :])
        #i = tensor.nnet.sigmoid(tensor.dot(tparams['weight'][1, :, :], ip_mat) + tparams['bias'][1, :][:, None])

        #c_new : NxH matrix
        c_new = tensor.tanh(tensor.tensordot(ip_mat, tparams['weight'][2], axes=[1,1]) + tparams['bias'][2, :][None, :])
        #c_new = tensor.tanh(tensor.dot(tparams['weight'][2, :, :], ip_mat) + tparams['bias'][2, :][:, None])

        # Compute new memory
        # c : NxH
        c = i * c_new + f * c_
        # Retain based on mask
        c = m_[:, None] * c + (1. - m_)[:, None] * c_

        # Compute new hidden state
        # h : NxH
        h = tensor.nnet.sigmoid(
            tensor.tensordot(ip_mat, tparams['weight'][3], axes=[1,1]) + tparams['bias'][3, :][None, :]) * tensor.tanh(c)
        #h = tensor.nnet.sigmoid(
        #    tensor.dot(tparams['weight'][3, :, :], ip_mat) + tparams['bias'][3, :][:, None]) * tensor.tanh(c)
        # Retain based on mask
        h = m_[:, None] * h + (1. - m_)[:, None] * h_

        # Predict next vector here.
        # U = OxH.
        # B = O.
        context = tensor.tensordot( h, tparams['U'], axes=[1,1] ) + tparams['b'][None, :]

        y_old = tensor.tensordot( h, tparams['U_context'], axes=[1,1] ) + tparams['b_context'][None, :]
        #y_old = tensor.nnet.softmax(y_old)

        # pred = NxO
        #pred = tensor.nnet.softmax( proj );
        # Nx(M+1)
        #context = tensor.nnet.softmax(context)

        #temp: NxW
        y = tensor.nnet.softmax( ( tensor.sum(context[:, :-1, None ] * memory, axis=1) + context[:, -1][:, None] * y_old ) / options['sample_temperature'] )
        #temp = tensor.sum(temp)

        # ArgMax?
        # pred[ T.arange(pred.shape[0])[:,None], T.arange(pred.shape[1])[None,:], pred.argmax( axis=2 ) ] = 1.;
        # Or Sample from last axis?
        # TxNxO Last dimension one-hot sampled.
        #w = trng2.multinomial( pvals=pred );

        # N
        w_nums = ( tensor.switch( tensor.gt( r_, tensor.extra_ops.cumsum( y, axis=1 ) ), 1, 0 ) ).sum( axis=1 );
        #pred[ tensor.arange(pred.shape[0])[:,None], tensor.arange(pred.shape[1])[None,:], w_nums ] = 1.;
        # NxW
        w = tensor.extra_ops.to_one_hot( w_nums, options['ydim'], dtype=config.floatX)
        return h, c, w.astype(config.floatX), y, context
 def get_output_for(self, inputs, **kwargs):
     x, y = inputs[0], inputs[1]
     xfactor = T.tensordot(x, self.Wf, axes=(2, 1)).dimshuffle(0, 2, 1, 3)
     yfactor = T.tensordot(y, self.Wf, axes=(2, 1)).dimshuffle(0, 2, 1, 3)
     return xfactor * yfactor
 def apply(self, input_):
     W, b = self.parameters
     output = T.tensordot(input_, W, axes=[[1], [0]]) + b
     return output
Example #55
0
    def __init__(self,
                 E,
                 n_users,
                 lrate=0.0001,
                 margin_loss=1,
                 rng=None,
                 init_w2v=False):

        # Generate random seed if not provided
        if rng is None:
            rng = np.random.RandomState(1234)
        #parameters
        if init_w2v == "gauss":
            U = init_w2v_gauss(rng, n_users, E)
        elif init_w2v == "mean":
            U = init_w2v_mean(rng, E, n_users)
        else:
            U = init_weight(rng, (E.shape[0], n_users))
        U = theano.shared(U.astype(theano.config.floatX), borrow=True)
        E = theano.shared(E.astype(theano.config.floatX), borrow=True)

        self.params = [U]
        self.margin_loss = margin_loss
        self.lrate = lrate
        #input
        usr_idx = T.iscalar('usr')
        sent_idx = T.ivector('sent')
        neg_samp_idx = T.imatrix('neg_sample')
        # word_probs   = T.fvector('word_probs')
        #word_probs   = T.fscalar('word_probs')
        curr_lrate = T.fscalar('lrate')
        #embedding lookup
        usr = U[:, usr_idx]
        sent = E[:, sent_idx]
        neg_samples = E[:, neg_samp_idx]
        #loss
        # objectives, _ = theano.scan(fn=self.rank_loss,
        #                             outputs_info=None,
        #                             sequences=[sent_idx,neg_samp_idx],
        #                             non_sequences=[usr,E,U])
        pos_score = T.dot(usr, sent)
        neg_score = T.tensordot(usr, neg_samples, axes=(0, 0))
        loss = T.maximum(0, self.margin_loss - pos_score[:, None] + neg_score)
        # final_loss = loss.sum(axis=None) + word_probs.sum()
        final_loss = loss.sum(axis=None)
        #Gradient wrt to user embeddings
        usr_grad = T.grad(final_loss, usr)
        #Sparse update
        upd_usr = T.set_subtensor(usr, usr - curr_lrate * usr_grad)
        updates = ((U, upd_usr), )
        # self.dbg = theano.function(inputs=[usr_idx, sent_idx, neg_samp_idx],
        #                              outputs=[usr,sent,neg_samples],
        #                              mode="FAST_COMPILE")
        self.dbg = theano.function(inputs=[usr_idx, sent_idx, neg_samp_idx],
                                   outputs=[usr, sent, neg_samples],
                                   allow_input_downcast=True)

        self.train = theano.function(
            inputs=[usr_idx, sent_idx, neg_samp_idx, curr_lrate],
            outputs=final_loss,
            updates=updates,
            mode="FAST_RUN",
            allow_input_downcast=True)
        #\propto P(message|usr)
        # scores_m = T.exp(T.dot(U.T,E[:,sent_idx]))
        scores_m = T.dot(U.T, E[:, sent_idx])
        prob = T.nnet.softmax(scores_m.T).T
        log_prob = T.log(prob).sum(axis=1)
        #sum the scores for all the words
        # scores_m = scores_m.sum(axis=1)
        # user_score = scores_m[usr_idx]
        user_score = log_prob[usr_idx]
        self.predict = theano.function(inputs=[usr_idx, sent_idx],
                                       outputs=[user_score, prob],
                                       allow_input_downcast=True)
Example #56
0
 def dot(self, vec, mat):
   if self.depth == 1:
     return T.dot(vec, mat)
   else:
     return T.tensordot(vec, mat, 1)
Example #57
0
    if mean:
        return out.mean()

    else:
        return out

t_C = T.matrix("cov","float32")
DDS_var = T.nlinalg.det(t_C)*T.nlinalg.MatrixInverse()(t_C)
DDS = theano.function([t_C],DDS_var,allow_input_downcast = True)

def np_DDS(C):
    return np.linalg.det(C)*np.linalg.inv(C)

t_x = T.matrix("vec_in","float32")
XDX_var = (t_x * T.tensordot(t_x,T.nlinalg.MatrixInverse()(t_C),axes = [1,0])).sum(axis = 1)
XDX = theano.function([t_C,t_x],XDX_var,allow_input_downcast = True)

def np_XDX(C,x):
    return (x*np.tensordot(x,np.linalg.inv(C),axes = [1,0])).sum(axis = 1)

def att_LAM(C,Q,F,x):
    first_term = (x[:,0]*np.tensordot(x[:,0],np.linalg.inv(C),axes = [1,0])).sum(axis = 1)

    Tn = x[:,1:]
    Tnp1 = np.tensordot(x[:,:-1,:],F,axes = [2,1])

    dif = Tn - Tnp1
    
    other_terms = dif*np.tensordot(dif,np.linalg.inv(Q),axes = [2,0])
    other_terms = (other_terms).sum(axis = (1,2))
Example #58
0
sampler = sampling.AudioFileSampler.load(path+"/sampler.p")
#sampler = sampling.AudioFileSampler(["Zece/audio"+str(i+1).zfill(2)+".wav" for i in range(23)], sample_size)
#sampler = sampling.AudioFileFreqSampler("but_one_day.wav", sample_size, 128, 20)
#sampler = sampling.SinusSampler(sample_size)

'''import pickle
with open(path+"/gaussian_process.p", "rb") as f:
	pick = pickle.Unpickler(f)
	sampler = sampling.GaussianProcess(sample_size, pick.load(), 1.0)
'''
######################

x = T.dtensor3('x')
batch_size = x.shape[0]

z = T.tensordot(x, encode, ([1, 2], [0, 1])) + encode_bias
#encoder(x)

xx = generator(T.reshape(z, [-1, 1, generator.gen_dim]))

mean_enc = z.mean()
var_enc = T.sqr(z - mean_enc).mean()


cost_enc = -(xx - x).norm(2, axis=[1, 2]).mean() / (sample_size * data_dim)
#cost_enc = -T.sqr(xx - x).mean(axis=[0, 1, 2])
#cost_enc += -(mean_enc).norm(2)*0.01 - (T.log(var_enc)).norm(2)*0.001
#cost_enc += -0.01 * generator.normL1()

cost_enc *= 100
        def step(target_token_id, hidden_state, conv_out,
                 gru_prediction_to_reset, gru_prediction_to_hidden,
                 gru_prediction_to_update, gru_prev_hidden_to_reset,
                 gru_prev_hidden_to_next, gru_prev_hidden_to_update,
                 gru_hidden_update_bias, gru_update_bias, gru_reset_bias,
                 conv_weights_code_l3, conv_layer3_bias, code_embeddings,
                 all_name_reps, use_prev_stat):

            gated_l2 = conv_out * T.switch(hidden_state > 0, hidden_state,
                                           0.01 * hidden_state).dimshuffle(
                                               0, 1, 'x', 'x')
            gated_l2 = gated_l2 / gated_l2.norm(2)

            code_convolved_l3 = T.nnet.conv2d(
                gated_l2,
                conv_weights_code_l3,
                image_shape=(1, self.hyperparameters["conv_layer2_nfilters"],
                             None, 1),
                filter_shape=self.conv_layer3_code.get_value().shape)[:, 0, :,
                                                                      0]

            l3_out = code_convolved_l3 + conv_layer3_bias
            code_toks_weights = T.nnet.softmax(
                l3_out
            )  # This should be one dimension (the size of the sentence)

            # the first/last tokens are padding
            padding_size = T.constant(
                self.hyperparameters["layer1_window_size"] +
                self.hyperparameters["layer2_window_size"] +
                self.hyperparameters["layer3_window_size"] - 3)

            predicted_embedding = T.tensordot(
                code_toks_weights,
                code_embeddings[padding_size / 2 + 1:-padding_size / 2 + 1],
                [[1], [0]])[0]

            # Get the next hidden!
            if do_dropout:
                # For regularization, we can use the context embeddings *some* of the time
                embedding_used = T.switch(use_prev_stat,
                                          all_name_reps[target_token_id],
                                          predicted_embedding)
            else:
                embedding_used = all_name_reps[target_token_id]

            reset_gate = T.nnet.sigmoid(
                T.dot(embedding_used, gru_prediction_to_reset) +
                T.dot(hidden_state, gru_prev_hidden_to_reset) + gru_reset_bias)
            update_gate = T.nnet.sigmoid(
                T.dot(embedding_used, gru_prediction_to_update) +
                T.dot(hidden_state, gru_prev_hidden_to_update) +
                gru_update_bias)
            hidden_update = T.tanh(
                T.dot(embedding_used, gru_prediction_to_hidden) +
                reset_gate * T.dot(hidden_state, gru_prev_hidden_to_next) +
                gru_hidden_update_bias)

            next_hidden = (
                1. - update_gate) * hidden_state + update_gate * hidden_update
            return next_hidden, predicted_embedding, code_toks_weights
Example #60
0
def main(data):
    # optimizer
    opt = Optimizers()

    # sampler
    theano_rng = RandomStreams(999)

    # import dataset
    n_samples = data.attrs['n_rows']
    lr = 1e-3
    batch_size = 128

    x_data = [
        data['purpose'], data['avg_speed'], data['duration'], data['trip_km'],
        data['n_coord'], data['interval'], data['dow'], data['startdistrict'],
        data['enddistrict']
    ]

    y_data = [data['mode']]

    params = OrderedDict()
    params_shp = OrderedDict()

    output = []
    input = []
    asc_params = []
    asc_params_m = []
    beta_params_f = []
    beta_params_s = []
    beta_params_sf = []
    beta_params = []
    beta_params_m = []

    for var in y_data:
        name = 'asc_' + var.name.strip('/')
        asc_shp = var['data'][:].squeeze().shape[1:]
        print('y', name, asc_shp)

        output.append(init_tensor((), name))

        mask = np.ones(asc_shp, DTYPE_FLOATX)
        mask[-1] = 0.
        asc_value = np.zeros(asc_shp, DTYPE_FLOATX) * mask

        asc_params.append(shared(asc_value, name))
        asc_params_m.append(shared(mask, name + '_mask'))

        params[name] = asc_params[-1]
        params_shp[name] = asc_shp

    for var in x_data:
        name = 'beta_' + var.name.strip('/')
        shp = var['data'].shape[1:] + asc_shp
        print('x', name, shp)

        input.append(init_tensor(var['data'].shape[1:], name))

        mask = np.ones(shp, DTYPE_FLOATX)
        mask[..., -1] = 0.
        mask = mask.flatten()
        beta_value = np.zeros(np.prod(shp), DTYPE_FLOATX) * mask
        sigma_value = np.ones(np.prod(shp), DTYPE_FLOATX) * mask

        beta_params_f.append(shared(beta_value, name))
        beta_params_sf.append(shared(sigma_value, name + '_sigma'))

        beta_params.append(T.reshape(beta_params_f[-1], shp))
        beta_params_s.append(T.reshape(beta_params_sf[-1], shp))
        beta_params_m.append(shared(mask, name + '_mask'))

        params[name] = beta_params_f[-1]
        params[name + '_sigma'] = beta_params_sf[-1]
        params_shp[name] = shp
        params_shp[name + '_sigma'] = shp

    # compute the utility function
    utility = 0.
    h_utility = 0.
    for x, b, s in zip(input, beta_params, beta_params_s):

        normal_sample = b[..., None] + T.sqr(s)[..., None] * theano_rng.normal(
            size=b.eval().shape + (1, ), avg=0., std=1., dtype=DTYPE_FLOATX)

        ax = [np.arange(x.ndim)[1:], np.arange(b.ndim)[:-1]]
        utility += T.tensordot(x, normal_sample, axes=ax)
        if x.ndim > 2:
            h_utility += T.tensordot(x, b + T.sqr(s), axes=[[1, 2], [0, 1]])
        else:
            h_utility += T.tensordot(x, b + T.sqr(s), axes=[[1], [0]])

    for y, asc in zip(output, asc_params):
        utility += asc[None, ..., None]
        h_utility += asc
        (d1, d2, d3) = utility.shape
        utility = utility.reshape((d1 * d3, d2))
        p_y_given_x = T.nnet.softmax(utility)

        hessian_prob = T.nnet.softmax(h_utility)  #!
        hessian_nll = T.log(hessian_prob)
        hessian_cr = hessian_nll[T.arange(y.shape[0]), y]
        hessian_cost = -T.sum(hessian_cr)

        nll = T.log(p_y_given_x).reshape((d3, d1, d2))
        nll = nll[:, T.arange(y.shape[0]), y]
        cost = -T.sum(T.mean(nll, axis=0))

    gparams = asc_params + beta_params_f + beta_params_sf
    grads = T.grad(cost, gparams)

    # mask gradient updates
    mask = asc_params_m + beta_params_m + beta_params_m
    for j, g in enumerate(grads):
        grads[j] = g * mask[j]

    # create list of updates to iterate over
    updates = opt.sgd_updates(gparams, grads, lr)

    # symbolic equation for the Hessian function
    stderrs = []
    hessian = T.hessian(cost=hessian_cost, wrt=gparams)
    stderr = [T.sqrt(f) for f in [T.diag(2. / h) for h in hessian]]
    stderrs.extend(stderr)

    tensors = input + output
    shared_x = [shared(var['data'][:], borrow=True) for var in x_data]
    shared_y = [T.cast(shared(var['label'][:]), 'int32') for var in y_data]
    shared_variables = shared_x + shared_y

    i = T.lscalar('index')
    start_idx = i * batch_size
    end_idx = (i + 1) * batch_size

    print('constructing Theano computational graph...')

    train = theano.function(
        inputs=[i],
        outputs=cost,
        updates=updates,
        givens={
            key: val[start_idx:end_idx]
            for key, val in zip(tensors, shared_variables)
        },
        name='train',
        allow_input_downcast=True,
    )

    std_err = theano.function(
        inputs=[],
        outputs=stderrs,
        givens={key: val[:]
                for key, val in zip(tensors, shared_variables)},
        name='std errors',
        allow_input_downcast=True,
    )

    # train model
    print('training the model...')
    curves = []
    n_batches = n_samples // batch_size
    epochs = 100
    epoch = 0
    t0 = time.time()
    while epoch < epochs:
        epoch += 1
        cost = []
        for i in range(n_batches):
            cost_items = train(i)
            cost.append(cost_items)

        epoch_cost = np.sum(cost)
        curves.append((epoch, epoch_cost))
        minutes, seconds = divmod(time.time() - t0, 60.)
        hours, minutes = divmod(minutes, 60.)
        print(("epoch {0:d} loglikelihood "
               "{1:.3f} time {hh:02d}:{mm:02d}:{ss:05.2f}").format(
                   epoch,
                   epoch_cost,
                   hh=int(hours),
                   mm=int(minutes),
                   ss=seconds))

        if (epoch % 5) == 0:
            print('checkpoint')
            param_values = {}
            for name, param in params.items():
                param_shp = params_shp[name]
                param_values[name] = param.eval().reshape(param_shp)
                np.savetxt('params/{}.csv'.format(name),
                           param_values[name].squeeze(),
                           fmt='%.3f',
                           delimiter=',')

            to_file = param_values, curves
            path = 'params/epoch_{0:d}.params'.format(epoch)
            with open(path, 'wb') as f:
                pickle.dump(to_file, f, protocol=pickle.HIGHEST_PROTOCOL)

    # save parameters and stderrs to .csv
    stderrs = std_err()
    params_list = [p for p in asc_params + beta_params_f + beta_params_sf]
    param_names = [p.name for p in asc_params + beta_params_f + beta_params_sf]
    for se, param, name in zip(stderrs, params_list, param_names):
        v = param.eval().squeeze()
        shp = v.shape
        path = 'params/stderrs_{}.csv'.format(name)
        np.savetxt(path, se.reshape(shp), fmt='%.3f', delimiter=',')
        path = 'params/tstat_{}.csv'.format(name)
        np.savetxt(path, v / se.reshape(shp), fmt='%.3f', delimiter=',')