Example #1
0
def softmax_with_bias_unittest_template(dtypeInput, dtypeBias):
    """
    This is basic test for GpuSoftmaxWithBias with float64 variables

    We check that we loop when their is too much block

    TODO: check that we loop when their is too much thread.(THIS IS
    NOT IMPLEMENTED)
    """
    assert dtypeInput in ['float32', 'float64']
    assert dtypeBias in ['float32', 'float64']

    if dtypeInput == 'float32':
        x = T.fmatrix('x')
    elif dtypeInput == 'float64':
        x = T.dmatrix('x')

    # We can't use zeros_like(x[0,::]) as this don't allow to test with
    # 0 shape
    if dtypeBias == 'float32':
        z = T.nnet.softmax_with_bias(x, T.arange(x.shape[1] * 2,
                                                 dtype='float32')[::2])
    elif dtypeBias == 'float64':
        z = T.nnet.softmax_with_bias(x, T.arange(x.shape[1] * 2,
                                                 dtype='float64')[::2])


    f = theano.function([x], z, mode=mode_without_gpu)
    f_gpu = theano.function([x], z, mode=mode_with_gpu)
    assert f.maker.fgraph.toposort()[-1].op == T.nnet.softmax_with_bias
    assert isinstance(f_gpu.maker.fgraph.toposort()[-2].op,
                      theano.sandbox.gpuarray.nnet.GpuSoftmaxWithBias)

    def cmp(n, m):
        #print "test_softmax",n,m
        if dtypeInput == 'float32':
            data = numpy.arange(n * m, dtype='float32').reshape(n, m)
        elif dtypeInput == 'float64':
            data = numpy.arange(n * m, dtype='float64').reshape(n, m)

        out = f(data)
        gout = f_gpu(data)
        assert numpy.allclose(out, gout), numpy.absolute(out - gout)

    cmp(2, 5)
    #we need to test n>32*1024 to check that we make the block loop.
    cmp(2 << 15, 5)
    cmp(4074, 400)
    cmp(0, 10)
    cmp(784, 784)
    cmp(4, 1000)
    cmp(4, 1024)
    cmp(4, 2000)
    cmp(4, 2024)
    #GTX285 don't have enough shared mem for this case.
    cmp(4, 4074)
    # The GTX580, 680 and kepler don't have enough shared memory.
    cmp(2, 10000)
    cmp(128, 16 * 1024)
    cmp(128, 64 * 1024)
Example #2
0
def dtw(array1, array2):
    """
    Accepts: two one dimensional arrays
    Returns: (float) DTW distance between them.
    """
    s = np.zeros((array1.size+1, array2.size+1))

    s[:,0] = 1e6
    s[0,:] = 1e6
    s[0,0] = 0.0

    # Set up symbolic variables
    square = T.dmatrix('square')
    vec1 = T.dvector('vec1')
    vec2 = T.dvector('vec2')
    vec1_length = T.dscalar('vec1_length')
    vec2_length = T.dscalar('vec2_length')
    outer_loop = T.arange(vec1_length, dtype='int64')
    inner_loop = T.arange(vec2_length, dtype='int64')

    # Run the outer loop
    path, _ = scan(fn=outer,
                    outputs_info=[dict(initial=square, taps=[-1])],
                    non_sequences=[inner_loop, vec1, vec2],
                    sequences=outer_loop)

    # Compile the function
    theano_square = function([vec1, vec2, square, vec1_length, vec2_length], path, on_unused_input='warn')

    # Call the compiled function and return the actual distance
    return theano_square(array1, array2, s, array1.size, array2.size)[-1][array1.size, array2.size]
Example #3
0
    def _compile_bp(self):
        '''
        compile backpropagation foreach of the dqns.
        '''
        self.bprop_by_goal = {}
        for (goal, dqn) in self.dqn_by_goal.items():
            states = dqn.states
            action_values = dqn.action_values
            params = dqn.params
            targets = T.vector('target')
            shared_values = T.vector('shared_values')
            last_actions = T.lvector('action')

            # loss function.
            mse = layers.MSE(action_values[T.arange(action_values.shape[0]),
                                last_actions], targets) \
                    + T.mean(abs(action_values[T.arange(action_values.shape[0]),
                                    last_actions] - shared_values))
            # l2 penalty.
            l2_penalty = 0.
            for param in params:
                l2_penalty += (param ** 2).sum()

            cost = mse + self.l2_reg * l2_penalty

            # back propagation.
            updates = optimizers.Adam(cost, params, alpha=self.lr)

            td_errors = T.sqrt(mse)
            self.bprop_by_goal[goal] = theano.function(inputs=[states, last_actions, targets, shared_values],
                                        outputs=td_errors, updates=updates)
Example #4
0
def uniq_with_lengths(seq, time_mask):
  """
  :param seq: (time,batch) -> label
  :param time_mask: (time,batch) -> 0 or 1
  :return: out_seqs, seq_lens.
  out_seqs is (max_seq_len,batch) -> label, where max_seq_len <= time.
  seq_lens is (batch,) -> len.
  """
  num_batches = seq.shape[1]
  diffs = T.ones_like(seq)
  diffs = T.set_subtensor(diffs[1:], seq[1:] - seq[:-1])
  time_range = T.arange(seq.shape[0]).dimshuffle([0] + ['x'] * (seq.ndim - 1))
  idx = T.switch(T.neq(diffs, 0) * time_mask, time_range, -1)  # (time,batch) -> idx or -1
  seq_lens = T.sum(T.ge(idx, 0), axis=0)  # (batch,) -> len
  max_seq_len = T.max(seq_lens)

  # I don't know any better way without scan.
  # http://stackoverflow.com/questions/31379971/uniq-for-2d-theano-tensor
  def step(batch_idx, out_seq_b1):
    #out_seq = seq[T.ge(idx[:, batch_idx], 0).nonzero(), batch_idx][0]
    out_seq = seq[:, batch_idx][T.ge(idx[:, batch_idx], 0).nonzero()]
    return T.concatenate((out_seq, T.zeros((max_seq_len - out_seq.shape[0],), dtype=seq.dtype)))

  out_seqs, _ = theano.scan(
    step,
    sequences=[T.arange(num_batches)],
    outputs_info=[T.zeros((max_seq_len,), dtype=seq.dtype)]
  )
  # out_seqs is (batch,max_seq_len)
  return out_seqs.T, seq_lens
Example #5
0
def filterbank_matrices(center_y, center_x, delta, sigma, N, imgshp):
    """Create a Fy and a Fx

    Parameters
    ----------
    center_y : T.vector (shape: batch_size)
    center_x : T.vector (shape: batch_size)
        Y and X center coordinates for the attention window
    delta : T.vector (shape: batch_size)
    sigma : T.vector (shape: batch_size)

    Returns
    -------
        FY, FX
    """
    tol = 1e-4
    img_height, img_width = imgshp
    muX = center_x.dimshuffle([0, 'x']) + delta.dimshuffle([0, 'x'])*(T.arange(N)-N/2-0.5)
    muY = center_y.dimshuffle([0, 'x']) + delta.dimshuffle([0, 'x'])*(T.arange(N)-N/2-0.5)

    a = T.arange(img_width)
    b = T.arange(img_height)

    FX = T.exp( -(a-muX.dimshuffle([0,1,'x']))**2 / 2. / sigma.dimshuffle([0,'x','x'])**2 )
    FY = T.exp( -(b-muY.dimshuffle([0,1,'x']))**2 / 2. / sigma.dimshuffle([0,'x','x'])**2 )
    FX = FX / (FX.sum(axis=-1).dimshuffle(0, 1, 'x') + tol)
    FY = FY / (FY.sum(axis=-1).dimshuffle(0, 1, 'x') + tol)

    return FY, FX
Example #6
0
    def link(self, input):
        self.input = input.dimshuffle(0, 1, 3, 2)
        # get the indexes that give the max on every line and sort them
        ind = T.argsort(self.input, axis=3)
        sorted_ind = T.sort(ind[:, :, :, -self.k_max:], axis=3)
        dim0, dim1, dim2, dim3 = sorted_ind.shape

        # prepare indices for selection
        indices_dim0 = T.arange(dim0)\
                        .repeat(dim1 * dim2 * dim3)
        indices_dim1 = T.arange(dim1)\
                        .repeat(dim2 * dim3)\
                        .reshape((dim1 * dim2 * dim3, 1))\
                        .repeat(dim0, axis=1)\
                        .T\
                        .flatten()
        indices_dim2 = T.arange(dim2)\
                        .repeat(dim3)\
                        .reshape((dim2 * dim3, 1))\
                        .repeat(dim0 * dim1, axis=1)\
                        .T\
                        .flatten()

        # output
        self.output = self.input[
            indices_dim0,
            indices_dim1,
            indices_dim2,
            sorted_ind.flatten()
        ].reshape(sorted_ind.shape).dimshuffle(0, 1, 3, 2)
        return self.output
Example #7
0
    def k_max_pool(self, x, k):
        """
        perform k-max pool on the input along the rows

        input: theano.tensor.tensor4
           
        k: theano.tensor.iscalar
            the k parameter

        Returns: 
        4D tensor
        """
        x = T.reshape(x, (x.shape[0], x.shape[1], 1, x.shape[2] * x.shape[3]))
        ind = T.argsort(x, axis=3)

        sorted_ind = T.sort(ind[:, :, :, -k:], axis=3)

        dim0, dim1, dim2, dim3 = sorted_ind.shape

        indices_dim0 = T.arange(dim0).repeat(dim1 * dim2 * dim3)
        indices_dim1 = (
            T.arange(dim1).repeat(dim2 * dim3).reshape((dim1 * dim2 * dim3, 1)).repeat(dim0, axis=1).T.flatten()
        )
        indices_dim2 = T.arange(dim2).repeat(dim3).reshape((dim2 * dim3, 1)).repeat(dim0 * dim1, axis=1).T.flatten()

        result = x[indices_dim0, indices_dim1, indices_dim2, sorted_ind.flatten()].reshape(sorted_ind.shape)
        shape = (result.shape[0], result.shape[1], result.shape[2] * result.shape[3], 1)

        result = T.reshape(result, shape)

        return result
Example #8
0
def nin(X, param):
    w1, w2, w3, b1, b2, b3 = param
    X = X.dimshuffle(0, 1, 'x', 2, 3)  # (n,32,1,r,c)
    w1 = w1.dimshuffle(0, 1, 2, 'x', 3, 4)  # (64,32,16,1,3,3)
    w2 = w2.dimshuffle(0, 1, 'x', 2, 'x', 'x')  # (64,32,1,16,1,1)
    w3 = w3.dimshuffle(0, 1, 2, 'x', 'x')  # (64,2,32,1,1)
    b1 = b1.dimshuffle(0, 1, 'x', 2, 'x', 'x')  # (64,32,1,16,1,1)
    b2 = b2.dimshuffle(0, 1, 'x', 2, 'x', 'x')  # (64,32,1,1,1,1)
    b3 = b3.dimshuffle(0, 'x', 1, 'x', 'x')  # (64,1,2,1,1)
    indexi = T.arange(w1.shape[0], dtype='int32')  # (0:64)
    indexi = T.repeat(indexi, w1.shape[1], axis=0)
    indexj = T.arange(w1.shape[1], dtype='int32')  # (0:64)
    indexj = T.tile(indexj, w1.shape[0])
    results, updates = scan(fn=metaOp1,
                            sequences=[indexi, indexj],
                            outputs_info=None,
                            non_sequences=[X, w1, w2, b1, b2],
                            strict=True)  # (64*32,n,1,r,c)
    metaShape1 = results.shape[-4], results.shape[-2], results.shape[-1]
    reshaped1 = results.reshape((w1.shape[0], w1.shape[1]) + metaShape1)  # (64,32,n,r,c)
    permuted1 = T.transpose(reshaped1, axes=(0, 2, 1, 3, 4))  # (64,n,32,r,c)
    indexi = T.arange(w1.shape[0], dtype='int32')  # (0:64)
    results, updates = scan(fn=metaOp2,
                            sequences=[indexi],
                            outputs_info=None,
                            non_sequences=[permuted1, w3, b3],
                            strict=True)  # (64,n,2,r,c)
    permuted2 = T.transpose(results, axes=(1, 0, 2, 3, 4))  # (n,64,2,r,c)
    metaShape2 = permuted2.shape[-2], permuted2.shape[-1]
    reshaped2 = permuted2.reshape((permuted2.shape[0], -1) + metaShape2)  # (n,128,r,c)
    return reshaped2
Example #9
0
def sequence_log_likelihood(y, y_hat, y_mask, y_hat_mask, blank_symbol, log_scale=True):
    """
    Based on code from Shawn Tan.
    Credits to Kyle Kastner as well.

    This function computes the CTC log likelihood for a sequence that has
    been augmented with blank labels.


    """
    y_hat_mask_len = tensor.sum(y_hat_mask, axis=0, dtype="int32")
    y_mask_len = tensor.sum(y_mask, axis=0, dtype="int32")

    if log_scale:
        log_probabs = _log_path_probabs(y, T.log(y_hat), y_mask, y_hat_mask, blank_symbol)
        batch_size = log_probabs.shape[1]

        # Add the probabilities of the final time steps to get the total
        # sequence likelihood.
        log_labels_probab = _log_add(
            log_probabs[y_hat_mask_len - 1, tensor.arange(batch_size), y_mask_len - 1],
            log_probabs[y_hat_mask_len - 1, tensor.arange(batch_size), y_mask_len - 2],
        )
    else:
        probabilities = _path_probabs(y, y_hat, y_mask, y_hat_mask, blank_symbol)
        batch_size = probabilities.shape[1]
        labels_probab = (
            probabilities[y_hat_mask_len - 1, tensor.arange(batch_size), y_mask_len - 1]
            + probabilities[y_hat_mask_len - 1, tensor.arange(batch_size), y_mask_len - 2]
        )
        log_labels_probab = tensor.log(labels_probab)
    return log_labels_probab
Example #10
0
    def test_optimize_xent_vector2(self):
        verbose = 0
        mode = theano.compile.mode.get_default_mode()
        if mode == theano.compile.mode.get_mode('FAST_COMPILE'):
            mode = 'FAST_RUN'
        rng = numpy.random.RandomState(utt.fetch_seed())
        x_val = rng.randn(5)
        b_val = rng.randn(5)
        y_val = numpy.asarray([2])

        x = T.dvector('x')
        b = T.dvector('b')
        y = T.lvector('y')

        def print_graph(func):
            for i, node in enumerate(func.maker.fgraph.toposort()):
                print i, node
            # Last node should be the output
            print i, printing.pprint(node.outputs[0])
            print

        ## Test that a biased softmax is optimized correctly
        bias_expressions = [
                T.sum(-T.log(softmax(x + b)[T.arange(y.shape[0]), y])),
                -T.sum(T.log(softmax(b + x)[T.arange(y.shape[0]), y])),
                -T.sum(T.log(softmax(x + b))[T.arange(y.shape[0]), y]),
                T.sum(-T.log(softmax(b + x))[T.arange(y.shape[0]), y])]

        for expr in bias_expressions:
            f = theano.function([x, b, y], expr, mode=mode)
            if verbose:
                print_graph(f)
            try:
                prev, last = f.maker.fgraph.toposort()[-2:]
                assert len(f.maker.fgraph.toposort()) == 3
                # [big_op, sum, dim_shuffle]
                f(x_val, b_val, y_val)
            except Exception:
                theano.printing.debugprint(f)
                raise

            backup = config.warn.sum_div_dimshuffle_bug
            config.warn.sum_div_dimshuffle_bug = False
            try:
                g = theano.function([x, b, y], T.grad(expr, x), mode=mode)
            finally:
                config.warn.sum_div_dimshuffle_bug = backup

            if verbose:
                print_graph(g)
            try:
                ops = [node.op for node in g.maker.fgraph.toposort()]
                assert len(ops) <= 6
                assert crossentropy_softmax_1hot_with_bias_dx in ops
                assert softmax_with_bias in ops
                assert softmax_grad not in ops
                g(x_val, b_val, y_val)
            except Exception:
                theano.printing.debugprint(g)
                raise
    def __init__(self, x, y, l, window, opt, lr, init_emb, dim_emb, dim_hidden, n_vocab, L2_reg, unit,
                 sim='cos', n_layers=1, activation=tanh):
        self.tr_inputs = [x, y, l]
        self.pr_inputs = [x, y, l]

        self.x = x  # 1D: batch_size * l * 2, 2D: window; elem=word_id
        self.y = y  # 1D: batch_size; elem=label
        self.l = l  # scalar: elem=sentence length

        batch_size = y.shape[0]
        n_cands = x.shape[0] / batch_size / l

        self.pad = build_shared_zeros((1, dim_emb))
        if init_emb is None:
            self.emb = theano.shared(sample_weights(n_vocab - 1, dim_emb))
        else:
            self.emb = theano.shared(init_emb)
        self.E = T.concatenate([self.pad, self.emb], 0)
        self.W_out = theano.shared(sample_weights(dim_hidden, dim_hidden))
        self.params = [self.emb, self.W_out]

        """ Input Layer """
        e = self.E[x]  # e: 1D: batch_size * l * 2, 2D: window, 3D: dim_emb
        x_in = e.reshape((batch_size * n_cands, l, -1))

        """ Intermediate Layer """
        # h: 1D: n_batch * n_cands, 2D: dim_emb
        h, params = cnn.layers(x_in, window, dim_emb, dim_hidden, n_layers, activation)
        self.params.extend(params)

        """ Output Layer """
        h = h.reshape((batch_size, n_cands, -1))
        h_1 = h[T.arange(batch_size), 0]
        h_2 = h[T.arange(batch_size), 1:]
        if sim == 'cos':
            y_score = cosign_similarity(h_1, h_2)
        else:
            y_score = T.batched_dot(T.dot(h_1, self.W_out), h_2.dimshuffle(0, 2, 1))
        y_score_hat = T.max(y_score, 1)

        """ Objective Function """
        self.nll = max_margin_loss(y_score_hat, y_score[T.arange(batch_size), y])
        self.L2_sqr = regularization(self.params)
        self.cost = self.nll + L2_reg * self.L2_sqr / 2.

        """ Optimization """
        if opt == 'adagrad':
            self.update = ada_grad(cost=self.cost, params=self.params, lr=lr)
        elif opt == 'ada_delta':
            self.update = ada_delta(cost=self.cost, params=self.params)
        elif opt == 'adam':
            self.update = adam(cost=self.cost, params=self.params, lr=lr)
        else:
            self.update = sgd(cost=self.cost, params=self.params, lr=lr)

        """ Predicts """
        y_hat = T.argmax(y_score, 1)

        """ Check Accuracies """
        self.correct = T.eq(y_hat, y)
Example #12
0
File: cost.py Project: Beronx86/cle
    def sample_mean(self, X):

        mu = X[0]
        sig = X[1]
        coeff = X[2]

        mu = mu.reshape((mu.shape[0],
                         mu.shape[1]/coeff.shape[-1],
                         coeff.shape[-1]))

        sig = sig.reshape((sig.shape[0],
                           sig.shape[1]/coeff.shape[-1],
                           coeff.shape[-1]))

        idx = predict(
            self.theano_rng.multinomial(
                pvals=coeff,
                dtype=coeff.dtype
            ),
            axis=1
        )

        mu = mu[T.arange(mu.shape[0]), :, idx]
        sig = sig[T.arange(sig.shape[0]), :, idx]

        epsilon = self.theano_rng.normal(size=mu.shape,
                                         avg=0., std=1.,
                                         dtype=mu.dtype)

        z = mu + sig * epsilon

        return z, mu
Example #13
0
	def step(i,inputs):
		length = inputs.shape[0]
		next_level = T.dot(inputs[T.arange(0,length-i-1)],W1) + T.dot(inputs[T.arange(1,length-i)],W2) + b
		next_level = next_level*(next_level > 0)
		#next_level = inputs[T.arange(0,length-i-1)] + inputs[T.arange(1,length-i)]
		#next_level = theano.printing.Print('inputs')(next_level)
		return T.concatenate([next_level,T.zeros_like(inputs[:length-next_level.shape[0]])])
def compute_cov_field(x, t, params):
	t0=T.cast(T.arange(x.shape[0])*0.0+t, 'float32')
	t1=T.reshape(t0,(x.shape[0],1,1))
	t2=T.extra_ops.repeat(t1,x.shape[1],axis=1)
	[centers, spreads, biases, M, b]=params
	diffs=x.dimshuffle(0,1,2,'x')-centers.dimshuffle('x','x',0,1)
	scaled_diffs=(diffs**2)*T.exp(spreads).dimshuffle('x','x',0,1)
	exp_terms=T.sum(scaled_diffs,axis=2)+biases.dimshuffle('x','x',0)*0.0
	h=T.exp(-exp_terms)
	sumact=T.sum(h,axis=2)
	#Normalization
	hnorm=h/sumact.dimshuffle(0,1,'x')
	z=T.dot(hnorm,M)
	z=T.reshape(z,(x.shape[0],x.shape[1],ntgates))+b.dimshuffle('x','x',0) #nt by nb by ntgates by 1
	#z=z+T.reshape(x,(t.shape[0],t.shape[1],1,nx))
	z=T.exp(z)
	
	tpoints=T.cast(T.arange(ntgates),'float32')/T.cast(ntgates-1,'float32')
	tpoints=T.reshape(tpoints, (1,1,ntgates))
	#tgating=T.exp(T.dot(t,muWT)+mubT) #nt by nb by ntgates
	tgating=T.exp(-kT*(tpoints-t2)**2)
	tgating=tgating/T.reshape(T.sum(tgating, axis=2),(t2.shape[0], t2.shape[1], 1))
	tgating=T.reshape(tgating,(t2.shape[0],t2.shape[1],ntgates))
	
	mult=z*tgating
	
	out=T.sum(mult,axis=2)
	
	return T.cast(out,'float32')
Example #15
0
File: cost.py Project: Beronx86/cle
    def argmax_mean(self, X):

        mu = X[0]
        sig = X[1]
        coeff = X[2]

        mu = mu.reshape((mu.shape[0],
                         mu.shape[1]/coeff.shape[-1],
                         coeff.shape[-1]))

        sig = sig.reshape((sig.shape[0],
                           sig.shape[1]/coeff.shape[-1],
                           coeff.shape[-1]))

        idx = predict(coeff)
        mu = mu[T.arange(mu.shape[0]), :, idx]
        sig = sig[T.arange(sig.shape[0]), :, idx]

        epsilon = self.theano_rng.normal(size=mu.shape,
                                         avg=0., std=1.,
                                         dtype=mu.dtype)

        z = mu + sig * epsilon

        return z, mu
Example #16
0
    def emit(self, readouts):
        mu, sigma, coeff = self.gmmmlp.apply(readouts)

        frame_size = mu.shape[-1]/coeff.shape[-1]
        k = coeff.shape[-1]
        shape_result = coeff.shape
        shape_result = tensor.set_subtensor(shape_result[-1],frame_size)
        ndim_result = coeff.ndim

        mu = mu.reshape((-1, frame_size, k))
        sigma = sigma.reshape((-1, frame_size,k))
        coeff = coeff.reshape((-1, k))

        sample_coeff = self.theano_rng.multinomial(pvals = coeff, dtype=coeff.dtype)
        idx = predict(sample_coeff, axis = -1)
        #idx = predict(coeff, axis = -1) use this line for using most likely coeff.

        mu = mu[tensor.arange(mu.shape[0]), :, idx]
        sigma = sigma[tensor.arange(sigma.shape[0]), :, idx]

        epsilon = self.theano_rng.normal(
            size=mu.shape,avg=0.,
            std=1.,
            dtype=mu.dtype)

        result = mu + sigma*epsilon

        return result.reshape(shape_result, ndim = ndim_result)
Example #17
0
    def negative_log_likelihood(self, y,penalty=[]):
        """Return the mean of the negative log-likelihood of the prediction
        of this model under a given target distribution.

        .. math::

            \frac{1}{|\mathcal{D}|} \mathcal{L} (\theta=\{W,b\}, \mathcal{D}) =
            \frac{1}{|\mathcal{D}|} \sum_{i=0}^{|\mathcal{D}|} \log(P(Y=y^{(i)}|x^{(i)}, W,b)) \\
                \ell (\theta=\{W,b\}, \mathcal{D})

        :type y: theano.tensor.TensorType
        :param y: corresponds to a vector that gives for each example the
                  correct label

        Note: we use the mean instead of the sum so that
              the learning rate is less dependent on the batch size
        """
        # y.shape[0] is (symbolically) the number of rows in y, i.e.,
        # number of examples (call it n) in the minibatch
        # T.arange(y.shape[0]) is a symbolic vector which will contain
        # [0,1,2,... n-1] T.log(self.p_y_given_x) is a matrix of
        # Log-Probabilities (call it LP) with one row per example and
        # one column per class LP[T.arange(y.shape[0]),y] is a vector
        # v containing [LP[0,y[0]], LP[1,y[1]], LP[2,y[2]], ...,
        # LP[n-1,y[n-1]]] and T.mean(LP[T.arange(y.shape[0]),y]) is
        # the mean (across minibatch examples) of the elements in v,
        # i.e., the mean log-likelihood across the minibatch.
        if penalty==[]:
            return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]), y])
        else:
            return -T.mean(T.log ( (self.p_y_given_x)[T.arange(y.shape[0]), y])*penalty) 
    def negative_log_likelihood(self, label_sym):
        """
        Return the mean of the negative log-likelihood of the prediction
        of this model under a given target distribution.

        :type label_sym: theano.tensor.TensorType
        :param label_sym: corresponds to a vector that gives for each example the
                  correct label

        Note: we use the mean instead of the sum so that
              the learning rate is less dependent on the batch size
        """
        # label_sym.shape[0] is (symbolically) the number of rows in label_sym, i.e.,
        # number of examples (call it n) in the minibatch
        # T.arange(label_sym.shape[0]) is a symbolic vector which will contain
        # [0,1,2,... n-1] T.log(self.p_y_given_x) is a matrix of
        # Log-Probabilities (call it LP) with one row per example and
        # one column per class LP[T.arange(label_sym.shape[0]),label_sym] is a vector
        # v containing [LP[0,label_sym[0]], LP[1,label_sym[1]], LP[2,label_sym[2]], ...,
        # LP[n-1,label_sym[n-1]]] and T.mean(LP[T.arange(label_sym.shape[0]),label_sym]) is
        # the mean (across minibatch examples) of the elements in v,
        # i.e., the mean log-likelihood across the minibatch.

        # loss, matrix \in R[#data,#classes]
        loss = theano.shared(value=numpy.ones((self.n_data,self.n_classes), 
                        dtype=theano.config.floatX),
                name='cost', borrow=True)
        T.set_subtensor(loss[T.arange(label_sym.shape[0]),label_sym], 0)
        #loss = 0
        # score, matrix \in R[#data,1]
        self.score = T.max(loss + self.compatibility, axis=1)
        margin = T.mean(self.score - self.compatibility[T.arange(label_sym.shape[0]),label_sym])

        return self.l2norm + self.C * margin
 def _step(x, k, max_seq_len):
     tmp = x[
         T.arange(x.shape[0])[:, np.newaxis, np.newaxis],
         T.sort(T.argsort(x, axis=1)[:, -k:, :], axis=1),
         T.arange(x.shape[2])[np.newaxis, np.newaxis,:],
     ]
     return T.concatenate([tmp, T.zeros([x.shape[0], max_seq_len-k, x.shape[2]])], axis=1)
Example #20
0
    def logp(self, x):
        n = self.n
        eta = self.eta

        diag_idxs = self.diag_idxs
        cumsum = tt.cumsum(x ** 2)
        variance = tt.zeros(n)
        variance = tt.inc_subtensor(variance[0], x[0] ** 2)
        variance = tt.inc_subtensor(
            variance[1:],
            cumsum[diag_idxs[1:]] - cumsum[diag_idxs[:-1]])
        sd_vals = tt.sqrt(variance)

        logp_sd = self.sd_dist.logp(sd_vals).sum()
        corr_diag = x[diag_idxs] / sd_vals

        logp_lkj = (2 * eta - 3 + n - tt.arange(n)) * tt.log(corr_diag)
        logp_lkj = tt.sum(logp_lkj)

        # Compute the log det jacobian of the second transformation
        # described in the docstring.
        idx = tt.arange(n)
        det_invjac = tt.log(corr_diag) - idx * tt.log(sd_vals)
        det_invjac = det_invjac.sum()

        norm = _lkj_normalizing_constant(eta, n)

        return norm + logp_lkj + logp_sd + det_invjac
Example #21
0
 def sample(self, X):
     mu = X[0]
     sig = X[1]
     coeff = X[2]
     n_noise = T.cast(T.floor(coeff.shape[-1] * self.p_noise), 'int32')
     mu = T.concatenate(
         [mu, T.zeros((mu.shape[0],
                       n_noise*sig.shape[1]/coeff.shape[-1]))],
         axis=1
     )
     mu = mu.reshape((mu.shape[0],
                      mu.shape[1]/coeff.shape[-1],
                      coeff.shape[-1]))
     sig = sig.reshape((sig.shape[0],
                        sig.shape[1]/coeff.shape[-1],
                        coeff.shape[-1]))
     idx = predict(
         self.theano_rng.multinomial(
             pvals=coeff,
             dtype=coeff.dtype
         ),
         axis=1
     )
     mu = mu[T.arange(mu.shape[0]), :, idx]
     sig = sig[T.arange(sig.shape[0]), :, idx]
     sample = self.theano_rng.normal(size=mu.shape,
                                     avg=mu, std=sig,
                                     dtype=mu.dtype)
     return sample
Example #22
0
    def run(self, images, h):#, error_images, h):
        channels = self.channels#images.shape[1]
        if not self.test:
            gx,gy,dx,dy,s2,g = self.get_params(h)
        else:
            gx,gy,dx,dy,s2,g = self.get_params_test(h)

        # how to handle variable sized input images? (mask??)
        I = images.reshape((self.batch_size*self.channels, self.height, self.width))

        muX = gx.dimshuffle([0,'x']) + dx.dimshuffle([0,'x']) * (T.arange(self.N).astype(theano.config.floatX) - self.N/2 - 0.5)
        muY = gy.dimshuffle([0,'x']) + dy.dimshuffle([0,'x']) * (T.arange(self.N).astype(theano.config.floatX) - self.N/2 - 0.5)

        a = T.arange(self.width).astype(theano.config.floatX)
        b = T.arange(self.height).astype(theano.config.floatX)

        Fx = T.exp(-(a-muX.dimshuffle([0,1,'x']))**2 / 2. / s2.dimshuffle([0,'x','x'])**2)
        Fy = T.exp(-(b-muY.dimshuffle([0,1,'x']))**2 / 2. / s2.dimshuffle([0,'x','x'])**2)

        Fx = Fx / (Fx.sum(axis=-1).dimshuffle([0,1,'x']) + 1e-4)
        Fy = Fy / (Fy.sum(axis=-1).dimshuffle([0,1,'x']) + 1e-4)

        self.Fx = T.repeat(Fx, channels, axis=0)
        self.Fy = T.repeat(Fy, channels, axis=0)

        self.fint = self.batched_dot(self.Fy, I)
#        self.efint = T.dot(self.Fx, error_images)
        self.fim = self.batched_dot(self.fint, self.Fx.transpose([0,2,1])).reshape(
            (self.batch_size, self.channels*self.N*self.N))
#        self.feim = T.dot(self.efint, self.Fy.transpose([0,2,1])).reshape(
#            (self.batch_size, channels,self.N,self.N))
        return g * self.fim, (gx, gy, dx, dy, self.fint)#$T.concatenate([self.fim, self.feim], axis=1)
Example #23
0
 def geterr(
     self, probs, golds, occlusion
 ):  # cross-entropy; probs: floats of (batsize, seqlen, vocabsize), gold: indexes of (batsize, seqlen)
     r = occlusion[:, 1:] * T.log(
         probs[T.arange(probs.shape[0])[:, None], T.arange(probs.shape[1])[None, :], golds]
     )  # --> result: floats of (batsize, seqlen)
     return -T.sum(r) / occlusion[:, 1:].norm(1)
Example #24
0
    def log_ctc(self, ):
        _1000 = tt.eye(self.n)[0]
        prev_mask = 1 - _1000
        prevprev_mask = tt.neq(self.labels[:-2], self.labels[2:]) * \
                        tt.eq(self.labels[1:-1], self.blank)
        prevprev_mask = tt.concatenate(([0, 0], prevprev_mask))
        prev_mask = safe_log(prev_mask)
        prevprev_mask = safe_log(prevprev_mask)
        prev = tt.arange(-1, self.n-1)
        prevprev = tt.arange(-2, self.n-2)
        log_pred_y = tt.log(self.inpt[:, self.labels])

        def step(curr, accum):
            return logmul(curr,
                          logadd(accum,
                                 logmul(prev_mask, accum[prev]),
                                 logmul(prevprev_mask, accum[prevprev])))

        log_probs, _ = theano.scan(
            step,
            sequences=[log_pred_y],
            outputs_info=[safe_log(_1000)]
        )

        # TODO: Add -2 if n > 1 and blank at end
        log_labels_probab = log_probs[-1, -2]
        self.cost = -log_labels_probab
        self.debug = tt.exp(log_probs.T)
	def sample_from_joint(self, n_samples, output_2D=False):
		'''Samples from the joint posterior P(s_t-n_history:s_t | observations)
		n_samples: the number of samples to draw
		
		Returns an array with shape (n_history+1, n_samples, state_dims),
		where array[-1] corresponds to the current time.
		'''
		samps=self.theano_rng.multinomial(pvals=T.extra_ops.repeat(self.current_weights.dimshuffle('x',0),n_samples,axis=0))
		idxs=T.cast(T.dot(samps, T.arange(self.n_particles)),'int64')
		samps_t0=self.current_state[idxs]
		
		t0=T.as_tensor_variable(1)
		
		[samples, ts], updates = theano.scan(fn=self.sample_step,
											outputs_info=[samps_t0, t0],
											non_sequences=[n_samples],
											n_steps=self.n_history)
		
		#the variable "samples" that results from the scan is time-flipped
		#in the sense that samples[0] corresponds to the most recent point
		#in time, and higher indices correspond to points in the past.
		#I will stick to the convention that for any collection of points in 
		#time, [-1] will index the most recent time, and [0] will index
		#the point farthest in the past. So, the first axis of "samples" 
		#needs to be flipped.
		flip_idxs=T.cast(-T.arange(self.n_history)+self.n_history-1,'int64')
		samples=T.concatenate([samples[flip_idxs], samps_t0.dimshuffle('x',0,1)], axis=0)
		
		if output_2D:
			samples=T.reshape(samples, ((self.n_history+1)*n_samples, self.state_dims))
		
		return samples, updates
def keep_max(input, theta, k, sent_mask):
    sig_input = T.nnet.sigmoid(T.dot(input, theta))
    sent_mask = sent_mask.dimshuffle(0, 'x', 1, 'x')
    sig_input = sig_input * sent_mask
    #sig_input = T.dot(input, theta)
    if k == 0:
        result = input * T.addbroadcast(sig_input, 3)
        return result, sig_input

    # get the sorted idx
    sort_idx = T.argsort(sig_input, axis=2)
    k_max_ids = sort_idx[:,:,-k:,:]
    dim0, dim1, dim2, dim3 = k_max_ids.shape
    batchids = T.repeat(T.arange(dim0), dim1*dim2*dim3)
    mapids = T.repeat(T.arange(dim1), dim2*dim3).reshape((1, dim2*dim3))
    mapids = T.repeat(mapids, dim0, axis=0).flatten()
    rowids = k_max_ids.flatten()
    colids = T.arange(dim3).reshape((1, dim3))
    colids = T.repeat(colids, dim0*dim1*dim2, axis=0).flatten()
    sig_mask = T.zeros_like(sig_input)
    choosed = sig_input[batchids, mapids, rowids, colids]
    sig_mask = T.set_subtensor(sig_mask[batchids, mapids, rowids, colids], 1)
    input_mask = sig_mask * sig_input
    result = input * T.addbroadcast(input_mask, 3)
    return result, sig_input
Example #27
0
 def ans_score(ans, outputs):
     arr = T.arange(ans.shape[0])
     sum1 = T.sum(outputs[arr, ans])
     arr = T.arange(ans.shape[0] - 1)
     st = self.seg.params["A"][self.seg.viterbi_startnode, ans[0]]
     sum2 = T.sum(self.seg.params["A"][ans[arr], ans[arr + 1]]) + st
     return sum1 + sum2
Example #28
0
def neglog_2d(output, target):
    i = T.arange(target.shape[0]).reshape((target.shape[0], 1))
    i = T.repeat(i, target.shape[1], axis=1).flatten()
    j = T.arange(target.shape[1]).reshape((1, target.shape[1]))
    j = T.repeat(j, target.shape[0], axis=0).flatten()
    k = target.flatten()
    return -T.mean(T.log(output)[i, j, k])
Example #29
0
def _sequence_log_likelihood(y, y_hat, y_mask, y_hat_mask, blank_symbol,
                            log_scale=True):
    '''
    Based on code from Shawn Tan.
    Credits to Kyle Kastner as well.
    '''
    y_hat_mask_len = tensor.sum(y_hat_mask, axis=0, dtype='int32')
    y_mask_len = tensor.sum(y_mask, axis=0, dtype='int32')

    if log_scale:
        log_probabs = _log_path_probabs(y, T.log(y_hat),
                                        y_mask, y_hat_mask,
                                        blank_symbol)
        batch_size = log_probabs.shape[1]
        log_labels_probab = _log_add(
            log_probabs[y_hat_mask_len - 1,
                        tensor.arange(batch_size),
                        y_mask_len - 1],
            log_probabs[y_hat_mask_len - 1,
                        tensor.arange(batch_size),
                        y_mask_len - 2])
    else:
        probabilities = _path_probabs(y, y_hat,
                                      y_mask, y_hat_mask,
                                      blank_symbol)
        batch_size = probabilities.shape[1]
        labels_probab = (probabilities[y_hat_mask_len - 1,
                                       tensor.arange(batch_size),
                                       y_mask_len - 1] +
                         probabilities[y_hat_mask_len - 1,
                                       tensor.arange(batch_size),
                                       y_mask_len - 2])
        log_labels_probab = tensor.log(labels_probab)
    return log_labels_probab
Example #30
0
    def filterbank_matrices(self, center_y, center_x, delta, sigma):
        """Create a Fy and a Fx
        
        Parameters
        ----------
        center_y : T.vector (shape: batch_size)
        center_x : T.vector (shape: batch_size)
            Y and X center coordinates for the attention window
        delta : T.vector (shape: batch_size)
        sigma : T.vector (shape: batch_size)
        
        Returns
        -------
            FY : T.fvector (shape: )
            FX : T.fvector (shape: )
        """
        tol = 1e-4
        N = self.N

        rng = T.arange(N, dtype=floatX)-N/2.+0.5  # e.g.  [1.5, -0.5, 0.5, 1.5]

        muX = center_x.dimshuffle([0, 'x']) + delta.dimshuffle([0, 'x'])*rng
        muY = center_y.dimshuffle([0, 'x']) + delta.dimshuffle([0, 'x'])*rng

        a = tensor.arange(self.img_width, dtype=floatX)
        b = tensor.arange(self.img_height, dtype=floatX)
        
        FX = tensor.exp( -(a-muX.dimshuffle([0,1,'x']))**2 / 2. / sigma.dimshuffle([0,'x','x'])**2 )
        FY = tensor.exp( -(b-muY.dimshuffle([0,1,'x']))**2 / 2. / sigma.dimshuffle([0,'x','x'])**2 )
        FX = FX / (FX.sum(axis=-1).dimshuffle(0, 1, 'x') + tol)
        FY = FY / (FY.sum(axis=-1).dimshuffle(0, 1, 'x') + tol)

        return FY, FX
Example #31
0
def interpolate_bilinear(im, x, y, out_shape=None, border_mode='nearest'):
    if im.ndim != 4:
        raise TypeError('im should be a 4D Tensor image, got %dD.' % im.ndim)

    out_shape = out_shape if out_shape else T.shape(im)[2:]
    x, y = x.flatten(), y.flatten()
    n, c, h, w = im.shape
    h_out, w_out = out_shape
    height_f = T.cast(h, theano.config.floatX)
    width_f = T.cast(w, theano.config.floatX)

    # scale coordinates from [-1, 1] to [0, width/height - 1]
    x = (x + 1) / 2 * (width_f - 1)
    y = (y + 1) / 2 * (height_f - 1)

    x0_f = T.floor(x)
    y0_f = T.floor(y)
    x1_f = x0_f + 1
    y1_f = y0_f + 1

    if border_mode == 'nearest':
        x0 = T.clip(x0_f, 0, width_f - 1)
        x1 = T.clip(x1_f, 0, width_f - 1)
        y0 = T.clip(y0_f, 0, height_f - 1)
        y1 = T.clip(y1_f, 0, height_f - 1)
    elif border_mode == 'mirror':
        w = 2 * (width_f - 1)
        x0 = T.minimum(x0_f % w, -x0_f % w)
        x1 = T.minimum(x1_f % w, -x1_f % w)
        h = 2 * (height_f - 1)
        y0 = T.minimum(y0_f % h, -y0_f % h)
        y1 = T.minimum(y1_f % h, -y1_f % h)
    elif border_mode == 'wrap':
        x0 = T.mod(x0_f, width_f)
        x1 = T.mod(x1_f, width_f)
        y0 = T.mod(y0_f, height_f)
        y1 = T.mod(y1_f, height_f)
    else:
        raise ValueError("border_mode must be one of "
                         "'nearest', 'mirror', 'wrap'")
    x0, x1, y0, y1 = (T.cast(v, 'int64') for v in (x0, x1, y0, y1))

    base = T.arange(n) * w * h
    base = T.reshape(base, (-1, 1))
    base = T.tile(base, (1, h_out * w_out))
    base = base.flatten()

    base_y0 = base + y0 * w
    base_y1 = base + y1 * w
    idx_a = base_y0 + x0
    idx_b = base_y1 + x0
    idx_c = base_y0 + x1
    idx_d = base_y1 + x1

    im_flat = T.reshape(im.dimshuffle((0, 2, 3, 1)), (-1, c))
    pixel_a = im_flat[idx_a]
    pixel_b = im_flat[idx_b]
    pixel_c = im_flat[idx_c]
    pixel_d = im_flat[idx_d]

    wa = ((x1_f - x) * (y1_f - y)).dimshuffle((0, 'x'))
    wb = ((x1_f - x) * (1. - (y1_f - y))).dimshuffle((0, 'x'))
    wc = ((1. - (x1_f - x)) * (y1_f - y)).dimshuffle((0, 'x'))
    wd = ((1. - (x1_f - x)) * (1. - (y1_f - y))).dimshuffle((0, 'x'))

    output = T.sum((wa * pixel_a, wb * pixel_b, wc * pixel_c, wd * pixel_d),
                   axis=0)
    output = T.reshape(output, (n, h_out, w_out, c))
    return output.dimshuffle((0, 3, 1, 2))
Example #32
0
 def cost(self):
     return -TT.mean(TT.log(self.y)[TT.arange(self.k.shape[0]), self.k])
    def __init__(self, config, vocab_size):
        question = tensor.imatrix('question')
        question_mask = tensor.imatrix('question_mask')
        context = tensor.imatrix('context')
        context_mask = tensor.imatrix('context_mask')
        answer = tensor.imatrix('answer')
        answer_mask = tensor.imatrix('answer_mask')
        ans_indices = tensor.imatrix('ans_indices')  # n_steps * n_samples
        ans_indices_mask = tensor.imatrix('ans_indices_mask')

        bricks = []

        question = question.dimshuffle(1, 0)
        question_mask = question_mask.dimshuffle(1, 0)
        context = context.dimshuffle(1, 0)
        context_mask = context_mask.dimshuffle(1, 0)
        answer = answer.dimshuffle(1, 0)
        answer_mask = answer_mask.dimshuffle(1, 0)
        ans_indices = ans_indices.dimshuffle(1, 0)
        ans_indices_mask = ans_indices_mask.dimshuffle(1, 0)

        # Embed questions and context
        embed = LookupTable(vocab_size, config.embed_size, name='embed')
        #embed.weights_init = IsotropicGaussian(0.01)
        embed.weights_init = Constant(
            init_embedding_table(filename='embeddings/vocab_embeddings.txt'))

        # one directional LSTM encoding
        q_lstm_ins = Linear(input_dim=config.embed_size,
                            output_dim=4 * config.pre_lstm_size,
                            name='q_lstm_in')
        q_lstm = LSTM(dim=config.pre_lstm_size,
                      activation=Tanh(),
                      name='q_lstm')
        c_lstm_ins = Linear(input_dim=config.embed_size,
                            output_dim=4 * config.pre_lstm_size,
                            name='c_lstm_in')
        c_lstm = LSTM(dim=config.pre_lstm_size,
                      activation=Tanh(),
                      name='c_lstm')
        bricks += [q_lstm, c_lstm, q_lstm_ins, c_lstm_ins]

        q_tmp = q_lstm_ins.apply(embed.apply(question))
        c_tmp = c_lstm_ins.apply(embed.apply(context))
        q_hidden, _ = q_lstm.apply(q_tmp,
                                   mask=question_mask.astype(
                                       theano.config.floatX))  # lq, bs, dim
        c_hidden, _ = c_lstm.apply(c_tmp,
                                   mask=context_mask.astype(
                                       theano.config.floatX))  # lc, bs, dim

        # Attention mechanism Bilinear question
        attention_question = Linear(input_dim=config.pre_lstm_size,
                                    output_dim=config.pre_lstm_size,
                                    name='att_question')
        bricks += [attention_question]
        att_weights_question = q_hidden[
            None, :, :, :] * attention_question.apply(
                c_hidden.reshape(
                    (c_hidden.shape[0] * c_hidden.shape[1],
                     c_hidden.shape[2]))).reshape(
                         (c_hidden.shape[0], c_hidden.shape[1],
                          c_hidden.shape[2]))[:,
                                              None, :, :]  # --> lc,lq,bs,dim
        att_weights_question = att_weights_question.sum(
            axis=3)  # sum over axis 3 -> dimensions --> lc,lq,bs
        att_weights_question = att_weights_question.dimshuffle(
            0, 2, 1)  # --> lc,bs,lq
        att_weights_question = att_weights_question.reshape(
            (att_weights_question.shape[0] * att_weights_question.shape[1],
             att_weights_question.shape[2]))  # --> lc*bs,lq
        att_weights_question = tensor.nnet.softmax(
            att_weights_question
        )  # softmax over axis 1 -> length of question # --> lc*bs,lq
        att_weights_question = att_weights_question.reshape(
            (c_hidden.shape[0], q_hidden.shape[1],
             q_hidden.shape[0]))  # --> lc,bs,lq
        att_weights_question = att_weights_question.dimshuffle(
            0, 2, 1)  # --> lc,lq,bs

        attended_question = tensor.sum(
            q_hidden[None, :, :, :] * att_weights_question[:, :, :, None],
            axis=1)  # sum over axis 1 -> length of question --> lc,bs,dim
        attended_question.name = 'attended_question'

        # Match LSTM
        cqembed = tensor.concatenate([c_hidden, attended_question], axis=2)
        mlstms, mhidden_list = make_bidir_lstm_stack(
            cqembed, 2 * config.pre_lstm_size,
            context_mask.astype(theano.config.floatX), config.match_lstm_size,
            config.match_skip_connections, 'match')
        bricks = bricks + mlstms
        if config.match_skip_connections:
            menc_dim = 2 * sum(config.match_lstm_size)
            menc = tensor.concatenate(mhidden_list, axis=2)
        else:
            menc_dim = 2 * config.match_lstm_size[-1]
            menc = tensor.concatenate(mhidden_list[-2:], axis=2)
        menc.name = 'menc'

        # Attention mechanism MLP start
        attention_mlp_start = MLP(
            dims=config.attention_mlp_hidden + [1],
            activations=config.attention_mlp_activations[1:] + [Identity()],
            name='attention_mlp_start')
        attention_clinear_start = Linear(
            input_dim=menc_dim,
            output_dim=config.attention_mlp_hidden[0],
            name='attm_start')  # Wym
        bricks += [attention_mlp_start, attention_clinear_start]
        layer1_start = Tanh(name='layer1_start')
        layer1_start = layer1_start.apply(
            attention_clinear_start.apply(
                menc.reshape(
                    (menc.shape[0] * menc.shape[1], menc.shape[2]))).reshape(
                        (menc.shape[0], menc.shape[1],
                         config.attention_mlp_hidden[0])))
        att_weights_start = attention_mlp_start.apply(
            layer1_start.reshape(
                (layer1_start.shape[0] * layer1_start.shape[1],
                 layer1_start.shape[2])))
        att_weights_start = att_weights_start.reshape(
            (layer1_start.shape[0], layer1_start.shape[1]))
        att_weights_start = tensor.nnet.softmax(att_weights_start.T).T

        attended = tensor.sum(menc * att_weights_start[:, :, None], axis=0)
        attended.name = 'attended'

        # Attention mechanism MLP end
        attention_mlp_end = MLP(
            dims=config.attention_mlp_hidden + [1],
            activations=config.attention_mlp_activations[1:] + [Identity()],
            name='attention_mlp_end')
        attention_qlinear_end = Linear(
            input_dim=menc_dim,
            output_dim=config.attention_mlp_hidden[0],
            name='atts_end')  #Wum
        attention_clinear_end = Linear(
            input_dim=menc_dim,
            output_dim=config.attention_mlp_hidden[0],
            use_bias=False,
            name='attm_end')  # Wym
        bricks += [
            attention_mlp_end, attention_qlinear_end, attention_clinear_end
        ]
        layer1_end = Tanh(name='layer1_end')
        layer1_end = layer1_end.apply(
            attention_clinear_end.apply(
                menc.reshape((menc.shape[0] * menc.shape[1], menc.shape[2]
                              ))).reshape((menc.shape[0], menc.shape[1],
                                           config.attention_mlp_hidden[0])) +
            attention_qlinear_end.apply(attended)[None, :, :])
        att_weights_end = attention_mlp_end.apply(
            layer1_end.reshape((layer1_end.shape[0] * layer1_end.shape[1],
                                layer1_end.shape[2])))
        att_weights_end = att_weights_end.reshape(
            (layer1_end.shape[0], layer1_end.shape[1]))
        att_weights_end = tensor.nnet.softmax(att_weights_end.T).T

        att_weights_start = tensor.dot(
            tensor.le(
                tensor.tile(
                    theano.tensor.arange(context.shape[0])[None, :],
                    (context.shape[0], 1)),
                tensor.tile(
                    theano.tensor.arange(context.shape[0])[:, None],
                    (1, context.shape[0]))), att_weights_start)
        att_weights_end = tensor.dot(
            tensor.ge(
                tensor.tile(
                    theano.tensor.arange(context.shape[0])[None, :],
                    (context.shape[0], 1)),
                tensor.tile(
                    theano.tensor.arange(context.shape[0])[:, None],
                    (1, context.shape[0]))), att_weights_end)

        # add attention from left and right
        att_weights = att_weights_start * att_weights_end
        #att_weights = tensor.minimum(att_weights_start, att_weights_end)

        att_target = tensor.zeros((ans_indices.shape[1], context.shape[0]),
                                  dtype=theano.config.floatX)
        att_target = tensor.set_subtensor(
            att_target[tensor.arange(ans_indices.shape[1]), ans_indices], 1)
        att_target = att_target.dimshuffle(1, 0)
        #att_target = tensor.eq(tensor.tile(answer[None,:,:], (context.shape[0], 1, 1)),
        #                       tensor.tile(context[:,None,:], (1, answer.shape[0], 1))).sum(axis=1).clip(0,1)

        self.predictions = tensor.gt(att_weights, 0.25) * context

        att_target = att_target / (att_target.sum(axis=0) + 0.00001)
        #att_weights = att_weights / (att_weights.sum(axis=0) + 0.00001)

        cost = (tensor.nnet.binary_crossentropy(att_weights, att_target) *
                context_mask).sum() / context_mask.sum()

        # Apply dropout
        cg = ComputationGraph([cost])
        if config.w_noise > 0:
            noise_vars = VariableFilter(roles=[WEIGHT])(cg)
            cg = apply_noise(cg, noise_vars, config.w_noise)
        if config.dropout > 0:
            cg = apply_dropout(cg, mhidden_list, config.dropout)
        [cost_reg] = cg.outputs

        # Other stuff
        cost.name = 'cost'
        cost_reg.name = 'cost_reg'
        att_weights_start.name = 'att_weights_start'
        att_weights_end.name = 'att_weights_end'
        att_weights.name = 'att_weights'
        att_target.name = 'att_target'
        self.predictions.name = 'pred'

        self.sgd_cost = cost_reg
        self.monitor_vars = [[cost_reg]]
        self.monitor_vars_valid = [[cost_reg]]
        self.analyse_vars = [
            cost, self.predictions, att_weights_start, att_weights_end,
            att_weights, att_target
        ]

        # Initialize bricks
        embed.initialize()
        for brick in bricks:
            brick.weights_init = config.weights_init
            brick.biases_init = config.biases_init
            brick.initialize()
    def fit(self,
            X,
            Y,
            Xvalid,
            Yvalid,
            learning_rate=1e-2,
            mu=0.9,
            decay=0.9,
            epochs=10,
            batch_sz=100,
            show_fig=False):
        X = X.astype(np.float32)
        Y = Y.astype(np.int32)
        Xvalid = Xvalid.astype(np.float32)
        Yvalid = Yvalid.astype(np.int32)

        self.rng = RandomStreams()

        # initialize hidden layers
        N, D = X.shape
        K = len(set(Y))
        self.hidden_layers = []
        M1 = D
        count = 0
        for M2 in self.hidden_layer_sizes:
            h = HiddenLayer(M1, M2, count)
            self.hidden_layers.append(h)
            M1 = M2
            count += 1
        W = np.random.randn(M1, K) * np.sqrt(2.0 / M1)
        b = np.zeros(K)
        self.W = theano.shared(W, 'W_logreg')
        self.b = theano.shared(b, 'b_logreg')

        # collect params for later use
        self.params = [self.W, self.b]
        for h in self.hidden_layers:
            self.params += h.params

        # set up theano functions and variables
        thX = T.matrix('X')
        thY = T.ivector('Y')
        pY_train = self.forward_train(thX)

        # this cost is for training
        cost = -T.mean(T.log(pY_train[T.arange(thY.shape[0]), thY]))
        updates = momentum_updates(cost, self.params, learning_rate, mu)

        train_op = theano.function(inputs=[thX, thY], updates=updates)

        # for evaluation and prediction
        pY_predict = self.forward_predict(thX)
        cost_predict = -T.mean(T.log(pY_predict[T.arange(thY.shape[0]), thY]))
        prediction = self.predict(thX)
        cost_predict_op = theano.function(inputs=[thX, thY],
                                          outputs=[cost_predict, prediction])

        n_batches = N // batch_sz
        costs = []
        for i in range(epochs):
            X, Y = shuffle(X, Y)
            for j in range(n_batches):
                Xbatch = X[j * batch_sz:(j * batch_sz + batch_sz)]
                Ybatch = Y[j * batch_sz:(j * batch_sz + batch_sz)]

                train_op(Xbatch, Ybatch)

                if j % 50 == 0:
                    c, p = cost_predict_op(Xvalid, Yvalid)
                    costs.append(c)
                    e = error_rate(Yvalid, p)
                    print("i:", i, "j:", j, "nb:", n_batches, "cost:", c,
                          "error rate:", e)

        if show_fig:
            plt.plot(costs)
            plt.show()
X = T.dmatrix()
y = T.ivector()

prepare_data = lambda x: (theano.shared(x[0].astype('float64')),
                          theano.shared(x[1].astype('int32')))
(training_x, training_y), (test_x, test_y), (validation_x, validation_y) = map(
    prepare_data, [train_set, test_set, valid_set])

W = theano.shared(numpy.zeros([dims, n_classes]))
b = theano.shared(numpy.zeros(n_classes))

y_hat = T.nnet.softmax(T.dot(X, W) + b)
y_pred = T.argmax(y_hat, axis=1)
test_error = T.mean(T.neq(y_pred, y))
training_error = -T.mean(T.log(y_hat)[T.arange(y.shape[0]), y])

learning_rate = 0.2
params = [W, b]
beta = .9
updates = []
for p in params:
    ms = theano.shared(1. + 0. * p.get_value())
    updates += [
        (p, p - learning_rate * T.grad(training_error, p) / T.sqrt(ms)),
        (ms, beta * ms + (1 - beta) * T.sqr(T.grad(training_error, p)))
    ]

idx = T.ivector()
training_function = theano.function(inputs=[idx],
                                    outputs=training_error,
Example #36
0
 def negative_log_likelihood(self,y):
     
      return -T.mean((self.p_y_given_x)[T.arange(y.shape[0]),y])
Example #37
0
    def get_sessions(self,
                     environment,
                     session_length=10,
                     batch_size=None,
                     initial_env_states='zeros',
                     initial_observations='zeros',
                     initial_hidden='zeros',
                     experience_replay=False,
                     unroll_scan=True,
                     return_automatic_updates=False,
                     optimize_experience_replay=None,
                     **kwargs):
        """
        Returns history of agent interaction with environment for given number of turns:
        
        :param environment: an environment to interact with
        :type environment: BaseEnvironment
        :param session_length: how many turns of interaction shall there be for each batch
        :type session_length: int
        :param batch_size: amount of independent sessions [number or symbolic].
            irrelevant if there's at least one input or if you manually set any initial_*.

        :type batch_size: int or theano.tensor.TensorVariable

        :param experience_replay: whether or not to use experience replay if True, assumes environment to have
            a pre-defined sequence of observations and actions (as env.observations etc.)
            The agent will then observe sequence of observations and will be forced to take recorded actions
            via get_output(...,{action_layer=recorded_action}
            Saves some time by directly using environment.observations (list of sequences) instead of calling
            environment.get_action_results via environment.as_layers(...).
            Note that if this parameter is false, agent will be allowed to pick any actions during experience replay

        :type experience_replay: bool

        :param unroll_scan: whether use theano.scan or lasagne.utils.unroll_scan
        :param return_automatic_updates: whether to append automatic updates to returned tuple (as last element)

        
        :param kwargs: optional flags to be sent to NN when calling get_output (e.g. deterministic = True)
        :type kwargs: several kw flags (flag=value,flag2=value,...)
        
        :param initial_something: layers providing initial values for all variables at 0-th time step
            'zeros' default means filling variables with zeros
            Initial values are NOT included in history sequences

        :param optimize_experience_replay: deprecated, use experience_replay

        :returns: state_seq,observation_seq,hidden_seq,action_seq,policy_seq,
            for environment state, observation, hidden state, chosen actions and agent policy respectively
            each of them having dimensions of [batch_i,seq_i,...]
            time synchronization policy:
            env_states[:,i] was observed as observation[:,i] BASED ON WHICH agent generated his
            policy[:,i], resulting in action[:,i], and also updated his memory from hidden[:,i-1] to hiden[:,i]

        :rtype: tuple of Theano tensors



        """

        if optimize_experience_replay is not None:
            experience_replay = optimize_experience_replay
            warn(
                "optimize_experience_replay is deprecated and will be removed in 1.0.2. Use experience_replay parameter."
            )

        env = environment

        if experience_replay:
            if not hasattr(env, "observations") or not hasattr(env, "actions"):
                raise ValueError(
                    'if optimize_experience_replay is turned on, one must provide an environment with .observations'
                    'and .actions properties containing observations and actions to replay.'
                )
            if initial_env_states != 'zeros' or initial_observations != 'zeros':
                warn(
                    "In experience replay mode, initial env states and initial observations parameters are unused",
                    verbosity_level=2)

            if initial_hidden == 'zeros' and hasattr(
                    env, "preceding_agent_memories"):
                initial_hidden = getattr(env, "preceding_agent_memories")

            # create recurrence
            self.recurrence = self.as_replay_recurrence(
                environment=environment,
                session_length=session_length,
                initial_hidden=initial_hidden,
                unroll_scan=unroll_scan,
                **kwargs)

        else:
            if isinstance(env, SessionPoolEnvironment) or isinstance(
                    env, SessionBatchEnvironment):
                warn(
                    "You are using experience replay environment as normal environment. "
                    "This will work, but you can get a free performance boost "
                    "by using passing optimize_experience_replay = True to .get_sessions",
                    verbosity_level=2)

            # create recurrence in active mode (using environment.get_action_results)
            self.recurrence = self.as_recurrence(
                environment=environment,
                session_length=session_length,
                batch_size=batch_size,
                initial_env_states=initial_env_states,
                initial_observations=initial_observations,
                initial_hidden=initial_hidden,
                unroll_scan=unroll_scan,
                **kwargs)
        state_layers_dict, output_layers = self.recurrence.get_sequence_layers(
        )

        # convert sequence layers into actual theano variables
        theano_expressions = lasagne.layers.get_output(
            list(state_layers_dict.values()) + list(output_layers))

        n_states = len(state_layers_dict)
        states_list, outputs = theano_expressions[:
                                                  n_states], theano_expressions[
                                                      n_states:]

        if experience_replay:
            assert len(states_list) == len(self.agent_states)
            agent_states = states_list
            env_states = [T.arange(session_length)]
            observations = env.observations
        else:
            # sort sequences into categories
            agent_states, env_states, observations = \
                unpack_list(states_list, [len(self.agent_states), len(env.state_shapes), len(env.observation_shapes)])

        policy, actions = unpack_list(
            outputs,
            [len(self.policy), len(self.action_layers)])

        agent_states = OrderedDict(
            list(zip(list(self.agent_states.keys()), agent_states)))

        # if user asked for single value and not one-element list, unpack the list
        if type(environment.state_shapes) not in supported_sequences:
            env_states = env_states[0]
        if self.single_observation:
            observations = observations[0]

        if self.single_action:
            actions = actions[0]
        if self.single_policy:
            policy = policy[0]

        ret_tuple = env_states, observations, agent_states, actions, policy

        if return_automatic_updates:
            ret_tuple += (self.get_automatic_updates(), )

        if unroll_scan and return_automatic_updates:
            warn(
                "return_automatic_updates useful when and only when unroll_scan == False",
                verbosity_level=2)

        return ret_tuple
Example #38
0
 def negative_log_likelihood(self):
     self.prob_of_y_given_x = T.nnet.softmax(self.x)
     return -T.mean(
         T.log(self.prob_of_y_given_x)[T.arange(self.y.shape[0]), self.y])
Example #39
0
    def fit(self,
            X,
            Y,
            learning_rate=1e-2,
            mu=0.99,
            reg=1e-12,
            epochs=400,
            batch_sz=20,
            print_period=1,
            show_fig=False):

        # X = X.astype(np.float32)
        Y = Y.astype(np.int32)

        # initialize hidden layers
        N, D = X.shape
        K = len(set(Y))
        self.hidden_layers = []
        M1 = D
        count = 0
        for M2 in self.hidden_layer_sizes:
            h = HiddenLayer(M1, M2, count)
            self.hidden_layers.append(h)
            M1 = M2
            count += 1
        W = init_weight(M1, K)
        b = np.zeros(K)
        self.W = theano.shared(W, 'W_logreg')
        self.b = theano.shared(b, 'b_logreg')

        # collect params for later use
        self.params = [self.W, self.b]
        for h in self.hidden_layers:
            self.params += h.params

        # for momentum
        dparams = [
            theano.shared(np.zeros(p.get_value().shape)) for p in self.params
        ]

        # for rmsprop
        cache = [
            theano.shared(np.zeros(p.get_value().shape)) for p in self.params
        ]

        # set up theano functions and variables
        thX = T.matrix('X')
        thY = T.ivector('Y')
        pY = self.forward(thX)

        rcost = reg * T.sum([(p * p).sum() for p in self.params])
        cost = -T.mean(T.log(pY[T.arange(thY.shape[0]), thY])) + rcost
        prediction = self.predict(thX)
        grads = T.grad(cost, self.params)

        # momentum only
        updates = [(p, p + mu * dp - learning_rate * g)
                   for p, dp, g in zip(self.params, dparams, grads)
                   ] + [(dp, mu * dp - learning_rate * g)
                        for dp, g in zip(dparams, grads)]

        train_op = theano.function(
            inputs=[thX, thY],
            outputs=[cost, prediction],
            updates=updates,
        )

        n_batches = N // batch_sz
        costs = []
        for i in range(epochs):
            X, Y = shuffle(X, Y)
            for j in range(n_batches):
                Xbatch = X[j * batch_sz:(j * batch_sz + batch_sz)]
                Ybatch = Y[j * batch_sz:(j * batch_sz + batch_sz)]

                c, p = train_op(Xbatch, Ybatch)

                if j % print_period == 0:
                    costs.append(c)
                    e = np.mean(Ybatch != p)
                    print("i:", i, "j:", j, "nb:", n_batches, "cost:", c,
                          "error rate:", e)

        if show_fig:
            plt.plot(costs)
            plt.show()
Example #40
0
def make_one_hot(label, dim):
    num = label.shape[0]
    one_hot = T.zeros((num, dim), theano.config.floatX)
    one_hot = T.set_subtensor(one_hot[T.arange(num), label], 1.)
    return one_hot
Example #41
0
    def fit(self,
            X,
            Y,
            Xvalid,
            Yvalid,
            learning_rate=1e-3,
            mu=0.99,
            decay=0.999,
            reg=1e-3,
            eps=1e-8,
            epochs=10,
            batch_sz=100,
            show_fig=False):
        # # 特地轉換所有參數的型別到float32,要不然就會有error,但其他支theano 的 ANN class卻不用這樣做,不知道為什麼??
        learning_rate = np.float32(learning_rate)
        mu = np.float32(mu)
        decay = np.float32(decay)
        reg = np.float32(reg)

        # step1 get the data
        X, Y = shuffle(X, Y)
        X = X.astype(np.float32)  # for being avalibel for GPU
        Y = Y.astype(np.int32)
        Xvalid = Xvalid.astype(np.float32)
        Yvalid = Yvalid.astype(np.int32)

        # initialize each layer and parameters of NN
        N, D = X.shape
        K = len(set(Y))
        self.hidden_layers = []  # 這個list用來放HiddenLyer物件
        M1 = D
        count = 0
        for M2 in self.hidden_layer_sizes:  # 建立ANN物件時輸入的參數,轉成在ANN物件內部建立實體HiddenLyer物件
            h = HiddenLayer(M1, M2, count)
            self.hidden_layers.append(h)
            M1 = M2
            count += 1

        # initialize logistic regression layer
        W, b = init_weight_and_bias(
            M1, K)  # 最後一層output物件 ( the last logist regression layer )
        self.W = theano.shared(W, 'W_logreg')
        self.b = theano.shared(b, 'b_logreg')

        # collect all the parameters that we are going to use grediant descent
        self.params = [self.W, self.b]  # 先把最後一層output layer放進去
        for h in self.hidden_layers:
            self.params += h.params  # 應該是照著 hidden lyer1 , hidden layer2, 的順序放進去

        # for momentum, we need to create zero matrix for each layer
        dparams = [
            theano.shared(np.zeros_like(p.get_value(), dtype=np.float32))
            for p in self.params
        ]

        # for rmsprop, we need create cache
        cache = [
            theano.shared(np.ones_like(p.get_value(), dtype=np.float32))
            for p in self.params
        ]

        # step2. model
        # theano variabels
        thX = T.fmatrix('X')  # data input (matrix)
        thY = T.ivector('Y')  # target ( vector)
        pY = self.forward(thX)  # forward的 output,出來的型別是matrix

        # step3. cost function
        # define theano's computation grath
        rcost = reg * T.sum([(p * p).sum() for p in self.params])
        cost = -T.mean(
            T.log(pY[T.arange(thY.shape[0]), thY])
        ) + rcost  # thY there is a vector, and we do not need y2indicator in this case(算是特殊寫法)

        # step4. solver
        # define theano's operations
        grads = T.grad(cost, self.params)
        updates = [
            (c, decay * c + (np.float32(1.0) - decay) * g * g)
            for c, g in zip(cache, grads)
        ] + [(p, p + mu * dp - learning_rate * g / T.sqrt(c + eps))
             for p, c, dp, g in zip(self.params, cache, dparams, grads)
             ] + [(dp, mu * dp - learning_rate * g / T.sqrt(c + eps))
                  for c, dp, g in zip(cache, dparams, grads)]

        # updates = [
        #     (c, decay*c + (np.float32(1.0)-decay)*T.grad(cost, Dp)*T.grad(cost, p)) for p, c in zip(self.params, cache)
        # ] + [
        #     (p, p + mu*dp - learning_rate*T.grad(cost, p) / T.sqrt(c + eps)) for p, c, dp in zip(self.params, cache, dparams)
        # ] + [
        #     (dp, mu*dp - learning_rate*T.grad(cost, p) / T.sqrt(c + eps)) for p, c, dp in zip(self.params, cache, dparams)
        # ]

        train_op = theano.function(inputs=[thX, thY], updates=updates)

        # for evaluation and prediction
        prediction = self.th_prediction(thX)
        cost_prediction_op = theano.function(inputs=[thX, thY],
                                             outputs=[cost, prediction])

        n_batches = N // batch_sz
        costs = []
        for i in range(epochs):
            for j in range(n_batches):
                Xbatch = X[j * batch_sz:(j * batch_sz + batch_sz), ]
                Ybatch = Y[j * batch_sz:(j * batch_sz + batch_sz), ]

                train_op(Xbatch, Ybatch)

                if j % 20 == 0:
                    cost_valid, preds_valid = cost_prediction_op(
                        Xvalid, Yvalid)
                    costs.append(cost_valid)
                    e = error_rate(Yvalid, preds_valid)
                    print("i:", i, " j,", j, " nb:", n_batches, " cost:",
                          cost_valid, " error_rate:", e)

        if show_fig:
            plt.plot(costs)
            plt.show()
Example #42
0
 def cost(self, net):
     "Return the log-likelihood cost."
     return -T.mean(
         T.log(self.output_dropout)[T.arange(net.y.shape[0]), net.y])
Example #43
0
def tree_lstm_layer(tparams, inputs, options, prefix='tree_lstm', **kwargs):
    state_below, mask, left_mask, right_mask = inputs

    # state_below: #step x #sample x dim_emb
    # mask: #step x #sample
    # left_mask: #step x #sample x #step
    # right_mask: #step x #sample x #step

    nsteps = state_below.shape[0]
    dim = tparams[_p(prefix, 'U_l')].shape[0]

    n_samples = state_below.shape[1]
    init_state = tensor.alloc(0., n_samples, nsteps, dim)
    init_memory = tensor.alloc(0., n_samples, nsteps, dim)

    # use the slice to calculate all the different gates
    def _slice(_x, n, dim):
        if _x.ndim == 3:
            return _x[:, :, n * dim:(n + 1) * dim]
        elif _x.ndim == 2:
            return _x[:, n * dim:(n + 1) * dim]
        return _x[n * dim:(n + 1) * dim]

    # one time step of the lstm
    def _step(m_, x_, left_mask_, right_mask_, counter_, h_, c_):

        # zero out the input unless this is a leaf node
        # flag = tensor.switch(tensor.eq(tensor.sum(left_mask_, axis=1) + tensor.sum(right_mask_, axis=1), 0), 1., 0.)
        # x_ = x_ * flag[:, None]

        preact_l = tensor.dot(tensor.sum(left_mask_[:, :, None] * h_, axis=1),
                              tparams[_p(prefix, 'U_l')])
        preact_r = tensor.dot(tensor.sum(right_mask_[:, :, None] * h_, axis=1),
                              tparams[_p(prefix, 'U_r')])

        x_ = concatenate([
            _slice(x_, 0, dim),
            _slice(x_, 1, dim),
            _slice(x_, 1, dim),
            _slice(x_, 2, dim),
            _slice(x_, 3, dim)
        ],
                         axis=1)
        preact = preact_l + preact_r + x_

        i = tensor.nnet.sigmoid(_slice(preact, 0, dim))
        fl = tensor.nnet.sigmoid(_slice(preact, 1, dim))
        fr = tensor.nnet.sigmoid(_slice(preact, 2, dim))
        o = tensor.nnet.sigmoid(_slice(preact, 3, dim))
        u = tensor.tanh(_slice(preact, 4, dim))

        c_temp = fl * tensor.sum(left_mask_[:, :, None] * c_, axis=1) \
                    + fr * tensor.sum(right_mask_[:, :, None] * c_, axis=1) \
                    + i * u
        h_temp = o * tensor.tanh(c_temp)

        h = tensor.set_subtensor(h_[:, counter_, :], h_temp)
        c = tensor.set_subtensor(c_[:, counter_, :], c_temp)

        c = m_[:, None, None] * c + (1. - m_)[:, None, None] * c_
        h = m_[:, None, None] * h + (1. - m_)[:, None, None] * h_

        return h, c, i, fl, fr, o

    state_below = tensor.dot(state_below, tparams[_p(
        prefix, 'W')]) + tparams[_p(prefix, 'b')]

    rval, updates = theano.scan(
        fn=_step,
        sequences=[
            mask, state_below, left_mask, right_mask,
            tensor.arange(0, nsteps)
        ],
        outputs_info=[init_state, init_memory, None, None, None, None],
        name=_p(prefix, '_layers'),
        profile=False)
    return rval
Example #44
0
    def fit(self,
            X,
            Y,
            Xvalid,
            Yvalid,
            learning_rate=1e-4,
            mu=0.9,
            decay=0.9,
            epochs=8,
            batch_sz=100,
            show_fig=False):
        '''
        Takes training data and test data (valid) at once, then trains and
        validates along the way. Modifying hyperparams of learning_rate, mu,
        decay, epochs (iterations = N//batch_sz * epochs), batch_sz and whether
        to display a figure are passed as optional variables.
        '''
        X = X.astype(np.float32)
        Y = Y.astype(np.int32)
        Xvalid = Xvalid.astype(np.float32)
        Yvalid = Yvalid.astype(np.int32)

        self.rng = RandomStreams()

        # initialize hidden layers
        N, D = X.shape
        K = len(set(Y))
        self.hidden_layers = []
        M1 = D  # first input layer is the number of features in X
        count = 0
        for M2 in self.hidden_layer_sizes:
            h = HiddenLayer(M1, M2, count)  # layer ID is just the number
            self.hidden_layers.append(h)
            M1 = M2  # input layer to next layer is this layer.
            count += 1
        # output layer weights (last hidden layer to K output classes)
        W = np.random.randn(M1, K) * np.sqrt(2.0 / M1)
        b = np.zeros(K)
        self.W = theano.shared(W, 'W_logreg')
        self.b = theano.shared(b, 'b_logreg')

        # collect params for later use
        self.params = [self.W, self.b]
        for h in self.hidden_layers:
            self.params += h.params

        # set up theano functions and variables
        thX = T.matrix('X')
        thY = T.ivector('Y')
        pY_train = self.forward_train(thX)  # function to calc prob Y given X

        # this cost is for training
        cost = -T.mean(T.log(pY_train[T.arange(thY.shape[0]), thY]))

        # gradients wrt each param
        grads = T.grad(cost, self.params)

        # for momentum
        '''
        np.zeros_like(array) returns an array(/matrix) of the same shape and
        type of the given array. Very cool, never seen this before.
        '''
        dparams = [
            theano.shared(np.zeros_like(p.get_value())) for p in self.params
        ]

        # for rmsprop, initialize cache as 1
        cache = [
            theano.shared(np.ones_like(p.get_value())) for p in self.params
        ]
        '''
        Noting for myself that I've never seen this way of using zip to loop
        through multiple lists/arays with the same indices simultaneously.
        Makes a lot of sense now, I should see where I can use this to turn
        loops over indices in my code in to list comprehension that is by ele.
        '''
        # these are the functions for updating the variables of
        # dparams (momentum) and cache.
        new_cache = [
            decay * c + (1 - decay) * g * g
            for p, c, g in zip(self.params, cache, grads)
        ]
        new_dparams = [
            mu * dp - learning_rate * g / T.sqrt(new_c + 1e-10)
            for p, new_c, dp, g in zip(self.params, new_cache, dparams, grads)
        ]
        '''
        Using zip to create lists of tuples of the variables themselves, and
        the fuctions for updating them (cache, momentum params and params),
        where params are weights (W) and biases (b) for each layer.
        '''
        updates = [(c, new_c) for c, new_c in zip(cache, new_cache)] + [
            (dp, new_dp) for dp, new_dp in zip(dparams, new_dparams)
        ] + [(p, p + new_dp) for p, new_dp in zip(self.params, new_dparams)]

        train_op = theano.function(inputs=[thX, thY], updates=updates)

        # for evaluation and prediction, more theano graph set-up with tensors
        # still no values yet in any of these. Training loop next!
        pY_predict = self.forward_predict(thX)
        cost_predict = -T.mean(T.log(pY_predict[T.arange(thY.shape[0]), thY]))
        prediction = self.predict(thX)
        cost_predict_op = theano.function(inputs=[thX, thY],
                                          outputs=[cost_predict, prediction])

        n_batches = N // batch_sz
        costs = []
        for i in range(epochs):
            X, Y = shuffle(X, Y)
            for j in range(n_batches):
                Xbatch = X[j * batch_sz:(j * batch_sz + batch_sz)]
                Ybatch = Y[j * batch_sz:(j * batch_sz + batch_sz)]

                # theano function defined above that does all the work.
                # takes the data (like feed_dict in tf). The update calcs were
                # given to it above as a list for all layers.
                train_op(Xbatch, Ybatch)

                if j % 50 == 0:
                    c, p = cost_predict_op(Xvalid, Yvalid)
                    costs.append(c)
                    e = error_rate(Yvalid, p)
                    print("i:", i, "j:", j, "nb:", n_batches, "cost:", c,
                          "error rate:", e)

        if show_fig:
            plt.plot(costs)
            plt.show()
Example #45
0
 def sum_ll(self, y):
     """
 Sum log-lilelihood
 """
     return T.sum(self.log_p_y_given_x[T.arange(y.shape[0]), y])
Example #46
0
def build_model(shared_params, options):
    trng = RandomStreams(1234)
    drop_ratio = options['drop_ratio']
    batch_size = options['batch_size']
    n_dim = options['n_dim']

    w_emb = shared_params['w_emb']

    dropout = theano.shared(numpy.float32(0.))
    image_feat = T.ftensor3('image_feat')
    # T x batch_size
    input_idx = T.imatrix('input_idx')
    input_mask = T.matrix('input_mask')
    # label is the TRUE label
    label = T.ivector('label')

    empty_word = theano.shared(value=np.zeros((1, options['n_emb']),
                                              dtype='float32'),
                               name='empty_word')
    w_emb_extend = T.concatenate([empty_word, shared_params['w_emb']], axis=0)
    input_emb = w_emb_extend[input_idx]

    # get the transformed image feature
    h_0 = theano.shared(numpy.zeros((batch_size, n_dim), dtype='float32'))
    c_0 = theano.shared(numpy.zeros((batch_size, n_dim), dtype='float32'))

    if options['sent_drop']:
        input_emb = dropout_layer(input_emb, dropout, trng, drop_ratio)

    h_from_lstm, c_encode = lstm_layer(shared_params,
                                       input_emb,
                                       input_mask,
                                       h_0,
                                       c_0,
                                       options,
                                       prefix='sent_lstm')
    # pick the last one as encoder

    Y = fflayer(shared_params,
                image_feat,
                options,
                prefix='image_mlp',
                act_func=options.get('image_mlp_act', 'tanh'))
    r_0 = theano.shared(numpy.zeros((batch_size, n_dim), dtype='float32'))
    r = wbw_attention_layer(shared_params,
                            Y,
                            h_from_lstm,
                            input_mask,
                            r_0,
                            options,
                            return_final=True)

    h_star = T.tanh(
        T.dot(r, shared_params['W_p_w']) +
        T.dot(h_from_lstm[-1], shared_params['W_x_w']))
    combined_hidden = fflayer(shared_params,
                              h_star,
                              options,
                              prefix='scale_to_softmax',
                              act_func='linear')

    # drop the image output
    prob = T.nnet.softmax(combined_hidden)
    prob_y = prob[T.arange(prob.shape[0]), label]
    pred_label = T.argmax(prob, axis=1)
    # sum or mean?
    cost = -T.mean(T.log(prob_y))
    accu = T.mean(T.eq(pred_label, label))

    return image_feat, input_idx, input_mask, \
        label, dropout, cost, accu
Example #47
0
    def fit(self,
            X,
            Y,
            learning_rate=0.1,
            mu=0.99,
            reg=1.0,
            activation=T.tanh,
            epochs=100,
            show_fig=False):
        D = X[0].shape[1]  # X is of size N x T(n) x D
        K = len(set(Y.flatten()))  # K is the total number of classes
        N = len(Y)  # N is the total number of individuals
        M = self.M
        self.f = activation

        # initial weights (3 Weights, 2 biases, 1 hidden state)
        Wx = init_weight(D, M)
        Wh = init_weight(M, M)
        bh = np.zeros(M)
        h0 = np.zeros(M)  # initial hidden state
        Wo = init_weight(M, K)
        bo = np.zeros(K)

        # make them theano shared (3 Weights, 2 biases, 1 hidden state)
        self.Wx = theano.shared(Wx)
        self.Wh = theano.shared(Wh)
        self.bh = theano.shared(bh)
        self.h0 = theano.shared(h0)
        self.Wo = theano.shared(Wo)
        self.bo = theano.shared(bo)
        self.params = [self.Wx, self.Wh, self.bh, self.h0, self.Wo, self.bo]

        thX = T.fmatrix(
            'X'
        )  # 2-dimensional (T*D). In parity problem, D=1, e.g. thX = [[0],[1],[1],[0]]
        thY = T.ivector(
            'Y')  # thY in parity problem is also a sequence thY = [0,1,0,0]

        def recurrence(x_t, h_t1):  # do calculation at each recurrent unit
            # returns h(t), y(t)
            h_t = self.f(x_t.dot(self.Wx) + h_t1.dot(self.Wh) + self.bh)
            y_t = T.nnet.softmax(h_t.dot(self.Wo) + self.bo)
            return h_t, y_t

        [h, y], _ = theano.scan(  # do calculation at each recurrent unit
            fn=recurrence,
            outputs_info=[self.h0, None],
            sequences=thX,  # e.g. [[0],[1],[1],[0]]
            n_steps=thX.shape[0],  # for each time
        )

        py_x = y[:, 0, :]
        prediction = T.argmax(py_x, axis=1)

        # cost for one sequence
        cost = -T.mean(T.log(py_x[T.arange(thY.shape[0]),
                                  thY]))  # cross-entropy loss
        grads = T.grad(cost, self.params)  # gradient
        dparams = [theano.shared(p.get_value() * 0)
                   for p in self.params]  # momentum

        updates = [(p, p + mu * dp - learning_rate * g)
                   for p, dp, g in zip(self.params, dparams, grads)
                   ] + [(dp, mu * dp - learning_rate * g)
                        for dp, g in zip(dparams, grads)]

        self.predict_op = theano.function(
            inputs=[thX], outputs=prediction)  # forward feeding one time
        self.train_op = theano.function(
            inputs=[thX, thY],
            outputs=[cost, prediction, y],  # forwardprop to get loss
            updates=updates  # backprop to update params
        )

        # total cost for all sequences
        costs = []
        # main training loop
        for i in range(epochs):
            X, Y = shuffle(X, Y)
            n_correct = 0
            cost = 0
            for j in range(
                    N
            ):  # train one sequence at a time! So we do not specify batch_size, since batch_size = 1.
                c, p, rout = self.train_op(
                    X[j], Y[j]
                )  # each time we train a sequence, we update all params.
                # print "p:", p
                cost += c
                if p[-1] == Y[
                        j,
                        -1]:  # only need to check the last element in p and Y for parity problem
                    n_correct += 1
            print("shape y:", rout.shape)
            print("i:", i, "cost:", cost, "accuracy:", (float(n_correct) / N))
            costs.append(cost)
            if n_correct == N:
                break

        if show_fig:
            plt.plot(costs)
            plt.show()
Example #48
0
 def ind_ll(self, y):
     """
 Individual log-lilelihood
 """
     return self.log_p_y_given_x[T.arange(y.shape[0]), y]
Example #49
0
    def fit(self,
            X,
            learning_rate=10e-1,
            mu=0.99,
            reg=1.0,
            activation=T.tanh,
            epochs=500,
            show_fig=False):
        N = len(X)
        D = self.D
        M = self.M
        V = self.V
        self.f = activation

        # initial weights
        We = init_weight(V, D)
        Wx = init_weight(D, M)
        Wh = init_weight(M, M)
        bh = np.zeros(M)
        h0 = np.zeros(M)
        Wo = init_weight(M, V)
        bo = np.zeros(V)

        # make them theano shared
        self.We = theano.shared(We)
        self.Wx = theano.shared(Wx)
        self.Wh = theano.shared(Wh)
        self.bh = theano.shared(bh)
        self.h0 = theano.shared(h0)
        self.Wo = theano.shared(Wo)
        self.bo = theano.shared(bo)
        self.params = [
            self.We, self.Wx, self.Wh, self.bh, self.h0, self.Wo, self.bo
        ]

        thX = T.ivector('X')
        Ei = self.We[thX]  # will be a TxD matrix
        thY = T.ivector('Y')

        # sentence input:
        # [START, w1, w2, ..., wn]
        # sentence target:
        # [w1,    w2, w3, ..., END]

        def recurrence(x_t, h_t1):
            # returns h(t), y(t)
            h_t = self.f(x_t.dot(self.Wx) + h_t1.dot(self.Wh) + self.bh)
            y_t = T.nnet.softmax(h_t.dot(self.Wo) + self.bo)
            return h_t, y_t

        [h, y], _ = theano.scan(
            fn=recurrence,
            outputs_info=[self.h0, None],
            sequences=Ei,
            n_steps=Ei.shape[0],
        )

        py_x = y[:, 0, :]
        prediction = T.argmax(py_x, axis=1)

        cost = -T.mean(T.log(py_x[T.arange(thY.shape[0]), thY]))
        grads = T.grad(cost, self.params)
        dparams = [theano.shared(p.get_value() * 0) for p in self.params]

        updates = [(p, p + mu * dp - learning_rate * g)
                   for p, dp, g in zip(self.params, dparams, grads)
                   ] + [(dp, mu * dp - learning_rate * g)
                        for dp, g in zip(dparams, grads)]

        self.predict_op = theano.function(inputs=[thX], outputs=prediction)
        self.train_op = theano.function(inputs=[thX, thY],
                                        outputs=[cost, prediction],
                                        updates=updates)

        costs = []
        n_total = sum((len(sentence) + 1) for sentence in X)
        for i in xrange(epochs):
            X = shuffle(X)
            n_correct = 0
            cost = 0
            for j in xrange(N):
                # problem! many words --> END token are overrepresented
                # result: generated lines will be very short
                # we will try to fix in a later iteration
                # BAD! magic numbers 0 and 1...
                input_sequence = [0] + X[j]
                output_sequence = X[j] + [1]

                # we set 0 to start and 1 to end
                c, p = self.train_op(input_sequence, output_sequence)
                # print "p:", p
                cost += c
                # print "j:", j, "c:", c/len(X[j]+1)
                for pj, xj in zip(p, output_sequence):
                    if pj == xj:
                        n_correct += 1
            print "i:", i, "cost:", cost, "correct rate:", (float(n_correct) /
                                                            n_total)
            costs.append(cost)

        if show_fig:
            plt.plot(costs)
            plt.show()
Example #50
0
 def nll(self, y):
     """
 Mean negative log-lilelihood
 """
     return -T.mean(self.log_p_y_given_x[T.arange(y.shape[0]), y])
Example #51
0
def run_cnn(exp_name,
            dataset,
            embedding,
            log_fn,
            perf_fn,
            emb_dm=100,
            batch_size=100,
            filter_hs=[1, 2, 3],
            hidden_units=[200, 100, 11],
            type_hidden_units=[200, 100, 6],
            dropout_rate=0.5,
            shuffle_batch=True,
            n_epochs=300,
            lr_decay=0.95,
            activation=ReLU,
            sqr_norm_lim=9,
            non_static=True,
            print_freq=5,
            sen_reg=False,
            L2=False):
    """
    Train and Evaluate CNN event encoder model
    :dataset: list containing three elements[(train_x, train_y), 
            (valid_x, valid_y), (test_x, test_y)]
    :embedding: word embedding with shape (|V| * emb_dm)
    :filter_hs: filter height for each paralle cnn layer
    :dropout_rate: dropout rate for full connected layers
    :n_epochs: the max number of iterations
    
    """
    start_time = timeit.default_timer()
    rng = np.random.RandomState(1234)

    input_height = len(dataset[0][0][0][0])
    num_sens = len(dataset[0][0][0])
    print "--input height ", input_height
    input_width = emb_dm
    num_maps = hidden_units[0]

    ###################
    # start snippet 1 #
    ###################
    print "start to construct the model ...."
    x = T.tensor3("x")
    type_y = T.ivector("y_type")
    pop_y = T.ivector("y_pop")

    words = shared(value=np.asarray(embedding, dtype=theano.config.floatX),
                   name="embedding",
                   borrow=True)

    # define function to keep padding vector as zero
    zero_vector_tensor = T.vector()
    zero_vec = np.zeros(input_width, dtype=theano.config.floatX)
    set_zero = function([zero_vector_tensor],
                        updates=[(words,
                                  T.set_subtensor(words[0, :],
                                                  zero_vector_tensor))])

    layer0_input = words[T.cast(x.flatten(), dtype="int32")].reshape(
        (x.shape[0] * x.shape[1], 1, x.shape[2], emb_dm))

    #########################
    # Construct Sen Vec #####
    #########################
    conv_layers = []
    filter_shape = (num_maps, 1, filter_hs[0], emb_dm)
    pool_size = (input_height - filter_hs[0] + 1, 1)
    conv_layer = nn.ConvPoolLayer(rng,
                                  input=layer0_input,
                                  input_shape=None,
                                  filter_shape=filter_shape,
                                  pool_size=pool_size,
                                  activation=activation)

    # make the sentence vector maxtrix
    sen_vecs = conv_layer.output.reshape((x.shape[0] * x.shape[1], num_maps))
    conv_layers.append(conv_layer)

    ########################
    ## Task 1: populaiton###
    ########################
    pop_layer_sizes = zip(hidden_units, hidden_units[1:])
    pop_layer_input = sen_vecs
    pop_drop_input = sen_vecs
    pop_hidden_outs = []
    pop_drop_outs = []
    pop_hidden_layers = []
    pop_drop_layers = []
    droprate = 0.5
    for layer_size in pop_layer_sizes[:-1]:
        U_value = np.random.random(layer_size).astype(theano.config.floatX)
        b_value = np.zeros((layer_size[-1], ), dtype=theano.config.floatX)

        U = theano.shared(U_value, borrow=True, name="U")
        b = theano.shared(b_value, borrow=True, name="b")

        pop_hidden_layer = nn.HiddenLayer(rng, pop_layer_input, layer_size[0],
                                          layer_size[1], ReLU,
                                          U * (1 - droprate), b)
        pop_drop_hidden_layer = nn.DropoutHiddenLayer(rng, pop_drop_input,
                                                      layer_size[0],
                                                      layer_size[1], ReLU,
                                                      droprate, U, b)

        pop_hidden_layers.append(pop_hidden_layer)
        pop_drop_layers.append(pop_drop_hidden_layer)

        pop_hidden_out = pop_hidden_layer.output
        pop_drop_out = pop_drop_hidden_layer.output

        pop_layer_input = pop_hidden_out
        pop_drop_input = pop_drop_out

        pop_hidden_outs.append(pop_hidden_out)
        pop_drop_outs.append(pop_drop_out)

    # construct pop classifier
    n_in, n_out = pop_layer_sizes[-1]
    W_value = np.random.random((n_in, n_out)).astype(theano.config.floatX)
    b_value = np.zeros((n_out, ), dtype=theano.config.floatX)

    pop_W = theano.shared(W_value, borrow=True, name="pop_W")
    pop_b = theano.shared(b_value, borrow=True, name="pop_b")

    pop_act = T.dot(pop_hidden_outs[-1], pop_W * (1 - droprate)) + pop_b
    pop_drop_act = T.dot(pop_drop_outs[-1], pop_W) + pop_b

    sen_pop_probs = T.nnet.softmax(pop_act)
    sen_drop_pop_probs = T.nnet.softmax(pop_drop_act)

    pop_probs = T.mean(sen_pop_probs.reshape((x.shape[0], x.shape[1], n_out)),
                       axis=1)
    pop_drop_probs = T.mean(sen_drop_pop_probs.reshape(
        (x.shape[0], x.shape[1], n_out)),
                            axis=1)

    pop_y_pred = T.argmax(pop_probs, axis=1)
    pop_drop_y_pred = T.argmax(pop_drop_probs, axis=1)

    pop_neg_loglikelihood = -T.mean(
        T.log(pop_probs)[T.arange(pop_y.shape[0]), pop_y])
    pop_drop_neg_loglikelihood = -T.mean(
        T.log(pop_drop_probs)[T.arange(pop_y.shape[0]), pop_y])

    pop_errors = T.mean(T.neq(pop_y_pred, pop_y))
    pop_errors_detail = T.neq(pop_y_pred, pop_y)

    pop_cost = pop_neg_loglikelihood
    pop_drop_cost = pop_drop_neg_loglikelihood

    ########################
    ## Task 1: event type###
    ########################
    type_layer_sizes = zip(type_hidden_units, type_hidden_units[1:])
    type_layer_input = sen_vecs
    type_drop_input = sen_vecs
    type_hidden_outs = []
    type_drop_outs = []
    type_hidden_layers = []
    type_drop_layers = []
    droprate = 0.5
    for layer_size in type_layer_sizes[:-1]:
        U_value = np.random.random(layer_size).astype(theano.config.floatX)
        b_value = np.zeros((layer_size[-1], ), dtype=theano.config.floatX)

        U = theano.shared(U_value, borrow=True, name="U")
        b = theano.shared(b_value, borrow=True, name="b")

        type_hidden_layer = nn.HiddenLayer(rng, type_layer_input,
                                           layer_size[0], layer_size[1], ReLU,
                                           U * (1 - droprate), b)
        type_drop_hidden_layer = nn.DropoutHiddenLayer(rng, type_drop_input,
                                                       layer_size[0],
                                                       layer_size[1], ReLU,
                                                       droprate, U, b)

        type_hidden_layers.append(type_hidden_layer)
        type_drop_layers.append(type_drop_hidden_layer)

        type_hidden_out = type_hidden_layer.output
        type_drop_out = type_drop_hidden_layer.output

        type_layer_input = type_hidden_out
        type_drop_input = type_drop_out

        type_hidden_outs.append(type_hidden_out)
        type_drop_outs.append(type_drop_out)

    # construct pop classifier
    n_in, n_out = type_layer_sizes[-1]
    W_value = np.random.random((n_in, n_out)).astype(theano.config.floatX)
    b_value = np.zeros((n_out, ), dtype=theano.config.floatX)

    type_W = theano.shared(W_value, borrow=True, name="pop_W")
    type_b = theano.shared(b_value, borrow=True, name="pop_b")

    type_act = T.dot(type_hidden_outs[-1], type_W * (1 - droprate)) + type_b
    type_drop_act = T.dot(type_drop_outs[-1], type_W) + type_b

    #type_probs = T.nnet.softmax(type_max_act)
    #type_drop_probs = T.nnet.softmax(type_drop_max_act)

    sen_type_probs = T.nnet.softmax(type_act)
    sen_drop_type_probs = T.nnet.softmax(type_drop_act)

    type_probs = T.mean(sen_type_probs.reshape(
        (x.shape[0], x.shape[1], n_out)),
                        axis=1)
    type_drop_probs = T.mean(sen_drop_type_probs.reshape(
        (x.shape[0], x.shape[1], n_out)),
                             axis=1)

    type_y_pred = T.argmax(type_probs, axis=1)
    type_drop_y_pred = T.argmax(type_drop_probs, axis=1)

    type_neg_loglikelihood = -T.mean(
        T.log(type_probs)[T.arange(type_y.shape[0]), type_y])
    type_drop_neg_loglikelihood = -T.mean(
        T.log(type_drop_probs)[T.arange(type_y.shape[0]), type_y])

    type_errors = T.mean(T.neq(type_y_pred, type_y))
    type_errors_detail = T.neq(type_y_pred, type_y)

    type_cost = type_neg_loglikelihood
    type_drop_cost = type_drop_neg_loglikelihood

    ##################################
    # Collect all the parameters #####
    ##################################
    params = []
    # convolution layer params
    for conv_layer in conv_layers:
        params += conv_layer.params

    # params for population task
    for layer in pop_drop_layers:
        params += layer.params

    params.append(pop_W)
    params.append(pop_b)

    # params for event type task
    for layer in type_drop_layers:
        params += layer.params

    params.append(type_W)
    params.append(type_b)

    if non_static:
        params.append(words)

    total_cost = pop_cost + type_cost
    total_drop_cost = pop_drop_cost + type_drop_cost

    if L2:
        l2_norm = 0.1 * T.sum(pop_W**2) + 0.1 * T.sum(type_W**2)
        for drop_layer in type_drop_layers:
            l2_norm += 0.1 * T.sum(drop_layer.W**2)

        for drop_layer in pop_drop_layers:
            l2_norm += 0.1 * T.sum(drop_layer.W**2)
        total_cost += l2_norm
        total_drop_cost += l2_norm

    total_grad_updates = sgd_updates_adadelta(params, total_drop_cost,
                                              lr_decay, 1e-6, sqr_norm_lim)

    total_preds = [pop_y_pred, type_y_pred]
    total_errors_details = [pop_errors_detail, type_errors_detail]
    total_out = total_preds + total_errors_details

    #####################
    # Construct Dataset #
    #####################
    print "Copy data to GPU and constrct train/valid/test func"
    np.random.seed(1234)

    train_x, train_pop_y, train_type_y = shared_dataset(dataset[0])
    valid_x, valid_pop_y, valid_type_y = shared_dataset(dataset[1])
    test_x, test_pop_y, test_type_y = shared_dataset(dataset[2])

    n_train_batches = int(np.ceil(1.0 * len(dataset[0][0]) / batch_size))
    n_valid_batches = int(np.ceil(1.0 * len(dataset[1][0]) / batch_size))
    n_test_batches = int(np.ceil(1.0 * len(dataset[2][0]) / batch_size))

    #####################
    # Train model func #
    #####################
    index = T.iscalar()
    train_func = function(
        [index],
        total_drop_cost,
        updates=total_grad_updates,
        givens={
            x: train_x[index * batch_size:(index + 1) * batch_size],
            pop_y: train_pop_y[index * batch_size:(index + 1) * batch_size],
            type_y: train_type_y[index * batch_size:(index + 1) * batch_size]
        })

    valid_train_func = function(
        [index],
        total_drop_cost,
        updates=total_grad_updates,
        givens={
            x: valid_x[index * batch_size:(index + 1) * batch_size],
            pop_y: valid_pop_y[index * batch_size:(index + 1) * batch_size],
            type_y: valid_type_y[index * batch_size:(index + 1) * batch_size]
        })

    test_pred_detail = function(
        [index],
        total_out,
        givens={
            x: test_x[index * batch_size:(index + 1) * batch_size],
            pop_y: test_pop_y[index * batch_size:(index + 1) * batch_size],
            type_y: test_type_y[index * batch_size:(index + 1) * batch_size]
        })

    # apply early stop strategy
    patience = 100
    patience_increase = 2
    improvement_threshold = 1.005

    n_valid = len(dataset[1][0])
    n_test = len(dataset[2][0])

    epoch = 0
    best_params = None
    best_validation_score = 0.
    test_perf = 0

    done_loop = False

    log_file = open(log_fn, 'w')

    print "Start to train the model....."

    total_score = 0.0
    while (epoch < n_epochs) and not done_loop:
        start_time = timeit.default_timer()
        epoch += 1
        costs = []
        for minibatch_index in np.random.permutation(range(n_train_batches)):
            cost_epoch = train_func(minibatch_index)
            costs.append(cost_epoch)
            set_zero(zero_vec)

        # do validatiovalidn
        valid_cost = [
            valid_train_func(i)
            for i in np.random.permutation(xrange(n_valid_batches))
        ]

        if epoch % print_freq == 0:
            # do test
            pop_preds = []
            type_preds = []
            pop_errors = []
            type_errors = []
            pop_sens = []
            type_sens = []

            for i in xrange(n_test_batches):
                test_pop_pred, test_type_pred, test_pop_error, test_type_error = test_pred_detail(
                    i)

                pop_preds.append(test_pop_pred)
                type_preds.append(test_type_pred)
                pop_errors.append(test_pop_error)
                type_errors.append(test_type_error)

            pop_preds = np.concatenate(pop_preds)
            type_preds = np.concatenate(type_preds)
            pop_errors = np.concatenate(pop_errors)
            type_errors = np.concatenate(type_errors)

            pop_perf = 1 - np.mean(pop_errors)
            type_perf = 1 - np.mean(type_errors)

            # dumps the predictions and the choosed sentences
            with open(
                    os.path.join(perf_fn,
                                 "%s_%d.pop_pred" % (exp_name, epoch)),
                    'w') as epf:
                for p in pop_preds:
                    epf.write("%d\n" % int(p))

            with open(
                    os.path.join(perf_fn,
                                 "%s_%d.type_pred" % (exp_name, epoch)),
                    'w') as epf:
                for p in type_preds:
                    epf.write("%d\n" % int(p))

            message = "Epoch %d test pop perf %f, type perf %f, training_cost %f" % (
                epoch, pop_perf, type_perf, np.mean(costs))
            print message
            log_file.write(message + "\n")
            log_file.flush()

            if (pop_perf + type_perf) > total_score:
                total_score = pop_perf + type_perf
                # save the model
                model_name = os.path.join(
                    perf_fn, "%s_%d.best_model" % (exp_name, epoch))
                with open(model_name, 'wb') as mn:
                    for param in params:
                        cPickle.dump(param.get_value(), mn)

        end_time = timeit.default_timer()
        print "Finish one iteration using %f m" % (
            (end_time - start_time) / 60.)

    # output the final model params
    print "Output the final model"
    model_name = os.path.join(perf_fn, "%s_%d.final_model" % (exp_name, epoch))
    with open(model_name, 'wb') as mn:
        for param in params:
            cPickle.dump(param.get_value(), mn)

    log_file.flush()
    log_file.close()
Example #52
0
    def build(self):
        # description string: #words x #samples
        x = tensor.matrix('x', dtype=INT)
        x_mask = tensor.matrix('x_mask', dtype=FLOAT)
        y1 = tensor.matrix('y1', dtype=INT)
        y1_mask = tensor.matrix('y1_mask', dtype=FLOAT)
        y2 = tensor.matrix('y2', dtype=INT)
        y2_mask = tensor.matrix('y2_mask', dtype=FLOAT)

        self.inputs = OrderedDict()
        self.inputs['x'] = x
        self.inputs['x_mask'] = x_mask
        self.inputs['y1'] = y1
        self.inputs['y2'] = y2
        self.inputs['y1_mask'] = y1_mask
        self.inputs['y2_mask'] = y2_mask

        # for the backward rnn, we just need to invert x and x_mask
        xr = x[::-1]
        xr_mask = x_mask[::-1]

        n_timesteps = x.shape[0]
        n_timesteps_trg = y1.shape[0]
        n_timesteps_trgmult = y2.shape[0]
        n_samples = x.shape[1]

        # word embedding for forward rnn (source)
        emb = dropout(self.tparams['Wemb_enc'][x.flatten()], self.trng,
                      self.emb_dropout, self.use_dropout)
        emb = emb.reshape([n_timesteps, n_samples, self.embedding_dim])
        proj = get_new_layer(self.enc_type)[1](self.tparams,
                                               emb,
                                               prefix='encoder',
                                               mask=x_mask,
                                               layernorm=self.lnorm)

        # word embedding for backward rnn (source)
        embr = dropout(self.tparams['Wemb_enc'][xr.flatten()], self.trng,
                       self.emb_dropout, self.use_dropout)
        embr = embr.reshape([n_timesteps, n_samples, self.embedding_dim])
        projr = get_new_layer(self.enc_type)[1](self.tparams,
                                                embr,
                                                prefix='encoder_r',
                                                mask=xr_mask,
                                                layernorm=self.lnorm)

        # context will be the concatenation of forward and backward rnns
        ctx = [
            tensor.concatenate([proj[0], projr[0][::-1]],
                               axis=proj[0].ndim - 1)
        ]

        for i in range(1, self.n_enc_layers):
            ctx = get_new_layer(self.enc_type)[1](self.tparams,
                                                  ctx[0],
                                                  prefix='deepencoder_%d' % i,
                                                  mask=x_mask,
                                                  layernorm=self.lnorm)

        # Apply dropout
        ctx = dropout(ctx[0], self.trng, self.ctx_dropout, self.use_dropout)

        if self.init_cgru == 'text':
            # mean of the context (across time) will be used to initialize decoder rnn
            ctx_mean = (ctx * x_mask[:, :, None]).sum(0) / x_mask.sum(0)[:,
                                                                         None]
            init_state = get_new_layer('ff')[1](self.tparams,
                                                ctx_mean,
                                                prefix='ff_state',
                                                activ='tanh')
        else:
            # Assume zero-initialized decoder
            init_state = tensor.alloc(0., n_samples, self.rnn_dim)

        # word embedding (target), we will shift the target sequence one time step
        # to the right. This is done because of the bi-gram connections in the
        # readout and decoder rnn. The first target will be all zeros and we will
        emb_lem = self.tparams['Wemb_dec_lem'][y1.flatten()]
        emb_lem = emb_lem.reshape(
            [n_timesteps_trg, n_samples, self.embedding_dim])
        emb_lem_shifted = tensor.zeros_like(emb_lem)
        emb_lem_shifted = tensor.set_subtensor(emb_lem_shifted[1:],
                                               emb_lem[:-1])
        emb_lem = emb_lem_shifted

        emb_fact = self.tparams['Wemb_dec_fact'][y2.flatten()]
        emb_fact = emb_fact.reshape(
            [n_timesteps_trgmult, n_samples, self.embedding_dim])
        emb_fact_shifted = tensor.zeros_like(emb_fact)
        emb_fact_shifted = tensor.set_subtensor(emb_fact_shifted[1:],
                                                emb_fact[:-1])
        emb_fact = emb_fact_shifted

        # Concat the 2 embeddings
        emb_prev = tensor.concatenate([emb_lem, emb_fact], axis=2)

        # decoder - pass through the decoder conditional gru with attention
        proj = get_new_layer('gru_cond')[1](self.tparams,
                                            emb_prev,
                                            prefix='decoder',
                                            mask=y1_mask,
                                            context=ctx,
                                            context_mask=x_mask,
                                            one_step=False,
                                            init_state=init_state,
                                            layernorm=False)
        # hidden states of the decoder gru
        proj_h = proj[0]

        # weighted averages of context, generated by attention module
        ctxs = proj[1]

        # weights (alignment matrix)
        self.alphas = proj[2]

        # compute word probabilities
        logit_gru = get_new_layer('ff')[1](self.tparams,
                                           proj_h,
                                           prefix='ff_logit_gru',
                                           activ='linear')
        logit_ctx = get_new_layer('ff')[1](self.tparams,
                                           ctxs,
                                           prefix='ff_logit_ctx',
                                           activ='linear')
        logit_lem = get_new_layer('ff')[1](self.tparams,
                                           emb_lem,
                                           prefix='ff_logit_lem',
                                           activ='linear')
        logit_fact = get_new_layer('ff')[1](self.tparams,
                                            emb_fact,
                                            prefix='ff_logit_fact',
                                            activ='linear')

        logit1 = dropout(tanh(logit_gru + logit_lem + logit_ctx), self.trng,
                         self.out_dropout, self.use_dropout)
        logit2 = dropout(tanh(logit_gru + logit_fact + logit_ctx), self.trng,
                         self.out_dropout, self.use_dropout)

        if self.tied_trg_emb is False:
            logit_trg = get_new_layer('ff')[1](self.tparams,
                                               logit1,
                                               prefix='ff_logit_trg',
                                               activ='linear')
            logit_trgmult = get_new_layer('ff')[1](self.tparams,
                                                   logit2,
                                                   prefix='ff_logit_trgmult',
                                                   activ='linear')

        else:
            logit_trg = tensor.dot(logit1, self.tparams['Wemb_dec_lem'].T)
            logit_trgmult = tensor.dot(logit2, self.tparams['Wemb_dec_fact'].T)

        logit_trg_shp = logit_trg.shape
        logit_trgmult_shp = logit_trgmult.shape

        # Apply logsoftmax (stable version)
        log_trg_probs = -tensor.nnet.logsoftmax(
            logit_trg.reshape(
                [logit_trg_shp[0] * logit_trg_shp[1], logit_trg_shp[2]]))
        log_trgmult_probs = -tensor.nnet.logsoftmax(
            logit_trgmult.reshape([
                logit_trgmult_shp[0] * logit_trgmult_shp[1],
                logit_trgmult_shp[2]
            ]))

        # cost
        y1_flat = y1.flatten()
        y2_flat = y2.flatten()
        y1_flat_idx = tensor.arange(
            y1_flat.shape[0]) * self.n_words_trg1 + y1_flat
        y2_flat_idx = tensor.arange(
            y2_flat.shape[0]) * self.n_words_trg2 + y2_flat

        cost_trg = log_trg_probs.flatten()[y1_flat_idx]
        cost_trg = cost_trg.reshape([n_timesteps_trg, n_samples])
        cost_trg = (cost_trg * y1_mask).sum(0)

        cost_trgmult = log_trgmult_probs.flatten()[y2_flat_idx]
        cost_trgmult = cost_trgmult.reshape([n_timesteps_trgmult, n_samples])
        cost_trgmult = (cost_trgmult * y2_mask).sum(0)

        cost = cost_trg + cost_trgmult
        self.f_log_probs = theano.function(list(self.inputs.values()), cost)

        # For alpha regularization

        return cost
Example #53
0
    def step_backward(T2_lp1, char_lp1):
        char_l = T2_lp1[T.arange(N), T.cast(char_lp1, 'int32')]  # N

        return T.cast(char_l, 'float32')
    def fit(self, trees, learning_rate=3*1e-3, mu=0.99, reg=1e-4, epochs=15, activation=T.nnet.relu, train_inner_nodes=False):
        D = self.D
        V = self.V
        K = self.K
        self.f = activation
        N = len(trees)

        We = init_weight(V, D)
        Wh = np.random.randn(2, D, D) / np.sqrt(2 + D + D)
        bh = np.zeros(D)
        Wo = init_weight(D, K)
        bo = np.zeros(K)

        self.We = theano.shared(We)
        self.Wh = theano.shared(Wh)
        self.bh = theano.shared(bh)
        self.Wo = theano.shared(Wo)
        self.bo = theano.shared(bo)
        self.params = [self.We, self.Wh, self.bh, self.Wo, self.bo]

        words = T.ivector('words')
        parents = T.ivector('parents')
        relations = T.ivector('relations')
        labels = T.ivector('labels')

        def recurrence(n, hiddens, words, parents, relations):
            w = words[n]
            # any non-word will have index -1
            # if T.ge(w, 0):
            #     hiddens = T.set_subtensor(hiddens[n], self.We[w])
            # else:
            #     hiddens = T.set_subtensor(hiddens[n], self.f(hiddens[n] + self.bh))
            hiddens = T.switch(
                T.ge(w, 0),
                T.set_subtensor(hiddens[n], self.We[w]),
                T.set_subtensor(hiddens[n], self.f(hiddens[n] + self.bh))
            )

            r = relations[n] # 0 = is_left, 1 = is_right
            p = parents[n] # parent idx
            # if T.ge(p, 0):
            #     # root will have parent -1
            #     hiddens = T.set_subtensor(hiddens[p], hiddens[p] + hiddens[n].dot(self.Wh[r]))
            hiddens = T.switch(
                T.ge(p, 0),
                T.set_subtensor(hiddens[p], hiddens[p] + hiddens[n].dot(self.Wh[r])),
                hiddens
            )
            return hiddens

        hiddens = T.zeros((words.shape[0], D))

        h, _ = theano.scan(
            fn=recurrence,
            outputs_info=[hiddens],
            n_steps=words.shape[0],
            sequences=T.arange(words.shape[0]),
            non_sequences=[words, parents, relations],
        )

        # shape of h that is returned by scan is TxTxD
        # because hiddens is TxD, and it does the recurrence T times
        # technically this stores T times too much data
        py_x = T.nnet.softmax(h[-1].dot(self.Wo) + self.bo)

        prediction = T.argmax(py_x, axis=1)
        
        rcost = reg*T.mean([(p*p).sum() for p in self.params])
        if train_inner_nodes:
            # won't work for binary classification
            cost = -T.mean(T.log(py_x[T.arange(labels.shape[0]), labels])) + rcost
        else:
            # print "K is:", K
            # premean = T.log(py_x[-1])
            # target = T.zeros(K)
            # target = T.set_subtensor(target[labels[-1]], 1)            
            # cost = -T.mean(target * premean)

            cost = -T.mean(T.log(py_x[-1, labels[-1]])) + rcost
Example #55
0
 def negative_log_likelihood(self, tensor_x, tensor_y):
     tensor_ypredproba = self.decision_function_tensor(tensor_x)
     return -T.mean(
         T.log(tensor_ypredproba)[T.arange(tensor_y.shape[0]), tensor_y])
Example #56
0
    def step(char_lm1, char_l, trans_probs_l):
        """Probability of going from char_lm1 to char_l using trans_probs_l tensor"""
        char_lm1 = T.cast(char_lm1, 'int32')
        char_l = T.cast(char_l, 'int32')

        return trans_probs_l[T.arange(N), char_lm1, char_l]  # N
Example #57
0
def smoothedSeries(obs_series, init_probs, final_probs, tmat, emission):	
	f_result, updates = theano.scan(fn = forwardIteration, outputs_info = init_probs, non_sequences = [tmat, emission], sequences = obs_series, n_steps = obs_series.shape[0])

	f_r = T.concatenate((init_probs.dimshuffle('x',0), f_result))

	b_result, b_updates = theano.scan(fn = backwardIteration, outputs_info = final_probs, non_sequences = [tmat, emission], sequences = obs_series, n_steps = obs_series.shape[0], go_backwards = True)

	b_r = T.concatenate((final_probs.dimshuffle('x',0), b_result))

	s_result, s_updates = theano.scan(fn = smoothingIteration, outputs_info = None, non_sequences = [b_r,  b_r.shape[0]], sequences = [f_r, T.arange(10000)])

	return s_result
Example #58
0
 def negative_log_likelihood_sum(self, y):
     return -T.sum(T.log(self.p_y_given_x)[T.arange(y.shape[0]), y])
Example #59
0
b_r = T.concatenate((final_probs_tensor.dimshuffle('x',0), b_result))
b_fn = theano.function(inputs = [final_probs_tensor, tmat_tensor, emission_tensor, obs_vec], outputs = b_r)

#Now define a scan that computes the smoothed probabilities

#To reverse the backward probabilities, we provide an index and the length
#of dimension 1 of the backward prob matrix. 

#Is there a better way of reversing a sequence while iterating over 
#another forwards w/o an additional call to scan to reverse the original?
def smoothingIteration(f_val, f_index, b_vals, max_index):
	x = b_vals[max_index - f_index-1]
	s = f_val*x
	return s / T.sum(s)

s_result, s_updates = theano.scan(fn = smoothingIteration, outputs_info = None, non_sequences = [b_r,  b_r.shape[0]], sequences = [f_r, T.arange(10000)])

s_fn = theano.function(inputs = [init_probs_tensor, final_probs_tensor, tmat_tensor, emission_tensor, obs_vec], outputs = s_result)

#This function is supposed to obtain the smoothed state probabilities
#from a vector passed as obs_series, given the initial probs, final
#probs and emission matrix
def smoothedSeries(obs_series, init_probs, final_probs, tmat, emission):	
	f_result, updates = theano.scan(fn = forwardIteration, outputs_info = init_probs, non_sequences = [tmat, emission], sequences = obs_series, n_steps = obs_series.shape[0])

	f_r = T.concatenate((init_probs.dimshuffle('x',0), f_result))

	b_result, b_updates = theano.scan(fn = backwardIteration, outputs_info = final_probs, non_sequences = [tmat, emission], sequences = obs_series, n_steps = obs_series.shape[0], go_backwards = True)

	b_r = T.concatenate((final_probs.dimshuffle('x',0), b_result))
Example #60
0
    def fit(self,
            X,
            Y,
            learning_rate=0.1,
            mu=0.99,
            reg=1.0,
            activation=T.tanh,
            epochs=100,
            show_fig=False):
        D = X[0].shape[1]  # X is of size N x T(n) x D
        K = len(set(Y.flatten()))
        N = len(Y)
        M = self.M
        self.f = activation

        # initial weights
        Wx = init_weight(D, M)
        Wh = init_weight(M, M)
        bh = np.zeros(M)
        h0 = np.zeros(M)
        Wo = init_weight(M, K)
        bo = np.zeros(K)

        # make them theano shared
        self.Wx = theano.shared(Wx)
        self.Wh = theano.shared(Wh)
        self.bh = theano.shared(bh)
        self.h0 = theano.shared(h0)
        self.Wo = theano.shared(Wo)
        self.bo = theano.shared(bo)
        self.params = [self.Wx, self.Wh, self.bh, self.h0, self.Wo, self.bo]

        thX = T.fmatrix('X')
        thY = T.ivector('Y')

        def recurrence(x_t, h_t1):
            # returns h(t), y(t)
            h_t = self.f(x_t.dot(self.Wx) + h_t1.dot(self.Wh) + self.bh)
            y_t = T.nnet.softmax(h_t.dot(self.Wo) + self.bo)
            return h_t, y_t

        [h, y], _ = theano.scan(
            fn=recurrence,
            outputs_info=[self.h0, None],
            sequences=thX,
            n_steps=thX.shape[0],
        )

        py_x = y[:, 0, :]
        prediction = T.argmax(py_x, axis=1)

        cost = -T.mean(T.log(py_x[T.arange(thY.shape[0]), thY]))
        grads = T.grad(cost, self.params)
        dparams = [theano.shared(p.get_value() * 0) for p in self.params]

        updates = [(p, p + mu * dp - learning_rate * g)
                   for p, dp, g in zip(self.params, dparams, grads)
                   ] + [(dp, mu * dp - learning_rate * g)
                        for dp, g in zip(dparams, grads)]

        self.predict_op = theano.function(inputs=[thX], outputs=prediction)
        self.train_op = theano.function(inputs=[thX, thY],
                                        outputs=[cost, prediction, y],
                                        updates=updates)

        costs = []
        for i in range(epochs):
            X, Y = shuffle(X, Y)
            n_correct = 0
            cost = 0
            for j in range(N):
                c, p, rout = self.train_op(X[j], Y[j])
                # print "p:", p
                cost += c
                if p[-1] == Y[j, -1]:
                    n_correct += 1
            print("shape y:", rout.shape)
            print("i:", i, "cost:", cost, "classification rate:",
                  (float(n_correct) / N))
            costs.append(cost)
            if n_correct == N:
                break

        if show_fig:
            plt.plot(costs)
            plt.show()