Exemple #1
0
    def quantized_bprop(self, cost):
        """
        bprop for convolution layer equals:

        (
            self.x.dimshuffle(1, 0, 2, 3)       (*)
            T.grad(cost, wrt=#convoutput).dimshuffle(1, 0, 2, 3)[:, :, ::-1, ::-1]
        ).dimshuffle(1, 0, 2, 3)[:, :, ::-1, ::-1]

        '(*)'stands for convolution.
        Here we quantize (rep of previous layer) and leave the rest as it is.
        """
        # the lower 2**(integer power)
        index_low = T.switch(self.x > 0., T.floor(T.log2(self.x)),
                             T.floor(T.log2(-self.x)))
        index_low = T.clip(index_low, -4, 3)
        sign = T.switch(self.x > 0., 1., -1.)
        #index_up = index_low + 1  # the upper 2**(integer power) though not used explicitly.
        p_up = sign * self.x / 2**(index_low) - 1  # percentage of upper index.
        srng = theano.sandbox.rng_mrg.MRG_RandomStreams(
            self.rng.randint(999999))
        index_random = index_low + srng.binomial(
            n=1, p=p_up, size=T.shape(self.x), dtype=theano.config.floatX)
        quantized_rep = sign * 2**index_random
        error = T.grad(cost=cost, wrt=self.conv_z)

        self.dEdW = T.nnet.conv.conv2d(
            input=quantized_rep.dimshuffle(1, 0, 2, 3),
            filters=error.dimshuffle(1, 0, 2, 3)[:, :, ::-1, ::-1]).dimshuffle(
                1, 0, 2, 3)[:, :, ::-1, ::-1]

        self.dEdb = T.grad(cost=cost, wrt=self.b)

        if self.BN == True:
            self.dEda = T.grad(cost=cost, wrt=self.a)
Exemple #2
0
 def get_entropy_reg(self):
     epsilon = 1e-7
     p = self.activation_h(self.input)
     p = T.switch(T.eq(p, 0), epsilon, p)
     p = T.switch(T.eq(p, 1), 1-epsilon, p)
     entropy = -p*T.log2(p)-(1-p)*T.log2(1-p)
     return T.mean(entropy)
Exemple #3
0
    def quantized_bprop(self, cost):
        """
        bprop equals:
        (active_prime) *elem_multiply* error_signal_in * (rep of previous layer)
        (rep of previous layer) is recoded as self.x during fprop() process.
        Here we quantize (rep of previous layer) and leave the rest as it is.
        """
        # the lower 2**(integer power)
        index_low = T.switch(self.x > 0., T.floor(T.log2(self.x)),
                             T.floor(T.log2(-self.x)))
        sign = T.switch(self.x > 0., 1., -1.)
        # index_up = index_low + 1  # the upper 2**(integer power) though not used explicitly.
        p_up = sign * self.x / 2**(index_low) - 1  # percentage of upper index.
        srng = theano.sandbox.rng_mrg.MRG_RandomStreams(
            self.rng.randint(999999))
        index_random = index_low + srng.binomial(
            n=1, p=p_up, size=T.shape(self.x), dtype=theano.config.floatX)
        quantized_rep = sign * 2**index_random
        # there is sth wrong with this self-made backprop:
        # the code is using BN, but this type of explicitly computation is not considering
        # gradients caused by BN.
        # error = self.activation_prime(self.z) * error_signal_in
        error = T.grad(cost=cost, wrt=self.z)
        self.dEdW = T.dot(quantized_rep.T, error)

        self.dEdb = T.grad(cost=cost, wrt=self.b)

        if self.BN == True:
            self.dEda = T.grad(cost=cost, wrt=self.a)
Exemple #4
0
    def get_output(self, input_, label, mask):
        """
        This function overrides the parents' one.
        Computes the loss by mode input_ion and real label.
        
        Parameters
        ----------
        input_: TensorVariable
            an array of (batch size, input_ion).
            for accuracy task, "input_" is 2D matrix.
        label: TensorVariable
            an array of (batch size, answer) or (batchsize,) if label is a list of class labels.
            for word perplexity case, currently only second one is supported.
            should make label as integer.
        mask: TensorVariable
            an array of (batchsize,) only contains 0 and 1.
            loss are summed or averaged only through 1.

        Returns
        -------
        TensorVariable
            a symbolic tensor variable which is scalar.
        """
        # do
        if mask is None:
            return T.pow(
                2, -T.mean(T.log2(input_[T.arange(label.shape[0]), label])))
        else:
            return T.pow(
                2,
                -T.sum(T.log2(input_[T.arange(label.shape[0]), label]) * mask)
                / T.sum(mask))
Exemple #5
0
    def quantized_bprop(self, cost):
        """
        bprop for convolution layer equals:
        
        (
            self.x.dimshuffle(1, 0, 2, 3)       (*) 
            T.grad(cost, wrt=#convoutput).dimshuffle(1, 0, 2, 3)[:, :, ::-1, ::-1]
        ).dimshuffle(1, 0, 2, 3)[:, :, ::-1, ::-1]

        '(*)'stands for convolution.
        Here we quantize (rep of previous layer) and leave the rest as it is.
        """
        # the lower 2**(integer power)
        index_low = T.switch(self.x > 0., T.floor(T.log2(self.x)), T.floor(T.log2(-self.x)))
        index_low = T.clip(index_low, -4, 3)
        sign = T.switch(self.x > 0., 1., -1.)
        #index_up = index_low + 1  # the upper 2**(integer power) though not used explicitly.
        p_up = sign * self.x / 2**(index_low) - 1  # percentage of upper index.
        srng = theano.sandbox.rng_mrg.MRG_RandomStreams(self.rng.randint(999999))
        index_random = index_low + srng.binomial(n=1, p=p_up, size=T.shape(self.x), dtype=theano.config.floatX)
        quantized_rep = sign * 2**index_random
        error = T.grad(cost=cost, wrt=self.conv_z)

        self.dEdW = T.nnet.conv.conv2d(
            input=quantized_rep.dimshuffle(1, 0, 2, 3),
            filters=error.dimshuffle(1, 0, 2, 3)[:, :, ::-1, ::-1]
        ).dimshuffle(1, 0, 2, 3)[:, :, ::-1, ::-1]

        self.dEdb = T.grad(cost=cost, wrt=self.b)

        if self.BN == True:
            self.dEda = T.grad(cost=cost, wrt=self.a)
Exemple #6
0
    def quantized_bprop(self, cost):
        """
        bprop equals:
        (active_prime) *elem_multiply* error_signal_in * (rep of previous layer)
        (rep of previous layer) is recoded as self.x during fprop() process.
        Here we quantize (rep of previous layer) and leave the rest as it is.
        """
        # the lower 2**(integer power)
        index_low = T.switch(self.x > 0., T.floor(T.log2(self.x)), T.floor(T.log2(-self.x)))
        
        index_low = T.clip(index_low, -4, 3)
        sign = T.switch(self.x > 0., 1., -1.)
        #index_up = index_low + 1  # the upper 2**(integer power) though not used explicitly.
        p_up = sign * self.x / 2**(index_low) - 1  # percentage of upper index.
        srng = theano.sandbox.rng_mrg.MRG_RandomStreams(self.rng.randint(999999))
        index_random = index_low + srng.binomial(n=1, p=p_up, size=T.shape(self.x), dtype=theano.config.floatX)
        quantized_rep = sign * 2**index_random
        # there is sth wrong with this self-made backprop: 
        # the code is using BN, but this type of explicit computation is not considering 
        # gradients caused by BN.
        # error = self.activation_prime(self.z) * error_signal_in
        error = T.grad(cost=cost, wrt=self.z)
        self.dEdW = T.dot(quantized_rep.T, error)
        #self.dEdW = T.dot(self.x.T, error)

        self.dEdb = T.grad(cost=cost, wrt=self.b)

        if self.BN == True:
            self.dEda = T.grad(cost=cost, wrt=self.a)
Exemple #7
0
def quantize_weights(W, srng=None, bitlimit=None, deterministic=False):
    """
    Exponential quantization
    :param W: Weights
    :param srng: random number generator
    :param bitlimit: limit values to be in power of 2 range, e.g. for values in 2^-22 to 2^9 set it to [-22, 9]
    :param deterministic: deterministic rounding
    :return: quantized weights
    """
    bitlimit = [-22, 9]  #hardcoded for experiments
    if srng is None:
        rng = np.random.RandomState(666)
        srng = theano.sandbox.rng_mrg.MRG_RandomStreams(rng.randint(999999))

    if bitlimit:
        index_low = T.clip(
            T.switch(W > 0., T.floor(T.log2(W)), T.floor(T.log2(-W))),
            bitlimit[0], bitlimit[1])
    else:
        index_low = T.switch(W > 0., T.floor(T.log2(W)), T.floor(T.log2(-W)))
    sign = T.switch(W > 0., 1., -1.)
    p_up = sign * W / 2**(index_low) - 1  # percentage of upper index.
    if deterministic:
        index_deterministic = index_low + T.switch(p_up > 0.5, 1, 0)
        quantized_W = sign * 2**index_deterministic
    else:
        index_random = index_low + srng.binomial(
            n=1, p=p_up, size=T.shape(W), dtype=theano.config.floatX)
        quantized_W = sign * 2**index_random
    return quantized_W
Exemple #8
0
    def _compileTheanoFunctions(self):
        """This methods compiles all theano functions."""

        print("Start compiling Theano training function...")
        D = T.tensor4('data')
        updates = self._updateWeightsOnMinibatch(D, self.cd_k)
        self.theano_trainingFct = theano.function([D],
                                                  None,
                                                  updates=updates,
                                                  name='train_CRBM')

        #compute mean free energy
        mfe_ = self._meanFreeEnergy(D)
        #compute number  of motif hits
        [_, H] = self._computeHgivenV(D)

        #H = self.bottomUpProbability(self.bottomUpActivity(D))
        nmh_ = T.mean(H)  # mean over samples (K x 1 x N_h)

        #compute norm of the motif parameters
        twn_ = T.sqrt(T.mean(self.motifs**2))

        #compute information content
        pwm = self._softmax(self.motifs)
        entropy = -pwm * T.log2(pwm)
        entropy = T.sum(entropy, axis=2)  # sum over letters
        ic_= T.log2(self.motifs.shape[2]) - \
            T.mean(entropy)  # log is possible information due to length of sequence
        medic_= T.log2(self.motifs.shape[2]) - \
            T.mean(T.sort(entropy, axis=2)[:, :, entropy.shape[2] // 2])

        self.theano_evaluateData = theano.function([D], [mfe_, nmh_],
                                                   name='evaluationData')

        W = T.tensor4("W")
        self.theano_evaluateParams = theano.function([], [twn_, ic_, medic_],
                                                     givens={W: self.motifs},
                                                     name='evaluationParams')
        fed = self._freeEnergyForData(D)
        self.theano_freeEnergy = theano.function([D],
                                                 fed,
                                                 name='fe_per_datapoint')

        fed = self._freeEnergyPerMotif(D)
        self.theano_fePerMotif = theano.function([D], fed, name='fe_per_motif')

        if self.doublestranded:
            self.theano_getHitProbs = theano.function([D], \
                self._bottomUpProbability(self._bottomUpActivity(D)))
        else:
            self.theano_getHitProbs = theano.function([D], \
                #self.bottomUpProbability( T.maximum(self.bottomUpActivity(D),

                self._bottomUpProbability( self._bottomUpActivity(D) +
                        self._bottomUpActivity(D, True)))
        print("Compilation of Theano training function finished")
def kl_divergence(rho, rho_cap):
    """TODO: Docstring for kl_divergence.

    :rho: TODO
    :rho_cap: TODO
    :returns: TODO

    """
    kl = T.sum(rho * T.log2(rho / rho_cap)
            + (1.5 - rho) * T.log2((1.5 - rho) / (1.5 - rho_cap)))

    return kl
		def OneStep(alpha, b):
			# minimize alpha
			alpha_new  = (T.abs_(b*D*W).sum()/T.abs_(b*D).sum()).astype('float32') 
			# minimize b
			tmp_new = T.clip(W/alpha_new, -1., 1.)
			b_new =  T.switch( T.ge(tmp_new, pow(2, -n)), T.pow(2, round3(T.log2(tmp_new)-0.0849625)), 
				T.switch( T.le(tmp_new, -pow(2, -n)), -T.pow(2, round3(T.log2(-tmp_new)-0.0849625)), 0.))		
			b_new = T.switch(T.ge(b_new, pow(2, - (n-1))), b_new, 
				T.switch(T.le(b_new, -pow(2, -(n-1))), b_new, T.sgn(b_new)*pow(2, -(n-1))))
		
			delta = T.abs_(alpha_new-alpha)
			condition = T.lt(delta, 1e-6)
			return [alpha_new, b_new], theano.scan_module.until(condition)
Exemple #11
0
    def compile_entropy_fun(self):
        p = self.v_samples

        h = -p*T.log2(p)-(1-p)*T.log2(1-p)
        h = T.switch(T.isnan(h), 0., h)
        
        if self.eval_mask is not None:
            eval_units = np.sum(self.eval_mask)
        else:
            eval_units = np.prod(self.clamp_mask.shape) - np.sum(self.clamp_mask)
            
        entropy = T.sum(h) / (self.n_samples * eval_units) 

        self.entropy_fun = theano.function([], entropy)
Exemple #12
0
def hingesig(y_true, y_pred):
    """Computes the hingeles for a sigmoidal output by apply the logit to y_pred.
       Note: this function is intended for THEANO.
    Arguments:
        y_true  -- a theano tensor holding the true labels
        y_pred -- a theano tensor holding the raw pradictions, i.e. the sigmoid output
    
    Returns:
        theano tensor with hingelosss
    """

    transform_y_true = T.switch(T.eq(y_true, 0), -1, y_true)
    compl_y_pred = T.clip(T.sub(1., y_pred), 1e-20, 1)
    y_pred = T.clip(y_pred, 1e-20, 1)
    logit = (T.log2(y_pred) - T.log2(compl_y_pred))
    return T.mean(T.maximum(1. - transform_y_true * logit, 0.), axis=-1)
Exemple #13
0
 def __init__(self, sen_vec):
     # sen_vec is a shared variable of shape (vec_dim, sentence_length)
     self.sen_vec = assert_op(sen_vec, sen_vec.shape[0] == 50)
     n_layers = T.log2(sen_vec.shape[1])
     self.n_in = sen_vec.shape[1]
     self.num_layers = assert_op(n_layers, n_layers.get_value() - int(n_layers.get_value()) == 0)
     self.params = []
 def accumCost(pred, xW, m, c_sum, ppl_sum):
     pred = tensor.nnet.softmax(pred)
     c_sum += (tensor.log(pred[tensor.arange(n_samples), xW] + 1e-20) *
               m)
     ppl_sum += -(
         tensor.log2(pred[tensor.arange(n_samples), xW] + 1e-10) * m)
     return c_sum, ppl_sum
Exemple #15
0
 def __init__(self, sen_vec, n_in):
     self.sen_vec = assert_shape_op(sen_vec, (vec_dims, n_in))
     n_layers = T.log2(sen_vec.shape[1])
     self.num_layers = T.cast(assert_op(n_layers, T.eq(n_layers, T.cast(n_layers, 'int32'))), 'int32')
     self.n_in = T.constant(n_in, name='n_in', dtype='int32')
     self.params = []
     self.output = self.tree_output()
Exemple #16
0
    def sym_entropy(self, S, mapping):
        """
        Defines the symbolic calculation of the soft entropy
        """
        if self.distance == 'euclidean':
            distances = euclidean_distance(S, self.C)
        else:
            distances = cosine_distance(S, self.C)
        Q = T.nnet.softmax(-distances / self.m)

        # Calculates the fuzzy membership vector for each histogram S
        # Q, scan_u = theano.map(fn=self.sym_get_similarity, sequences=[S])

        Nk = T.sum(Q, axis=0)

        H = T.dot(mapping.T, Q)
        P = H / Nk

        entropy_per_cluster = P * T.log2(P)
        entropy_per_cluster = T.switch(T.isnan(entropy_per_cluster), 0,
                                       entropy_per_cluster)
        entropy_per_cluster = entropy_per_cluster.sum(axis=0)

        Rk = Nk / Nk.sum()
        E = -(entropy_per_cluster * Rk).sum()
        return T.squeeze(E)
def forward_batch_step(x_t, H_mask, H_tm1):
    H = TT.dot(W_rec,H_tm1) + W_in[:,x_t]
    H_t = TT.nnet.sigmoid(H)
    Y_t = TT.nnet.softmax(TT.transpose(TT.dot(W_out, H_t)))
    Y_t = -TT.log2(Y_t)
    Y_t = TT.dot(TT.transpose(Y_t), TT.diag(H_mask))
    return [H_t, Y_t]
Exemple #18
0
def get_insilico_knockout_tensor_op(lisa_prediction, precompute, coef, original_median=None):
    """ use theano tensor operation to speed up
    return a theano.function

    lisa_prediction: numpy array
    precompute: numpy array
    coef: pandas DataFrame
    """
    x = T.imatrix('E') # each motif tensor
    precomp = theano.shared(precompute.astype(theano.config.floatX), name='precompute')
    r = theano.shared(lisa_prediction.astype(theano.config.floatX), name='Lisa RP')
    c = theano.shared(coef.iloc[:, 0].values.astype(theano.config.floatX), name='coefficients')
    m = theano.shared(original_median.astype(theano.config.floatX), name='original_rp_median')

    # sample x (gene1_bin1, gene1_bin2...gene2_bin1,gene2_bin2...)
    y = T.extra_ops.repeat(x, precompute.shape[0], axis=0)
    tensor_del = y * precomp # sample x (gene,bin)
    tensor_del = T.reshape(tensor_del, (c.shape[0],r.shape[0],200)) # sample x gene x bin
    tensor_del = T.transpose(T.sum(tensor_del, axis=2), (1,0)) + T.constant(1) # one motif

    ##tensor_del_med = T.mean(tensor_del, axis=0)  # one motif
    ##log_tensor_del = T.log2(tensor_del) - T.log2(tensor_del_med)

    log_tensor_del = T.log2(tensor_del) - m # original median already take log2
    tensor_delta = r - T.dot(log_tensor_del, c)

    mode = theano.Mode(linker='cvm', optimizer='fast_run')
    theano.config.exception_verbosity = 'high'
    # theano.config.openmp = True
    theano_delta_rp = theano.function([x], tensor_delta, mode=mode)
    return theano_delta_rp
Exemple #19
0
    def __call__(self, loss):
        attention = self.layer.get_attention() +0.000001
        attention = attention
        entropy = -T.sum(T.log2(attention) * attention, axis = 1)
        entropy = T.mean(entropy)

        loss+= self.w*entropy
        return loss
Exemple #20
0
    def quantized_bprop(self, cost):
        index_low = T.switch(self.varin > 0.,
            T.floor(T.log2(self.varin)), T.floor(T.log2(-self.varin))
        )
        index_low = T.clip(index_low, -4, 3)
        sign = T.switch(self.varin > 0., 1., -1.)
        # the upper 2**(integer power) though not used explicitly.
        # index_up = index_low + 1
        # percentage of upper index.
        p_up = sign * self.varin / 2**(index_low) - 1
        index_random = index_low + self.srng.binomial(
            n=1, p=p_up, size=T.shape(self.varin), dtype=theano.config.floatX)
        quantized_rep = sign * 2**index_random

        error = T.grad(cost=cost, wrt=self.varfanin)

        self.dEdW = T.dot(quantized_rep.T, error)
Exemple #21
0
    def __call__(self, loss):
        attention = self.layer.get_attention() + 0.000001
        attention = attention
        entropy = -T.sum(T.log2(attention) * attention, axis=1)
        entropy = T.mean(entropy)

        loss += self.w * entropy
        return loss
 def __init__(self, sen_vec):
     # sen_vec is a shared variable of shape (vec_dim, sentence_length)
     self.sen_vec = assert_op(sen_vec, sen_vec.shape[0] == 50)
     n_layers = T.log2(sen_vec.shape[1])
     self.n_in = sen_vec.shape[1]
     self.num_layers = assert_op(
         n_layers,
         n_layers.get_value() - int(n_layers.get_value()) == 0)
     self.params = []
Exemple #23
0
 def cost(self, Y, Y_hat):
     zeros = tensor.eq(Y, 0)
     ones = tensor.eq(Y, 1)
     probs = zeros * Y_hat + ones * (1 - Y_hat)
     result, _ = theano.scan(fn=lambda vec: -tensor.sum(
         tensor.log2(vec.nonzero_values())),
         outputs_info=None,
         sequences=probs)
     return result.mean()
Exemple #24
0
 def cost(self, Y, Y_hat):
     zeros = tensor.eq(Y, 0)
     ones = tensor.eq(Y, 1)
     probs = zeros * Y_hat + ones * (1 - Y_hat)
     result, _ = theano.scan(
         fn=lambda vec: -tensor.sum(tensor.log2(vec.nonzero_values())),
         outputs_info=None,
         sequences=probs)
     return result.mean()
 def __init__(self, sen_vec, n_in):
     self.sen_vec = assert_shape_op(sen_vec, (vec_dims, n_in))
     n_layers = T.log2(sen_vec.shape[1])
     self.num_layers = T.cast(
         assert_op(n_layers, T.eq(n_layers, T.cast(n_layers, 'int32'))),
         'int32')
     self.n_in = T.constant(n_in, name='n_in', dtype='int32')
     self.params = []
     self.output = self.tree_output()
Exemple #26
0
        def step(input_, label):
            if self.use_bias:
                result = T.dot(input_, self.W) + self.b
            else:
                result = T.dot(input_, self.W)
            result = T.nnet.softmax(result)
            cross_entropy = T.nnet.categorical_crossentropy(
                T.clip(result, 1e-7, 1.0 - 1e-7), label)  # (batch_size,)
            perplexity = -T.log2(result[T.arange(self.batch_size),
                                        label])  # (batch_size,)

            return cross_entropy, perplexity
Exemple #27
0
def gpu_searchsorted_scan(P, X):
    N = T.cast(T.floor(T.log2(P.shape[0])) + 1, 'int64')
    (_,
     B), _ = theano.scan(gpu_searchsorted_step,
                         outputs_info=[
                             T.zeros_like(X, dtype='int64'),
                             T.ones_like(X, dtype='int64') * (P.shape[0] - 1)
                         ],
                         non_sequences=[X, P],
                         n_steps=N,
                         allow_gc=True)
    return B[-1]
Exemple #28
0
def R2_RNN_block(tparams, inputs, prefix=None, name='r2_rnn', std=True):
    prefix = GetPrefix(prefix, name)
    n_steps = inputs.shape[0]
    n_samples = inputs.shape[1]
    x_size = inputs.shape[2]

    r_steps = T.ceil(T.log2(n_steps)).astype('uint32')
    r_steps = T.arange(r_steps)

    # r_steps=r_steps.reshape([r_steps.shape[0],1]);

    def _step_inner(index, num, inps):
        index = index * 2
        index_ = T.minimum(index + 2, num)

        h = RNN_layer(tparams,
                      inps[index:index_, :, :],
                      prefix=prefix,
                      name=None,
                      std=False)
        return h[-1, :, :]

    def _step(r_step, num, inps, std=True):
        n = num
        steps = T.arange((n + 1) / 2)
        # steps=steps.reshape([steps.shape[0],1]);

        out, updates = theano.scan(
            lambda index, num, inps: _step_inner(index, num, inps),
            sequences=[steps],
            outputs_info=None,
            non_sequences=[num, inps],
            name=_p(prefix, 'inner_scan'),
            n_steps=steps.shape[0],
            profile=False)

        # if std:	out=standardize(out);
        num = out.shape[0]
        h = T.zeros_like(inps)
        h = T.set_subtensor(h[:num], out)
        return num, h
        # return out;

    if std: inputs = standardize(inputs)
    out, updates = theano.reduce(
        lambda r_step, num, inps: _step(r_step, num, inps),
        sequences=r_steps,
        outputs_info=[inputs.shape[0], inputs],
        # non_sequences=inputs,
        name=_p(prefix, 'scan'))
    return out[1][:out[0]]
def deep_learn(X, y, layer, _iter_num, _alpha, _decay):
    """TODO: Docstring for deep_learn.

    :X: TODO
    :y: TODO
    :_iter_num: TODO
    :_alpha: TODO
    :_decay: TODO
    :returns: TODO

    """
    init_params = stack_aes(X, y, layer)

    t_X, t_y, t_z = T.dmatrix(), T.dmatrix(), T.dmatrix()
    t_m, t_weight_decay, t_b = T.dscalar(), T.dscalar(), T.dscalar()
    t_params = reduce(add, [[T.dmatrix(), T.dvector()] for i in range(len(init_params)/2)])

    t_z = t_X
    for i in range(0, len(t_params)-2, 2):
        t_z = T.nnet.sigmoid(T.dot(t_z, t_params[i])+t_params[i+1])
    t_z = T.dot(t_z, t_params[-2]) + t_params[-1]
    J = (-1.0 / t_m) \
            * T.sum(T.log2(T.exp(T.sum(t_z * t_y, 1)) / T.sum(T.exp(t_z), 1))) \
            + (t_weight_decay / (2.0 * t_m)) * \
            T.sum(reduce(add, [T.sum(param ** 2.0) for param in t_params]))

    formula = theano.function([t_X, t_y] + t_params + [t_weight_decay, t_m],
                              [J] + [T.grad(J, param) for param in t_params])

    def cost_func_sm(params):
        exec compile('tmp = formula(X, y, '+
                     ''.join(['params[%d], ' % (index)
                              for index in range(len(params))])+
                     '_decay, X.shape[0])',
                     '', 'exec') in {'formula': formula, 'X': X, 'y': y,
                                     '_decay': _decay, 'tmp': None}, locals()
        J, grads = tmp[0], tmp[1:]
        return J, grads

    start_time = time()

    finale = gradient_descent(cost_func_sm, init_params, _iter_num, _alpha)

    print 'Training time of sparse linear decoder: %f minutes.' \
            %((time() - start_time) / 60.0)
    print 'The accuracy of dl: %f %% (threshold used)' \
            % (assess(y, be_onefold(predict_dl(X, finale), 1), 1))
    print 'The accuracy of dl: %f %% (abs used)' \
            % (assess(y, be_onefold(predict_dl(X, finale), 1), 2))

    return finale
def softmax_classify(X, y, _iter_num,
                _alpha, _decay, _beta, _rho):
    """TODO: Docstring for softmax_classify.

    :X: TODO
    :y: TODO
    :options: TODO
    :returns: TODO

    """
    input_n, output_n = X.shape[1], y.shape[1]
    in_out_degree = [input_n, output_n]
    init_params = initial_params(in_out_degree)

    t_X, t_y = T.dmatrix(), T.dmatrix()
    t_weight_decay, t_m = T.dscalar(), T.dscalar()
    t_theta, t_b = T.dmatrix(), T.dvector()

    z = T.dot(t_X, t_theta) + t_b
    J = (-1.0 / t_m) \
            * T.sum(T.log2(T.exp(T.sum(z * t_y, 1)) / T.sum(T.exp(z), 1))) \
            + (t_weight_decay / (2.0 * t_m)) * T.sum(t_theta ** 2.0)

    formula = theano.function([t_X, t_y, t_theta, t_b, t_weight_decay, t_m],
                              [J, T.grad(J, t_theta), T.grad(J, t_b),])

    def cost_func_sm(params):
        result = formula(X, y, params[0], params[1], _decay, X.shape[0])
        J, grads = result[0], result[1:]
        return J, grads

    start_time = time()

    finale = gradient_descent(cost_func_sm, init_params, _iter_num, _alpha)

    print 'Training time of sparse linear decoder: %f minutes.' \
            %((time() - start_time) / 60.0)
    print 'The accuracy of sm: %f %% (threshold used)' \
            % (assess(y, be_onefold(predict_sm(X, finale), 1), 1))
    print 'The accuracy of sm: %f %% (abs used)' \
            % (assess(y, be_onefold(predict_sm(X, finale), 1), 2))

    # sio.savemat('./param/sm_weight_bias', {
    #         'weight': params[0], 'bias': params[1]
    #         })

    return finale[:2]
Exemple #31
0
 def cost(X, P): # batch_size x time
     eps = 1e-3
     X = X.T                                         # time x batch_size
     char_prob_dist = lang_model(X[:-1])                # time x batch_size x output_size
     char_prob_dist = (1 - 2 * eps) * char_prob_dist + eps
     label_prob = char_prob_dist[
         T.arange(X.shape[0] - 1).dimshuffle(0, 'x'),
         T.arange(X.shape[1]).dimshuffle('x', 0),
         X[1:]
     ]                                                # time x batch_size
     cross_entropy = -T.sum(T.log(label_prob), axis=0)
     display_cost = 2**(-T.mean(T.log2(label_prob), axis=0))
     l2 = sum(T.sum(p**2) for p in P.values())
     cost = cross_entropy
     if l2_coefficient > 0:
         cost += l2_coefficient * l2
     return cost, display_cost
def R2_RNN_block(tparams,inputs,prefix=None,name='r2_rnn',std=True):
	prefix=GetPrefix(prefix,name);
	n_steps=inputs.shape[0];
	n_samples=inputs.shape[1];
	x_size=inputs.shape[2];	

	r_steps=T.ceil(T.log2(n_steps)).astype('uint32');
	r_steps=T.arange(r_steps);
	# r_steps=r_steps.reshape([r_steps.shape[0],1]);

	
	def _step_inner(index,num,inps):
		index=index*2;
		index_=T.minimum(index+2,num);

		h=RNN_layer(tparams,inps[index:index_,:,:],prefix=prefix,name=None,std=False);
		return h[-1,:,:];
	
	def _step(r_step,num,inps,std=True):
		n=num;
		steps=T.arange((n+1)/2);
		# steps=steps.reshape([steps.shape[0],1]);

		out,updates=theano.scan(lambda index,num,inps:_step_inner(index,num,inps), 
							sequences=[steps], 
							outputs_info=None,
							non_sequences=[num,inps],
							name=_p(prefix,'inner_scan'),
							n_steps=steps.shape[0],
							profile=False);

		# if std:	out=standardize(out);
		num=out.shape[0];
		h=T.zeros_like(inps);
		h=T.set_subtensor(h[:num],out);
		return num,h;
		# return out;
	
	if std:	inputs=standardize(inputs);
	out,updates=theano.reduce(lambda r_step,num,inps:_step(r_step,num,inps), 
							sequences=r_steps, 
							outputs_info=[inputs.shape[0],inputs],
							# non_sequences=inputs,
							name=_p(prefix,'scan')
							);
	return out[1][:out[0]];
Exemple #33
0
    def _sym_entropy(self, S):
        """
        Defines the symbolic calculation of the soft entropy
        """

        distances = symbolic_distance_matrix(S, self.C)
        Q = T.nnet.softmax(-distances / self.m)

        # Calculates the fuzzy membership vector for each histogram S
        Nk = T.sum(Q, axis=0)

        H = T.dot(self.mapping.T, Q)
        P = H / Nk

        entropy_per_cluster = P * T.log2(P)
        entropy_per_cluster = T.switch(T.isnan(entropy_per_cluster), 0, entropy_per_cluster)
        entropy_per_cluster = entropy_per_cluster.sum(axis=0)

        Rk = Nk / Nk.sum()
        E = -(entropy_per_cluster * Rk).sum()
        return T.squeeze(E)
Exemple #34
0
def test_lstm():

    # load wiki data
    X_train_np, X_valid_np, X_test_np = gen_data_wiki()
    batchsize = 100
    blocklength = 25000  #450000
    bsize_test = batchsize
    numframe = 100
    numframe_test = 1250  #2500#5000
    X_valid = onehot(X_valid_np).reshape(bsize_test,
                                         X_valid_np.shape[0] / bsize_test, 205)
    X_test = onehot(X_test_np).reshape(bsize_test,
                                       X_test_np.shape[0] / bsize_test, 205)
    nb_classes = 205

    X_train_shared = theano.shared(np.zeros(
        (batchsize, blocklength, nb_classes)).astype('float32'),
                                   name='train_set',
                                   borrow=True)
    X_valid_shared = theano.shared(np.zeros(
        (bsize_test, numframe_test, nb_classes)).astype('float32'),
                                   name='valid_set',
                                   borrow=True)
    X_test_shared = theano.shared(np.zeros(
        (bsize_test, numframe_test, nb_classes)).astype('float32'),
                                  name='test_set',
                                  borrow=True)

    # build the model
    from keras.layers.recurrent import LSTM, SimpleRNN, LSTMgrave
    from layer_icml import LSTM_bu, LSTM_td, RNN_td, RNN_bu, RNN_sh, RNN_dp, LSTM_dp, RNN_shallow
    from layer_icml import RNN_relugate, RNN_ens, RNN_2tanh, RNN_ntanh, RNN_multidp, LSTM_multi, LSTM_u, RNN_utanh, LSTM_uu, LSTM_uugrave
    from keras.layers.core import Dense, Activation, TimeDistributedDense
    from keras.initializations import normal, identity

    x = T.tensor3()
    y = T.matrix()

    name_init = 'uniform'
    n_h = 2450
    L1 = LSTMgrave(output_dim=n_h,
                   init='uniform',
                   batchsize=batchsize,
                   inner_init='uniform',
                   input_shape=(None, nb_classes),
                   return_sequences=True)
    name_model = 'lstm_shallowgrave_' + str(
        n_h) + name_init + '0.01' + '_batchsize' + str(
            batchsize) + '_numframe' + str(numframe)

    # RNN
    name_act = 'tanh'
    name_init = 'uniform'
    #n_h=2048;L1 = RNN_shallow(output_dim = n_h, init = 'uniform', U_init = name_init, activation = name_act, input_shape = (None, nb_classes), return_sequences=True);name_model = "rnn_tanh" + str(n_h) + "_"+name_act+ name_init + '0.1'
    #n_h = 2048;L1 = SimpleRNN(output_dim = n_h, init = 'uniform', inner_init = 'uniform', activation = name_act, input_shape = (None, nb_classes), return_sequences=True);name_model = "rnn_shallow"+str(n_h)+name_act+ name_init + '0.05'
    #n_h = 4096;L1 = RNN_utanh(output_dim = n_h, init = 'uniform', U_init = name_init, activation = name_act, input_shape = (None, nb_classes), return_sequences=True);name_model = "rnn_utanh_2_0_0" + str(n_h) + "_"+name_act+ name_init +'0.01'
    n_h = 2048
    in_act = 'tanh'
    L1 = LSTM_uugrave(output_dim=n_h,
                      batchsize=batchsize,
                      init='uniform',
                      inner_init='uniform',
                      input_shape=(None, nb_classes),
                      return_sequences=True)
    name_model = 'lstm_u_grave' + in_act + '_1.0_1.0_1.0_0' + str(
        n_h) + name_init + '0.01' + '_batchsize' + str(
            batchsize) + '_numframe' + str(numframe)
    #n_h = 1200; in_act = 'tanh';L2 = LSTM_uu(output_dim = n_h, init = 'uniform', inner_init = 'uniform', input_shape = (None, n_h), return_sequences=True); name_model= 'lstm_u_stack2'+in_act+'_1.0_1.0_1.0_0' + str(n_h) + name_init + '0.01'
    #n_h = 700; L2 = LSTM_uu(output_dim = n_h, init = 'uniform', inner_init = 'uniform',input_shape = (None, n_h), return_sequences=True); name_model= 'lstm_u_1.0_1.0_0.5_0' + str(n_h) + name_init + '0.03'
    #n_h = 700; L3 = LSTM_uu(output_dim = n_h, init = 'uniform', inner_init = 'uniform',input_shape = (None, n_h), return_sequences=True); name_model= 'lstm_u_1.0_1.0_0.5_0' + str(n_h) + name_init + '0.03'
    #n_h = 700; L4 = LSTM_uu(output_dim = n_h, init = 'uniform', inner_init = 'uniform',input_shape = (None, n_h), return_sequences=True); name_model= 'lstm_u_1.0_1.0_0.5_0' + str(n_h) + name_init + '0.03'
    #n_h = 700; L5 = LSTM_uu(output_dim = n_h, init = 'uniform', inner_init = 'uniform',input_shape = (None, n_h), return_sequences=True); name_model= '7005layerlstm_uu_1.0_1.0_0.5_0' + str(n_h) + name_init + '0.03'

    D1 = TimeDistributedDense(nb_classes)
    D1._input_shape = [None, None, n_h]
    O = Activation('softmax')

    #layers = [L1, L2, L3, L4, L5, D1, O]
    layers = [L1, D1, O]
    #layers = [L1, L2, D1, O]

    load_model = True
    if load_model:
        #f_model = open('/data/lisatmp3/zhangsa/lstm/models/180rnn_td_reluidentityotherinit_identity_sgd0.1_clip10.pkl', 'rb')
        #f_model = open('/data/lisatmp4/zhangsa/rnn_trans/models/wiki100lstm_u_gravetanh_1.0_1.0_1.0_02048uniform0.01_batchsize100_numframe100_adam0.001inorder_withtestfinetune5e-4inorder_withtest.pkl', 'rb')
        #f_model = open('/data/lisatmp4/zhangsa/rnn_trans/models/wiki100lstm_u_gravetanh_1.0_1.0_1.0_02048uniform0.01_batchsize100_numframe100_adam0.001inorder_withtest.pkl', 'rb')
        #f_model = open('/data/lisatmp4/zhangsa/rnn_trans/models/wiki100lstm_u_gravetanh_1.0_1.0_1.0_02048uniform0.01_batchsize100_numframe100_adam0.001inorder_withtest.pkl', 'rb')
        f_model = open(
            '/data/lisatmp4/zhangsa/rnn_trans/models/wiki100lstm_u_gravetanh_1.0_0.5_1.0_02048uniform0.01_batchsize100_numframe100_adam0.001inorder_withtestfinetune1e-5inorder_withtest.pkl',
            'rb')
        layers = pickle.load(f_model)
        f_model.close()
        name_model_load = 'wiki100lstm_u_gravetanh_1.0_0.5_1.0_02048uniform0.01_batchsize100_numframe100_adam0.001inorder_withtest' + 'finetune2e-6'
        #name_perpmat_load = 'wiki100lstm_u_gravetanh_1.0_1.0_1.0_02048uniform0.01_batchsize100_numframe100_adam0.001inorder_withtest.npy'
        L1 = layers[0]

    out = x
    params = []
    for l in layers:
        if not load_model:
            l.build()
        l.input = out
        params += l.params
        if l == L1:
            out = l.get_output()[0]
            h0 = l.get_output()[0]
            c0 = l.get_output()[1]
        else:
            out = l.get_output()

    # compute the loss
    loss = -T.mean(T.log(out)[:, :numframe - 1, :] * x[:, 1:, :])
    logperp_valid = T.mean(
        -T.log2(T.sum(out[:, :numframe_test - 1, :] * x[:, 1:, :], axis=2)))
    logperp_train = T.mean(
        -T.log2(T.sum(out[:, :numframe - 1, :] * x[:, 1:, :], axis=2)))

    # set optimizer
    from keras.constraints import identity as ident
    from keras.optimizers import RMSprop, SGD, Adam

    lr_ = 2 * 1e-6
    clipnorm_ = 10000
    rmsprop = RMSprop(lr=lr_, clipnrom=clipnorm_)
    sgd = SGD(lr=lr_, momentum=0.9, clipnorm=clipnorm_)
    adam = Adam(lr=lr_)

    #opt = sgd; name_opt = 'sgd'+str(lr_); clip_flag = False
    #opt = rmsprop; name_opt = 'rmsprop'+str(lr_)
    opt = adam
    name_opt = 'adam' + str(lr_)
    clip_flag = False

    if clip_flag:
        name_opt = name_opt + '_clip' + str(clipnorm_)

    #param update for regular parameters
    constraints = [ident() for p in params]
    updates = opt.get_updates(params, constraints, loss)

    index = T.iscalar()
    f_train = theano.function(
        [index], [loss, h0, c0],
        updates=updates,
        givens={
            x: X_train_shared[:, index * numframe:(index + 1) * numframe, :]
        })

    # perplexity function
    f_perp_valid = theano.function([], [logperp_valid, h0, c0],
                                   givens={x: X_valid_shared})
    f_perp_test = theano.function([], [logperp_valid, h0, c0],
                                  givens={x: X_test_shared})

    #f_perp_valid = theano.function([index], [logperp_valid], givens={x:X_valid_shared[index*bsize_test : (index+1)*bsize_test]})
    #f_perp_test = theano.function([index], [logperp_valid], givens={x:X_test_shared[index*bsize_test : (index+1)*bsize_test]})

    def perp_valid():
        logperp_acc = 0
        n = 0
        L1.H0.set_value(np.zeros((batchsize, n_h)).astype('float32'))
        L1.C0.set_value(np.zeros((batchsize, n_h)).astype('float32'))
        for k in xrange(X_valid.shape[1] / numframe_test):
            X_valid_shared.set_value(X_valid[:, k * numframe_test:(k + 1) *
                                             numframe_test, :])
            perp, h0, c0 = f_perp_valid()
            logperp_acc += perp
            L1.H0.set_value(h0[:, -1, :])
            L1.C0.set_value(c0[:, -1, :])
            n += 1
        return (logperp_acc / n)

    def perp_test():
        logperp_acc = 0
        n = 0
        L1.H0.set_value(np.zeros((batchsize, n_h)).astype('float32'))
        L1.C0.set_value(np.zeros((batchsize, n_h)).astype('float32'))
        for k in xrange(X_test.shape[1] / numframe_test):
            X_test_shared.set_value(X_test[:, k * numframe_test:(k + 1) *
                                           numframe_test, :])
            perp, h0, c0 = f_perp_test()
            logperp_acc += perp
            L1.H0.set_value(h0[:, -1, :])
            L1.C0.set_value(c0[:, -1, :])
            n += 1
        return (logperp_acc / n)

    #def perp_valid():
    #    logperp_acc = 0
    #    n = 0
    #    for k in xrange(X_valid_np.shape[0]/(bsize_test*numframe_test)):
    #        X_valid_shared.set_value(onehot(X_valid_np[k*bsize_test*numframe_test:(k+1)*bsize_test*numframe_test]).reshape((bsize_test, numframe_test, 205)))
    #        for i in xrange(X_valid_shared.get_value().shape[0]/bsize_test):
    #            logperp_acc += f_perp_valid(i)
    #            n += 1
    #    return (logperp_acc/n)

    #def perp_test():
    #    logperp_acc = 0
    #    n = 0
    #    for k in xrange(X_test_np.shape[0]/(bsize_test*numframe_test)):
    #        X_test_shared.set_value(onehot(X_test_np[k*bsize_test*numframe_test:(k+1)*bsize_test*numframe_test]).reshape((bsize_test, numframe_test, 205)))
    #        for i in xrange(X_test_shared.get_value().shape[0]/bsize_test):
    #            logperp_acc += f_perp_test(i)
    #            n += 1
    #    return (logperp_acc/n)

    ######## testmodel ########
    #test_score = perp_valid()
    #pdb.set_trace()

    epoch_ = 9000
    perpmat = np.zeros((epoch_, 3))
    t_start = time.time()
    name = 'wiki100' + name_model + '_' + name_opt

    if load_model:
        name = name_model_load
        #perpmat = np.load(name_perpmat_load)

    #only_block = False
    #if only_block:
    #    name = name + 'random_only_block'
    #else:
    #    name = name + 'random_per_row_in_block'
    name = name + 'inorder'
    blocksize = batchsize * blocklength
    bestscore = 100000000
    for epoch in xrange(epoch_):
        for k in xrange(X_train_np.shape[0] / blocksize):
            t_s = time.time()
            print "reloading " + str(k) + " th train patch..."

            #if only_block:
            #    pos = np.random.randint(0, X_train_np.shape[0]-blocksize)
            #    X_train_shared.set_value(onehot(X_train_np[pos: pos + blocksize]).reshape(batchsize, blocklength, 205))
            #else:
            #    pos = np.random.randint(0, X_train_np.shape[0]-blocklength, batchsize)
            #    tmp = np.zeros((batchsize, blocklength, 205)).astype('float32')
            #    for j in xrange(batchsize):
            #        tmp[j] = onehot(X_train_np[pos[j]: pos[j] + blocklength])
            #    X_train_shared.set_value(tmp)
            X_train_shared.set_value(
                onehot(X_train_np[k * blocksize:(k + 1) * blocksize]).reshape(
                    batchsize, blocklength, 205))
            print "reloading finished, time cost: " + str(time.time() - t_s)
            L1.H0.set_value(np.zeros((batchsize, n_h)).astype('float32'))
            L1.C0.set_value(np.zeros((batchsize, n_h)).astype('float32'))
            for i in xrange(blocklength / numframe):
                loss, h0, c0 = f_train(i)
                L1.H0.set_value(h0[:, -1, :])
                L1.C0.set_value(c0[:, -1, :])
                if i % 10 == 0:
                    t_end = time.time()
                    print "Time consumed: " + str(t_end - t_start) + " secs."
                    t_start = time.time()
                    print "Epoch " + str(
                        epoch
                    ) + " " + name + ": The training loss in batch " + str(
                        k * (blocklength / numframe) +
                        i) + " is: " + str(loss) + "."
            if k % 6 == 0:
                #save results
                m = epoch * X_train_np.shape[0] / (blocksize * 6) + k / 6
                perpmat[m][0], perpmat[m][1] = 0, perp_valid()
                perpmat[m][2] = perp_test()
                np.save(
                    '/data/lisatmp4/zhangsa/rnn_trans/results/' + name +
                    '_withtest.npy', perpmat)

                #save model
                if perpmat[m][1] < bestscore:
                    bestscore = perpmat[m][1]
                    f_model = open(
                        '/data/lisatmp4/zhangsa/rnn_trans/models/' + name +
                        '_withtest.pkl', 'wb+')
                    pickle.dump(layers, f_model)
                    f_model.close()

        print "Epoch "+ str(epoch)+ " " + name + ": The training perp is: " + str(perpmat[epoch][0]) \
                      + ", test perp is: " + str(perpmat[epoch][1]) + "."
Exemple #35
0
Y = tt.matrix("Y")
sigma = tt.vector("sigma")

pdist2 = lambda A: ((A[:, np.newaxis, :] - A[np.newaxis, :, :])**2).sum(2)

pdist_X = pdist2(X)
pdist_Y = pdist2(Y)

P_c = tt.exp(-pdist_X /
             (2 * sigma**2)) / tt.exp(-pdist_X /
                                      (2 * sigma**2)).sum(1)[:, np.newaxis]
P = (P_c + P_c.T) / (2 * N)
Q = tt.exp(-pdist_Y) / tt.exp(-pdist_Y).sum()
KL = tt.where(abs(P) > 1e-8, P * tt.log(P / Q), 0).sum(1)
C = KL.sum()
LogPerplexity = -tt.where(abs(P_c) > 1e-8, P_c * tt.log2(P_c), 0).sum(1)
PerplexityCost = 0.5 * ((LogPerplexity - np.log2(perplexity_target))**2).sum()

#### Sigma
s0 = np.ones(N, np.float32)
I = tt.iscalar("I")
prp = theano.function([sigma, I], LogPerplexity[I], allow_input_downcast=True)
print("Init PC:", PerplexityCost.eval({sigma: s0}))
for i in range(N):
    f = lambda s: (prp(s * np.ones(N), i) - np.log2(perplexity_target))
    s0[i] = brentq(f, 1e-6, 10, rtol=1e-8)
print("Final PC:", PerplexityCost.eval({sigma: s0}))

#### Y
f_g = theano.function([Y, sigma], [C, tt.grad(C, Y)])
Exemple #36
0
    def __init__(self, nh, nw):
        """
        nh :: dimension of the hidden layer
        nw :: vocabulary size
        """
        # parameters of the model
        self.index = theano.shared(name='index',
                                   value=numpy.eye(nw,
                                                   dtype=theano.config.floatX))
        self.wxg = theano.shared(
            name='wxg',
            value=0.02 *
            numpy.random.randn(nw, nh).astype(theano.config.floatX))
        self.whg = theano.shared(
            name='whg',
            value=0.02 *
            numpy.random.randn(nh, nh).astype(theano.config.floatX))
        self.wxi = theano.shared(
            name='wxi',
            value=0.02 *
            numpy.random.randn(nw, nh).astype(theano.config.floatX))
        self.whi = theano.shared(
            name='whi',
            value=0.02 *
            numpy.random.randn(nh, nh).astype(theano.config.floatX))
        self.wxf = theano.shared(
            name='wxf',
            value=0.02 *
            numpy.random.randn(nw, nh).astype(theano.config.floatX))
        self.whf = theano.shared(
            name='whf',
            value=0.02 *
            numpy.random.randn(nh, nh).astype(theano.config.floatX))
        self.wxo = theano.shared(
            name='wxo',
            value=0.02 *
            numpy.random.randn(nw, nh).astype(theano.config.floatX))
        self.who = theano.shared(
            name='who',
            value=0.02 *
            numpy.random.randn(nh, nh).astype(theano.config.floatX))
        self.w = theano.shared(
            name='w',
            value=0.02 *
            numpy.random.randn(nh, nw).astype(theano.config.floatX))
        self.bg = theano.shared(name='bg',
                                value=numpy.zeros(nh,
                                                  dtype=theano.config.floatX))
        self.bi = theano.shared(name='bi',
                                value=numpy.zeros(nh,
                                                  dtype=theano.config.floatX))
        self.bf = theano.shared(name='bf',
                                value=numpy.zeros(nh,
                                                  dtype=theano.config.floatX))
        self.bo = theano.shared(name='bo',
                                value=numpy.zeros(nh,
                                                  dtype=theano.config.floatX))
        self.b = theano.shared(name='b',
                               value=numpy.zeros(nw,
                                                 dtype=theano.config.floatX))
        self.h0 = theano.shared(name='h0',
                                value=numpy.zeros(nh,
                                                  dtype=theano.config.floatX))
        self.c0 = theano.shared(name='c0',
                                value=numpy.zeros(nh,
                                                  dtype=theano.config.floatX))

        #bundle
        self.params = [
            self.wxg, self.whg, self.wxi, self.whi, self.wxf, self.whf,
            self.wxo, self.who, self.w, self.bg, self.bi, self.bf, self.bo,
            self.b, self.h0, self.c0
        ]

        idxs = T.ivector()
        x = self.index[idxs]
        y_sentence = T.ivector('y_sentence')  # labels

        def recurrence(x_t, c_tm1, h_tm1):
            i_t = T.nnet.sigmoid(
                T.dot(x_t, self.wxi) + T.dot(h_tm1, self.whi) + self.bi)
            f_t = T.nnet.sigmoid(
                T.dot(x_t, self.wxf) + T.dot(h_tm1, self.whf) + self.bf)
            o_t = T.nnet.sigmoid(
                T.dot(x_t, self.wxo) + T.dot(h_tm1, self.who) + self.bo)
            g_t = T.tanh(
                T.dot(x_t, self.wxg) + T.dot(h_tm1, self.whg) + self.bg)
            c_t = f_t * c_tm1 + i_t * g_t
            h_t = o_t * T.tanh(c_t)
            s_t = T.nnet.softmax(T.dot(h_t, self.w) + self.b)
            return [c_t, h_t, s_t]

        [c, h, s], _ = theano.scan(fn=recurrence,
                                   sequences=x,
                                   outputs_info=[self.c0, self.h0, None],
                                   n_steps=x.shape[0],
                                   truncate_gradient=-1)

        p_y_given_x_sentence = s[:, 0, :]
        y_pred = T.argmax(p_y_given_x_sentence, axis=1)

        # cost and gradients and learning rate
        lr = T.scalar('lr')

        sentence_nll = -T.mean(
            T.log2(p_y_given_x_sentence)[T.arange(x.shape[0]), y_sentence])

        sentence_gradients = [
            T.grad(sentence_nll, param) for param in self.params
        ]
        sentence_updates = [
            (param, param - lr * g)
            for param, g in zip(self.params, sentence_gradients)
        ]

        # perplexity of a sentence
        sentence_ppl = T.pow(2, sentence_nll)

        # theano functions to compile
        self.classify = theano.function(inputs=[idxs],
                                        outputs=y_pred,
                                        allow_input_downcast=True)
        self.prob_dist = theano.function(inputs=[idxs],
                                         outputs=p_y_given_x_sentence,
                                         allow_input_downcast=True)
        self.ppl = theano.function(inputs=[idxs, y_sentence],
                                   outputs=sentence_ppl,
                                   allow_input_downcast=True)
        self.sentence_train = theano.function(inputs=[idxs, y_sentence, lr],
                                              outputs=sentence_nll,
                                              updates=sentence_updates,
                                              allow_input_downcast=True)
def word_cost(probs,Y):
	lbl_probs = probs[T.arange(Y.shape[0]),Y]
	return -T.sum(T.log(lbl_probs)), -T.mean(T.log2(lbl_probs))
Exemple #38
0
 def get(self):
     y = T.clip(self.result_tensor, EPSILON, 1.0 - EPSILON)
     y = y.reshape((-1, y.shape[-1]))
     k = self.index_tensor.reshape((-1, ))
     return -T.mean(T.log2(y[T.arange(k.shape[0]), k]))
Exemple #39
0
	def negative_log_likelihood(self, y):

		loss = -T.mean(T.log2(self.p_y_given_x)[T.arange(y.shape[0]), y])

		return loss
Exemple #40
0
import theano.tensor as T
from theano.tensor import shared_randomstreams
import numpy as np
import numpy.random
from scipy.special import gammaincinv
from numpy.linalg import norm

# tensor stand-in for np.random.RandomState
rngT = shared_randomstreams.RandomStreams()
rng = numpy.random.RandomState()

# {{{ Fastfood Params }}}
n, d = T.dscalars('n', 'd')
# transform dimensions to be a power of 2
d0, n0 = d, n
l = T.ceil(T.log2(d))  # TODO cast to int
d = 2**l
k = T.ceil(n/d)  # TODO cast to int
n = d*k
# generate parameter 'matrices'
B = rng.choice([-1, 1], size=(k, d))
G = rng.normal(size=(k, d), dtype=np.float64)
PI = np.array([rng.permutation(d) for _ in xrange(k)]).T
S = np.empty((k*d, 1), dtype=np.float64)
# generate scaling matrix, S
for i in xrange(k):
    for j in xrange(d):
        p1 = rng.uniform(size=d)
        p2 = d/2
        Tmp = gammaincinv(p2, p1)
        Tmp = T.sqrt(2*Tmp)
Exemple #41
0
#train_features = theano.shared(train_features_numpy, name='train_set', borrow=True)
valid_features = theano.shared(valid_features_numpy, name='valid_set', borrow=True)

model = srnnnobias_scan.SRNN(name="aoeu",
                             numvis=framelen*alphabetsize,
                             numhid=512, 
                             numframes=50,
                             cheating_level=1.0, 
                             output_type="softmax",
                             numpy_rng=numpy_rng, 
                             theano_rng=theano_rng)

ppw = 2 ** T.mean(  # first mean NLL over each time step prediction, then mean over the whole batch
    -T.log2(  # apply log_2
        T.sum(  # summing over the 3rd dimention, which has 27 elements
            (model._prediction_for_training * model._input_frames[1:]),
            axis=2
        )
    )
)
#train_perplexity = theano.function([], ppw, givens={model.inputs:train_features})
valid_perplexity = theano.function([], ppw, givens={model.inputs:valid_features})

#model.monitor = model.normalizefilters


# TRAIN MODEL
trainer = graddescent_rewrite.SGD_Trainer(model, train_features_numpy, batchsize=32, learningrate=0.1, loadsize=10000, gradient_clip_threshold=1.0)

print "BEFORE_TRAINING: valid perplexity: %f" % (valid_perplexity())
print 'training...'
for epoch in xrange(100):
Exemple #42
0
 def negative_log_likelihood(self, y):
     """
     compute negative log-likelihood of target words y
     explicitly normalize predicted scores
     """
     return -T.mean(T.log2(self.p_w_given_h)[y, T.arange(y.shape[0])])
    (x_train, y_train), (x_test, y_test) = mnist.load_data()
    x_train = np.asarray(x_train, dtype=np.float32)
    y_train = np.asarray(y_train, dtype=np.int32)
    x_test = np.asarray(x_test, dtype=np.float32)
    y_test = np.asarray(y_test, dtype=np.int32)

    x_train = x_train.reshape((x_train.shape[0], x_train.shape[1] ** 2))
    x_test = x_test.reshape((x_test.shape[0], x_test.shape[1] ** 2))
    x_train /= 255
    x_test /= 255

    X = T.matrix()
    Y = T.matrix()
    w = theano.shared(np.zeros((28 ** 2, 10), dtype=theano.config.floatX))
    log_reg = T.nnet.sigmoid(T.dot(X, w))
    cost = T.mean(-Y*T.log2(log_reg)-(1-Y)*T.log2(1-log_reg))
    gradient = T.grad(cost=cost, wrt=w)
    updates = [[w, w - gradient * 0.1]]
    train = theano.function(inputs=[X, Y], outputs=cost, updates=updates, allow_input_downcast=True)
    predict = theano.function(inputs=[X], outputs=log_reg)
    calc_accuracy()
    raw_input('Press enter')
    errors = []
    steps = 200
    for i in xrange(0, steps):
        e = train(x_train, to_output(y_train))
        print('{} Error: {}'.format(i, e))
        errors.append(e)
    print('Final mult: {}'.format(w.get_value()))
    plt.plot(np.asarray(range(0, steps)), np.asarray(errors))
    plt.show()
Exemple #44
0
def log2(x):
    return T.log2(x)
Exemple #45
0
    def __init__(self, nh, nw):
        """
        nh :: dimension of the hidden layer
        nw :: vocabulary size
        """
        # parameters of the model
        self.index = theano.shared(name='index',
                                value=numpy.eye(nw,
                                dtype=theano.config.floatX))
        # parameters of the first LSTM
        self.wxg_1 = theano.shared(name='wxg_1',
                                value=0.02 * numpy.random.randn(nw, nh)
                                .astype(theano.config.floatX))
        self.whg_1 = theano.shared(name='whg_1',
                                value=0.02 * numpy.random.randn(nh, nh)
                                .astype(theano.config.floatX))
        self.wxi_1 = theano.shared(name='wxi_1',
                                value=0.02 * numpy.random.randn(nw, nh)
                                .astype(theano.config.floatX))
        self.whi_1 = theano.shared(name='whi_1',
                                value=0.02 * numpy.random.randn(nh, nh)
                                .astype(theano.config.floatX))
        self.wxf_1 = theano.shared(name='wxf_1',
                                value=0.02 * numpy.random.randn(nw, nh)
                                .astype(theano.config.floatX))
        self.whf_1 = theano.shared(name='whf_1',
                                value=0.02 * numpy.random.randn(nh, nh)
                                .astype(theano.config.floatX))
        self.wxo_1 = theano.shared(name='wxo_1',
                                value=0.02 * numpy.random.randn(nw, nh)
                                .astype(theano.config.floatX))
        self.who_1 = theano.shared(name='who_1',
                                value=0.02 * numpy.random.randn(nh, nh)
                                .astype(theano.config.floatX))
        self.bg_1 = theano.shared(name='bg_1',
                                value=numpy.zeros(nh,
                                dtype=theano.config.floatX))
        self.bi_1 = theano.shared(name='bi_1',
                                value=numpy.zeros(nh,
                                dtype=theano.config.floatX))
        self.bf_1 = theano.shared(name='bf_1',
                                value=numpy.zeros(nh,
                                dtype=theano.config.floatX))
        self.bo_1 = theano.shared(name='bo_1',
                                value=numpy.zeros(nh,
                                dtype=theano.config.floatX))
        self.h0_1 = theano.shared(name='h0_1',
                                value=numpy.zeros(nh,
                                dtype=theano.config.floatX))
        self.c0_1 = theano.shared(name='c0_1',
                                value=numpy.zeros(nh,
                                dtype=theano.config.floatX))

        self.wxg_acc_1 = theano.shared(name='wxg_acc_1',
                                value=numpy.zeros((nw, nh),
                                dtype=theano.config.floatX))
        self.whg_acc_1 = theano.shared(name='whg_acc_1',
                                value=numpy.zeros((nh, nh),
                                dtype=theano.config.floatX))
        self.wxi_acc_1 = theano.shared(name='wxi_acc_1',
                                value=numpy.zeros((nw, nh),
                                dtype=theano.config.floatX))
        self.whi_acc_1 = theano.shared(name='whi_acc_1',
                                value=numpy.zeros((nh, nh),
                                dtype=theano.config.floatX))
        self.wxf_acc_1 = theano.shared(name='wxf_acc_1',
                                value=numpy.zeros((nw, nh),
                                dtype=theano.config.floatX))
        self.whf_acc_1 = theano.shared(name='whf_acc_1',
                                value=numpy.zeros((nh, nh),
                                dtype=theano.config.floatX))
        self.wxo_acc_1 = theano.shared(name='wxo_acc_1',
                                value=numpy.zeros((nw, nh),
                                dtype=theano.config.floatX))
        self.who_acc_1 = theano.shared(name='who_acc_1',
                                value=numpy.zeros((nh, nh),
                                dtype=theano.config.floatX))
        self.bg_acc_1 = theano.shared(name='bg_acc_1',
                                value=numpy.zeros(nh,
                                dtype=theano.config.floatX))
        self.bi_acc_1 = theano.shared(name='bi_acc_1',
                                value=numpy.zeros(nh,
                                dtype=theano.config.floatX))
        self.bf_acc_1 = theano.shared(name='bf_acc_1',
                                value=numpy.zeros(nh,
                                dtype=theano.config.floatX))
        self.bo_acc_1 = theano.shared(name='bo_acc_1',
                                value=numpy.zeros(nh,
                                dtype=theano.config.floatX))
        self.h0_acc_1 = theano.shared(name='h0_acc_1',
                                value=numpy.zeros(nh,
                                dtype=theano.config.floatX))
        self.c0_acc_1 = theano.shared(name='c0_acc_1',
                                value=numpy.zeros(nh,
                                dtype=theano.config.floatX))

        # parameters of the second LSTM
        self.wxg_2 = theano.shared(name='wxg_2',
                                value=0.02 * numpy.random.randn(nw, nh)
                                .astype(theano.config.floatX))
        self.whg_2 = theano.shared(name='whg_2',
                                value=0.02 * numpy.random.randn(nh, nh)
                                .astype(theano.config.floatX))
        self.wh_1g_2 = theano.shared(name='wh_1g_2',
                                value=0.02 * numpy.random.randn(nh, nh)
                                .astype(theano.config.floatX))
        self.wxi_2 = theano.shared(name='wxi_2',
                                value=0.02 * numpy.random.randn(nw, nh)
                                .astype(theano.config.floatX))
        self.whi_2 = theano.shared(name='whi_2',
                                value=0.02 * numpy.random.randn(nh, nh)
                                .astype(theano.config.floatX))
        self.wh_1i_2 = theano.shared(name='wh_1i_2',
                                value=0.02 * numpy.random.randn(nh, nh)
                                .astype(theano.config.floatX))
        self.wxf_2 = theano.shared(name='wxf_2',
                                value=0.02 * numpy.random.randn(nw, nh)
                                .astype(theano.config.floatX))
        self.whf_2 = theano.shared(name='whf_2',
                                value=0.02 * numpy.random.randn(nh, nh)
                                .astype(theano.config.floatX))
        self.wh_1f_2 = theano.shared(name='wh_1f_2',
                                value=0.02 * numpy.random.randn(nh, nh)
                                .astype(theano.config.floatX))
        self.wxo_2 = theano.shared(name='wxo_2',
                                value=0.02 * numpy.random.randn(nw, nh)
                                .astype(theano.config.floatX))
        self.who_2 = theano.shared(name='who_2',
                                value=0.02 * numpy.random.randn(nh, nh)
                                .astype(theano.config.floatX))
        self.wh_1o_2 = theano.shared(name='wh_1o_2',
                                value=0.02 * numpy.random.randn(nh, nh)
                                .astype(theano.config.floatX))
        self.w_2 = theano.shared(name='w_2',
                                value=0.02 * numpy.random.randn(nh, nw)
                               .astype(theano.config.floatX))
        self.bg_2 = theano.shared(name='bg_2',
                                value=numpy.zeros(nh,
                                dtype=theano.config.floatX))
        self.bi_2 = theano.shared(name='bi_2',
                                value=numpy.zeros(nh,
                                dtype=theano.config.floatX))
        self.bf_2 = theano.shared(name='bf_2',
                                value=numpy.zeros(nh,
                                dtype=theano.config.floatX))
        self.bo_2 = theano.shared(name='bo_2',
                                value=numpy.zeros(nh,
                                dtype=theano.config.floatX))
        self.b_2 = theano.shared(name='b_2',
                               value=numpy.zeros(nw,
                               dtype=theano.config.floatX))

        self.wxg_acc_2 = theano.shared(name='wxg_acc_2',
                                value=numpy.zeros((nw, nh),
                                dtype=theano.config.floatX))
        self.whg_acc_2 = theano.shared(name='whg_acc_2',
                                value=numpy.zeros((nh, nh),
                                dtype=theano.config.floatX))
        self.wh_1g_acc_2 = theano.shared(name='wh_1g_acc_2',
                                value=numpy.zeros((nh, nh),
                                dtype=theano.config.floatX))
        self.wxi_acc_2 = theano.shared(name='wxi_acc_2',
                                value=numpy.zeros((nw, nh),
                                dtype=theano.config.floatX))
        self.whi_acc_2 = theano.shared(name='whi_acc_2',
                                value=numpy.zeros((nh, nh),
                                dtype=theano.config.floatX))
        self.wh_1i_acc_2 = theano.shared(name='wh_1i_acc_2',
                                value=numpy.zeros((nh, nh),
                                dtype=theano.config.floatX))
        self.wxf_acc_2 = theano.shared(name='wxf_acc_2',
                                value=numpy.zeros((nw, nh),
                                dtype=theano.config.floatX))
        self.whf_acc_2 = theano.shared(name='whf_acc_2',
                                value=numpy.zeros((nh, nh),
                                dtype=theano.config.floatX))
        self.wh_1f_acc_2 = theano.shared(name='wh_1f_acc_2',
                                value=numpy.zeros((nh, nh),
                                dtype=theano.config.floatX))
        self.wxo_acc_2 = theano.shared(name='wxo_acc_2',
                                value=numpy.zeros((nw, nh),
                                dtype=theano.config.floatX))
        self.who_acc_2 = theano.shared(name='who_acc_2',
                                value=numpy.zeros((nh, nh),
                                dtype=theano.config.floatX))
        self.wh_1o_acc_2 = theano.shared(name='wh_1o_acc_2',
                                value=numpy.zeros((nh, nh),
                                dtype=theano.config.floatX))
        self.w_acc_2 = theano.shared(name='w_acc_2',
                                value=numpy.zeros((nh, nw),
                                dtype=theano.config.floatX))
        self.bg_acc_2 = theano.shared(name='bg_acc_2',
                                value=numpy.zeros(nh,
                                dtype=theano.config.floatX))
        self.bi_acc_2 = theano.shared(name='bi_acc_2',
                                value=numpy.zeros(nh,
                                dtype=theano.config.floatX))
        self.bf_acc_2 = theano.shared(name='bf_acc_2',
                                value=numpy.zeros(nh,
                                dtype=theano.config.floatX))
        self.bo_acc_2 = theano.shared(name='bo_acc_2',
                                value=numpy.zeros(nh,
                                dtype=theano.config.floatX))
        self.b_acc_2 = theano.shared(name='b_acc_2',
                               value=numpy.zeros(nw,
                               dtype=theano.config.floatX))

        #bundle
        self.params = [self.wxg_1, self.whg_1, self.wxi_1, self.whi_1,
                self.wxf_1, self.whf_1, self.wxo_1, self.who_1, self.bg_1,
                self.bi_1, self.bf_1, self.bo_1, self.h0_1, self.c0_1,
                self.wxg_2, self.whg_2, self.wh_1g_2, self.wxi_2, self.whi_2,
                self.wh_1i_2, self.wxf_2,
                self.whf_2, self.wh_1f_2, self.wxo_2, self.who_2, self.wh_1o_2, self.w_2, self.bg_2,
                self.bi_2, self.bf_2, self.bo_2, self.b_2]
        self.params_acc = [self.wxg_acc_1, self.whg_acc_1, self.wxi_acc_1,
                self.whi_acc_1, self.wxf_acc_1, self.whf_acc_1, self.wxo_acc_1,
                self.who_acc_1, self.bg_acc_1, self.bi_acc_1, self.bf_acc_1,
                self.bo_acc_1, self.h0_acc_1, self.c0_acc_1, self.wxg_acc_2, 
                self.whg_acc_2, self.wh_1g_acc_2, self.wxi_acc_2,
                self.whi_acc_2, self.wh_1i_acc_2, self.wxf_acc_2,
                self.whf_acc_2, self.wh_1f_acc_2, self.wxo_acc_2,
                self.who_acc_2, self.wh_1o_acc_2, self.w_acc_2,
                self.bg_acc_2, self.bi_acc_2, self.bf_acc_2, self.bo_acc_2,
                self.b_acc_2]

        idxs = T.ivector()
        x = self.index[idxs]
        idxs_r = T.ivector()
        x_r = self.index[idxs_r]
        idxs_s = T.ivector()
        x_s = self.index[idxs_s]
        y_sentence = T.ivector('y_sentence') # labels

        def recurrence_1(x_t, c_tm1, h_tm1):
            i_t = T.nnet.sigmoid(T.dot(x_t, self.wxi_1) + T.dot(h_tm1,
                self.whi_1) + self.bi_1)
            f_t = T.nnet.sigmoid(T.dot(x_t, self.wxf_1) + T.dot(h_tm1,
                self.whf_1) + self.bf_1)
            o_t = T.nnet.sigmoid(T.dot(x_t, self.wxo_1) + T.dot(h_tm1,
                self.who_1) + self.bo_1)
            g_t = T.tanh(T.dot(x_t, self.wxg_1) + T.dot(h_tm1, self.whg_1) +
                    self.bg_1)
            c_t = f_t * c_tm1 + i_t * g_t
            h_t = o_t * T.tanh(c_t)
            return [c_t, h_t]

        def recurrence_2(x_t, c_tm1, h_tm1, h_1):
            i_t = T.nnet.sigmoid(T.dot(x_t, self.wxi_2) + T.dot(h_tm1,
                self.whi_2) + T.dot(h_1, self.wh_1i_2) + self.bi_2)
            f_t = T.nnet.sigmoid(T.dot(x_t, self.wxf_2) + T.dot(h_tm1,
                self.whf_2) + T.dot(h_1, self.wh_1f_2) + self.bf_2)
            o_t = T.nnet.sigmoid(T.dot(x_t, self.wxo_2) + T.dot(h_tm1,
                self.who_2) + T.dot(h_1, self.wh_1o_2) + self.bo_2)
            g_t = T.tanh(T.dot(x_t, self.wxg_2) + T.dot(h_tm1, self.whg_2) +
                    T.dot(h_1, self.wh_1g_2) + self.bg_2)
            c_t = f_t * c_tm1 + i_t * g_t
            h_t = o_t * T.tanh(c_t)
            s_t = T.nnet.softmax(T.dot(h_t, self.w_2) + self.b_2)
            return [c_t, h_t, s_t]

        [c_1, h_1], _ = theano.scan(fn=recurrence_1,
                                sequences=x_r,
                                outputs_info=[self.c0_1, self.h0_1],
                                n_steps=x_r.shape[0],
                                truncate_gradient=-1)
        c_1_last = c_1[-1]
        h_1_last = h_1[-1]

        [c_2, h_2, s_2], _ = theano.scan(fn=recurrence_2,
                                sequences=x,
                                non_sequences=[h_1_last],
                                outputs_info=[T.zeros_like(c_1_last),
                                    T.zeros_like(h_1_last), None],
                                n_steps=x.shape[0],
                                truncate_gradient=-1)

        [c_3, h_3, s_3], _ = theano.scan(fn=recurrence_2,
                                sequences=x_s,
                                non_sequences=[h_1_last],
                                outputs_info=[T.zeros_like(c_1_last),
                                    T.zeros_like(h_1_last), None],
                                n_steps=x_s.shape[0],
                                truncate_gradient=-1)

        p_y_given_x_sentence = s_2[:, 0, :]
        p_y_given_x_sentence2 = s_3[:, 0, :]
        y_pred = T.argmax(p_y_given_x_sentence, axis=1)

        # cost and gradients and learning rate
        lr = T.scalar('lr')

        sentence_nll = -T.mean(T.log2(p_y_given_x_sentence)
                               [T.arange(x.shape[0]), y_sentence])

        sentence_gradients = [T.grad(sentence_nll, param) for param in self.params]

        #Adagrad
        sentence_updates = []
        for param_i, grad_i, acc_i in zip(self.params, sentence_gradients, self.params_acc):
            acc = acc_i + T.sqr(grad_i)
            sentence_updates.append((param_i, param_i - lr*grad_i/(T.sqrt(acc)+1e-5)))
            sentence_updates.append((acc_i, acc))

        # SGD
        #sentence_updates = [(param, param - lr*g) for param,g in zip(self.params, sentence_gradients)]

        # theano functions to compile
        #self.classify = theano.function(inputs=[idxs, idxs_r], outputs=y_pred, allow_input_downcast=True)
        #self.prob_dist = theano.function(inputs=[idxs, idxs_r], outputs=p_y_given_x_sentence, allow_input_downcast=True)
        self.prob_dist2 = theano.function(inputs=[idxs_r, idxs_s],
                outputs=p_y_given_x_sentence2, allow_input_downcast=True)
        self.nll = theano.function(inputs=[idxs, idxs_r, y_sentence], outputs=sentence_nll, allow_input_downcast=True)
        self.sentence_train = theano.function(inputs=[idxs, idxs_r, y_sentence, lr],
                                              outputs=sentence_nll,
                                              updates=sentence_updates,
                                              allow_input_downcast=True)
        self.sent_vec = theano.function(inputs=[idxs_r], 
                                        outputs=h_1[-1],
                                        allow_input_downcast=True)
Exemple #46
0
 def get(self):
     y = T.clip(self.result_tensor, EPSILON, 1.0 - EPSILON)
     y = y.reshape((-1, y.shape[-1]))
     k = self.index_tensor.reshape((-1,))
     return -T.mean(T.log2(y[T.arange(k.shape[0]), k]))
Exemple #47
0
 def unnormalized_neg_log_likelihood(self, y, c=1.0):
     """
     compute unnormalized log-likelihood of target words y
     """
     return -T.mean(T.log2(self.s * T.exp(c))[y, T.arange(y.shape[0])])
    def build_model(self, tparams, options):
        trng = RandomStreams(1234)

        # Used for dropout.
        use_noise = theano.shared(numpy_floatX(0.))

        xW = tensor.matrix('xW', dtype='int64')

        mask = tensor.matrix('mask', dtype=config.floatX)
        n_timesteps = xW.shape[0]
        n_samples = xW.shape[1]

        embW = tparams['Wemb'][xW.flatten()].reshape(
            [n_timesteps, n_samples, options['word_encoding_size']])

        embW_rev = tparams['Wemb'][xW[::-1, :].flatten()].reshape(
            [n_timesteps, n_samples, options['word_encoding_size']])
        xI = tensor.matrix('xI', dtype=config.floatX)
        xAux = tensor.matrix('xAux', dtype=config.floatX)

        if options.get('swap_aux', 0):
            xAuxEmb = tensor.dot(xAux,
                                 tparams['WIemb_aux']) + tparams['b_Img_aux']
        else:
            xAuxEmb = xAux

        embImg = (tensor.dot(xI, tparams['WIemb']) + tparams['b_Img']).reshape(
            [1, n_samples, options['image_encoding_size']])
        emb = tensor.concatenate([embImg, embW], axis=0)

        emb_rev = tensor.set_subtensor(
            embW_rev[mask[::-1, :].argmax(axis=0) - 1,
                     tensor.arange(n_samples), :], embImg[0, :, :])

        #This is implementation of input dropout !!
        if options['use_dropout']:
            emb = dropout_layer(emb,
                                use_noise,
                                trng,
                                options['drop_prob_encoder'],
                                shp=emb.shape)
            if options.get('en_aux_inp', 0):
                xAuxEmb = dropout_layer(xAuxEmb,
                                        use_noise,
                                        trng,
                                        options['drop_prob_aux'],
                                        shp=xAuxEmb.shape)

        #############################################################################################################################
        # This implements core lstm
        rval, updatesLSTM = basic_lstm_layer(tparams,
                                             emb[:n_timesteps, :, :],
                                             xAuxEmb,
                                             use_noise,
                                             options,
                                             prefix='lstm',
                                             sched_prob_mask=[])
        #############################################################################################################################
        # This implements core reverse lstm
        rev_rval, rev_updatesLSTM = basic_lstm_layer(
            tparams,
            emb_rev[:n_timesteps, :, :],
            xAuxEmb,
            use_noise,
            options,
            prefix='rev_lstm',
            sched_prob_mask=[])
        #############################################################################################################################

        # NOTE1: we are leaving out the first prediction, which was made for the image and is meaningless.
        if options['use_dropout']:
            # XXX : Size given to dropout is missing one dimension. This keeps the dropped units consistent across time!?.
            # ###   Is this a good bug ?
            p = dropout_layer(
                sliceT(rval[0][1:, :, :], options.get('hidden_depth', 1),
                       options['hidden_size']), use_noise, trng,
                options['drop_prob_decoder'],
                (n_samples, options['hidden_size']))
            rev_p = dropout_layer(
                sliceT(rev_rval[0][:, :, :], options.get('hidden_depth', 1),
                       options['hidden_size']), use_noise, trng,
                options['drop_prob_decoder'],
                (n_samples, options['hidden_size']))
        else:
            p = sliceT(rval[0][1:, :, :], options.get('hidden_depth', 1),
                       options['hidden_size'])
            rev_p = sliceT(rev_rval[0][:, :, :],
                           options.get('hidden_depth',
                                       1), options['hidden_size'])

        n_out_samps = (n_timesteps - 2) * n_samples
        if options.get('class_out_factoring', 0) == 0:
            pW = (tensor.dot(p[:-1, :, :] + rev_p[::-1, :, :][2:, :, :],
                             tparams['Wd']) + tparams['bd']).reshape(
                                 [n_out_samps, options['output_size']])
            pWSft = tensor.nnet.softmax(pW)
            totProb = pWSft[tensor.arange(n_out_samps), xW[1:-1, :].flatten()]
            out_list = [pWSft, totProb, p]
        else:
            ixtoclsinfo_t = tensor.as_tensor_variable(options['ixtoclsinfo'])
            xC = ixtoclsinfo_t[xW[1:, :].flatten(), 0]
            pW = ((tparams['Wd'][:, xC, :].T *
                   (p.reshape([1, n_out_samps, options['hidden_size']]))).sum(
                       axis=-1).T + tparams['bd'][:, xC, :])
            pWSft = tensor.nnet.softmax(pW[0, :, :])
            pC = (tensor.dot(p, tparams['WdCls']) + tparams['bdCls']).reshape(
                [n_out_samps, options['nClasses']])
            pCSft = tensor.nnet.softmax(pC)

            totProb = pWSft[tensor.arange(n_out_samps), ixtoclsinfo_t[xW[1:,:].flatten(),3]] * \
                      pCSft[tensor.arange(n_out_samps), xC]
            out_list = [pWSft, pCSft, totProb, p]

        # XXX : THIS IS VERY FISHY, CHECK THE MASK INDEXING AGAIN
        probs_valid = tensor.log(totProb + 1e-10) * mask[1:-1, :].flatten()
        tot_cost = -(probs_valid.sum())
        tot_pplx = -(tensor.log2(totProb + 1e-10) *
                     mask[1:-1, :].flatten()).sum()
        cost = [tot_cost / options['batch_size'], tot_pplx]

        inp_list = [xW, mask, xI]

        if options.get('en_aux_inp', 0):
            inp_list.append(xAux)

        if options.get('sched_sampling_mode', None) != None:
            inp_list.append(curr_epoch)

        per_sent_prob = probs_valid.reshape([n_timesteps - 2,
                                             n_samples]).sum(axis=0)
        f_per_sentLogP = theano.function(inp_list,
                                         per_sent_prob,
                                         name='f_pred_logprob',
                                         updates=updatesLSTM)
        f_pred_prob = ['', f_per_sentLogP, '']

        return use_noise, inp_list, f_pred_prob, cost, out_list, updatesLSTM
Exemple #49
0
valid_features = theano.shared(valid_features_numpy,
                               name='valid_set',
                               borrow=True)

model = srnnnobias_scan.SRNN(name="aoeu",
                             numvis=framelen * alphabetsize,
                             numhid=512,
                             numframes=50,
                             cheating_level=1.0,
                             output_type="softmax",
                             numpy_rng=numpy_rng,
                             theano_rng=theano_rng)

ppw = 2**T.mean(  # first mean NLL over each time step prediction, then mean over the whole batch
    -T.log2(  # apply log_2
        T.sum(  # summing over the 3rd dimention, which has 27 elements
            (model._prediction_for_training * model._input_frames[1:]),
            axis=2)))
#train_perplexity = theano.function([], ppw, givens={model.inputs:train_features})
valid_perplexity = theano.function([],
                                   ppw,
                                   givens={model.inputs: valid_features})

#model.monitor = model.normalizefilters

# TRAIN MODEL
trainer = graddescent_rewrite.SGD_Trainer(model,
                                          train_features_numpy,
                                          batchsize=32,
                                          learningrate=0.1,
                                          loadsize=10000,
                                          gradient_clip_threshold=1.0)
def test_lstm():
    
    # load wiki data
    X_train_np, X_valid_np, X_test_np = gen_data_wiki()
    batchsize = 100
    blocklength = 25000 #450000
    bsize_test = batchsize 
    numframe = 100
    numframe_test = 1250#2500#5000 
    X_valid = onehot(X_valid_np).reshape(bsize_test, X_valid_np.shape[0]/bsize_test, 205)
    X_test = onehot(X_test_np).reshape(bsize_test, X_test_np.shape[0]/bsize_test, 205)
    nb_classes= 205 

    X_train_shared = theano.shared(np.zeros((batchsize,blocklength, nb_classes)).astype('float32'), name = 'train_set', borrow=True)
    X_valid_shared = theano.shared(np.zeros((bsize_test, numframe_test, nb_classes)).astype('float32'), name = 'valid_set', borrow=True)
    X_test_shared = theano.shared(np.zeros((bsize_test, numframe_test, nb_classes)).astype('float32'), name = 'test_set', borrow=True)

    # build the model
    from keras.layers.recurrent import LSTM, SimpleRNN, LSTMgrave 
    from layer_icml import LSTM_bu, LSTM_td, RNN_td, RNN_bu, RNN_sh, RNN_dp, LSTM_dp, RNN_shallow 
    from layer_icml import RNN_relugate, RNN_ens, RNN_2tanh, RNN_ntanh, RNN_multidp, LSTM_multi, LSTM_u, RNN_utanh, LSTM_uu, LSTM_uugrave 
    from keras.layers.core import Dense, Activation, TimeDistributedDense
    from keras.initializations import normal, identity

    x = T.tensor3()
    y = T.matrix()

    name_init = 'uniform'
    n_h = 2450; L1 = LSTMgrave(output_dim = n_h, init = 'uniform', batchsize = batchsize, inner_init = 'uniform',input_shape = (None, nb_classes), return_sequences=True); name_model= 'lstm_shallowgrave_' + str(n_h) + name_init + '0.01'+ '_batchsize' + str(batchsize) + '_numframe' + str(numframe)

    # RNN
    name_act = 'tanh'; name_init = 'uniform' 
    #n_h=2048;L1 = RNN_shallow(output_dim = n_h, init = 'uniform', U_init = name_init, activation = name_act, input_shape = (None, nb_classes), return_sequences=True);name_model = "rnn_tanh" + str(n_h) + "_"+name_act+ name_init + '0.1'
    #n_h = 2048;L1 = SimpleRNN(output_dim = n_h, init = 'uniform', inner_init = 'uniform', activation = name_act, input_shape = (None, nb_classes), return_sequences=True);name_model = "rnn_shallow"+str(n_h)+name_act+ name_init + '0.05'
    #n_h = 4096;L1 = RNN_utanh(output_dim = n_h, init = 'uniform', U_init = name_init, activation = name_act, input_shape = (None, nb_classes), return_sequences=True);name_model = "rnn_utanh_2_0_0" + str(n_h) + "_"+name_act+ name_init +'0.01' 
    n_h = 2048; in_act = 'tanh';L1 = LSTM_uugrave(output_dim = n_h, batchsize = batchsize, init = 'uniform', inner_init = 'uniform', input_shape = (None, nb_classes), return_sequences=True); name_model= 'lstm_u_grave'+in_act+'_1.0_1.0_1.0_0' + str(n_h) + name_init + '0.01' + '_batchsize' + str(batchsize) + '_numframe' + str(numframe)
    #n_h = 1200; in_act = 'tanh';L2 = LSTM_uu(output_dim = n_h, init = 'uniform', inner_init = 'uniform', input_shape = (None, n_h), return_sequences=True); name_model= 'lstm_u_stack2'+in_act+'_1.0_1.0_1.0_0' + str(n_h) + name_init + '0.01'
    #n_h = 700; L2 = LSTM_uu(output_dim = n_h, init = 'uniform', inner_init = 'uniform',input_shape = (None, n_h), return_sequences=True); name_model= 'lstm_u_1.0_1.0_0.5_0' + str(n_h) + name_init + '0.03'
    #n_h = 700; L3 = LSTM_uu(output_dim = n_h, init = 'uniform', inner_init = 'uniform',input_shape = (None, n_h), return_sequences=True); name_model= 'lstm_u_1.0_1.0_0.5_0' + str(n_h) + name_init + '0.03'
    #n_h = 700; L4 = LSTM_uu(output_dim = n_h, init = 'uniform', inner_init = 'uniform',input_shape = (None, n_h), return_sequences=True); name_model= 'lstm_u_1.0_1.0_0.5_0' + str(n_h) + name_init + '0.03'
    #n_h = 700; L5 = LSTM_uu(output_dim = n_h, init = 'uniform', inner_init = 'uniform',input_shape = (None, n_h), return_sequences=True); name_model= '7005layerlstm_uu_1.0_1.0_0.5_0' + str(n_h) + name_init + '0.03'

    D1 = TimeDistributedDense(nb_classes);D1._input_shape = [None, None, n_h]
    O = Activation('softmax')

    #layers = [L1, L2, L3, L4, L5, D1, O]
    layers = [L1, D1, O]
    #layers = [L1, L2, D1, O]

    load_model = True 
    if load_model:
        #f_model = open('/data/lisatmp3/zhangsa/lstm/models/180rnn_td_reluidentityotherinit_identity_sgd0.1_clip10.pkl', 'rb')
        #f_model = open('/data/lisatmp4/zhangsa/rnn_trans/models/wiki100lstm_u_gravetanh_1.0_1.0_1.0_02048uniform0.01_batchsize100_numframe100_adam0.001inorder_withtestfinetune5e-4inorder_withtest.pkl', 'rb')
        #f_model = open('/data/lisatmp4/zhangsa/rnn_trans/models/wiki100lstm_u_gravetanh_1.0_1.0_1.0_02048uniform0.01_batchsize100_numframe100_adam0.001inorder_withtest.pkl', 'rb')
        #f_model = open('/data/lisatmp4/zhangsa/rnn_trans/models/wiki100lstm_u_gravetanh_1.0_1.0_1.0_02048uniform0.01_batchsize100_numframe100_adam0.001inorder_withtest.pkl', 'rb')
        f_model = open('/data/lisatmp4/zhangsa/rnn_trans/models/wiki100lstm_u_gravetanh_1.0_0.5_1.0_02048uniform0.01_batchsize100_numframe100_adam0.001inorder_withtestfinetune1e-5inorder_withtest.pkl', 'rb')
        layers = pickle.load(f_model)
        f_model.close()
        name_model_load = 'wiki100lstm_u_gravetanh_1.0_0.5_1.0_02048uniform0.01_batchsize100_numframe100_adam0.001inorder_withtest' + 'finetune2e-6'
        #name_perpmat_load = 'wiki100lstm_u_gravetanh_1.0_1.0_1.0_02048uniform0.01_batchsize100_numframe100_adam0.001inorder_withtest.npy'
        L1 = layers[0]

    out  = x
    params = []
    for l in layers: 
        if not load_model:
            l.build()
        l.input = out
        params += l.params
        if l == L1:
            out = l.get_output()[0]
            h0 = l.get_output()[0]
            c0 = l.get_output()[1]
        else:
            out = l.get_output()

    # compute the loss
    loss = -T.mean(T.log(out)[:,:numframe-1,:] *x[:,1:,:])
    logperp_valid = T.mean(-T.log2(T.sum(out[:,:numframe_test-1,:]*x[:,1:,:],axis=2)))
    logperp_train = T.mean(-T.log2(T.sum(out[:,:numframe-1,:]*x[:,1:,:],axis=2)))

    # set optimizer
    from keras.constraints import identity as ident 
    from keras.optimizers import RMSprop, SGD, Adam

    lr_ = 2*1e-6
    clipnorm_ = 10000
    rmsprop = RMSprop(lr=lr_, clipnrom = clipnorm_)
    sgd = SGD(lr=lr_, momentum=0.9, clipnorm=clipnorm_)
    adam = Adam(lr=lr_)

    #opt = sgd; name_opt = 'sgd'+str(lr_); clip_flag = False 
    #opt = rmsprop; name_opt = 'rmsprop'+str(lr_)
    opt = adam; name_opt = 'adam' + str(lr_); clip_flag = False

    if clip_flag: 
        name_opt = name_opt + '_clip'+str(clipnorm_)

    #param update for regular parameters
    constraints = [ident() for p in params]    
    updates = opt.get_updates(params, constraints, loss)

    index = T.iscalar()
    f_train = theano.function([index], [loss, h0, c0], updates = updates,
            givens={x:X_train_shared[:,index*numframe : (index+1)*numframe, :]})

    # perplexity function
    f_perp_valid = theano.function([], [logperp_valid, h0, c0], givens={x:X_valid_shared})
    f_perp_test = theano.function([], [logperp_valid, h0, c0], givens={x:X_test_shared})

    #f_perp_valid = theano.function([index], [logperp_valid], givens={x:X_valid_shared[index*bsize_test : (index+1)*bsize_test]})
    #f_perp_test = theano.function([index], [logperp_valid], givens={x:X_test_shared[index*bsize_test : (index+1)*bsize_test]})


    def perp_valid():
        logperp_acc = 0
        n = 0
        L1.H0.set_value(np.zeros((batchsize, n_h)).astype('float32'))
        L1.C0.set_value(np.zeros((batchsize, n_h)).astype('float32'))
        for k in xrange(X_valid.shape[1]/numframe_test):
            X_valid_shared.set_value(X_valid[:, k*numframe_test:(k+1)*numframe_test, :])
            perp, h0, c0 = f_perp_valid()
            logperp_acc += perp
            L1.H0.set_value(h0[:,-1,:])
            L1.C0.set_value(c0[:,-1,:])
            n += 1
        return (logperp_acc/n)

    def perp_test():
        logperp_acc = 0
        n = 0
        L1.H0.set_value(np.zeros((batchsize, n_h)).astype('float32'))
        L1.C0.set_value(np.zeros((batchsize, n_h)).astype('float32'))
        for k in xrange(X_test.shape[1]/numframe_test):
            X_test_shared.set_value(X_test[:, k*numframe_test:(k+1)*numframe_test, :])
            perp, h0, c0 = f_perp_test()
            logperp_acc += perp
            L1.H0.set_value(h0[:,-1,:])
            L1.C0.set_value(c0[:,-1,:])
            n += 1
        return (logperp_acc/n)


    #def perp_valid():
    #    logperp_acc = 0
    #    n = 0
    #    for k in xrange(X_valid_np.shape[0]/(bsize_test*numframe_test)):
    #        X_valid_shared.set_value(onehot(X_valid_np[k*bsize_test*numframe_test:(k+1)*bsize_test*numframe_test]).reshape((bsize_test, numframe_test, 205)))
    #        for i in xrange(X_valid_shared.get_value().shape[0]/bsize_test):
    #            logperp_acc += f_perp_valid(i)
    #            n += 1
    #    return (logperp_acc/n)

    #def perp_test():
    #    logperp_acc = 0
    #    n = 0
    #    for k in xrange(X_test_np.shape[0]/(bsize_test*numframe_test)):
    #        X_test_shared.set_value(onehot(X_test_np[k*bsize_test*numframe_test:(k+1)*bsize_test*numframe_test]).reshape((bsize_test, numframe_test, 205)))
    #        for i in xrange(X_test_shared.get_value().shape[0]/bsize_test):
    #            logperp_acc += f_perp_test(i)
    #            n += 1
    #    return (logperp_acc/n)


    ######## testmodel ########
    #test_score = perp_valid()
    #pdb.set_trace()


    epoch_ = 9000 
    perpmat = np.zeros((epoch_, 3))
    t_start = time.time()
    name = 'wiki100'+ name_model + '_' +  name_opt 

    if load_model:
        name = name_model_load 
        #perpmat = np.load(name_perpmat_load)

    #only_block = False
    #if only_block:
    #    name = name + 'random_only_block'
    #else:
    #    name = name + 'random_per_row_in_block'
    name = name+'inorder'
    blocksize = batchsize*blocklength
    bestscore = 100000000
    for epoch in xrange(epoch_):
        for k in xrange(X_train_np.shape[0]/blocksize):
            t_s = time.time()
            print "reloading " + str(k) + " th train patch..."

            #if only_block:
            #    pos = np.random.randint(0, X_train_np.shape[0]-blocksize)
            #    X_train_shared.set_value(onehot(X_train_np[pos: pos + blocksize]).reshape(batchsize, blocklength, 205))
            #else:    
            #    pos = np.random.randint(0, X_train_np.shape[0]-blocklength, batchsize)
            #    tmp = np.zeros((batchsize, blocklength, 205)).astype('float32')
            #    for j in xrange(batchsize):
            #        tmp[j] = onehot(X_train_np[pos[j]: pos[j] + blocklength])
            #    X_train_shared.set_value(tmp)
            X_train_shared.set_value(onehot(X_train_np[k*blocksize: (k+1)*blocksize]).reshape(batchsize, blocklength, 205)) 
            print "reloading finished, time cost: " + str(time.time()-t_s)
            L1.H0.set_value(np.zeros((batchsize, n_h)).astype('float32'))
            L1.C0.set_value(np.zeros((batchsize, n_h)).astype('float32'))
            for i in xrange(blocklength/numframe):
                loss, h0, c0 = f_train(i)
                L1.H0.set_value(h0[:,-1,:])
                L1.C0.set_value(c0[:,-1,:])
                if i%10 == 0:
                    t_end = time.time()
                    print "Time consumed: " + str(t_end - t_start) + " secs."
                    t_start = time.time()
                    print "Epoch "+ str(epoch)+" " + name + ": The training loss in batch " + str(k*(blocklength/numframe)+i) +" is: " + str(loss) + "."
            if k%6 == 0:
                #save results
                m = epoch*X_train_np.shape[0]/(blocksize*6) +k/6
                perpmat[m][0], perpmat[m][1] = 0, perp_valid()
                perpmat[m][2] = perp_test()
                np.save('/data/lisatmp4/zhangsa/rnn_trans/results/' + name +'_withtest.npy', perpmat)

                #save model
                if perpmat[m][1] < bestscore:
                    bestscore = perpmat[m][1]
                    f_model = open('/data/lisatmp4/zhangsa/rnn_trans/models/' + name + '_withtest.pkl', 'wb+')
                    pickle.dump(layers, f_model)
                    f_model.close()
       
        print "Epoch "+ str(epoch)+ " " + name + ": The training perp is: " + str(perpmat[epoch][0]) \
                      + ", test perp is: " + str(perpmat[epoch][1]) + "." 
                         outputs_info=[H0, None]
                        )

#[h_ts_fb, y_predicted_fb], _ = T.scan(forward_step,
#                         sequences=[x],
#                         outputs_info=[h0, None]
#                        )

logprobs = y_predicted[TT.arange(Y.shape[0]), TT.transpose(Y), TT.reshape(TT.arange(n_minibatches),(n_minibatches,1))]
DENOM_th = TT.diag(1/TT.sum(logprobs>0, axis=1).astype('float32'))
cross_entropy = TT.sum(TT.dot(DENOM_th,logprobs)) / n_minibatches

#cross_entropy = -TT.mean(TT.log2(TT.nonzero_values(y_predicted[TT.arange(Y.shape[0]), TT.transpose(Y), TT.reshape(TT.arange(n_minibatches),(n_minibatches,1))])))
#cross_entropy_fb = -TT.mean(TT.log2(y_predicted_fb)[TT.arange(y.shape[0]), y])

cross_entropy_fb = -TT.log2(y_t_fb)[y]

params = [W_in, W_rec, W_out]
theta_updates = {W_in: W_in_theta_update, W_rec: W_rec_theta_update, W_out: W_out_theta_update}

g_params = []
for param in params:
    g_params.append(TT.grad(cross_entropy, param))
    T.pp(TT.grad(cross_entropy, param))

updates = []
for param, grad in zip(params, g_params):
    theta_update = theta_updates[param]
    upd = mom * theta_update - lr * grad
    updates.append((theta_updates[param], upd))
    updates.append((param, param + upd,))
    def __init__(self, inp_shape, output_num, training_size, stride=(4, 2), untie_biases=False):
        # setup shared vars
        self.state = theano.shared(np.zeros((1, inp_shape[1], inp_shape[2], inp_shape[3]), dtype=theano.config.floatX))
        self.training_states = theano.shared(np.zeros((training_size, inp_shape[1], inp_shape[2], inp_shape[3]),
                                                      dtype=theano.config.floatX))
        self.training_actions = theano.shared(np.zeros(training_size, dtype=np.int32))
        self.training_rewards = theano.shared(np.zeros(training_size, dtype=theano.config.floatX))

        network_dic = create_A3C(inp_shape, output_num, stride=stride, untie_biases=untie_biases)
        self.l_in = network_dic['l_in']
        self.l_hid1 = network_dic['l_hid1']
        self.l_hid2 = network_dic['l_hid2']
        self.l_hid3 = network_dic['l_hid3']
        self.l_policy = network_dic['l_policy']
        self.l_value = network_dic['l_value']

        # network output vars
        policy_output = lasagne.layers.get_output(self.l_policy, inputs=self.state)
        value_output = lasagne.layers.get_output(self.l_value, inputs=self.state)

        # setup training vars and loss
        training_policy_output = lasagne.layers.get_output(self.l_policy, inputs=self.training_states)
        training_value_output = lasagne.layers.get_output(self.l_value, inputs=self.training_states)

        # log(prediction, action taken) * (R - Value(states))
        # one_hot_true = T.zeros_like(training_policy_output)
        # one_hot_true = T.set_subtensor(one_hot_true[T.arange(self.training_actions.shape[0]), self.training_actions], 1)
        # rewrite categorical crossentropy here because the lasagne/theano function sums the result and I need per step
        # categorical_crossentropy = -T.sum(one_hot_true * T.log(training_policy_output), axis=1)
        entropy = 0.01 * -T.sum(training_policy_output * T.log2(training_policy_output), axis=1)
        value_diff_rewards = (self.training_rewards - training_value_output[:, 0] + entropy)

        # sum is to aggregate over the nsteps
        policy_loss = T.sum(T.log(training_policy_output[:, self.training_actions]) * value_diff_rewards)
        value_loss = T.sum((self.training_rewards - training_value_output[:, 0])**2)

        # get layer parms
        policy_params = lasagne.layers.get_all_params(self.l_policy)
        value_params = lasagne.layers.get_all_params(self.l_value)
        params = policy_params + self.l_value.get_params()

        # get grads
        policy_grads = T.grad(policy_loss, policy_params)
        value_grads = T.grad(value_loss, value_params)

        # combine grads for the non-output layers
        combine_grads = policy_grads[0:-2]
        for grad_ind in range(len(value_grads)-2):
            combine_grads[grad_ind] += value_grads[grad_ind]

        # add grads for policy and value layers
        grads = combine_grads + policy_grads[-2:] + value_grads[-2:]

        # add loss to return in grads list
        grads.append(policy_loss)
        grads.append(value_loss)

        # updates
        self.w1_update = theano.shared(np.zeros(self.l_hid1.W.eval().shape, dtype=theano.config.floatX))
        self.w2_update = theano.shared(np.zeros(self.l_hid2.W.eval().shape, dtype=theano.config.floatX))
        if untie_biases:
            self.b1_update = theano.shared(np.zeros(self.l_hid1.b.eval().shape, dtype=theano.config.floatX))
            self.b2_update = theano.shared(np.zeros(self.l_hid2.b.eval().shape, dtype=theano.config.floatX))
        else:
            self.b1_update = theano.shared(np.zeros(self.l_hid1.b.eval().shape, dtype=theano.config.floatX))
            self.b2_update = theano.shared(np.zeros(self.l_hid2.b.eval().shape, dtype=theano.config.floatX))
        self.w3_update = theano.shared(np.zeros(self.l_hid3.W.eval().shape, dtype=theano.config.floatX))
        self.b3_update = theano.shared(np.zeros(self.l_hid3.b.eval().shape, dtype=theano.config.floatX))
        self.l_policy_w_update = theano.shared(np.zeros(self.l_policy.W.eval().shape, dtype=theano.config.floatX))
        self.l_policy_b_update = theano.shared(np.zeros(self.l_policy.b.eval().shape, dtype=theano.config.floatX))
        self.l_value_w_update = theano.shared(np.zeros(self.l_value.W.eval().shape, dtype=theano.config.floatX))
        self.l_value_b_update = theano.shared(np.zeros(self.l_value.b.eval().shape, dtype=theano.config.floatX))

        network_updates = [self.w1_update, self.b1_update, self.w2_update, self.b2_update, self.w3_update,
                           self.b3_update, self.l_policy_w_update, self.l_policy_b_update, self.l_value_w_update,
                           self.l_value_b_update]
        theano_updates = lasagne.updates.rmsprop(network_updates, params, 0.0001)

        self._get_policy_output = theano.function([], policy_output)
        self._get_value_output = theano.function([], value_output)
        self._gradient_step = theano.function([], updates=theano_updates)
        self._get_grads = theano.function([], outputs=grads)

        self.accumulated_grads = None
Exemple #53
0
def norm_entropy(dist):
    return entropy(dist) / T.log2(dist.shape[-1])
Exemple #54
0
import theano
import theano.tensor as T

"""
A bunch of loss definitions for ease
"""

# zero one loss
zero_one_loss = lambda x, y: T.sum(T.neq(T.argmax(x), y))

# log loss or cross entropy error
cross_entropy = lambda x, y: -T.mean(T.log2(x[T.arange(0, y.shape[0]), y]))

# mean squarred error
mse = lambda x, y: T.mean(T.square(x - y))
 def negative_log_likelihood(self, y):
     # take the logarithm with base 2
     return -T.mean(T.log2(self.p_w_given_h)[T.arange(y.shape[0]), y])
 def accumCost(pred,xW,m,c_sum,ppl_sum):
     pred = tensor.nnet.softmax(pred)
     c_sum += (tensor.log(pred[tensor.arange(n_samples), xW]+1e-20) * m)
     ppl_sum += -(tensor.log2(pred[tensor.arange(n_samples), xW]+1e-10) * m)
     return c_sum, ppl_sum
Exemple #57
0
def kullback_leibler(dist1, dist2):
    logged = T.log2(dist1 / dist2)
    # let 0 * log(0) -> 0
    #logged = T.set_subtensor(logged[T.eq(dist1, 0).nonzero()], 0)
    return (dist1 * logged).sum(axis=1)