Exemple #1
0
    def _activation(self, Y, L, M, W):
        """Returns the activation for a given input.

        Derived from the generative model formulation of hierarchical
        Poisson mixtures, the formular for the activation in the network
        reads as follows:
        I_c =
         \sum_d \log(W_{cd})y_d + \log(M_{lc})        for labeled data
         \sum_d \log(W_{cd})y_d + \log(\sum_k M_{kc}) for unlabeled data
        s_c = softmax(I_c)
        """
        # first: complete inference to find label
        # Input integration:
        I = T.tensordot(Y, T.log(W), axes=[1, 1])
        # recurrent term:
        vM = M[L]
        L_index = T.eq(L, -1).nonzero()
        vM = T.set_subtensor(vM[L_index], T.sum(M, axis=0))
        # numeric trick to prevent overflow in the exp-function
        max_exponent = 86. - T.ceil(T.log(I.shape[1].astype('float32')))
        scale = T.switch(T.gt(T.max(I, axis=1, keepdims=True), max_exponent),
                         T.max(I, axis=1, keepdims=True) - max_exponent, 0.)
        # numeric approximation to prevent underflow in the exp-function:
        # map too low values of I to a fixed minimum value
        min_exponent = -87. + T.ceil(T.log(I.shape[1].astype('float32')))
        I = T.switch(T.lt(I - scale, min_exponent), scale + min_exponent, I)
        # activation: recurrent softmax with overflow protection
        s = vM * T.exp(I - scale) / T.sum(
            vM * T.exp(I - scale), axis=1, keepdims=True)
        return s
Exemple #2
0
    def _activation(self, Y, L, M, W):
        """Returns the activation for a given input.

        Derived from the generative model formulation of hierarchical
        Poisson mixtures, the formular for the activation in the network
        reads as follows:
        I_c =
         \sum_d \log(W_{cd})y_d + \log(M_{lc})        for labeled data
         \sum_d \log(W_{cd})y_d + \log(\sum_k M_{kc}) for unlabeled data
        s_c = softmax(I_c)
        """
        # first: complete inference to find label
        # Input integration:
        I = T.tensordot(Y,T.log(W),axes=[1,1])
        # recurrent term:
        vM = M[L]
        L_index = T.eq(L,-1).nonzero()
        vM = T.set_subtensor(vM[L_index], T.sum(M, axis=0))
        # numeric trick to prevent overflow in the exp-function
        max_exponent = 86. - T.ceil(T.log(I.shape[1].astype('float32')))
        scale = T.switch(
            T.gt(T.max(I, axis=1, keepdims=True), max_exponent),
            T.max(I, axis=1, keepdims=True) - max_exponent,
            0.)
        # numeric approximation to prevent underflow in the exp-function:
        # map too low values of I to a fixed minimum value
        min_exponent = -87. + T.ceil(T.log(I.shape[1].astype('float32')))
        I = T.switch(
            T.lt(I-scale, min_exponent),
            scale+min_exponent,
            I)
        # activation: recurrent softmax with overflow protection
        s = vM*T.exp(I-scale)/T.sum(vM*T.exp(I-scale), axis=1, keepdims=True)
        return s
Exemple #3
0
    def get_output_for(self, inputs, **kwargs):
        # For each ROI R = [batch_index x1 y1 x2 y2]: max pool over R
        input = inputs[0]
        boxes = inputs[1]
        batch = T.shape(input)[0]
        channels = T.shape(input)[1]
        height = T.shape(input)[2]
        width = T.shape(input)[3]
        num_boxes = T.shape(boxes)[0]
        output = T.zeros((batch * num_boxes, channels, self.num_features))

        for idbb, bb in enumerate(range(num_boxes)):
            batch_ind = bb[0]

            pool_list = []
            #for pool_dim in self.pool_dims:
            start_w = T.clip(T.floor(bb[1] * self.sp_scale), 0, width)
            start_h = T.clip(T.floor(bb[2] * self.sp_scale), 0, heigth)
            end_w = T.clip(T.ceil(bb[3] * self.sp_scale), 0, width)
            end_h = T.clip(T.ceil(bb[4] * self.sp_scale), 0, height)

            w = T.max(end_w - start_w + 1, 1)
            h = T.amx(end_h - start_h + 1, 1)

            start_samples_y, start_sample_x = T.floor(
                _meshgrid(start_h, end_h, pool_dims + 1, start_w, end_w,
                          pool_dims + 1))
            end_samples_y, end_sample_x = T.ceil(
                _meshgrid(start_h, end_h, pool_dims + 1, start_w, end_w,
                          pool_dims + 1))

            input[batch_ind, :,
                  np.floor(py):np.ceil(samples_y[idy + 1]),
                  np.floor(px):np.ceil(samples_x[idx + 1])]

            #T.max()

            #for idx,px in enumerate(samples_x[:-1]):
            #    for idy,py in enumerate(samples_y[:-1]):

            #       (pool.dnn_pool( input[batch_ind,:,np.floor(py):np.ceil(samples_y[idy+1]),np.floor(px):np.ceil(samples_x[idx+1])],(0,0),(None,None),'max', (0,0) )).flatten(2)

            #sz_w = ( w - 1 ) // pool_dim
            #sz_h = ( h - 1 ) // pool_dim

            #str_h = w // pool_dim
            #str_w = h // pool_dim

            #pool = dnn.dnn_pool( input[bb[0],:,start_h:end_h+1,start_w:end_w+1], (sz_h,sz_w),                 (str_h,str_w), 'max', (0,0) ).flatten(2)
        pool_list.append(pool)
        output[idbb] = T.transpose(T.concatenate(
            pool_list, axis=1))  #not efficient but for the moment is ok!
        #if everything is correct this vector should be ordered as in fast RCNN
        return output
Exemple #4
0
 def process(self, input, tparams, BNparams):
     b, f, h0, w0 = input.shape
     result = []
     for h, w in self.pymamid:
         win_h = T.ceil(h0 / h).astype('int32')
         win_w = T.ceil(w0 / w).astype('int32')
         str_h = T.floor(h0 / h).astype('int32')
         str_w = T.floor(w0 / w).astype('int32')
         result.append(dnn_pool(
             img=input, ws=(win_h, win_w), mode=self.mode,
             stride=(str_h, str_w), pad=(0, 0)).reshape([b, -1]))
     return T.concatenate(result, axis=1)
Exemple #5
0
def pool_2d_nxn_regions(inputs, output_size, mode='max'):
    """
    Performs a pooling operation that results in a fixed size:
    output_size x output_size.
    Used by SpatialPyramidPoolingLayer. Refer to appendix A in [1]

    Parameters
    ----------
    inputs : a tensor with 4 dimensions (N x C x H x W)
    output_size: integer
        The output size of the pooling operation
    mode : string
        Pooling mode, one of 'max', 'average_inc_pad', 'average_exc_pad'
        Defaults to 'max'.

    Returns a list of tensors, for each output bin.
       The list contains output_size*output_size elements, where
       each element is a 3D tensor (N x C x 1)

    References
    ----------
    .. [1] He, Kaiming et al (2015):
           Spatial Pyramid Pooling in Deep Convolutional Networks
           for Visual Recognition.
           http://arxiv.org/pdf/1406.4729.pdf.
    """

    if mode == 'max':
        pooling_op = T.max
    elif mode in ['average_inc_pad', 'average_exc_pad']:
        pooling_op = T.mean
    else:
        msg = "Mode must be either 'max', 'average_inc_pad' or "
        msg += "'average_exc_pad'. Got '{0}'"
        raise ValueError(msg.format(mode))

    h, w = inputs.shape[2:]

    result = []
    n = float(output_size)

    for row in range(output_size):
        for col in range(output_size):
            start_h = T.floor(row / n * h).astype('int32')
            end_h = T.ceil((row + 1) / n * h).astype('int32')
            start_w = T.floor(col / n * w).astype('int32')
            end_w = T.ceil((col + 1) / n * w).astype('int32')

            pooling_region = inputs[:, :, start_h:end_h, start_w:end_w]
            this_result = pooling_op(pooling_region, axis=(2, 3))
            result.append(this_result.dimshuffle(0, 1, 'x'))
    return result
Exemple #6
0
def pool_2d_nxn_regions(inputs, output_size, mode='max'):
    """
    Performs a pooling operation that results in a fixed size:
    output_size x output_size.
    Used by SpatialPyramidPoolingLayer. Refer to appendix A in [1]

    Parameters
    ----------
    inputs : a tensor with 4 dimensions (N x C x H x W)
    output_size: integer
        The output size of the pooling operation
    mode : string
        Pooling mode, one of 'max', 'average_inc_pad', 'average_exc_pad'
        Defaults to 'max'.

    Returns a list of tensors, for each output bin.
       The list contains output_size*output_size elements, where
       each element is a 3D tensor (N x C x 1)

    References
    ----------
    .. [1] He, Kaiming et al (2015):
           Spatial Pyramid Pooling in Deep Convolutional Networks
           for Visual Recognition.
           http://arxiv.org/pdf/1406.4729.pdf.
    """

    if mode == 'max':
        pooling_op = T.max
    elif mode in ['average_inc_pad', 'average_exc_pad']:
        pooling_op = T.mean
    else:
        msg = "Mode must be either 'max', 'average_inc_pad' or "
        msg += "'average_exc_pad'. Got '{0}'"
        raise ValueError(msg.format(mode))

    h, w = inputs.shape[2:]

    result = []
    n = float(output_size)

    for row in range(output_size):
        for col in range(output_size):
            start_h = T.floor(row / n * h).astype('int32')
            end_h = T.ceil((row + 1) / n * h).astype('int32')
            start_w = T.floor(col / n * w).astype('int32')
            end_w = T.ceil((col + 1) / n * w).astype('int32')

            pooling_region = inputs[:, :, start_h:end_h, start_w:end_w]
            this_result = pooling_op(pooling_region, axis=(2, 3))
            result.append(this_result.dimshuffle(0, 1, 'x'))
    return result
Exemple #7
0
def blockify(
        inp, block_size = (1, 1), step_size = (1, 1), direction = (1, 1),
        padding = False):
    input_size = T.shape(inp)
    if padding:
        b0 = T.ceil((input_size[0] - block_size[0]) / step_size[0]) + 1
        b1 = T.ceil((input_size[1] - block_size[1]) / step_size[1]) + 1
    else:
        b0 = T.floor((input_size[0] - block_size[0]) / step_size[0]) + 1
        b1 = T.floor((input_size[1] - block_size[1]) / step_size[1]) + 1
    num_blocks = b0 * b1

    for b in range(num_blocks):
Exemple #8
0
 def __theano_train__(self, n_size):
     """
     Pr(l|u, C(l)) = Pr(l|u) * Pr(l|C(l))
     Pr(u, l, t) = Pr(l|u, C(l))     if C(l) exists,
                   Pr(l|u)           otherwise.
     $Theta$ = argmax Pr(u, l, t)
     """
     tra_mask = T.ivector()
     seq_length = T.sum(tra_mask)  # 有效长度
     wl = T.concatenate((self.wl, self.wl_m))
     tidx, cidx, bidx, userid = T.ivector(), T.imatrix(), T.itensor3(
     ), T.iscalar()
     pb = self.pb[bidx]  # (seq_length x 4 x depth x n_size)
     lrs = self.lrs[tidx]  # (seq_length x 4 x depth)
     # user preference
     xu = self.xu[userid]
     plu = softmax(T.dot(xu, self.wl.T))
     # geographical influence
     cl = T.sum(wl[cidx], axis=1)  # (seq_length x n_size)
     cl = cl.reshape((cl.shape[0], 1, 1, cl.shape[1]))
     br = sigmoid(T.sum(pb[:seq_length] * cl, axis=3) *
                  lrs[:seq_length]) * T.ceil(abs(T.mean(cl, axis=3)))
     path = T.prod(br, axis=2) * self.probs[tidx][:seq_length]
     # paths = T.prod((T.floor(1-path) + path), axis=1)
     paths = T.sum(path, axis=1)
     paths = T.floor(1 - paths) + paths
     # ----------------------------------------------------------------------------
     # cost, gradients, learning rate, l2 regularization
     lr, l2 = self.alpha_lambda[0], self.alpha_lambda[1]
     seq_l2_sq = T.sum([T.sum(par**2) for par in [xu, self.wl]])
     upq = -1 * T.sum(T.log(plu[tidx[:seq_length]] * paths)) / seq_length
     seq_costs = (upq + 0.5 * l2 * seq_l2_sq)
     seq_grads = T.grad(seq_costs, self.params)
     seq_updates = [(par, par - lr * gra)
                    for par, gra in zip(self.params, seq_grads)]
     pars_subs = [(self.xu, xu), (self.pb, pb)]
     seq_updates.extend([
         (par, T.set_subtensor(sub, sub - lr * T.grad(seq_costs, sub)))
         for par, sub in pars_subs
     ])
     # ----------------------------------------------------------------------------
     uidx = T.iscalar()  # T.iscalar()类型是 TensorType(int32, )
     self.seq_train = theano.function(
         inputs=[uidx],
         outputs=upq,
         updates=seq_updates,
         givens={
             userid:
             uidx,
             tidx:
             self.tra_target_masks[uidx],
             cidx:
             self.tra_context_masks[T.arange(self.tra_accum_lens[uidx][0],
                                             self.tra_accum_lens[uidx][1])],
             bidx:
             self.routes[self.tra_target_masks[uidx]],
             tra_mask:
             self.tra_masks[uidx]
             # tra_mask_cot: self.tra_masks_cot[T.arange(self.tra_accum_lens[uidx][0], self.tra_accum_lens[uidx][1])]
         })
Exemple #9
0
 def compute_sub_all_scores(self, start_end):
     plu = softmax(
         T.dot(self.trained_users[start_end],
               self.trained_items.T))[:, :-1]  # (n_batch, n_item)
     length = T.max(T.sum(self.tes_masks[start_end], axis=1))  # 253
     cidx = T.arange(length).reshape(
         (1, length)) + self.tra_accum_lens[start_end][:, 0].reshape(
             (len(start_end), 1))
     cl = T.sum(self.trained_items[self.tra_context_masks[cidx]],
                axis=2)  # n_batch x seq_length x n_size
     cl = cl.dimshuffle(1, 2, 0)
     pb = self.trained_branch[
         self.routes]  # (n_item x 4 x tree_depth x n_size)
     shp0, shp1, shp2 = self.lrs.shape
     lrs = self.lrs.reshape((shp0, shp1, shp2, 1, 1))
     pr_bc = T.dot(pb, cl)
     br = sigmoid(pr_bc * lrs) * T.ceil(
         abs(pr_bc))  # (n_item x 4 x tree_depth x seq_length x n_batch)
     path = T.prod(br, axis=2) * self.probs.reshape((shp0, shp1, 1, 1))
     del cl, pb, br, lrs
     # paths = T.prod((T.floor(1 - path) + path), axis=1)  # (n_item x seq_length x n_batch)
     paths = T.sum(path, axis=1)
     paths = T.floor(1 - paths) + paths
     p = paths[:-1].T * plu.reshape(
         (plu.shape[0], 1, plu.shape[1]))  # (n_batch x n_item)
     # p = plu.reshape((plu.shape[0], 1, plu.shape[1])) * T.ones((plu.shape[0], length, plu.shape[1]))
     return T.reshape(p, (p.shape[0] * p.shape[1], p.shape[2])).eval()
Exemple #10
0
    def compute_hard_windows(self, image_shape, location, scale):
        # find topleft(front) and bottomright(back) corners for each patch
        a = location - 0.5 * (T.cast(self.patch_shape, theano.config.floatX) / scale)
        b = location + 0.5 * (T.cast(self.patch_shape, theano.config.floatX) / scale)

        # grow by three patch pixels
        a -= self.kernel.k_sigma_radius(self.cutoff, scale)
        b += self.kernel.k_sigma_radius(self.cutoff, scale)

        # clip to fit inside image and have nonempty window
        a = T.clip(a, 0, image_shape - 1)
        b = T.clip(b, a + 1, image_shape)

        if self.batched_window:
            # take the bounding box of all windows; now the slices
            # will have the same length for each sample and scan can
            # be avoided.  comes at the cost of typically selecting
            # more of the input.
            a = a.min(axis=0, keepdims=True)
            b = b.max(axis=0, keepdims=True)

        # make integer
        a = T.cast(T.floor(a), 'int16')
        b = T.cast(T.ceil(b), 'int16')

        return a, b
Exemple #11
0
    def get_stencil(self, t, r=None, texp=None):
        if r is None or texp is None:
            return tt.shape_padright(t)

        z = tt.zeros_like(self.a)
        r = tt.as_tensor_variable(r)
        R = self.r_star + z
        hp = 0.5 * self.period

        if self.ecc is None:
            # Equation 14 from Winn (2010)
            k = r / self.r_star
            arg1 = tt.square(1 + k) - tt.square(self.b)
            arg2 = tt.square(1 - k) - tt.square(self.b)
            factor = R / (self.a * self.sin_incl)
            hdur1 = hp * tt.arcsin(factor * tt.sqrt(arg1)) / np.pi
            hdur2 = hp * tt.arcsin(factor * tt.sqrt(arg2)) / np.pi
            ts = [-hdur1, -hdur2, hdur2, hdur1]
            flag = z

        else:
            M_contact1 = self.contact_points_op(self.a, self.ecc,
                                                self.cos_omega, self.sin_omega,
                                                self.cos_incl + z,
                                                self.sin_incl + z, R + r)
            M_contact2 = self.contact_points_op(self.a, self.ecc,
                                                self.cos_omega, self.sin_omega,
                                                self.cos_incl + z,
                                                self.sin_incl + z, R - r)

            flag = M_contact1[2] + M_contact2[2]

            ts = [
                tt.mod(
                    (M_contact1[0] - self.M0) / self.n + hp, self.period) - hp,
                tt.mod(
                    (M_contact2[0] - self.M0) / self.n + hp, self.period) - hp,
                tt.mod(
                    (M_contact2[1] - self.M0) / self.n + hp, self.period) - hp,
                tt.mod(
                    (M_contact1[1] - self.M0) / self.n + hp, self.period) - hp
            ]

        start = self.period * tt.floor((tt.min(t) - self.t0) / self.period)
        end = self.period * (tt.ceil((tt.max(t) - self.t0) / self.period) + 1)
        start += self.t0
        end += self.t0
        tout = []
        for i in range(4):
            if z.ndim < 1:
                tout.append(ts[i] + tt.arange(start, end, self.period))
            else:
                tout.append(
                    theano.scan(
                        fn=lambda t0, s0, e0, p0: t0 + tt.arange(s0, e0, p0),
                        sequences=[ts[i], start, end, self.period],
                    )[0].flatten())

        ts = tt.sort(tt.concatenate(tout))
        return ts, flag
Exemple #12
0
    def compute_hard_windows(self, image_shape, location, scale):
        # find topleft(front) and bottomright(back) corners for each patch
        a = location - 0.5 * (T.cast(self.patch_shape, theano.config.floatX) / scale)
        b = location + 0.5 * (T.cast(self.patch_shape, theano.config.floatX) / scale)

        # grow by three patch pixels
        a -= self.kernel.k_sigma_radius(self.cutoff, scale)
        b += self.kernel.k_sigma_radius(self.cutoff, scale)

        # clip to fit inside image and have nonempty window
        a = T.clip(a, 0, image_shape - 1)
        b = T.clip(b, a + 1, image_shape)

        if self.batched_window:
            # take the bounding box of all windows; now the slices
            # will have the same length for each sample and scan can
            # be avoided.  comes at the cost of typically selecting
            # more of the input.
            a = a.min(axis=0, keepdims=True)
            b = b.max(axis=0, keepdims=True)

        # make integer
        a = T.cast(T.floor(a), 'int16')
        b = T.cast(T.ceil(b), 'int16')

        return a, b
Exemple #13
0
	def encode(self, state_below):
		"""
		:development:
			(1) may need to prepend encoding_length * padding array to the state_below to produce the same length sequence as state_below
			(2) can return an offset encoding by only returing certain indices of the encoding (though this is pretty wasteful)

		:type state_below: 2d tensor
		:param state_below: the enitre sequence of states from the layer below the current one

		:type rval: 2d tensor
		:param rval: an encoding of the state_below (the entire sequence of state) to be passed to the above layer
		"""

		total_sequence_length = T.cast(state_below.shape[0], theano.config.floatX)
		self.n_encodings = T.cast(T.ceil(total_sequence_length / self.encoding_length), 'int32')
		self.n_padding_timesteps = T.cast(self.n_encodings * self.encoding_length - total_sequence_length, 'int32')
		zeros = T.alloc(np.cast[theano.config.floatX](0), self.n_padding_timesteps, self.n_vis)
		state_below = T.concatenate((zeros, state_below))

		Wxh = self.Wxh
		bxh = self.bxh
		Whhe = self.Whhe

		state_below = state_below.reshape((self.encoding_length, self.n_encodings, self.n_vis))
		state_below = T.dot(state_below, Wxh) + bxh
		
		# a single output will be n_encoding rows with n_hid features each
		encoding_0 = T.alloc(np.cast[theano.config.floatX](0), self.n_encodings, self.n_hid)

		encodings, updates = scan(fn=self.encode_step, sequences=[state_below], outputs_info=[encoding_0], non_sequences=[Whhe])
		# encodings is a 3d vector (encoding_length, n_encodings, n_hid)
		# returns encodings[-1] in 2d vector shape = (n_encodings, n_hid)
		return encodings[-1]
Exemple #14
0
	def get_pseudo_likelihood_cost(self, updates):
		"""Stochastic approximation to the pseudo-likelihood"""

		# index of bit i in expression p(x_i | x_{\i})
		bit_i_idx = theano.shared(value=0, name='bit_i_idx')

		# binarize the input image by rounding to nearest integer
		xi = T.round(self.input)

		# calculate free energy for the given bit configuration
		fe_xi = self.free_energy(xi, self.scaling)

		# flip bit x_i of matrix xi and preserve all other bits x_{\i}
		# Equivalent to xi[:,bit_i_idx] = 1-xi[:, bit_i_idx], but assigns
		# the result to xi_flip, instead of working in place on xi.
		xi_flip = T.set_subtensor(xi[:, bit_i_idx], 1 - T.ceil(xi[:, bit_i_idx] / (xi[:, bit_i_idx] + 1)))

		# calculate free energy with bit flipped
		fe_xi_flip = self.free_energy(xi_flip, self.scaling)

		# equivalent to e^(-FE(x_i)) / (e^(-FE(x_i)) + e^(-FE(x_{\i})))
		cost = T.mean(self.n_visible * T.log(T.nnet.sigmoid(fe_xi_flip - fe_xi)))

		# increment bit_i_idx % number as part of updates
		updates[bit_i_idx] = (bit_i_idx + 1) % self.n_visible

		return cost
Exemple #15
0
    def k_areas_maxpooling(input,k):

        #dynamic filter size
        f=int(T.ceil(T.sqrt((input.shape[-1]*input.shape[-2])/float(k))))

        #how many zero rows have to inserted to the end so that the size of the filter to fit exaclty to the number of rows.
        rows_to_insert=input.shape[-1]%f

        #how many zero columns have to inserted top the end so that the size of the filter to fit exaclty to the number of columns.
        columns_to_insert=input.shape[-2]%f

        #insert rows
        output=T.insert(input, input.shape[2]*T.ones(1).repeat(rows_to_insert), 0, axis=2)

        #insert columns
        output=T.insert(output, output.shape[3]*T.ones(1).repeat(columns_to_insert), 0, axis=3)

        output_shape=output.shape
        #take max out of every f (filter size) rows
        output=T.transpose(output, (0,1,3,2)).reshape(output_shape[0],output_shape[1],-1,f).max(3).reshape(output_shape[0],output_shape[1],output_shape[-1],-1).transpose((0,1,3,2))

        #take the max out of every f(filter size) columns
        output=output.reshape(output.shape[0],output.shape[1],-1,f).max(3).reshape(output_shape[0],output_shape[1],output_shape[2]/f,-1)

        return output
Exemple #16
0
 def get_k(self, input_shape):
     return T.cast(
         T.max([
             self.ktop,
             T.ceil((self.nroflayers - self.layernr) /
                    float(self.nroflayers) * input_shape[3])
         ]), 'int32')
    def __init__(self, input_ngram, input_sm, vocab_size, emb_dim, num_section, linear_W_emb=None, fix_emb=False, nonlinear=None, activation=None):
        
        global rng
        global init_range
        if linear_W_emb is None:
            # random initialize
            linear_W_emb = np.asarray(rng.uniform(
                low=-init_range, high=init_range, size=(vocab_size, emb_dim)), dtype=theano.config.floatX)
        else:
            # use the given model parameter
            given_vocab_size, given_emb_dim = linear_W_emb.shape
            assert(given_vocab_size == vocab_size and given_emb_dim == emb_dim)

        # shared variables
        self.W_emb = theano.shared(value=linear_W_emb, name='W_emb')

        # stack vectors
        input_ngram = T.cast(input_ngram, 'int32')
        input_sm = T.cast(input_sm, 'int32')

        # output is a matrix where each row correponds to a context_size embedding vector, and row number equals to batch size
        # output dimensions: batch_size * ((context_size + 1) * emb_dim)
        output_local = self.W_emb[input_ngram[:, :-1].flatten()].reshape(
            (input_ngram.shape[0], emb_dim * (input_ngram.shape[1] - 1)))  # self.W_emb.shape[1]
        
        sentence_lengths = input_sm[:,0]
        sentence_matrix = input_sm[:,1:]

        sentence_num = sentence_matrix.shape[0]
        global_length = sentence_matrix.shape[1]
        section_length = T.cast(T.ceil(global_length / float(num_section)), 'int32')

        # For the first section
        sentence_embeddings = T.mean(self.W_emb[sentence_matrix[:, :section_length].flatten()].reshape(
            (sentence_num, section_length, emb_dim)), axis=1)

        # For the rest sections
        for i in xrange(1, num_section):
            current_section = T.mean(self.W_emb[sentence_matrix[:, i*section_length:(i+1)*section_length].flatten()].reshape(
                (sentence_num, section_length, emb_dim)), axis=1)
            sentence_embeddings = T.concatenate([sentence_embeddings, current_section], axis=1)

        # get the sentence index for each ngram vector, and transform it to 0-based
        sentence_indeces = input_ngram[:,-1]
        base_index = sentence_indeces[0]
        sentence_indeces = sentence_indeces - base_index

        # the last column of output should be a weighted sum of the sentence
        # vectors
        output_global = sentence_embeddings[sentence_indeces.flatten()].reshape((sentence_indeces.shape[0], emb_dim * num_section))

        # handle non-linear layer
        if nonlinear is None or activation is None:
            self.output = T.concatenate([output_local, output_global], axis=1)
            # params is the word embedding matrix
            self.params = [self.W_emb] if not fix_emb else []
        else:
            self.non_linear_params, non_linear_output_global = addNonlinearLayer(output_global, emb_dim * num_section, nonlinear, activation)
            self.output = T.concatenate([output_local, non_linear_output_global], axis=1)
            self.params = [self.W_emb] + self.non_linear_params if not fix_emb else self.non_linear_params
Exemple #18
0
def spp_max_pool_axis_kwargs(in_shape, out_shape):
    symbolic = (treeano.utils.is_variable(in_shape)
                or treeano.utils.is_variable(out_shape))
    # maxpool requires static shape
    assert not symbolic
    if symbolic:
        int_ceil = lambda x: T.ceil(x).astype("int32")
    else:
        int_ceil = lambda x: int(np.ceil(x))

    # eg. if input is 5 and output is 2, each pool size should be 3
    pool_size = int_ceil(in_shape / out_shape)
    # stride should equal pool_size, since we want non-overlapping regions
    stride = pool_size
    # pad as much as possible, since ignore_border=True
    padding = int_ceil((pool_size * out_shape - in_shape) / 2)

    if not symbolic:
        assert padding < pool_size

    return dict(
        ds=pool_size,
        st=stride,
        padding=padding,
    )
Exemple #19
0
    def get_output_for(self, input, **kwargs):
        p = self.p
        k = self.k
        nbatches = input.shape[0]
        x_len = self.x_len
        # x_len = 30
        # x = input.reshape((nbatches, x_len))
        x = input.reshape((nbatches, x_len))

        p_floor = T.floor(p)
        p_ceil = T.ceil(p)
        
        # Deltas
        p_delta = p - p_floor
        ep_delta = T.exp(k*-p_delta)

        p2_delta = 1 - p_delta
        ep2_delta = T.exp(k*-p2_delta)

        p0_delta = 1 + p_delta
        ep0_delta = T.exp(k*-p0_delta)

        ep_sum = ep_delta + ep2_delta + ep0_delta

        perm1 = x[:, (T.cast(p_floor, 'int32'))%x_len]
        perm2 = x[:, (T.cast(p_ceil, 'int32')+1)%x_len]
        perm0 = x[:, (T.cast(p_floor, 'int32')-1)%x_len]

        perm1_factor = ep_delta * perm1
        perm2_factor = ep2_delta * perm2
        perm3_factor = ep0_delta * perm0
        res = (perm1_factor + perm2_factor + perm3_factor) / ep_sum
        return res.reshape(input.shape)
Exemple #20
0
    def get_pseudo_likelihood_cost(self, updates):
        """Stochastic approximation to the pseudo-likelihood"""

        # index of bit i in expression p(x_i | x_{\i})
        bit_i_idx = theano.shared(value=0, name='bit_i_idx')

        # binarize the input image by rounding to nearest integer
        xi = T.round(self.input)

        # calculate free energy for the given bit configuration
        fe_xi = self.free_energy(xi, self.scaling)

        # flip bit x_i of matrix xi and preserve all other bits x_{\i}
        # Equivalent to xi[:,bit_i_idx] = 1-xi[:, bit_i_idx], but assigns
        # the result to xi_flip, instead of working in place on xi.
        xi_flip = T.set_subtensor(
            xi[:, bit_i_idx],
            1 - T.ceil(xi[:, bit_i_idx] / (xi[:, bit_i_idx] + 1)))

        # calculate free energy with bit flipped
        fe_xi_flip = self.free_energy(xi_flip, self.scaling)

        # equivalent to e^(-FE(x_i)) / (e^(-FE(x_i)) + e^(-FE(x_{\i})))
        cost = T.mean(self.n_visible *
                      T.log(T.nnet.sigmoid(fe_xi_flip - fe_xi)))

        # increment bit_i_idx % number as part of updates
        updates[bit_i_idx] = (bit_i_idx + 1) % self.n_visible

        return cost
Exemple #21
0
def spp_max_pool_axis_kwargs(in_shape, out_shape):
    symbolic = (treeano.utils.is_variable(in_shape)
                or treeano.utils.is_variable(out_shape))
    # maxpool requires static shape
    assert not symbolic
    if symbolic:
        int_ceil = lambda x: T.ceil(x).astype("int32")
    else:
        int_ceil = lambda x: int(np.ceil(x))

    # eg. if input is 5 and output is 2, each pool size should be 3
    pool_size = int_ceil(in_shape / out_shape)
    # stride should equal pool_size, since we want non-overlapping regions
    stride = pool_size
    # pad as much as possible, since ignore_border=True
    padding = int_ceil((pool_size * out_shape - in_shape) / 2)

    if not symbolic:
        assert padding < pool_size

    return dict(
        ds=pool_size,
        st=stride,
        padding=padding,
    )
Exemple #22
0
    def get_output_for( self, inputs ,**kwargs ):
        # For each ROI R = [batch_index x1 y1 x2 y2]: max pool over R
        input = inputs[0]
        boxes = inputs[1]
        batch = T.shape (input)[0]
        channels = T.shape (input)[1]
        height = T.shape( input )[2]
        width = T.shape( input )[3]
        num_boxes = T.shape(boxes)[0]
        output = T.zeros((batch * num_boxes , channels, self.num_features))

        for idbb,bb in enumerate(range(num_boxes)):
            batch_ind = bb[0]

            pool_list = []
            #for pool_dim in self.pool_dims:
            start_w = T.clip(T.floor(bb[1] * self.sp_scale),0,width)
            start_h = T.clip(T.floor(bb[2] * self.sp_scale),0,heigth)
            end_w = T.clip(T.ceil(bb[3] * self.sp_scale),0,width)
            end_h = T.clip(T.ceil(bb[4] * self.sp_scale),0,height)

            w = T.max(end_w - start_w +1,1)
            h = T.amx(end_h - start_h +1,1)

            start_samples_y,start_sample_x = T.floor(_meshgrid(start_h,end_h,pool_dims+1,start_w,end_w,pool_dims+1))
            end_samples_y,end_sample_x = T.ceil(_meshgrid(start_h,end_h,pool_dims+1,start_w,end_w,pool_dims+1))

            input[batch_ind,:,np.floor(py):np.ceil(samples_y[idy+1]),np.floor(px):np.ceil(samples_x[idx+1])]
            
            #T.max()

            #for idx,px in enumerate(samples_x[:-1]):
            #    for idy,py in enumerate(samples_y[:-1]):

             #       (pool.dnn_pool( input[batch_ind,:,np.floor(py):np.ceil(samples_y[idy+1]),np.floor(px):np.ceil(samples_x[idx+1])],(0,0),(None,None),'max', (0,0) )).flatten(2)

                #sz_w = ( w - 1 ) // pool_dim
                #sz_h = ( h - 1 ) // pool_dim

                #str_h = w // pool_dim
                #str_w = h // pool_dim

                #pool = dnn.dnn_pool( input[bb[0],:,start_h:end_h+1,start_w:end_w+1], (sz_h,sz_w),                 (str_h,str_w), 'max', (0,0) ).flatten(2)
        pool_list.append( pool )
        output[idbb] = T.transpose(T.concatenate( pool_list, axis=1 )) #not efficient but for the moment is ok!
        #if everything is correct this vector should be ordered as in fast RCNN    
        return output
Exemple #23
0
 def get_output_shape_for(self, input_shape):
     get_k = K.cast(
         K.max([
             self.ktop,
             T.ceil((self.numLayers - self.currlayer) /
                    float(self.numLayers) * self.inputdim)
         ]), 'int32')
     return (input_shape[0], get_k, input_shape[2])
Exemple #24
0
    def _build_expression(self, input_expression=None):
        if self.pool_type not in ['max', 'avg']:
            raise NotImplementedError(
                'Pooling only implemented for max and avg')

        if input_expression is None:
            self.input_ = T.tensor4(dtype=self.input_dtype)
        else:
            self.input_ = input_expression

        # Replicating caffe style pooling means zero padding
        # then strided pooling with ignore_border=True
        if self.padding in [0, (0, 0)]:
            padded_input = self.input_
        else:
            zero_padder = ZeroPad(padding=self.padding)
            zero_padder._build_expression(self.input_)
            padded_input = zero_padder.expression_
        if self.pool_type == 'max':
            pooled = fancy_max_pool(padded_input,
                                    self.pool_shape, self.pool_stride,
                                    ignore_border=False)
        elif self.pool_type == 'avg':
            # self.pool_shape needs to be a tuple
            avg_kernel = T.cast(T.ones((1, 1) + self.pool_shape,
                                dtype=self.input_.dtype
                                ) / np.prod(self.pool_shape),
                                self.input_.dtype)
            n_imgs = self.input_.shape[0]
            n_channels = self.input_.shape[1]
            conv_output = T.nnet.conv2d(
                padded_input.reshape((n_imgs * n_channels, 1,
                                      padded_input.shape[2],
                                      padded_input.shape[3])),
                avg_kernel, subsample=self.pool_stride)
            pooled = conv_output.reshape((n_imgs, n_channels,
                                         conv_output.shape[2],
                                         conv_output.shape[3]))

        # A caffe quirk: The output shape is (for width, analogous for h:)
        # ceil((w + 2 * pad_w - kernel_w) / stride_w) + 1, instead of floor
        # With floor, ignore_border=True would have yielded the exact result
        # With ceil, sometimes we need an extra column and/or line. So we do
        # ignore_border=False and then crop to the right shape. Since the
        # shape is dynamic we need to first calculate it:

        # padding gotta be a tuple too
        pad = T.constant(self.padding)
        # pad = T.constant(zero_padder.padding_)
        # supposing here that self.pool_shape is a tuple. Should check
        pool_shape = T.constant(self.pool_shape)
        # stride hopefully a tuple, too
        pool_stride = T.constant(self.pool_stride, dtype='float64')
        float_shape = (self.input_.shape[2:4] + 2 * pad
                       - pool_shape) / pool_stride + 1
        output_shape = T.cast(T.ceil(float_shape), dtype='int64')
        self.expression_ = pooled[:, :, 0:output_shape[0],
                                        0:output_shape[1]]
Exemple #25
0
    def _build_expression(self, input_expression=None):
        if self.pool_type not in ['max', 'avg']:
            raise NotImplementedError(
                'Pooling only implemented for max and avg')

        if input_expression is None:
            self.input_ = T.tensor4(dtype=self.input_dtype)
        else:
            self.input_ = input_expression

        # Replicating caffe style pooling means zero padding
        # then strided pooling with ignore_border=True
        if self.padding in [0, (0, 0)]:
            padded_input = self.input_
        else:
            zero_padder = ZeroPad(padding=self.padding)
            zero_padder._build_expression(self.input_)
            padded_input = zero_padder.expression_
        if self.pool_type == 'max':
            pooled = fancy_max_pool(padded_input,
                                    self.pool_shape, self.pool_stride,
                                    ignore_border=False)
        elif self.pool_type == 'avg':
            # self.pool_shape needs to be a tuple
            avg_kernel = T.cast(T.ones((1, 1) + self.pool_shape,
                                dtype=self.input_.dtype
                                ) / np.prod(self.pool_shape),
                                self.input_.dtype)
            n_imgs = self.input_.shape[0]
            n_channels = self.input_.shape[1]
            conv_output = T.nnet.conv2d(
                padded_input.reshape((n_imgs * n_channels, 1,
                                      padded_input.shape[2],
                                      padded_input.shape[3])),
                avg_kernel, subsample=self.pool_stride)
            pooled = conv_output.reshape((n_imgs, n_channels,
                                         conv_output.shape[2],
                                         conv_output.shape[3]))

        # A caffe quirk: The output shape is (for width, analogous for h:)
        # ceil((w + 2 * pad_w - kernel_w) / stride_w) + 1, instead of floor
        # With floor, ignore_border=True would have yielded the exact result
        # With ceil, sometimes we need an extra column and/or line. So we do
        # ignore_border=False and then crop to the right shape. Since the
        # shape is dynamic we need to first calculate it:

        # padding gotta be a tuple too
        pad = T.constant(self.padding)
        # pad = T.constant(zero_padder.padding_)
        # supposing here that self.pool_shape is a tuple. Should check
        pool_shape = T.constant(self.pool_shape)
        # stride hopefully a tuple, too
        pool_stride = T.constant(self.pool_stride, dtype='float64')
        float_shape = (self.input_.shape[2:4] + 2 * pad
                       - pool_shape) / pool_stride + 1
        output_shape = T.cast(T.ceil(float_shape), dtype='int64')
        self.expression_ = pooled[:, :, 0:output_shape[0],
                                        0:output_shape[1]]
    def compileActivation(self, net, layerNum):
        variable = net.x if layerNum == 0 else net.varArrayA[layerNum - 1]

        #Calc shapes for reshape function on-the-fly. Assume we have square images as input.
        sX = T.cast(T.sqrt(T.shape(variable)[0] / self.kernel_shape[1]), 'int16')

        #Converts input from 2 to 4 dimensions
        Xr = T.reshape(variable.T, (T.shape(variable)[1], self.kernel_shape[1], sX, sX))

        if self.optimized:
            out_size = T.cast(
                T.ceil((T.shape(Xr)[-1] - T.shape(net.varWeights[layerNum]['w'])[-1] + 1) / np.float32(self.stride)),
                'int32')

            conv_op = FilterActs(stride=self.stride)
            input_shuffled = Xr.dimshuffle(1, 2, 3, 0)  # bc01 to c01b
            filters_shuffled = net.varWeights[layerNum]['w'].dimshuffle(1, 2, 3, 0)  # bc01 to c01b
            filters_flipped = filters_shuffled[:, ::-1, ::-1, :] # flip rows and columns
            contiguous_input = gpu_contiguous(input_shuffled)
            contiguous_filters = gpu_contiguous(filters_flipped *
                                                (net.dropOutVectors[layerNum].dimshuffle('x', 0, 1, 'x') if self.dropout else 1.0))
            a = conv_op(contiguous_input, contiguous_filters)
            a = a[:, :out_size, :out_size, :]
            #Add bias
            a = a + net.varWeights[layerNum]['b'].dimshuffle(0, 'x', 'x', 'x')
        else:
            a = T.nnet.conv2d(Xr, net.varWeights[layerNum]['w'] *
                              (net.dropOutVectors[layerNum].dimshuffle('x', 'x', 0, 1) if self.dropout else 1.0),
                              border_mode='valid',
                              subsample=(self.stride, self.stride))
            #Add bias
            a = a + net.varWeights[layerNum]['b'].dimshuffle('x', 0, 'x', 'x')

        if self.pooling:
            if self.optimized:
                #Pooling
                # ds - side of square pool window
                # stride - Defines the stride size between successive pooling squares.
                # Setting this parameter smaller than sizeX produces overlapping pools.
                # Setting it equal to sizeX gives the usual, non-overlapping pools. Values greater than sizeX are not allowed.
                pool_op = MaxPool(ds=self.pooling_shape, stride=self.pooling_shape)

                contiguous_input = gpu_contiguous(a)
                a = pool_op(contiguous_input)
                a = a.dimshuffle(3, 0, 1, 2)       # c01b to bc01
            else:
                #a = downsample.max_pool_2d(a, (self.pooling_shape, self.pooling_shape), ignore_border=False)
                a = pool.max_pool2D(a, (self.pooling_shape, self.pooling_shape), ignore_border=False)
        else:
            if self.optimized:
                a = a.dimshuffle(3, 0, 1, 2)       # c01b to bc01

        a = T.flatten(a, outdim=2).T

        #Sigmoid
        a = self.activation(a, self.pool_size)

        net.varArrayA.append(a)
Exemple #27
0
    def _ppf(self, p):
        """
        The percentile point function (the inverse of the cumulative
        distribution function) of the discrete Weibull distribution.
        """
        q = self.q
        beta = self.beta

        return (tt.ceil(tt.power(tt.log(1 - p) / tt.log(q), 1. / beta)) - 1).astype('int64')
Exemple #28
0
def gaussian_kernel_default_radius(sigma, window_radius=None):
    if window_radius is None:
        radius = T.cast(T.max(T.ceil(3 * sigma)), 'int32')
        if type(sigma) in (float, int):
            return int(radius.eval())
        else:
            return radius
    else:
        return window_radius
Exemple #29
0
 def get_hidden_values(self, input, batch_size):
     self.indices_high = T.ceil(self.indices).astype('int8')
     self.indices_low = T.floor(self.indices).astype('int8')
     self.factors_high = self.W[self.indices_high]
     self.factors_low = self.W[self.indices_low]
     self.factors = (self.factors_high - self.factors_low) * (self.indices - self.indices_low) / \
                    (self.indices_high - self.indices_low + 1E-5) + self.factors_low
     self.output = T.sum(self.x * T.transpose(self.factors).dimshuffle(0, 'x', 1), axis=2) / \
                   (self.length + 1.0).dimshuffle(0, 'x')
 def get_hidden_values(self, input, batch_size):
     self.indices_high = T.ceil(self.indices).astype('int8')
     self.indices_low = T.floor(self.indices).astype('int8')
     self.factors_high = self.W[self.indices_high]
     self.factors_low = self.W[self.indices_low]
     self.factors = (self.factors_high - self.factors_low) * (self.indices - self.indices_low) / \
                    (self.indices_high - self.indices_low + 1E-5) + self.factors_low
     self.output = T.sum(self.x * T.transpose(self.factors).dimshuffle(0, 'x', 1), axis=2) / \
                   (self.length + 1.0).dimshuffle(0, 'x')
Exemple #31
0
def gaussian_kernel_default_radius(sigma, window_radius=None):
    if window_radius is None:
        radius = T.cast(T.max(T.ceil(3*sigma)), 'int32')
        if type(sigma) in (float, int):
            return int(radius.eval())
        else:
            return radius
    else:
        return window_radius
Exemple #32
0
    def _ppf(self, p):
        r"""
        The percentile point function (the inverse of the cumulative
        distribution function) of the discrete Weibull distribution.
        """
        q = self.q
        beta = self.beta

        return (tt.ceil(tt.power(tt.log(1 - p) / tt.log(q), 1.0 / beta)) - 1).astype("int64")
    def compileActivation(self, net, layerNum):
        variable = net.x if layerNum == 0 else net.varArrayA[layerNum - 1]

        #Calc shapes for reshape function on-the-fly. Assume we have square images as input.
        sX = T.cast(T.sqrt(T.shape(variable)[0] / self.kernel_shape[1]), 'int16')

        #Converts input from 2 to 4 dimensions
        Xr = T.reshape(variable.T, (T.shape(variable)[1], self.kernel_shape[1], sX, sX))

        if self.optimized:
            out_size = T.cast(
                T.ceil((T.shape(Xr)[-1] - T.shape(net.varWeights[layerNum]['w'])[-1] + 1) / np.float32(self.stride)),
                'int32')

            conv_op = FilterActs(stride=self.stride)
            input_shuffled = Xr.dimshuffle(1, 2, 3, 0)  # bc01 to c01b
            filters_shuffled = net.varWeights[layerNum]['w'].dimshuffle(1, 2, 3, 0)  # bc01 to c01b
            filters_flipped = filters_shuffled[:, ::-1, ::-1, :] # flip rows and columns
            contiguous_input = gpu_contiguous(input_shuffled)
            contiguous_filters = gpu_contiguous(filters_flipped *
                                                (net.dropOutVectors[layerNum].dimshuffle('x', 0, 1, 'x') if self.dropout else 1.0))
            a = conv_op(contiguous_input, contiguous_filters)
            a = a[:, :out_size, :out_size, :]
            #Add bias
            a = a + net.varWeights[layerNum]['b'].dimshuffle(0, 'x', 'x', 'x')
        else:
            a = T.nnet.conv2d(Xr, net.varWeights[layerNum]['w'] *
                              (net.dropOutVectors[layerNum].dimshuffle('x', 'x', 0, 1) if self.dropout else 1.0),
                              border_mode='valid',
                              subsample=(self.stride, self.stride))
            #Add bias
            a = a + net.varWeights[layerNum]['b'].dimshuffle('x', 0, 'x', 'x')

        if self.pooling:
            if self.optimized:
                #Pooling
                # ds - side of square pool window
                # stride - Defines the stride size between successive pooling squares.
                # Setting this parameter smaller than sizeX produces overlapping pools.
                # Setting it equal to sizeX gives the usual, non-overlapping pools. Values greater than sizeX are not allowed.
                pool_op = MaxPool(ds=self.pooling_shape, stride=self.pooling_shape)

                contiguous_input = gpu_contiguous(a)
                a = pool_op(contiguous_input)
                a = a.dimshuffle(3, 0, 1, 2)       # c01b to bc01
            else:
                a = downsample.max_pool_2d(a, (self.pooling_shape, self.pooling_shape), ignore_border=False)
        else:
            if self.optimized:
                a = a.dimshuffle(3, 0, 1, 2)       # c01b to bc01

        a = T.flatten(a, outdim=2).T

        #Sigmoid
        a = self.activation(a, self.pool_size)

        net.varArrayA.append(a)
Exemple #34
0
def dynamic_k_max_pooling(input, sent_sizes, k_max_factor, k_max_final):
    """
    k_max_factor -- multiplied by sentence_sizes gives the value of kmax for each sentence
  """
    # Unroll input into (batch_size x nchannels x nwords) x ndim
    nbatches, nchannels, nwords, ndim = input.shape[0], input.shape[
        1], input.shape[2], input.shape[3]
    x = input.dimshuffle(0, 1, 3, 2)

    sent_sizes = T.cast(T.ceil(sent_sizes * k_max_factor), dtype='int32')
    sent_sizes = T.maximum(sent_sizes, k_max_final)
    # sent_sizes_matrix = T.repeat(sent_sizes, nwords, axis=1)
    sent_sizes_matrix = T.repeat(sent_sizes.dimshuffle(0, 'x'), nwords, axis=1)

    idx = T.arange(nwords).dimshuffle('x', 0)
    idx_matrix = T.repeat(idx, nbatches, axis=0)

    sent_sizes_mask = T.lt(idx_matrix, sent_sizes_matrix)[:, ::-1]

    neighborsArgSorted = T.argsort(x, axis=3)
    neighborsArgSorted_masked = (
        (neighborsArgSorted + 1) *
        sent_sizes_mask.dimshuffle(0, 'x', 'x', 1)) - 1
    neighborsArgSorted_masked_sorted = neighborsArgSorted_masked.sort(axis=3)

    nwords_max = T.cast(T.ceil(nwords * k_max_factor), 'int32')
    # print nwords_max.eval()
    neighborsArgSorted_masked_sorted_clipped = neighborsArgSorted_masked_sorted[:, :, :,
                                                                                -nwords_max:]

    ax0 = T.repeat(T.arange(nbatches), nchannels * ndim * nwords_max)
    ax1 = T.repeat(T.arange(nchannels), ndim * nwords_max).dimshuffle('x', 0)
    ax1 = T.repeat(ax1, nbatches, axis=0).flatten()
    ax2 = T.repeat(T.arange(ndim), nwords_max, axis=0).dimshuffle('x', 'x', 0)
    ax2 = T.repeat(ax2, nchannels, axis=1)
    ax2 = T.repeat(ax2, nbatches, axis=0).flatten()
    ax3 = neighborsArgSorted_masked_sorted_clipped.flatten()

    pooled_out = x[ax0, ax1, ax2, ax3]
    pooled_out = pooled_out.reshape(
        (nbatches, nchannels, ndim, nwords_max)).dimshuffle(0, 1, 3, 2)

    return pooled_out
Exemple #35
0
    def weighted_vector_mse(self, y_true, y_pred):

        self.y_true = y_true
        self.y_pred = y_pred

        weight = T.ceil(self.y_true)
        loss = T.square(weight * (self.y_true - self.y_pred))
        # use appropriate relations for other objectives. E.g, for binary_crossentropy:
        #loss = weights * (y_true * T.log(y_pred) + (1.0 - y_true) * T.log(1.0 - y_pred))
        return T.mean(T.sum(loss, axis=-1))
Exemple #36
0
 def call(self, x, mask=None):
     get_k = K.cast(
         K.max([
             self.ktop,
             T.ceil((self.numLayers - self.currlayer) /
                    float(self.numLayers) * self.inputdim)
         ]), 'int32')
     output = x[T.arange(x.shape[0]).dimshuffle(0, "x", "x"),
                T.sort(T.argsort(x, axis=1)[:, -get_k:, :], axis=1),
                T.arange(x.shape[2]).dimshuffle("x", "x", 0)]
     return output
Exemple #37
0
    def pool(self, x, mode, pool_size, strides, padding=(0, 0)):

        if strides is None:
            strides = pool_size
        assert len(strides) == len(pool_size)
        do2D = len(pool_size) == 2

        if mode == 'avg':
            mode = 'average_exc_pad'

        # theano requires symmetric padding
        # We pad the larger on when two sides' padding are unequal
        max_padding = list(padding)
        for i, p in enumerate(padding):
            if isinstance(p, tuple):
                assert p[1] == p[0] + 1
                max_padding[i] = p[1]
            else:
                max_padding[i] = p

        if do2D:
            pool_out = pool.pool_2d(x,
                                    ws=pool_size,
                                    stride=strides,
                                    ignore_border=True,
                                    pad=max_padding,
                                    mode=mode)
        else:
            # pool over HW
            pool_out = pool.pool_2d(x.dimshuffle(0, 1, 4, 2, 3),
                                    ws=pool_size[:2],
                                    stride=strides[:2],
                                    ignore_border=True,
                                    pad=max_padding[:2],
                                    mode=mode)

            # pool over Z
            pool_out = pool.pool_2d(pool_out.dimshuffle(0, 1, 3, 4, 2),
                                    ws=(1, pool_size[2]),
                                    stride=(1, strides[2]),
                                    ignore_border=True,
                                    pad=(0, max_padding[2]),
                                    mode=mode)

        # theano might output more than expected output shape (due to max padding). We truncate them here
        exp_l = []
        for i in range(len(strides)):
            c = T.ceil(self.cast(x.shape[i + 2], _FLOATX) / strides[i])
            exp_l.append(self.cast(c, 'int32'))

        if do2D:
            return pool_out[:, :, :exp_l[0], :exp_l[1]]
        else:
            return pool_out[:, :, :exp_l[0], :exp_l[1], :exp_l[2]]
Exemple #38
0
def set_k_max(layer, k_top, layer_position, nb_layers, sentence_length):
    """
    Set k_max based on the number of convolutional layers,
    and the layer position in the network.
    http://nal.co/papers/Kalchbrenner_DCNN_ACL14
    """
    alpha = (nb_layers - layer_position) * 1. / nb_layers
    layer.k_max = T.maximum(
        k_top,
        T.cast(T.ceil(sentence_length * alpha), 'int32')
    )
Exemple #39
0
    def get_stencil(self, t, r=None, texp=None):
        if r is None or texp is None:
            return tt.shape_padright(t)

        z = tt.zeros_like(self.a)
        r = tt.as_tensor_variable(r)
        R = self.r_star + z
        hp = 0.5 * self.period

        if self.ecc is None:
            # Equation 14 from Winn (2010)
            k = r / self.r_star
            arg1 = tt.square(1 + k) - tt.square(self.b)
            arg2 = tt.square(1 - k) - tt.square(self.b)
            factor = R / (self.a * self.sin_incl)
            hdur1 = hp * tt.arcsin(factor * tt.sqrt(arg1)) / np.pi
            hdur2 = hp * tt.arcsin(factor * tt.sqrt(arg2)) / np.pi
            ts = [-hdur1, -hdur2, hdur2, hdur1]
            flag = z

        else:
            M_contact1 = self.contact_points_op(
                self.a, self.ecc, self.cos_omega, self.sin_omega,
                self.cos_incl + z, self.sin_incl + z, R + r)
            M_contact2 = self.contact_points_op(
                self.a, self.ecc, self.cos_omega, self.sin_omega,
                self.cos_incl + z, self.sin_incl + z, R - r)

            flag = M_contact1[2] + M_contact2[2]

            ts = [
                tt.mod((M_contact1[0]-self.M0)/self.n+hp, self.period)-hp,
                tt.mod((M_contact2[0]-self.M0)/self.n+hp, self.period)-hp,
                tt.mod((M_contact2[1]-self.M0)/self.n+hp, self.period)-hp,
                tt.mod((M_contact1[1]-self.M0)/self.n+hp, self.period)-hp
            ]

        start = self.period * tt.floor((tt.min(t) - self.t0) / self.period)
        end = self.period * (tt.ceil((tt.max(t) - self.t0) / self.period) + 1)
        start += self.t0
        end += self.t0
        tout = []
        for i in range(4):
            if z.ndim < 1:
                tout.append(ts[i] + tt.arange(start, end, self.period))
            else:
                tout.append(theano.scan(
                    fn=lambda t0, s0, e0, p0: t0 + tt.arange(s0, e0, p0),
                    sequences=[ts[i], start, end, self.period],
                )[0].flatten())

        ts = tt.sort(tt.concatenate(tout))
        return ts, flag
Exemple #40
0
def R2_RNN_block(tparams, inputs, prefix=None, name='r2_rnn', std=True):
    prefix = GetPrefix(prefix, name)
    n_steps = inputs.shape[0]
    n_samples = inputs.shape[1]
    x_size = inputs.shape[2]

    r_steps = T.ceil(T.log2(n_steps)).astype('uint32')
    r_steps = T.arange(r_steps)

    # r_steps=r_steps.reshape([r_steps.shape[0],1]);

    def _step_inner(index, num, inps):
        index = index * 2
        index_ = T.minimum(index + 2, num)

        h = RNN_layer(tparams,
                      inps[index:index_, :, :],
                      prefix=prefix,
                      name=None,
                      std=False)
        return h[-1, :, :]

    def _step(r_step, num, inps, std=True):
        n = num
        steps = T.arange((n + 1) / 2)
        # steps=steps.reshape([steps.shape[0],1]);

        out, updates = theano.scan(
            lambda index, num, inps: _step_inner(index, num, inps),
            sequences=[steps],
            outputs_info=None,
            non_sequences=[num, inps],
            name=_p(prefix, 'inner_scan'),
            n_steps=steps.shape[0],
            profile=False)

        # if std:	out=standardize(out);
        num = out.shape[0]
        h = T.zeros_like(inps)
        h = T.set_subtensor(h[:num], out)
        return num, h
        # return out;

    if std: inputs = standardize(inputs)
    out, updates = theano.reduce(
        lambda r_step, num, inps: _step(r_step, num, inps),
        sequences=r_steps,
        outputs_info=[inputs.shape[0], inputs],
        # non_sequences=inputs,
        name=_p(prefix, 'scan'))
    return out[1][:out[0]]
Exemple #41
0
def dynamic_k_max_pooling(input, sent_sizes, k_max_factor, k_max_final):
  """
    k_max_factor -- multiplied by sentence_sizes gives the value of kmax for each sentence
  """
  # Unroll input into (batch_size x nchannels x nwords) x ndim
  nbatches, nchannels, nwords, ndim = input.shape[0], input.shape[1], input.shape[2], input.shape[3]
  x = input.dimshuffle(0,1,3,2)

  sent_sizes = T.cast(T.ceil(sent_sizes * k_max_factor), dtype='int32')
  sent_sizes = T.maximum(sent_sizes, k_max_final)
  # sent_sizes_matrix = T.repeat(sent_sizes, nwords, axis=1)
  sent_sizes_matrix = T.repeat(sent_sizes.dimshuffle(0, 'x'), nwords, axis=1)

  idx = T.arange(nwords).dimshuffle('x', 0)
  idx_matrix = T.repeat(idx, nbatches, axis=0)

  sent_sizes_mask = T.lt(idx_matrix, sent_sizes_matrix)[:,::-1]

  neighborsArgSorted = T.argsort(x, axis=3)
  neighborsArgSorted_masked = ((neighborsArgSorted + 1) * sent_sizes_mask.dimshuffle(0,'x','x',1)) - 1
  neighborsArgSorted_masked_sorted = neighborsArgSorted_masked.sort(axis=3)

  nwords_max = T.cast(T.ceil(nwords * k_max_factor), 'int32')
  # print nwords_max.eval()
  neighborsArgSorted_masked_sorted_clipped = neighborsArgSorted_masked_sorted[:,:,:,-nwords_max:]

  ax0 = T.repeat(T.arange(nbatches), nchannels*ndim*nwords_max)
  ax1 = T.repeat(T.arange(nchannels), ndim * nwords_max).dimshuffle('x', 0)
  ax1 = T.repeat(ax1, nbatches, axis=0).flatten()
  ax2 = T.repeat(T.arange(ndim), nwords_max, axis=0).dimshuffle('x', 'x', 0)
  ax2 = T.repeat(ax2, nchannels, axis=1)
  ax2 = T.repeat(ax2, nbatches, axis=0).flatten()
  ax3 = neighborsArgSorted_masked_sorted_clipped.flatten()

  pooled_out = x[ax0, ax1, ax2, ax3]
  pooled_out = pooled_out.reshape((nbatches, nchannels, ndim, nwords_max)).dimshuffle(0,1,3,2)

  return pooled_out
Exemple #42
0
 def _get_valid_cost(self, input_vec, *args, **kwargs):
     idx = tensor.ceil(input_vec.shape[0] *
                       self.config.sample_percent_for_test).astype('int32')
     new_input_vec = input_vec[0:idx]
     preds = self._get_pred_dist(new_input_vec)
     ranks = tensor.argsort(preds, axis=1)[:, ::-1]
     top1_accuracy = tensor.eq(self.hashtag[0:idx], ranks[:, 0]).mean()
     top10_accuracy = tensor.sum(tensor.eq(ranks[:, 0:self.rank],
                                           self.hashtag[0:idx, None]),
                                 axis=1).mean()
     top1_accuracy.name = "top1_accuracy"
     top10_accuracy.name = "top10_accuracy"
     self.monitor_valid_vars = [[top1_accuracy], [top10_accuracy]]
     self.stop_monitor_var = top10_accuracy
Exemple #43
0
    def __init__(self,
                 numpy_rng,
                 theano_rng,
                 input,
                 input_shape,
                 indices,
                 length,
                 max_length=30,
                 n_out=1,
                 batch_size=100,
                 W=None):
        self.n_out = n_out
        self.n_in = input_shape[1]
        self.x = input  #3D tensor
        self.indices = indices  #2D tensor
        self.length = length  #1D tensor
        self.max_length = float(max_length)
        self.numpy_rng = numpy_rng
        self.theano_rng = theano_rng

        init_W = [
            0.54457003, 0.72741562, 1.39331913, 1.12367916, 0.79878163,
            0.27706152, 0.3593896, 0.39622781, 0.27895978, 0.23260947,
            0.26763204, 0.27084899, 0.07067534, 0.13463201, 0.07948229,
            0.02779013, 0.12053657, 0.14807181, 0.24277158, 0.36964679,
            0.1601541, 0.37342793, 0.47257897, 0.39729786, 0.56589139,
            0.30535939, 0.10021771, 0.07151619, 0.12510002, 0.3112531,
            0.43562451, 0.05050614, 0.07199406, 0.50659907, 0.42588547
        ]

        if W is None:
            W_values = numpy.asarray(
                self.numpy_rng.uniform(low=0.5, high=0.5, size=(self.n_in)),
                #init_W,
                # numpy.linspace(1.0, 0.0, self.n_in),
                dtype=theano.config.floatX)
            self.W = theano.shared(value=W_values, name='W', borrow=True)
        else:
            self.W = W

        self.indices_high = T.ceil(self.indices).astype('int8')
        self.indices_low = T.floor(self.indices).astype('int8')
        self.factors_high = self.W[self.indices_high]
        self.factors_low = self.W[self.indices_low]
        self.factors = (self.factors_high - self.factors_low) * (self.indices - self.indices_low) / \
                       (self.indices_high - self.indices_low + 1E-5) + self.factors_low
        self.output = T.sum(self.x * T.transpose(self.factors).dimshuffle(0, 'x', 1), axis=2) / \
                      (self.length + 1.0).dimshuffle(0, 'x')

        self.params = [self.W]
class dynamicKMaxPoolingLayer(Layer):

    def __init__(self, incoming, kTop, numOfLayers, layerNumber, **kwargs):
        super(dynamicKMaxPoolingLayer, self).__init__(incoming, **kwargs)

    self.kTop = kTop
    self.numOfLayers = numOfLayers
    self.layerNumber = layerNumber
    # As per the definition in Kalchbrenner's paper, the 
    # k value for k-max pooling is dynamically given as :
    self.k = T.cast(T.max([self.kTop, 
        T.ceil((self.numOfLayers - self.layerNumber)*self.input_shape[3]/float(self.numOfLayers))]), 'int16')

    def get_output_for(self, input)
    def get_hidden_values(self, input):

        # convolve input feature maps with filters
        self.conv_out = conv.conv2d(
            input=input, filters=self.W, border_mode="full", filter_shape=self.kshp, image_shape=self.imshp
        )

        # k-max pooling.
        k = T.cast(T.max((self.k_Top, T.ceil(self.factor * self.s))), "int32")
        pool_shape = self.conv_out.shape
        pool = self.kmaxPool(self.conv_out, pool_shape, k)

        output = T.tanh(pool + self.b.dimshuffle("x", 0, "x", "x"))
        self.shape = output.shape

        return output
def R2_RNN_block(tparams,inputs,prefix=None,name='r2_rnn',std=True):
	prefix=GetPrefix(prefix,name);
	n_steps=inputs.shape[0];
	n_samples=inputs.shape[1];
	x_size=inputs.shape[2];	

	r_steps=T.ceil(T.log2(n_steps)).astype('uint32');
	r_steps=T.arange(r_steps);
	# r_steps=r_steps.reshape([r_steps.shape[0],1]);

	
	def _step_inner(index,num,inps):
		index=index*2;
		index_=T.minimum(index+2,num);

		h=RNN_layer(tparams,inps[index:index_,:,:],prefix=prefix,name=None,std=False);
		return h[-1,:,:];
	
	def _step(r_step,num,inps,std=True):
		n=num;
		steps=T.arange((n+1)/2);
		# steps=steps.reshape([steps.shape[0],1]);

		out,updates=theano.scan(lambda index,num,inps:_step_inner(index,num,inps), 
							sequences=[steps], 
							outputs_info=None,
							non_sequences=[num,inps],
							name=_p(prefix,'inner_scan'),
							n_steps=steps.shape[0],
							profile=False);

		# if std:	out=standardize(out);
		num=out.shape[0];
		h=T.zeros_like(inps);
		h=T.set_subtensor(h[:num],out);
		return num,h;
		# return out;
	
	if std:	inputs=standardize(inputs);
	out,updates=theano.reduce(lambda r_step,num,inps:_step(r_step,num,inps), 
							sequences=r_steps, 
							outputs_info=[inputs.shape[0],inputs],
							# non_sequences=inputs,
							name=_p(prefix,'scan')
							);
	return out[1][:out[0]];
 def attend(self, y_p):
   inp, updates = 0, {}
   z = T.dot(y_p,self.T_W) + self.T_b
   #idx = self.I[self.n[0]]
   #y_out = T.cast(self.y_t[self.n[0]],'int32')
   #nll, _ = T.nnet.crossentropy_softmax_1hot(x=z[idx], y_idx=y_out[idx])
   smooth = T.constant(self.attrs['smooth'], 'float32')
   #n = T.cast(self.n[0],'int32')
   n = T.cast(self.ns, 'int32')
   t = T.dot(T.nnet.softmax(z), T.arange(self.base[0].attrs['max_skip'],dtype='float32')) #+ numpy.float32(1)
   #t = T.cast(T.argmax(z,axis=1), 'float32' )
   t = smooth * self.y_t[n,T.arange(self.y_t.shape[1]),T.cast(self.t,'int32')] + (numpy.float32(1) - smooth) * t
   pos = T.cast(T.ceil(self.t), 'int32')
   inp = T.dot(self.B[pos,T.arange(pos.shape[0])], self.W_att_in)
   #updates[self.cost_sum] = T.sum(nll,dtype='float32').dimshuffle('x').repeat(1,axis=0)
   updates[self.t] = T.maximum(self.t - t, numpy.float32(0))
   updates[self.ns] = self.ns - numpy.float32(1)
   return inp, updates
Exemple #48
0
def pad_to_a_multiple(tensor_, k, pad_with):
    """Pad a tensor to make its first dimension a multiple of a number.

    Parameters
    ----------
    tensor_ : :class:`~theano.Variable`
    k : int
        The number, multiple of which the length of tensor is made.
    pad_with : float or int
        The value for padding.

    """
    new_length = (
        tensor.ceil(tensor_.shape[0].astype('float32') / k) * k).astype('int64')
    new_shape = tensor.set_subtensor(tensor_.shape[:1], new_length)
    canvas = tensor.alloc(pad_with, tensor.prod(new_shape)).reshape(
        new_shape, ndim=tensor_.ndim)
    return tensor.set_subtensor(canvas[:tensor_.shape[0]], tensor_)
    def Output(self):

        #  Convolve input with trained parameters.
        conv_out = conv.conv2d(input=self.x, filters=self.W, border_mode='full',
                filter_shape=self.kshp, image_shape=self.imshp)
        # Fold conv result into two.
        if self.do_fold:
            fold = self.Fold(conv_out)
        
        # k-max pooling.
        k = T.cast(T.max((self.k_Top, T.ceil(self.factor * self.s))), 'int32')
        if self.do_fold:
            pool_shape = fold.shape
            pooled_out = self.kmaxPool(fold, pool_shape, k)
        else:
            pool_shape = conv_out.shape
            pooled_out = self.kmaxPool(conv_out, pool_shape, k)
        
        return T.tanh(pooled_out + self.b.dimshuffle('x', 0, 'x', 'x'))
    def __init__(self, numpy_rng, theano_rng, input, input_shape, indices, length, max_length=30, n_out=1, batch_size=100, W=None):
        self.n_out = n_out
        self.n_in = input_shape[1]
        self.x = input #3D tensor
        self.indices = indices #2D tensor
        self.length = length #1D tensor
        self.max_length = float(max_length)
        self.numpy_rng = numpy_rng
        self.theano_rng = theano_rng

        init_W = [ 0.54457003,  0.72741562,  1.39331913,  1.12367916,  0.79878163,
        0.27706152,  0.3593896 ,  0.39622781,  0.27895978,  0.23260947,
        0.26763204,  0.27084899,  0.07067534,  0.13463201,  0.07948229,
        0.02779013,  0.12053657,  0.14807181,  0.24277158,  0.36964679,
        0.1601541 ,  0.37342793,  0.47257897,  0.39729786,  0.56589139,
        0.30535939,  0.10021771,  0.07151619,  0.12510002,  0.3112531 ,
        0.43562451,  0.05050614,  0.07199406,  0.50659907,  0.42588547]

        if W is None:
            W_values = numpy.asarray(
                    self.numpy_rng.uniform(
                        low=0.5,
                        high=0.5,
                        size=(self.n_in)
                    ),
                    #init_W,
                    # numpy.linspace(1.0, 0.0, self.n_in),
                    dtype=theano.config.floatX
                )
            self.W = theano.shared(value=W_values, name='W', borrow=True)
        else:
            self.W = W

        self.indices_high = T.ceil(self.indices).astype('int8')
        self.indices_low = T.floor(self.indices).astype('int8')
        self.factors_high = self.W[self.indices_high]
        self.factors_low = self.W[self.indices_low]
        self.factors = (self.factors_high - self.factors_low) * (self.indices - self.indices_low) / \
                       (self.indices_high - self.indices_low + 1E-5) + self.factors_low
        self.output = T.sum(self.x * T.transpose(self.factors).dimshuffle(0, 'x', 1), axis=2) / \
                      (self.length + 1.0).dimshuffle(0, 'x')

        self.params = [self.W]
Exemple #51
0
        def weighted_sentence(sentence, sent_len, W):
            sec_length = T.cast(T.ceil(sent_len / float(num_section)), 'int32')

            # for every section except the last one
            for sec_num in xrange(num_section-1):
                sec_start_id = sec_num * sec_length
                sec_end_id = (sec_num+1) * sec_length
                sec_vector = T.mean(W[sentence[sec_start_id:sec_end_id].flatten()], axis=0)
                if sec_num == 0:
                    global_vector = sec_vector
                else:
                    global_vector = T.concatenate([global_vector, sec_vector], axis=0) # here is axis=0 because sec_vector is a vector
            # for the last section
            sec_start_id = (num_section - 1) * sec_length
            sec_end_id = sent_len
            # if sec_start_id >= sent_len, it means this section should contain 0 words, so use EOS embedding instead.
            sec_vector = T.switch(T.ge(sec_start_id, sent_len), W[io_vocab.VocabConstants.EOS_INDEX], T.mean(W[sentence[sec_start_id:sec_end_id].flatten()], axis=0))
            # num_section > 1
            global_vector = T.concatenate([global_vector, sec_vector], axis=0)
            global_vector_for_short = W[sentence[:num_section].flatten()].reshape((1, emb_dim * num_section))
            return T.switch(T.gt(num_section, sent_len), global_vector_for_short, global_vector)
    def get_hidden_values(self, input):

        # convolve input feature maps with filters
        self.conv_out = conv.conv2d(
            input=input, filters=self.W, border_mode="full", filter_shape=self.kshp, image_shape=self.imshp
        )

        # k-max pooling.
        k = T.cast(T.max((self.k_Top, T.ceil(self.factor * self.s))), "int32")
        pool_shape = self.conv_out.shape
        pool = self.kmaxPool(self.conv_out, pool_shape, k)

        output = T.tanh(pool + self.b.dimshuffle("x", 0, "x", "x"))
        self.shape = output.shape

        hidden_input = output.flatten(2)
        self.fully_connected = AE(
            (self.rng), input=hidden_input, n_visible=self.kshp[0] * 25 * self.k_Top, n_hidden=60
        )  # nkerns[0] replaced with 8
        self.params.extend(self.fully_connected.params)

        return self.fully_connected.get_hidden_values(hidden_input)
Exemple #53
0
    def apply(self, image, image_shape, location, scale):
        a, b = self.compute_hard_windows(image_shape, location, scale)

        if hasattr(self, "cropop"):
            patch = self.cropop(image, a, b, location, scale)
        else:
            # make integer
            a = T.cast(T.floor(a), 'int16')
            b = T.cast(T.ceil(b), 'int16')

            if self.batched_window:
                # take the bounding box of all windows; now the slices
                # will have the same length for each sample and scan can
                # be avoided.  comes at the cost of typically selecting
                # more of the input.
                a = a.min(axis=0, keepdims=True)
                b = b.max(axis=0, keepdims=True)

                patch = self.apply_inner(image, location, scale, a[0], b[0])
            elif self.scan:
                def map_fn(image, a, b, location, scale):
                    # apply_inner expects a batch axis
                    image = T.shape_padleft(image)
                    location = T.shape_padleft(location)
                    scale = T.shape_padleft(scale)

                    patch = self.apply_inner(image, location, scale, a, b)

                    # return without batch axis
                    return patch[0]

                patch, _ = theano.map(
                    map_fn,
                    sequences=[image, a, b, location, scale])

        savings = (1 - T.cast((b - a).prod(axis=1), floatX) / image_shape.prod(axis=1))
        return patch, savings
    def take_glimpses(self, attended, preprocessed_attended=None,
                      attended_mask=None, weights=None, step=None, **states):
        # Cut the considered window.
        p = self.prior
        length = attended.shape[0]
        prior_type = p.get('type', 'expanding')
        if prior_type=='expanding':
            begin = p['initial_begin'] + step[0] * p['min_speed']
            end = p['initial_end'] + step[0] * p['max_speed']
            begin = tensor.maximum(0, tensor.minimum(length - 1, begin))
            end = tensor.maximum(0, tensor.minimum(length, end))
            additional_mask = None
        elif prior_type.startswith('window_around'):
            #check whether we want the mean or median!
            if prior_type == 'window_around_mean':
                position_in_attended = tensor.arange(length, dtype=floatX)[None, :]
                expected_last_source_pos = (weights * position_in_attended).sum(axis=1)
            elif prior_type == 'window_around_median':
                ali_to_05 = tensor.extra_ops.cumsum(weights, axis=1) - 0.5
                ali_to_05 = (ali_to_05>=0)
                ali_median_pos = ali_to_05[:,1:] - ali_to_05[:,:-1]
                expected_last_source_pos = tensor.argmax(ali_median_pos, axis=1)
                expected_last_source_pos = theano.gradient.disconnected_grad(
                    expected_last_source_pos)
            else:
                raise ValueError
            #the window taken around each element
            begins = tensor.floor(expected_last_source_pos - p['before'])
            ends = tensor.ceil(expected_last_source_pos + p['after'])
            #the global window to optimize computations
            begin = tensor.maximum(0, begins.min()).astype('int64')
            end = tensor.minimum(length, ends.max()).astype('int64')
            #the new mask, already cut to begin:end
            position_in_attended_cut = tensor.arange(
                begin * 1., end * 1., 1., dtype=floatX)[None, :]
            additional_mask = ((position_in_attended_cut > begins[:,None]) *
                               (position_in_attended_cut < ends[:,None]))
        else:
            raise Exception("Unknown prior type: %s", prior_type)
        begin = tensor.floor(begin).astype('int64')
        end = tensor.ceil(end).astype('int64')
        attended_cut = attended[begin:end]
        preprocessed_attended_cut = (preprocessed_attended[begin:end]
                                     if preprocessed_attended else None)
        attended_mask_cut = (
            (attended_mask[begin:end] if attended_mask else None)
            * (additional_mask.T if additional_mask else 1))
        weights_cut = weights[:, begin:end]

        # Call
        energies_cut = self.compute_energies(attended_cut, preprocessed_attended_cut,
                                             weights_cut, states)
        weights_cut = self.compute_weights(energies_cut, attended_mask_cut)
        weighted_averages = self.compute_weighted_averages(weights_cut, attended_cut)

        # Paste
        new_weights = new_energies = tensor.zeros_like(weights.T)
        new_weights = tensor.set_subtensor(new_weights[begin:end],
                                           weights_cut)
        new_energies = tensor.set_subtensor(new_energies[begin:end],
                                            energies_cut)

        return weighted_averages, new_weights.T, new_energies.T, step + 1
  def __init__(self,
               n_out = None,
               n_units = None,
               direction = 1,
               truncation = -1,
               sampling = 1,
               encoder = None,
               unit = 'lstm',
               n_dec = 0,
               attention = "none",
               recurrent_transform = "none",
               recurrent_transform_attribs = "{}",
               attention_template = 128,
               attention_distance = 'l2',
               attention_step = "linear",
               attention_beam = 0,
               attention_norm = "exp",
               attention_momentum = "none",
               attention_sharpening = 1.0,
               attention_nbest = 0,
               attention_store = False,
               attention_smooth = False,
               attention_glimpse = 1,
               attention_filters = 1,
               attention_accumulator = 'sum',
               attention_loss = 0,
               attention_bn = 0,
               attention_lm = 'none',
               attention_ndec = 1,
               attention_memory = 0,
               attention_alnpts = 0,
               attention_epoch  = 1,
               attention_segstep=0.01,
               attention_offset=0.95,
               attention_method="epoch",
               attention_scale=10,
               context=-1,
               base = None,
               aligner = None,
               lm = False,
               force_lm = False,
               droplm = 1.0,
               forward_weights_init=None,
               bias_random_init_forget_shift=0.0,
               copy_weights_from_base=False,
               segment_input=False,
               join_states=False,
               sample_segment=None,
               **kwargs):
    """
    :param n_out: number of cells
    :param n_units: used when initialized via Network.from_hdf_model_topology
    :param direction: process sequence in forward (1) or backward (-1) direction
    :param truncation: gradient truncation
    :param sampling: scan every nth frame only
    :param encoder: list of encoder layers used as initalization for the hidden state
    :param unit: cell type (one of 'lstm', 'vanilla', 'gru', 'sru')
    :param n_dec: absolute number of steps to unfold the network if integer, else relative number of steps from encoder
    :param recurrent_transform: name of recurrent transform
    :param recurrent_transform_attribs: dictionary containing parameters for a recurrent transform
    :param attention_template:
    :param attention_distance:
    :param attention_step:
    :param attention_beam:
    :param attention_norm:
    :param attention_sharpening:
    :param attention_nbest:
    :param attention_store:
    :param attention_align:
    :param attention_glimpse:
    :param attention_lm:
    :param base: list of layers which outputs are considered as based during attention mechanisms
    :param lm: activate RNNLM
    :param force_lm: expect previous labels to be given during testing
    :param droplm: probability to take the expected output as predecessor instead of the real one when LM=true
    :param bias_random_init_forget_shift: initialize forget gate bias of lstm networks with this value
    """
    source_index = None
    if len(kwargs['sources']) == 1 and (kwargs['sources'][0].layer_class.endswith('length') or kwargs['sources'][0].layer_class.startswith('length')):
      kwargs['sources'] = []
      source_index = kwargs['index']
    unit_given = unit
    from Device import is_using_gpu
    if unit == 'lstm':  # auto selection
      if not is_using_gpu():
        unit = 'lstme'
      elif recurrent_transform == 'none' and (not lm or droplm == 0.0):
        unit = 'lstmp'
      else:
        unit = 'lstmc'
    elif unit in ("lstmc", "lstmp") and not is_using_gpu():
      unit = "lstme"
    if segment_input:
      if is_using_gpu():
        unit = "lstmps"
      else:
        unit = "lstms"
    if n_out is None:
      assert encoder
      n_out = sum([enc.attrs['n_out'] for enc in encoder])
    kwargs.setdefault("n_out", n_out)
    if n_units is not None:
      assert n_units == n_out
    self.attention_weight = T.constant(1.,'float32')
    if len(kwargs['sources']) == 1 and kwargs['sources'][0].layer_class.startswith('length'):
      kwargs['sources'] = []
    elif len(kwargs['sources']) == 1 and kwargs['sources'][0].layer_class.startswith('signal'):
      kwargs['sources'] = []
    super(RecurrentUnitLayer, self).__init__(**kwargs)
    self.set_attr('from', ",".join([s.name for s in self.sources]) if self.sources else "null")
    self.set_attr('n_out', n_out)
    self.set_attr('unit', unit_given.encode("utf8"))
    self.set_attr('truncation', truncation)
    self.set_attr('sampling', sampling)
    self.set_attr('direction', direction)
    self.set_attr('lm', lm)
    self.set_attr('force_lm', force_lm)
    self.set_attr('droplm', droplm)
    if bias_random_init_forget_shift:
      self.set_attr("bias_random_init_forget_shift", bias_random_init_forget_shift)
    self.set_attr('attention_beam', attention_beam)
    self.set_attr('recurrent_transform', recurrent_transform.encode("utf8"))
    if isinstance(recurrent_transform_attribs, str):
      recurrent_transform_attribs = json.loads(recurrent_transform_attribs)
    if attention_template is not None:
      self.set_attr('attention_template', attention_template)
    self.set_attr('recurrent_transform_attribs', recurrent_transform_attribs)
    self.set_attr('attention_distance', attention_distance.encode("utf8"))
    self.set_attr('attention_step', attention_step.encode("utf8"))
    self.set_attr('attention_norm', attention_norm.encode("utf8"))
    self.set_attr('attention_sharpening', attention_sharpening)
    self.set_attr('attention_nbest', attention_nbest)
    attention_store = attention_store or attention_smooth or attention_momentum != 'none'
    self.set_attr('attention_store', attention_store)
    self.set_attr('attention_smooth', attention_smooth)
    self.set_attr('attention_momentum', attention_momentum.encode('utf8'))
    self.set_attr('attention_glimpse', attention_glimpse)
    self.set_attr('attention_filters', attention_filters)
    self.set_attr('attention_lm', attention_lm)
    self.set_attr('attention_bn', attention_bn)
    self.set_attr('attention_accumulator', attention_accumulator)
    self.set_attr('attention_ndec', attention_ndec)
    self.set_attr('attention_memory', attention_memory)
    self.set_attr('attention_loss', attention_loss)
    self.set_attr('n_dec', n_dec)
    self.set_attr('segment_input', segment_input)
    self.set_attr('attention_alnpts', attention_alnpts)
    self.set_attr('attention_epoch', attention_epoch)
    self.set_attr('attention_segstep', attention_segstep)
    self.set_attr('attention_offset', attention_offset)
    self.set_attr('attention_method', attention_method)
    self.set_attr('attention_scale', attention_scale)
    if segment_input:
      if not self.eval_flag:
      #if self.eval_flag:
        if isinstance(self.sources[0],RecurrentUnitLayer):
          self.inv_att = self.sources[0].inv_att #NBT
        else:
          if not join_states:
            self.inv_att = self.sources[0].attention #NBT
          else:
            assert hasattr(self.sources[0], "nstates"), "source does not have number of states!"
            ns = self.sources[0].nstates
            self.inv_att = self.sources[0].attention[(ns-1)::ns]
        inv_att = T.roll(self.inv_att.dimshuffle(2, 1, 0),1,axis=0)#TBN
        inv_att = T.set_subtensor(inv_att[0],T.zeros((inv_att.shape[1],inv_att.shape[2])))
        inv_att = T.max(inv_att,axis=-1)
      else:
        inv_att = T.zeros((self.sources[0].output.shape[0],self.sources[0].output.shape[1]))
    if encoder and hasattr(encoder[0],'act'):
      self.set_attr('encoder', ",".join([e.name for e in encoder]))
    if base:
      self.set_attr('base', ",".join([b.name for b in base]))
    else:
      base = encoder
    self.base = base
    self.encoder = encoder
    if aligner:
      self.aligner = aligner
    self.set_attr('n_units', n_out)
    unit = eval(unit.upper())(**self.attrs)
    assert isinstance(unit, Unit)
    self.unit = unit
    kwargs.setdefault("n_out", unit.n_out)
    n_out = unit.n_out
    self.set_attr('n_out', unit.n_out)
    if n_dec < 0:
      source_index = self.index
      n_dec *= -1
    if n_dec != 0:
      self.target_index = self.index
      if isinstance(n_dec,float):
        if not source_index:
          source_index = encoder[0].index if encoder else base[0].index
        lengths = T.cast(T.ceil(T.sum(T.cast(source_index,'float32'),axis=0) * n_dec), 'int32')
        idx, _ = theano.map(lambda l_i, l_m:T.concatenate([T.ones((l_i,),'int8'),T.zeros((l_m-l_i,),'int8')]),
                            [lengths], [T.max(lengths)+1])
        self.index = idx.dimshuffle(1,0)[:-1]
        n_dec = T.cast(T.ceil(T.cast(source_index.shape[0],'float32') * numpy.float32(n_dec)),'int32')
      else:
        if encoder:
          self.index = encoder[0].index
        self.index = T.ones((n_dec,self.index.shape[1]),'int8')
    else:
      n_dec = self.index.shape[0]
    # initialize recurrent weights
    self.W_re = None
    if unit.n_re > 0:
      self.W_re = self.add_param(self.create_recurrent_weights(unit.n_units, unit.n_re, name="W_re_%s" % self.name))
    # initialize forward weights
    bias_init_value = self.create_bias(unit.n_in).get_value()
    if bias_random_init_forget_shift:
      assert unit.n_units * 4 == unit.n_in  # (input gate, forget gate, output gate, net input)
      bias_init_value[unit.n_units:2 * unit.n_units] += bias_random_init_forget_shift
    self.b.set_value(bias_init_value)
    if not forward_weights_init:
      forward_weights_init = "random_uniform(p_add=%i)" % unit.n_re
    else:
      self.set_attr('forward_weights_init', forward_weights_init)
    self.forward_weights_init = forward_weights_init
    self.W_in = []
    sample_mean, gamma = None, None
    if copy_weights_from_base:
      self.params = {}
      #self.W_re = self.add_param(base[0].W_re)
      #self.W_in = [ self.add_param(W) for W in base[0].W_in ]
      #self.b = self.add_param(base[0].b)
      self.W_re = base[0].W_re
      self.W_in = base[0].W_in
      self.b = base[0].b
      if self.attrs.get('batch_norm', False):
        sample_mean = base[0].sample_mean
        gamma = base[0].gamma
      #self.masks = base[0].masks
      #self.mass = base[0].mass
    else:
      for s in self.sources:
        W = self.create_forward_weights(s.attrs['n_out'], unit.n_in, name="W_in_%s_%s" % (s.name, self.name))
        self.W_in.append(self.add_param(W))
    # make input
    z = self.b
    for x_t, m, W in zip(self.sources, self.masks, self.W_in):
      if x_t.attrs['sparse']:
        if x_t.output.ndim == 3: out_dim = x_t.output.shape[2]
        elif x_t.output.ndim == 2: out_dim = 1
        else: assert False, x_t.output.ndim
        if x_t.output.ndim == 3:
          z += W[T.cast(x_t.output[:,:,0], 'int32')]
        elif x_t.output.ndim == 2:
          z += W[T.cast(x_t.output, 'int32')]
        else:
          assert False, x_t.output.ndim
      elif m is None:
        z += T.dot(x_t.output, W)
      else:
        z += self.dot(self.mass * m * x_t.output, W)
    #if self.attrs['batch_norm']:
    #  z = self.batch_norm(z, unit.n_in)
    num_batches = self.index.shape[1]
    self.num_batches = num_batches
    non_sequences = []
    if self.attrs['lm'] or attention_lm != 'none':
      if not 'target' in self.attrs:
        self.attrs['target'] = 'classes'
      if self.attrs['droplm'] > 0.0 or not (self.train_flag or force_lm):
        if copy_weights_from_base:
          self.W_lm_in = base[0].W_lm_in
          self.b_lm_in = base[0].b_lm_in
        else:
          l = sqrt(6.) / sqrt(unit.n_out + self.y_in[self.attrs['target']].n_out)
          values = numpy.asarray(self.rng.uniform(low=-l, high=l, size=(unit.n_out, self.y_in[self.attrs['target']].n_out)), dtype=theano.config.floatX)
          self.W_lm_in = self.add_param(self.shared(value=values, borrow=True, name = "W_lm_in_"+self.name))
          self.b_lm_in = self.create_bias(self.y_in[self.attrs['target']].n_out, 'b_lm_in')
      l = sqrt(6.) / sqrt(unit.n_in + self.y_in[self.attrs['target']].n_out)
      values = numpy.asarray(self.rng.uniform(low=-l, high=l, size=(self.y_in[self.attrs['target']].n_out, unit.n_in)), dtype=theano.config.floatX)
      if copy_weights_from_base:
        self.W_lm_out = base[0].W_lm_out
      else:
        self.W_lm_out = self.add_param(self.shared(value=values, borrow=True, name = "W_lm_out_"+self.name))
      if self.attrs['droplm'] == 0.0 and (self.train_flag or force_lm):
        self.lmmask = 1
        #if recurrent_transform != 'none':
        #  recurrent_transform = recurrent_transform[:-3]
      elif self.attrs['droplm'] < 1.0 and (self.train_flag or force_lm):
        from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
        srng = RandomStreams(self.rng.randint(1234) + 1)
        self.lmmask = T.cast(srng.binomial(n=1, p=1.0 - self.attrs['droplm'], size=self.index.shape), theano.config.floatX).dimshuffle(0,1,'x').repeat(unit.n_in,axis=2)
      else:
        self.lmmask = T.zeros_like(self.index, dtype='float32').dimshuffle(0,1,'x').repeat(unit.n_in,axis=2)

    if recurrent_transform == 'input': # attention is just a sequence dependent bias (lstmp compatible)
      src = []
      src_names = []
      n_in = 0
      for e in base:
        #src_base = [ s for s in e.sources if s.name not in src_names ]
        #src_names += [ s.name for s in e.sources ]
        src_base = [ e ]
        src_names += [e.name]
        src += [s.output for s in src_base]
        n_in += sum([s.attrs['n_out'] for s in src_base])
      self.xc = T.concatenate(src, axis=2)
      l = sqrt(6.) / sqrt(self.attrs['n_out'] + n_in)
      values = numpy.asarray(self.rng.uniform(low=-l, high=l, size=(n_in, 1)), dtype=theano.config.floatX)
      self.W_att_xc = self.add_param(self.shared(value=values, borrow=True, name = "W_att_xc"))
      values = numpy.asarray(self.rng.uniform(low=-l, high=l, size=(n_in, self.attrs['n_out'] * 4)), dtype=theano.config.floatX)
      self.W_att_in = self.add_param(self.shared(value=values, borrow=True, name = "W_att_in"))
      zz = T.exp(T.tanh(T.dot(self.xc, self.W_att_xc))) # TB1
      self.zc = T.dot(T.sum(self.xc * (zz / T.sum(zz, axis=0, keepdims=True)).repeat(self.xc.shape[2],axis=2), axis=0, keepdims=True), self.W_att_in)
      recurrent_transform = 'none'
    elif recurrent_transform == 'attention_align':
      max_skip = base[0].attrs['max_skip']
      values = numpy.zeros((max_skip,), dtype=theano.config.floatX)
      self.T_b = self.add_param(self.shared(value=values, borrow=True, name="T_b"), name="T_b")
      l = sqrt(6.) / sqrt(self.attrs['n_out'] + max_skip)
      values = numpy.asarray(self.rng.uniform(
        low=-l, high=l, size=(self.attrs['n_out'], max_skip)), dtype=theano.config.floatX)
      self.T_W = self.add_param(self.shared(value=values, borrow=True, name="T_W"), name="T_W")
      y_t = T.dot(self.base[0].attention, T.arange(self.base[0].output.shape[0], dtype='float32'))  # NB
      y_t = T.concatenate([T.zeros_like(y_t[:1]), y_t], axis=0)  # (N+1)B
      y_t = y_t[1:] - y_t[:-1]  # NB
      self.y_t = y_t # T.clip(y_t,numpy.float32(0),numpy.float32(max_skip - 1))

      self.y_t = T.cast(self.base[0].backtrace,'float32')
    elif recurrent_transform == 'attention_segment':
      assert aligner.attention, "Segment-wise attention requires attention points!"

    recurrent_transform_inst = RecurrentTransform.transform_classes[recurrent_transform](layer=self)
    assert isinstance(recurrent_transform_inst, RecurrentTransform.RecurrentTransformBase)
    unit.recurrent_transform = recurrent_transform_inst
    self.recurrent_transform = recurrent_transform_inst
    # scan over sequence
    for s in range(self.attrs['sampling']):
      index = self.index[s::self.attrs['sampling']]

      if context > 0:
        from TheanoUtil import context_batched
        n_batches = z.shape[1]
        time, batch, dim = z.shape[0], z.shape[1], z.shape[2]
        #z = context_batched(z[::direction or 1], window=context)[::direction or 1] # TB(CD)

        from theano.ifelse import ifelse
        def context_window(idx, x_in, i_in):
          x_out = x_in[idx:idx + context]
          x_out = x_out.dimshuffle('x',1,0,2).reshape((1, batch, dim * context))
          i_out = i_in[idx:idx+1].repeat(context, axis=0)
          i_out = ifelse(T.lt(idx,context),T.set_subtensor(i_out[:context - idx],numpy.int8(0)),i_out).reshape((1, batch * context))
          return x_out, i_out

        z = z[::direction or 1]
        i = index[::direction or 1]
        out, _ = theano.map(context_window, sequences = [T.arange(z.shape[0])], non_sequences = [T.concatenate([T.zeros((context - 1,z.shape[1],z.shape[2]),dtype='float32'),z],axis=0), i])
        z = out[0][::direction or 1]
        i = out[1][::direction or 1] # T(BC)
        direction = 1
        z = z.reshape((time * batch, context * dim)) # (TB)(CD)
        z = z.reshape((time * batch, context, dim)).dimshuffle(1,0,2) # C(TB)D
        i = i.reshape((time, context, batch)).dimshuffle(1,0,2).reshape((context, time * batch))
        index = i
        num_batches = time * batch

      sequences = z
      sources = self.sources
      if encoder:
        if recurrent_transform == "attention_segment":
          if hasattr(encoder[0],'act'):
            outputs_info = [T.concatenate([e.act[i][-1] for e in encoder], axis=1) for i in range(unit.n_act)]
          else:
           # outputs_info = [ T.concatenate([e[i] for e in encoder], axis=1) for i in range(unit.n_act) ]
            outputs_info[0] = self.aligner.output[-1]
        elif hasattr(encoder[0],'act'):
          outputs_info = [ T.concatenate([e.act[i][-1] for e in encoder], axis=1) for i in range(unit.n_act) ]
        else:
          outputs_info = [ T.concatenate([e[i] for e in encoder], axis=1) for i in range(unit.n_act) ]
        sequences += T.alloc(numpy.cast[theano.config.floatX](0), n_dec, num_batches, unit.n_in) + (self.zc if self.attrs['recurrent_transform'] == 'input' else numpy.float32(0))
      else:
        outputs_info = [ T.alloc(numpy.cast[theano.config.floatX](0), num_batches, unit.n_units) for a in range(unit.n_act) ]

      if self.attrs['lm'] and self.attrs['droplm'] == 0.0 and (self.train_flag or force_lm):
        if self.network.y[self.attrs['target']].ndim == 3:
          sequences += T.dot(self.network.y[self.attrs['target']],self.W_lm_out)
        else:
          y = self.y_in[self.attrs['target']].flatten()
          sequences += self.W_lm_out[y].reshape((index.shape[0],index.shape[1],unit.n_in))

      if sequences == self.b:
        sequences += T.alloc(numpy.cast[theano.config.floatX](0), n_dec, num_batches, unit.n_in) + (self.zc if self.attrs['recurrent_transform'] == 'input' else numpy.float32(0))

      if unit.recurrent_transform:
        outputs_info += unit.recurrent_transform.get_sorted_state_vars_initial()

      index_f = T.cast(index, theano.config.floatX)
      unit.set_parent(self)

      if segment_input:
        outputs = unit.scan_seg(x=sources,
                                z=sequences[s::self.attrs['sampling']],
                                att = inv_att,
                                non_sequences=non_sequences,
                                i=index_f,
                                outputs_info=outputs_info,
                                W_re=self.W_re,
                                W_in=self.W_in,
                                b=self.b,
                                go_backwards=direction == -1,
                                truncate_gradient=self.attrs['truncation'])
      else:
        outputs = unit.scan(x=sources,
                            z=sequences[s::self.attrs['sampling']],
                            non_sequences=non_sequences,
                            i=index_f,
                            outputs_info=outputs_info,
                            W_re=self.W_re,
                            W_in=self.W_in,
                            b=self.b,
                            go_backwards=direction == -1,
                            truncate_gradient=self.attrs['truncation'])

      if not isinstance(outputs, list):
        outputs = [outputs]
      if outputs:
        outputs[0].name = "%s.act[0]" % self.name
        if context > 0:
          for i in range(len(outputs)):
            outputs[i] = outputs[i][-1].reshape((outputs[i].shape[1]//n_batches,n_batches,outputs[i].shape[2]))

      if unit.recurrent_transform:
        unit.recurrent_transform_state_var_seqs = outputs[-len(unit.recurrent_transform.state_vars):]

      if self.attrs['sampling'] > 1:
        if s == 0:
          self.act = [ T.alloc(numpy.cast['float32'](0), self.index.shape[0], self.index.shape[1], n_out) for act in outputs ]
        self.act = [ T.set_subtensor(tot[s::self.attrs['sampling']], act) for tot,act in zip(self.act, outputs) ]
      else:
        self.act = outputs[:unit.n_act]
        if len(outputs) > unit.n_act:
          self.aux = outputs[unit.n_act:]
    if self.attrs['attention_store']:
      self.attention = [ self.aux[i].dimshuffle(0,2,1) for i,v in enumerate(sorted(unit.recurrent_transform.state_vars.keys())) if v.startswith('att_') ] # NBT
      for i in range(len(self.attention)):
        vec = T.eye(self.attention[i].shape[2], 1, -direction * (self.attention[i].shape[2] - 1))
        last = vec.dimshuffle(1, 'x', 0).repeat(self.index.shape[1], axis=1)
        self.attention[i] = T.concatenate([self.attention[i][1:],last],axis=0)[::direction]

    self.cost_val = numpy.float32(0)
    if recurrent_transform == 'attention_align':
      back = T.ceil(self.aux[sorted(unit.recurrent_transform.state_vars.keys()).index('t')])
      def make_output(base, yout, trace, length):
        length = T.cast(length, 'int32')
        idx = T.cast(trace[:length][::-1],'int32')
        x_out = T.concatenate([base[idx],T.zeros((self.index.shape[0] + 1 - length, base.shape[1]), 'float32')],axis=0)
        y_out = T.concatenate([yout[idx,T.arange(length)],T.zeros((self.index.shape[0] + 1 - length, ), 'float32')],axis=0)
        return x_out, y_out

      output, _ = theano.map(make_output,
                             sequences = [base[0].output.dimshuffle(1,0,2),
                                          self.y_t.dimshuffle(1,2,0),
                                          back.dimshuffle(1,0),
                                          T.sum(self.index,axis=0,dtype='float32')])
      self.attrs['n_out'] = base[0].attrs['n_out']
      self.params.update(unit.params)
      self.output = output[0].dimshuffle(1,0,2)[:-1]

      z = T.dot(self.act[0], self.T_W)[:-1] + self.T_b
      z = z.reshape((z.shape[0] * z.shape[1], z.shape[2]))
      idx = (self.index[1:].flatten() > 0).nonzero()
      idy = (self.index[1:][::-1].flatten() > 0).nonzero()
      y_out = T.cast(output[1],'int32').dimshuffle(1, 0)[:-1].flatten()
      nll, _ = T.nnet.crossentropy_softmax_1hot(x=z[idx], y_idx=y_out[idy])
      self.cost_val = T.sum(nll)
      recog = T.argmax(z[idx], axis=1)
      real = y_out[idy]
      self.errors = lambda: T.sum(T.neq(recog, real))

      return

      back += T.arange(self.index.shape[1], dtype='float32') * T.cast(self.base[0].index.shape[0], 'float32')
      idx = (self.index[:-1].flatten() > 0).nonzero()
      idx = T.cast(back[::-1].flatten()[idx],'int32')
      x_out = base[0].output
      #x_out = x_out.dimshuffle(1,0,2).reshape((x_out.shape[0] * x_out.shape[1], x_out.shape[2]))[idx]
      #x_out = x_out.reshape((self.index.shape[1], self.index.shape[0] - 1, x_out.shape[1])).dimshuffle(1,0,2)
      x_out = x_out.reshape((x_out.shape[0] * x_out.shape[1], x_out.shape[2]))[idx]
      x_out = x_out.reshape((self.index.shape[0] - 1, self.index.shape[1], x_out.shape[1]))
      self.output = T.concatenate([x_out, base[0].output[1:]],axis=0)
      self.attrs['n_out'] = base[0].attrs['n_out']
      self.params.update(unit.params)
      return


      skips = T.dot(T.nnet.softmax(z), T.arange(z.shape[1], dtype='float32')).reshape(self.index[1:].shape)
      shift = T.arange(self.index.shape[1], dtype='float32') * T.cast(self.base[0].index.shape[0], 'float32')
      skips = T.concatenate([T.zeros_like(self.y_t[:1]),self.y_t[:-1]],axis=0)
      idx = shift + T.cumsum(skips, axis=0)
      idx = T.cast(idx[:-1].flatten(),'int32')
      #idx = (idx.flatten() > 0).nonzero()
      #idx = base[0].attention.flatten()
      x_out = base[0].output[::-1]
      x_out = x_out.reshape((x_out.shape[0] * x_out.shape[1], x_out.shape[2]))[idx]
      x_out = x_out.reshape((self.index.shape[0], self.index.shape[1], x_out.shape[1]))
      self.output = T.concatenate([base[0].output[-1:], x_out], axis=0)[::-1]
      self.attrs['n_out'] = base[0].attrs['n_out']
      self.params.update(unit.params)
      return

    if recurrent_transform == 'batch_norm':
      self.params['sample_mean_batch_norm'].custom_update = T.dot(T.mean(self.act[0],axis=[0,1]),self.W_re)
      self.params['sample_mean_batch_norm'].custom_update_normalized = True

    self.make_output(self.act[0][::direction or 1], sample_mean=sample_mean, gamma=gamma)
    self.params.update(unit.params)
Exemple #56
0
import theano.tensor as T
from theano.tensor import shared_randomstreams
import numpy as np
import numpy.random
from scipy.special import gammaincinv
from numpy.linalg import norm

# tensor stand-in for np.random.RandomState
rngT = shared_randomstreams.RandomStreams()
rng = numpy.random.RandomState()

# {{{ Fastfood Params }}}
n, d = T.dscalars('n', 'd')
# transform dimensions to be a power of 2
d0, n0 = d, n
l = T.ceil(T.log2(d))  # TODO cast to int
d = 2**l
k = T.ceil(n/d)  # TODO cast to int
n = d*k
# generate parameter 'matrices'
B = rng.choice([-1, 1], size=(k, d))
G = rng.normal(size=(k, d), dtype=np.float64)
PI = np.array([rng.permutation(d) for _ in xrange(k)]).T
S = np.empty((k*d, 1), dtype=np.float64)
# generate scaling matrix, S
for i in xrange(k):
    for j in xrange(d):
        p1 = rng.uniform(size=d)
        p2 = d/2
        Tmp = gammaincinv(p2, p1)
        Tmp = T.sqrt(2*Tmp)
def experiment(state, outdir_base='./'):
    rng.seed(1)  # seed the numpy random generator
    # Initialize output directory and files
    data.mkdir_p(outdir_base)
    outdir = outdir_base + "/" + state.dataset + "/"
    data.mkdir_p(outdir)
    logfile = outdir + "log.txt"
    with open(logfile, 'w') as f:
        f.write("MODEL 2, {0!s}\n\n".format(state.dataset))
    train_convergence_pre = outdir + "train_convergence_pre.csv"
    train_convergence_post = outdir + "train_convergence_post.csv"
    valid_convergence_pre = outdir + "valid_convergence_pre.csv"
    valid_convergence_post = outdir + "valid_convergence_post.csv"
    test_convergence_pre = outdir + "test_convergence_pre.csv"
    test_convergence_post = outdir + "test_convergence_post.csv"

    print
    print
    "----------MODEL 2, {0!s}--------------".format(state.dataset)
    print

    # load parameters from config file if this is a test
    config_filename = outdir + 'config'
    if state.test_model and 'config' in os.listdir(outdir):
        config_vals = load_from_config(config_filename)
        for CV in config_vals:
            print
            CV
            if CV.startswith('test'):
                print
                'Do not override testing switch'
                continue
            try:
                exec('state.' + CV) in globals(), locals()
            except:
                exec('state.' + CV.split('=')[0] + "='" + CV.split('=')[1] + "'") in globals(), locals()
    else:
        # Save the current configuration
        # Useful for logs/experiments
        print
        'Saving config'
        with open(config_filename, 'w') as f:
            f.write(str(state))

    print
    state
    # Load the data, train = train+valid, and sequence
    artificial = False
    if state.dataset == 'MNIST_1' or state.dataset == 'MNIST_2' or state.dataset == 'MNIST_3':
        (train_X, train_Y), (valid_X, valid_Y), (test_X, test_Y) = data.load_mnist(state.data_path)
        train_X = numpy.concatenate((train_X, valid_X))
        train_Y = numpy.concatenate((train_Y, valid_Y))
        artificial = True
        try:
            dataset = int(state.dataset.split('_')[1])
        except:
            raise AssertionError("artificial dataset number not recognized. Input was " + state.dataset)
    else:
        raise AssertionError("dataset not recognized.")

    train_X = theano.shared(train_X)
    train_Y = theano.shared(train_Y)
    valid_X = theano.shared(valid_X)
    valid_Y = theano.shared(valid_Y)
    test_X = theano.shared(test_X)
    test_Y = theano.shared(test_Y)

    if artificial:
        print
        'Sequencing MNIST data...'
        print
        'train set size:', len(train_Y.eval())
        print
        'valid set size:', len(valid_Y.eval())
        print
        'test set size:', len(test_Y.eval())
        data.sequence_mnist_data(train_X, train_Y, valid_X, valid_Y, test_X, test_Y, dataset, rng)
        print
        'train set size:', len(train_Y.eval())
        print
        'valid set size:', len(valid_Y.eval())
        print
        'test set size:', len(test_Y.eval())
        print
        'Sequencing done.'
        print

    N_input = train_X.eval().shape[1]
    root_N_input = numpy.sqrt(N_input)

    # Network and training specifications
    layers = state.layers  # number hidden layers
    walkbacks = state.walkbacks  # number of walkbacks
    layer_sizes = [N_input] + [state.hidden_size] * layers  # layer sizes, from h0 to hK (h0 is the visible layer)
    learning_rate = theano.shared(cast32(state.learning_rate))  # learning rate
    annealing = cast32(state.annealing)  # exponential annealing coefficient
    momentum = theano.shared(cast32(state.momentum))  # momentum term

    # PARAMETERS : weights list and bias list.
    # initialize a list of weights and biases based on layer_sizes
    weights_list = [get_shared_weights(layer_sizes[i], layer_sizes[i + 1], name="W_{0!s}_{1!s}".format(i, i + 1)) for i
                    in range(layers)]  # initialize each layer to uniform sample from sqrt(6. / (n_in + n_out))
    recurrent_weights_list = [
        get_shared_weights(layer_sizes[i + 1], layer_sizes[i], name="V_{0!s}_{1!s}".format(i + 1, i)) for i in
        range(layers)]  # initialize each layer to uniform sample from sqrt(6. / (n_in + n_out))
    bias_list = [get_shared_bias(layer_sizes[i], name='b_' + str(i)) for i in
                 range(layers + 1)]  # initialize each layer to 0's.

    # Theano variables and RNG
    MRG = RNG_MRG.MRG_RandomStreams(1)
    X = T.fmatrix('X')
    Xs = [T.fmatrix(name="X_initial") if i == 0 else T.fmatrix(name="X_" + str(i + 1)) for i in range(walkbacks + 1)]
    hiddens_input = [X] + [T.fmatrix(name="h_" + str(i + 1)) for i in range(layers)]
    hiddens_output = hiddens_input[:1] + hiddens_input[1:]

    # Check variables for bad inputs and stuff
    if state.batch_size > len(Xs):
        warnings.warn(
            "Batch size should not be bigger than walkbacks+1 (len(Xs)) unless you know what you're doing. You need to know the sequence length beforehand.")
    if state.batch_size <= 0:
        raise AssertionError("batch size cannot be <= 0")

    ''' F PROP '''
    if state.hidden_act == 'sigmoid':
        print
        'Using sigmoid activation for hiddens'
        hidden_activation = T.nnet.sigmoid
    elif state.hidden_act == 'rectifier':
        print
        'Using rectifier activation for hiddens'
        hidden_activation = lambda x: T.maximum(cast32(0), x)
    elif state.hidden_act == 'tanh':
        print
        'Using hyperbolic tangent activation for hiddens'
        hidden_activation = lambda x: T.tanh(x)
    else:
        raise AssertionError("Did not recognize hidden activation {0!s}, please use tanh, rectifier, or sigmoid".format(
            state.hidden_act))

    if state.visible_act == 'sigmoid':
        print
        'Using sigmoid activation for visible layer'
        visible_activation = T.nnet.sigmoid
    elif state.visible_act == 'softmax':
        print
        'Using softmax activation for visible layer'
        visible_activation = T.nnet.softmax
    else:
        raise AssertionError(
            "Did not recognize visible activation {0!s}, please use sigmoid or softmax".format(state.visible_act))

    def update_layers(hiddens, p_X_chain, Xs, sequence_idx, noisy=True, sampling=True):
        print
        'odd layer updates'
        update_odd_layers(hiddens, noisy)
        print
        'even layer updates'
        update_even_layers(hiddens, p_X_chain, Xs, sequence_idx, noisy, sampling)
        # choose the correct output for hidden_outputs based on batch_size and walkbacks (this is due to an issue with batches, see note in run_story2.py)
        if state.batch_size <= len(Xs) and sequence_idx == state.batch_size - 1:
            return hiddens
        else:
            return None
        print
        'done full update.'
        print

    # Odd layer update function
    # just a loop over the odd layers
    def update_odd_layers(hiddens, noisy):
        for i in range(1, len(hiddens), 2):
            print
            'updating layer', i
            simple_update_layer(hiddens, None, None, None, i, add_noise=noisy)

    # Even layer update
    # p_X_chain is given to append the p(X|...) at each full update (one update = odd update + even update)
    def update_even_layers(hiddens, p_X_chain, Xs, sequence_idx, noisy, sampling):
        for i in range(0, len(hiddens), 2):
            print
            'updating layer', i
            simple_update_layer(hiddens, p_X_chain, Xs, sequence_idx, i, add_noise=noisy, input_sampling=sampling)

    # The layer update function
    # hiddens   :   list containing the symbolic theano variables [visible, hidden1, hidden2, ...]
    #               layer_update will modify this list inplace
    # p_X_chain :   list containing the successive p(X|...) at each update
    #               update_layer will append to this list
    # add_noise     : pre and post activation gaussian noise

    def simple_update_layer(hiddens, p_X_chain, Xs, sequence_idx, i, add_noise=True, input_sampling=True):
        # Compute the dot product, whatever layer
        # If the visible layer X
        if i == 0:
            print
            'using', recurrent_weights_list[i]
            hiddens[i] = (T.dot(hiddens[i + 1], recurrent_weights_list[i]) + bias_list[i])
        # If the top layer
        elif i == len(hiddens) - 1:
            print
            'using', weights_list[i - 1]
            hiddens[i] = T.dot(hiddens[i - 1], weights_list[i - 1]) + bias_list[i]
        # Otherwise in-between layers
        else:
            # next layer        :   hiddens[i+1], assigned weights : W_i
            # previous layer    :   hiddens[i-1], assigned weights : W_(i-1)
            print
            "using {0!s} and {1!s}".format(weights_list[i - 1], recurrent_weights_list[i])
            hiddens[i] = T.dot(hiddens[i + 1], recurrent_weights_list[i]) + T.dot(hiddens[i - 1], weights_list[i - 1]) + \
                         bias_list[i]

        # Add pre-activation noise if NOT input layer
        if i == 1 and state.noiseless_h1:
            print
            '>>NO noise in first hidden layer'
            add_noise = False

        # pre activation noise            
        if i != 0 and add_noise:
            print
            'Adding pre-activation gaussian noise for layer', i
            hiddens[i] = add_gaussian_noise(hiddens[i], state.hidden_add_noise_sigma)

        # ACTIVATION!
        if i == 0:
            print
            'Sigmoid units activation for visible layer X'
            hiddens[i] = visible_activation(hiddens[i])
        else:
            print
            'Hidden units {} activation for layer'.format(state.act), i
            hiddens[i] = hidden_activation(hiddens[i])

            # post activation noise
            # why is there post activation noise? Because there is already pre-activation noise, this just doubles the amount of noise between each activation of the hiddens.
        #         if i != 0 and add_noise:
        #             print 'Adding post-activation gaussian noise for layer', i
        #             hiddens[i]  =   add_gaussian(hiddens[i], state.hidden_add_noise_sigma)

        # build the reconstruction chain if updating the visible layer X
        if i == 0:
            # if input layer -> append p(X|...)
            p_X_chain.append(hiddens[i])  # what the predicted next input should be

            if sequence_idx + 1 < len(Xs):
                next_input = Xs[sequence_idx + 1]
                # sample from p(X|...) - SAMPLING NEEDS TO BE CORRECT FOR INPUT TYPES I.E. FOR BINARY MNIST SAMPLING IS BINOMIAL. real-valued inputs should be gaussian
                if input_sampling:
                    print
                    'Sampling from input'
                    sampled = MRG.binomial(p=next_input, size=next_input.shape, dtype='float32')
                else:
                    print
                    '>>NO input sampling'
                    sampled = next_input
                # add noise
                sampled = salt_and_pepper(sampled, state.input_salt_and_pepper)

                # DOES INPUT SAMPLING MAKE SENSE FOR SEQUENTIAL? - not really since it was used in walkbacks which was gibbs.
                # set input layer
                hiddens[i] = sampled

    def build_graph(hiddens, Xs, noisy=True, sampling=True):
        predicted_X_chain = []  # the visible layer that gets generated at each update_layers run
        H_chain = []  # either None or hiddens that gets generated at each update_layers run, this is used to determine what the correct hiddens_output should be
        print
        "Building the graph :", walkbacks, "updates"
        for i in range(walkbacks):
            print
            "Forward Prediction {!s}/{!s}".format(i + 1, walkbacks)
            H_chain.append(update_layers(hiddens, predicted_X_chain, Xs, i, noisy, sampling))
        return predicted_X_chain, H_chain

    '''Build the main training graph'''
    # corrupt x
    hiddens_output[0] = salt_and_pepper(hiddens_output[0], state.input_salt_and_pepper)
    # build the computation graph and the generated visible layers and appropriate hidden_output
    predicted_X_chain, H_chain = build_graph(hiddens_output, Xs, noisy=True, sampling=state.input_sampling)
    #     predicted_X_chain, H_chain = build_graph(hiddens_output, Xs, noisy=False, sampling=state.input_sampling) #testing one-hot without noise


    # choose the correct output for hiddens_output (this is due to the issue with batches - see note in run_story2.py)
    # this finds the not-None element of H_chain and uses that for hiddens_output
    h_empty = [True if h is None else False for h in H_chain]
    if False in h_empty:  # if there was a not-None element
        hiddens_output = H_chain[h_empty.index(False)]  # set hiddens_output to the appropriate element from H_chain

    ######################
    # COST AND GRADIENTS #
    ######################
    print
    if state.cost_funct == 'binary_crossentropy':
        print
        'Using binary cross-entropy cost!'
        cost_function = lambda x, y: T.mean(T.nnet.binary_crossentropy(x, y))
    elif state.cost_funct == 'square':
        print
        "Using square error cost!"
        cost_function = lambda x, y: T.mean(T.sqr(x - y))
    else:
        raise AssertionError(
            "Did not recognize cost function {0!s}, please use binary_crossentropy or square".format(state.cost_funct))
    print
    'Cost w.r.t p(X|...) at every step in the graph'

    costs = [cost_function(predicted_X_chain[i], Xs[i + 1]) for i in range(len(predicted_X_chain))]
    # outputs for the functions
    show_COSTs = [costs[0]] + [costs[-1]]

    # cost for the gradient
    # care more about the immediate next predictions rather than the future - use exponential decay
    #     COST = T.sum(costs)
    COST = T.sum([T.exp(-i / T.ceil(walkbacks / 3)) * costs[i] for i in range(len(costs))])

    params = weights_list + recurrent_weights_list + bias_list
    print
    "params:", params

    print
    "creating functions..."
    gradient = T.grad(COST, params)

    gradient_buffer = [theano.shared(numpy.zeros(param.get_value().shape, dtype='float32')) for param in params]

    m_gradient = [momentum * gb + (cast32(1) - momentum) * g for (gb, g) in zip(gradient_buffer, gradient)]
    param_updates = [(param, param - learning_rate * mg) for (param, mg) in zip(params, m_gradient)]
    gradient_buffer_updates = zip(gradient_buffer, m_gradient)

    updates = OrderedDict(param_updates + gradient_buffer_updates)

    # odd layer h's not used from input -> calculated directly from even layers (starting with h_0) since the odd layers are updated first.
    f_cost = theano.function(inputs=hiddens_input + Xs,
                             outputs=hiddens_output + show_COSTs,
                             on_unused_input='warn')

    f_learn = theano.function(inputs=hiddens_input + Xs,
                              updates=updates,
                              outputs=hiddens_output + show_COSTs,
                              on_unused_input='warn')

    print
    "functions done."
    print

    #############
    # Denoise some numbers  :   show number, noisy number, reconstructed number
    #############
    import random as R
    R.seed(1)
    # a function to add salt and pepper noise
    f_noise = theano.function(inputs=[X], outputs=salt_and_pepper(X, state.input_salt_and_pepper))

    # Recompile the graph without noise for reconstruction function - the input x_recon is already going to be noisy, and this is to test on a simulated 'real' input.
    X_recon = T.fvector("X_recon")
    Xs_recon = [T.fvector("Xs_recon")]
    hiddens_R_input = [X_recon] + [T.fvector(name="h_recon_" + str(i + 1)) for i in range(layers)]
    hiddens_R_output = hiddens_R_input[:1] + hiddens_R_input[1:]

    # The layer update scheme
    print
    "Creating graph for noisy reconstruction function at checkpoints during training."
    p_X_chain_R, H_chain_R = build_graph(hiddens_R_output, Xs_recon, noisy=False)

    # choose the correct output from H_chain for hidden_outputs based on batch_size and walkbacks
    # choose the correct output for hiddens_output
    h_empty = [True if h is None else False for h in H_chain_R]
    if False in h_empty:  # if there was a set of hiddens output from the batch_size-1 element of the chain
        hiddens_R_output = H_chain_R[
            h_empty.index(False)]  # extract out the not-None element from the list if it exists
    #     if state.batch_size <= len(Xs_recon):
    #         for i in range(len(hiddens_R_output)):
    #             hiddens_R_output[i] = H_chain_R[state.batch_size - 1][i]

    f_recon = theano.function(inputs=hiddens_R_input + Xs_recon,
                              outputs=hiddens_R_output + [p_X_chain_R[0], p_X_chain_R[-1]],
                              on_unused_input="warn")

    ############
    # Sampling #
    ############

    # the input to the sampling function
    X_sample = T.fmatrix("X_sampling")
    network_state_input = [X_sample] + [T.fmatrix("H_sampling_" + str(i + 1)) for i in range(layers)]

    # "Output" state of the network (noisy)
    # initialized with input, then we apply updates

    network_state_output = [X_sample] + network_state_input[1:]

    visible_pX_chain = []

    # ONE update
    print
    "Performing one walkback in network state sampling."
    _ = update_layers(network_state_output, visible_pX_chain, [X_sample], 0, noisy=True)

    if layers == 1:
        f_sample_simple = theano.function(inputs=[X_sample], outputs=visible_pX_chain[-1])

    # WHY IS THERE A WARNING????
    # because the first odd layers are not used -> directly computed FROM THE EVEN layers
    # unused input = warn
    f_sample2 = theano.function(inputs=network_state_input, outputs=network_state_output + visible_pX_chain,
                                on_unused_input='warn')

    def sample_some_numbers_single_layer():
        x0 = test_X.get_value()[:1]
        samples = [x0]
        x = f_noise(x0)
        for i in range(399):
            x = f_sample_simple(x)
            samples.append(x)
            x = numpy.random.binomial(n=1, p=x, size=x.shape).astype('float32')
            x = f_noise(x)
        return numpy.vstack(samples)

    def sampling_wrapper(NSI):
        # * is the "splat" operator: It takes a list as input, and expands it into actual positional arguments in the function call.
        out = f_sample2(*NSI)
        NSO = out[:len(network_state_output)]
        vis_pX_chain = out[len(network_state_output):]
        return NSO, vis_pX_chain

    def sample_some_numbers(N=400):
        # The network's initial state
        init_vis = test_X.get_value()[:1]

        noisy_init_vis = f_noise(init_vis)

        network_state = [
            [noisy_init_vis] + [numpy.zeros((1, len(b.get_value())), dtype='float32') for b in bias_list[1:]]]

        visible_chain = [init_vis]

        noisy_h0_chain = [noisy_init_vis]

        for i in range(N - 1):
            # feed the last state into the network, compute new state, and obtain visible units expectation chain
            net_state_out, vis_pX_chain = sampling_wrapper(network_state[-1])

            # append to the visible chain
            visible_chain += vis_pX_chain

            # append state output to the network state chain
            network_state.append(net_state_out)

            noisy_h0_chain.append(net_state_out[0])

        return numpy.vstack(visible_chain), numpy.vstack(noisy_h0_chain)

    def plot_samples(epoch_number, iteration):
        to_sample = time.time()
        if layers == 1:
            # one layer model
            V = sample_some_numbers_single_layer()
        else:
            V, H0 = sample_some_numbers()
        img_samples = PIL.Image.fromarray(tile_raster_images(V, (root_N_input, root_N_input), (20, 20)))

        fname = outdir + 'samples_iteration_' + str(iteration) + '_epoch_' + str(epoch_number) + '.png'
        img_samples.save(fname)
        print
        'Took ' + str(time.time() - to_sample) + ' to sample 400 numbers'

    ##############
    # Inpainting #
    ##############
    def inpainting(digit):
        # The network's initial state

        # NOISE INIT
        init_vis = cast32(numpy.random.uniform(size=digit.shape))

        # noisy_init_vis  =   f_noise(init_vis)
        # noisy_init_vis  =   cast32(numpy.random.uniform(size=init_vis.shape))

        # INDEXES FOR VISIBLE AND NOISY PART
        noise_idx = (numpy.arange(N_input) % root_N_input < (root_N_input / 2))
        fixed_idx = (numpy.arange(N_input) % root_N_input > (root_N_input / 2))

        # function to re-init the visible to the same noise

        # FUNCTION TO RESET HALF VISIBLE TO DIGIT
        def reset_vis(V):
            V[0][fixed_idx] = digit[0][fixed_idx]
            return V

        # INIT DIGIT : NOISE and RESET HALF TO DIGIT
        init_vis = reset_vis(init_vis)

        network_state = [[init_vis] + [numpy.zeros((1, len(b.get_value())), dtype='float32') for b in bias_list[1:]]]

        visible_chain = [init_vis]

        noisy_h0_chain = [init_vis]

        for i in range(49):
            # feed the last state into the network, compute new state, and obtain visible units expectation chain
            net_state_out, vis_pX_chain = sampling_wrapper(network_state[-1])

            # reset half the digit
            net_state_out[0] = reset_vis(net_state_out[0])
            vis_pX_chain[0] = reset_vis(vis_pX_chain[0])

            # append to the visible chain
            visible_chain += vis_pX_chain

            # append state output to the network state chain
            network_state.append(net_state_out)

            noisy_h0_chain.append(net_state_out[0])

        return numpy.vstack(visible_chain), numpy.vstack(noisy_h0_chain)

    def save_params_to_file(name, n, params, iteration):
        print
        'saving parameters...'
        save_path = outdir + name + '_params_iteration_' + str(iteration) + '_epoch_' + str(n) + '.pkl'
        f = open(save_path, 'wb')
        try:
            cPickle.dump(params, f, protocol=cPickle.HIGHEST_PROTOCOL)
        finally:
            f.close()

            ################

    # GSN TRAINING #
    ################
    def train_recurrent_GSN(iteration, train_X, train_Y, valid_X, valid_Y, test_X, test_Y):
        print
        '----------------------------------------'
        print
        'TRAINING GSN FOR ITERATION', iteration
        with open(logfile, 'a') as f:
            f.write("--------------------------\nTRAINING GSN FOR ITERATION {0!s}\n".format(iteration))

        # TRAINING
        n_epoch = state.n_epoch
        batch_size = state.batch_size
        STOP = False
        counter = 0
        if iteration == 0:
            learning_rate.set_value(cast32(state.learning_rate))  # learning rate
        times = []
        best_cost = float('inf')
        patience = 0

        print
        'learning rate:', learning_rate.get_value()

        print
        'train X size:', str(train_X.shape.eval())
        print
        'valid X size:', str(valid_X.shape.eval())
        print
        'test X size:', str(test_X.shape.eval())

        train_costs = []
        valid_costs = []
        test_costs = []
        train_costs_post = []
        valid_costs_post = []
        test_costs_post = []

        if state.vis_init:
            bias_list[0].set_value(logit(numpy.clip(0.9, 0.001, train_X.get_value().mean(axis=0))))

        if state.test_model:
            # If testing, do not train and go directly to generating samples, parzen window estimation, and inpainting
            print
            'Testing : skip training'
            STOP = True

        while not STOP:
            counter += 1
            t = time.time()
            print
            counter, '\t',
            with open(logfile, 'a') as f:
                f.write("{0!s}\t".format(counter))
            # shuffle the data
            data.sequence_mnist_data(train_X, train_Y, valid_X, valid_Y, test_X, test_Y, dataset, rng)

            # train
            # init hiddens
            #             hiddens = [(T.zeros_like(train_X[:batch_size]).eval())]
            #             for i in range(len(weights_list)):
            #                 # init with zeros
            #                 hiddens.append(T.zeros_like(T.dot(hiddens[i], weights_list[i])).eval())
            hiddens = [T.zeros((batch_size, layer_size)).eval() for layer_size in layer_sizes]
            train_cost = []
            train_cost_post = []
            for i in range(len(train_X.get_value(borrow=True)) / batch_size):
                xs = [train_X.get_value(borrow=True)[
                      (i * batch_size) + sequence_idx: ((i + 1) * batch_size) + sequence_idx] for sequence_idx in
                      range(len(Xs))]
                xs, hiddens = fix_input_size(xs, hiddens)
                hiddens[0] = xs[0]
                _ins = hiddens + xs
                _outs = f_learn(*_ins)
                hiddens = _outs[:len(hiddens)]
                cost = _outs[-2]
                cost_post = _outs[-1]
                train_cost.append(cost)
                train_cost_post.append(cost_post)

            train_cost = numpy.mean(train_cost)
            train_costs.append(train_cost)
            train_cost_post = numpy.mean(train_cost_post)
            train_costs_post.append(train_cost_post)
            print
            'Train : ', trunc(train_cost), trunc(train_cost_post), '\t',
            with open(logfile, 'a') as f:
                f.write("Train : {0!s} {1!s}\t".format(trunc(train_cost), trunc(train_cost_post)))
            with open(train_convergence_pre, 'a') as f:
                f.write("{0!s},".format(train_cost))
            with open(train_convergence_post, 'a') as f:
                f.write("{0!s},".format(train_cost_post))

            # valid
            # init hiddens
            hiddens = [T.zeros((batch_size, layer_size)).eval() for layer_size in layer_sizes]
            valid_cost = []
            valid_cost_post = []
            for i in range(len(valid_X.get_value(borrow=True)) / batch_size):
                xs = [valid_X.get_value(borrow=True)[
                      (i * batch_size) + sequence_idx: ((i + 1) * batch_size) + sequence_idx] for sequence_idx in
                      range(len(Xs))]
                xs, hiddens = fix_input_size(xs, hiddens)
                hiddens[0] = xs[0]
                _ins = hiddens + xs
                _outs = f_cost(*_ins)
                hiddens = _outs[:-2]
                cost = _outs[-2]
                cost_post = _outs[-1]
                valid_cost.append(cost)
                valid_cost_post.append(cost_post)

            valid_cost = numpy.mean(valid_cost)
            valid_costs.append(valid_cost)
            valid_cost_post = numpy.mean(valid_cost_post)
            valid_costs_post.append(valid_cost_post)
            print
            'Valid : ', trunc(valid_cost), trunc(valid_cost_post), '\t',
            with open(logfile, 'a') as f:
                f.write("Valid : {0!s} {1!s}\t".format(trunc(valid_cost), trunc(valid_cost_post)))
            with open(valid_convergence_pre, 'a') as f:
                f.write("{0!s},".format(valid_cost))
            with open(valid_convergence_post, 'a') as f:
                f.write("{0!s},".format(valid_cost_post))

            # test
            # init hiddens
            hiddens = [T.zeros((batch_size, layer_size)).eval() for layer_size in layer_sizes]
            test_cost = []
            test_cost_post = []
            for i in range(len(test_X.get_value(borrow=True)) / batch_size):
                xs = [test_X.get_value(borrow=True)[
                      (i * batch_size) + sequence_idx: ((i + 1) * batch_size) + sequence_idx] for sequence_idx in
                      range(len(Xs))]
                xs, hiddens = fix_input_size(xs, hiddens)
                hiddens[0] = xs[0]
                _ins = hiddens + xs
                _outs = f_cost(*_ins)
                hiddens = _outs[:-2]
                cost = _outs[-2]
                cost_post = _outs[-1]
                test_cost.append(cost)
                test_cost_post.append(cost_post)

            test_cost = numpy.mean(test_cost)
            test_costs.append(test_cost)
            test_cost_post = numpy.mean(test_cost_post)
            test_costs_post.append(test_cost_post)
            print
            'Test  : ', trunc(test_cost), trunc(test_cost_post), '\t',
            with open(logfile, 'a') as f:
                f.write("Test : {0!s} {1!s}\t".format(trunc(test_cost), trunc(test_cost_post)))
            with open(test_convergence_pre, 'a') as f:
                f.write("{0!s},".format(test_cost))
            with open(test_convergence_post, 'a') as f:
                f.write("{0!s},".format(test_cost_post))

            # check for early stopping
            cost = train_cost
            if cost < best_cost * state.early_stop_threshold:
                patience = 0
                best_cost = cost
            else:
                patience += 1

            if counter >= n_epoch or patience >= state.early_stop_length:
                STOP = True
                save_params_to_file('gsn', counter, params, iteration)

            timing = time.time() - t
            times.append(timing)

            print
            'time : ', trunc(timing),

            print
            'remaining: ', trunc((n_epoch - counter) * numpy.mean(times) / 60 / 60), 'hrs',

            print
            'B : ', [trunc(abs(b.get_value(borrow=True)).mean()) for b in bias_list],

            print
            'W : ', [trunc(abs(w.get_value(borrow=True)).mean()) for w in weights_list],

            print
            'V : ', [trunc(abs(v.get_value(borrow=True)).mean()) for v in recurrent_weights_list]

            with open(logfile, 'a') as f:
                f.write("MeanVisB : {0!s}\t".format(trunc(bias_list[0].get_value().mean())))

            with open(logfile, 'a') as f:
                f.write("W : {0!s}\t".format(str([trunc(abs(w.get_value(borrow=True)).mean()) for w in weights_list])))

            with open(logfile, 'a') as f:
                f.write("Time : {0!s} seconds\n".format(trunc(timing)))

            if (counter % state.save_frequency) == 0:
                # Checking reconstruction
                nums = test_X.get_value()[range(100)]
                noisy_nums = f_noise(test_X.get_value()[range(100)])
                reconstructed_prediction = []
                reconstructed_prediction_end = []
                # init reconstruction hiddens
                hiddens = [T.zeros(layer_size).eval() for layer_size in layer_sizes]
                for num in noisy_nums:
                    hiddens[0] = num
                    for i in range(len(hiddens)):
                        if len(hiddens[i].shape) == 2 and hiddens[i].shape[0] == 1:
                            hiddens[i] = hiddens[i][0]
                    _ins = hiddens + [num]
                    _outs = f_recon(*_ins)
                    hiddens = _outs[:len(hiddens)]
                    [reconstructed_1, reconstructed_n] = _outs[len(hiddens):]
                    reconstructed_prediction.append(reconstructed_1)
                    reconstructed_prediction_end.append(reconstructed_n)

                with open(logfile, 'a') as f:
                    f.write("\n")
                for i in range(len(nums)):
                    if len(reconstructed_prediction[i].shape) == 2 and reconstructed_prediction[i].shape[0] == 1:
                        reconstructed_prediction[i] = reconstructed_prediction[i][0]
                    print
                    nums[i].tolist(), "->", reconstructed_prediction[i].tolist()
                    with open(logfile, 'a') as f:
                        f.write("{0!s} -> {1!s}\n".format(nums[i].tolist(),
                                                          [trunc(n) if n > 0.0001 else trunc(0.00000000000000000) for n
                                                           in reconstructed_prediction[i].tolist()]))
                with open(logfile, 'a') as f:
                    f.write("\n")

                #                 # Concatenate stuff
                #                 stacked = numpy.vstack([numpy.vstack([nums[i*10 : (i+1)*10], noisy_nums[i*10 : (i+1)*10], reconstructed_prediction[i*10 : (i+1)*10], reconstructed_prediction_end[i*10 : (i+1)*10]]) for i in range(10)])
                #                 numbers_reconstruction = PIL.Image.fromarray(tile_raster_images(stacked, (root_N_input,root_N_input), (10,40)))
                #                 numbers_reconstruction.save(outdir+'gsn_number_reconstruction_iteration_'+str(iteration)+'_epoch_'+str(counter)+'.png')
                #
                #                 #sample_numbers(counter, 'seven')
                #                 plot_samples(counter, iteration)
                #
                #                 #save params
                #                 save_params_to_file('gsn', counter, params, iteration)

            # ANNEAL!
            new_lr = learning_rate.get_value() * annealing
            learning_rate.set_value(new_lr)

        # 10k samples
        print
        'Generating 10,000 samples'
        samples, _ = sample_some_numbers(N=10000)
        f_samples = outdir + 'samples.npy'
        numpy.save(f_samples, samples)
        print
        'saved digits'

    #####################
    # STORY 2 ALGORITHM #
    #####################
    for iter in range(state.max_iterations):
        train_recurrent_GSN(iter, train_X, train_Y, valid_X, valid_Y, test_X, test_Y)