Example #1
0
def advanced_indexing(volume, *indices_list, **kwargs):
    """ Performs advanced indexing on `volume`.

    This function exists because in Theano<=0.9 advanced indexing is
    only supported along the first dimension.

    Notes
    -----
    Assuming `volume` is C contiguous.
    """
    strides = kwargs.get("strides")
    if strides is None:
        shapes = T.cast(volume.shape[:len(indices_list)], dtype=theano.config.floatX)
        strides = T.concatenate([T.ones((1,)), T.cumprod(shapes[::-1])[:-1]], axis=0)[::-1]

    shapes = T.cast(volume.shape, dtype=theano.config.floatX)

    indices = T.maximum(0, T.minimum(indices_list[-1], shapes[len(indices_list)-1]-1))
    for i in range(len(indices_list)-1):
        clipped_idx = T.maximum(0, T.minimum(indices_list[i], shapes[i]-1))
        indices += clipped_idx * strides[i]

    # indices = T.sum(T.stack(indices_list, axis=1)*strides[:len(indices_list)], axis=1)
    indices = T.cast(indices, dtype="int32")
    return volume.reshape((-1, volume.shape[-1]))[indices]
Example #2
0
 def get(self, y_p, i, g):
   W_att_re = self.item("W_att_re", i)
   b_att_re = self.item("b_att_re", i)
   B = self.item("B", i)
   C = self.item("C", i)
   I = self.item("I", i)
   beam_size = T.minimum(numpy.int32(abs(self.attrs['beam'])), C.shape[0])
   loc = T.cast(T.maximum(T.minimum(T.sum(I,axis=0) * self.n / self.bound - beam_size / 2, T.sum(I,axis=0) - beam_size), 0),'int32')
   if self.attrs['beam'] > 0:
     beam_idx = (self.custom_vars[('P_%d' % i)][loc].dimshuffle(1,0).flatten() > 0).nonzero()
     I = I.reshape((I.shape[0]*I.shape[1],))[beam_idx].reshape((beam_size,I.shape[1]))
     C = C.reshape((C.shape[0]*C.shape[1],C.shape[2]))[beam_idx].reshape((beam_size,C.shape[1],C.shape[2]))
     B = B.reshape((B.shape[0]*B.shape[1],B.shape[2]))[beam_idx].reshape((beam_size,B.shape[1],B.shape[2]))
   if self.attrs['template'] != self.layer.unit.n_out:
     z_p = T.dot(y_p, W_att_re) + b_att_re
   else:
     z_p = y_p
   if self.attrs['momentum'] == 'conv1d':
     from theano.tensor.nnet import conv
     att = self.item('att', i)
     F = self.item("F", i)
     v = T.dot(T.sum(conv.conv2d(border_mode='full',
       input=att.dimshuffle(1, 'x', 0, 'x'),
       filters=F).dimshuffle(2,3,0,1),axis=1)[F.shape[2]/2:-F.shape[2]/2+1],self.item("U",i))
     v = I * v / v.sum(axis=0,keepdims=True)
     z_p += T.sum(C * v,axis=0)
   if g > 0:
     z_p += self.glimpses[i][-1]
   h_p = T.tanh(z_p)
   return B, C, I, h_p, self.item("W_att_in", i), self.item("b_att_in", i)
Example #3
0
 def call(self, X):
     if type(X) is not list or len(X) != 2:
         raise Exception("SquareAttention must be called on a list of two tensors. Got: " + str(X))
         
     frame, position  = X[0], X[1]
     
     # Reshaping the input to exclude the time dimension
     frameShape = K.shape(frame)
     positionShape = K.shape(position)
     (chans, height, width) = frameShape[-3:]
     targetDim = positionShape[-1]
     frame = K.reshape(frame, (-1, chans, height, width))
     position = K.reshape(position, (-1, ) + (targetDim, ))
     
     # Applying the attention
     hw = THT.abs_(position[:, 2] - position[:, 0]) * self.scale / 2.0
     hh = THT.abs_(position[:, 3] - position[:, 1]) * self.scale / 2.0
     position = THT.maximum(THT.set_subtensor(position[:, 0], position[:, 0] - hw), -1.0)
     position = THT.minimum(THT.set_subtensor(position[:, 2], position[:, 2] + hw), 1.0)
     position = THT.maximum(THT.set_subtensor(position[:, 1], position[:, 1] - hh), -1.0)
     position = THT.minimum(THT.set_subtensor(position[:, 3], position[:, 3] + hh), 1.0)
     rX = Data.linspace(-1.0, 1.0, width)
     rY = Data.linspace(-1.0, 1.0, height)
     FX = THT.gt(rX, position[:,0].dimshuffle(0,'x')) * THT.le(rX, position[:,2].dimshuffle(0,'x'))
     FY = THT.gt(rY, position[:,1].dimshuffle(0,'x')) * THT.le(rY, position[:,3].dimshuffle(0,'x'))
     m = FY.dimshuffle(0, 1, 'x') * FX.dimshuffle(0, 'x', 1)
     m = m + self.alpha - THT.gt(m, 0.) * self.alpha
     frame = frame * m.dimshuffle(0, 'x', 1, 2)
     
     # Reshaping the frame to include time dimension
     output = K.reshape(frame, frameShape)
     
     return output
Example #4
0
    def lp_norm(self, n, k, r, c, z):
        '''
        Lp = ( 1/n * sum(|x_i|^p, 1..n))^(1/p) where p = 1 + ln(1+e^P)
        :param n:
        :param k:
        :param r:
        :param c:
        :param z:
        :return:
        '''
        ds0, ds1 = self.pool_size
        st0, st1 = self.stride
        pad_h = self.pad[0]
        pad_w = self.pad[1]

        row_st = r * st0
        row_end = T.minimum(row_st + ds0, self.img_rows)
        row_st = T.maximum(row_st, self.pad[0])
        row_end = T.minimum(row_end, self.x_m2d + pad_h)

        col_st = c * st1
        col_end = T.minimum(col_st + ds1, self.img_cols)
        col_st = T.maximum(col_st, self.pad[1])
        col_end = T.minimum(col_end, self.x_m1d + pad_w)

        Lp = T.pow(
                T.mean(T.pow(
                        T.abs_(T.flatten(self.y[n, k, row_st:row_end, col_st:col_end], 1)),
                        1 + T.log(1 + T.exp(self.P))
                )),
                1 / (1 + T.log(1 + T.exp(self.P)))
        )

        return T.set_subtensor(z[n, k, r, c], Lp)
Example #5
0
 def _output(self, input,  *args, **kwargs):
     k = (self.alpha - 1).reshape(self.filter_shape)
     if self.affected_channels == self.n_channel:
         return input + T.minimum(0, input) * k
     else:
         affected = input[:, :self.affected_channels]
         unaffected = input[:, self.affected_channels:]
         affected = affected + T.minimum(0, affected) * k
         return T.concatenate([affected, unaffected], axis=1)
Example #6
0
def _interpolate(im, x, y, out_height, out_width):
    # *_f are floats
    num_batch, height, width, channels = im.shape
    height_f = T.cast(height, theano.config.floatX)
    width_f = T.cast(width, theano.config.floatX)

    # clip coordinates to [-1, 1]
    x = T.clip(x, -1, 1)
    y = T.clip(y, -1, 1)

    # scale coordinates from [-1, 1] to [0, width/height - 1]
    x = (x + 1) / 2 * (width_f - 1)
    y = (y + 1) / 2 * (height_f - 1)

    # obtain indices of the 2x2 pixel neighborhood surrounding the coordinates;
    # we need those in floatX for interpolation and in int64 for indexing. for
    # indexing, we need to take care they do not extend past the image.
    x0_f = T.floor(x)
    y0_f = T.floor(y)
    x1_f = x0_f + 1
    y1_f = y0_f + 1
    x0 = T.cast(x0_f, 'int64')
    y0 = T.cast(y0_f, 'int64')
    x1 = T.cast(T.minimum(x1_f, width_f - 1), 'int64')
    y1 = T.cast(T.minimum(y1_f, height_f - 1), 'int64')

    # The input is [num_batch, height, width, channels]. We do the lookup in
    # the flattened input, i.e [num_batch*height*width, channels]. We need
    # to offset all indices to match the flat version
    dim2 = width
    dim1 = width*height
    base = T.repeat(
        T.arange(num_batch, dtype='int64')*dim1, out_height*out_width)
    base_y0 = base + y0*dim2
    base_y1 = base + y1*dim2
    idx_a = base_y0 + x0
    idx_b = base_y1 + x0
    idx_c = base_y0 + x1
    idx_d = base_y1 + x1

    # use indices to lookup pixels for all samples
    im_flat = im.reshape((-1, channels))
    Ia = im_flat[idx_a]
    Ib = im_flat[idx_b]
    Ic = im_flat[idx_c]
    Id = im_flat[idx_d]

    # calculate interpolated values
    wa = ((x1_f-x) * (y1_f-y)).dimshuffle(0, 'x')
    wb = ((x1_f-x) * (y-y0_f)).dimshuffle(0, 'x')
    wc = ((x-x0_f) * (y1_f-y)).dimshuffle(0, 'x')
    wd = ((x-x0_f) * (y-y0_f)).dimshuffle(0, 'x')
    output = T.sum([wa*Ia, wb*Ib, wc*Ic, wd*Id], axis=0)

    assert str(output.dtype) == theano.config.floatX, str(output.dtype)
    return output
Example #7
0
def _interpolate(im, x, y, out_height, out_width, num_b):
    _, height, width, channels = im.shape
    # *_f are floats
    height_f = T.cast(height, theano.config.floatX)
    width_f = T.cast(width, theano.config.floatX)

    # clip coordinates to [-1, 1]
    x = T.clip(x, -1, 1)
    y = T.clip(y, -1, 1)

    # scale coordinates from [-1, 1] to [0, width/height - 1]
    x = (x + 1) / 2 * (width_f - 1)
    y = (y + 1) / 2 * (height_f - 1)

    # obtain indices of the 2x2 pixel neighborhood surrounding the coordinates;
    # we need those in floatX for interpolation and in int64 for indexing. for
    # indexing, we need to take care they do not extend past the image.
    x0_f = T.floor(x)
    y0_f = T.floor(y)
    x1_f = x0_f + 1
    y1_f = y0_f + 1

    # KMYI: we cast only at the end to maximize GPU usage
    x0 = T.floor(x0_f)
    y0 = T.floor(y0_f)
    x1 = T.floor(T.minimum(x1_f, width_f - 1))
    y1 = T.floor(T.minimum(y1_f, height_f - 1))

    dim2 = width_f
    dim1 = width_f * height_f
    base = T.repeat(
        T.arange(num_b,
                 dtype=theano.config.floatX) * dim1,
        out_height * out_width)
    base_y0 = base + y0 * dim2
    base_y1 = base + y1 * dim2
    idx_a = base_y0 + x0
    idx_b = base_y1 + x0
    idx_c = base_y0 + x1
    idx_d = base_y1 + x1

    # use indices to lookup pixels for all samples
    im_flat = im.reshape((-1, channels))
    Ia = im_flat[T.cast(idx_a, 'int64')]
    Ib = im_flat[T.cast(idx_b, 'int64')]
    Ic = im_flat[T.cast(idx_c, 'int64')]
    Id = im_flat[T.cast(idx_d, 'int64')]

    # calculate interpolated values
    wa = ((x1_f - x) * (y1_f - y)).dimshuffle(0, 'x')
    wb = ((x1_f - x) * (y - y0_f)).dimshuffle(0, 'x')
    wc = ((x - x0_f) * (y1_f - y)).dimshuffle(0, 'x')
    wd = ((x - x0_f) * (y - y0_f)).dimshuffle(0, 'x')
    output = T.sum([wa * Ia, wb * Ib, wc * Ic, wd * Id], axis=0)
    return output
Example #8
0
def _interpolate(im, x, y, out_height, out_width, dtype = 'float32'):
  # *_f are floats
  num_batch, height, width, channels = im.shape
  height_f = T.cast(height, dtype = dtype)
  width_f = T.cast(width, dtype = dtype)

  # scale coordinates from [-1, 1] to [0, width/height - 1]
  idx = ((x >= 0) & (x <= 1) & (y >= 0) & (y <= 1)).nonzero()[0]
  # x = (x + 1) / 2 * (width_f - 1)
  # y = (y + 1) / 2 * (height_f - 1)
  x = x * (width_f - 1)
  y = y * (height_f - 1)
  # obtain indices of the 2x2 pixel neighborhood surrounding the coordinates;
  # we need those in floatX for interpolation and in int64 for indexing. for
  # indexing, we need to take care they do not extend past the image.
  x0_f = T.floor(x)
  y0_f = T.floor(y)
  x1_f = x0_f + 1
  y1_f = y0_f + 1
  x0 = T.cast(x0_f, 'int64')
  y0 = T.cast(y0_f, 'int64')
  x1 = T.cast(T.minimum(x1_f, width_f - 1), 'int64')
  y1 = T.cast(T.minimum(y1_f, height_f - 1), 'int64')

  # The input is [num_batch, height, width, channels]. We do the lookup in
  # the flattened input, i.e [num_batch*height*width, channels]. We need
  # to offset all indices to match the flat version
  dim2 = width
  dim1 = width*height
  base = T.repeat(
      T.arange(num_batch, dtype='int64')*dim1, out_height*out_width)
  base_y0 = base + y0*dim2
  base_y1 = base + y1*dim2
  idx_a = base_y0 + x0
  idx_b = base_y1 + x0
  idx_c = base_y0 + x1
  idx_d = base_y1 + x1

  # use indices to lookup pixels for all samples
  im_flat = im.reshape((-1, channels))
  Ia = im_flat[idx_a[idx]]
  Ib = im_flat[idx_b[idx]]
  Ic = im_flat[idx_c[idx]]
  Id = im_flat[idx_d[idx]]

  # calculate interpolated values
  wa = ((x1_f-x) * (y1_f-y)).dimshuffle(0, 'x')[idx, :]
  wb = ((x1_f-x) * (y-y0_f)).dimshuffle(0, 'x')[idx, :]
  wc = ((x-x0_f) * (y1_f-y)).dimshuffle(0, 'x')[idx, :]
  wd = ((x-x0_f) * (y-y0_f)).dimshuffle(0, 'x')[idx, :]
  output = T.sum([wa*Ia, wb*Ib, wc*Ic, wd*Id], axis=0)

  # out = T.zeros_like(((x1_f-x) * (y1_f-y)).dimshuffle(0, 'x'))
  out = T.zeros_like(im_flat)
  return T.set_subtensor(out[idx, :], output)
Example #9
0
def past_weight_grad_step(xs, es, kp_x, kd_x, kp_e, kd_e, shape, dws=None):
    """
    Do an efficient update of the weights given the two spike-update.

    (This still runs FING SLOWLY!)

    :param xs: An (n_in) vector
    :param es: An (n_out) vector
    :param kp_x:
    :param kd_x:
    :param kp_e:
    :param kd_e:
    :param shapes: (n_in, n_out)
    :return:
    """
    kp_x, kd_x, kp_e, kd_e = [as_floatx(k) for k in (kp_x, kd_x, kp_e, kd_e)]
    n_in, n_out = shape
    rx = kd_x/(kp_x+kd_x)
    re = kd_e/(kp_e+kd_e)

    tx_last = create_shared_variable(np.zeros(n_in)+1)
    te_last = create_shared_variable(np.zeros(n_out)+1)
    x_last = create_shared_variable(np.zeros(n_in))
    e_last = create_shared_variable(np.zeros(n_out))
    x_spikes = tt.neq(xs, 0)
    e_spikes = tt.neq(es, 0)
    x_spike_ixs, = tt.nonzero(x_spikes)
    e_spike_ixs, = tt.nonzero(e_spikes)

    if dws is None:
        dws = tt.zeros(shape)

    t_last = tt.minimum(tx_last[x_spike_ixs, None], te_last)  # (n_x_spikes, n_out)
    dws = tt.inc_subtensor(dws[x_spike_ixs, :], x_last[x_spike_ixs, None]*e_last
        * rx**(tx_last[x_spike_ixs, None]-t_last)
        * re**(te_last[None, :]-t_last)
        * geoseries_sum(re*rx, t_end=t_last, t_start=1)
        )

    new_x_last = tt.set_subtensor(x_last[x_spike_ixs], x_last[x_spike_ixs]*rx**tx_last[x_spike_ixs]+ xs[x_spike_ixs]/as_floatx(kd_x))
    new_tx_last = tt.switch(x_spikes, 0, tx_last)

    t_last = tt.minimum(new_tx_last[:, None], te_last[e_spike_ixs])  # (n_in, n_e_spikes)
    dws = tt.inc_subtensor(dws[:, e_spike_ixs], new_x_last[:, None]*e_last[e_spike_ixs]
        * rx**(new_tx_last[:, None]-t_last)
        * re**(te_last[None, e_spike_ixs]-t_last)
        * geoseries_sum(re*rx, t_end=t_last, t_start=1)
        )

    add_update(x_last, new_x_last)
    add_update(e_last, tt.set_subtensor(e_last[e_spike_ixs], e_last[e_spike_ixs]*re**te_last[e_spike_ixs]+ es[e_spike_ixs]/as_floatx(kd_e)))
    add_update(tx_last, new_tx_last+1)
    add_update(te_last, tt.switch(e_spikes, 1, te_last+1))
    return dws
Example #10
0
def _log_add_3(log_a, log_b, log_c):
    """Theano expression for log(a+b+c) given log(a), log(b), log(c)."""
    smaller = T.minimum(log_a, log_b)
    larger = T.maximum(log_a, log_b)
    largest = T.maximum(larger, log_c)
    larger = T.minimum(larger, log_c)

    return largest + T.log1p(
            T.exp(smaller - largest) + 
            T.exp(larger - largest)
            )
Example #11
0
def clip_boxes(boxes, im_shape):
    """
    Clip boxes to image boundaries.
    """
    # x1 >= 0
    boxes = T.set_subtensor(boxes[:, 0::4], T.maximum(T.minimum(boxes[:, 0::4], im_shape[1] - 1), 0))
    # y1 >= 0
    boxes = T.set_subtensor(boxes[:, 1::4], T.maximum(T.minimum(boxes[:, 1::4], im_shape[0] - 1), 0))
    # x2 < im_shape[1]
    boxes = T.set_subtensor(boxes[:, 2::4], T.maximum(T.minimum(boxes[:, 2::4], im_shape[1] - 1), 0))
    # y2 < im_shape[0]
    boxes = T.set_subtensor(boxes[:, 3::4], T.maximum(T.minimum(boxes[:, 3::4], im_shape[0] - 1), 0))
    return boxes
Example #12
0
    def perform(self, x):

        Pmax = self.params[0]
        Pmin = self.params[1]

        if x.ndim==3:
            Pmin = Pmin.dimshuffle('x', 'x', 0)
            Pmax = Pmax.dimshuffle('x', 'x', 0)
            return T.minimum(T.maximum(Pmin, x), Pmax)
        else:
            Pmin = Pmin.dimshuffle('x', 0)
            Pmax = Pmax.dimshuffle('x', 0)
            return T.minimum(T.maximum(Pmin, x), Pmax)
Example #13
0
    def perform(self, x):

        EPSI = 1e-6
        Pmax = self.params[0]
        Pmin = self.params[1]

        if x.ndim==3:
            Pmin = Pmin.dimshuffle('x', 'x', 0)
            Pmax = Pmax.dimshuffle('x', 'x', 0)
            return (T.minimum(T.maximum(Pmin, x), Pmax)-Pmin)/(Pmax-Pmin+EPSI)
        else:
            Pmin = Pmin.dimshuffle('x', 0)
            Pmax = Pmax.dimshuffle('x', 0)
            return (T.minimum(T.maximum(Pmin, x), Pmax)-Pmin)/(Pmax-Pmin+EPSI)
Example #14
0
def create_activation(activation):
    '''Given an activation description, return a callable that implements it.

    Parameters
    ----------
    activation : string
        A string description of an activation function to use.

    Returns
    -------
    activation : callable(float) -> float
        A callable activation function.
    '''
    def compose(a, b):
        c = lambda z: b(a(z))
        c.__theanets_name__ = '%s(%s)' % (b.__theanets_name__, a.__theanets_name__)
        return c
    if '+' in activation:
        return functools.reduce(
            compose, (create_activation(a) for a in activation.split('+')))
    options = {
        'tanh': TT.tanh,
        'linear': lambda z: z,
        'logistic': TT.nnet.sigmoid,
        'sigmoid': TT.nnet.sigmoid,
        'softplus': TT.nnet.softplus,
        'softmax': softmax,

        # rectification
        'relu': lambda z: TT.maximum(0, z),
        'trel': lambda z: TT.maximum(0, TT.minimum(1, z)),
        'trec': lambda z: TT.maximum(1, z),
        'tlin': lambda z: z * (abs(z) > 1),

        # modifiers
        'rect:max': lambda z: TT.minimum(1, z),
        'rect:min': lambda z: TT.maximum(0, z),

        # normalization
        'norm:dc': lambda z: z - z.mean(axis=-1, keepdims=True),
        'norm:max': lambda z: z / TT.maximum(TT.cast(1e-7, FLOAT), abs(z).max(axis=-1, keepdims=True)),
        'norm:std': lambda z: z / TT.maximum(TT.cast(1e-7, FLOAT), TT.std(z, axis=-1, keepdims=True)),
        'norm:z': lambda z: (z - z.mean(axis=-1, keepdims=True)) / TT.maximum(TT.cast(1e-7, FLOAT), z.std(axis=-1, keepdims=True)),
        }
    for k, v in options.items():
        v.__theanets_name__ = k
    try:
        return options[activation.lower()]
    except KeyError:
        raise KeyError('unknown activation {}'.format(activation))
def __step(img, prev_bbox, state, timestep):
	conv1 = conv2d(img, conv1_filters, subsample=(conv1_stride, conv1_stride), border_mode='half')
	act1 = NN.relu(conv1)
	flat1 = TT.reshape(act1, (-1, conv1_output_dim))
	gru_in = TT.concatenate([flat1, prev_bbox], axis=1)
	gru_z = NN.sigmoid(TT.dot(gru_in, Wz) + TT.dot(state, Uz) + bz)
	gru_r = NN.sigmoid(TT.dot(gru_in, Wr) + TT.dot(state, Ur) + br)
	gru_h_ = TT.tanh(TT.dot(gru_in, Wg) + TT.dot(gru_r * state, Ug) + bg)
	gru_h = (1 - gru_z) * state + gru_z * gru_h_
	bbox = TT.tanh(TT.dot(gru_h, W_fc2) + b_fc2)

        bbox_cx = ((bbox[:, 2] + bbox[:, 0]) / 2 + 1) / 2 * img_row
        bbox_cy = ((bbox[:, 3] + bbox[:, 1]) / 2 + 1) / 2 * img_col
        bbox_w = TT.abs_(bbox[:, 2] - bbox[:, 0]) / 2 * img_row
        bbox_h = TT.abs_(bbox[:, 3] - bbox[:, 1]) / 2 * img_col
        x = TT.arange(img_row, dtype=T.config.floatX)
        y = TT.arange(img_col, dtype=T.config.floatX)
	mx = TT.maximum(TT.minimum(-TT.abs_(x.dimshuffle('x', 0) - bbox_cx.dimshuffle(0, 'x')) + bbox_w.dimshuffle(0, 'x') / 2., 1), 1e-4)
	my = TT.maximum(TT.minimum(-TT.abs_(y.dimshuffle('x', 0) - bbox_cy.dimshuffle(0, 'x')) + bbox_h.dimshuffle(0, 'x') / 2., 1), 1e-4)
        bbox_mask = mx.dimshuffle(0, 1, 'x') * my.dimshuffle(0, 'x', 1)

        new_cls1_f = cls_f
        new_cls1_b = cls_b

        mask = act1 * bbox_mask.dimshuffle(0, 'x', 1, 2)

        new_featmaps = TG.disconnected_grad(TT.set_subtensor(featmaps[:, timestep], mask))
	new_featmaps.name = 'new_featmaps'
        new_probmaps = TG.disconnected_grad(TT.set_subtensor(probmaps[:, timestep], bbox_mask))
	new_probmaps.name = 'new_probmaps'

        train_featmaps = TG.disconnected_grad(new_featmaps[:, :timestep+1].reshape(((timestep + 1) * batch_size, conv1_nr_filters, img_row, img_col)))
	train_featmaps.name = 'train_featmaps'
        train_probmaps = TG.disconnected_grad(new_probmaps[:, :timestep+1])
	train_probmaps.name = 'train_probmaps'

        for _ in range(0, 5):
		train_convmaps = conv2d(train_featmaps, new_cls1_f, subsample=(cls1_stride, cls1_stride), border_mode='half').reshape((batch_size, timestep + 1, batch_size, img_row, img_col))
		train_convmaps.name = 'train_convmaps'
		train_convmaps_selected = train_convmaps[TT.arange(batch_size).repeat(timestep+1), TT.tile(TT.arange(timestep+1), batch_size), TT.arange(batch_size).repeat(timestep+1)].reshape((batch_size, timestep+1, img_row, img_col))
		train_convmaps_selected.name = 'train_convmaps_selected'
		train_predmaps = NN.sigmoid(train_convmaps_selected + new_cls1_b.dimshuffle(0, 'x', 'x', 'x'))
		train_loss = NN.binary_crossentropy(train_predmaps, train_probmaps).mean()
                train_grad_cls1_f, train_grad_cls1_b = T.grad(train_loss, [new_cls1_f, new_cls1_b])
                new_cls1_f -= train_grad_cls1_f * 0.1
                new_cls1_b -= train_grad_cls1_b * 0.1

	return (bbox, gru_h, timestep + 1, mask, bbox_mask), {cls_f: TG.disconnected_grad(new_cls1_f), cls_b: TG.disconnected_grad(new_cls1_b), featmaps: TG.disconnected_grad(new_featmaps), probmaps: TG.disconnected_grad(new_probmaps)}
    def get_constraint_updates(self):
        constraint_updates = OrderedDict() 

        if self.flags['wv_norm'] == 'unit':
            constraint_updates[self.Wv] = self.Wv / self.norm_wv
        elif self.flags['wv_norm'] == 'max_unit':
            constraint_updates[self.Wv] = self.Wv / self.norm_wv * T.minimum(self.norm_wv, 1.0)

        if self.flags['scalar_lambd']:
            constraint_updates[self.lambd] = T.mean(self.lambd) * T.ones_like(self.lambd)

        ## Enforce sparsity pattern on g if required ##
        if self.sparse_gmask:
            constraint_updates[self.Wg] = self.Wg * self.sparse_gmask.mask.T

        ## clip parameters to maximum values (if applicable)
        for (k,v) in self.clip_max.iteritems():
            assert k in [param.name for param in self.params()]
            param = constraint_updates.get(k, getattr(self, k))
            constraint_updates[param] = T.clip(param, param, v)

        ## clip parameters to minimum values (if applicable)
        for (k,v) in self.clip_min.iteritems():
            assert k in [param.name for param in self.params()]
            param = constraint_updates.get(k, getattr(self, k))
            constraint_updates[param] = T.clip(constraint_updates.get(param, param), v, param)

        return constraint_updates
Example #17
0
    def learning_rate_updates(self):
        """
        Compute a dictionary of shared variable updates related to annealing
        the learning rate.

        Returns
        -------
        updates : dict
            A dictionary with the shared variables representing SGD metadata
            as keys and a symbolic expression of how they are to be updated as
            values.
        """
        ups = {}

        # Annealing coefficient. Here we're using a formula of
        # min(base_lr, anneal_start / (iteration + 1))
        if self.anneal_start is None:
            annealed = sharedX(self.base_lr)
        else:
            frac = self.anneal_start / (self.iteration + 1.)
            annealed = tensor.minimum(
                    as_floatX(frac),
                    self.base_lr  # maximum learning rate
                    )

        # Update the shared variable for the annealed learning rate.
        ups[self.annealed] = annealed
        ups[self.iteration] = self.iteration + 1

        # Calculate the learning rates for each parameter, in the order
        # they appear in self.params
        learn_rates = [annealed * self.learning_rates[p] for p in self.params]
        return ups, learn_rates
	def _step(self,
			  xsum_t, xmax_t, xmin_t, xsubt_t, xmul_t, xres_t, xone_t, xi_t, xf_t, xo_t, xc_t, mask_tm1, ### add op's input x
			  h_tm1, c_tm1,
			  u_sum, u_max, u_min, u_subt, u_mul, u_res, u_one, u_i, u_f, u_o, u_c): ### add gate weight u_ s
		h_mask_tm1 = mask_tm1 * h_tm1
		c_mask_tm1 = mask_tm1 * c_tm1
		c_tilda = self.activation(xc_t + T.dot(h_mask_tm1, u_c))

		a0_i = self.inner_activation(xi_t + T.dot(h_mask_tm1, u_i)) ### gate activations
		a1_f = self.inner_activation(xf_t + T.dot(h_mask_tm1, u_f))
		a2_sum = self.inner_activation(xsum_t + T.dot(h_mask_tm1, u_sum))
		a3_max = self.inner_activation(xmax_t + T.dot(h_mask_tm1, u_max))
		a4_min = self.inner_activation(xmin_t + T.dot(h_mask_tm1, u_min))
		a5_subt = self.inner_activation(xsubt_t + T.dot(h_mask_tm1, u_subt))
		a6_mul = self.inner_activation(xmul_t + T.dot(h_mask_tm1, u_mul))
		a7_res = self.inner_activation(xres_t + T.dot(h_mask_tm1, u_res))
		a8_one = self.inner_activation(xone_t + T.dot(h_mask_tm1, u_one))

		g0_forget = c_mask_tm1
		g1_input = c_tilda
		g2_sum = (c_mask_tm1 + c_tilda)
		g3_max = T.maximum(c_mask_tm1, c_tilda)
		g4_min = T.minimum(c_mask_tm1, c_tilda)
		g5_sub = c_mask_tm1 - c_tilda
		g6_mul = c_mask_tm1 * c_tilda
		g7_res = 0 * c_tilda
		g8_one = 0 * c_tilda + 1

		c_t = a0_i * g0_forget + a1_f * g1_input  + a2_sum * g2_sum + a3_max * g3_max + a4_min * g4_min + a5_subt * g5_sub + a6_mul * g6_mul + a7_res * g7_res + a8_one * g8_one     ### update cell

		o_t = self.inner_activation(xo_t + T.dot(h_mask_tm1, u_o))
		h_t = o_t * self.activation(c_t)
		return h_t, c_t
Example #19
0
    def get_output_for(self, input, only_at_anchor=False, **kwargs):
        if input.ndim > 2:
            # if the input has more than two dimensions, flatten it into a
            # batch of feature vectors.
            input = input.flatten(2)

        # ## calculate attention anchor position based on atw, atb and input x
        at_anchor = nonlinearities.rectify(T.dot(input, self.atw) + self.atb[0])
        at_anchor = T.minimum(at_anchor, 1)
        at_anchor *= self.num_units

        self.at_anchor = at_anchor  # for printing
        # print_op = printing.Print('attention')
        # at_anchor = print_op(at_anchor)
        if only_at_anchor:
            return at_anchor

        # ## normal dense layer activation output
        activation = T.dot(input, self.W)

        if self.b is not None:
            activation = activation + self.b.dimshuffle('x', 0)

        out = self.nonlinearity(activation)

        ### multiply activation with attention weight
        attention = T.exp(
            self.at_decay * (
                T.arange(0, self.num_units).dimshuffle('x', 0) -
                at_anchor.dimshuffle(0, 'x')
            ) ** 2)

        out *= attention
        return out
Example #20
0
def smorms3(cost, params, lrate=1e-3, eps=1e-16, gather=False):
    updates = []
    optim_params = []
    grads = T.grad(cost, params)

    for p, grad in zip(params, grads):
        mem = sharedX(p.get_value() * 0. + 1.)
        g = sharedX(p.get_value() * 0.)
        g2 = sharedX(p.get_value() * 0.)
        if gather:
            optim_params.append(mem)
            optim_params.append(g)
            optim_params.append(g2)

        r_t = 1. / (mem + 1)
        g_t = (1 - r_t) * g + r_t * grad
        g2_t = (1 - r_t) * g2 + r_t * grad**2
        p_t = p - grad * T.minimum(lrate, g_t * g_t / (g2_t + eps)) / \
              (T.sqrt(g2_t + eps) + eps)
        mem_t = 1 + mem * (1 - g_t * g_t / (g2_t + eps))

        updates.append((g, g_t))
        updates.append((g2, g2_t))
        updates.append((p, p_t))
        updates.append((mem, mem_t))
    return updates
Example #21
0
def update_log_p(skip_idxs,zeros,active,log_p_curr,log_p_prev):
    active_skip_idxs = skip_idxs[(skip_idxs < active).nonzero()]
    active_next = T.cast(T.minimum(
        T.maximum(
            active + 1,
            T.max(T.concatenate([active_skip_idxs, [-1]])) + 2 + 1
        ),
        log_p_curr.shape[0]
    ), 'int32')

    common_factor = T.max(log_p_prev[:active])
    p_prev = T.exp(log_p_prev[:active] - common_factor)
    _p_prev = zeros[:active_next]
    # copy over
    _p_prev = T.set_subtensor(_p_prev[:active], p_prev)
    # previous transitions
    _p_prev = T.inc_subtensor(_p_prev[1:], _p_prev[:-1])
    # skip transitions
    _p_prev = T.inc_subtensor(
        _p_prev[active_skip_idxs + 2], p_prev[active_skip_idxs])
    updated_log_p_prev = T.log(_p_prev) + common_factor

    log_p_next = T.set_subtensor(
        zeros[:active_next],
        log_p_curr[:active_next] + updated_log_p_prev
    )
    return active_next, log_p_next
    def __init__(self, inputs, input_size, output_size, is_backward=False, parameters=None):

        if parameters is None:
            self.W_if = U.create_shared(U.initial_weights(input_size, output_size), name='W_if')
            self.W_ff = U.create_shared(U.initial_weights(output_size, output_size), name='W_ff')
            self.b = U.create_shared(U.initial_weights(output_size), name='b')
        else:
            self.W_if = theano.shared(parameters['W_if'], name='W_if')
            self.W_ff = theano.shared(parameters['W_ff'], name='W_ff')
            self.b = theano.shared(parameters['b'], name='b')

        initial = T.zeros((output_size,))
        self.is_backward = is_backward
        self.activation_fn = lambda x: T.cast(T.minimum(x * (x > 0), 20), dtype='float32')#dtype=theano.config.floatX)
        
        nonrecurrent = T.dot(inputs, self.W_if) + self.b

        self.output, _ = theano.scan(
            lambda in_t, out_tminus1, weights: self.activation_fn(in_t + T.dot(out_tminus1, weights)),
            sequences=[nonrecurrent],
            outputs_info=[initial],
            non_sequences=[self.W_ff],
            go_backwards=self.is_backward
        )

        self.params = [self.W_if, self.W_ff, self.b]
Example #23
0
    def _differentiate(self, params=None):
        '''Return a sequence of gradients for our parameters.

        If this optimizer has been configured with a gradient norm limit, or
        with elementwise gradient clipping, this method applies the appropriate
        rescaling and clipping operations before returning the gradient.

        Parameters
        ----------
        params : list of Theano variables, optional
            Return the gradient with respect to these parameters. Defaults to
            all parameters that the optimizer knows about.

        Yields
        ------
        pairs : (param, grad) tuples
            Generates a sequence of tuples representing each of the parameters
            requested and the corresponding Theano gradient expressions.
        '''
        if params is None:
            params = self._params
        for param, grad in zip(params, TT.grad(self._loss, params)):
            if self.max_gradient_elem > 0:
                limit = util.as_float(self.max_gradient_elem)
                yield param, TT.clip(grad, -limit, limit)
            elif self.max_gradient_norm > 0:
                norm = TT.sqrt((grad * grad).sum())
                limit = util.as_float(self.max_gradient_norm)
                yield param, grad * TT.minimum(1, limit / norm)
            else:
                yield param, grad
Example #24
0
    def get_constraint_updates(self):
        constraint_updates = OrderedDict() 
        if self.flags['scalar_lambd']:
            constraint_updates[self.lambd] = T.mean(self.lambd) * T.ones_like(self.lambd)

        # constraint filters to have unit norm
        if self.flags['wv_norm'] in ('unit', 'max_unit'):
            wv = constraint_updates.get(self.Wv, self.Wv)
            wv_norm = T.sqrt(T.sum(wv**2, axis=0))
            if self.flags['wv_norm'] == 'unit':
                constraint_updates[self.Wv] = wv / wv_norm
            elif self.flags['wv_norm'] == 'max_unit':
                constraint_updates[self.Wv] = wv / wv_norm * T.minimum(wv_norm, 1.0)

        constraint_updates[self.scalar_norms] = T.maximum(1.0, self.scalar_norms)
        ## clip parameters to maximum values (if applicable)
        for (k,v) in self.clip_max.iteritems():
            assert k in [param.name for param in self.params()]
            param = constraint_updates.get(k, getattr(self, k))
            constraint_updates[param] = T.clip(param, param, v)

        ## clip parameters to minimum values (if applicable)
        for (k,v) in self.clip_min.iteritems():
            assert k in [param.name for param in self.params()]
            param = constraint_updates.get(k, getattr(self, k))
            constraint_updates[param] = T.clip(constraint_updates.get(param, param), v, param)

        return constraint_updates
Example #25
0
def irprop_minus_updates(params, grads):

    # IRPROP- parameters
    updates = []
    deltas = 0.1*numpy.ones(len(params))
    last_params = params
    
    positiveStep = 1.2
    negativeStep = 0.5
    maxStep = 1.
    minStep = math.exp(-6)

    for param, gparam, delta, last_gparam in zip(params, grads, deltas, last_params):
        # calculate change
        change = T.sgn(gparam * last_gparam)
        if T.gt(change, 0) :
            delta = T.minimum(delta * positiveStep, maxStep)
                           
        elif T.lt(change, 0):
            delta = T.maximum(delta * negativeStep, minStep)
            
            last_gparam = 0
            
        # update the weights
        updates.append((param, param - T.sgn(gparam) * delta))
        # store old change
        last_gparam = gparam

    return updates
Example #26
0
 def softmax(self, D, I):
   D = D * T.constant(self.attrs['sharpening'], 'float32')
   if self.attrs['norm'] == 'exp':
     E = T.exp(-D) * I
     E = E / T.maximum(T.sum(E,axis=0,keepdims=True),T.constant(1e-20,'float32'))
   elif self.attrs['norm'] == 'sigmoid':
     E = (numpy.float32(1) - T.tanh(D)**2) * I
   elif self.attrs['norm'] == 'lstm':
     n_out = self.attrs['template']
     def lstm(z, i_t, s_p, h_p):
       z += T.dot(h_p, self.N_re)
       i = T.outer(i_t, T.alloc(numpy.cast['int8'](1), n_out))
       ingate = T.nnet.sigmoid(z[:,n_out: 2 * n_out])
       forgetgate = T.nnet.sigmoid(z[:,2 * n_out:3 * n_out])
       outgate = T.nnet.sigmoid(z[:,3 * n_out:])
       input = T.tanh(z[:,:n_out])
       s_t = input * ingate + s_p * forgetgate
       h_t = T.tanh(s_t) * outgate
       return theano.gradient.grad_clip(s_t * i, -50, 50), h_t * i
     E, _ = theano.scan(lstm, sequences=[D,I], outputs_info=[T.zeros((n_out,), 'float32'), T.zeros((n_out,), 'int32')])
     E = T.nnet.sigmoid(T.dot(E,self.N_out))
   else:
     raise NotImplementedError()
   if self.attrs['nbest'] > 1:
     opt = T.minimum(self.attrs['nbest'], E.shape[0])
     score = (T.sort(E, axis=0)[-opt]).dimshuffle('x',0).repeat(E.shape[0],axis=0)
     E = T.switch(T.lt(E,score), T.zeros_like(E), E)
   return E
def prepare():

    X = T.fmatrix('X')
    y = T.ivector('y')

    assert not ("regression" in args and "logistic" in args)

    if "regression" in args:
        output_layer = squared_error_net_adaptive()
    else:
        output_layer = logistic()

    all_params = lasagne.layers.get_all_params(output_layer)

    if "regression" in args:
        prob_vector = lasagne.layers.get_output(output_layer, X)
        loss = squared_error(prob_vector, y).mean()
        pred = T.maximum(0, T.minimum( T.round(prob_vector), args["num_classes"]-1 ) )
        accuracy = T.mean( T.eq( pred, y ) )
    else:
        a = args["a"]
        b = args["b"]
        loss_fn = get_hybrid_loss(a,b)
        prob_vector = lasagne.layers.get_output(output_layer, X)
        loss = loss_fn(prob_vector, y).mean()
        pred = T.argmax( prob_vector, axis=1 )
        accuracy = T.mean( T.eq(pred,y) )

    return Container(
        { "X": X, "y": y, "output_layer": output_layer, "all_params": all_params,
        "loss": loss, "pred": pred, "accuracy": accuracy,
        "prob_vector": prob_vector
        }
    )
Example #28
0
 def attend(self, y_p):
   updates = self.default_updates()
   for g in range(self.attrs['glimpse']):
     for i in range(len(self.base)-1,-1,-1):
       factor = T.constant(self.base[i].attrs['factor'][0], 'int32') if i > 0 else 1
       B, C, I, h_p, _ = self.get(y_p, i, g)
       if i == len(self.base) - 1:
         z_i = self.distance(C, h_p)
       else:
         length = T.cast(T.max(T.sum(I,axis=0))+1,'int32')
         ext = T.cast(T.minimum(ext/factor,T.min(length)),'int32')
         def pick(i_t, ext):
           pad = T.minimum(i_t+ext, B.shape[0]) - ext
           return T.concatenate([T.zeros((pad,), 'int8'), T.ones((ext,), 'int8'), T.zeros((B.shape[0]-pad-ext+1,), 'int8')], axis=0)
         idx, _ = theano.map(pick, sequences = [pos/factor], non_sequences = [ext])
         idx = (idx.dimshuffle(1,0)[:-1].flatten() > 0).nonzero()
         C = C.reshape((C.shape[0]*C.shape[1],C.shape[2]))[idx].reshape((ext,C.shape[1],C.shape[2]))
         z_i = self.distance(C, h_p)
         I = I.reshape((I.shape[0]*I.shape[1],))[idx].reshape((ext,I.shape[1]))
       if i > 0:
         pos = T.argmax(self.softmax(z_i,I),axis=0) * factor
         ext = factor
       else:
         w_i = self.softmax(z_i,I)
     B = B.reshape((B.shape[0]*B.shape[1],B.shape[2]))[idx].reshape((ext,B.shape[1],B.shape[2]))
     proto = T.sum(B * w_i.dimshuffle(0,1,'x').repeat(B.shape[2],axis=2),axis=0)
     for i in range(len(self.base)):
       self.glimpses[i].append(proto)
   return T.dot(proto, self.custom_vars['W_att_in_0']), updates
Example #29
0
    def _build_activation(self, act=None):
        '''Given an activation description, return a callable that implements it.
        '''
        def compose(a, b):
            c = lambda z: b(a(z))
            c.__theanets_name__ = '%s(%s)' % (b.__theanets_name__, a.__theanets_name__)
            return c
        act = act or self.args.activation.lower()
        if '+' in act:
            return reduce(compose, (self._build_activation(a) for a in act.split('+')))
        options = {
            'tanh': TT.tanh,
            'linear': lambda z: z,
            'logistic': TT.nnet.sigmoid,
            'softplus': TT.nnet.softplus,

            # shorthands
            'relu': lambda z: TT.maximum(0, z),

            # modifiers
            'rect:max': lambda z: TT.minimum(1, z),
            'rect:min': lambda z: TT.maximum(0, z),

            # normalization
            'norm:dc': lambda z: (z.T - z.mean(axis=1)).T,
            'norm:max': lambda z: (z.T / TT.maximum(1e-10, abs(z).max(axis=1))).T,
            'norm:std': lambda z: (z.T / TT.maximum(1e-10, TT.std(z, axis=1))).T,
            }
        for k, v in options.iteritems():
            v.__theanets_name__ = k
        try:
            return options[act]
        except:
            raise KeyError('unknown --activation %s' % act)
Example #30
0
    def ready(self):
        # input (where first dimension is time)
        self.x = T.matrix()
        # target (where first dimension is time)
        if self.output_type == 'real':
            self.y = T.matrix(name='y', dtype=theano.config.floatX)
        else:
            raise NotImplementedError
        # initial hidden state of the RNN
        self.h0 = T.vector()
        # learning rate
        self.lr = T.scalar()

        if self.activation == 'tanh':
            activation = T.tanh
        elif self.activation == 'sigmoid':
            activation = T.nnet.sigmoid
        elif self.activation == 'relu':
            activation = lambda x: x * (x > 0)
        elif self.activation == 'cappedrelu':
            activation = lambda x: T.minimum(x * (x > 0), 6)
        else:
            raise NotImplementedError

        self.rnn = RNN(input=self.x, n_in=self.n_in,
                       n_hidden=self.n_hidden, n_out=self.n_out,
                       activation=activation, output_type=self.output_type,
                       use_symbolic_softmax=self.use_symbolic_softmax)

        if self.output_type == 'real':
            self.predict = theano.function(inputs=[self.x, ],
                                           outputs=self.rnn.y_pred,
                                           mode=mode)
        else:
            raise NotImplementedError
 def infer_shape(self, node, shapes):
     in_shape, = shapes
     dim1 = in_shape[self.axis1]
     dim2 = in_shape[self.axis2]
     out_shape = [
         d for i, d in enumerate(in_shape)
         if i not in (self.axis1, self.axis2)
     ]
     # The following logic is inspired by C code of PyArray_Diagonal().
     offset = self.offset
     if offset > 0:
         diag_size = T.clip(dim2 - offset, 0, dim1)
     elif offset < 0:
         diag_size = T.clip(dim1 + offset, 0, dim2)
     else:
         diag_size = T.minimum(dim1, dim2)
     out_shape.append(diag_size)
     return [tuple(out_shape)]
Example #32
0
 def learning_updates(self):
     for param, grad in zip(self.params, self.clipped_gradients()):
         grad_tm1 = self.shared_like(param, 'grad')
         step_tm1 = self.shared_like(param, 'step',
                                     self.learning_rate.value)
         test = grad * grad_tm1
         same = TT.gt(test, 0)
         diff = TT.lt(test, 0)
         step = TT.minimum(
             self.max_step,
             TT.maximum(
                 self.min_step,
                 step_tm1 * (TT.eq(test, 0) + same * self.step_increase +
                             diff * self.step_decrease)))
         grad = grad - diff * grad
         yield param, param - TT.sgn(grad) * step
         yield grad_tm1, grad
         yield step_tm1, step
    def compute_steps(self, previous_steps):
        # if not hasattr(self, 'threshold'):
        #    return previous_steps

        adapt_steps_up = self.adapt_steps + 1.0

        # This will quickly converge the estimate for the mean
        cut_rho_mean = tensor.minimum(self.decay,
                                      self.adapt_steps / adapt_steps_up)
        if self.quick_variance_convergence:
            cut_rho_mean2 = cut_rho_mean
        else:
            cut_rho_mean2 = self.decay

        gnorm = l2_norm(previous_steps.values())
        gnorm_log = tensor.log(l2_norm(previous_steps.values()))

        # here we quiclky converge the mean
        gnorm_log_ave_up = (cut_rho_mean * self.gnorm_log_ave +
                            (1. - cut_rho_mean) * gnorm_log)

        # this can wait as it starts from 0 anyways!
        gnorm_log2_ave_up = (cut_rho_mean2 * self.gnorm_log2_ave +
                             (1. - cut_rho_mean2) * (gnorm_log**2))

        clip_threshold_up = tensor.exp(gnorm_log_ave_up + tensor.sqrt(
            tensor.maximum(0.0, gnorm_log2_ave_up - gnorm_log_ave_up**2)) *
                                       self.stdevs)

        if self.clip_to_mean:
            clip_level_up = tensor.exp(gnorm_log_ave_up)
        else:
            clip_level_up = clip_threshold_up

        multiplier = tensor.switch(gnorm < clip_threshold_up, 1,
                                   clip_level_up / gnorm)
        steps = OrderedDict((parameter, step * multiplier)
                            for parameter, step in previous_steps.items())

        return steps, [(self.adapt_steps, adapt_steps_up),
                       (self.gnorm_log_ave, gnorm_log_ave_up),
                       (self.gnorm_log2_ave, gnorm_log2_ave_up),
                       (self.clip_threshold, clip_threshold_up),
                       (self.clip_level, clip_level_up)]
Example #34
0
    def queue_transform(feature_strengths,
                        feature_vects,
                        return_strengths=False):
        """
        Process features according to a "fragmented queue", where each timestep
        gets a size-1 window onto a feature queue. Effectively,
            feature_strengths gives how much to push onto queue
            feature_vects gives what to push on
            pop weights are tied to feature_strengths
            output is a size-1 peek (without popping)

        Parameters:
            - feature_strengths: float32 tensor of shape (batch, push_timestep) in [0,1]
            - feature_vects: float32 tensor of shape (batch, push_timestep, feature_dim)

        Returns:
            - peek_vects: float32 tensor of shape (batch, timestep, feature_dim)
        """
        n_batch, n_time, n_feature = feature_vects.shape

        cum_sum_str = T.extra_ops.cumsum(feature_strengths, 1)

        # We will be working in (batch, timestep, push_timestep)
        # For each timestep, if we subtract out the sum of pushes before that timestep
        # and then cap to 0-1 we get the cumsums for just the features active in that
        # timestep
        timestep_adjustments = T.shape_padright(cum_sum_str -
                                                feature_strengths)
        push_time_cumsum = T.shape_padaxis(cum_sum_str, 1)
        relative_cumsum = push_time_cumsum - timestep_adjustments
        capped_cumsum = T.minimum(T.maximum(relative_cumsum, 0), 1)

        # Now we can recover the peek strengths by taking a diff
        shifted = T.concatenate(
            [T.zeros((n_batch, n_time, 1)), capped_cumsum[:, :, :-1]], 2)
        peek_strengths = capped_cumsum - shifted
        # Peek strengths is now (batch, timestep, push_timestep)

        result = T.batched_dot(peek_strengths, feature_vects)

        if return_strengths:
            return peek_strengths, result
        else:
            return result
def parse_transfer_function(string_identifier, slope_parameter=None):
    """ This function returns the appropriate activation function, as selected by the string argument.
    
    string_identifier: 
        possible values are tanh, ReLU/relu, sigmoid/sig, abs, maxout <number>, linear/lin
    
    RETURNS: 
        transfer_function(python/theano function), string_identifier (normalized), dict (for special cases)
            
    """
    cross_channel_pooling_groups = None

    if string_identifier == 'tanh':
        Activation_f = T.tanh
    elif string_identifier in ['ReLU', 'relu']:  #rectified linear unit
        string_identifier = "relu"
        Activation_f = lambda x: x * (x > 0)
    elif string_identifier in ['sigmoid', 'sig']:
        string_identifier = "sigmoid"
        Activation_f = T.nnet.sigmoid
    elif string_identifier in ['abs', 'Abs', 'absolute']:
        string_identifier = 'abs'
        Activation_f = T.abs_
    elif string_identifier in ['plu', 'PLu', 'PLU',
                               'piecewise']:  #piece-wise linear function
        string_identifier = "PLU"
        print(
            "parse_transfer_function::Remember to optimize the 'slope_parameter'"
        )
        assert slope_parameter is not None, "...and better pass it to this function, as well! (type: Theano.Tensor, shape: same as activation, unif. random values [-1,1] should be fine)"
        Activation_f = lambda x: T.maximum(0, x) + T.minimum(
            0, x) * slope_parameter
    elif "maxout" in string_identifier:
        r = int(string_identifier.split(" ")[1])
        assert r >= 2
        cross_channel_pooling_groups = r
    elif string_identifier in ['linear', "lin"]:
        string_identifier = "linear"
        Activation_f = lambda x: x
    else:
        raise NotImplementedError()
    return Activation_f, string_identifier, {
        "cross_channel_pooling_groups": cross_channel_pooling_groups
    }
Example #36
0
    def init_fbcorr(self, x, x_shp, n_filters,
            filter_shape,
            min_out=fbcorr_.DEFAULT_MIN_OUT,
            max_out=fbcorr_.DEFAULT_MAX_OUT,
            stride=fbcorr_.DEFAULT_STRIDE,
            mode=fbcorr_.DEFAULT_MODE,
            generate=None):
        # Reference implementation:
        # ../pythor3/pythor3/operation/fbcorr_/plugins/scipy_naive/scipy_naive.py
        if stride != fbcorr_.DEFAULT_STRIDE:
            raise NotImplementedError('stride is not used in reference impl.')
        fake_x = np.empty((x_shp[2], x_shp[3], x_shp[1]),
                x.dtype)
        kerns = self.SLMP._get_filterbank(fake_x,
                dict(n_filters=n_filters,
                    filter_shape=filter_shape,
                    generate=generate))
        kerns = kerns.transpose(0, 3, 1, 2).copy()[:,:,::-1,::-1]
        x = conv.conv2d(
                x,
                kerns,
                image_shape=x_shp,
                filter_shape=kerns.shape,
                border_mode=mode)
        if mode == 'valid':
            x_shp = (x_shp[0], n_filters,
                    x_shp[2] - filter_shape[0] + 1,
                    x_shp[3] - filter_shape[1] + 1)
        elif mode == 'full':
            x_shp = (x_shp[0], n_filters,
                    x_shp[2] + filter_shape[0] - 1,
                    x_shp[3] + filter_shape[1] - 1)
        else:
            raise NotImplementedError('fbcorr mode', mode)

        if min_out is None and max_out is None:
            return x, x_shp
        elif min_out is None:
            return tensor.minimum(x, max_out), x_shp
        elif max_out is None:
            return tensor.maximum(x, min_out), x_shp
        else:
            return tensor.clip(x, min_out, max_out), x_shp
Example #37
0
    def post_modify_updates(self, updates, model):
        if hasattr(model, 'W'):
            W = model.W
        else:
            if not hasattr(model, 'transformer'):
                raise TypeError("model has neither 'W' nor 'transformer'.")
            transformer = model.transformer
            params = transformer.get_params()
            if len(params) != 1:
                raise TypeError("self.transformer does not have exactly one "
                                "parameter tensor.")
            W, = params

        if W in updates:
            updated_W = updates[W]
            col_norms = T.sqrt(T.square(updated_W).sum(axis=0))
            desired_norms = T.minimum(col_norms, self.limit)
            scale = desired_norms / T.maximum(1e-7, col_norms)
            updates[W] = updated_W * scale
    def __init__(self, inputs, input_size, output_size, is_backward=False):
        W_if = U.create_shared(U.initial_weights(input_size, output_size))
        W_ff = U.create_shared(U.initial_weights(output_size, output_size))
        b = U.create_shared(U.initial_weights(output_size))
        initial = U.create_shared(U.initial_weights(output_size))

        self.activation_fn = lambda x: T.minimum(x * (x > 0), 20)

        self.output, _ = theano.scan(
            lambda in_t: theano.scan(
                lambda index, out_tminus1: self.activation_fn(
                    T.dot(out_tminus1, W_ff) + T.dot(in_t[index], W_if) + b),
                sequences=[T.arange(inputs.shape[1])],
                outputs_info=[initial],
                go_backwards=is_backward),
            sequences=[inputs]  # for each sample at time "t"
        )

        self.params = [W_if, W_ff, b]
Example #39
0
def get_activation(act=None):
    def compose(a, b):
        c = lambda z: b(a(z))
        c.__theanets_name__ = '%s(%s)' % (b.__theanets_name__,
                                          a.__theanets_name__)
        return c

    if '+' in act:
        return functools.reduce(compose,
                                (get_activation(a) for a in act.split('+')))
    options = {
        'tanh': T.tanh,
        'linear': lambda z: z,
        'logistic': T.nnet.sigmoid,
        'sigmoid': T.nnet.sigmoid,
        'hard_sigmoid': T.nnet.hard_sigmoid,
        'softplus': T.nnet.softplus,
        'softmax': softmax,
        'theano_softmax': T.nnet.softmax,

        # shorthands
        'relu': lambda z: T.nnet.relu(z),
        'leaky_relu': lambda z: T.nnet.relu(z, 0.01),
        'trel': lambda z: z * (z > 0) * (z < 1),
        'trec': lambda z: z * (z > 1),
        'tlin': lambda z: z * (abs(z) > 1),

        # modifiers
        'rect:max': lambda z: T.minimum(1, z),
        'rect:min': lambda z: T.maximum(0, z),

        # normalization
        'norm:dc': lambda z: (z.T - z.mean(axis=1)).T,
        'norm:max': lambda z: (z.T / T.maximum(1e-10,
                                               abs(z).max(axis=1))).T,
        'norm:std': lambda z: (z.T / T.maximum(1e-10, T.std(z, axis=1))).T,
    }
    for k, v in options.items():
        v.__theanets_name__ = k
    try:
        return options[act]
    except KeyError:
        raise KeyError('unknown activation %r' % act)
Example #40
0
def sgd(loss, params, learning_rate, clip_at=5.0, scale_norm=0.0):

    updates = OrderedDict()
    grads = T.grad(cost=loss, wrt=params)
    epsilon = 1e-8

    for p, g in zip(params, grads):
        # if clip_at > 0.0:
        #     grad = clip(g, clip_at)
        # else:
        #     grad = g
        #
        # if scale_norm > 0.0:
        #     grad = scale(grad, scale_norm)
        grad_norm = g.norm(L=2)
        grad = (T.minimum(clip_at, grad_norm) / (grad_norm + epsilon)) * g

        updates[p] = p - learning_rate * grad
    return updates, grads
Example #41
0
    def __init__(self,
                 n_in=5,
                 hidden_stride=[50],
                 n_out=5,
                 learning_rate=0.01,
                 L1_reg=0.00,
                 L2_reg=0.00,
                 learning_rate_decay=1,
                 activation='tanh',
                 final_momentum=0.9,
                 initial_momentum=0.5,
                 momentum_switchover=5):
        self.n_in = int(n_in)
        self.n_out = int(n_out)
        self.learning_rate = float(learning_rate)
        self.learning_rate_decay = float(learning_rate_decay)
        self.activation = activation
        self.initial_momentum = float(initial_momentum)
        self.final_momentum = float(final_momentum)
        self.momentum_switchover = int(momentum_switchover)

        if self.activation == 'tanh':
            activation = T.tanh
        elif self.activation == 'sigmoid':
            activation = T.nnet.sigmoid
        elif self.activation == 'relu':
            activation = lambda x: x * (x > 0)
        elif self.activation == 'cappedrelu':
            activation = lambda x: T.minimum(x * (x > 0), 6)
        else:
            raise NotImplementedError

        ######################
        # BUILD ACTUAL MODEL #
        ######################
        logger.info('... building the model')
        self.rnn = RNN(n_in,
                       hidden_stride,
                       n_out,
                       activation=activation,
                       L1_reg=L1_reg,
                       L2_reg=L2_reg)
        self.stride_cnt = len(hidden_stride)
Example #42
0
    def sample_session_batch(self, max_n_samples, replace=False, selector_dtype='int32'):
        """
        returns SessionBatchEnvironment with sessions(observations,actions,rewards)
        that will be sampled uniformly from this session pool.
        if replace=False, the amount of samples is min(max_n_sample, current pool)
        Otherwise it equals max_n_samples
        
        The chosen session ids will be sampled at random using self.rng on each iteration
        p.s. no need to propagate rng updates! It does so by itself. 
        Unless you are calling it inside theano.scan, ofc, but i'd recomment that you didn't.
        unroll_scan works ~probably~ perfectly fine btw
        """
        if replace:
            n_samples = max_n_samples
        else:
            n_samples = T.minimum(max_n_samples, self.pool_size)

        sample_ids = self.rng.choice(size=(n_samples,), a=self.pool_size, dtype=selector_dtype, replace=replace)
        return self.select_session_batch(sample_ids)
    def init_opt(self):
        obs_var = self.env.observation_space.new_tensor_variable(
            'obs',
            extra_dims=1,
        )
        action_var = self.env.action_space.new_tensor_variable(
            'action',
            extra_dims=1,
        )
        advantage_var = ext.new_tensor(
            'advantage',
            ndim=1,
            dtype=theano.config.floatX
        )

        mean_var = ext.new_tensor(
            'mean',
            ndim=2,
            dtype=theano.config.floatX
        )

        log_std_var = ext.new_tensor(
            'log_std',
            ndim=2,
            dtype=theano.config.floatX
        )

        old_dist_info_vars = dict(mean=mean_var, log_std=log_std_var)
        dist_info_vars = self.policy.dist_info_sym(obs_var)
        lr = self.policy.distribution.likelihood_ratio_sym(action_var, old_dist_info_vars, dist_info_vars)

        surr_loss_vector = TT.minimum(lr * advantage_var,
                                      TT.clip(lr, 1 - self.epsilon, 1 + self.epsilon) * advantage_var)
        surr_loss = -TT.mean(surr_loss_vector)

        input_list = [obs_var, action_var, advantage_var, mean_var, log_std_var]

        self.optimizer.update_opt(
            loss=surr_loss,
            target=self.policy,
            inputs=input_list
        )
        return dict()
Example #44
0
    def learning_rate_updates(self, gradients):
        """
        Compute a dictionary of shared variable updates related to annealing
        the learning rate.

        Returns
        -------
        updates : dict
            A dictionary with the shared variables representing SGD metadata
            as keys and a symbolic expression of how they are to be updated as
            values.
        """
        ups = {}

        if self.use_adagrad:
            learn_rates = []
            for param, gp in zip(self.params, gradients):
                acc = self.accumulators[param]
                ups[acc] = acc + (gp**2).sum()
                learn_rates.append(self.e0s[param] / (ups[acc]**.5))
        else:
            # Annealing coefficient. Here we're using a formula of
            # min(base_lr, anneal_start / (iteration + 1))
            if self.anneal_start is None:
                annealed = sharedX(self.base_lr)
            else:
                frac = self.anneal_start / (self.iteration + 1.)
                annealed = tensor.minimum(
                    as_floatX(frac),
                    self.base_lr  # maximum learning rate
                )

            # Update the shared variable for the annealed learning rate.
            ups[self.annealed] = annealed
            ups[self.iteration] = self.iteration + 1

            # Calculate the learning rates for each parameter, in the order
            # they appear in self.params
            learn_rates = [
                annealed * self.learning_rates[p] for p in self.params
            ]
        return ups, learn_rates
Example #45
0
 def _new_update_deltas(self, network, parameter_vws, grads):
     learning_rate = network.find_hyperparameter(["learning_rate"], 0.001)
     epsilon = network.find_hyperparameter(["epsilon"], 1e-16)
     update_deltas = treeano.UpdateDeltas()
     for parameter_vw, grad in zip(parameter_vws, grads):
         mem_vw = network.create_vw(
             "smorms3_mem(%s)" % parameter_vw.name,
             shape=parameter_vw.shape,
             is_shared=True,
             tags={"state"},
             default_inits=[treeano.inits.ConstantInit(1)],
         )
         g_vw = network.create_vw(
             "smorms3_g(%s)" % parameter_vw.name,
             shape=parameter_vw.shape,
             is_shared=True,
             tags={"state"},
             default_inits=[],
         )
         g2_vw = network.create_vw(
             "smorms3_g2(%s)" % parameter_vw.name,
             shape=parameter_vw.shape,
             is_shared=True,
             tags={"state"},
             default_inits=[],
         )
         parameter = parameter_vw.variable
         mem = mem_vw.variable
         g = g_vw.variable
         g2 = g2_vw.variable
         r = 1 / (mem + 1)
         new_g = (1 - r) * g + r * grad
         new_g2 = (1 - r) * g2 + r * grad**2
         term1 = (new_g**2) / (new_g2 + epsilon)
         term2 = T.sqrt(new_g2) + epsilon
         parameter_delta = -grad * T.minimum(learning_rate, term1) / term2
         new_mem = 1 + mem * (1 - term1)
         update_deltas[parameter] = parameter_delta
         update_deltas[mem] = new_mem - mem
         update_deltas[g] = new_g - g
         update_deltas[g2] = new_g2 - g2
     return update_deltas
Example #46
0
    def ready(self):
        """
        this routine is called from "fit" since we determine the
        image size (assumed square) and output labels from the training data.

        """
        #input
        self.x = T.matrix('x')
        #output (a label)
        self.y = T.ivector('y')

        if self.activation == 'tanh':
            activation = T.tanh
        elif self.activation == 'sigmoid':
            activation = T.nnet.sigmoid
        elif self.activation == 'relu':
            activation = lambda x: x * (x > 0)
        elif self.activation == 'cappedrelu':
            activation = lambda x: T.minimum(x * (x > 0), 6)
        else:
            raise NotImplementedError

        self.cnn = CNN(input=self.x,
                       n_in=self.n_in,
                       n_out=self.n_out,
                       activation=activation,
                       nkerns=self.nkerns,
                       filters=self.filters,
                       n_hidden=self.n_hidden,
                       poolsize=self.poolsize,
                       output_type=self.output_type,
                       batch_size=self.batch_size,
                       use_symbolic_softmax=self.use_symbolic_softmax)

        #self.cnn.predict expects batch_size number of inputs.
        #we wrap those functions and pad as necessary in 'def predict' and 'def predict_proba'
        self.predict_wrap = theano.function(inputs=[self.x],
                                            outputs=self.cnn.y_pred,
                                            mode=mode)
        self.predict_proba_wrap = theano.function(inputs=[self.x],
                                                  outputs=self.cnn.p_y_given_x,
                                                  mode=mode)
Example #47
0
def minimum(var1, var2, name=None):
    """Compute elementwise min among tensors

    Parameters
    ----------
    var1, var2: Tensor
        Tensor to compare. Either one has to be
        :class:`luchador.nn.theano.wrapper.Tensor` class

    name : str
        Name of new Tensor

    Returns
    -------
    Tensor
        The resulting Tensor
    """
    # TODO: Add Broadcasting
    _tensor = T.minimum(var1.unwrap(), var2.unwrap())
    return Tensor(tensor=_tensor, shape=var1.shape, name=name)
Example #48
0
 def optimizer(self):
     if not hasattr(self, '_optimizer'):
         df = self.fvector('A') - self.fvector('B')
         phi = df / (1 + tn.relu(df.norm(2) - 1))
         y = tt.dot(self.samples, phi)
         p = tt.sum(tt.switch(y < 0, 1., 0.))
         q = tt.sum(tt.switch(y > 0, 1., 0.))
         if not hasattr(self, 'avg_case'):
             obj = tt.minimum(tt.sum(1. - tt.exp(-tn.relu(y))),
                              tt.sum(1. - tt.exp(-tn.relu(-y))))
         else:
             obj = p * tt.sum(1. - tt.exp(-tn.relu(y))) + q * tt.sum(
                 1. - tt.exp(-tn.relu(-y)))
         variables = [self.x0]
         for robot in self.robots:
             variables += [robot.x[0]] + robot.u
         for human in self.human.values():
             variables += human.u
         self._optimizer = Maximizer(obj, variables)
     return self._optimizer
Example #49
0
def cat_entropy(arr):
    """Return the entropy of categorical distributions described by the rows
    in ``arr``.

    Parameters
    ----------

    arr : Theano variable
        Array of shape ``(n, d)`` describing ``n`` different categorical
        variables. Rows need to sum up to ``1`` and be non-negative.

    Returns
    -------

    res : theano variable
        Has shape ``(n,)``.
    """
    # TODO check if this is also valid for multinomial.
    arr = T.minimum(1, arr + 1e-8)
    return -(arr * T.log(arr)).sum(axis=1)
Example #50
0
def relevance_pool(out_relevances, inputs, pool_size, pool_stride):
    # channels x channels x pool_0 x pool_1
    pool_ones_shape = [out_relevances.shape[1], out_relevances.shape[1],
        pool_size[0], pool_size[1]]
    # modification: make inputs positive
    #inputs = T.abs_(inputs)
    # other variant: make inputs positive by offset
    offset = T.minimum(0, T.min(inputs, axis=(1,2,3), keepdims=True))
    inputs = inputs - offset
    pool_ones = T.ones(pool_ones_shape, dtype=np.float32)
    # only within a channel spread values of that channel...
    # therefore set all values of indices like
    # filt_i, channel_j with j!=i to zero!
    pool_ones = pool_ones * T.eye(out_relevances.shape[1],
                              out_relevances.shape[1]).dimshuffle(
                                 0,1,'x','x')
    norms_for_relevances = conv2d(inputs,
        pool_ones, subsample=pool_stride, border_mode='valid')
    # prevent division by 0...
    # the relevance which had norm zero will not be redistributed anyways..
    # so it doesn't matter which normalization factor you choose here,
    # only thing is to prevent NaNs...
    # however this means heatmapping is no longer completely preserving
    # 
    norms_for_relevances += T.eq(norms_for_relevances, 0) * 1
    normed_relevances = out_relevances / norms_for_relevances
    # stride has to be taken into account, see 
    # http://stackoverflow.com/a/28752057/1469195
    upsampled_relevances = T.zeros((normed_relevances.shape[0],
        normed_relevances.shape[1],
        normed_relevances.shape[2] * pool_stride[0] - pool_stride[0] + 1, 
        normed_relevances.shape[3] * pool_stride[1] - pool_stride[1] + 1, 
        ), dtype=np.float32)
    upsampled_relevances = T.set_subtensor(
        upsampled_relevances[:, :, ::pool_stride[0], ::pool_stride[1]], 
        normed_relevances)
    in_relevances = conv2d(upsampled_relevances,
                           pool_ones, subsample=(1,1),
                           border_mode='full')
    in_relevances = in_relevances * inputs
    return in_relevances
Example #51
0
def relevance_conv(out_relevances, inputs, weights, rule, bias=None, min_in=None,
        max_in=None, a=None, b=None):
    assert rule in ['w_sqr', 'z', 'z_plus', 'z_b', 'adapt_z_b', 'sign_stable',
        'a_b', 'a_b_sign_switch', 'a_b_abs', 'a_b_in_plus']
    if rule == 'w_sqr':
        return relevance_conv_w_sqr(out_relevances, weights, bias=bias)
    elif rule == 'z_plus':
        return relevance_conv_z_plus(out_relevances, inputs, weights, bias=bias)
    elif rule == 'z_b':
        assert min_in is not None
        assert max_in is not None
        assert min_in <= 0
        assert max_in >= 0
        return relevance_conv_z_b(out_relevances, inputs, weights,
            min_in, max_in, bias=bias)
    elif rule == 'adapt_z_b':
        # clip to zero both min and max to prevent mistakes...
        min_in = T.min(inputs)
        min_in = T.minimum(0, min_in)
        max_in = T.max(inputs)
        max_in = T.maximum(0, max_in)
        return relevance_conv_z_b(out_relevances, inputs, weights,
            min_in, max_in, bias=bias)
    elif rule == 'sign_stable':
        return relevance_conv_stable_sign(inputs, weights, out_relevances,
            bias=bias)
    elif rule == 'a_b':
        return relevance_conv_a_b(inputs, weights, out_relevances, 
            a=a,b=b, bias=bias)
    elif rule == 'z':
        return relevance_conv_z(out_relevances, inputs, weights, 
            bias=bias)
    elif rule == 'a_b_sign_switch':
        return relevance_conv_a_b_sign_switch(inputs, weights, out_relevances, 
            a=a,b=b, bias=bias)
    elif rule == 'a_b_abs':
        return relevance_conv_a_b_abs(inputs, weights, out_relevances, 
            a=a,b=b, bias=bias)
    elif rule == 'a_b_in_plus':
        return relevance_conv_a_b_in_plus(inputs, weights, out_relevances,
            a, b, bias)
Example #52
0
    def __init__(self,
                 rng,
                 input,
                 batch_size,
                 in_size,
                 label_size,
                 latent_size,
                 label_fn,
                 W_y=None,
                 b_y=None,
                 W_a=None,
                 W_b=None):
        self.label_fn = label_fn

        # init parent class
        super(StickBreaking_Encoder_w_Labels,
              self).__init__(rng=rng,
                             input=input,
                             batch_size=batch_size,
                             in_size=in_size,
                             latent_size=latent_size,
                             W_a=W_a,
                             W_b=W_b)

        # setup label prediction params
        if W_y is None:
            W_values = np.asarray(
                0.01 * rng.standard_normal(size=(in_size, label_size)),
                dtype=theano.config.floatX)
            W_y = theano.shared(value=W_values, name='W_y')
        if b_y is None:
            b_values = np.zeros((label_size, ), dtype=theano.config.floatX)
            b_y = theano.shared(value=b_values, name='b_y')
        self.W_y = W_y
        self.b_y = b_y

        # compute the label probabilities
        self.y_probs = self.label_fn(T.dot(self.input, self.W_y) + self.b_y)
        self.y_probs = T.maximum(T.minimum(self.y_probs, 1 - 1e-4),
                                 1e-4)  # Force 0 < output < 1
        self.params += [self.W_y, self.b_y]
Example #53
0
    def build(self):
        """The PyMC model that incorporates Bayesian Statistics in order to store what the likelihood of the model is for a given point."""
        M = pm.Model()

        with M:
            kfwd, endo, activeEndo, kRec, kDeg, sortF = commonTraf()
            nullRates = T.ones(
                6, dtype=np.float64)  # associated with IL2 and IL15
            Tone = T.ones(1, dtype=np.float64)
            k27rev = pm.Lognormal("k27rev", mu=np.log(0.1), sigma=1,
                                  shape=1)  # associated with IL7
            k33rev = pm.Lognormal("k33rev", mu=np.log(0.1), sigma=1,
                                  shape=1)  # associated with IL4

            # constant according to measured number per cell. gc, blank, IL7R, blank, IL4R
            Rexpr = (np.array([0.0, 0.0, 328.0, 0.0, 2591.0, 0.0, 254.0, 0.0])
                     * endo) / (1.0 + ((kRec * (1.0 - sortF)) /
                                       (kDeg * sortF)))

            # indexing same as in model.hpp
            unkVec = T.concatenate(
                (kfwd, nullRates, k27rev, Tone, k33rev, Tone, endo, activeEndo,
                 sortF, kRec, kDeg, Rexpr))

            self.act.calc(
                unkVec, M
            )  # fitting the data based on act.calc for the given parameters

            if self.pretreat is True:
                Y_cross = self.cross.calc(
                    unkVec)  # fitting the data based on cross.calc
                pm.Deterministic("Y_cross", T.sum(T.square(Y_cross)))
                sd_cross = T.minimum(T.std(Y_cross), 0.1)
                pm.Normal(
                    "fitD_cross", sigma=sd_cross,
                    observed=Y_cross)  # the stderr is definitely less than 0.2

            # Save likelihood
            pm.Deterministic("logp", M.logpt)

        return M
Example #54
0
    def _step(self, xg_t, xo_t, xc_t, mask_tm1, h_tm1, c_tm1, u_g, u_o, u_c):

        h_mask_tm1 = mask_tm1 * h_tm1
        c_mask_tm1 = mask_tm1 * c_tm1
        act = T.tensordot(xg_t + h_mask_tm1, u_g, [[1], [2]])
        gate = T.nnet.softmax(act.reshape(
            (-1, act.shape[-1]))).reshape(act.shape)

        c_tilda = self.activation(xc_t + T.dot(h_mask_tm1, u_c))
        ops = [
            c_mask_tm1, c_tilda, (c_mask_tm1 + c_tilda),
            T.maximum(c_mask_tm1, c_tilda),
            T.minimum(c_mask_tm1, c_tilda), c_mask_tm1 - c_tilda,
            c_mask_tm1 * c_tilda, 0 * c_tilda, 0 * c_tilda + 1
        ]
        yshuff = T.as_tensor_variable(ops, name='yshuff').dimshuffle(1, 2, 0)
        c_t = (gate.reshape((-1, gate.shape[-1])) * yshuff.reshape(
            (-1, yshuff.shape[-1]))).sum(axis=1).reshape(gate.shape[:2])
        o_t = self.inner_activation(xo_t + T.dot(h_mask_tm1, u_o))
        h_t = o_t * self.activation(c_t)
        return h_t, c_t
Example #55
0
    def ready(self):
        # input (where first dimension is time)
        self.x = T.matrix()
        # target (where first dimension is time)
        if self.output_type == 'real':
            self.y = T.matrix(name='y', dtype=theano.config.floatX)
        else:
            raise NotImplementedError
        # initial hidden state of the RNN
        self.h0 = T.vector()
        # learning rate
        self.lr = T.scalar()

        if self.activation == 'tanh':
            activation = T.tanh
        elif self.activation == 'sigmoid':
            activation = T.nnet.sigmoid
        elif self.activation == 'relu':
            activation = lambda x: x * (x > 0)
        elif self.activation == 'cappedrelu':
            activation = lambda x: T.minimum(x * (x > 0), 6)
        else:
            raise NotImplementedError

        self.rnn = RNN(input=self.x,
                       n_in=self.n_in,
                       n_hidden=self.n_hidden,
                       n_out=self.n_out,
                       activation=activation,
                       output_type=self.output_type,
                       use_symbolic_softmax=self.use_symbolic_softmax)

        if self.output_type == 'real':
            self.predict = theano.function(inputs=[
                self.x,
            ],
                                           outputs=self.rnn.y_pred,
                                           mode=mode)
        else:
            raise NotImplementedError
Example #56
0
    def ready(self):
        # input (where first dimension is time)
        self.x = T.tensor3(name='x')
        # target (where first dimension is time)
        self.y = T.tensor3(name='y', dtype=theano.config.floatX)

        # learning rate
        self.lr = T.scalar()

        if self.activation == 'tanh':
            activation = T.tanh
        elif self.activation == 'sigmoid':
            activation = T.nnet.sigmoid
        elif self.activation == 'relu':
            activation = lambda x: x * (x > 0)
        elif self.activation == 'cappedrelu':
            activation = lambda x: T.minimum(x * (x > 0), 6)
        else:
            raise NotImplementedError

        # generate numpy rng
        numpy_rng = np.random.RandomState(self.numpy_rng_seed)

        self.estimator = RNN(input=self.x, n_in=self.n_in,
                             n_hidden=self.n_hidden, n_out=self.n_out,
                             truncated_num=self.truncated_num,
                             activation=activation, numpy_rng=numpy_rng)

        self.predict = theano.function(inputs=[self.x, ],
                                       outputs=self.estimator.y_pred,
                                       mode=mode)
        
        # get time stamp
        date_obj = datetime.datetime.now()
        date_str = date_obj.strftime('%Y%m%d-%H%M%S')
        self.timestamp = date_str
        
        # initialize errorlog
        self.errorlog = []
Example #57
0
    def input_row_from_variables(ori_ip, dest_ip, ori_lat, ori_long, dest_lat,
                                 dest_long, ori_type, dest_type, dist):
        '''Create an input row for the MLP from the inputs'''

        input_row = tensor.zeros([input_size])

        offset = 0

        ips = [ori_ip, dest_ip]
        for ip in ips:
            for _ in range(4):
                input_row = add_one_shot(input_row, offset,
                                         tensor.mod(ip, 256))
                ip = tensor.int_div(ip, 256)
                offset += 256

        for lat_, long_ in [(ori_lat, ori_long), (dest_lat, dest_long)]:
            translated_lat = tensor.iround(
                (coordinate_size - 1) * (lat_ / 180 + 0.5))
            input_row = add_thermo(input_row, offset, translated_lat)
            offset += coordinate_size

            translated_long = tensor.iround(
                (coordinate_size - 1) * (long_ / 360 + 0.5))
            input_row = add_thermo(input_row, offset, translated_long)
            offset += coordinate_size

        for type_ in [ori_type, dest_type]:
            add_one_shot(input_row, offset, type_ + 1)
            offset += type_size

        translated_dist = tensor.iround(
            (dest_size - 1) * (tensor.minimum(1, dist / max_earth_distance)))
        input_row = add_thermo(input_row, offset, translated_dist)

        #could be useful if we want to add something
        offset += dest_size

        return input_row
Example #58
0
def prepare():

    X = T.fmatrix('X')
    y = T.ivector('y')

    assert not ("regression" in args and "logistic" in args)

    if "regression" in args:
        output_layer = squared_error_net_adaptive()
    else:
        output_layer = logistic()

    all_params = lasagne.layers.get_all_params(output_layer)

    if "regression" in args:
        prob_vector = lasagne.layers.get_output(output_layer, X)
        loss = squared_error(prob_vector, y).mean()
        pred = T.maximum(
            0, T.minimum(T.round(prob_vector), args["num_classes"] - 1))
        accuracy = T.mean(T.eq(pred, y))
    else:
        a = args["a"]
        b = args["b"]
        loss_fn = get_hybrid_loss(a, b)
        prob_vector = lasagne.layers.get_output(output_layer, X)
        loss = loss_fn(prob_vector, y).mean()
        pred = T.argmax(prob_vector, axis=1)
        accuracy = T.mean(T.eq(pred, y))

    return Container({
        "X": X,
        "y": y,
        "output_layer": output_layer,
        "all_params": all_params,
        "loss": loss,
        "pred": pred,
        "accuracy": accuracy,
        "prob_vector": prob_vector
    })
Example #59
0
def relu_activation(x, leak_slope=0., clip_threshold=None, **kwargs):
    # Reference:
    # Nair, Vinod, and Geoffrey E. Hinton. "Rectified linear units improve restricted boltzmann machines."
    # In Proceedings of the 27th International Conference on Machine Learning (ICML-10), pp. 807-814. 2010.
    #
    # softplus in turn can be approximated by a simple max operation max(0, x + N(0, sigmoid(x))). The gaussian noise
    # component is added since softplus behaves like a noisy integer valued version of a smoothed rectified linear unit.
    # The variance of this noise is sigmoid(x) and does not becom large for large x. This can further be simplified by
    # usinf max(0,x) instead. This function is known as Rectified Linear (ReL). This has some advantages:
    #   - No vanishing gradient at +inf, like softplus
    #   - Induces sparsity in activations
    #   - Empirical results indicate deep networks can be trained effectively with ReL units (ReLU)
    #   - Can be used by RBMs to model real/integer valued inputs
    assert hasattr(T.nnet, 'relu'), ('It looks like like your version of '
                                     'Theano is out of date. '
                                     'Install the latest version with:\n'
                                     'pip install git+git://github.com/Theano/Theano.git --upgrade --no-deps')
    assert leak_slope is not None, "Leak slope cannot be None"
    x = T.nnet.relu(x, leak_slope)
    if clip_threshold is not None:
        x = T.minimum(x, clip_threshold)
    return x
Example #60
0
def FeedforwardBatchNormalization(x,
                                  gamma,
                                  mask,
                                  estimated_mean=0.0,
                                  estimated_var=1.0):
    assert x.ndim == 3
    if mask:
        assert mask.ndim == 2
        mask = mask.dimshuffle(0, 1, 'x')

        mask_nonzeros = T.sum(T.sum(mask, axis=0), axis=0)
        mask_nonzeros_weight = T.cast(
            T.minimum(1.0, T.sum(mask, axis=0)) / mask.shape[1], 'float32')

        x_masked = x * mask

        x_mean = (T.sum(T.sum(x_masked, axis=0), axis=0) /
                  mask_nonzeros).dimshuffle('x', 'x', 0)
        ## why do we need mask_nonzeros_weight
        x_mean_adjusted = mask_nonzeros_weight * x_mean + (
            1.0 - mask_nonzeros_weight) * estimated_mean
        x_zero_mean = x - x_mean_adjusted

        x_var = (T.sum(T.sum(x_zero_mean**2, axis=0), axis=0) /
                 mask_nonzeros).dimshuffle('x', 'x', 0)
        x_var_adjusted = mask_nonzeros_weight * x_var + (
            1.0 - mask_nonzeros_weight) * estimated_var

    else:
        x_mean = estimated_mean.dimshuffle('x', 'x', 0)
        x_mean_adjusted = x_mean

        x_zero_mean = x - x_mean

        x_var = estimated_var.dimshuffle('x', 'x', 0)
        x_var_adjusted = x_var

    return gamma * (x_zero_mean / T.sqrt(x_var_adjusted + 1e-7)
                    ), x_mean_adjusted[0, 0], x_var_adjusted[0, 0]