Example #1
0
    def fprop(self,
              state_before,
              mem_before,
              cell_before,
              forget_below,
              input_below,
              output_below,
              state_below):

        state_fork_outs = self.state_before_fork_layer.fprop(state_before)
        mem_fork_outs = self.mem_before_fork_layer.fprop(mem_before)

        inp = Sigmoid(input_below + mem_fork_outs[self.mbf_names[1]] + \
                state_fork_outs[self.sbf_names[1]])

        output = Sigmoid(output_below + mem_fork_outs[self.mbf_names[2]] + \
                state_fork_outs[self.sbf_names[2]])

        forget = Sigmoid(forget_below + mem_fork_outs[self.mbf_names[0]] + \
                state_fork_outs[self.sbf_names[0]])

        cell = Tanh(state_below + mem_fork_outs[self.mbf_names[3]] +
                state_fork_outs[self.sbf_names[3]])

        c_t = inp * cell + forget * cell_before
        h_t = output * self.activ(c_t)

        return h_t, c_t
Example #2
0
    def _step(mask, sbelow, sbefore, cell_before):
        preact = dot(sbefore, param('U'))
        preact += sbelow
        preact += tparams[prfx(prefix, 'b')]

        f = Sigmoid(_slice(preact, 0, dim))
        o = Sigmoid(_slice(preact, 1, dim))
        c = Tanh(_slice(preact, 2, dim))

        c = f * cell_before + (1 - f) * c
        c = mask * c + (1. - mask) * cell_before
        h = o * tensor.tanh(c)
        h = mask * h + (1. - mask) * sbefore

        return h, c
Example #3
0
    def _step(mask, sbelow, sbefore, cell_before, *args):
        preact = dot(sbefore, param('U'))
        preact += sbelow
        preact += param('b')

        i = Sigmoid(_slice(preact, 0, dim))
        f = Sigmoid(_slice(preact, 1, dim))
        o = Sigmoid(_slice(preact, 2, dim))
        c = Tanh(_slice(preact, 3, dim))

        c = f * cell_before + i * c
        c = mask * c + (1. - mask) * cell_before
        h = o * tensor.tanh(c)
        h = mask * h + (1. - mask) * sbefore

        return h, c
Example #4
0
    def _step_slice(mask, sbelow, sbelowx, sbefore, U, Ux):
        preact = dot(sbefore, U)
        preact += sbelow

        r = Sigmoid(_slice(preact, 0, dim))
        u = Sigmoid(_slice(preact, 1, dim))

        preactx = dot(r * sbefore, Ux)

        # preactx = preactx
        preactx = preactx + sbelowx

        h = Tanh(preactx)

        h = u * sbefore + (1. - u) * h
        h = mask[:, None] * h + (1. - mask)[:, None] * sbefore

        return h
Example #5
0
    def fprop(self,
              state_before,
              mem_before,
              reset_below,
              gater_below,
              state_below,
              context=None,
              use_noise=None):

        state_reset = self.state_reset_before_proj.fprop(state_before)
        state_gater = self.state_gater_before_proj.fprop(state_before)
        membf_state = self.state_mem_before_proj.fprop(mem_before)

        if self.use_layer_norm:
            state_reset = self.reset_layer_norm_bf.fprop(state_reset)
            state_gater = self.update_layer_norm_bf.fprop(state_gater)
            membf_state = self.mem_layer_norm_bf.fprop(membf_state)

            reset_below = self.reset_layer_norm_inp.fprop(reset_below)
            gater_below = self.update_layer_norm_inp.fprop(gater_below)
            state_below = self.ht_layer_norm_inp.fprop(state_below)

        reset = Sigmoid(reset_below + state_reset)
        state_state = self.state_str_before_proj.fprop(reset * state_before)

        if self.use_layer_norm:
            state_state = self.ht_layer_norm_bf.fprop(state_state)

        gater = Sigmoid(gater_below + state_gater)

        if context:
            h = self.activ(state_state + membf_state + state_below + context)
        else:
            h = self.activ(state_state + membf_state + state_below)

        if self.dropout:
            h = self.dropout(h,
                             use_noise=use_noise)

        h_t = (1. - gater) * state_before + gater * h

        return h_t
Example #6
0
    def _step_slice(mask,
                    sbelow,
                    sbelowx,
                    xc_, sbefore,
                    ctx_, alpha_,
                    pctx_, cc_,
                    U, Wc,
                    Wd_att, U_att,
                    c_tt, Ux, Wcx):
        # attention
        pstate_ = dot(sbefore, Wd_att)
        pctx__ = pctx_ + pstate_[None, :, :]
        pctx__ += xc_
        pctx__ = Tanh(pctx__)
        alpha = dot(pctx__, U_att)+c_tt
        alpha = alpha.reshape([alpha.shape[0], alpha.shape[1]])
        alpha = tensor.exp(alpha)
        if context_mask:
            alpha = alpha * context_mask

        alpha = alpha / alpha.sum(0, keepdims=True)
        ctx_ = (cc_ * alpha[:, :, None]).sum(0)
        # current context

        preact = dot(sbefore, U)
        preact += sbelow
        preact += dot(ctx_, Wc)
        preact = Sigmoid(preact)

        r = _slice(preact, 0, dim)
        u = _slice(preact, 1, dim)

        preactx = dot(sbefore, Ux)
        preactx *= r
        preactx += sbelowx
        preactx += dot(ctx_, Wcx)

        h = Tanh(preactx)

        h = u * sbefore + (1. - u) * h
        h = mask[:, None] * h + (1. - mask)[:, None] * sbefore

        return h, ctx_, alpha.T
Example #7
0
 def NReLU(x, rng=None, use_noise=False):
     assert rng is not None
     if use_noise:
         stds = Sigmoid(x)
         x = x + rng.normal(x.shape, avg=0.0, std=stds, dtype=x.dtype)
     return Trect(x)
Example #8
0
    def fprop(self,
              state_below,
              memory,
              w_t_before,
              w_t_pre_before=None,
              time_idxs=None):

        if time_idxs is None:
            logger.info("Time indices are empty!")
            time_idxs = self.time_idxs

        fork_outs = self.state_fork_layer.fprop(state_below)
        idx = 0
        # First things first, content based addressing:
        if not self.use_local_att:
            beta_pre = fork_outs[self.names[0]]
            beta = TT.nnet.softplus(beta_pre).reshape((beta_pre.shape[0],))

            if (state_below.ndim != beta.ndim and beta.ndim == 2
                    and state_below.ndim == 3):
                beta = beta.reshape((state_below.shape[0], state_below.shape[1]))
            elif (state_below.ndim != beta.ndim and beta.ndim == 1
                    and state_below.ndim == 2):
                beta = beta.reshape((state_below.shape[0],))
            else:
                raise ValueError("Unknown shape for beta!")
            beta = TT.shape_padright(beta)
            idx = 1

        key_pre = fork_outs[self.names[idx]]
        idx += 1
        key_t = key_pre
        sim_vals = self.mem_similarity(key_t, memory)

        weights = sim_vals
        new_pre_weights = None

        if self.smoothed_diff_weights:
            dw_scaler = fork_outs[self.names[idx]]
            dw_scaler = TT.addbroadcast(dw_scaler, 1)
            weights = sim_vals - Sigmoid(dw_scaler) * w_t_pre_before
            new_pre_weights = self.mem_weight_decay * sim_vals + (1 - \
                    self.mem_weight_decay) * w_t_pre_before
            idx += 1
        std = 5

        """
        if self.use_local_att:
            mean = as_floatX(self.mem_nel) * Sigmoid(weights*self.mean_pred.fprop(state_below))
            exp_ws = -(time_idxs - mean)**2 / (2.0 * std)
            weights = exp_ws * weights
        """

        if self.use_local_att:
            w_tc = softmax3(weights) if weights.ndim == 3 else TT.nnet.softmax(weights)
        else:
            if weights.ndim == 3 and beta.ndim == 2:
                beta = beta.dimshuffle('x', 0, 1)
                w_tc = softmax3(weights * beta)
            else:
                # Content based weights:
                w_tc = TT.nnet.softmax(weights * beta)

        if self.use_local_att:
            first_loc_layer = Tanh(self.state_below_local.fprop(state_below) +\
                    self.weights_below_local.fprop(weights))
            mean = as_floatX(self.mem_nel) * Sigmoid(self.mean_pred.fprop(first_loc_layer))
            mean = TT.addbroadcast(mean, 1)
            exp_ws = TT.exp(-((time_idxs - mean)**2) / (2.0 * std))
            w_tc = exp_ws * w_tc
            w_tc = w_tc / w_tc.sum(axis=1, keepdims=True)

        if self.use_loc_based_addressing:
            # Location based addressing:
            g_t_pre = fork_outs[self.names[idx]]
            g_t = Sigmoid(g_t_pre).reshape((g_t_pre.shape[0],))

            if (state_below.ndim != g_t.ndim and g_t.ndim == 2
                    and state_below.ndim == 3):
                g_t = g_t.reshape((state_below.shape[0], state_below.shape[1]))
            elif (state_below.ndim != g_t.ndim and g_t.ndim == 1
                    and state_below.ndim == 2):
                g_t = g_t.reshape((state_below.shape[0],))
            else:
                raise ValueError("Unknown shape for g_t!")

            g_t = TT.shape_padright(g_t)
            w_tg = g_t * w_tc + (1 - g_t) * w_t_before
            shifts_pre = fork_outs[self.names[idx + 1]]

            if shifts_pre.ndim == 2:
                if self.use_multiscale_shifts:

                    if self.use_scale_layer:
                        scales = TT.exp(self.scale_layer.fprop(state_below))
                        scales = scales.dimshuffle(0, 'x', 1)
                    else:
                        scales = TT.exp(TT.arange(self.scale_size).dimshuffle('x', 'x', 0))

                    shifts_pre = shifts_pre.reshape((state_below.shape[0],
                                                     -1,
                                                     self.scale_size))

                    shifts_pre = (shifts_pre * scales).sum(-1)

                    if self.shift_width >= 0:
                        shifts_pre = shifts_pre.reshape((-1, self.shift_width, 1))

                elif self.shift_width >= 0:
                    shifts_pre = shifts_pre.reshape((-1, self.shift_width, 1))
                else:
                    shifts_pre = shifts_pre.reshape(
                        (state_below.shape[0], self.mem_nel))

                if state_below.ndim == 3:
                    shifts_pre = shifts_pre.dimshuffle(0, 1, 'x')
                    shifts_pre = shifts_pre - shifts_pre.max(1, keepdims=True).dimshuffle(0, 'x', 'x')
                else:
                    shifts_pre = shifts_pre.dimshuffle(0, 1)
                    shifts_pre = shifts_pre - shifts_pre.max(1, keepdims=True)
                    shifts_pre = shifts_pre.dimshuffle(0, 1, 'x')
            elif shifts_pre.ndim == 1:
                if self.use_multiscale_shifts:
                    if self.use_scale_layer:
                        scales = TT.exp(self.scale_layer.fprop(state_below))
                    else:
                        scales = TT.exp(TT.arange(self.scale_size))

                    shifts_pre = shifts_pre.reshape((-1, self.scale_size))
                    shifts_pre = (shifts_pre * scales).sum(-1)
                    if self.shift_width >= 0:
                        shifts_pre = shifts_pre.reshape((-1, self.shift_width, 1))
                    if self.shift_width >= 0:
                        shifts_pre = shifts_pre.reshape((-1, 1))
                elif self.shift_width >= 0:
                    shifts_pre = shifts_pre.reshape((-1, 1))
                else:
                    shifts_pre = shifts_pre.reshape((self.mem_nel,))

                if state_below.ndim == 2:
                    shifts_pre = TT.shape_padright(shifts_pre)
                    shifts_pre = shifts_pre - shifts_pre.max(0, keepdims=True)

            shifts = TT.exp(shifts_pre)
            if shifts.ndim == 2:
                shifts = shifts / shifts.sum(axis=0, keepdims=True)
            elif shifts.ndim == 3:
                shifts = shifts / shifts.sum(axis=1, keepdims=True)

            CC = CircularConvolveAdvIndexing if self.use_adv_indexing else\
                    CircularConvolve

            w_t_hat = CC()(weights=w_tg, shifts=shifts,
                           mem_size=self.mem_nel,
                           shift_width=self.shift_width)

            if self.use_reinforce:
                if w_t_hat.ndim == 2:
                    w_t = TT.nnet.softmax(w_t_hat)
                elif w_t_hat.ndim == 3:
                    w_t = softmax3(w_t_hat)
            else:
                gamma_pre = fork_outs[self.names[4]]
                assert w_t_hat.ndim == gamma_pre.ndim, ("The number of dimensions for "
                                                        " w_t_hat and gamma_pre should "
                                                        " be the same")

                if gamma_pre.ndim == 1:
                    gamma_pre = gamma_pre
                else:
                    gamma_pre = gamma_pre.reshape((gamma_pre.shape[0],))

                gamma_pre = TT.shape_padright(gamma_pre)
                gamma = TT.nnet.softplus(gamma_pre) + const(1)

                w_t = (abs(w_t_hat + const(1e-16))**gamma) + const(1e-42)
                if (state_below.ndim != shifts_pre.ndim and w_t.ndim == 2
                        and state_below.ndim == 3):
                    w_t = w_t.reshape((state_below.shape[0], state_below.shape[1]))
                    w_t = w_t.dimshuffle(0, 1, 'x')
                elif (state_below.ndim != w_t.ndim and w_t.ndim == 1
                        and state_below.ndim == 2):
                    w_t = w_t.reshape((state_below.shape[0],))
                    w_t = w_t.dimshuffle(0, 'x')

                if w_t.ndim == 2:
                    w_t = w_t / (w_t.sum(axis=-1, keepdims=True) + const(1e-6))
                elif w_t.ndim == 3:
                    w_t = w_t / (w_t.sum(axis=-1, keepdims=True) + const(1e-6))
        else:
            w_t = w_tc

        return [w_t], [new_pre_weights]
Example #9
0
    def fprop(self, inps=None, leak_rate=0.05, use_noise=False, mdl_name=None):

        self.build_model(use_noise=use_noise, mdl_name=mdl_name)
        self.ntm.evaluation_mode = use_noise
        if not inps:
            inps = self.inps

        # First two are X and targets
        # assert (2 + sum([use_mask, use_cmask])) + 1 >= len(inps), \
        #    "inputs have illegal shape."
        cmask = None
        mask = None
        if isinstance(inps, list):
            X = inps[0]
            y = inps[1]

            if self.use_mask:
                mask = inps[2]
                if self.use_cost_mask:
                    cmask = inps[3]
        else:
            X = inps['X']
            y = inps['y']
            if self.use_mask:
                mask = inps['mask']
                if self.use_cost_mask:
                    cmask = inps['cmask']

        if self.use_cost_mask:
            if cmask is not None:
                if self.use_bow_cost_mask:
                    if mask.ndim == cmask.ndim:
                        m = (mask * TT.eq(cmask, 0)).reshape(
                            (cmask.shape[0] * cmask.shape[1], -1))
                    else:
                        m = (mask.dimshuffle(0, 1, 'x') *
                             TT.eq(cmask, 0))[:, :, 0].reshape(
                                 (cmask.shape[0] * cmask.shape[1], -1))
                else:
                    m = mask
            else:
                raise ValueError("Mask for the answers should not be empty.")

        if X.ndim == 2 and y.ndim == 1:
            # For sequential MNIST.
            if self.permute_order:
                X = X.dimshuffle(1, 0)
                idxs = self.rnd_indxs
                X = X[idxs]
            inp_shp = (X.shape[0], X.shape[1], -1)
        else:
            inp_shp = (X.shape[1], X.shape[2], -1)

        #import pdb;pdb.set_trace()
        self.ntm_in = None
        if self.use_bow_input and not self.use_gru_inp_rep and not self.use_simple_rnn_inp_rep:

            bow_out = self.bow_layer.fprop(X,
                                           amask=m,
                                           deterministic=not use_noise)
            bow_out = bow_out.reshape((X.shape[1], X.shape[2], -1))
            self.ntm_in = bow_out

        elif self.use_gru_inp_rep:
            m0 = as_floatX(TT.gt(X, 0))
            if self.use_mask and self.use_cost_mask:
                if cmask is not None:
                    m1 = mask * TT.eq(cmask, 0)
                else:
                    raise ValueError(
                        "Mask for the answers should not be empty.")

            low_inp_shp = (X.shape[0], X.shape[1] * X.shape[2], -1)
            Xr = X.reshape(low_inp_shp)
            grufact_inps = self.gru_fact_layer_inps.fprop(Xr)
            low_reset_below = grufact_inps.values()[0].reshape(low_inp_shp)
            low_gater_below = grufact_inps.values()[1].reshape(low_inp_shp)
            low_state_below = grufact_inps.values()[2].reshape(low_inp_shp)
            linps = [low_reset_below, low_gater_below, low_state_below]

            m0_part = TT.cast(
                m0.sum(0).reshape(
                    (X.shape[1], X.shape[2])).dimshuffle(0, 1, 'x'), 'float32')
            m0_part = TT.switch(TT.eq(m0_part, as_floatX(0)), as_floatX(1),
                                m0_part)

            h0 = self.gru_fact_layer.fprop(inps=linps,
                                           mask=m0,
                                           batch_size=self.batch_size)

            self.ntm_in = m1.dimshuffle(0, 1, 'x') * ((m0.dimshuffle(0, 1, 2, 'x') * h0.reshape((X.shape[0],
                                                                   X.shape[1],
                                                                   X.shape[2],
                                                                   -1))).sum(0) \
                                                                           / m0_part).reshape(inp_shp)
        elif self.use_simple_rnn_inp_rep:
            m0 = as_floatX(TT.gt(X, 0))
            if cmask is not None:
                m1 = mask * TT.eq(cmask, 0)
            else:
                raise ValueError("Mask for the answers should not be empty.")

            low_inp_shp = (X.shape[0], X.shape[1] * X.shape[2], -1)
            Xr = X.reshape(low_inp_shp)
            rnnfact_inps = self.rnn_fact_layer_inps.fprop(Xr).reshape(
                low_inp_shp)
            m0 = m0.reshape(low_inp_shp)

            h0 = self.rnn_fact_layer.fprop(inps=rnnfact_inps,
                                           mask=m0,
                                           batch_size=self.batch_size)

            m0_part = TT.cast(
                m0.sum(0).reshape(
                    (X.shape[1], X.shape[2])).dimshuffle(0, 1, 'x'), 'float32')
            m0_part = TT.switch(m0_part == 0, as_floatX(1), m0_part)
            self.ntm_in = m1.dimshuffle(0, 1, 'x') * (h0.reshape((X.shape[0],
                                                                  X.shape[1],
                                                                  X.shape[2],
                                                                  -1)).sum(0) / \
                                                                          m0_part).reshape(inp_shp)

        else:
            X_proj = self.inp_proj_layer.fprop(X)
            if not self.learn_embeds:
                X_proj = block_gradient(X_proj)

            if self.use_batch_norm:
                X_proj = self.batch_norm_layer.fprop(X_proj,
                                                     inference=not use_noise)
            self.ntm_in = X_proj

        context = None
        if self.use_context:
            if self.use_qmask:
                context = (self.qmask.dimshuffle(0, 1, 'x') *
                           self.ntm_in).sum(0)
            else:
                m1_part = m1.sum(0).dimshuffle(0, 'x')
                context = self.ntm_in.sum(0) / m1_part

        self.ntm_outs = self.ntm.fprop(self.ntm_in,
                                       mask=mask,
                                       cmask=cmask,
                                       context=context,
                                       batch_size=self.batch_size,
                                       use_mask=self.use_mask,
                                       use_noise=not use_noise)

        h, m_read = self.ntm_outs[0], self.ntm_outs[2]

        if self.use_reinforce:
            self.w_samples, self.r_samples = self.ntm_outs[-2], self.ntm_outs[
                -1]

            if self.smoothed_diff_weights:
                idx = -6
            else:
                idx = -4

            self.write_weights, self.read_weights = self.ntm_outs[idx], \
                    self.ntm_outs[idx+1]
        else:
            self.write_weights, self.read_weights = self.ntm_outs[
                3], self.ntm_outs[4]

        if self.anticorrelation:
            acorr = AntiCorrelationConstraint(level=self.anticorrelation)
            rw1 = self.read_weights[:, 0]
            rw2 = self.read_weights[:, 1]
            self.reg += acorr(rw1, rw2, mask=mask)

        if self.correlation_ws:
            logger.info("Applying the correlation constraint.")
            corr_cons = CorrelationConstraint(level=self.correlation_ws)
            self.reg += corr_cons(self.read_weights, self.write_weights, mask,
                                  self.qmask)

        if self.use_last_hidden_state:
            h = h.reshape(inp_shp)
            h = h[-1]

        if self.use_deepout:
            merged_out = self.merge_layer.fprop([h, m_read])
            out_layer = Leaky_Rect(merged_out, leak_rate)

            if self.dropout:
                dropOp = Dropout(dropout_prob=self.dropout)
                out_layer = dropOp(out_layer, deterministic=not use_noise)

            out_layer = self.out_layer.fprop(out_layer,
                                             deterministic=not use_noise)
        else:
            if self.use_out_mem:
                if self.dropout:
                    dropOp = Dropout(dropout_prob=self.dropout)
                    m_read = dropOp(m_read, deterministic=not use_noise)

                mem_out = self.out_mem.fprop(m_read,
                                             deterministic=not use_noise)
                mem_scaler = self.out_scaler.fprop(
                    h, deterministic=not use_noise).reshape(
                        (mem_out.shape[0], )).dimshuffle(0, 'x')

                h_out = self.out_layer.fprop(h, deterministic=not use_noise)
                out_layer = h_out + mem_out * Sigmoid(mem_scaler)
            else:
                if self.dropout:
                    dropOp = Dropout(dropout_prob=self.dropout)
                    h = dropOp(h, deterministic=not use_noise)
                out_layer = self.out_layer.fprop(h,
                                                 deterministic=not use_noise)

        if self.predict_bow_out and self.bow_out_layer:
            logger.info("Using the bow output prediction.")
            self.bow_pred_out = Sigmoid(
                self.bow_out_layer.fprop(h, deterministic=not use_noise))

        if self.softmax:
            self.probs = Softmax(out_layer)
        else:
            self.probs = Sigmoid(out_layer)

        if self.ntm.updates:
            self.updates.update(self.ntm.updates)

        self.str_params(logger)

        self.h = h
        return self.probs, self.ntm_outs