Ejemplo n.º 1
0
    def _predict_k_star(self, k_star, x_star):
        """
        Predict one test sample using algorithm (3.4) from GPML
        and assuming shared covariance matrix among all latent functions.
        """
        # shortcuts
        C = self._n_outputs
        n = self._n_samples

        mu = (self._y - self.pi_).T.dot(k_star)
        Sigma = []
        k_star_star = self._kernel(x_star, x_star)
        for c_ in xrange(C):
            b = self._e[c_] * k_star
            # _t = solve_triangular(self._M, b)
            # _t2 = solve_triangular(self._M, _t, trans='T')
            _t2 = b / np.maximum(sum(self._e), 1e-8 * np.ones_like(self._e[0]))
            c = self._e[c_] * _t2
            sigma_row = [c.dot(k_star)] * C
            sigma_row[c_] += (k_star_star - b.dot(k_star))
            Sigma.append(sigma_row)
        Sigma = np.asarray(Sigma)
        f_star = self._rng.multivariate_normal(mu, Sigma, size=self.n_samples)
        pi_star = softmax(f_star)
        return np.mean(pi_star, axis=0)
Ejemplo n.º 2
0
def rnn_cell_forward(Xt,h_prev,parameters):
    '''
    RNN Cell:
    Input: 
        - Xt: (N,D) N=2000 D=28
        - h_prev: (N,H) #of neurons in the hidden state. "prev" is actually for timestep "t-1"
        - parameters:
            : Wx: Weight matrix multiplying the input Xt, (D, H) 
            : Wh: Weight matrix multiplying the hidden state (H,H)
            : Wy: Weight matrix relating to the hidden-state. Shape is (H,M) # M = 10
            : bh: Bias, (1, H)
            : by: Bias, (1, M)
    Returns:
    - h_next: next hidden state (N, H)
    - yt_pred: prediction at timestep t, (N, M)
    - cache : tuple of values needed for the back-propagation part, has shape (h_next, h_prev, Xt, parameters)
    '''
    Wx = parameters["Wx"]
    Wh = parameters["Wh"]
    Wy = parameters["Wy"]
    bh = parameters["bh"]
    by = parameters["by"]

    # compute next activation state using the formula tanh(xxxx)
    h_next = tanh(np.dot(Xt,Wx) + np.dot(h_prev,Wh) + bh)
    yt_pred = softmax(np.dot(h_next, Wy) + by)
    cache = (h_next, h_prev, Xt, parameters)

    return h_next, yt_pred, cache
Ejemplo n.º 3
0
    def __init__(self):
        self.layers = []
        self.history = {"loss": []}
        self.cost = None

        self.activation_funcs = {
            "relu": activation.relu(),
            "softmax": activation.softmax(),
            "sigmoid": activation.sigmoid(),
            "linear": activation.identity(),
            "tanh": activation.tanh(),
            "swish": activation.swish(),
            "lrelu": activation.lrelu()
        }

        self.cost_funcs = {
            "squared loss": error.SquaredError(),
            "cross entropy": error.CrossEntropy()
        }

        self.layer_types = {
            "dense": layers.Dense,
        }
Ejemplo n.º 4
0
    def build_decoder(self, query_tokens, query_token_embed,
                      query_token_embed_mask, mask):
        logging.info('building decoder ...')

        # mask = ndim_itensor(2, 'mask')

        # (batch_size, decoder_state_dim)
        decoder_prev_state = ndim_tensor(2, name='decoder_prev_state')

        # (batch_size, decoder_state_dim)
        decoder_prev_cell = ndim_tensor(2, name='decoder_prev_cell')

        # (batch_size, n_timestep, decoder_state_dim)
        hist_h = ndim_tensor(3, name='hist_h')

        # (batch_size, decoder_state_dim)
        prev_action_embed = ndim_tensor(2, name='prev_action_embed')

        # (batch_size)
        node_id = T.ivector(name='node_id')

        # (batch_size, node_embed_dim)
        node_embed = self.node_embedding[node_id]

        # (batch_size)
        par_rule_id = T.ivector(name='par_rule_id')

        # (batch_size, decoder_state_dim)
        par_rule_embed = T.switch(par_rule_id[:, None] < 0,
                                  T.alloc(0., 1, config.rule_embed_dim),
                                  self.rule_embedding_W[par_rule_id])

        # ([time_step])
        time_steps = T.ivector(name='time_steps')

        # (batch_size)
        parent_t = T.ivector(name='parent_t')

        # (batch_size, 1)
        parent_t_reshaped = T.shape_padright(parent_t)

        # mask = ndim_itensor(2, 'mask')

        query_embed = self.query_encoder_lstm(query_token_embed,
                                              mask=query_token_embed_mask,
                                              dropout=config.dropout,
                                              train=False)

        # (batch_size, 1, decoder_state_dim)
        prev_action_embed_reshaped = prev_action_embed.dimshuffle((0, 'x', 1))

        # (batch_size, 1, node_embed_dim)
        node_embed_reshaped = node_embed.dimshuffle((0, 'x', 1))

        # (batch_size, 1, node_embed_dim)
        par_rule_embed_reshaped = par_rule_embed.dimshuffle((0, 'x', 1))

        if not config.frontier_node_type_feed:
            node_embed_reshaped *= 0.

        if not config.parent_action_feed:
            par_rule_embed_reshaped *= 0.

        decoder_input = T.concatenate([
            prev_action_embed_reshaped, node_embed_reshaped,
            par_rule_embed_reshaped
        ],
                                      axis=-1)

        # (batch_size, 1, decoder_state_dim)
        # (batch_size, 1, decoder_state_dim)
        # (batch_size, 1, field_token_encode_dim)
        decoder_next_state_dim3, decoder_next_cell_dim3, ctx_vectors = self.decoder_lstm(
            decoder_input,
            init_state=decoder_prev_state,
            init_cell=decoder_prev_cell,
            hist_h=hist_h,
            context=query_embed,
            context_mask=query_token_embed_mask,
            parent_t_seq=parent_t_reshaped,
            dropout=config.dropout,
            train=False,
            time_steps=time_steps)

        decoder_next_state = decoder_next_state_dim3.flatten(2)
        # decoder_output = decoder_next_state * (1 - DECODER_DROPOUT)

        decoder_next_cell = decoder_next_cell_dim3.flatten(2)

        decoder_next_state_trans_rule = self.decoder_hidden_state_W_rule(
            decoder_next_state)
        decoder_next_state_trans_token = self.decoder_hidden_state_W_token(
            T.concatenate([decoder_next_state,
                           ctx_vectors.flatten(2)],
                          axis=-1))

        rule_prob = softmax(
            T.dot(decoder_next_state_trans_rule,
                  T.transpose(self.rule_embedding_W)) + self.rule_embedding_b)

        gen_action_prob = self.terminal_gen_softmax(decoder_next_state)

        # vocab_prob = softmax(T.dot(decoder_next_state_trans_token, T.transpose(self.vocab_embedding_W)) + self.vocab_embedding_b)
        logits = T.dot(decoder_next_state_trans_token,
                       T.transpose(
                           self.vocab_embedding_W)) + self.vocab_embedding_b
        # vocab_predict = softmax(T.dot(decoder_hidden_state_trans_token, T.transpose(self.vocab_embedding_W)) + self.vocab_embedding_b)
        test = T.dot((T.min(logits, axis=1, keepdims=True) - 1),
                     (1 - mask).reshape((1, mask.shape[1])))
        vocab_prob = softmax(logits * mask + test)
        # vocab_prob = softmax(
        #     logits.transpose(1, 0, 2) * mask + (T.min(logits.transpose(1, 0, 2), axis=1, keepdims=True) - 1) * (
        #     1 - mask)).transpose(1, 0, 2)

        ptr_net_decoder_state = T.concatenate(
            [decoder_next_state_dim3, ctx_vectors], axis=-1)

        copy_prob = self.src_ptr_net(query_embed, query_token_embed_mask,
                                     ptr_net_decoder_state)

        copy_prob = copy_prob.flatten(2)

        inputs = [query_tokens]
        outputs = [query_embed, query_token_embed_mask]

        self.decoder_func_init = theano.function(inputs, outputs)

        inputs = [
            time_steps, decoder_prev_state, decoder_prev_cell, hist_h,
            prev_action_embed, node_id, par_rule_id, parent_t, query_embed,
            query_token_embed_mask, mask
        ]

        outputs = [
            decoder_next_state, decoder_next_cell, rule_prob, gen_action_prob,
            vocab_prob, copy_prob
        ]

        self.decoder_func_next_step = theano.function(inputs, outputs)
Ejemplo n.º 5
0
    def build(self):
        # (batch_size, max_example_action_num, action_type)
        tgt_action_seq = ndim_itensor(3, 'tgt_action_seq')

        # (batch_size, max_example_action_num, action_type)
        tgt_action_seq_type = ndim_itensor(3, 'tgt_action_seq_type')

        # (batch_size, max_example_action_num)
        tgt_node_seq = ndim_itensor(2, 'tgt_node_seq')

        # (batch_size, max_example_action_num)
        tgt_par_rule_seq = ndim_itensor(2, 'tgt_par_rule_seq')

        # (batch_size, max_example_action_num)
        tgt_par_t_seq = ndim_itensor(2, 'tgt_par_t_seq')

        # (batch_size, max_example_action_num, symbol_embed_dim)
        # tgt_node_embed = self.node_embedding(tgt_node_seq, mask_zero=False)
        tgt_node_embed = self.node_embedding[tgt_node_seq]

        # (batch_size, max_query_length)
        query_tokens = ndim_itensor(2, 'query_tokens')

        mask = T.TensorType(dtype='int32',
                            name='mask',
                            broadcastable=(True, False))()

        # (batch_size, max_query_length, query_token_embed_dim)
        # (batch_size, max_query_length)
        query_token_embed, query_token_embed_mask = self.query_embedding(
            query_tokens, mask_zero=True)

        # if WORD_DROPOUT > 0:
        #     logging.info('used word dropout for source, p = %f', WORD_DROPOUT)
        #     query_token_embed, query_token_embed_intact = WordDropout(WORD_DROPOUT, self.srng)(query_token_embed, False)

        batch_size = tgt_action_seq.shape[0]
        max_example_action_num = tgt_action_seq.shape[1]

        # previous action embeddings
        # (batch_size, max_example_action_num, action_embed_dim)
        tgt_action_seq_embed = T.switch(
            T.shape_padright(tgt_action_seq[:, :, 0] > 0),
            self.rule_embedding_W[tgt_action_seq[:, :, 0]],
            self.vocab_embedding_W[tgt_action_seq[:, :, 1]])

        tgt_action_seq_embed_tm1 = tensor_right_shift(tgt_action_seq_embed)

        # parent rule application embeddings
        tgt_par_rule_embed = T.switch(tgt_par_rule_seq[:, :, None] < 0,
                                      T.alloc(0., 1, config.rule_embed_dim),
                                      self.rule_embedding_W[tgt_par_rule_seq])

        if not config.frontier_node_type_feed:
            tgt_node_embed *= 0.

        if not config.parent_action_feed:
            tgt_par_rule_embed *= 0.

        # (batch_size, max_example_action_num, action_embed_dim + symbol_embed_dim + action_embed_dim)
        decoder_input = T.concatenate(
            [tgt_action_seq_embed_tm1, tgt_node_embed, tgt_par_rule_embed],
            axis=-1)

        # (batch_size, max_query_length, query_embed_dim)
        query_embed = self.query_encoder_lstm(query_token_embed,
                                              mask=query_token_embed_mask,
                                              dropout=config.dropout,
                                              srng=self.srng)

        # (batch_size, max_example_action_num)
        tgt_action_seq_mask = T.any(tgt_action_seq_type, axis=-1)

        # decoder_hidden_states: (batch_size, max_example_action_num, lstm_hidden_state)
        # ctx_vectors: (batch_size, max_example_action_num, encoder_hidden_dim)
        decoder_hidden_states, _, ctx_vectors = self.decoder_lstm(
            decoder_input,
            context=query_embed,
            context_mask=query_token_embed_mask,
            mask=tgt_action_seq_mask,
            parent_t_seq=tgt_par_t_seq,
            dropout=config.dropout,
            srng=self.srng)

        # if DECODER_DROPOUT > 0:
        #     logging.info('used dropout for decoder output, p = %f', DECODER_DROPOUT)
        #     decoder_hidden_states = Dropout(DECODER_DROPOUT, self.srng)(decoder_hidden_states)

        # ====================================================
        # apply additional non-linearity transformation before
        # predicting actions
        # ====================================================

        decoder_hidden_state_trans_rule = self.decoder_hidden_state_W_rule(
            decoder_hidden_states)
        decoder_hidden_state_trans_token = self.decoder_hidden_state_W_token(
            T.concatenate([decoder_hidden_states, ctx_vectors], axis=-1))

        # (batch_size, max_example_action_num, rule_num)
        rule_predict = softmax(
            T.dot(decoder_hidden_state_trans_rule,
                  T.transpose(self.rule_embedding_W)) + self.rule_embedding_b)

        # (batch_size, max_example_action_num, 2)
        terminal_gen_action_prob = self.terminal_gen_softmax(
            decoder_hidden_states)

        # (batch_size, max_example_action_num, target_vocab_size)
        logits = T.dot(decoder_hidden_state_trans_token,
                       T.transpose(
                           self.vocab_embedding_W)) + self.vocab_embedding_b
        # vocab_predict = softmax(T.dot(decoder_hidden_state_trans_token, T.transpose(self.vocab_embedding_W)) + self.vocab_embedding_b)
        vocab_predict = softmax(
            logits.transpose(1, 0, 2) * mask +
            (T.min(logits.transpose(1, 0, 2), axis=1, keepdims=True) - 1) *
            (1 - mask)).transpose(1, 0, 2)
        # (batch_size, max_example_action_num, lstm_hidden_state + encoder_hidden_dim)
        ptr_net_decoder_state = T.concatenate(
            [decoder_hidden_states, ctx_vectors], axis=-1)

        # (batch_size, max_example_action_num, max_query_length)
        copy_prob = self.src_ptr_net(query_embed, query_token_embed_mask,
                                     ptr_net_decoder_state)

        # (batch_size, max_example_action_num)
        rule_tgt_prob = rule_predict[
            T.shape_padright(T.arange(batch_size)),
            T.shape_padleft(T.arange(max_example_action_num)),
            tgt_action_seq[:, :, 0]]

        # (batch_size, max_example_action_num)
        vocab_tgt_prob = vocab_predict[
            T.shape_padright(T.arange(batch_size)),
            T.shape_padleft(T.arange(max_example_action_num)),
            tgt_action_seq[:, :, 1]]

        # (batch_size, max_example_action_num)
        copy_tgt_prob = copy_prob[
            T.shape_padright(T.arange(batch_size)),
            T.shape_padleft(T.arange(max_example_action_num)),
            tgt_action_seq[:, :, 2]]

        # (batch_size, max_example_action_num)
        tgt_prob = tgt_action_seq_type[:, :, 0] * rule_tgt_prob + \
                   tgt_action_seq_type[:, :, 1] * terminal_gen_action_prob[:, :, 0] * vocab_tgt_prob + \
                   tgt_action_seq_type[:, :, 2] * terminal_gen_action_prob[:, :, 1] * copy_tgt_prob

        likelihood = T.log(tgt_prob + 1.e-7 * (1 - tgt_action_seq_mask))
        loss = -(likelihood * tgt_action_seq_mask).sum(
            axis=-1)  # / tgt_action_seq_mask.sum(axis=-1)
        loss = T.mean(loss)

        # let's build the function!
        train_inputs = [
            query_tokens, tgt_action_seq, tgt_action_seq_type, tgt_node_seq,
            tgt_par_rule_seq, tgt_par_t_seq, mask
        ]
        optimizer = optimizers.get(config.optimizer)
        optimizer.clip_grad = config.clip_grad
        updates, grads = optimizer.get_updates(self.params, loss)
        self.train_func = theano.function(
            train_inputs,
            [loss],
            # [loss, tgt_action_seq_type, tgt_action_seq,
            #  rule_tgt_prob, vocab_tgt_prob, copy_tgt_prob,
            #  copy_prob, terminal_gen_action_prob],
            updates=updates)

        # if WORD_DROPOUT > 0:
        #     self.build_decoder(query_tokens, query_token_embed_intact, query_token_embed_mask)
        # else:
        #     self.build_decoder(query_tokens, query_token_embed, query_token_embed_mask)

        self.build_decoder(query_tokens, query_token_embed,
                           query_token_embed_mask, mask)
Ejemplo n.º 6
0
    def build_decoder(self, query_tokens, query_token_embed,
                      query_token_embed_mask, query_tokens_phrase,
                      query_tokens_pos, query_tokens_canon_id):
        logging.info('building decoder ...')

        # (batch_size, decoder_state_dim)
        decoder_prev_state = ndim_tensor(2, name='decoder_prev_state')

        # (batch_size, decoder_state_dim)
        decoder_prev_cell = ndim_tensor(2, name='decoder_prev_cell')

        # (batch_size, n_timestep, decoder_state_dim)
        hist_h = ndim_tensor(3, name='hist_h')

        # (batch_size, decoder_state_dim)
        prev_action_embed = ndim_tensor(2, name='prev_action_embed')

        # (batch_size)
        node_id = T.ivector(name='node_id')

        # (batch_size, node_embed_dim)
        node_embed = self.node_embedding[node_id]

        # (batch_size)
        par_rule_id = T.ivector(name='par_rule_id')

        # (batch_size, decoder_state_dim)
        par_rule_embed = T.switch(par_rule_id[:, None] < 0,
                                  T.alloc(0., 1, config.rule_embed_dim),
                                  self.rule_embedding_W[par_rule_id])

        # ([time_step])
        time_steps = T.ivector(name='time_steps')

        # (batch_size)
        parent_t = T.ivector(name='parent_t')

        # (batch_size, 1)
        parent_t_reshaped = T.shape_padright(parent_t)

        # concatenate query_token_embed with query_tokens_phrase and query_tokens_pos
        # (batch_size, max_query_length, query_embed_dim + 2)
        new_query_token_embed = self.concatenate(query_token_embed,
                                                 query_tokens_phrase,
                                                 query_tokens_pos,
                                                 query_tokens_canon_id)

        query_embed = self.query_encoder_lstm(new_query_token_embed,
                                              mask=query_token_embed_mask,
                                              dropout=config.dropout,
                                              train=False)

        # (batch_size, 1, decoder_state_dim)
        prev_action_embed_reshaped = prev_action_embed.dimshuffle((0, 'x', 1))

        # (batch_size, 1, node_embed_dim)
        node_embed_reshaped = node_embed.dimshuffle((0, 'x', 1))

        # (batch_size, 1, node_embed_dim)
        par_rule_embed_reshaped = par_rule_embed.dimshuffle((0, 'x', 1))

        if not config.frontier_node_type_feed:
            node_embed_reshaped *= 0.

        if not config.parent_action_feed:
            par_rule_embed_reshaped *= 0.

        decoder_input = T.concatenate([
            prev_action_embed_reshaped, node_embed_reshaped,
            par_rule_embed_reshaped
        ],
                                      axis=-1)

        # (batch_size, 1, decoder_state_dim)
        # (batch_size, 1, decoder_state_dim)
        # (batch_size, 1, field_token_encode_dim)
        decoder_next_state_dim3, decoder_next_cell_dim3, ctx_vectors = self.decoder_lstm(
            decoder_input,
            init_state=decoder_prev_state,
            init_cell=decoder_prev_cell,
            hist_h=hist_h,
            context=query_embed,
            context_mask=query_token_embed_mask,
            parent_t_seq=parent_t_reshaped,
            dropout=config.dropout,
            train=False,
            time_steps=time_steps)

        decoder_next_state = decoder_next_state_dim3.flatten(2)
        # decoder_output = decoder_next_state * (1 - DECODER_DROPOUT)

        decoder_next_cell = decoder_next_cell_dim3.flatten(2)

        decoder_next_state_trans_rule = self.decoder_hidden_state_W_rule(
            decoder_next_state)
        decoder_next_state_trans_token = self.decoder_hidden_state_W_token(
            T.concatenate([decoder_next_state,
                           ctx_vectors.flatten(2)],
                          axis=-1))

        rule_prob = softmax(
            T.dot(decoder_next_state_trans_rule,
                  T.transpose(self.rule_embedding_W)) + self.rule_embedding_b)

        gen_action_prob = self.terminal_gen_softmax(decoder_next_state)

        vocab_prob = softmax(
            T.dot(decoder_next_state_trans_token,
                  T.transpose(self.vocab_embedding_W)) +
            self.vocab_embedding_b)

        ptr_net_decoder_state = T.concatenate(
            [decoder_next_state_dim3, ctx_vectors], axis=-1)

        copy_prob = self.src_ptr_net(query_embed, query_token_embed_mask,
                                     ptr_net_decoder_state)

        copy_prob = copy_prob.flatten(2)

        inputs = [
            query_tokens, query_tokens_phrase, query_tokens_pos,
            query_tokens_canon_id
        ]
        outputs = [query_embed, query_token_embed_mask]

        self.decoder_func_init = theano.function(inputs,
                                                 outputs,
                                                 allow_input_downcast=True,
                                                 on_unused_input='ignore')

        inputs = [
            time_steps, decoder_prev_state, decoder_prev_cell, hist_h,
            prev_action_embed, node_id, par_rule_id, parent_t, query_embed,
            query_token_embed_mask
        ]

        outputs = [
            decoder_next_state, decoder_next_cell, rule_prob, gen_action_prob,
            vocab_prob, copy_prob
        ]

        self.decoder_func_next_step = theano.function(inputs, outputs)
Ejemplo n.º 7
0
    def build(self):
        # (batch_size, max_example_action_num, action_type)
        tgt_action_seq = ndim_itensor(3, 'tgt_action_seq')

        # (batch_size, max_example_action_num, action_type)
        tgt_action_seq_type = ndim_itensor(3, 'tgt_action_seq_type')

        # (batch_size, max_example_action_num)
        tgt_node_seq = ndim_itensor(2, 'tgt_node_seq')

        # (batch_size, max_example_action_num)
        tgt_par_rule_seq = ndim_itensor(2, 'tgt_par_rule_seq')

        # (batch_size, max_example_action_num)
        tgt_par_t_seq = ndim_itensor(2, 'tgt_par_t_seq')

        # (batch_size, max_example_action_num, symbol_embed_dim)
        # tgt_node_embed = self.node_embedding(tgt_node_seq, mask_zero=False)
        tgt_node_embed = self.node_embedding[tgt_node_seq]

        # (batch_size, max_query_length)
        query_tokens = ndim_itensor(2, 'query_tokens')

        # (batch_size, max_query_length, query_token_embed_dim)
        # (batch_size, max_query_length)
        query_token_embed, query_token_embed_mask = self.query_embedding(query_tokens, mask_zero=True)

        # if WORD_DROPOUT > 0:
        #     logging.info('used word dropout for source, p = %f', WORD_DROPOUT)
        #     query_token_embed, query_token_embed_intact = WordDropout(WORD_DROPOUT, self.srng)(query_token_embed, False)

        batch_size = tgt_action_seq.shape[0]
        max_example_action_num = tgt_action_seq.shape[1]

        # previous action embeddings
        # (batch_size, max_example_action_num, action_embed_dim)
        tgt_action_seq_embed = T.switch(T.shape_padright(tgt_action_seq[:, :, 0] > 0),
                                        self.rule_embedding_W[tgt_action_seq[:, :, 0]],
                                        self.vocab_embedding_W[tgt_action_seq[:, :, 1]])

        tgt_action_seq_embed_tm1 = tensor_right_shift(tgt_action_seq_embed)

        # parent rule application embeddings
        tgt_par_rule_embed = T.switch(tgt_par_rule_seq[:, :, None] < 0,
                                      T.alloc(0., 1, config.rule_embed_dim),
                                      self.rule_embedding_W[tgt_par_rule_seq])

        if not config.frontier_node_type_feed:
            tgt_node_embed *= 0.

        if not config.parent_action_feed:
            tgt_par_rule_embed *= 0.

        # (batch_size, max_example_action_num, action_embed_dim + symbol_embed_dim + action_embed_dim)
        decoder_input = T.concatenate([tgt_action_seq_embed_tm1, tgt_node_embed, tgt_par_rule_embed], axis=-1)

        # (batch_size, max_query_length, query_embed_dim)
        query_embed = self.query_encoder_lstm(query_token_embed, mask=query_token_embed_mask,
                                              dropout=config.dropout, srng=self.srng)

        # (batch_size, max_example_action_num)
        tgt_action_seq_mask = T.any(tgt_action_seq_type, axis=-1)
        
        # decoder_hidden_states: (batch_size, max_example_action_num, lstm_hidden_state)
        # ctx_vectors: (batch_size, max_example_action_num, encoder_hidden_dim)
        decoder_hidden_states, _, ctx_vectors = self.decoder_lstm(decoder_input,
                                                                  context=query_embed,
                                                                  context_mask=query_token_embed_mask,
                                                                  mask=tgt_action_seq_mask,
                                                                  parent_t_seq=tgt_par_t_seq,
                                                                  dropout=config.dropout,
                                                                  srng=self.srng)

        # if DECODER_DROPOUT > 0:
        #     logging.info('used dropout for decoder output, p = %f', DECODER_DROPOUT)
        #     decoder_hidden_states = Dropout(DECODER_DROPOUT, self.srng)(decoder_hidden_states)

        # ====================================================
        # apply additional non-linearity transformation before
        # predicting actions
        # ====================================================

        decoder_hidden_state_trans_rule = self.decoder_hidden_state_W_rule(decoder_hidden_states)
        decoder_hidden_state_trans_token = self.decoder_hidden_state_W_token(T.concatenate([decoder_hidden_states, ctx_vectors], axis=-1))

        # (batch_size, max_example_action_num, rule_num)
        rule_predict = softmax(T.dot(decoder_hidden_state_trans_rule, T.transpose(self.rule_embedding_W)) + self.rule_embedding_b)

        # (batch_size, max_example_action_num, 2)
        terminal_gen_action_prob = self.terminal_gen_softmax(decoder_hidden_states)

        # (batch_size, max_example_action_num, target_vocab_size)
        vocab_predict = softmax(T.dot(decoder_hidden_state_trans_token, T.transpose(self.vocab_embedding_W)) + self.vocab_embedding_b)

        # (batch_size, max_example_action_num, lstm_hidden_state + encoder_hidden_dim)
        ptr_net_decoder_state = T.concatenate([decoder_hidden_states, ctx_vectors], axis=-1)

        # (batch_size, max_example_action_num, max_query_length)
        copy_prob = self.src_ptr_net(query_embed, query_token_embed_mask, ptr_net_decoder_state)

        # (batch_size, max_example_action_num)
        rule_tgt_prob = rule_predict[T.shape_padright(T.arange(batch_size)),
                                     T.shape_padleft(T.arange(max_example_action_num)),
                                     tgt_action_seq[:, :, 0]]

        # (batch_size, max_example_action_num)
        vocab_tgt_prob = vocab_predict[T.shape_padright(T.arange(batch_size)),
                                       T.shape_padleft(T.arange(max_example_action_num)),
                                       tgt_action_seq[:, :, 1]]

        # (batch_size, max_example_action_num)
        copy_tgt_prob = copy_prob[T.shape_padright(T.arange(batch_size)),
                                  T.shape_padleft(T.arange(max_example_action_num)),
                                  tgt_action_seq[:, :, 2]]


        # (batch_size, max_example_action_num)
        tgt_prob = tgt_action_seq_type[:, :, 0] * rule_tgt_prob + \
                   tgt_action_seq_type[:, :, 1] * terminal_gen_action_prob[:, :, 0] * vocab_tgt_prob + \
                   tgt_action_seq_type[:, :, 2] * terminal_gen_action_prob[:, :, 1] * copy_tgt_prob

        likelihood = T.log(tgt_prob + 1.e-7 * (1 - tgt_action_seq_mask))
        loss = - (likelihood * tgt_action_seq_mask).sum(axis=-1) # / tgt_action_seq_mask.sum(axis=-1)
        loss = T.mean(loss)

        # let's build the function!
        train_inputs = [query_tokens, tgt_action_seq, tgt_action_seq_type,
                        tgt_node_seq, tgt_par_rule_seq, tgt_par_t_seq]
        optimizer = optimizers.get(config.optimizer)
        optimizer.clip_grad = config.clip_grad
        updates, grads = optimizer.get_updates(self.params, loss)
        self.train_func = theano.function(train_inputs, [loss],
                                          # [loss, tgt_action_seq_type, tgt_action_seq,
                                          #  rule_tgt_prob, vocab_tgt_prob, copy_tgt_prob,
                                          #  copy_prob, terminal_gen_action_prob],
                                          updates=updates)

        # if WORD_DROPOUT > 0:
        #     self.build_decoder(query_tokens, query_token_embed_intact, query_token_embed_mask)
        # else:
        #     self.build_decoder(query_tokens, query_token_embed, query_token_embed_mask)

        self.build_decoder(query_tokens, query_token_embed, query_token_embed_mask)
Ejemplo n.º 8
0
    def build_decoder(self, query_tokens, query_token_embed, query_token_embed_mask):
        logging.info('building decoder ...')

        # (batch_size, decoder_state_dim)
        decoder_prev_state = ndim_tensor(2, name='decoder_prev_state')

        # (batch_size, decoder_state_dim)
        decoder_prev_cell = ndim_tensor(2, name='decoder_prev_cell')

        # (batch_size, n_timestep, decoder_state_dim)
        hist_h = ndim_tensor(3, name='hist_h')

        # (batch_size, decoder_state_dim)
        prev_action_embed = ndim_tensor(2, name='prev_action_embed')

        # (batch_size)
        node_id = T.ivector(name='node_id')

        # (batch_size, node_embed_dim)
        node_embed = self.node_embedding[node_id]

        # (batch_size)
        par_rule_id = T.ivector(name='par_rule_id')

        # (batch_size, decoder_state_dim)
        par_rule_embed = T.switch(par_rule_id[:, None] < 0,
                                  T.alloc(0., 1, config.rule_embed_dim),
                                  self.rule_embedding_W[par_rule_id])

        # ([time_step])
        time_steps = T.ivector(name='time_steps')

        # (batch_size)
        parent_t = T.ivector(name='parent_t')

        # (batch_size, 1)
        parent_t_reshaped = T.shape_padright(parent_t)

        query_embed = self.query_encoder_lstm(query_token_embed, mask=query_token_embed_mask,
                                              dropout=config.dropout, train=False)

        # (batch_size, 1, decoder_state_dim)
        prev_action_embed_reshaped = prev_action_embed.dimshuffle((0, 'x', 1))

        # (batch_size, 1, node_embed_dim)
        node_embed_reshaped = node_embed.dimshuffle((0, 'x', 1))

        # (batch_size, 1, node_embed_dim)
        par_rule_embed_reshaped = par_rule_embed.dimshuffle((0, 'x', 1))

        if not config.frontier_node_type_feed:
            node_embed_reshaped *= 0.

        if not config.parent_action_feed:
            par_rule_embed_reshaped *= 0.

        decoder_input = T.concatenate([prev_action_embed_reshaped, node_embed_reshaped, par_rule_embed_reshaped], axis=-1)

        # (batch_size, 1, decoder_state_dim)
        # (batch_size, 1, decoder_state_dim)
        # (batch_size, 1, field_token_encode_dim)
        decoder_next_state_dim3, decoder_next_cell_dim3, ctx_vectors = self.decoder_lstm(decoder_input,
                                                                                         init_state=decoder_prev_state,
                                                                                         init_cell=decoder_prev_cell,
                                                                                         hist_h=hist_h,
                                                                                         context=query_embed,
                                                                                         context_mask=query_token_embed_mask,
                                                                                         parent_t_seq=parent_t_reshaped,
                                                                                         dropout=config.dropout,
                                                                                         train=False,
                                                                                         time_steps=time_steps)

        decoder_next_state = decoder_next_state_dim3.flatten(2)
        # decoder_output = decoder_next_state * (1 - DECODER_DROPOUT)

        decoder_next_cell = decoder_next_cell_dim3.flatten(2)

        decoder_next_state_trans_rule = self.decoder_hidden_state_W_rule(decoder_next_state)
        decoder_next_state_trans_token = self.decoder_hidden_state_W_token(T.concatenate([decoder_next_state, ctx_vectors.flatten(2)], axis=-1))

        rule_prob = softmax(T.dot(decoder_next_state_trans_rule, T.transpose(self.rule_embedding_W)) + self.rule_embedding_b)

        gen_action_prob = self.terminal_gen_softmax(decoder_next_state)

        vocab_prob = softmax(T.dot(decoder_next_state_trans_token, T.transpose(self.vocab_embedding_W)) + self.vocab_embedding_b)

        ptr_net_decoder_state = T.concatenate([decoder_next_state_dim3, ctx_vectors], axis=-1)

        copy_prob = self.src_ptr_net(query_embed, query_token_embed_mask, ptr_net_decoder_state)

        copy_prob = copy_prob.flatten(2)

        inputs = [query_tokens]
        outputs = [query_embed, query_token_embed_mask]

        self.decoder_func_init = theano.function(inputs, outputs)

        inputs = [time_steps, decoder_prev_state, decoder_prev_cell, hist_h, prev_action_embed,
                  node_id, par_rule_id, parent_t,
                  query_embed, query_token_embed_mask]

        outputs = [decoder_next_state, decoder_next_cell,
                   rule_prob, gen_action_prob, vocab_prob, copy_prob]

        self.decoder_func_next_step = theano.function(inputs, outputs)
Ejemplo n.º 9
0
    def _fit(self, X, y):
        """
        Compute mode of approximation of the posterior using
        algorithm (3.3) from GPML with shared covariance matrix
        among all latent functions.
        """
        if len(y.shape) == 1 or y.shape[1] == 1:
            y = one_hot(y)
        self._check_X_y(X, y)
        y = y.astype(np.float32)
        self._kernel = get_kernel(self.kernel, **self.kernel_params)
        # shortcuts
        C = self._n_outputs
        n = self._n_samples
        # construct covariance matrix [if needed]
        # if self.K_ is None:
        self.K_ = self._kernel(X, X)
        self.K_ += self.sigma_n**2 * np.eye(n)
        self.K_ = self.K_.astype(np.float32)

        # init latent function values
        self.f_ = np.zeros_like(y)

        lmls = []
        iter = 0
        while True:
            iter += 1
            if iter > self.max_iter:
                print 'convergence is not reached'
                return

            self.pi_ = softmax(self.f_)
            z = []
            self._e = []
            for c_ in xrange(C):
                # compute E_c
                sqrt_d_c = np.sqrt(self.pi_[:, c_])
                _T = np.eye(
                    self._n_samples) + (sqrt_d_c * self.K_.T).T * sqrt_d_c
                if self.algorithm == 'exact':
                    L = cholesky(_T, lower=True, overwrite_a=True)
                    _T2 = solve_triangular(L, sqrt_d_c)
                    e_c = sqrt_d_c * solve_triangular(L, _T2, trans='T')
                elif self.algorithm == 'cg':
                    _t, _ = cg(_T,
                               sqrt_d_c,
                               tol=self.cg_tol,
                               maxiter=self.cg_max_iter)
                    _t = _t.astype(np.float32)
                    e_c = sqrt_d_c * _t
                self._e.append(e_c)
                # compute z_c
                if self.algorithm == 'exact':
                    z_c = sum(np.log(L.diagonal()))
                    z.append(z_c)
            # compute b
            # b = (D - Pi.dot(Pi.T)).dot(self.f_.T.reshape((C * n,)))
            # b = b.reshape((n, C))
            b = (1. - self.pi_) * self.pi_ * self.f_
            b = b + y - self.pi_
            # compute c
            c = np.hstack((self._e[c_] * self.K_.dot(b[:, c_]))[:, np.newaxis]
                          for c_ in xrange(C))
            # compute a
            # self._M = cholesky(np.diag(sum(self._e)), lower=True, overwrite_a=True)
            # _t = np.sum(c, axis=1)
            # _t2 = solve_triangular(self._M, _t)
            # _t3 = solve_triangular(self._M, _t2, trans='T')
            _t3 = np.sum(c, axis=1) / np.maximum(
                sum(self._e), 1e-8 * np.ones_like(self._e[0]))
            _t4 = np.hstack(
                (self._e[c_] * _t3)[:, np.newaxis] for c_ in xrange(C))
            a = b - c + _t4
            a = a.astype(np.float32)
            # compute f
            self.f_ = self.K_.dot(a)
            # compute approx. LML
            lml = -0.5 * sum(a[:, _c].dot(self.f_[:, _c])
                             for _c in xrange(C))  # -0.5a^Tf
            lml += sum(y[:, _c].dot(self.f_[:, _c])
                       for _c in xrange(C))  # y^Tf
            lml -= sum(log_sum_exp(f) for f in self.f_)
            lml -= sum(z)
            lmls.append(lml)
            if len(lmls) >= 2 and np.abs(lmls[-1] -
                                         lmls[-2]) < self.tol * self.K_.max():
                break
        self.lml_ = lmls[-1]
Ejemplo n.º 10
0
def lstm_cell_forward(Xt, h_prev, c_prev, parameters):
    """
    Input:
        - Xt: Input data at timestep "t", shape: (N, D)
            : N : #of samples.
            : D : #of input examples. D = 28 in MNIST dataset
        - h_prev: Hidden state at timestep "t-1", shape: (N, H)
            : N : #of samples.
            : H : #of hidden neurans
        - c_prev: Memory state at timestep "t-1", shape: (N,H)
        - parameters: a dictionary containing:
            : Wf : Weight matrix of the forget gate, shape (H+D, H)
            : Wi : Weight matrix of the update gate, shape (H+D, H)
            : Wo : Weight matrix of the output gate, shape (H+D, H)
            : Wc : Weight matrix of the first "tanh", shape (H+D, H)
            : Wy : Weight matrix relating the hidden-state to the output, shape (H, M), M = 10 in MNIST dataset
            : bf  : Bias, shape (1, H)
            : bi  : Bias, shape (1, H)
            : bo  : Bias, shape (1, H)
            : bc  : Bias, shape (1, H)
            : by  : Bias, shape (1, M)
    Returns:
        - h_next : next hidden state, shape (N, H)
        - c_next : next memory state, shape (N, H)
        - yt_pred: prediction at timestep "t", shape (N, M)
        - cache  : tuple of values needed for the backward pass,
                   contains (h_next, c_next, h_prev, c_prev, Xt, parameters)

    Note:
        ft/it/ot stand for the forget/update/output gates, cct stands for the candidate value (c tilde), c stands for the memory value
    """

    # Retrieve parameters from "parameters"
    Wf = parameters["Wf"]
    Wi = parameters["Wi"]
    Wo = parameters["Wo"]
    Wc = parameters["Wc"]
    Wy = parameters["Wy"]

    bf = parameters["bf"]
    bi = parameters["bi"]
    bo = parameters["bo"]
    bc = parameters["bc"]
    by = parameters["by"]

    # Retrieve dimensions from shapes of Xt and Wy
    N, D = Xt.shape
    H, M = Wy.shape

    # Concatenate h_prev and Xt
    concat = np.zeros((N, H+D))
    concat[:, :H] = h_prev
    concat[:, H:] = Xt

    # Compute values for ft, it, cct, c_next, ot, h_next
    ft = sigmoid(np.dot(concat, Wf) + bf)
    it = sigmoid(np.dot(concat, Wi) + bi)
    ot = sigmoid(np.dot(concat, Wo) + bo)
    cct = np.tanh(np.dot(concat, Wc) + bc)
    c_next = ft * c_prev + it * cct
    h_next = ot * np.tanh(c_next)

    # Compute prediction of the LSTM cell
    yt_pred = softmax(np.dot(h_next, Wy) + by)

    # store values needed for backward propagation in cache
    cache = (h_next, c_next, h_prev, c_prev, ft, it, cct, ot, Xt, parameters)

    return h_next, c_next, yt_pred, cache