Beispiel #1
0
    def __init__(self,
                 num_nodes,
                 num_edges,
                 edges,
                 node_feats=None,
                 edge_feats=None):
        super(BatchGraphWrapper, self).__init__()

        node_shift, edge_lod = self.__build_meta_data(num_nodes, num_edges)
        self.__build_edges(edges, node_shift, edge_lod, edge_feats)

        # assign node features
        if node_feats is not None:
            for key, value in node_feats.items():
                self.node_feat_tensor_dict[key] = value

        # other meta-data
        self._edge_uniq_dst, _, uniq_count = L.unique_with_counts(
            self._edges_dst, dtype="int32")
        self._edge_uniq_dst.stop_gradient = True
        last = L.reduce_sum(uniq_count, keep_dim=True)
        uniq_count = L.cumsum(uniq_count, exclusive=True)
        self._edge_uniq_dst_count = L.concat([uniq_count, last])
        self._edge_uniq_dst_count.stop_gradient = True
        self._indegree = get_degree(self._edges_dst, self._num_nodes)
Beispiel #2
0
    def forward(self, tensor_list: NestedTensor):
        x = tensor_list.tensors
        mask = tensor_list.mask
        assert mask is not None
        bs, h, w = mask.shape

        mask = mask.numpy()
        not_mask = ~mask
        not_mask = dg.to_variable(not_mask).astype('float32')
        y_embed = L.cumsum(not_mask, axis=1)  # [batch_size, h, w]
        x_embed = L.cumsum(not_mask, axis=2)  # [batch_size, h, w]
        if self.normalize:
            eps = 1e-6
            y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale
            x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale

        dim_t = (np.arange(0, self.num_pos_feats, 1,
                           dtype="float32"))  # [num_pos_feats]
        dim_t = self.temperature**(2 * (dim_t // 2) / self.num_pos_feats
                                   )  # [num_pos_feats]
        dim_t = dg.to_variable(dim_t)

        x_embed = L.unsqueeze(x_embed, 3)  # [batch_size, h, w, 1]
        y_embed = L.unsqueeze(y_embed, 3)  # [batch_size, h, w, 1]
        pos_x = x_embed / dim_t  # [batch_size, h, w, num_pos_feats]
        pos_y = y_embed / dim_t  # [batch_size, h, w, num_pos_feats]
        pos_x_1 = L.sin(pos_x[:, :, :,
                              0::2])  # [batch_size, h, w, num_pos_feats / 2]
        pos_x_2 = L.cos(pos_x[:, :, :,
                              1::2])  # [batch_size, h, w, num_pos_feats / 2]
        pos_y_1 = L.sin(pos_y[:, :, :,
                              0::2])  # [batch_size, h, w, num_pos_feats / 2]
        pos_y_2 = L.cos(pos_y[:, :, :,
                              1::2])  # [batch_size, h, w, num_pos_feats / 2]
        pos_x = L.reshape(L.stack([pos_x_1, pos_x_2], axis=4),
                          (bs, h, w, -1))  # [batch_size, h, w, num_pos_feats]
        pos_y = L.reshape(L.stack([pos_y_1, pos_y_2], axis=4),
                          (bs, h, w, -1))  # [batch_size, h, w, num_pos_feats]

        pos = L.concat((pos_y, pos_x),
                       axis=3)  # [batch_size, h, w, num_pos_feats * 2]
        pos = L.transpose(pos,
                          perm=(0, 3, 1,
                                2))  # [batch_size, num_pos_feats * 2, h, w]
        return pos
Beispiel #3
0
def fluid_get_offset(seq_len):
    """
    args:
        seq_len: (-1)
    return:
        offset: the same shape as seq_len,
            cumsum(seq_len) - seq_len 
    """
    assert len(seq_len.shape) == 1
    csum = layers.cumsum(layers.cast(seq_len, 'float32'), exclusive=True)
    return layers.cast(csum, 'int64')
Beispiel #4
0
def fluid_get_offset2(seq_len):
    """
    args:
        seq_len: (-1)
    return:
        offset: the same shape as seq_len,
            cumsum(seq_len) - seq_len 
    """
    assert len(seq_len.shape) == 1
    csum = layers.cumsum(seq_len)
    offset = csum - seq_len
    return offset
Beispiel #5
0
 def flat_words(self, words):
     pad_index = self.args.pad_index
     lens = nn.reduce_sum(words != pad_index, dim=-1)
     position = layers.cumsum(lens + layers.cast((lens == 0), "int32"),
                              axis=1) - 1
     flat_words = nn.masked_select(words, words != pad_index)
     flat_words = nn.pad_sequence_paddle(
         layers.split(flat_words,
                      layers.reduce_sum(lens, -1).numpy().tolist(),
                      pad_index))
     max_len = flat_words.shape[1]
     position = nn.mask_fill(position, position >= max_len, max_len - 1)
     return flat_words, position
Beispiel #6
0
    def __build_meta_data(self, num_nodes, num_edges):
        """ Merge information for nodes and edges.
        """
        num_nodes = L.reshape(num_nodes, [-1])
        num_edges = L.reshape(num_edges, [-1])
        num_nodes = paddle_helper.ensure_dtype(num_nodes, dtype="int32")
        num_edges = paddle_helper.ensure_dtype(num_edges, dtype="int32")

        num_graph = L.shape(num_nodes)[0]
        sum_num_nodes = L.reduce_sum(num_nodes)
        sum_num_edges = L.reduce_sum(num_edges)
        edge_lod = L.concat(
            [L.cumsum(num_edges, exclusive=True), sum_num_edges])
        edge_lod = paddle_helper.lod_remove(edge_lod)

        node_shift = L.cumsum(num_nodes, exclusive=True)
        graph_lod = L.concat([node_shift, sum_num_nodes])
        graph_lod = paddle_helper.lod_remove(graph_lod)
        self._num_nodes = sum_num_nodes
        self._num_edges = sum_num_edges
        self._num_graph = num_graph
        self._graph_lod = graph_lod
        return node_shift, edge_lod
Beispiel #7
0
    def __init__(self, graph_wrapper, dropout, keep_self_loop=True):
        super(DropEdgeWrapper, self).__init__()

        # Copy Node's information
        for key, value in graph_wrapper.node_feat.items():
            self.node_feat_tensor_dict[key] = value

        self._num_nodes = graph_wrapper.num_nodes
        self._graph_lod = graph_wrapper.graph_lod
        self._num_graph = graph_wrapper.num_graph

        # Dropout Edges
        src, dst = graph_wrapper.edges
        u = L.uniform_random(shape=L.cast(L.shape(src), 'int64'),
                             min=0.,
                             max=1.)

        # Avoid Empty Edges
        keeped = L.cast(u > dropout, dtype="float32")
        self._num_edges = L.reduce_sum(L.cast(keeped, "int32"))
        keeped = keeped + L.cast(self._num_edges == 0, dtype="float32")

        if keep_self_loop:
            self_loop = L.cast(src == dst, dtype="float32")
            keeped = keeped + self_loop

        keeped = (keeped > 0.5)
        src = paddle_helper.masked_select(src, keeped)
        dst = paddle_helper.masked_select(dst, keeped)
        src.stop_gradient = True
        dst.stop_gradient = True
        self._edges_src = src
        self._edges_dst = dst

        for key, value in graph_wrapper.edge_feat.items():
            self.edge_feat_tensor_dict[key] = paddle_helper.masked_select(
                value, keeped)

        self._edge_uniq_dst, _, uniq_count = L.unique_with_counts(
            dst, dtype="int32")
        self._edge_uniq_dst.stop_gradient = True
        last = L.reduce_sum(uniq_count, keep_dim=True)
        uniq_count = L.cumsum(uniq_count, exclusive=True)
        self._edge_uniq_dst_count = L.concat([uniq_count, last])
        self._edge_uniq_dst_count.stop_gradient = True
        self._indegree = get_degree(self._edges_dst, self._num_nodes)
Beispiel #8
0
def fluid_sequence_get_pos(lodtensor):
    """
    args:
        lodtensor: lod = [[0,4,7]]
    return:
        pos: lod = [[0,4,7]]
             data = [0,1,2,3,0,1,3]
             shape = [-1, 1]
    """
    lodtensor = layers.reduce_sum(lodtensor, dim=1, keep_dim=True) 
    assert lodtensor.shape == (-1, 1), (lodtensor.shape())
    ones = layers.cast(lodtensor * 0 + 1, 'float32')        # (batch*seq_len, 1)
    ones_padded = fluid_sequence_pad(ones, 0)               # (batch, max_seq_len, 1)
    ones_padded = layers.squeeze(ones_padded, [2])          # (batch, max_seq_len)
    seq_len = layers.cast(layers.reduce_sum(ones_padded, 1, keep_dim=True), 'int64')    # (batch, 1)
    pos = layers.cast(layers.cumsum(ones_padded, 1, exclusive=True), 'int64')
    pos = layers.sequence_unpad(pos, seq_len)               # (batch*seq_len, 1)
    pos.stop_gradient = True
    return pos
Beispiel #9
0
 def topp_sampling(self, probs):
     sorted_probs, sorted_idx = layers.argsort(probs, descending=True)
     cum_sorted_probs = layers.cumsum(sorted_probs, axis=1, exclusive=True)
     lt_cond = paddle.cast(
         paddle.less_than(
             cum_sorted_probs,
             layers.fill_constant_batch_size_like(cum_sorted_probs,
                                                  cum_sorted_probs.shape,
                                                  cum_sorted_probs.dtype,
                                                  self.topp)), "float32")
     old_probs = probs
     candidate_probs = sorted_probs * lt_cond
     probs = candidate_probs / paddle.sum(
         candidate_probs, axis=-1, keep_dim=True)
     sampling_ids = layers.sampling_id(probs, dtype="int")
     sampling_ids = paddle.index_sample(sorted_idx,
                                        paddle.unsqueeze(sampling_ids, [1]))
     sampling_ids = paddle.squeeze(sampling_ids, [1])
     probs = old_probs
     return probs, sampling_ids
Beispiel #10
0
 def __init__(self, input_mask):
     super(BigBirdWrapper, self).__init__()
     max_seqlen = L.shape(input_mask)[1]
     input_mask = L.reshape(input_mask, [-1])
     num_nodes = L.shape(input_mask)[0]
     src, dst = build_edges(num_nodes, input_mask, max_seqlen)
     self._edges_src = src
     self._edges_dst = dst
     self._edges_src.stop_gradient = True
     self._edges_dst.stop_gradient = True
     self._num_nodes = num_nodes
     self._num_edges = L.shape(self._edges_src)[0]
     self._node_ids = L.range(0, self._num_nodes, step=1, dtype="int32")
     self._edge_uniq_dst, _, uniq_count = L.unique_with_counts(
         self._edges_dst, dtype="int32")
     self._edge_uniq_dst.stop_gradient = True
     last = L.reduce_sum(uniq_count, keep_dim=True)
     uniq_count = L.cumsum(uniq_count, exclusive=True)
     self._edge_uniq_dst_count = L.concat([uniq_count, last])
     self._edge_uniq_dst_count.stop_gradient = True
Beispiel #11
0
def fluid_sequence_get_pos(lodtensor):
    """
    args:
        lodtensor: lod = [[0,4,7]]
    return:
        pos: lod = [[0,4,7]]
             data = [0,1,2,3,0,1,3]
             shape = [-1, 1]
    """
    lodtensor_slice = layers.slice(lodtensor, axes=[1], starts=[0], ends=[1])
    assert lodtensor_slice.shape == (-1, 1), (lodtensor_slice.shape())
    ones = layers.cast(lodtensor_slice * 0 + 1,
                       'float32')  # (batch*seq_len, 1)
    ones = layers.lod_reset(ones, lodtensor)
    ones_padded = fluid_sequence_pad(ones, 0)  # (batch, max_seq_len, 1)
    ones_padded = layers.squeeze(ones_padded, [2])  # (batch, max_seq_len)
    seq_len = layers.cast(layers.reduce_sum(ones_padded, 1, keep_dim=True),
                          'int64')  # (batch, 1)
    pos = layers.cast(layers.cumsum(ones_padded, 1, exclusive=True), 'int64')
    pos = layers.sequence_unpad(pos, seq_len)  # (batch*seq_len, 1)
    return pos
Beispiel #12
0
    def __init__(self, graph_wrapper, part_id, num_parts):
        super(PartitionWrapper, self).__init__()

        # Copy Node's information
        for key, value in graph_wrapper.node_feat.items():
            self.node_feat_tensor_dict[key] = value

        self._num_nodes = graph_wrapper.num_nodes
        self._graph_lod = graph_wrapper.graph_lod
        self._num_graph = graph_wrapper.num_graph

        # Dropout Edges
        src, dst = graph_wrapper.edges
        keeped = L.cast((dst % num_parts) == part_id, dtype="float32")

        keeped = (keeped > 0.5)
        self.keeped = keeped
        self._num_edges = L.reduce_sum(L.cast(keeped, "int32"))
        #L.Print(self._num_edges, message="Part-%s num edges" % part_id)
        src = paddle_helper.masked_select(src, keeped)
        dst = paddle_helper.masked_select(dst, keeped)
        src.stop_gradient = True
        dst.step_gradient = True
        self._edges_src = src
        self._edges_dst = dst

        for key, value in graph_wrapper.edge_feat.items():
            self.edge_feat_tensor_dict[key] = paddle_helper.masked_select(
                value, keeped)

        self._edge_uniq_dst, _, uniq_count = L.unique_with_counts(
            dst, dtype="int32")
        self._edge_uniq_dst.stop_gradient = True
        last = L.reduce_sum(uniq_count, keep_dim=True)
        uniq_count = L.cumsum(uniq_count, exclusive=True)
        self._edge_uniq_dst_count = L.concat([uniq_count, last])
        self._edge_uniq_dst_count.stop_gradient = True
        self._indegree = get_degree(self._edges_dst, self._num_nodes)
Beispiel #13
0
    def inference(self, model, inputs, outputs):
        """
        Run inference.

        Args:
            inputs(dict): Its key is input name(str) and its value is a Variable.
            model(object): A generate model. Need to implement `_generation_network` and `_calc_logits`.

        Returns:
            dict(str:Variable): Its key is output name(str) and its value is a Variable.
        """
        # prepare while loop
        max_len = layers.fill_constant(shape=[1],
                                       dtype="int64",
                                       value=self.max_dec_len,
                                       force_cpu=True)
        min_len = layers.fill_constant(shape=[1],
                                       dtype="int64",
                                       value=self.min_dec_len,
                                       force_cpu=True)
        step_idx = layers.fill_constant(shape=[1],
                                        dtype="int64",
                                        value=0,
                                        force_cpu=True)

        ids = layers.array_write(layers.reshape(inputs["tgt_ids"], (-1, 1)),
                                 step_idx)
        pos_biases = layers.array_write(
            layers.reshape(inputs["tgt_pos"], (-1, 1)), step_idx)
        scores = layers.array_write(inputs["init_score"], step_idx)
        tgt_generation_mask = layers.array_write(inputs["tgt_generation_mask"],
                                                 step_idx)
        parent_idx = inputs["parent_idx"]

        if self.decoding_strategy == "beam_search":
            beam_size = self.beam_size
        else:
            beam_size = 1

        eos_penalty = np.zeros(self.vocab_size, dtype="float32")
        eos_penalty[self.eos_id] = -1e9
        eos_penalty = layers.assign(eos_penalty)

        token_penalty = np.zeros(self.vocab_size, dtype="float32")
        token_penalty[self.unk_id] = -1e9
        if self.mask_id >= 0:
            token_penalty[self.mask_id] = -1e9
        token_penalty = layers.assign(token_penalty)

        # start while loop
        cond = layers.less_than(x=step_idx, y=max_len)
        while_op = layers.While(cond)
        with while_op.block():
            pre_ids = layers.array_read(array=ids, i=step_idx)
            pre_ids = layers.reshape(pre_ids, (-1, 1, 1), inplace=True)
            pre_scores = layers.array_read(array=scores, i=step_idx)
            pos_bias = layers.array_read(array=pos_biases, i=step_idx)
            pos_bias = layers.gather(input=pos_bias, index=parent_idx)

            tmp_tgt_generation_mask = layers.array_read(tgt_generation_mask,
                                                        i=step_idx)
            dtype = tmp_tgt_generation_mask.dtype

            append_mask = layers.fill_constant_batch_size_like(
                input=pre_ids, value=1.0, shape=[-1, 1, 1], dtype=dtype)
            tmp_tgt_generation_mask = layers.concat(
                [tmp_tgt_generation_mask, append_mask], axis=2)
            pre_mask = tmp_tgt_generation_mask = layers.gather(
                input=tmp_tgt_generation_mask, index=parent_idx)

            pre_sent = layers.fill_constant_batch_size_like(
                input=pre_mask, value=1, shape=[-1, 1, 1], dtype=pre_ids.dtype)

            if self.continuous_position:
                pre_pos = layers.elementwise_mul(
                    x=layers.fill_constant_batch_size_like(
                        input=pre_mask,
                        value=1,
                        shape=[-1, 1, 1],
                        dtype=pre_ids.dtype),
                    y=step_idx,
                    axis=0) + pos_bias
            else:
                pre_pos = layers.elementwise_mul(
                    x=layers.fill_constant_batch_size_like(
                        input=pre_mask,
                        value=1,
                        shape=[-1, 1, 1],
                        dtype=pre_ids.dtype),
                    y=step_idx,
                    axis=0)

            dec_out, _ = model._generation_network(
                token_ids=pre_ids,
                type_ids=pre_sent,
                pos_ids=pre_pos,
                generation_mask=tmp_tgt_generation_mask,
                gather_idx=parent_idx)
            logits = model._calc_logits(dec_out)

            # ignore unk and mask token
            if self.ignore_unk:
                logits = layers.elementwise_add(logits, token_penalty, axis=1)

            # min dec length
            min_len_cond = layers.less_than(x=step_idx, y=min_len)

            def min_len_penalty():
                """Plus minimum length penalty."""
                return layers.elementwise_add(logits, eos_penalty, axis=1)

            def no_penalty():
                """No penalty."""
                return logits

            logits = layers.case([(min_len_cond, min_len_penalty)],
                                 default=no_penalty)

            # get probs
            probs = layers.softmax(logits / self.temperature)

            if self.decoding_strategy == "beam_search":
                topk_scores, topk_indices = layers.topk(input=probs,
                                                        k=beam_size)
            else:
                if self.decoding_strategy.startswith("sampling"):
                    sampling_ids = layers.sampling_id(probs, dtype="int")
                elif self.decoding_strategy.startswith("topk_sampling"):
                    topk_probs, _ = layers.topk(input=probs, k=self.topk)
                    ge_cond = layers.cast(
                        layers.greater_equal(
                            probs, layers.unsqueeze(topk_probs[:, -1], [1])),
                        "float32")
                    old_probs = probs
                    probs = probs * ge_cond / layers.reduce_sum(
                        topk_probs, dim=-1, keep_dim=True)
                    sampling_ids = layers.sampling_id(probs, dtype="int")
                    probs = old_probs
                elif self.decoding_strategy.startswith("topp_sampling"):
                    sorted_probs, sorted_idx = layers.argsort(probs,
                                                              descending=True)
                    cum_sorted_probs = layers.cumsum(sorted_probs,
                                                     axis=1,
                                                     exclusive=True)
                    lt_cond = layers.cast(
                        layers.less_than(
                            cum_sorted_probs,
                            layers.fill_constant_batch_size_like(
                                cum_sorted_probs, cum_sorted_probs.shape,
                                cum_sorted_probs.dtype, self.topp)), "float32")
                    old_probs = probs
                    candidate_probs = sorted_probs * lt_cond
                    probs = candidate_probs / layers.reduce_sum(
                        candidate_probs, dim=-1, keep_dim=True)
                    sampling_ids = layers.sampling_id(probs, dtype="int")
                    sampling_ids = layers.index_sample(
                        sorted_idx, layers.unsqueeze(sampling_ids, [1]))
                    sampling_ids = layers.squeeze(sampling_ids, [1])
                    probs = old_probs
                else:
                    raise ValueError(self.decoding_strategy)

                sampling_scores = layers.one_hot(
                    layers.unsqueeze(sampling_ids, [1]), probs.shape[1])
                sampling_scores = sampling_scores * probs - (
                    1 - sampling_scores) * 1e3
                topk_scores, topk_indices = layers.topk(input=sampling_scores,
                                                        k=1)

            pre_len = layers.cast(step_idx, "float32")
            layers.increment(x=step_idx, value=1.0, in_place=True)
            cur_len = layers.cast(step_idx, "float32")

            # update scores
            if self.length_average:
                accu_scores = layers.elementwise_add(x=layers.log(topk_scores),
                                                     y=pre_scores * pre_len,
                                                     axis=0) / cur_len
            elif self.length_penalty > 0:
                pre_lp = layers.pow((5 + pre_len) / 6, self.length_penalty)
                cur_lp = layers.pow((5 + cur_len) / 6, self.length_penalty)
                accu_scores = layers.elementwise_add(x=layers.log(topk_scores),
                                                     y=pre_scores * pre_lp,
                                                     axis=0) / cur_lp
            else:
                accu_scores = layers.elementwise_add(x=layers.log(topk_scores),
                                                     y=pre_scores,
                                                     axis=0)
            topk_indices = layers.lod_reset(topk_indices, pre_ids)
            accu_scores = layers.lod_reset(accu_scores, pre_ids)
            selected_ids, selected_scores, gather_idx = layers.beam_search(
                pre_ids=pre_ids,
                pre_scores=pre_scores,
                ids=topk_indices,
                scores=accu_scores,
                beam_size=beam_size,
                end_id=self.eos_id,
                return_parent_idx=True)

            layers.array_write(selected_ids, i=step_idx, array=ids)
            layers.array_write(selected_scores, i=step_idx, array=scores)
            layers.array_write(pre_mask, i=step_idx, array=tgt_generation_mask)
            layers.array_write(pos_bias, i=step_idx, array=pos_biases)

            layers.assign(gather_idx, parent_idx)

            length_cond = layers.less_than(x=step_idx, y=max_len)
            finish_cond = layers.logical_not(layers.is_empty(x=selected_ids))
            layers.logical_and(x=length_cond, y=finish_cond, out=cond)

        finished_ids, finished_scores = layers.beam_search_decode(
            ids, scores, beam_size=beam_size, end_id=self.eos_id)

        predictions = {
            "finished_ids": finished_ids,
            "finished_scores": finished_scores,
            "token_ids": inputs["token_ids"],
            "data_id": inputs["data_id"]
        }
        return predictions
Beispiel #14
0
 def build_position_ids(term_ids):
     input_mask = L.cast(term_ids > 0, "int64")
     position_ids = L.cumsum(input_mask, axis=1) - 1
     return position_ids