def __init__(self, num_nodes, num_edges, edges, node_feats=None, edge_feats=None): super(BatchGraphWrapper, self).__init__() node_shift, edge_lod = self.__build_meta_data(num_nodes, num_edges) self.__build_edges(edges, node_shift, edge_lod, edge_feats) # assign node features if node_feats is not None: for key, value in node_feats.items(): self.node_feat_tensor_dict[key] = value # other meta-data self._edge_uniq_dst, _, uniq_count = L.unique_with_counts( self._edges_dst, dtype="int32") self._edge_uniq_dst.stop_gradient = True last = L.reduce_sum(uniq_count, keep_dim=True) uniq_count = L.cumsum(uniq_count, exclusive=True) self._edge_uniq_dst_count = L.concat([uniq_count, last]) self._edge_uniq_dst_count.stop_gradient = True self._indegree = get_degree(self._edges_dst, self._num_nodes)
def forward(self, tensor_list: NestedTensor): x = tensor_list.tensors mask = tensor_list.mask assert mask is not None bs, h, w = mask.shape mask = mask.numpy() not_mask = ~mask not_mask = dg.to_variable(not_mask).astype('float32') y_embed = L.cumsum(not_mask, axis=1) # [batch_size, h, w] x_embed = L.cumsum(not_mask, axis=2) # [batch_size, h, w] if self.normalize: eps = 1e-6 y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale dim_t = (np.arange(0, self.num_pos_feats, 1, dtype="float32")) # [num_pos_feats] dim_t = self.temperature**(2 * (dim_t // 2) / self.num_pos_feats ) # [num_pos_feats] dim_t = dg.to_variable(dim_t) x_embed = L.unsqueeze(x_embed, 3) # [batch_size, h, w, 1] y_embed = L.unsqueeze(y_embed, 3) # [batch_size, h, w, 1] pos_x = x_embed / dim_t # [batch_size, h, w, num_pos_feats] pos_y = y_embed / dim_t # [batch_size, h, w, num_pos_feats] pos_x_1 = L.sin(pos_x[:, :, :, 0::2]) # [batch_size, h, w, num_pos_feats / 2] pos_x_2 = L.cos(pos_x[:, :, :, 1::2]) # [batch_size, h, w, num_pos_feats / 2] pos_y_1 = L.sin(pos_y[:, :, :, 0::2]) # [batch_size, h, w, num_pos_feats / 2] pos_y_2 = L.cos(pos_y[:, :, :, 1::2]) # [batch_size, h, w, num_pos_feats / 2] pos_x = L.reshape(L.stack([pos_x_1, pos_x_2], axis=4), (bs, h, w, -1)) # [batch_size, h, w, num_pos_feats] pos_y = L.reshape(L.stack([pos_y_1, pos_y_2], axis=4), (bs, h, w, -1)) # [batch_size, h, w, num_pos_feats] pos = L.concat((pos_y, pos_x), axis=3) # [batch_size, h, w, num_pos_feats * 2] pos = L.transpose(pos, perm=(0, 3, 1, 2)) # [batch_size, num_pos_feats * 2, h, w] return pos
def fluid_get_offset(seq_len): """ args: seq_len: (-1) return: offset: the same shape as seq_len, cumsum(seq_len) - seq_len """ assert len(seq_len.shape) == 1 csum = layers.cumsum(layers.cast(seq_len, 'float32'), exclusive=True) return layers.cast(csum, 'int64')
def fluid_get_offset2(seq_len): """ args: seq_len: (-1) return: offset: the same shape as seq_len, cumsum(seq_len) - seq_len """ assert len(seq_len.shape) == 1 csum = layers.cumsum(seq_len) offset = csum - seq_len return offset
def flat_words(self, words): pad_index = self.args.pad_index lens = nn.reduce_sum(words != pad_index, dim=-1) position = layers.cumsum(lens + layers.cast((lens == 0), "int32"), axis=1) - 1 flat_words = nn.masked_select(words, words != pad_index) flat_words = nn.pad_sequence_paddle( layers.split(flat_words, layers.reduce_sum(lens, -1).numpy().tolist(), pad_index)) max_len = flat_words.shape[1] position = nn.mask_fill(position, position >= max_len, max_len - 1) return flat_words, position
def __build_meta_data(self, num_nodes, num_edges): """ Merge information for nodes and edges. """ num_nodes = L.reshape(num_nodes, [-1]) num_edges = L.reshape(num_edges, [-1]) num_nodes = paddle_helper.ensure_dtype(num_nodes, dtype="int32") num_edges = paddle_helper.ensure_dtype(num_edges, dtype="int32") num_graph = L.shape(num_nodes)[0] sum_num_nodes = L.reduce_sum(num_nodes) sum_num_edges = L.reduce_sum(num_edges) edge_lod = L.concat( [L.cumsum(num_edges, exclusive=True), sum_num_edges]) edge_lod = paddle_helper.lod_remove(edge_lod) node_shift = L.cumsum(num_nodes, exclusive=True) graph_lod = L.concat([node_shift, sum_num_nodes]) graph_lod = paddle_helper.lod_remove(graph_lod) self._num_nodes = sum_num_nodes self._num_edges = sum_num_edges self._num_graph = num_graph self._graph_lod = graph_lod return node_shift, edge_lod
def __init__(self, graph_wrapper, dropout, keep_self_loop=True): super(DropEdgeWrapper, self).__init__() # Copy Node's information for key, value in graph_wrapper.node_feat.items(): self.node_feat_tensor_dict[key] = value self._num_nodes = graph_wrapper.num_nodes self._graph_lod = graph_wrapper.graph_lod self._num_graph = graph_wrapper.num_graph # Dropout Edges src, dst = graph_wrapper.edges u = L.uniform_random(shape=L.cast(L.shape(src), 'int64'), min=0., max=1.) # Avoid Empty Edges keeped = L.cast(u > dropout, dtype="float32") self._num_edges = L.reduce_sum(L.cast(keeped, "int32")) keeped = keeped + L.cast(self._num_edges == 0, dtype="float32") if keep_self_loop: self_loop = L.cast(src == dst, dtype="float32") keeped = keeped + self_loop keeped = (keeped > 0.5) src = paddle_helper.masked_select(src, keeped) dst = paddle_helper.masked_select(dst, keeped) src.stop_gradient = True dst.stop_gradient = True self._edges_src = src self._edges_dst = dst for key, value in graph_wrapper.edge_feat.items(): self.edge_feat_tensor_dict[key] = paddle_helper.masked_select( value, keeped) self._edge_uniq_dst, _, uniq_count = L.unique_with_counts( dst, dtype="int32") self._edge_uniq_dst.stop_gradient = True last = L.reduce_sum(uniq_count, keep_dim=True) uniq_count = L.cumsum(uniq_count, exclusive=True) self._edge_uniq_dst_count = L.concat([uniq_count, last]) self._edge_uniq_dst_count.stop_gradient = True self._indegree = get_degree(self._edges_dst, self._num_nodes)
def fluid_sequence_get_pos(lodtensor): """ args: lodtensor: lod = [[0,4,7]] return: pos: lod = [[0,4,7]] data = [0,1,2,3,0,1,3] shape = [-1, 1] """ lodtensor = layers.reduce_sum(lodtensor, dim=1, keep_dim=True) assert lodtensor.shape == (-1, 1), (lodtensor.shape()) ones = layers.cast(lodtensor * 0 + 1, 'float32') # (batch*seq_len, 1) ones_padded = fluid_sequence_pad(ones, 0) # (batch, max_seq_len, 1) ones_padded = layers.squeeze(ones_padded, [2]) # (batch, max_seq_len) seq_len = layers.cast(layers.reduce_sum(ones_padded, 1, keep_dim=True), 'int64') # (batch, 1) pos = layers.cast(layers.cumsum(ones_padded, 1, exclusive=True), 'int64') pos = layers.sequence_unpad(pos, seq_len) # (batch*seq_len, 1) pos.stop_gradient = True return pos
def topp_sampling(self, probs): sorted_probs, sorted_idx = layers.argsort(probs, descending=True) cum_sorted_probs = layers.cumsum(sorted_probs, axis=1, exclusive=True) lt_cond = paddle.cast( paddle.less_than( cum_sorted_probs, layers.fill_constant_batch_size_like(cum_sorted_probs, cum_sorted_probs.shape, cum_sorted_probs.dtype, self.topp)), "float32") old_probs = probs candidate_probs = sorted_probs * lt_cond probs = candidate_probs / paddle.sum( candidate_probs, axis=-1, keep_dim=True) sampling_ids = layers.sampling_id(probs, dtype="int") sampling_ids = paddle.index_sample(sorted_idx, paddle.unsqueeze(sampling_ids, [1])) sampling_ids = paddle.squeeze(sampling_ids, [1]) probs = old_probs return probs, sampling_ids
def __init__(self, input_mask): super(BigBirdWrapper, self).__init__() max_seqlen = L.shape(input_mask)[1] input_mask = L.reshape(input_mask, [-1]) num_nodes = L.shape(input_mask)[0] src, dst = build_edges(num_nodes, input_mask, max_seqlen) self._edges_src = src self._edges_dst = dst self._edges_src.stop_gradient = True self._edges_dst.stop_gradient = True self._num_nodes = num_nodes self._num_edges = L.shape(self._edges_src)[0] self._node_ids = L.range(0, self._num_nodes, step=1, dtype="int32") self._edge_uniq_dst, _, uniq_count = L.unique_with_counts( self._edges_dst, dtype="int32") self._edge_uniq_dst.stop_gradient = True last = L.reduce_sum(uniq_count, keep_dim=True) uniq_count = L.cumsum(uniq_count, exclusive=True) self._edge_uniq_dst_count = L.concat([uniq_count, last]) self._edge_uniq_dst_count.stop_gradient = True
def fluid_sequence_get_pos(lodtensor): """ args: lodtensor: lod = [[0,4,7]] return: pos: lod = [[0,4,7]] data = [0,1,2,3,0,1,3] shape = [-1, 1] """ lodtensor_slice = layers.slice(lodtensor, axes=[1], starts=[0], ends=[1]) assert lodtensor_slice.shape == (-1, 1), (lodtensor_slice.shape()) ones = layers.cast(lodtensor_slice * 0 + 1, 'float32') # (batch*seq_len, 1) ones = layers.lod_reset(ones, lodtensor) ones_padded = fluid_sequence_pad(ones, 0) # (batch, max_seq_len, 1) ones_padded = layers.squeeze(ones_padded, [2]) # (batch, max_seq_len) seq_len = layers.cast(layers.reduce_sum(ones_padded, 1, keep_dim=True), 'int64') # (batch, 1) pos = layers.cast(layers.cumsum(ones_padded, 1, exclusive=True), 'int64') pos = layers.sequence_unpad(pos, seq_len) # (batch*seq_len, 1) return pos
def __init__(self, graph_wrapper, part_id, num_parts): super(PartitionWrapper, self).__init__() # Copy Node's information for key, value in graph_wrapper.node_feat.items(): self.node_feat_tensor_dict[key] = value self._num_nodes = graph_wrapper.num_nodes self._graph_lod = graph_wrapper.graph_lod self._num_graph = graph_wrapper.num_graph # Dropout Edges src, dst = graph_wrapper.edges keeped = L.cast((dst % num_parts) == part_id, dtype="float32") keeped = (keeped > 0.5) self.keeped = keeped self._num_edges = L.reduce_sum(L.cast(keeped, "int32")) #L.Print(self._num_edges, message="Part-%s num edges" % part_id) src = paddle_helper.masked_select(src, keeped) dst = paddle_helper.masked_select(dst, keeped) src.stop_gradient = True dst.step_gradient = True self._edges_src = src self._edges_dst = dst for key, value in graph_wrapper.edge_feat.items(): self.edge_feat_tensor_dict[key] = paddle_helper.masked_select( value, keeped) self._edge_uniq_dst, _, uniq_count = L.unique_with_counts( dst, dtype="int32") self._edge_uniq_dst.stop_gradient = True last = L.reduce_sum(uniq_count, keep_dim=True) uniq_count = L.cumsum(uniq_count, exclusive=True) self._edge_uniq_dst_count = L.concat([uniq_count, last]) self._edge_uniq_dst_count.stop_gradient = True self._indegree = get_degree(self._edges_dst, self._num_nodes)
def inference(self, model, inputs, outputs): """ Run inference. Args: inputs(dict): Its key is input name(str) and its value is a Variable. model(object): A generate model. Need to implement `_generation_network` and `_calc_logits`. Returns: dict(str:Variable): Its key is output name(str) and its value is a Variable. """ # prepare while loop max_len = layers.fill_constant(shape=[1], dtype="int64", value=self.max_dec_len, force_cpu=True) min_len = layers.fill_constant(shape=[1], dtype="int64", value=self.min_dec_len, force_cpu=True) step_idx = layers.fill_constant(shape=[1], dtype="int64", value=0, force_cpu=True) ids = layers.array_write(layers.reshape(inputs["tgt_ids"], (-1, 1)), step_idx) pos_biases = layers.array_write( layers.reshape(inputs["tgt_pos"], (-1, 1)), step_idx) scores = layers.array_write(inputs["init_score"], step_idx) tgt_generation_mask = layers.array_write(inputs["tgt_generation_mask"], step_idx) parent_idx = inputs["parent_idx"] if self.decoding_strategy == "beam_search": beam_size = self.beam_size else: beam_size = 1 eos_penalty = np.zeros(self.vocab_size, dtype="float32") eos_penalty[self.eos_id] = -1e9 eos_penalty = layers.assign(eos_penalty) token_penalty = np.zeros(self.vocab_size, dtype="float32") token_penalty[self.unk_id] = -1e9 if self.mask_id >= 0: token_penalty[self.mask_id] = -1e9 token_penalty = layers.assign(token_penalty) # start while loop cond = layers.less_than(x=step_idx, y=max_len) while_op = layers.While(cond) with while_op.block(): pre_ids = layers.array_read(array=ids, i=step_idx) pre_ids = layers.reshape(pre_ids, (-1, 1, 1), inplace=True) pre_scores = layers.array_read(array=scores, i=step_idx) pos_bias = layers.array_read(array=pos_biases, i=step_idx) pos_bias = layers.gather(input=pos_bias, index=parent_idx) tmp_tgt_generation_mask = layers.array_read(tgt_generation_mask, i=step_idx) dtype = tmp_tgt_generation_mask.dtype append_mask = layers.fill_constant_batch_size_like( input=pre_ids, value=1.0, shape=[-1, 1, 1], dtype=dtype) tmp_tgt_generation_mask = layers.concat( [tmp_tgt_generation_mask, append_mask], axis=2) pre_mask = tmp_tgt_generation_mask = layers.gather( input=tmp_tgt_generation_mask, index=parent_idx) pre_sent = layers.fill_constant_batch_size_like( input=pre_mask, value=1, shape=[-1, 1, 1], dtype=pre_ids.dtype) if self.continuous_position: pre_pos = layers.elementwise_mul( x=layers.fill_constant_batch_size_like( input=pre_mask, value=1, shape=[-1, 1, 1], dtype=pre_ids.dtype), y=step_idx, axis=0) + pos_bias else: pre_pos = layers.elementwise_mul( x=layers.fill_constant_batch_size_like( input=pre_mask, value=1, shape=[-1, 1, 1], dtype=pre_ids.dtype), y=step_idx, axis=0) dec_out, _ = model._generation_network( token_ids=pre_ids, type_ids=pre_sent, pos_ids=pre_pos, generation_mask=tmp_tgt_generation_mask, gather_idx=parent_idx) logits = model._calc_logits(dec_out) # ignore unk and mask token if self.ignore_unk: logits = layers.elementwise_add(logits, token_penalty, axis=1) # min dec length min_len_cond = layers.less_than(x=step_idx, y=min_len) def min_len_penalty(): """Plus minimum length penalty.""" return layers.elementwise_add(logits, eos_penalty, axis=1) def no_penalty(): """No penalty.""" return logits logits = layers.case([(min_len_cond, min_len_penalty)], default=no_penalty) # get probs probs = layers.softmax(logits / self.temperature) if self.decoding_strategy == "beam_search": topk_scores, topk_indices = layers.topk(input=probs, k=beam_size) else: if self.decoding_strategy.startswith("sampling"): sampling_ids = layers.sampling_id(probs, dtype="int") elif self.decoding_strategy.startswith("topk_sampling"): topk_probs, _ = layers.topk(input=probs, k=self.topk) ge_cond = layers.cast( layers.greater_equal( probs, layers.unsqueeze(topk_probs[:, -1], [1])), "float32") old_probs = probs probs = probs * ge_cond / layers.reduce_sum( topk_probs, dim=-1, keep_dim=True) sampling_ids = layers.sampling_id(probs, dtype="int") probs = old_probs elif self.decoding_strategy.startswith("topp_sampling"): sorted_probs, sorted_idx = layers.argsort(probs, descending=True) cum_sorted_probs = layers.cumsum(sorted_probs, axis=1, exclusive=True) lt_cond = layers.cast( layers.less_than( cum_sorted_probs, layers.fill_constant_batch_size_like( cum_sorted_probs, cum_sorted_probs.shape, cum_sorted_probs.dtype, self.topp)), "float32") old_probs = probs candidate_probs = sorted_probs * lt_cond probs = candidate_probs / layers.reduce_sum( candidate_probs, dim=-1, keep_dim=True) sampling_ids = layers.sampling_id(probs, dtype="int") sampling_ids = layers.index_sample( sorted_idx, layers.unsqueeze(sampling_ids, [1])) sampling_ids = layers.squeeze(sampling_ids, [1]) probs = old_probs else: raise ValueError(self.decoding_strategy) sampling_scores = layers.one_hot( layers.unsqueeze(sampling_ids, [1]), probs.shape[1]) sampling_scores = sampling_scores * probs - ( 1 - sampling_scores) * 1e3 topk_scores, topk_indices = layers.topk(input=sampling_scores, k=1) pre_len = layers.cast(step_idx, "float32") layers.increment(x=step_idx, value=1.0, in_place=True) cur_len = layers.cast(step_idx, "float32") # update scores if self.length_average: accu_scores = layers.elementwise_add(x=layers.log(topk_scores), y=pre_scores * pre_len, axis=0) / cur_len elif self.length_penalty > 0: pre_lp = layers.pow((5 + pre_len) / 6, self.length_penalty) cur_lp = layers.pow((5 + cur_len) / 6, self.length_penalty) accu_scores = layers.elementwise_add(x=layers.log(topk_scores), y=pre_scores * pre_lp, axis=0) / cur_lp else: accu_scores = layers.elementwise_add(x=layers.log(topk_scores), y=pre_scores, axis=0) topk_indices = layers.lod_reset(topk_indices, pre_ids) accu_scores = layers.lod_reset(accu_scores, pre_ids) selected_ids, selected_scores, gather_idx = layers.beam_search( pre_ids=pre_ids, pre_scores=pre_scores, ids=topk_indices, scores=accu_scores, beam_size=beam_size, end_id=self.eos_id, return_parent_idx=True) layers.array_write(selected_ids, i=step_idx, array=ids) layers.array_write(selected_scores, i=step_idx, array=scores) layers.array_write(pre_mask, i=step_idx, array=tgt_generation_mask) layers.array_write(pos_bias, i=step_idx, array=pos_biases) layers.assign(gather_idx, parent_idx) length_cond = layers.less_than(x=step_idx, y=max_len) finish_cond = layers.logical_not(layers.is_empty(x=selected_ids)) layers.logical_and(x=length_cond, y=finish_cond, out=cond) finished_ids, finished_scores = layers.beam_search_decode( ids, scores, beam_size=beam_size, end_id=self.eos_id) predictions = { "finished_ids": finished_ids, "finished_scores": finished_scores, "token_ids": inputs["token_ids"], "data_id": inputs["data_id"] } return predictions
def build_position_ids(term_ids): input_mask = L.cast(term_ids > 0, "int64") position_ids = L.cumsum(input_mask, axis=1) - 1 return position_ids