def train(): paddle.set_device("gpu" if args.n_gpu else "cpu") if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() model = ErnieForGeneration.from_pretrained(args.model_name_or_path) if "ernie-tiny" in args.model_name_or_path: tokenizer = ErnieTinyTokenizer.from_pretrained(args.model_name_or_path) elif "ernie" in args.model_name_or_path: tokenizer = ErnieTokenizer.from_pretrained(args.model_name_or_path) elif "roberta" in args.model_name_or_path or "rbt" in args.model_name_or_path: tokenizer = RobertaTokenizer.from_pretrained(args.model_name_or_path) elif "electra" in args.model_name_or_path: tokenizer = ElectraTokenizer.from_pretrained(args.model_name_or_path) else: tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path) if args.init_checkpoint: model_state = paddle.load(args.init_checkpoint) model.set_state_dict(model_state) train_dataset, dev_dataset = Poetry.get_datasets(['train', 'dev']) attn_id = tokenizer.vocab[ '[ATTN]'] if '[ATTN]' in tokenizer.vocab else tokenizer.vocab['[MASK]'] tgt_type_id = model.sent_emb.weight.shape[0] - 1 trans_func = convert_example(tokenizer=tokenizer, attn_id=attn_id, tgt_type_id=tgt_type_id, max_encode_len=args.max_encode_len, max_decode_len=args.max_decode_len, noise_prob=args.noise_prob, use_random_noice=args.use_random_noice) train_dataset = train_dataset.apply(trans_func, lazy=True) train_batch_sampler = paddle.io.DistributedBatchSampler( train_dataset, batch_size=args.batch_size, shuffle=True) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id), # src_ids Pad(axis=0, pad_val=tokenizer.pad_token_id), # src_pids Pad(axis=0, pad_val=tokenizer.pad_token_id), # src_sids Pad(axis=0, pad_val=tokenizer.pad_token_id), # tgt_ids Pad(axis=0, pad_val=tokenizer.pad_token_id), # tgt_pids Pad(axis=0, pad_val=tokenizer.pad_token_id), # tgt_sids Pad(axis=0, pad_val=tokenizer.pad_token_id), # attn_ids Pad(axis=0, pad_val=tokenizer.pad_token_id), # tgt_labels ): after_padding(fn(samples)) train_data_loader = DataLoader(dataset=train_dataset, batch_sampler=train_batch_sampler, collate_fn=batchify_fn, num_workers=0, return_list=True) dev_dataset = dev_dataset.apply(trans_func, lazy=True) dev_batch_sampler = paddle.io.BatchSampler(dev_dataset, batch_size=args.batch_size, shuffle=False) dev_data_loader = DataLoader(dataset=dev_dataset, batch_sampler=dev_batch_sampler, collate_fn=batchify_fn, num_workers=0, return_list=True) label_num = model.word_emb.weight.shape[0] if paddle.distributed.get_world_size() > 1: model = paddle.DataParallel(model) max_steps = (len(train_data_loader) * args.num_epochs) lr_scheduler = paddle.optimizer.lr.LambdaDecay( args.learning_rate, lambda current_step, num_warmup_steps=max_steps * args. warmup_proportion, num_training_steps=max_steps: float( current_step) / float(max(1, num_warmup_steps)) if current_step < num_warmup_steps else max( 0.0, float(num_training_steps - current_step) / float( max(1, num_training_steps - num_warmup_steps)))) optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, epsilon=args.adam_epsilon, parameters=model.parameters(), weight_decay=args.weight_decay, grad_clip=nn.ClipGradByGlobalNorm(1.0), apply_decay_param_fun=lambda x: x in [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ]) rouge1 = Rouge1() rouge2 = Rouge2() global_step = 1 tic_train = time.time() for epoch in range(args.num_epochs): for step, batch in enumerate(train_data_loader, start=1): (src_ids, src_sids, src_pids, tgt_ids, tgt_sids, tgt_pids, attn_ids, mask_src_2_src, mask_tgt_2_srctgt, mask_attn_2_srctgtattn, tgt_labels, _) = batch # import pdb; pdb.set_trace() _, __, info = model(src_ids, sent_ids=src_sids, pos_ids=src_pids, attn_bias=mask_src_2_src, encode_only=True) cached_k, cached_v = info['caches'] _, __, info = model(tgt_ids, sent_ids=tgt_sids, pos_ids=tgt_pids, attn_bias=mask_tgt_2_srctgt, past_cache=(cached_k, cached_v), encode_only=True) cached_k2, cached_v2 = info['caches'] past_cache_k = [ paddle.concat([k, k2], 1) for k, k2 in zip(cached_k, cached_k2) ] past_cache_v = [ paddle.concat([v, v2], 1) for v, v2 in zip(cached_v, cached_v2) ] if args.label_smooth > 0.: tgt_labels = nn.functional.label_smooth( nn.functional.one_hot(tgt_labels, label_num), epsilon=args.label_smooth) loss, _, __ = model(attn_ids, sent_ids=tgt_sids, pos_ids=tgt_pids, attn_bias=mask_attn_2_srctgtattn, past_cache=(past_cache_k, past_cache_v), tgt_labels=tgt_labels, tgt_pos=paddle.nonzero(attn_ids == attn_id)) if global_step % args.logging_steps == 0: if (not args.n_gpu > 1) or paddle.distributed.get_rank() == 0: logger.info( "global step %d, epoch: %d, batch: %d, loss: %f, speed: %.2f step/s, lr: %.3e" % (global_step, epoch, step, loss, args.logging_steps / (time.time() - tic_train), lr_scheduler.get_lr())) tic_train = time.time() loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_gradients() if global_step % args.save_steps == 0 and ( (not args.n_gpu > 1) or paddle.distributed.get_rank() == 0): evaluate(model, dev_data_loader, tokenizer, rouge1, rouge2, attn_id, tgt_type_id, args) output_dir = os.path.join(args.output_dir, "model_%d" % global_step) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = model._layers if isinstance( model, paddle.DataParallel) else model model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) global_step += 1
def forward(self, input_ids=None, token_type_ids=None, position_ids=None, attention_mask=None, query_input_ids=None, query_token_type_ids=None, query_position_ids=None, query_attention_mask=None, title_input_ids=None, title_token_type_ids=None, title_position_ids=None, title_attention_mask=None, seq_lengths=None, labels=None): if self.task != 'text-matching': result = self.model(input_ids, token_type_ids, position_ids, attention_mask) else: query_result = self.model(query_input_ids, query_token_type_ids, query_position_ids, query_attention_mask) title_result = self.model(title_input_ids, title_token_type_ids, title_position_ids, title_attention_mask) if self.task == 'seq-cls': logits = result probs = F.softmax(logits, axis=1) if labels is not None: loss = self.criterion(logits, labels) correct = self.metric.compute(probs, labels) acc = self.metric.update(correct) return probs, loss, {'acc': acc} return probs elif self.task == 'token-cls': logits = result token_level_probs = F.softmax(logits, axis=-1) preds = token_level_probs.argmax(axis=-1) if labels is not None: loss = self.criterion(logits, labels.unsqueeze(-1)) num_infer_chunks, num_label_chunks, num_correct_chunks = \ self.metric.compute(None, seq_lengths, preds, labels) self.metric.update(num_infer_chunks.numpy(), num_label_chunks.numpy(), num_correct_chunks.numpy()) _, _, f1_score = map(float, self.metric.accumulate()) return token_level_probs, loss, {'f1_score': f1_score} return token_level_probs elif self.task == 'text-matching': query_token_embedding = query_result query_token_embedding = self.dropout(query_token_embedding) query_attention_mask = paddle.unsqueeze( (query_input_ids != self.model.pad_token_id).astype( query_token_embedding.dtype), axis=2) query_token_embedding = query_token_embedding * query_attention_mask query_sum_embedding = paddle.sum(query_token_embedding, axis=1) query_sum_mask = paddle.sum(query_attention_mask, axis=1) query_mean = query_sum_embedding / query_sum_mask title_token_embedding = title_result title_token_embedding = self.dropout(title_token_embedding) title_attention_mask = paddle.unsqueeze( (title_input_ids != self.model.pad_token_id).astype( title_token_embedding.dtype), axis=2) title_token_embedding = title_token_embedding * title_attention_mask title_sum_embedding = paddle.sum(title_token_embedding, axis=1) title_sum_mask = paddle.sum(title_attention_mask, axis=1) title_mean = title_sum_embedding / title_sum_mask sub = paddle.abs(paddle.subtract(query_mean, title_mean)) projection = paddle.concat([query_mean, title_mean, sub], axis=-1) logits = self.classifier(projection) probs = F.softmax(logits) if labels is not None: loss = self.criterion(logits, labels) correct = self.metric.compute(probs, labels) acc = self.metric.update(correct) return probs, loss, {'acc': acc} return probs else: sequence_output, pooled_output = result return sequence_output, pooled_output
def test_quant_concat(self): out_1 = paddle.concat([self.x, self.y], axis=0) out_2 = paddle.nn.quant.concat()([self.x, self.y], 0) self.check(out_1, out_2) self.assertTrue(out_1.shape == out_2.shape)
def sample(self, input_ids, logits_processors, max_length, pad_token_id, eos_token_id, top_k=None, top_p=None, temperature=None, min_tokens_to_keep=1, **model_kwargs): def TopKProcess(probs, top_k, min_tokens_to_keep): top_k = min(max(top_k, min_tokens_to_keep), probs.shape[-1]) # Remove all tokens with a probability less than the last token of the top-k topk_probs, _ = paddle.topk(probs, k=top_k) probs = paddle.where(probs >= topk_probs[:, -1:], probs, paddle.full_like(probs, 0.0)) return probs def TopPProcess(probs, top_p, min_tokens_to_keep): sorted_probs = paddle.sort(probs, descending=True) sorted_indices = paddle.argsort(probs, descending=True) cumulative_probs = paddle.cumsum(sorted_probs, axis=-1) # Remove tokens with cumulative probs above the top_p, But keep at # least min_tokens_to_keep tokens sorted_indices_to_remove = cumulative_probs > top_p if min_tokens_to_keep > 1: # Set 'min_tokens_to_keep - 1' because the first token is kept sorted_indices_to_remove[:, :min_tokens_to_keep - 1] = 0 # Keep the first token sorted_indices_to_remove = paddle.cast( sorted_indices_to_remove, dtype='int64') sorted_indices_to_remove[:, 1:] = ( sorted_indices_to_remove[:, :-1].clone()) sorted_indices_to_remove[:, 0] = 0 # Scatter sorted tensors to original indexing sorted_indices = sorted_indices + paddle.arange(probs.shape[ 0]).unsqueeze(-1) * probs.shape[-1] condition = paddle.scatter(sorted_indices_to_remove.flatten(), sorted_indices.flatten(), sorted_indices_to_remove.flatten()) condition = paddle.cast(condition, 'bool').reshape(probs.shape) probs = paddle.where(condition, paddle.full_like(probs, 0.0), probs) return probs batch_size, cur_len = input_ids.shape origin_len = cur_len unfinished_flag = paddle.full([batch_size, 1], True, dtype='bool') scores = paddle.full( [batch_size, 1], 0.0, dtype=paddle.get_default_dtype()) while cur_len < max_length: # prepare model inputs & get model output model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs) outputs = self(**model_inputs) logits = outputs[0] if isinstance(outputs, tuple) else outputs # [batch_size, vocab_size] logits = logits[:, -1, :] # pre-process distribution logits = self.adjust_logits_during_generation(logits) logits = logits_processors(input_ids, logits) # sample origin_probs = F.softmax(logits) origin_probs = paddle.log(origin_probs) if temperature is not None and temperature != 1.0: logits = logits / temperature probs = F.softmax(logits) if top_k is not None and top_k != 0: probs = TopKProcess(probs, top_k, min_tokens_to_keep) if top_p is not None and top_p < 1.0: probs = TopPProcess(probs, top_p, min_tokens_to_keep) next_tokens = paddle.multinomial(probs) next_scores = paddle.index_sample(origin_probs, next_tokens) if eos_token_id is not None: next_tokens = paddle.where(unfinished_flag, next_tokens, paddle.full_like(next_tokens, pad_token_id)) scores = self.update_scores_for_generation( scores, next_scores, cur_len - origin_len, unfinished_flag) cur_len += 1 input_ids = paddle.concat([input_ids, next_tokens], axis=1) if eos_token_id is not None: unfinished_flag = paddle.logical_and( unfinished_flag, next_tokens != eos_token_id) # Stop when there is a </s> in all sentences if not paddle.any(unfinished_flag): break model_kwargs = self.update_model_kwargs_for_generation(outputs, model_kwargs) return input_ids[:, origin_len:], scores
def __call__(self, seg_preds, seg_masks, cate_labels, cate_scores, sum_masks=None): # sort and keep top nms_pre sort_inds = self._sort_score(cate_scores, self.pre_nms_top_n) seg_masks = paddle.gather(seg_masks, index=sort_inds) seg_preds = paddle.gather(seg_preds, index=sort_inds) sum_masks = paddle.gather(sum_masks, index=sort_inds) cate_scores = paddle.gather(cate_scores, index=sort_inds) cate_labels = paddle.gather(cate_labels, index=sort_inds) seg_masks = paddle.flatten(seg_masks, start_axis=1, stop_axis=-1) # inter. inter_matrix = paddle.mm(seg_masks, paddle.transpose(seg_masks, [1, 0])) n_samples = paddle.shape(cate_labels) # union. sum_masks_x = paddle.expand(sum_masks, shape=[n_samples, n_samples]) # iou. iou_matrix = (inter_matrix / (sum_masks_x + paddle.transpose(sum_masks_x, [1, 0]) - inter_matrix)) iou_matrix = paddle.triu(iou_matrix, diagonal=1) # label_specific matrix. cate_labels_x = paddle.expand(cate_labels, shape=[n_samples, n_samples]) label_matrix = paddle.cast( (cate_labels_x == paddle.transpose(cate_labels_x, [1, 0])), 'float32') label_matrix = paddle.triu(label_matrix, diagonal=1) # IoU compensation compensate_iou = paddle.max((iou_matrix * label_matrix), axis=0) compensate_iou = paddle.expand(compensate_iou, shape=[n_samples, n_samples]) compensate_iou = paddle.transpose(compensate_iou, [1, 0]) # IoU decay decay_iou = iou_matrix * label_matrix # matrix nms if self.kernel == 'gaussian': decay_matrix = paddle.exp(-1 * self.sigma * (decay_iou**2)) compensate_matrix = paddle.exp(-1 * self.sigma * (compensate_iou**2)) decay_coefficient = paddle.min(decay_matrix / compensate_matrix, axis=0) elif self.kernel == 'linear': decay_matrix = (1 - decay_iou) / (1 - compensate_iou) decay_coefficient = paddle.min(decay_matrix, axis=0) else: raise NotImplementedError # update the score. cate_scores = cate_scores * decay_coefficient y = paddle.zeros(shape=paddle.shape(cate_scores), dtype='float32') keep = paddle.where(cate_scores >= self.update_threshold, cate_scores, y) keep = paddle.nonzero(keep) keep = paddle.squeeze(keep, axis=[1]) # Prevent empty and increase fake data keep = paddle.concat( [keep, paddle.cast(paddle.shape(cate_scores)[0] - 1, 'int64')]) seg_preds = paddle.gather(seg_preds, index=keep) cate_scores = paddle.gather(cate_scores, index=keep) cate_labels = paddle.gather(cate_labels, index=keep) # sort and keep top_k sort_inds = self._sort_score(cate_scores, self.post_nms_top_n) seg_preds = paddle.gather(seg_preds, index=sort_inds) cate_scores = paddle.gather(cate_scores, index=sort_inds) cate_labels = paddle.gather(cate_labels, index=sort_inds) return seg_preds, cate_scores, cate_labels
def forward(self, pixels, regions): context = self.attention_block(pixels, regions) feats = paddle.concat([context, pixels], axis=1) feats = self.conv1x1(feats) return feats
def concat_unsqueeze2(inputs): a = (inputs[:, :, 0] + 2) * 7 b = (inputs[:, :, 1] + 3) * 11 c = (inputs[:, :, 2] + 5) * 13 return paddle.concat([paddle.unsqueeze(t, axis=2) for t in [a, b, c]], axis=2)
def to_tensor(self): return paddle.concat([self.quaternion, self.translation], axis=-1)
def forward(self, x, fusions): r_f = paddle.concat([x, fusions], axis=2) r = F.tanh(self.linear_r(r_f)) g = F.sigmoid(self.linear_g(r_f)) o = g * r + (1-g) * x return o
def forward(self, indices, segments, positions, input_mask): r''' The BertModel forward method, overrides the `__call__()` special method. Args: indices (Tensor): Indices of input sequence tokens in the vocabulary. They are numerical representations of tokens that build the input sequence. Its data type should be `int32` and it has a shape of [batch_size * sequence_length]. segments (Tensor): Segment token indices to indicate different portions of the inputs. Selected in the range ``[0, type_vocab_size - 1]``. Its data type should be `int32` and it has a shape of [batch_size * sequence_length]. positions(Tensor): Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0, max_position_embeddings - 1]``. Shape as `[batch_size * sequence_length]` and dtype as int32. input_mask (Tensor, optional): Mask used in multi-head attention to avoid performing attention on to some unwanted positions, usually the paddings or the subsequent positions. If the task is PRETRAINING: input_mask[0] is the index that masking starts in the mask_tokens input_mask[1] is the index that masking starts in the rest of the sequence Otherwise input_mask is the mask tensor that has -1000 in positions to be masked and 0 otherwise. Returns: tuple: Returns tuple (`sequence_output`, `word_embeddings_weights`). With the fields: - `sequence_output` (Tensor): Sequence of hidden-states at the last layer of the model. It's data type should be float32 and its shape is [batch_size, sequence_length, hidden_size]. ''' with self.config.embeddings_scope: sequence_output, word_embeddings_weights = self.embedding( indices, segments, positions) if self.config.task == "PRETRAINING": with paddle.static.ipu_shard_guard(index=0, stage=0): input_mask[0] = self.custom_ops.detach(input_mask[0]) input_mask[1] = self.custom_ops.detach(input_mask[1]) for i in range(self.config.num_hidden_layers): # Attention attn_scope = self.config.attn_scopes[i] with attn_scope: with paddle.static.name_scope(f"Layer{i}/Attention"): layer_input = sequence_output q = self.create_parameter(shape=[ self.config.hidden_size, self.config.hidden_size ], dtype="float32") k = self.create_parameter(shape=[ self.config.hidden_size, self.config.hidden_size ], dtype="float32") v = self.create_parameter(shape=[ self.config.hidden_size, self.config.hidden_size ], dtype="float32") qkv = paddle.concat([q, k, v], axis=1) qkv = paddle.matmul(sequence_output, qkv) qkv.block.ops[-1]._set_attr( '__available_memory', self.config.available_mem_proportion) q, k, v = paddle.split(qkv, num_or_sections=[ self.config.hidden_size, self.config.hidden_size, self.config.hidden_size ], axis=1) q = paddle.reshape(q, self.qkv_shape) q = paddle.transpose(q, [0, 2, 1, 3]) k = paddle.reshape(k, self.qkv_shape) k = paddle.transpose(k, [0, 2, 3, 1]) v = paddle.reshape(v, self.qkv_shape) v = paddle.transpose(v, [0, 2, 1, 3]) # Attention calculation with paddle.static.name_scope(f"Z"): if self.config.task == "PRETRAINING": if attn_scope.index in self.masks: final_mask = self.masks[attn_scope.index] else: with paddle.static.name_scope("Mask"): base_value = np.arange( self.config.seq_len).astype('int32') base = paddle.fluid.layers.assign( base_value) mmask = paddle.less_than( base, input_mask[0]) mask_value = np.greater_equal( base_value, self.config.max_predictions_per_seq) mask = paddle.fluid.layers.assign( mask_value) mmask = paddle.logical_or(mmask, mask) smask = paddle.less_than( base, input_mask[1]) final_mask = paddle.logical_and( mmask, smask) final_mask = paddle.cast( final_mask, "float16") sub_attrs = { 'name': 'constant_sub', 'shape': [1], 'dtype': 'float32', 'value': 1, } mul_attrs = { 'name': 'constant_mul', 'shape': [1], 'dtype': 'float32', 'value': 1000, } final_mask = paddle.fluid.layers.elementwise_sub( final_mask, paddle.fluid.layers.fill_constant( **sub_attrs)) final_mask = paddle.fluid.layers.elementwise_mul( final_mask, paddle.fluid.layers.fill_constant( **mul_attrs)) final_mask = paddle.reshape( final_mask, [-1, 1, 1, self.config.seq_len]) final_mask = self.custom_ops.detach( final_mask) self.masks[attn_scope.index] = final_mask qk = paddle.matmul(q, k) qk.block.ops[-1]._set_attr( '__available_memory', self.config.available_mem_proportion) qk_scale = paddle.fluid.layers.fill_constant( **self.qk_scale_attrs) qk = paddle.fluid.layers.elementwise_mul(qk, qk_scale) if self.config.task == "PRETRAINING": qk = paddle.fluid.layers.elementwise_add( qk, final_mask) else: # for SQUAD task, input_mask is calculated in data preprocessing qk = paddle.fluid.layers.elementwise_add( qk, input_mask) qk = paddle.fluid.layers.softmax(qk) if self.config.task == "SQUAD": qk = paddle.fluid.layers.dropout( qk, self.config.attention_probs_dropout_prob, dropout_implementation='upscale_in_train') qkv = paddle.matmul(qk, v) qkv.block.ops[-1]._set_attr( '__available_memory', self.config.available_mem_proportion) qkv = paddle.transpose(qkv, [0, 2, 1, 3]) qkv = paddle.reshape(qkv, [-1, self.config.hidden_size]) qkv_linear = nn.Linear(self.config.hidden_size, self.config.hidden_size, bias_attr=False) qkv = qkv_linear(qkv) qkv.block.ops[-1]._set_attr( '__available_memory', self.config.available_mem_proportion) qkv = paddle.fluid.layers.dropout( qkv, self.config.attention_probs_dropout_prob, dropout_implementation='upscale_in_train') attention = paddle.add(layer_input, qkv) layer_norm1 = nn.LayerNorm(self.config.hidden_size, epsilon=0.001) attention = layer_norm1(attention) # FF with self.config.ff_scopes[i]: with paddle.static.name_scope(f"Layer{i}/FF"): ff_linear1 = nn.Linear(self.config.hidden_size, 4 * self.config.hidden_size) ff_linear2 = nn.Linear(4 * self.config.hidden_size, self.config.hidden_size) with paddle.static.name_scope(f"1"): ff = ff_linear1(attention) ff.block.ops[-2]._set_attr( '__available_memory', self.config.available_mem_proportion) ff = paddle.fluid.layers.gelu(ff, approximate=True) with paddle.static.name_scope(f"2"): ff = ff_linear2(ff) ff.block.ops[-2]._set_attr( '__available_memory', self.config.available_mem_proportion) ff = paddle.fluid.layers.dropout( ff, self.config.attention_probs_dropout_prob, dropout_implementation='upscale_in_train') ff = paddle.add(attention, ff) layer_norm2 = nn.LayerNorm(self.config.hidden_size, epsilon=0.001) sequence_output = layer_norm2(ff) if self.should_checkpoint(i): with paddle.static.name_scope(f"Layer{i}"): logging.info(f'add checkpointoutput for ff_{i}') sequence_output = self.custom_ops.checkpointoutput( sequence_output) return sequence_output, word_embeddings_weights
def _encoder_preprocessor(self, position_sequence, n_node, global_context, particle_types): # Extract important features from the position_sequence. most_recent_position = position_sequence[:, -1] velocity_sequence = time_diff(position_sequence) # Finite-difference. # Get connectivity of the graph. (senders, receivers, n_edge ) = connectivity_utils_paddle.compute_connectivity_for_batch_pyfunc( most_recent_position, n_node, self._connectivity_radius) # Collect node features. node_features = [] # node_feat总共包含以下几项: # 1.flat_velocity_Sequence # 2.distance_to_lower_boundary # 3.distance_to_upper_boundary # 4.particle_type_embeddings # Normalized velocity sequence, merging spatial an time axis. velocity_stats = self._normalization_stats["velocity"] normalized_velocity_sequence = ( velocity_sequence - velocity_stats.mean) / velocity_stats.std ######################################################## flat_velocity_sequence = paddle.reshape(normalized_velocity_sequence, (1444, 10)) node_features.append(flat_velocity_sequence) # Normalized clipped distances to lower and upper boundaries. # boundaries are an array of shape [num_dimensions, 2], where the second # axis, provides the lower/upper boundaries. boundaries = np.array(self._boundaries) # x[:,n]就是取所有集合的第n个数据 distance_to_lower_boundary = most_recent_position - boundaries[:, 0] distance_to_upper_boundary = boundaries[:, 1] - most_recent_position distance_to_boundaries = paddle.concat( # 拼接张量操作 [distance_to_lower_boundary, distance_to_upper_boundary], axis=1) normalized_clipped_distance_to_boundaries = paddle.clip( distance_to_boundaries / self._connectivity_radius, min=-1, max=1) # 将距离控制在-1~1之间 node_features.append(normalized_clipped_distance_to_boundaries) # Particle type. # tf.nn.embedding_lookup查找数组中的序号为particle_types的元素 if self._num_particle_types > 1: particle_type_embeddings = paddle.gather( self._particle_type_embedding, particle_types) # particle_types存储着标号 node_features.append(particle_type_embeddings) # # Collect edge features. edge_features = [] #1. normalized_realative_displacements = sender - receiver / radius #2. normalized_relative_distances 向量/矩阵的范数 # Relative displacement and distances normalized to radius normalized_relative_displacements = ( paddle.gather(most_recent_position, senders) - paddle.gather( most_recent_position, receivers)) / self._connectivity_radius # sender - receiver / radius edge_features.append(normalized_relative_displacements) normalized_relative_distances = paddle.norm( normalized_relative_displacements, axis=-1, keepdim=True) edge_features.append(normalized_relative_distances) # Normalize the global context. if global_context is not None: context_stats = self._normalization_stats["context"] # Context in some datasets are all zero, so add an epsilon for numerical # stability. global_context = (global_context - context_stats.mean) / paddle.maximum( context_stats.std, STD_EPSILON) return pgl.Graph( edges=n_edge, num_nodes=n_node, node_feat=paddle.concat(node_features, axis=-1), edge_feat=paddle.concat(edge_features, axis=-1), )
def forward(self, query_matrix, key_matrix, value_matrix, d_head, attn_mask=None, rand_mask_idx=None, query_mask=None, key_mask=None, dropout=None): ''' query_matrix: [B, H, T, D] key_matrix: [B, H, T, D] value_matrix: [B, H, T, D] query_mask: [B, 1, T, 1] bool mask key_mask: [B, 1, 1, T] bool mask rand_mask_idx: [H, T//bs, bs] Global Attention Random Attention Window Attention ''' B = query_matrix.shape[0] # batch_size H = self.num_heads T = query_matrix.shape[2] # sequence_length D = query_matrix.shape[3] # size per head G = self.num_global_blocks GB = self.num_global_blocks_back GF = self.num_global_blocks_front R = self.num_rand_blocks W = self.window_size bs = self.block_size L = T // bs # blocked length blocked_query_matrix = paddle.reshape(query_matrix, [B, H, L, bs, -1]) blocked_key_matrix = paddle.reshape(key_matrix, [B, H, L, bs, -1]) blocked_value_matrix = paddle.reshape(value_matrix, [B, H, L, bs, -1]) blocked_query_mask = paddle.reshape(query_mask, [B, L, bs]) blocked_key_mask = paddle.reshape(key_mask, [B, L, bs]) # 1. global_front_product global_front_out = self._get_global_out( query_matrix, key_matrix, value_matrix, key_mask, d_head, dropout) # 2. global_back_product global_back_out = self._get_global_out(query_matrix, key_matrix, value_matrix, key_mask, d_head, dropout, False) # 3. second_product # create second matrix # [B, 1, L-G, bs, (G+W)*bs] band_mask = self._get_band_mask(blocked_query_mask, blocked_key_mask, B, T) # [B, H, L-G, bs, R*bs] rand_mask = self._get_rand_mask(blocked_query_mask, blocked_key_mask, rand_mask_idx, B, T) # [B, H, L-G, bs, (G+W+R)*bs] second_mask = paddle.concat([band_mask, rand_mask], axis=4) # [B, H, L-G, R * bs, -1] random_keys = self._gather_random_key_value(blocked_key_matrix, rand_mask_idx, B, T) random_values = self._gather_random_key_value(blocked_value_matrix, rand_mask_idx, B, T) band_keys_matrix = self._get_band_matrix(blocked_key_matrix, B, T) band_value_matrix = self._get_band_matrix(blocked_value_matrix, B, T) # [B, H, L - G, bs, -1] second_query_matrix = blocked_query_matrix[:, :, GF:-GB] # [B, H, L - G, (G+W+R)*bs, -1] second_key_matrix = paddle.concat( [band_keys_matrix, random_keys], axis=3) # [B, H, L - G, (G+W+R)*bs, -1] second_value_matrix = paddle.concat( [band_value_matrix, random_values], axis=3) second_top_value_matrix, second_middle_value_matrix, second_bottom_value_matrix = \ self._get_splited_matrix(second_value_matrix) second_product = paddle.matmul( second_query_matrix, second_key_matrix, transpose_y=True) second_product = second_product * (d_head**-0.5) second_product += (1 - second_mask) * -1e6 second_weights = F.softmax(second_product) second_top_weights, second_middle_weights, second_bottom_weights = \ self._get_splited_matrix(second_weights) second_top_out = paddle.matmul(second_top_weights, second_top_value_matrix) second_middle_out = paddle.matmul( second_middle_weights[:, :, :, :, GF * bs:-(GB + R) * bs], second_middle_value_matrix[:, :, :, GF * bs:-(GB + R) * bs]) # add global block attention second_middle_out += paddle.matmul( second_middle_weights[:, :, :, :, :GF * bs], blocked_value_matrix[:, :, 0:GF]) second_middle_out += paddle.matmul( second_middle_weights[:, :, :, :, -(GB + R) * bs:-R * bs], blocked_value_matrix[:, :, -GB:]) # add random block attention second_middle_out += paddle.matmul( second_middle_weights[:, :, :, :, -R * bs:], random_values[:, :, GF:-GB]) second_bottom_out = paddle.matmul(second_bottom_weights, second_bottom_value_matrix) second_out = paddle.concat( [second_top_out, second_middle_out, second_bottom_out], axis=2) second_out = paddle.reshape(second_out, [B, H, (L - G) * bs, -1]) # [B, H, T, D] out = paddle.concat( [global_front_out, second_out, global_back_out], axis=2) out = out * query_mask return out
def _get_band_matrix(self, blocked_matrix, B, T): ''' return global and window matrix: [B, H, L-G, (G+W) * bs, -1] ''' # blocked_matrix: [B, H, L, bs, -1] GB = self.num_global_blocks_back GF = self.num_global_blocks_front G = self.num_global_blocks R = self.num_rand_blocks W = self.window_size bs = self.block_size L = T // bs # blocked length H = self.num_heads # get roll matrix blocked_list = [] for query_block_id in range(GF, GF + W // 2): left_block_id = query_block_id - W // 2 right_block_id = query_block_id + W // 2 temp_blocked_matrix_list = [ blocked_matrix[:, :, 0:(right_block_id + 1)], blocked_matrix[:, :, -(G + W - right_block_id - 1):] ] temp_blocked_matrix = paddle.concat( temp_blocked_matrix_list, axis=2) temp_blocked_matrix = paddle.unsqueeze(temp_blocked_matrix, axis=2) blocked_list.append(temp_blocked_matrix) # get window matrix band_length = L - G - W // 2 * 2 band_matrix_list = [] for query_block_id in range(GF + W // 2, GF + W // 2 + W): left_block_id = query_block_id - W // 2 right_block_id = query_block_id + W // 2 band_matrix_list.append( paddle.unsqueeze( blocked_matrix[:, :, left_block_id:left_block_id + band_length], axis=3)) band_matrix = paddle.concat(band_matrix_list, axis=3) global_blocked_front_matrix = paddle.unsqueeze( blocked_matrix[:, :, :GF], axis=2) global_blocked_front_matrix = paddle.expand( global_blocked_front_matrix, [B, H, band_length, GF, bs, -1]) global_blocked_back_matrix = paddle.unsqueeze( blocked_matrix[:, :, -GB:], axis=2) global_blocked_back_matrix = paddle.expand( global_blocked_back_matrix, [B, H, band_length, GB, bs, -1]) band_matrix = paddle.concat( [ global_blocked_front_matrix, band_matrix, global_blocked_back_matrix ], axis=3) blocked_list.append(band_matrix) for query_block_id in range(L - GB - W // 2, L - GB): left_block_id = query_block_id - W // 2 right_block_id = query_block_id + W // 2 temp_blocked_matrix_list = [ blocked_matrix[:, :, 0:G + W - (L - left_block_id)], blocked_matrix[:, :, left_block_id:] ] temp_blocked_matrix = paddle.concat( temp_blocked_matrix_list, axis=2) temp_blocked_matrix = paddle.unsqueeze(temp_blocked_matrix, axis=2) blocked_list.append(temp_blocked_matrix) band_matrix = paddle.concat(blocked_list, axis=2) band_matrix = paddle.reshape(band_matrix, [B, H, L - G, (G + W) * bs, -1]) return band_matrix
def _get_band_mask(self, blocked_query_mask, blocked_key_mask, batch_size, sequence_length): ''' Return second mask: [B, 1, L-G, bs, G+W] ''' GB = self.num_global_blocks_back GF = self.num_global_blocks_front G = self.num_global_blocks R = self.num_rand_blocks W = self.window_size bs = self.block_size T = sequence_length L = T // bs # blocked length B = batch_size H = self.num_heads # G+W+R # query_mask: [B, L, bs] # key_mask: [B, L, bs] # [B, L-G, bs, 1] * [B, L-G, 1, G*bs] -> [B, L-G, bs, G*bs] temp_query_mask = paddle.reshape(blocked_query_mask[:, GF:-GB], [B, L - G, bs, 1]) temp_key_mask_front = paddle.reshape(blocked_key_mask[:, :GF], [B, 1, 1, GF * bs]) global_block_mask_front = paddle.matmul(temp_query_mask, temp_key_mask_front) temp_key_mask_back = paddle.reshape(blocked_key_mask[:, -GB:], [B, 1, 1, GB * bs]) global_block_mask_back = paddle.matmul(temp_query_mask, temp_key_mask_back) # create window block mask key_mask_list = [] for query_block_id in range(GF, GF + W // 2): left_block_id = query_block_id - W // 2 right_block_id = query_block_id + W // 2 zero_key_mask = paddle.zeros_like(blocked_key_mask[:, -(W - ( right_block_id + 1 - G)):-GB]) temp_key_mask = paddle.concat( [blocked_key_mask[:, GF:(right_block_id + 1)], zero_key_mask], axis=1) temp_key_mask = paddle.unsqueeze(temp_key_mask, 1) key_mask_list.append(temp_key_mask) roll_key_mask1 = paddle.concat(key_mask_list, axis=1) roll_key_mask1 = paddle.reshape(roll_key_mask1, [0, 0, W * bs]) key_mask_list = [] band_length = L - G - W // 2 * 2 for query_block_id in range(GF + W // 2, GF + W // 2 + W): left_block_id = query_block_id - W // 2 right_block_id = query_block_id + W // 2 key_mask_list.append(blocked_key_mask[:, left_block_id:left_block_id + band_length]) window_key_mask = paddle.concat(key_mask_list, axis=2) window_key_mask = paddle.reshape(window_key_mask, [0, 0, W * bs]) key_mask_list = [] for query_block_id in range((L - GB) - W // 2, L - GB): left_block_id = query_block_id - W // 2 right_block_id = query_block_id + W // 2 zero_key_mask = paddle.zeros_like(blocked_key_mask[:, GF:GF + W - ( L - left_block_id - GB)]) temp_key_mask = paddle.concat( [zero_key_mask, blocked_key_mask[:, left_block_id:-GB]], axis=1) temp_key_mask = paddle.unsqueeze(temp_key_mask, 1) key_mask_list.append(temp_key_mask) roll_key_mask2 = paddle.concat(key_mask_list, axis=1) roll_key_mask2 = paddle.reshape(roll_key_mask2, [0, 0, W * bs]) window_key_mask = paddle.concat( [roll_key_mask1, window_key_mask, roll_key_mask2], axis=1) window_key_mask = paddle.unsqueeze(window_key_mask, axis=2) # [B, L-G, bs, 1] * [B, L-G, 1, W*bs] -> [B, L-G, bs, W*bs] window_block_mask = paddle.matmul(temp_query_mask, window_key_mask) band_mask = paddle.concat( [ global_block_mask_front, window_block_mask, global_block_mask_back ], axis=3) band_mask = paddle.unsqueeze(band_mask, 1) # for head band_mask = paddle.expand(band_mask, [B, H, L - G, bs, -1]) return band_mask
def forward(self, inputs, targets=None): # if and else branch are both needed when you want to assign a variable # if you modify the var in just one branch, then the modification will not work. fea = inputs[-1] if len(fea.shape) == 3: pass else: last_shape = int(np.prod(fea.shape[2:])) # gry added fea = paddle.reshape(fea, [fea.shape[0], fea.shape[1], last_shape]) fea = fea.transpose([0, 2, 1]) # (NTC)(batch, width, channels) batch_size = fea.shape[0] hidden = paddle.zeros((batch_size, self.hidden_size)) output_hiddens = [] if self.training and targets is not None: structure = targets[0] for i in range(self.max_elem_length + 1): elem_onehots = self._char_to_onehot(structure[:, i], onehot_dim=self.elem_num) (outputs, hidden), alpha = self.structure_attention_cell( hidden, fea, elem_onehots) output_hiddens.append(paddle.unsqueeze(outputs, axis=1)) output = paddle.concat(output_hiddens, axis=1) structure_probs = self.structure_generator(output) if self.loc_type == 1: loc_preds = self.loc_generator(output) loc_preds = F.sigmoid(loc_preds) else: loc_fea = fea.transpose([0, 2, 1]) loc_fea = self.loc_fea_trans(loc_fea) loc_fea = loc_fea.transpose([0, 2, 1]) loc_concat = paddle.concat([output, loc_fea], axis=2) loc_preds = self.loc_generator(loc_concat) loc_preds = F.sigmoid(loc_preds) else: temp_elem = paddle.zeros(shape=[batch_size], dtype="int32") structure_probs = None loc_preds = None elem_onehots = None outputs = None alpha = None max_elem_length = paddle.to_tensor(self.max_elem_length) i = 0 while i < max_elem_length + 1: elem_onehots = self._char_to_onehot(temp_elem, onehot_dim=self.elem_num) (outputs, hidden), alpha = self.structure_attention_cell( hidden, fea, elem_onehots) output_hiddens.append(paddle.unsqueeze(outputs, axis=1)) structure_probs_step = self.structure_generator(outputs) temp_elem = structure_probs_step.argmax(axis=1, dtype="int32") i += 1 output = paddle.concat(output_hiddens, axis=1) structure_probs = self.structure_generator(output) structure_probs = F.softmax(structure_probs) if self.loc_type == 1: loc_preds = self.loc_generator(output) loc_preds = F.sigmoid(loc_preds) else: loc_fea = fea.transpose([0, 2, 1]) loc_fea = self.loc_fea_trans(loc_fea) loc_fea = loc_fea.transpose([0, 2, 1]) loc_concat = paddle.concat([output, loc_fea], axis=2) loc_preds = self.loc_generator(loc_concat) loc_preds = F.sigmoid(loc_preds) return {'structure_probs': structure_probs, 'loc_preds': loc_preds}
def get_loss(self, head_outputs, targets): """Here we calculate loss for a batch of images. We assign anchors to gts in each image and gather all the assigned postive and negative samples. Then loss is calculated on the gathered samples. """ cls_logits_list, bboxes_reg_list = head_outputs anchors = self.anchor_generator(cls_logits_list) anchors = paddle.concat(anchors) # matches: contain gt_inds # match_labels: -1(ignore), 0(neg) or 1(pos) matches_list, match_labels_list = [], [] # assign anchors to gts, no sampling is involved for gt_bbox in targets['gt_bbox']: matches, match_labels = self.bbox_assigner(anchors, gt_bbox) matches_list.append(matches) match_labels_list.append(match_labels) # reshape network outputs cls_logits = [ _.transpose([0, 2, 3, 1]).reshape([0, -1, self.num_classes]) for _ in cls_logits_list ] bboxes_reg = [ _.transpose([0, 2, 3, 1]).reshape([0, -1, 4]) for _ in bboxes_reg_list ] cls_logits = paddle.concat(cls_logits, axis=1) bboxes_reg = paddle.concat(bboxes_reg, axis=1) cls_pred_list, cls_tar_list = [], [] reg_pred_list, reg_tar_list = [], [] # find and gather preds and targets in each image for matches, match_labels, cls_logit, bbox_reg, gt_bbox, gt_class in \ zip(matches_list, match_labels_list, cls_logits, bboxes_reg, targets['gt_bbox'], targets['gt_class']): pos_mask = (match_labels == 1) neg_mask = (match_labels == 0) chosen_mask = paddle.logical_or(pos_mask, neg_mask) gt_class = gt_class.reshape([-1]) bg_class = paddle.to_tensor([self.num_classes], dtype=gt_class.dtype) # a trick to assign num_classes to negative targets gt_class = paddle.concat([gt_class, bg_class], axis=-1) matches = paddle.where( neg_mask, paddle.full_like(matches, gt_class.size - 1), matches) cls_pred = cls_logit[chosen_mask] cls_tar = gt_class[matches[chosen_mask]] reg_pred = bbox_reg[pos_mask].reshape([-1, 4]) reg_tar = gt_bbox[matches[pos_mask]].reshape([-1, 4]) reg_tar = bbox2delta(anchors[pos_mask], reg_tar, self.weights) cls_pred_list.append(cls_pred) cls_tar_list.append(cls_tar) reg_pred_list.append(reg_pred) reg_tar_list.append(reg_tar) cls_pred = paddle.concat(cls_pred_list) cls_tar = paddle.concat(cls_tar_list) reg_pred = paddle.concat(reg_pred_list) reg_tar = paddle.concat(reg_tar_list) avg_factor = max(1.0, reg_pred.shape[0]) cls_loss = self.loss_class(cls_pred, cls_tar, reduction='sum') / avg_factor if reg_pred.shape[0] == 0: reg_loss = paddle.zeros([1]) reg_loss.stop_gradient = False else: reg_loss = self.loss_bbox(reg_pred, reg_tar, reduction='sum') / avg_factor loss = cls_loss + reg_loss out_dict = { 'loss_cls': cls_loss, 'loss_reg': reg_loss, 'loss': loss, } return out_dict
def forward(self, inputs): x = self.primary_conv(inputs) y = self.cheap_operation(x) out = paddle.concat([x, y], axis=1) return out
def forward(self, x: paddle.Tensor) -> paddle.Tensor: x = self.conv(x) left = self.left(x) right = self.right(x) concat = paddle.concat([left, right], axis=1) return self.fuse(concat)
def concat_unsqueeze1(inputs): return paddle.concat( [inputs[:, 0].unsqueeze(1), inputs[:, 1].unsqueeze(1)], axis=1)
def ops(self, inputs): """ operation """ concat = paddle.concat(x=inputs, axis=self.axis) return concat
def get_loss(self, head_outs, gt_meta): cls_scores, bbox_preds = head_outs num_level_anchors = [ featmap.shape[-2] * featmap.shape[-1] for featmap in cls_scores ] num_imgs = gt_meta['im_id'].shape[0] featmap_sizes = [[featmap.shape[-2], featmap.shape[-1]] for featmap in cls_scores] decode_bbox_preds = [] center_and_strides = [] for featmap_size, stride, bbox_pred in zip(featmap_sizes, self.fpn_stride, bbox_preds): # center in origin image yy, xx = self.get_single_level_center_point(featmap_size, stride, self.cell_offset) strides = paddle.full((len(xx), ), stride) center_and_stride = paddle.stack([xx, yy, strides, strides], -1).tile([num_imgs, 1, 1]) center_and_strides.append(center_and_stride) center_in_feature = center_and_stride.reshape( [-1, 4])[:, :-2] / stride bbox_pred = bbox_pred.transpose([0, 2, 3, 1]).reshape( [num_imgs, -1, 4 * (self.reg_max + 1)]) pred_distances = self.distribution_project(bbox_pred) decode_bbox_pred_wo_stride = distance2bbox( center_in_feature, pred_distances).reshape([num_imgs, -1, 4]) decode_bbox_preds.append(decode_bbox_pred_wo_stride * stride) flatten_cls_preds = [ cls_pred.transpose([0, 2, 3, 1]).reshape( [num_imgs, -1, self.cls_out_channels]) for cls_pred in cls_scores ] flatten_cls_preds = paddle.concat(flatten_cls_preds, axis=1) flatten_bboxes = paddle.concat(decode_bbox_preds, axis=1) flatten_center_and_strides = paddle.concat(center_and_strides, axis=1) gt_boxes, gt_labels = gt_meta['gt_bbox'], gt_meta['gt_class'] pos_num_l, label_l, label_weight_l, bbox_target_l = [], [], [], [] for flatten_cls_pred, flatten_center_and_stride, flatten_bbox,gt_box,gt_label \ in zip(flatten_cls_preds.detach(), flatten_center_and_strides.detach(), \ flatten_bboxes.detach(),gt_boxes,gt_labels): pos_num, label, label_weight, bbox_target = self._get_target_single( flatten_cls_pred, flatten_center_and_stride, flatten_bbox, gt_box, gt_label) pos_num_l.append(pos_num) label_l.append(label) label_weight_l.append(label_weight) bbox_target_l.append(bbox_target) labels = paddle.to_tensor(np.stack(label_l, axis=0)) label_weights = paddle.to_tensor(np.stack(label_weight_l, axis=0)) bbox_targets = paddle.to_tensor(np.stack(bbox_target_l, axis=0)) center_and_strides_list = self._images_to_levels( flatten_center_and_strides, num_level_anchors) labels_list = self._images_to_levels(labels, num_level_anchors) label_weights_list = self._images_to_levels(label_weights, num_level_anchors) bbox_targets_list = self._images_to_levels(bbox_targets, num_level_anchors) num_total_pos = sum(pos_num_l) try: num_total_pos = paddle.distributed.all_reduce(num_total_pos.clone( )) / paddle.distributed.get_world_size() except: num_total_pos = max(num_total_pos, 1) loss_bbox_list, loss_dfl_list, loss_vfl_list, avg_factor = [], [], [], [] for cls_score, bbox_pred, center_and_strides, labels, label_weights, bbox_targets, stride in zip( cls_scores, bbox_preds, center_and_strides_list, labels_list, label_weights_list, bbox_targets_list, self.fpn_stride): center_and_strides = center_and_strides.reshape([-1, 4]) cls_score = cls_score.transpose([0, 2, 3, 1]).reshape( [-1, self.cls_out_channels]) bbox_pred = bbox_pred.transpose([0, 2, 3, 1]).reshape( [-1, 4 * (self.reg_max + 1)]) bbox_targets = bbox_targets.reshape([-1, 4]) labels = labels.reshape([-1]) bg_class_ind = self.num_classes pos_inds = paddle.nonzero( paddle.logical_and((labels >= 0), (labels < bg_class_ind)), as_tuple=False).squeeze(1) # vfl vfl_score = np.zeros(cls_score.shape) if len(pos_inds) > 0: pos_bbox_targets = paddle.gather(bbox_targets, pos_inds, axis=0) pos_bbox_pred = paddle.gather(bbox_pred, pos_inds, axis=0) pos_centers = paddle.gather( center_and_strides[:, :-2], pos_inds, axis=0) / stride weight_targets = F.sigmoid(cls_score.detach()) weight_targets = paddle.gather( weight_targets.max(axis=1, keepdim=True), pos_inds, axis=0) pos_bbox_pred_corners = self.distribution_project(pos_bbox_pred) pos_decode_bbox_pred = distance2bbox(pos_centers, pos_bbox_pred_corners) pos_decode_bbox_targets = pos_bbox_targets / stride bbox_iou = bbox_overlaps( pos_decode_bbox_pred.detach().numpy(), pos_decode_bbox_targets.detach().numpy(), is_aligned=True) # vfl pos_labels = paddle.gather(labels, pos_inds, axis=0) vfl_score[pos_inds.numpy(), pos_labels] = bbox_iou pred_corners = pos_bbox_pred.reshape([-1, self.reg_max + 1]) target_corners = bbox2distance(pos_centers, pos_decode_bbox_targets, self.reg_max).reshape([-1]) # regression loss loss_bbox = paddle.sum( self.loss_bbox(pos_decode_bbox_pred, pos_decode_bbox_targets) * weight_targets) # dfl loss loss_dfl = self.loss_dfl( pred_corners, target_corners, weight=weight_targets.expand([-1, 4]).reshape([-1]), avg_factor=4.0) else: loss_bbox = bbox_pred.sum() * 0 loss_dfl = bbox_pred.sum() * 0 weight_targets = paddle.to_tensor([0], dtype='float32') # vfl loss num_pos_avg_per_gpu = num_total_pos vfl_score = paddle.to_tensor(vfl_score) loss_vfl = self.loss_vfl( cls_score, vfl_score, avg_factor=num_pos_avg_per_gpu) loss_bbox_list.append(loss_bbox) loss_dfl_list.append(loss_dfl) loss_vfl_list.append(loss_vfl) avg_factor.append(weight_targets.sum()) avg_factor = sum(avg_factor) try: avg_factor = paddle.distributed.all_reduce(avg_factor.clone()) avg_factor = paddle.clip( avg_factor / paddle.distributed.get_world_size(), min=1) except: avg_factor = max(avg_factor.item(), 1) if avg_factor <= 0: loss_vfl = paddle.to_tensor(0, dtype='float32', stop_gradient=False) loss_bbox = paddle.to_tensor( 0, dtype='float32', stop_gradient=False) loss_dfl = paddle.to_tensor(0, dtype='float32', stop_gradient=False) else: losses_bbox = list(map(lambda x: x / avg_factor, loss_bbox_list)) losses_dfl = list(map(lambda x: x / avg_factor, loss_dfl_list)) loss_vfl = sum(loss_vfl_list) loss_bbox = sum(losses_bbox) loss_dfl = sum(losses_dfl) loss_states = dict( loss_vfl=loss_vfl, loss_bbox=loss_bbox, loss_dfl=loss_dfl) return loss_states
def _concat(x, out): # concatenate along channel axis return paddle.concat((x, out), 1)
def beam_search(self, input_ids, beam_scorer, logits_processors, max_length, pad_token_id, eos_token_id, **model_kwargs): batch_size = len(beam_scorer._beam_hyps) num_beams = beam_scorer.num_beams batch_beam_size, cur_len = input_ids.shape origin_len = cur_len assert ( num_beams * batch_size == batch_beam_size ), "Batch dimension of `input_ids` should be {}, but received {}.".format( num_beams * batch_size, batch_beam_size) beam_scores = paddle.zeros( (batch_size, num_beams), dtype=paddle.get_default_dtype()) beam_scores[:, 1:] = -1e9 beam_scores = paddle.reshape(beam_scores, [-1]) while cur_len < max_length: # prepare model inputs & get model output model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs) outputs = self(**model_inputs) logits = outputs[0] if isinstance(outputs, tuple) else outputs # [batch_size, vocab_size] logits = logits[:, -1, :] # pre-process distribution logits = self.adjust_logits_during_generation(logits) logits = logits_processors(input_ids, logits) # beam search # [batch_size * num_beams, vocab_size] next_scores = F.softmax(logits) next_scores = paddle.log(next_scores) next_scores = next_scores + beam_scores.unsqueeze(-1) # reshape for beam search vocab_size = next_scores.shape[-1] next_scores = next_scores.reshape( [batch_size, num_beams * vocab_size]) next_scores, next_tokens = paddle.topk( next_scores, 2 * num_beams, axis=1) next_indices = next_tokens // vocab_size next_tokens = next_tokens % vocab_size # stateless beam_outputs = beam_scorer.process( input_ids, next_scores, next_tokens, next_indices, pad_token_id=pad_token_id, eos_token_id=eos_token_id, ) beam_scores = beam_outputs["next_beam_scores"] beam_next_tokens = beam_outputs["next_beam_tokens"] beam_idx = beam_outputs["next_beam_indices"] cur_len += 1 input_ids = paddle.concat( [ paddle.index_select(input_ids, beam_idx), beam_next_tokens.unsqueeze(-1) ], axis=-1) if beam_scorer.is_done: break model_kwargs = self.update_model_kwargs_for_generation(outputs, model_kwargs) if model_kwargs["cache"] is not None: # reorder the cache model_kwargs["cache"] = map_structure( lambda x: paddle.index_select(x, beam_idx), model_kwargs["cache"]) pred_ids, scores = beam_scorer.finalize( input_ids, beam_scores, next_tokens, next_indices, pad_token_id=pad_token_id, eos_token_id=eos_token_id) return pred_ids[:, origin_len:], scores
def get_loss(self, scores, deltas, targets, rois, bbox_weight): """ scores (Tensor): scores from bbox head outputs deltas (Tensor): deltas from bbox head outputs targets (list[List[Tensor]]): bbox targets containing tgt_labels, tgt_bboxes and tgt_gt_inds rois (List[Tensor]): RoIs generated in each batch """ cls_name = 'loss_bbox_cls' reg_name = 'loss_bbox_reg' loss_bbox = {} # TODO: better pass args tgt_labels, tgt_bboxes, tgt_gt_inds = targets # bbox cls tgt_labels = paddle.concat( tgt_labels) if len(tgt_labels) > 1 else tgt_labels[0] valid_inds = paddle.nonzero(tgt_labels >= 0).flatten() if valid_inds.shape[0] == 0: loss_bbox[cls_name] = paddle.zeros([1], dtype='float32') else: tgt_labels = tgt_labels.cast('int64') tgt_labels.stop_gradient = True loss_bbox_cls = F.cross_entropy(input=scores, label=tgt_labels, reduction='mean') loss_bbox[cls_name] = loss_bbox_cls # bbox reg cls_agnostic_bbox_reg = deltas.shape[1] == 4 fg_inds = paddle.nonzero( paddle.logical_and(tgt_labels >= 0, tgt_labels < self.num_classes)).flatten() if fg_inds.numel() == 0: loss_bbox[reg_name] = paddle.zeros([1], dtype='float32') return loss_bbox if cls_agnostic_bbox_reg: reg_delta = paddle.gather(deltas, fg_inds) else: fg_gt_classes = paddle.gather(tgt_labels, fg_inds) reg_row_inds = paddle.arange(fg_gt_classes.shape[0]).unsqueeze(1) reg_row_inds = paddle.tile(reg_row_inds, [1, 4]).reshape([-1, 1]) reg_col_inds = 4 * fg_gt_classes.unsqueeze(1) + paddle.arange(4) reg_col_inds = reg_col_inds.reshape([-1, 1]) reg_inds = paddle.concat([reg_row_inds, reg_col_inds], axis=1) reg_delta = paddle.gather(deltas, fg_inds) reg_delta = paddle.gather_nd(reg_delta, reg_inds).reshape([-1, 4]) rois = paddle.concat(rois) if len(rois) > 1 else rois[0] tgt_bboxes = paddle.concat( tgt_bboxes) if len(tgt_bboxes) > 1 else tgt_bboxes[0] reg_target = bbox2delta(rois, tgt_bboxes, bbox_weight) reg_target = paddle.gather(reg_target, fg_inds) reg_target.stop_gradient = True if self.bbox_loss is not None: reg_delta = self.bbox_transform(reg_delta) reg_target = self.bbox_transform(reg_target) loss_bbox_reg = self.bbox_loss( reg_delta, reg_target).sum() / tgt_labels.shape[0] loss_bbox_reg *= self.num_classes else: loss_bbox_reg = paddle.abs(reg_delta - reg_target).sum() / tgt_labels.shape[0] loss_bbox[reg_name] = loss_bbox_reg return loss_bbox
def forward(self, inputs): """modeling forward stage of encoder """ seq_hidden, cls_hidden = self.base_encoder(inputs['src_ids'], inputs['sent_ids']) if self.pretrain_model_type != 'ERNIE' and self.pretrain_model_type != 'BERT': cls_hidden, seq_hidden = seq_hidden, cls_hidden question_tokens_index = inputs["question_tokens_index"] table_indexes = inputs["table_indexes"] column_indexes = inputs["column_indexes"] value_indexes = inputs["value_indexes"] question_encs = nn_utils.batch_gather_2d(seq_hidden, question_tokens_index) table_encs = nn_utils.batch_gather_2d(seq_hidden, table_indexes) column_encs = nn_utils.batch_gather_2d(seq_hidden, column_indexes) value_encs = nn_utils.batch_gather_2d(seq_hidden, value_indexes) if self.enc_value_with_col: value_num = value_encs.shape[1] // 2 value_encs = value_encs.reshape( [value_encs.shape[0], value_num, 2, -1]).sum(axis=2) orig_inputs = inputs['orig_inputs'] column_pointer_maps = [{ i: [i] for i in range(len(orig_input.columns)) } for orig_input in orig_inputs] table_pointer_maps = [{i: [i] for i in range(len(orig_input.tables))} for orig_input in orig_inputs] value_pointer_maps = [{i: [i] for i in range(len(orig_input.values))} for orig_input in orig_inputs] enc_results = [] # calculate relation encoding one-by-one for batch_idx, orig_input in enumerate(orig_inputs): q_len = orig_input.column_indexes[0] - 2 col_size = len(orig_input.columns) tab_size = len(orig_input.tables) val_size = len(orig_input.values) q_enc = question_encs[batch_idx][:q_len] tab_enc = table_encs[batch_idx][:tab_size] col_enc = column_encs[batch_idx][:col_size] val_enc = value_encs[batch_idx][:val_size] c_boundary = list(range(col_size + 1)) t_boundary = list(range(tab_size + 1)) v_e_input = val_enc.unsqueeze(0) if self.rel_has_value else None (q_enc_new, c_enc_new, t_enc_new, v_enc_new), align_mat = self.encs_update.forward_unbatched( q_enc.unsqueeze(0), col_enc.unsqueeze(0), tab_enc.unsqueeze(0), c_boundary, t_boundary, orig_input.relations, v_e_input) memory = [] if 'question' in self.include_in_memory: memory.append(q_enc_new) if 'table' in self.include_in_memory: memory.append(t_enc_new) if 'column' in self.include_in_memory: memory.append(c_enc_new) if 'value' in self.include_in_memory and self.rel_has_value: memory.append(v_enc_new) memory = paddle.concat(memory, axis=1) if not self.rel_has_value: v_enc_new = val_enc.unsqueeze(0) m2v_align_mat = self.value_align(memory, v_enc_new, relations=None) align_mat[2] = m2v_align_mat schema_memory = (c_enc_new, t_enc_new) if self.rel_has_value: schema_memory += (v_enc_new, ) enc_results.append( EncoderState( state=None, cls_hidden=cls_hidden[batch_idx], memory=memory, question_memory=q_enc_new, schema_memory=paddle.concat(schema_memory, axis=1), words=orig_input.question_tokens, pointer_memories={ 'table': t_enc_new, 'column': c_enc_new, 'value': v_enc_new, }, pointer_maps={ 'column': column_pointer_maps[batch_idx], 'table': table_pointer_maps[batch_idx], 'value': value_pointer_maps[batch_idx], }, m2c_align_mat=align_mat[0], m2t_align_mat=align_mat[1], m2v_align_mat=align_mat[2], )) return enc_results
def forward(self, x): bn_x = self.bnorm(x[:, :self.bnorm_channels, :, :]) in_x = self.inorm(x[:, self.bnorm_channels:, :, :]) return paddle.concat((bn_x, in_x), 1)
def predictEnsembleThree(model, model_1, model_crop, model_path, model_path_1, model_path_crop, transforms, transforms_crop, image_list, image_dir=None, save_dir='output', aug_pred=False, scales=1.0, flip_horizontal=True, flip_vertical=False, is_slide=False, stride=None, crop_size=None): """ predict and visualize the image_list. Args: model (nn.Layer): Used to predict for input image. model_path (str): The path of pretrained model. transforms (transform.Compose): Preprocess for input image. image_list (list): A list of image path to be predicted. image_dir (str, optional): The root directory of the images predicted. Default: None. save_dir (str, optional): The directory to save the visualized results. Default: 'output'. aug_pred (bool, optional): Whether to use mulit-scales and flip augment for predition. Default: False. scales (list|float, optional): Scales for augment. It is valid when `aug_pred` is True. Default: 1.0. flip_horizontal (bool, optional): Whether to use flip horizontally augment. It is valid when `aug_pred` is True. Default: True. flip_vertical (bool, optional): Whether to use flip vertically augment. It is valid when `aug_pred` is True. Default: False. is_slide (bool, optional): Whether to predict by sliding window. Default: False. stride (tuple|list, optional): The stride of sliding window, the first is width and the second is height. It should be provided when `is_slide` is True. crop_size (tuple|list, optional): The crop size of sliding window, the first is width and the second is height. It should be provided when `is_slide` is True. """ utils.utils.load_entire_model(model, model_path) model.eval() utils.utils.load_entire_model(model_1, model_path_1) model_1.eval() utils.utils.load_entire_model(model_crop, model_path_crop) model_crop.eval() nranks = paddle.distributed.get_world_size() local_rank = paddle.distributed.get_rank() if nranks > 1: img_lists = partition_list(image_list, nranks) else: img_lists = [image_list] added_saved_dir = os.path.join(save_dir, 'added_prediction') pred_saved_dir = os.path.join(save_dir, 'pseudo_color_prediction') logger.info("Start to predict...") progbar_pred = progbar.Progbar(target=len(img_lists[0]), verbose=1) with paddle.no_grad(): for i, im_path in enumerate(img_lists[local_rank]): im_origin = cv2.imread(im_path) ori_shape = im_origin.shape[:2] im, _ = transforms(im_origin) im = im[np.newaxis, ...] im = paddle.to_tensor(im) ims, _ = transforms_crop(im_origin) im1 = ims[:, 540:540 + 720, 320:320 + 1280] im2 = ims[:, 540:540 + 720, 960:960 + 1280] im3 = ims[:, 540:540 + 720, 1600:1600 + 1280] im1 = im1[np.newaxis, ...] im1 = paddle.to_tensor(im1) im2 = im2[np.newaxis, ...] im2 = paddle.to_tensor(im2) im3 = im3[np.newaxis, ...] im3 = paddle.to_tensor(im3) ims_ = [im1, im2, im3] if aug_pred: pred = infer_ensemble.aug_inference( model, model_1, im, ori_shape=ori_shape, transforms=transforms.transforms, scales=scales, flip_horizontal=flip_horizontal, flip_vertical=flip_vertical, is_slide=is_slide, stride=stride, crop_size=crop_size) else: pred = infer_ensemble.inference( model, model_1, im, ori_shape=ori_shape, transforms=transforms.transforms, is_slide=is_slide, stride=stride, crop_size=crop_size) preds = [] for ii in range(3): im_ = ims_[ii] if aug_pred: pred_crop = infer_crop.aug_inference( model, im_, ori_shape=ori_shape, transforms=transforms.transforms, scales=scales, flip_horizontal=flip_horizontal, flip_vertical=flip_vertical, is_slide=is_slide, stride=stride, crop_size=crop_size) else: pred_crop = infer_crop.inference( model, im_, ori_shape=ori_shape, transforms=transforms.transforms, is_slide=is_slide, stride=stride, crop_size=crop_size) preds.append(pred_crop) left_ensem = (preds[0][:, :, :, 640:1280] + preds[1][:, :, :, 0:640]) / 2 right_ensem = (preds[1][:, :, :, 640:1280] + preds[2][:, :, :, 0:640]) / 2 pred_ensem = paddle.concat([ preds[0][:, :, :, 0:640], left_ensem, right_ensem, preds[2][:, :, :, 640:1280] ], axis=3) logit = F.interpolate(pred_ensem, (432, 768), mode='bilinear') pred_logit = pred.clone() pred_logit[:, :, 324:756, 576:1344] = logit pred = pred + pred_logit pred = F.interpolate(pred, ori_shape, mode='bilinear') pred = paddle.argmax(pred, axis=1, keepdim=True, dtype='int32') pred = paddle.squeeze(pred) pred = pred.numpy().astype('uint8') # get the saved name if image_dir is not None: im_file = im_path.replace(image_dir, '') else: im_file = os.path.basename(im_path) if im_file[0] == '/': im_file = im_file[1:] # save added image added_image = utils.visualize.visualize(im_path, pred, weight=0.6) added_image_path = os.path.join(added_saved_dir, im_file) mkdir(added_image_path) cv2.imwrite(added_image_path, added_image) # save pseudo color prediction pred_mask = utils.visualize.get_pseudo_color_map(pred) pred_saved_path = os.path.join(pred_saved_dir, im_file.rsplit(".")[0] + ".png") mkdir(pred_saved_path) pred_mask.save(pred_saved_path) # pred_im = utils.visualize(im_path, pred, weight=0.0) # pred_saved_path = os.path.join(pred_saved_dir, im_file) # mkdir(pred_saved_path) # cv2.imwrite(pred_saved_path, pred_im) progbar_pred.update(i + 1)
def main(args): paddle.seed(12345) config = load_yaml(args.config_yaml) use_gpu = config.get("dygraph.use_gpu", True) train_data_dir = config.get("dygraph.train_data_dir", None) epochs = config.get("dygraph.epochs", None) print_interval = config.get("dygraph.print_interval", None) model_save_path = config.get("dygraph.model_save_path", "model_output") dense_input_dim = config.get('hyper_parameters.dense_input_dim', None) print("***********************************") logger.info( "use_gpu: {}, train_data_dir: {}, epochs: {}, print_interval: {}, model_save_path: {}" .format(use_gpu, train_data_dir, epochs, print_interval, model_save_path)) print("***********************************") place = paddle.set_device('gpu' if use_gpu else 'cpu') fm_model = create_model(config) model_init_path = config.get("dygraph.model_init_path", None) if model_init_path is not None: load_model(model_init_path, fm_model) # to do : add optimizer function optimizer = paddle.optimizer.Adam(parameters=fm_model.parameters()) file_list = [ os.path.join(train_data_dir, x) for x in os.listdir(train_data_dir) ] print("read data") dataset = CriteoDataset(file_list) train_dataloader = create_data_loader(dataset, place=place, config=config) last_epoch_id = config.get("last_epoch", -1) for epoch_id in range(last_epoch_id + 1, epochs): # set train mode fm_model.train() auc_metric = paddle.metric.Auc("ROC") epoch_begin = time.time() interval_begin = time.time() train_reader_cost = 0.0 train_run_cost = 0.0 total_samples = 0 reader_start = time.time() for batch_id, batch in enumerate(train_dataloader()): train_reader_cost += time.time() - reader_start optimizer.clear_grad() train_start = time.time() batch_size = len(batch[0]) label, sparse_tensor, dense_tensor = create_feeds( batch, dense_input_dim) pred = fm_model(sparse_tensor, dense_tensor) loss = create_loss(pred, label) loss.backward() optimizer.step() train_run_cost += time.time() - train_start total_samples += batch_size label_int = paddle.cast(label, 'int64') # for auc predict_2d = paddle.concat(x=[1 - pred, pred], axis=1) auc_metric.update(preds=predict_2d.numpy(), labels=label_int.numpy()) if batch_id % print_interval == 0: logger.info( "epoch: {}, batch_id: {}, auc: {:.6f}, avg_reader_cost: {:.5f} sec, avg_batch_cost: {:.5f} sec, avg_samples: {:.5f}, ips: {:.5f} images/sec" .format( epoch_id, batch_id, auc_metric.accumulate(), train_reader_cost / print_interval, (train_reader_cost + train_run_cost) / print_interval, total_samples / print_interval, total_samples / (train_reader_cost + train_run_cost))) train_reader_cost = 0.0 train_run_cost = 0.0 total_samples = 0 reader_start = time.time() logger.info("epoch: {} done, auc: {:.6f}, epoch time:{:.2f} s".format( epoch_id, auc_metric.accumulate(), time.time() - epoch_begin)) save_model(fm_model, optimizer, model_save_path, epoch_id, prefix='rec')
def get_loss(self, cate_preds, kernel_preds, ins_pred, ins_labels, cate_labels, grid_order_list, fg_num): """ Get loss of network of SOLOv2. Args: cate_preds (list): Tensor list of categroy branch output. kernel_preds (list): Tensor list of kernel branch output. ins_pred (list): Tensor list of instance branch output. ins_labels (list): List of instance labels pre batch. cate_labels (list): List of categroy labels pre batch. grid_order_list (list): List of index in pre grid. fg_num (int): Number of positive samples in a mini-batch. Returns: loss_ins (Tensor): The instance loss Tensor of SOLOv2 network. loss_cate (Tensor): The category loss Tensor of SOLOv2 network. """ batch_size = paddle.shape(grid_order_list[0])[0] ins_pred_list = [] for kernel_preds_level, grid_orders_level in zip(kernel_preds, grid_order_list): if grid_orders_level.shape[1] == 0: ins_pred_list.append(None) continue grid_orders_level = paddle.reshape(grid_orders_level, [-1]) reshape_pred = paddle.reshape( kernel_preds_level, shape=(paddle.shape(kernel_preds_level)[0], paddle.shape(kernel_preds_level)[1], -1)) reshape_pred = paddle.transpose(reshape_pred, [0, 2, 1]) reshape_pred = paddle.reshape( reshape_pred, shape=(-1, paddle.shape(reshape_pred)[2])) gathered_pred = paddle.gather(reshape_pred, index=grid_orders_level) gathered_pred = paddle.reshape( gathered_pred, shape=[batch_size, -1, paddle.shape(gathered_pred)[1]]) cur_ins_pred = ins_pred cur_ins_pred = paddle.reshape( cur_ins_pred, shape=(paddle.shape(cur_ins_pred)[0], paddle.shape(cur_ins_pred)[1], -1)) ins_pred_conv = paddle.matmul(gathered_pred, cur_ins_pred) cur_ins_pred = paddle.reshape( ins_pred_conv, shape=(-1, paddle.shape(ins_pred)[-2], paddle.shape(ins_pred)[-1])) ins_pred_list.append(cur_ins_pred) num_ins = paddle.sum(fg_num) cate_preds = [ paddle.reshape( paddle.transpose(cate_pred, [0, 2, 3, 1]), shape=(-1, self.cate_out_channels)) for cate_pred in cate_preds ] flatten_cate_preds = paddle.concat(cate_preds) new_cate_labels = [] for cate_label in cate_labels: new_cate_labels.append(paddle.reshape(cate_label, shape=[-1])) cate_labels = paddle.concat(new_cate_labels) loss_ins, loss_cate = self.solov2_loss( ins_pred_list, ins_labels, flatten_cate_preds, cate_labels, num_ins) return {'loss_ins': loss_ins, 'loss_cate': loss_cate}
def forward(self, inputs_tensor, is_infer=0): # input inputs = inputs_tensor[0] # sparse_tensor dense_tensor = inputs_tensor[1] self.btag_his = inputs[:, 0:50] self.cate_his = inputs[:, 50:100] self.brand_his = inputs[:, 100:150] self.mask = inputs[:, 150:200] self.match_mask = inputs[:, 200:250] self.uid = inputs[:, 250] self.cms_segid = inputs[:, 251] self.cms_group_id = inputs[:, 252] self.final_gender_code = inputs[:, 253] self.age_level = inputs[:, 254] self.pvalue_level = inputs[:, 255] self.shopping_level = inputs[:, 256] self.occupation = inputs[:, 257] self.new_user_class_level = inputs[:, 258] self.mid = inputs[:, 259] self.cate_id = inputs[:, 260] self.campaign_id = inputs[:, 261] self.customer = inputs[:, 262] self.brand = inputs[:, 263] self.price = dense_tensor.astype('float32') self.pid = inputs[:, 265] if is_infer == 0: self.labels = inputs[:, 266] # embedding layer self.uid_batch_embedded = self.uid_embeddings_var(self.uid) self.mid_batch_embedded = self.mid_embeddings_var(self.mid) self.cat_batch_embedded = self.cat_embeddings_var(self.cate_id) self.cat_his_batch_embedded = self.cat_embeddings_var(self.cate_his) self.brand_batch_embedded = self.brand_embeddings_var(self.brand) self.brand_his_batch_embedded = self.brand_embeddings_var( self.brand_his) self.btag_his_batch_embedded = self.btag_embeddings_var(self.btag_his) self.dm_btag_his_batch_embedded = self.dm_btag_embeddings_var( self.btag_his) self.campaign_id_batch_embedded = self.campaign_id_embeddings_var( self.campaign_id) self.customer_batch_embedded = self.customer_embeddings_var( self.customer) self.cms_segid_batch_embedded = self.cms_segid_embeddings_var( self.cms_segid) self.cms_group_id_batch_embedded = self.cms_group_id_embeddings_var( self.cms_group_id) self.final_gender_code_batch_embedded = self.final_gender_code_embeddings_var( self.final_gender_code) self.age_level_batch_embedded = self.age_level_embeddings_var( self.age_level) self.pvalue_level_batch_embedded = self.pvalue_level_embeddings_var( self.pvalue_level) self.shopping_level_batch_embedded = self.shopping_level_embeddings_var( self.shopping_level) self.occupation_batch_embedded = self.occupation_embeddings_var( self.occupation) self.new_user_class_level_batch_embedded = self.new_user_class_level_embeddings_var( self.new_user_class_level) self.pid_batch_embedded = self.pid_embeddings_var(self.pid) self.user_feat = paddle.concat([ self.uid_batch_embedded, self.cms_segid_batch_embedded, self.cms_group_id_batch_embedded, self.final_gender_code_batch_embedded, self.age_level_batch_embedded, self.pvalue_level_batch_embedded, self.shopping_level_batch_embedded, self.occupation_batch_embedded, self.new_user_class_level_batch_embedded ], -1) self.item_his_eb = paddle.concat( [self.cat_his_batch_embedded, self.brand_his_batch_embedded], -1) self.item_his_eb_sum = paddle.sum(self.item_his_eb, 1) self.item_feat = paddle.concat([ self.mid_batch_embedded, self.cat_batch_embedded, self.brand_batch_embedded, self.campaign_id_batch_embedded, self.customer_batch_embedded, self.price ], -1) self.item_eb = paddle.concat( [self.cat_batch_embedded, self.brand_batch_embedded], -1) self.context_feat = self.pid_batch_embedded self.position_his_eb = self.position_embeddings_var( self.position_his) # T, E self.position_his_eb = paddle.tile( self.position_his_eb, [paddle.shape(self.mid)[0], 1]) # B*T, E self.position_his_eb = paddle.reshape(self.position_his_eb, [ paddle.shape(self.mid)[0], -1, paddle.shape(self.position_his_eb)[1] ]) # B, T, E self.dm_position_his_eb = self.dm_position_embeddings_var( self.dm_position_his) # T, E self.dm_position_his_eb = paddle.tile( self.dm_position_his_eb, [paddle.shape(self.mid)[0], 1]) # B*T, E self.dm_position_his_eb = paddle.reshape(self.dm_position_his_eb, [ paddle.shape(self.mid)[0], -1, paddle.shape(self.dm_position_his_eb)[1] ]) # B, T, E self.position_his_eb = paddle.concat( [self.position_his_eb, self.btag_his_batch_embedded], -1) self.dm_position_his_eb = paddle.concat( [self.dm_position_his_eb, self.dm_btag_his_batch_embedded], -1) # User-to-Item Network # Auxiliary Match Network self.match_mask = paddle.cast(self.match_mask, 'float32') self.aux_loss, self.dm_user_vector, scores = self._deep_match( self.item_his_eb, self.dm_position_his_eb, self.mask, self.match_mask, self.cate_his, self.dm_item_vectors_var.weight, self.dm_item_biases, self.cate_size) self.aux_loss *= 0.1 self.dm_item_vec = self.dm_item_vectors_var(self.cate_id) rel_u2i = paddle.sum(self.dm_user_vector * self.dm_item_vec, -1, keepdim=True) # B,1 self.rel_u2i = rel_u2i # Item-to-Item Network att_outputs, alphas, scores_unnorm = self._dmr_fcn_attention( self.item_eb, self.item_his_eb, self.position_his_eb, self.mask) rel_i2i = paddle.unsqueeze(paddle.sum(scores_unnorm, [1, 2]), -1) self.rel_i2i = rel_i2i self.scores = paddle.sum(alphas, 1) inp = paddle.concat([ self.user_feat, self.item_feat, self.context_feat, self.item_his_eb_sum, self.item_eb * self.item_his_eb_sum, rel_u2i, rel_i2i, att_outputs ], -1) # build fcn net inp = self.inp_layer(inp) dnn0 = self.dnn0_layer(inp) dnn0 = self.dnn0_prelu(dnn0) dnn1 = self.dnn1_layer(dnn0) dnn1 = self.dnn1_prelu(dnn1) dnn2 = self.dnn2_layer(dnn1) dnn2 = self.dnn2_prelu(dnn2) dnn3 = self.dnn3_layer(dnn2) dnn3 = self.dnn3_prelu(dnn3) # prediction self.y_hat = F.sigmoid(dnn3) if is_infer == False: # Cross-entropy loss and optimizer initialization x = paddle.sum(dnn3, 1) BCE = paddle.nn.BCEWithLogitsLoss() ctr_loss = paddle.mean(BCE(x, label=self.labels.astype('float32'))) self.ctr_loss = ctr_loss self.loss = self.ctr_loss + self.aux_loss return self.y_hat, self.loss else: return self.y_hat, paddle.ones(shape=[1])