def forward(self, inputs, targets): """ Args: inputs (torch.Tensor): feature matrix with shape (batch_size, feat_dim). targets (torch.LongTensor): ground truth labels with shape (num_classes). """ n = inputs.size(0) # Compute pairwise distance, replace by the official when merged dist = flow.pow(inputs, 2).sum(dim=1).expand(n, n) dist = dist + flow.transpose(dist, dim0=1, dim1=0) temp1 = -2 * flow.matmul(inputs, flow.transpose(inputs, dim0=1, dim1=0)) dist = flow.add(dist, temp1) dist = flow.sqrt(flow.clamp(dist, min=1e-12)) # For each anchor, find the hardest positive and negative mask = targets.expand(n, n).eq( flow.transpose(targets.expand(n, n), dim0=1, dim1=0)) dist_ap, dist_an = [], [] y1 = flow.zeros((1, n), dtype=flow.float32).to("cuda") y2 = flow.Tensor(np.exp(100 * np.ones((1, n)))).to("cuda") for i in range(n): temp_dist = flow.slice(dist, [(i, i + 1, 1)]) temp_mask = flow.slice(mask, [(i, i + 1, 1)]) temp_mask_rev = flow.slice(1 - mask, [(i, i + 1, 1)]) dist_ap.append(temp_mask.where(temp_dist, y1).max().unsqueeze(0)) dist_an.append( temp_mask_rev.where(temp_dist, y2).min().unsqueeze(0)) dist_ap = flow.cat(dist_ap) dist_an = flow.cat(dist_an) # Compute ranking hinge loss y = flow.ones_like(dist_an) return self.ranking_loss(dist_an, dist_ap, y)
def convert(box_xywh): box_xy = flow.slice(box_xywh, begin=[None, None, None, None, None, 0], size=[None, None, None, None, None, 2]) box_wh = flow.slice(box_xywh, begin=[None, None, None, None, None, 2], size=[None, None, None, None, None, 2]) box_lt = box_xy - box_wh * 0.5 box_rb = box_xy + box_wh * 0.5 box_lt = flow.math.minimum(box_lt, box_rb) box_rb = flow.math.maximum(box_lt, box_rb) return box_lt, box_rb
def SQuAD( input_ids_blob, input_mask_blob, token_type_ids_blob, vocab_size, seq_length=512, hidden_size=768, num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072, hidden_act="gelu", hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1, max_position_embeddings=512, type_vocab_size=16, initializer_range=0.02, ): backbone = bert_util.BertBackbone( input_ids_blob=input_ids_blob, input_mask_blob=input_mask_blob, token_type_ids_blob=token_type_ids_blob, vocab_size=vocab_size, seq_length=seq_length, hidden_size=hidden_size, num_hidden_layers=num_hidden_layers, num_attention_heads=num_attention_heads, intermediate_size=intermediate_size, hidden_act=hidden_act, hidden_dropout_prob=hidden_dropout_prob, attention_probs_dropout_prob=attention_probs_dropout_prob, max_position_embeddings=max_position_embeddings, type_vocab_size=type_vocab_size, initializer_range=initializer_range, ) with flow.scope.namespace("cls-squad"): final_hidden = backbone.sequence_output() final_hidden_matrix = flow.reshape(final_hidden, [-1, hidden_size]) logits = bert_util._FullyConnected( final_hidden_matrix, hidden_size, units=2, weight_initializer=bert_util.CreateInitializer(initializer_range), name='output') logits = flow.reshape(logits, [-1, seq_length, 2]) start_logits = flow.slice(logits, [None, None, 0], [None, None, 1]) end_logits = flow.slice(logits, [None, None, 1], [None, None, 1]) return start_logits, end_logits
def yolo_train_job(): images, ground_truth, gt_valid_num = yolo_train_decoder( args.batch_size, args.image_height, args.image_width, args.classes, args.num_boxes, args.hue, args.jitter, args.saturation, args.exposure, args.dataset_dir, "yolo") gt_boxes = flow.slice(ground_truth, [None, 0, 0], [None, -1, 4], name='gt_box') gt_labels = flow.cast(flow.slice(ground_truth, [None, 0, 4], [None, -1, 1], name='gt_label'), dtype=flow.int32) yolo_loss_result, statistics_info_result = YoloTrainNet( images, gt_boxes, gt_labels, gt_valid_num, True) flow.losses.add_loss(yolo_loss_result[0]) flow.losses.add_loss(yolo_loss_result[1]) flow.losses.add_loss(yolo_loss_result[2]) return yolo_loss_result, statistics_info_result
def __call__(self, x, training, mask): # Sequence length seq_len = x.shape[1] # Embedding with flow.scope.namespace("Encoder_Embedding"): x = EmbeddingLayer(x, vocab_size=self.vocab_size, embedding_size=self.d_model) d_model_constant = flow.constant_scalar(value=self.d_model, dtype=flow.float32, name="d_model_constant") x *= flow.math.sqrt(d_model_constant) # Position encoding with flow.scope.namespace("Encoder_Position_encoding"): # equal to self.pos_encoding[:, :seq_len, :] pos_encoding = flow.slice(self.pos_encoding, begin=[None, 0, None], size=[None, seq_len, None]) x += pos_encoding if training: x = flow.nn.dropout(x, rate=self.rate) # Encoding with flow.scope.namespace("Encoder_Multi_encoder"): for i in range(self.num_layers): with flow.scope.namespace('encoder_{}'.format(i)): x = self.enc_layers[i](x, training, mask) return x
def slice(x, begin, size): ndim = len(x.shape) if not isinstance(begin, (list, tuple)) or len(begin) != ndim: raise ValueError( "begin must be a list/tuple with the same length as input tensor's number of dimensions" ) if not all((isinstance(b, int) or b is None for b in begin)): raise ValueError("element of begin must be a int or None") if not isinstance(size, (list, tuple)) or len(size) != ndim: raise ValueError( "size must be a list/tuple with the same length as input tensor's number of dimensions." ) if not all((isinstance(s, int) or s is None for s in size)): raise ValueError("element of size must be a int or None") slice_tup_list = [] for (b, s, dim_size) in zip(begin, size, x.shape): (start, stop, step) = (None, None, 1) if b is not None: if b < -dim_size or b >= dim_size: raise ValueError("element of begin is out of range") start = b if s is not None: if s == -1: stop = dim_size else: if s <= 0 or s > dim_size: raise ValueError("element of size is invalid") if b + s < dim_size: stop = b + s slice_tup_list.append((start, stop, step)) return flow.slice(x, slice_tup_list)
def _test_slice_4_dim(test_case, device): np_arr = np.random.randn(5, 3, 6, 9).astype(np.float32) x = flow.tensor(np_arr, device=flow.device(device)) tup_list = [[0, 5, 2], [None, None, None], [0, 5, 2], [0, 6, 3]] y = flow.slice(x, slice_tup_list=tup_list) tmp = np_arr[0:5, 0:3, 0:5, 0:6] np_out = tmp[::2, ::1, ::2, ::3] test_case.assertTrue(np.array_equal(y.numpy(), np_out))
def transformer_train_job(input: tp.Numpy.Placeholder( shape=(params.batch_size, params.max_length), dtype=flow.int64), target: tp.Numpy.Placeholder( shape=(params.batch_size, params.max_length), dtype=flow.int64)) -> tp.Numpy: """ The transformer training Job :param input: The input Sequence, we fix the shape to (_batch_size, _max_length) :param target: The target Sequence, we fix the shape to (_batch_size, _max_length) :return: Return the loss value. """ sample_transformer = Transformer( num_layers=6, d_model=512, num_heads=8, dff=2048, input_vocab_size=params.TARGET_VOCAB_SIZE, target_vocab_size=params.TARGET_VOCAB_SIZE, pe_input=params.TARGET_VOCAB_SIZE, pe_target=params.TARGET_VOCAB_SIZE) tar_inp = flow.slice(target, begin=[None, 1], size=[None, params.max_length - 1]) # (batch, seq_len - 1) tar_real = flow.slice(target, begin=[None, 0], size=[None, params.max_length - 1]) # (batch, seq_len - 1) enc_padding_mask, combined_mask, dec_padding_mask = create_masks( input, tar_inp) prediction, _ = sample_transformer(input, tar_inp, training=False, enc_padding_mask=enc_padding_mask, look_ahead_mask=combined_mask, dec_padding_mask=dec_padding_mask) loss = loss_function(tar_real, prediction) lr_scheduler = flow.optimizer.PiecewiseConstantScheduler([], [0.01]) flow.optimizer.Adam(lr_scheduler).minimize(loss) return loss
def _test_slice_empty(test_case, placement, sbp): dims = [random(1, 2) * 8 for _ in range(2)] input = random_tensor(2, *dims) x = input.to_global(placement=placement, sbp=sbp) slice_tup_list = [[3, 3, 1], [None, None, None]] of_out = flow.slice(x.oneflow, slice_tup_list=slice_tup_list) torch_out = x.pytorch[3:3:1, :] _check_forward_and_backward(test_case, input, of_out, torch_out)
def _test_slice_backward(test_case, device): np_arr = np.random.randn(3, 6, 9).astype(np.float32) x = flow.tensor(np_arr, device=flow.device(device), requires_grad=True) tup_list = [[None, None, None], [0, 5, 2], [0, 6, 3]] y = flow.slice(x, slice_tup_list=tup_list) z = y.sum() z.backward() np_grad = np.zeros((3, 6, 9)) np_grad[0:3, 0:5, 0:6][::1, ::2, ::3] = 1 test_case.assertTrue(np.array_equal(x.grad.numpy(), np_grad))
def bbox_giou(self, boxes1, boxes2): ''' (x, y, w, h) :param boxes1: [N, H, W, 3, 4] (x, y, w, h) :param boxes2: [N, H, W, 3, 4] (x, y, w, h) :return: [N, H, W, 3, 1] ''' def convert(box_xywh): box_xy = flow.slice(box_xywh, begin=[None, None, None, None, 0], size=[None, None, None, None, 2]) box_wh = flow.slice(box_xywh, begin=[None, None, None, None, 2], size=[None, None, None, None, 2]) box_lt = box_xy - box_wh * 0.5 box_rb = box_xy + box_wh * 0.5 box_lt = flow.math.minimum(box_lt, box_rb) box_rb = flow.math.maximum(box_lt, box_rb) return box_lt, box_rb boxes1_lt, boxes1_rb = convert(boxes1) boxes1_wh = boxes1_rb - boxes1_lt # boxes1_wh = flow.math.clip_by_value(boxes1_rb - boxes1_lt, min_value=0) boxes1_area = flow.slice(boxes1_wh, begin=[None, None, None, None, 0], size=[None, None, None, None, 1]) * \ flow.slice(boxes1_wh, begin=[None, None, None, None, 1], size=[None, None, None, None, 1]) boxes2_lt, boxes2_rb = convert(boxes2) boxes2_wh = boxes2_rb - boxes2_lt # boxes2_wh = flow.math.clip_by_value(boxes2_rb - boxes2_lt, min_value=0) boxes2_area = flow.slice(boxes2_wh, begin=[None, None, None, None, 0], size=[None, None, None, None, 1]) * \ flow.slice(boxes2_wh, begin=[None, None, None, None, 1], size=[None, None, None, None, 1]) left_up = flow.math.maximum(boxes1_lt, boxes2_lt) right_down = flow.math.minimum(boxes1_rb, boxes2_rb) inter_section_wh = flow.math.clip_by_value(right_down - left_up, min_value=0.0) inter_area = flow.slice(inter_section_wh, begin=[None, None, None, None, 0], size=[None, None, None, None, 1]) * \ flow.slice(inter_section_wh, begin=[None, None, None, None, 1], size=[None, None, None, None, 1]) union_area = boxes1_area + boxes2_area - inter_area iou = inter_area / (union_area + 1e-6) # added 1e-6 in denominator to avoid generation of inf, which may cause nan loss enclose_left_up = flow.math.minimum(boxes1_lt, boxes2_lt) enclose_right_down = flow.math.maximum(boxes1_rb, boxes2_rb) enclose_wh = flow.math.clip_by_value(enclose_right_down - enclose_left_up, min_value=0.0) enclose_area = flow.slice(enclose_wh, begin=[None, None, None, None, 0], size=[None, None, None, None, 1]) * \ flow.slice(enclose_wh, begin=[None, None, None, None, 1], size=[None, None, None, None, 1]) giou = iou - 1.0 * (enclose_area - union_area) / (enclose_area + 1e-6) # added 1e-6 in denominator to avoid generation of inf, which may cause nan loss return giou
def nonzero_op(input, as_tuple=False): if as_tuple and not input.ndim: input = input.unsqueeze(0) (res, size) = flow._C.argwhere(input) slice_tup_list = [[0, int(size.numpy()), 1]] res = flow.slice(res, slice_tup_list=slice_tup_list) if as_tuple: return tuple( [flow._C.transpose(res, [1, 0])[x] for x in range(res.shape[1])]) else: return res
def split(cls, x, axis, split_num): split_len = x.shape[axis] // split_num result_list = [] slice_begin = [0] * len(x.shape) slice_size = [-1] * len(x.shape) slice_size[axis] = split_len for i in range(split_num): slice_begin[axis] = i * split_len result = flow.slice(x, slice_begin, slice_size) result_list.append(result) return result_list
def YoloTrainLayer(in_blob, gt_bbox_blob, gt_label_blob, gt_valid_num_blob, i): global layer_number layer_name = 'yolo-layer' + str(layer_number) # placeholder for a reshape from (n,h,w,255)->(n,h,w*3,85) blob = flow.transpose(in_blob, name=layer_name + '-yolo_transpose', perm=[0, 2, 3, 1]) reshape_blob = flow.reshape(blob, shape=(blob.shape[0], -1, 85), name=layer_name + '-yolo_reshape') position = flow.slice(reshape_blob, [None, 0, 0], [None, -1, 4], name=layer_name + '-yolo_slice_pos') xy = flow.slice(position, [None, 0, 0], [None, -1, 2], name=layer_name + '-yolo_slice_xy') wh = flow.slice(position, [None, 0, 2], [None, -1, 2], name=layer_name + '-yolo_slice_wh') xy = logistic(xy, name=layer_name + '-yolo_ligistic_xy') # xy = flow.math.sigmoid(xy, name = layer_name + '-yolo_ligistic_xy') position = flow.concat([xy, wh], axis=2, name=layer_name + '-yolo_concat') confidence = flow.slice(reshape_blob, [None, 0, 4], [None, -1, 81], name=layer_name + '-yolo_slice_prob') confidence = logistic(confidence, name=layer_name + '-yolo_ligistic_prob') # confidence = flow.math.sigmoid(confidence, name = layer_name+ '-yolo_ligistic_prob') objness = flow.slice(confidence, [None, 0, 0], [None, -1, 1], name=layer_name + '-yolo_slice_objness') clsprob = flow.slice(confidence, [None, 0, 1], [None, -1, 80], name=layer_name + '-yolo_slice_clsprob') bbox_loc_diff, pos_inds, pos_cls_label, neg_inds, valid_num, statistics_info = yolo_box_diff( position, gt_bbox_blob, gt_label_blob, gt_valid_num_blob, image_height=yolo_box_diff_conf[i]['image_height'], image_width=yolo_box_diff_conf[i]['image_width'], layer_height=yolo_box_diff_conf[i]['layer_height'], layer_width=yolo_box_diff_conf[i]['layer_width'], ignore_thresh=yolo_box_diff_conf[i]['ignore_thresh'], truth_thresh=yolo_box_diff_conf[i]['truth_thresh'], box_mask=yolo_box_diff_conf[i]['box_mask'], anchor_boxes_size=yolo_box_diff_conf[i]['anchor_boxes_size'], name=layer_name + '-yolo_box_loss') # placeholder for yolobox layer bbox_objness_out, bbox_clsprob_out = yolo_prob_loss(objness, clsprob, pos_inds, pos_cls_label, neg_inds, valid_num, num_classes=80, name=layer_name + '-yolo_prob_loss') bbox_loss = flow.concat( [bbox_loc_diff, bbox_objness_out, bbox_clsprob_out], axis=2, name=layer_name + '-loss_concat') bbox_loss_reduce_sum = flow.math.reduce_sum(bbox_loss, axis=[1, 2], name=layer_name + '-bbox_loss_reduce_sum') return bbox_loss_reduce_sum, statistics_info
def call(self, y_pred, target, target_weight): batch_size = y_pred.shape[0] num_of_joints = y_pred.shape[-1] pred = flow.reshape(x=y_pred, shape=(batch_size, -1, num_of_joints)) heatmap_pred_list = [] for i in range(num_of_joints): tensor = flow.slice(pred, begin=[None, None, i * 1], size=[None, None, 1]) heatmap_pred_list.append(tensor) gt = flow.reshape(x=target, shape=(batch_size, -1, num_of_joints)) heatmap_gt_list = [] for i in range(num_of_joints): tensor = flow.slice(gt, begin=[None, None, i * 1], size=[None, None, 1]) heatmap_gt_list.append(tensor) loss = 0.0 for i in range(num_of_joints): heatmap_pred = flow.squeeze(heatmap_pred_list[i]) heatmap_gt = flow.squeeze(heatmap_gt_list[i]) y_true = heatmap_pred * flow.reshape( flow.slice(target_weight, begin=[None, i * 1, None], size=[None, 1, None]), [batch_size, 1]) y_pred = heatmap_gt * flow.reshape( flow.slice(target_weight, begin=[None, i * 1, None], size=[None, 1, None]), [batch_size, 1]) loss += 0.5 * flow.nn.MSELoss(y_true, y_pred, reduction="mean") return loss / num_of_joints
def total_variance_loss(self, images, weight): assert images.shape == ( self.batch_size, 3, self.hr_size, self.hr_size), "The shape of generated images is {}.".format( images.shape) def size_num(inputs): return inputs.shape[1] * inputs.shape[2] * inputs.shape[3] count_h = size_num( flow.slice(images, [None, 0, 1, 0], [None, 3, self.hr_size, self.hr_size])) count_w = size_num( flow.slice(images, [None, 0, 0, 1], [None, 3, self.hr_size, self.hr_size])) h_tv = flow.math.reduce_sum( flow.math.squared_difference( flow.slice(images, [None, 0, 1, 0], [None, 3, self.hr_size, self.hr_size]), flow.slice(images, [None, 0, 0, 0], [None, 3, self.hr_size - 1, self.hr_size]))) w_tv = flow.math.reduce_sum( flow.math.squared_difference( flow.slice(images, [None, 0, 0, 1], [None, 3, self.hr_size, self.hr_size]), flow.slice(images, [None, 0, 0, 0], [None, 3, self.hr_size, self.hr_size - 1]))) return weight * 2 * (h_tv / count_h + w_tv / count_w) / images.shape[0]
def PooledOutput(sequence_output, hidden_size, initializer_range): with flow.scope.namespace("bert-pooler"): first_token_tensor = flow.slice(sequence_output, [None, 0, 0], [None, 1, -1]) first_token_tensor = flow.reshape(first_token_tensor, [-1, hidden_size]) pooled_output = bert_util._FullyConnected( first_token_tensor, input_size=hidden_size, units=hidden_size, weight_initializer=bert_util.CreateInitializer(initializer_range), name="dense", ) pooled_output = flow.math.tanh(pooled_output) return pooled_output
def YoloPredictLayer(in_blob, origin_image_info, i, trainable): global layer_number layer_name = 'yolo-layer' + str(layer_number) #placeholder for a reshape from (n,h,w,255)->(n,h,w*3,85) blob = flow.transpose(in_blob, name=layer_name + '-yolo_transpose', perm=[0, 2, 3, 1]) reshape_blob = flow.reshape(blob, shape=(blob.shape[0], -1, 85), name=layer_name + '-yolo_reshape') position = flow.slice(reshape_blob, [None, 0, 0], [None, -1, 4], name=layer_name + '-yolo_slice_pos') xy = flow.slice(position, [None, 0, 0], [None, -1, 2], name=layer_name + '-yolo_slice_xy') wh = flow.slice(position, [None, 0, 2], [None, -1, 2], name=layer_name + '-yolo_slice_wh') xy = flow.math.sigmoid(xy, name=layer_name + '-yolo_ligistic_xy') position = flow.concat([xy, wh], axis=2, name=layer_name + '-yolo_concat') confidence = flow.slice(reshape_blob, [None, 0, 4], [None, -1, 81], name=layer_name + '-yolo_slice_prob') confidence = flow.math.sigmoid(confidence, name=layer_name + '-yolo_ligistic_prob') #[out_bbox, out_probs, valid_num] = flow.detection.yolo_detect(bbox=position, probs=confidence, origin_image_info=origin_image_info, image_height=608, image_width=608, layer_height=yolo_conf[i]['layer_height'], layer_width=yolo_conf[i]['layer_width'], prob_thresh=0.5, num_classes=80, max_out_boxes = max_out_boxes, anchor_boxes=yolo_conf[i]['anchor_boxes_size']) [out_bbox, out_probs, valid_num ] = flow.yolo_detect(bbox=position, probs=confidence, origin_image_info=origin_image_info, image_height=608, image_width=608, layer_height=yolo_conf[i]['layer_height'], layer_width=yolo_conf[i]['layer_width'], prob_thresh=0.5, num_classes=80, max_out_boxes=max_out_boxes, anchor_boxes=yolo_conf[i]['anchor_boxes_size'], name=str(layer_name) + "yolo_detect") #print("out_bbox.shape",out_bbox.shape) return out_bbox, out_probs, valid_num
def __call__(self, x, enc_output, training, look_ahead_mask, padding_mask): """ Forward :param x: The input X :param pos_encoding: The positional encoding :param enc_output: The encoder output :param training: Whether training :param look_ahead_mask: The look ahead mask :param padding_mask: The padding mask :return: """ # Sequence length seq_len = x.shape[1] attention_weights = {} # Embedding with flow.scope.namespace("Decoder_Embedding"): x = EmbeddingLayer(x, vocab_size=self.target_vocab_size, embedding_size=self.d_model) d_model_constant = flow.constant(self.d_model, dtype=flow.float32, shape=(1,)) x *= flow.math.sqrt(d_model_constant) # print(x.shape) # Position encoding with flow.scope.namespace("Decoder_Position_encoding"): pos_encoding = flow.slice(self.pos_encoding, begin=[None, 0, None], size=[None, seq_len, None]) x += pos_encoding if training: x = flow.nn.dropout(x, rate=self.rate) # Decoding with flow.scope.namespace("Decoder_Multi_decoder"): for i in range(self.num_layers): with flow.scope.namespace('decoder_{}'.format(i)): x, block1, block2 = self.dec_layers[i](x, enc_output, training, look_ahead_mask, padding_mask) attention_weights['decoder_layer{}_block1'.format(i + 1)] = block1 attention_weights['decoder_layer{}_block2'.format(i + 1)] = block2 return x, attention_weights
def _EmbeddingPostprocessor( input_blob, seq_length, embedding_size, use_token_type=False, token_type_ids_blob=None, token_type_vocab_size=16, token_type_embedding_name="token_type_embeddings", use_position_embeddings=True, position_embedding_name="position_embeddings", initializer_range=0.02, max_position_embeddings=512, dropout_prob=0.1, ): output = input_blob if use_token_type: assert token_type_ids_blob is not None token_type_table = flow.get_variable( name=token_type_embedding_name, shape=[token_type_vocab_size, embedding_size], dtype=input_blob.dtype, initializer=CreateInitializer(initializer_range), ) token_type_embeddings = flow.gather( params=token_type_table, indices=token_type_ids_blob, axis=0 ) output = output + token_type_embeddings if use_position_embeddings: position_table = flow.get_variable( name=position_embedding_name, shape=[1, max_position_embeddings, embedding_size], dtype=input_blob.dtype, initializer=CreateInitializer(initializer_range), ) assert seq_length <= max_position_embeddings if seq_length != max_position_embeddings: position_table = flow.slice( position_table, begin=[None, 0, 0], size=[None, seq_length, -1] ) output = output + position_table output = _LayerNorm(output, embedding_size) output = _Dropout(output, dropout_prob) return output
def self_attn_qk_v_fw_bw(h: flow.typing.Numpy.Placeholder( shape=(seq_len, batch_size, hidden_size), dtype=flow.float32)) -> typing.Tuple[flow.typing.Numpy, flow.typing.Numpy]: var = flow.get_variable( "var", shape=(1, ), dtype=flow.float32, initializer=flow.constant_initializer(1.0, dtype=flow.float32), trainable=True, ) h = h * var # save grad if fused: flow.watch_diff(h, test_global_storage.Setter("h_grad_fused")) else: flow.watch_diff(h, test_global_storage.Setter("h_grad")) if fp16: h = flow.amp_white_identity(h) alpha = get_alpha(head_size) if fused: qmk, v = flow.nn.fused_self_attention_query_mul_key_and_value( h, head_size=head_size, alpha=alpha) else: # (s, b, H) -> (s, b, n, 3 * h) -> (s, b, n, h) -> (b, n, s, h) h = flow.reshape(h, (seq_len, batch_size, -1, 3 * head_size)) q, k, v = (flow.transpose( flow.slice( h, begin=[None, None, None, head_size * i], size=[None, None, None, head_size], ), perm=[1, 2, 0, 3], ) for i in range(3)) qmk = flow.matmul(q, k, transpose_b=True, alpha=alpha) # calc loss for grad h = flow.matmul(qmk, v) loss = flow.math.reduce_sum(h) flow.optimizer.SGD(get_lr_scheduler(), momentum=0).minimize(loss) return qmk, v
def slice_wrapper(tensor, slice_tuple: Tuple[int, int, int]): with flow.no_grad(): ndim = tensor.ndim slice_tuple_list = [slice_tuple] + [[None, None, None]] * (ndim - 1) # TODO(): a kind 'slice op' supports both local and consistent tensor if tensor.is_consistent: # input is s0, output is p # input is b, output is b # input is p, output is p # so 'to b' is not needed here tensor = flow.logical_slice(tensor, slice_tuple_list) else: tensor = flow.slice(tensor, slice_tuple_list) # TODO(): flow.sequeeze will fail in some consistent tensor case if tensor.shape[0] == 1 and ndim > 1: tensor = tensor.reshape(list(tensor.shape[1:])) return tensor
def GPT(idx, config, target=None): b, t = idx.shape assert t <= config.block_size, "Cannot forward, model block size is exhausted." #forward the GPT model #token_embeddings = flow.layers.dense word_embedding = flow.get_variable( 'word_emb', initializer=flow.random_normal_initializer(), shape=(config.vocab_size, config.n_embd)) token_embeddings = flow.gather(word_embedding, idx) #positions embedding pos_emb = flow.get_variable(name='pos_emb', shape=(1, config.block_size, config.n_embd), dtype=flow.float32, initializer=flow.zeros_initializer()) #position_embeddings = fpos_emb[:, :t, :] # each position maps to a (learnable) vector position_embeddings = flow.slice(pos_emb, [None, 0, None], [None, t, None]) x = flow.nn.dropout((token_embeddings + position_embeddings), config.embd_pdrop) #Blocks for block_id in range(config.n_layer): with flow.scope.namespace('Block' + str(block_id)): x = Block(x, config) x = flow.layers.layer_norm(x, name='output_layernorm') logits = flow.layers.dense(x, config.vocab_size, use_bias=False, activation=flow.zeros_initializer(), name='output_logits') loss = None if target is not None: #TODO logits = flow.reshape(logits, [-1, config.vocab_size]) target = flow.reshape(target, [-1]) target = flow.one_hot(target, depth=config.vocab_size, dtype=flow.float32) loss = flow.nn.softmax_cross_entropy_with_logits(logits, target) return logits, loss
def argwhere_op(input, dtype: Optional[flow.dtype] = flow.int32): """This operator finds the indices of input Tensor `input` elements that are non-zero. It returns a list in which each element is a coordinate that points to a non-zero element in the condition. Args: input (oneflow.Tensor): The input Tensor. dtype (Optional[flow.dtype], optional): The data type of output. Defaults to None. Returns: oneflow.Tensor: The result Tensor. For example: .. code-block:: python >>> import numpy as np >>> import oneflow as flow >>> x = np.array([[0, 1, 0], ... [2, 0, 2]]).astype(np.float32) >>> input = flow.Tensor(x) >>> output = flow.argwhere(input) >>> output tensor([[0, 1], [1, 0], [1, 2]], dtype=oneflow.int32) """ if input.is_consistent: raise ValueError( "A consistent tensor can not be applied to argwhere, and use `tensor.to_local()` to convert it to local tensor first." ) (res, size) = flow._C.argwhere(input, dtype=dtype) if input.is_lazy: raise NotImplementedError # return flow._C.sync_dynamic_resize(res, size, dim=0) else: slice_tup_list = [(0, size.numpy().item(), 1)] return flow.slice(res, slice_tup_list=slice_tup_list)
def bbox_iou(self, boxes1, boxes2): ''' :param boxes1: [N, H, W, 3, 1, 4] (x, y, w, h) :param boxes2: [N, 1, 1, 1, V 4] (x, y, w, h) :return: [N, H, W, 3, V, 1] ''' def convert(box_xywh): box_xy = flow.slice(box_xywh, begin=[None, None, None, None, None, 0], size=[None, None, None, None, None, 2]) box_wh = flow.slice(box_xywh, begin=[None, None, None, None, None, 2], size=[None, None, None, None, None, 2]) box_lt = box_xy - box_wh * 0.5 box_rb = box_xy + box_wh * 0.5 box_lt = flow.math.minimum(box_lt, box_rb) box_rb = flow.math.maximum(box_lt, box_rb) return box_lt, box_rb boxes1_lt, boxes1_rb = convert(boxes1) boxes1_wh = boxes1_rb - boxes1_lt boxes1_area = flow.slice(boxes1_wh, begin=[None, None, None, None, None, 0], size=[None, None, None, None, None, 1]) * \ flow.slice(boxes1_wh, begin=[None, None, None, None, None, 1], size=[None, None, None, None, None, 1]) boxes2_lt, boxes2_rb = convert(boxes2) boxes2_wh = boxes2_rb - boxes2_lt boxes2_area = flow.slice(boxes2_wh, begin=[None, None, None, None, None, 0], size=[None, None, None, None, None, 1]) * \ flow.slice(boxes2_wh, begin=[None, None, None, None, None, 1], size=[None, None, None, None, None, 1]) left_up = flow.math.maximum(boxes1_lt, boxes2_lt) right_down = flow.math.minimum(boxes1_rb, boxes2_rb) inter_section_wh = flow.math.clip_by_value(right_down - left_up, min_value=0.0) inter_area = flow.slice(inter_section_wh, begin=[None, None, None, None, None, 0], size=[None, None, None, None, None, 1]) * \ flow.slice(inter_section_wh, begin=[None, None, None, None, None, 1], size=[None, None, None, None, None, 1]) union_area = boxes1_area + boxes2_area - inter_area iou = 1.0 * inter_area / (union_area + 1e-6) return iou
def build_network(self,inputs): b,c,t,h,w=inputs.shape N=self.time_dim templist=[] for i in range(N): tempname=datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S.%f') if i!=N//2: out = flow.range(t, dtype=flow.int64) one = flow.constant_like(out, i, dtype= flow.int64) out=flow.math.add(out, one) out=flow.expand_dims(out,axis=0) templist.append(out) neighbor_time_index=flow.concat(templist,axis=0) neighbor_time_index=flow.transpose(neighbor_time_index,[1,0]) neighbor_time_index=flow.flatten(neighbor_time_index, start_dim=0, end_dim=-1) # feature map registration tempname=datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S.%f') init=flow.kaiming_initializer(shape=inputs.shape,mode="fan_out",nonlinearity="relu") semantic=conv3d_layer("conv_semantic_"+tempname,inputs,self.out_channels, kernel_size=1,use_bias=False,padding="VALID",trainable=self.trainable, weight_initializer=init ) inputs_norm=flow.math.l2_normalize( semantic,axis=1 ) inputs_norm_padding=flow.pad(inputs_norm,paddings=[ (0,0),(0,0),((self.time_dim-1)//2,(self.time_dim-1)//2), (0,0),(0,0)] ) inputs_norm_expand=flow.expand_dims(inputs_norm,axis=3) temp_inputs_norm_expand=inputs_norm_expand for i in range(N-2): inputs_norm_expand=flow.concat( inputs=[ inputs_norm_expand,temp_inputs_norm_expand], axis=3 ) inputs_norm_expand=flow.transpose(inputs_norm_expand,perm=[0, 2, 3, 4, 5, 1]) inputs_norm_expand=flow.reshape(inputs_norm_expand,shape=[-1, h*w, c//16]) slice_list=[] for index in neighbor_time_index: temp=flow.slice( inputs_norm_padding, begin=[None,None,int(index),None,None], size=[None,None,1,None,None] ) slice_list.append(temp) neighbor_norm=flow.concat( slice_list,axis=2 ) neighbor_norm=flow.transpose(neighbor_norm,perm=[0, 2, 1, 3, 4]) neighbor_norm=flow.reshape(neighbor_norm,shape=[-1, c//16, h*w]) similarity=flow.matmul(inputs_norm_expand,neighbor_norm)*self.temperature similarity=nn.softmax(similarity,axis=-1) inputs_padding=flow.pad(inputs, paddings=[ (0,0),(0,0),((self.time_dim-1)//2,(self.time_dim-1)//2), (0,0),(0,0)] ) slice_list=[] for index in neighbor_time_index: temp=flow.slice( inputs_padding, begin=[None,None,int(index),None,None], size=[None,None,1,None,None] ) slice_list.append(temp) neighbor=flow.concat( slice_list,axis=2 ) neighbor=flow.transpose(neighbor,perm=[0,2,3,4,1]) neighbor=flow.reshape(neighbor,shape=[-1, h*w, c]) neighbor_new=flow.matmul(similarity,neighbor) neighbor_new=flow.reshape(neighbor_new,shape=[b, t*(N-1), h, w, c]) neighbor_new=flow.transpose(neighbor_new,perm=[0, 4, 1, 2, 3]) # contrastive attention if self.contrastive_att: temp_input=flow.expand_dims(inputs,axis=3) temp_temp_input=temp_input for i in range(N-2): temp_input=flow.concat( inputs=[ temp_input,temp_temp_input], axis=3 ) temp_input=flow.reshape(temp_input,shape=[b, c, (N-1)*t, h, w]) input_att=conv3d_layer( "conv3d_inputmapping_"+tempname,temp_input,self.out_channels, kernel_size=1, use_bias=False,trainable=False,weight_initializer=flow.kaiming_initializer(shape=temp_input.shape,mode="fan_out",nonlinearity="relu") ) n_att=conv3d_layer( "conv3d_nmapping_"+tempname,neighbor_new,self.out_channels, kernel_size=1, use_bias=False,trainable=False,weight_initializer=flow.kaiming_initializer(shape=neighbor_new.shape,mode="fan_out",nonlinearity="relu") ) temp_input=input_att*n_att contrastive_att_net=conv3d_layer( "conv3d_att_net_"+tempname,temp_input,1, kernel_size=1, use_bias=False,trainable=self.trainable,weight_initializer=flow.kaiming_initializer(shape=temp_input.shape,mode="fan_out",nonlinearity="relu") ) contrastive_att_net=flow.math.sigmoid(contrastive_att_net) neighbor_new=flow.math.multiply( neighbor_new,contrastive_att_net ) # integrating feature maps init = flow.zeros_initializer() tempname=datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S.%f') input_offset = flow.get_variable( "input_offset_"+tempname, shape=(b, c, N*t, h, w), initializer=init, dtype=inputs.dtype, trainable=self.trainable) with flow.scope.placement("cpu", "0:0"): input_index=np.array( [i for i in range(t*N) if i%N==N//2] ) neighbor_index=np.array( [i for i in range(t*N) if i%N!=N//2]) input_offset_list=[] inputs_list=[] neighbor_new_list=[] for index in range(input_offset.shape[2]): temp=flow.slice( input_offset, begin=[None,None,int(index),None,None], size=[None,None,1,None,None] ) input_offset_list.append(temp) for index in range(inputs.shape[2]): temp=flow.slice( inputs, begin=[None,None,int(index),None,None], size=[None,None,1,None,None] ) inputs_list.append(temp) for index in range(neighbor_new.shape[2]): temp=flow.slice( neighbor_new, begin=[None,None,int(index),None,None], size=[None,None,1,None,None] ) neighbor_new_list.append(temp) temp_index=0 for index in input_index: input_offset_list[index]+=inputs_list[temp_index] temp_index+=1 temp_index=0 for index in neighbor_index: input_offset_list[index]+=neighbor_new_list[temp_index] temp_index+=1 input_offset=flow.concat( input_offset_list,axis=2 ) return input_offset
def forward(self, x): tup_list = [[None, None, None], [0, 5, 2], [0, 6, 3]] out = flow.slice(x, slice_tup_list=tup_list) return out
def lstm(input, units, return_sequence=False, initial_state=None, direction='forward', layer_index=0, is_train=True): ''' input: sequence input tensor with shape [batch_size,sequence_length,embedding size] units: hidden units numbers ''' batch_size = input.shape[0] seq_len = input.shape[1] input_size = input.shape[2] dtype = flow.float32 with flow.scope.namespace('layer' + str(layer_index)): with flow.scope.namespace(direction): weight_blob_i = flow.get_variable( name='input' + '-weight', shape=[input_size, units], dtype=dtype, trainable=is_train, initializer=flow.glorot_normal_initializer()) weight_blob_ih = flow.get_variable( name='input' + '-h-weight', shape=[units, units], dtype=dtype, trainable=is_train, initializer=flow.glorot_normal_initializer()) bias_blob_i = flow.get_variable( name='input' + '-bias', shape=[units], dtype=dtype, trainable=is_train, initializer=flow.constant_initializer(0.0)) weight_blob_f = flow.get_variable( name='forget' + '-weight', shape=[input_size, units], dtype=dtype, trainable=is_train, initializer=flow.glorot_normal_initializer()) weight_blob_fh = flow.get_variable( name='forget' + '-h-weight', shape=[units, units], dtype=dtype, trainable=is_train, initializer=flow.glorot_normal_initializer()) bias_blob_f = flow.get_variable( name='forget' + '-bias', shape=[units], dtype=dtype, trainable=is_train, initializer=flow.constant_initializer(0.0)) weight_blob_c = flow.get_variable( name='cell' + '-weight', shape=[input_size, units], dtype=dtype, trainable=is_train, initializer=flow.glorot_normal_initializer()) weight_blob_ch = flow.get_variable( name='cell' + '-h-weight', shape=[units, units], dtype=dtype, trainable=is_train, initializer=flow.glorot_normal_initializer()) bias_blob_c = flow.get_variable( name='cell' + '-bias', shape=[units], dtype=dtype, trainable=is_train, initializer=flow.constant_initializer(0.0)) weight_blob_o = flow.get_variable( name='output' + '-weight', shape=[input_size, units], dtype=dtype, trainable=is_train, initializer=flow.glorot_normal_initializer()) weight_blob_oh = flow.get_variable( name='output' + '-h-weight', shape=[units, units], dtype=dtype, trainable=is_train, initializer=flow.glorot_normal_initializer()) bias_blob_o = flow.get_variable( name='output' + '-bias', shape=[units], dtype=dtype, trainable=is_train, initializer=flow.constant_initializer(0.0)) flow.watch(weight_blob_i, test_global_storage.Setter("weight_blob_i")) flow.watch(weight_blob_f, test_global_storage.Setter("weight_blob_f")) flow.watch(weight_blob_c, test_global_storage.Setter("weight_blob_c")) flow.watch(weight_blob_o, test_global_storage.Setter("weight_blob_o")) flow.watch(weight_blob_ih, test_global_storage.Setter("weight_blob_ih")) flow.watch(weight_blob_fh, test_global_storage.Setter("weight_blob_fh")) flow.watch(weight_blob_ch, test_global_storage.Setter("weight_blob_ch")) flow.watch(weight_blob_oh, test_global_storage.Setter("weight_blob_oh")) flow.watch(bias_blob_i, test_global_storage.Setter("bias_blob_i")) flow.watch(bias_blob_f, test_global_storage.Setter("bias_blob_f")) flow.watch(bias_blob_c, test_global_storage.Setter("bias_blob_c")) flow.watch(bias_blob_o, test_global_storage.Setter("bias_blob_o")) def step_function(input, states): hx = states[0] cx = states[1] x_i = _FullyConnected(input, weight_blob_i, bias_blob_i) # input gate mark_int = x_i x_f = _FullyConnected(input, weight_blob_f, bias_blob_f) # forget gate x_c = _FullyConnected(input, weight_blob_c, bias_blob_c) # cell state x_o = _FullyConnected(input, weight_blob_o, bias_blob_o) # output gate h_i = _FullyConnected(hx, weight_blob_ih, None) h_f = _FullyConnected(hx, weight_blob_fh, None) h_c = _FullyConnected(hx, weight_blob_ch, None) h_o = _FullyConnected(hx, weight_blob_oh, None) x_i = x_i + h_i x_f = x_f + h_f x_c = x_c + h_c x_o = x_o + h_o x_i = flow.math.sigmoid(x_i) x_f = flow.math.sigmoid(x_f) cellgate = flow.math.tanh(x_c) x_o = flow.math.sigmoid(x_o) cy = x_f * cx + x_i * cellgate hy = x_o * flow.math.tanh(cy) return hy, (hy, cy) if initial_state: states = initial_state else: states = [ flow.constant(0, dtype=flow.float32, shape=[batch_size, units]), flow.constant(0, dtype=flow.float32, shape=[batch_size, units]) ] successive_outputs = [] successive_states = [] for index in range(seq_len): # print('time step:',index) inp = flow.slice(input, [None, index, 0], [None, 1, input_size]) # print(inp.shape) inp = flow.reshape(inp, [-1, input_size]) # print(inp.shape) output, states = step_function(inp, states) output = flow.reshape(output, [-1, 1, units]) successive_outputs.append(output) successive_states.append(states) last_output = successive_outputs[-1] new_states = successive_states[-1] outputs = flow.concat(successive_outputs, axis=1) if return_sequence: return outputs else: return flow.reshape(last_output, [-1, units])
def build_network(self, inputs): b, c, t, h, w = inputs.shape N = self.time_dim templist = [np.arange(0, t) + i for i in range(N) if i != N // 2] templist = np.expand_dims(templist, axis=0) neighbor_time_index = np.concatenate(templist, axis=0) # neighbor_time_index=flow.concat( # templist,axis=0 # ) neighbor_time_index = np.transpose(neighbor_time_index) neighbor_time_index = np.ndarray.flatten(neighbor_time_index) #寻找tensor.long的代替(把tensor变成longtensor) #tensor 中long 是64整形 neighbor_time_index = np.int64(neighbor_time_index) semantic = conv3d_layer("conv_semantic_", inputs, self.out_channels, kernel_size=1, use_bias=False, padding="SAME") inputs_norm = flow.math.l2_normalize(semantic, axis=1) inputs_norm_padding = flow.pad(inputs_norm, paddings=[(0, 0), (0, 0), ((self.time_dim - 1) // 2, (self.time_dim - 1) // 2), (0, 0), (0, 0)]) inputs_norm_expand = flow.expand_dims(inputs_norm, axis=3) temp_inputs_norm_expand = inputs_norm_expand for i in range(N - 2): inputs_norm_expand = flow.concat( inputs=[inputs_norm_expand, temp_inputs_norm_expand], axis=3) #inputs_norm_expand=flow.transpose(inputs_norm_expand,perm=[0, 2, 3, 4, 5, 1]) print("inputs_norm_expand", inputs_norm_expand.shape) inputs_norm_expand = flow.reshape( inputs_norm_expand, (inputs_norm_expand.shape[0], inputs_norm_expand.shape[2], inputs_norm_expand.shape[3], inputs_norm_expand.shape[4], inputs_norm_expand.shape[5], inputs_norm_expand.shape[1])) inputs_norm_expand = flow.reshape(inputs_norm_expand, shape=[-1, h * w, c // 16]) slice_list = [] for index in neighbor_time_index: temp = flow.slice( inputs_norm_padding, begin=[None, None, int(index), None, None], #size=[None,slice_shape[1],1,slice_shape[3],slice_shape[4]] size=[None, None, 1, None, None]) slice_list.append(temp) neighbor_norm = flow.concat(slice_list, axis=2) neighbor_norm = flow.transpose(neighbor_norm, perm=[0, 2, 1, 3, 4]) #inputs_norm_expand=flow.reshape(neighbor_norm,(neighbor_norm.shape[0],neighbor_norm.shape[2],neighbor_norm.shape[3],neighbor_norm.shape[4],neighbor_norm.shape[5],neighbor_norm.shape[1])) neighbor_norm = flow.reshape(neighbor_norm, shape=[-1, c // 16, h * w]) similarity = flow.matmul(inputs_norm_expand, neighbor_norm) * self.temperature similarity = nn.softmax(similarity, axis=-1) inputs_padding = flow.pad(inputs, paddings=[(0, 0), (0, 0), ((self.time_dim - 1) // 2, (self.time_dim - 1) // 2), (0, 0), (0, 0)]) #neighbor=inputs_padding[:, :, neighbor_time_index, :, :] slice_list = [] for index in neighbor_time_index: temp = flow.slice(inputs_padding, begin=[None, None, int(index), None, None], size=[None, None, 1, None, None]) slice_list.append(temp) neighbor = flow.concat(slice_list, axis=2) neighbor = flow.transpose(neighbor, perm=[0, 2, 3, 4, 1]) neighbor = flow.reshape(neighbor, shape=[-1, h * w, c]) neighbor_new = flow.matmul(similarity, neighbor) neighbor_new = flow.reshape(neighbor_new, shape=[b, t * (N - 1), h, w, c]) neighbor_new = flow.transpose(neighbor_new, perm=[0, 4, 1, 2, 3]) if self.contrastive_att: temp_input = flow.expand_dims(inputs, axis=3) temp_temp_input = temp_input temp_input = flow.concat(inputs=[temp_input, temp_temp_input], axis=3) temp_input = flow.reshape(temp_input, shape=[b, c, (N - 1) * t, h, w]) input_att = conv3d_layer("conv3d_inputmapping", temp_input, self.out_channels, kernel_size=1, use_bias=False, trainable=False) n_att = conv3d_layer("conv3d_nmapping", neighbor_new, self.out_channels, kernel_size=1, use_bias=False, trainable=False) contrastive_att_net = conv3d_layer("conv3d_att_net", input_att * n_att, self.out_channels, kernel_size=1, use_bias=False) constastive_att = flow.math.sigmoid(contrastive_att_net) neighbor_new = neighbor_new * self.contrastive_att #device 暂时先空着了 input_offset = np.zeros([b, c, N * t, h, w], dtype=np.float) init = flow.zeros_initializer() input_offset = flow.get_variable("input_offset", shape=(b, c, N * t, h, w), initializer=init, dtype=inputs.dtype, trainable=True) input_index = np.array([i for i in range(t * N) if i % N == N // 2]) neighbor_index = np.array([i for i in range(t * N) if i % N != N // 2]) # print("inputs: ",inputs.shape) # print("input_index:",input_index) # print("input_index_len:",len(input_index)) print("input_offset:", input_offset.shape) input_offset_list = [] inputs_list = [] neighbor_new_list = [] for index in range(input_offset.shape[2]): temp = flow.slice(input_offset, begin=[None, None, int(index), None, None], size=[None, None, 1, None, None]) input_offset_list.append(temp) for index in range(inputs.shape[2]): temp = flow.slice(inputs, begin=[None, None, int(index), None, None], size=[None, None, 1, None, None]) inputs_list.append(temp) for index in range(neighbor_new.shape[2]): temp = flow.slice(neighbor_new, begin=[None, None, int(index), None, None], size=[None, None, 1, None, None]) neighbor_new_list.append(temp) temp_index = 0 for index in input_index: input_offset_list[index] += inputs_list[temp_index] temp_index += 1 # print("neighbor_new:",neighbor_new.shape) # print("neighbor_index:",neighbor_index.shape) temp_index = 0 for index in neighbor_index: input_offset_list[index] += neighbor_new_list[temp_index] temp_index += 1 # print("before",input_offset.shape) input_offset = flow.concat(input_offset_list, axis=2) print("after", input_offset.shape) return input_offset
def _test_fused_self_attention(test_case, batch_size, seq_len, num_heads, head_size): hidden_size = num_heads * 3 * head_size x = np.random.randn(seq_len, batch_size, hidden_size) fused_input = flow.Tensor(x).to("cuda") fused_input.requires_grad = True (fused_qmk, fused_v) = flow._C.fused_self_attention( fused_input, head_size=head_size, alpha=1.0, ) fused_atten = flow.matmul(fused_qmk, fused_v) fused_atten_sum = fused_atten.sum() origin_input = flow.Tensor(x).to("cuda") origin_input.requires_grad = True reshape_input = flow.reshape(origin_input, (seq_len, batch_size, -1, 3 * head_size)) origin_q = flow.slice( reshape_input, slice_tup_list=[ [None, None, None], [None, None, None], [None, None, None], [0, head_size, 1], ], ).permute(1, 2, 0, 3) origin_k = flow.slice( reshape_input, slice_tup_list=[ [None, None, None], [None, None, None], [None, None, None], [head_size, 2 * head_size, 1], ], ).permute(1, 2, 0, 3) origin_v = flow.slice( reshape_input, slice_tup_list=[ [None, None, None], [None, None, None], [None, None, None], [2 * head_size, 3 * head_size, 1], ], ).permute(1, 2, 0, 3) origin_k = origin_k.transpose(2, 3) origin_qmk = flow.matmul(origin_q, origin_k) origin_atten = flow.matmul(origin_qmk, origin_v) origin_atten_sum = origin_atten.sum() total_sum = fused_atten_sum + origin_atten_sum total_sum.backward() test_case.assertTrue( np.allclose(fused_atten.numpy(), origin_atten.numpy(), atol=1e-4, rtol=1e-4)) test_case.assertTrue( np.allclose( fused_input.grad.numpy(), origin_input.grad.numpy(), atol=1e-4, rtol=1e-4, ))