def build(self, inputs, targets): """ Args: inputs (torch.Tensor): feature matrix with shape (batch_size, feat_dim). targets (torch.LongTensor): ground truth labels with shape (num_classes). """ n = inputs.shape[0] dist = math.reduce_sum(math.pow( inputs, flow.constant_like(inputs, 2, dtype=flow.float32)), axis=1) shape_tensor = flow.constant(value=0.0, dtype=flow.float32, shape=(n, n)) dist = flow.broadcast_like(dist, like=shape_tensor, broadcast_axes=[1]) dist = math.add( dist, flow.transpose(dist, perm=(1, 0), batch_axis_non_change=True)) temp1 = math.multiply( -2, flow.matmul( inputs, flow.transpose(inputs, perm=(1, 0), batch_axis_non_change=True))) dist = math.add(dist, temp1) dist = math.sqrt(flow.clamp(dist, min_value=1e-12)) mask = math.equal( flow.broadcast_like(targets, like=shape_tensor, broadcast_axes=[1]), flow.transpose(flow.broadcast_like(targets, like=shape_tensor, broadcast_axes=[1]), perm=(1, 0), batch_axis_non_change=True)) mask_rev = math.not_equal( flow.broadcast_like(targets, like=shape_tensor, broadcast_axes=[1]), flow.transpose(flow.broadcast_like(targets, like=shape_tensor, broadcast_axes=[1]), perm=(1, 0), batch_axis_non_change=True)) dist_ap, dist_an = [], [] for i in range(n): temp_dist = flow.slice_v2(dist, [(i, i + 1, 1)]) temp_mask = flow.slice_v2(mask, [(i, i + 1, 1)]) temp_mask_rev = flow.slice_v2(mask_rev, [(i, i + 1, 1)]) dist_ap.append( math.reduce_max( flow.gather_nd(temp_dist, flow.where(temp_mask)))) dist_an.append( math.reduce_min( flow.gather_nd(temp_dist, flow.where(temp_mask_rev)))) dist_ap = flow.concat(dist_ap, 0) dist_an = flow.concat(dist_an, 0) y = flow.ones_like(dist_an) # return dist_an, dist_ap, y return self._MarginRankingLoss(dist_an, dist_ap, y)
def test_job( x: oft.Numpy.Placeholder(input_shape, dtype=flow.float32), addend: oft.Numpy.Placeholder(input_shape, dtype=flow.float32), ): v = flow.get_variable( name="v", shape=(1, ), dtype=flow.float32, initializer=flow.zeros_initializer(), ) x = x + v addend = addend + v x1 = flow.identity(x) x2 = flow.identity(x) addend1 = flow.identity(addend) addend2 = flow.identity(addend) flow.watch_diff(x1, test_global_storage.Setter("x1_diff")) flow.watch_diff(x2, test_global_storage.Setter("x2_diff")) flow.watch_diff(addend1, test_global_storage.Setter("addend1_diff")) flow.watch_diff(addend2, test_global_storage.Setter("addend2_diff")) x1 = flow.cast(x1, data_type) x2 = flow.cast(x2, data_type) addend1 = flow.cast(addend1, data_type) addend2 = flow.cast(addend2, data_type) y1 = flow.layers.batch_normalization_add_relu(x1, addend=addend1, axis=axis, name="BN1") y2 = flow.math.relu( flow.layers.batch_normalization(x2, axis=axis, name="BN2") + addend2) y1 = flow.cast(y1, flow.float32) y2 = flow.cast(y2, flow.float32) flow.watch(y1, test_global_storage.Setter("y1")) flow.watch(y2, test_global_storage.Setter("y2")) y1 = flow.where(flow.math.greater(y2, v), y1, v) y2 = flow.where(flow.math.greater(y1, v), y2, v) loss = y1 + y2 flow.optimizer.SGD(flow.optimizer.PiecewiseConstantScheduler([], [0.001]), momentum=0).minimize(flow.math.reduce_sum(loss)) return loss
def do_where(condition, x, y): with flow.scope.placement(device_type, "0:0"): x_var = flow.get_variable( "x", shape=x.shape, dtype=flow.float, initializer=flow.constant_initializer(0), ) x_var = flow.cast_to_current_logical_view(x_var) x_var = x_var + x y_var = flow.get_variable( "y", shape=y.shape, dtype=flow.float, initializer=flow.constant_initializer(0), ) y_var = flow.cast_to_current_logical_view(y_var) y_var = y_var + y z = flow.where(condition, x_var, y_var) with flow.scope.placement(device_type, "0:0"): flow.optimizer.SGD(flow.optimizer.PiecewiseConstantScheduler( [], [1e-3]), momentum=0).minimize(z) flow.watch_diff(x_var, dz_dx_watcher) flow.watch_diff(y_var, dz_dy_watcher) return z
def do_where(condition, x, y): with flow.scope.placement(device_type, "0:0"): x_var = flow.get_variable( "x", shape=x.shape, dtype=flow.float, initializer=flow.constant_initializer(0), ) x_var = flow.cast_to_current_logical_view(x_var) x_var = x_var + x y_var = flow.get_variable( "y", shape=y.shape, dtype=flow.float, initializer=flow.constant_initializer(0), ) y_var = flow.cast_to_current_logical_view(y_var) y_var = y_var + y z = flow.where(condition, x_var, y_var) with flow.scope.placement(device_type, "0:0"): flow.losses.add_loss(z) flow.watch_diff(x_var, dz_dx_watcher) flow.watch_diff(y_var, dz_dy_watcher) return z
def _test_where_scalar(test_case, device): x = 0.5 y = 2.0 condition = flow.tensor(np.array([1]), dtype=flow.int32) of_out = flow.where(condition, x, y) np_out = np.array([0.5]) test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05))
def _test_where_backward(test_case, device): x = flow.tensor( np.array([[-0.462, 0.3139], [0.3898, -0.7197], [0.0478, -0.1657]]), dtype=flow.float32, device=flow.device(device), requires_grad=True, ) y = flow.tensor( np.ones(shape=(3, 2)), dtype=flow.float32, device=flow.device(device), requires_grad=True, ) condition = flow.tensor(np.array([[0, 1], [1, 0], [1, 0]]), dtype=flow.int32, device=flow.device(device)) of_out = flow.where(condition, x, y) of_out = of_out.sum() of_out.backward() test_case.assertTrue( np.allclose(x.grad.numpy(), condition.numpy() == 1, 1e-05, 1e-05)) test_case.assertTrue( np.allclose(y.grad.numpy(), condition.numpy() == 0, 1e-05, 1e-05))
def forward(self, cosine: flow.Tensor, label): index = flow.where(label != -1)[0] m_hot = flow.zeros(index.size()[0], cosine.size()[1], device=cosine.device) m_hot.scatter_(1, label[index, None], self.m) cosine.acos_() cosine[index] += m_hot cosine.cos_().mul_(self.s) return cosine
def forward(self, cosine, label): index = flow.where(label != -1)[0] m_hot = flow.zeros(index.size()[0], cosine.size()[1], device=cosine.device) m_hot = flow.scatter(m_hot, 1, label[index, None], self.m) cosine = cosine[index] - m_hot ret = cosine * self.s return ret
def _test_where_x_y_none(test_case, device): condition = flow.tensor( np.array([[[-0.462, 0.3139], [0.3898, -0.7197], [0.0478, -0.1657]]]), dtype=flow.float32, device=flow.device(device), requires_grad=True, ) of_out = flow.where(condition) of_nonzero = flow.nonzero(condition, as_tuple=True) for i in range(len(of_out)): test_case.assertTrue( np.allclose(of_out[i].numpy(), of_nonzero[i].numpy(), 1e-05, 1e-05))
def masked_fill( x: remote_blob_util.BlobDef, mask: remote_blob_util.BlobDef, value: Union[float, int], name: Optional[str] = None, ) -> remote_blob_util.BlobDef: r"""Fill a blob with a given value according to the given mask. Args: x (remote_blob_util.BlobDef): Input Blob. mask (remote_blob_util.BlobDef): Composed with 0 and 1, the input blob 'x' will be filled with the given value where the mask is 1. value (Union[int, int]): The value to use for filling the input blob. name (Optional[str], optional): The name for the operation. Defaults to None. Attention: x and mask must be broadcastable to each other. mask must be int type (int8/int32/int64). Returns: remote_blob_util.BlobDef: The value-filled Blob For example: .. code-block:: python import oneflow as flow import numpy as np import oneflow.typing as tp @flow.global_function() def masked_fill_Job(x: tp.Numpy.Placeholder((4, ), mask: tp.Numpy.Placeholder((4, ), dtype = flow.int8))->tp.Numpy: return flow.masked_fill(x, mask, value=5) x = np.array([1, 2, 3, 4], dtype=np.float32) mask = np.array([1, 0, 0, 1], dtype=np.int8) out = masked_fill_Job(x, mask) # output [5 2 3 5] """ if name is None: name = id_util.UniqueStr("MaskedFill_") value_like_x = flow.constant_like(like=x, value=value, name=name + "_ConstantLike") return flow.where(condition=mask, x=value_like_x, y=x, name=name + "_Where")
def _test_where(test_case, device): x = flow.tensor( np.array([[-0.462, 0.3139], [0.3898, -0.7197], [0.0478, -0.1657]]), dtype=flow.float32, device=flow.device(device), ) y = flow.tensor(np.ones(shape=(3, 2)), dtype=flow.float32, device=flow.device(device)) condition = flow.tensor(np.array([[0, 1], [1, 0], [1, 0]]), dtype=flow.int32, device=flow.device(device)) of_out = flow.where(condition, x, y) np_out = np.array([[1.0, 0.3139], [0.3898, 1.0], [0.0478, 1.0]]) test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05))
def att_distill(args, student_atts, teacher_atts): att_loss = 0. teacher_layer_num = len(teacher_atts) student_layer_num = len(student_atts) assert teacher_layer_num % student_layer_num == 0 layers_per_block = int(teacher_layer_num / student_layer_num) new_teacher_atts = [ teacher_atts[i * layers_per_block + layers_per_block - 1] for i in range(student_layer_num) ] for student_att, teacher_att in zip(student_atts, new_teacher_atts): student_att = flow.where( student_att <= flow.constant(-1e2, dtype=flow.float), flow.zeros_like(student_att), student_att) teacher_att = flow.where( teacher_att <= flow.constant(-1e2, dtype=flow.float), flow.zeros_like(teacher_att), teacher_att) tmp_loss = mseloss(student_att, teacher_att) att_loss += tmp_loss return att_loss
def _attn(self, query, key, value): attn_weights = flow.matmul(query, key.transpose(-2, -1)) if self.scale_attn_weights: attn_weights = attn_weights / (float(value.size(-1))**0.5) query_length, key_length = query.size(-2), key.size(-2) causal_mask = self.bias[:, :, key_length - query_length:key_length, :key_length] attn_weights = flow.where(causal_mask, attn_weights, self.masked_bias.to(attn_weights.dtype)) attn_weights = nn.Softmax(dim=-1)(attn_weights) attn_weights = self.attn_dropout(attn_weights) attn_output = flow.matmul(attn_weights, value) return attn_output, attn_weights
def _prob_in_top_k( self, clean_values, noisy_values, noise_stddev, noisy_top_values ): """Helper function to NoisyTopKGating. Computes the probability that value is in top k, given different random noise. This gives us a way of backpropagating from a loss that balances the number of times each expert is in the top k experts per example. In the case of no noise, pass in None for noise_stddev, and the result will not be differentiable. Args: clean_values: a `Tensor` of shape [batch, n]. noisy_values: a `Tensor` of shape [batch, n]. Equal to clean values plus normally distributed noise with standard deviation noise_stddev. noise_stddev: a `Tensor` of shape [batch, n], or None noisy_top_values: a `Tensor` of shape [batch, m]. "values" Output of tf.top_k(noisy_top_values, m). m >= k+1 Returns: a `Tensor` of shape [batch, n]. """ batch = clean_values.size(0) m = noisy_top_values.size(1) top_values_flat = noisy_top_values.flatten() threshold_positions_if_in = ( flow.arange(batch, device=noisy_values.device) * m + self.k ) threshold_if_in = flow.unsqueeze( flow.gather(top_values_flat, 0, threshold_positions_if_in), 1 ) is_in = flow.gt(noisy_values, threshold_if_in) threshold_positions_if_out = threshold_positions_if_in - 1 threshold_if_out = flow.unsqueeze( flow.gather(top_values_flat, 0, threshold_positions_if_out), 1 ) # is each value currently in the top k. prob_if_in = cdf((clean_values - threshold_if_in) / noise_stddev) prob_if_out = cdf((clean_values - threshold_if_out) / noise_stddev) prob = flow.where(is_in, prob_if_in, prob_if_out) return prob
def __call__(self, x, padding=None): # Retrieve dynamically known shapes batch_size = x.shape[0] length = x.shape[1] if padding is not None: with flow.scope.namespace("remove_padding"): # Flatten padding to [batch_size*length] pad_mask = flow.reshape(padding, [-1]) nonpad_ids = flow.cast(flow.where(pad_mask < 1e-9), dtype=flow.int32) # nonpad_ids = tf.to_int32(tf.where(pad_mask < 1e-9)) # Reshape x to [batch_size*length, hidden_size] to remove padding x = flow.reshape(x, [-1, self.hidden_size]) x = flow.gather_nd(x, indices=nonpad_ids) # Reshape x from 2 dimensions to 3 dimensions. # TODO:Maybe has a batch axis error in there x = flow.expand_dims(x, axis=0) output = self._build_dense(x, self.filter_size, name="filter_layer") if self.train: # In TensorFlow the param means `keep_prob` and use `1-dropout`, # but our dropout means drop rate so i just use dropout ! output = flow.nn.dropout(output, self.relu_dropout) if padding is not None: with flow.scope.namespace("re_add_padding"): output = flow.squeeze(output, axis=[0, ]) output = flow.scatter_nd( indices=nonpad_ids, updates=output, shape=[batch_size * length, self.hidden_size] ) output = flow.reshape(output, [batch_size, length, self.hidden_size]) return output
def _where(self, x=None, y=None): return flow.where(self, x, y)
def ctc_loss( log_probs: oneflow_api.BlobDesc, targets: oneflow_api.BlobDesc, input_lengths: oneflow_api.BlobDesc, target_lengths: oneflow_api.BlobDesc, blank: int = 0, reduction: str = "mean", zero_infinity: bool = False, name: Optional[str] = None, ) -> oneflow_api.BlobDesc: r"""Computes the CTC(Connectionist Temporal Classification) loss. This operator implements the CTC loss as presented in (Graves et al., 2006). Args: log_probs (oneflow_api.BlobDesc): A Blob of shape [input_length, batch_size, num_labels]. The logarithmized probabilities of the outputs (e.g. obtained with flow.nn.logsoftmax()). targets (oneflow_api.BlobDesc): A Blob of shape [batch_size, max_target_length]. It represent the target sequences. Each element in the target sequence is a class index. And the target index cannot be blank (default=0). input_lengths (oneflow_api.BlobDesc): A Blob of shape [batch_size]. It represent the lengths of the inputs. And the lengths are specified for each sequence to achieve masking under the assumption that sequences are padded to equal lengths. target_lengths (oneflow_api.BlobDesc): A Blob of shape [batch_size]. It represent lengths of the targets. Lengths are specified for each sequence to achieve masking under the assumption that sequences are padded to equal lengths. blank (int, optional): Blank label. Defaults to 0. reduction (str, optional): The reduce type, it can be the one of "none", "mean", "sum". "none": no reduction will be applied, "mean": the output losses will be divided by the target lengths and then the mean over the batch is taken, "sum": the output will be summed. Defaults to "mean". zero_infinity (bool, optional): Whether to zero infinite losses and the associated gradients. Infinite losses mainly occur when the inputs are too short to be aligned to the targets. Defaults to False. name (Optional[str], optional): The name for the operation. Defaults to None. Returns: oneflow_api.BlobDesc: The result Blob. For example: .. code-block:: python import oneflow as flow import oneflow.typing as tp import numpy as np @flow.global_function() def ctc_loss_job( log_probs: tp.Numpy.Placeholder(shape=(5, 2, 3)), targets: tp.Numpy.Placeholder(shape=(2, 3), dtype=flow.int32), input_lengths: tp.Numpy.Placeholder(shape=(2,), dtype=flow.int32), target_lengths: tp.Numpy.Placeholder(shape=(2,), dtype=flow.int32), ) -> tp.Numpy: loss = flow.ctc_loss( log_probs, targets, input_lengths, target_lengths, blank=0, reduction="none" ) return loss log_probs = np.array( [ [[-1.1031, -0.7998, -1.5200], [-0.9808, -1.1363, -1.1908]], [[-1.2258, -1.0665, -1.0153], [-1.1135, -1.2331, -0.9671]], [[-1.3348, -0.6611, -1.5118], [-0.9823, -1.2355, -1.0941]], [[-1.3850, -1.3273, -0.7247], [-0.8235, -1.4783, -1.0994]], [[-0.9049, -0.8867, -1.6962], [-1.4938, -1.3630, -0.6547]], ] ).astype(np.float32) targets = np.array([[1, 2, 2], [1, 2, 2]]).astype("int32") input_lengths = np.array([5, 5]).astype("int32") target_lengths = np.array([3, 3]).astype("int32") loss = ctc_loss_job(log_probs, targets, input_lengths, target_lengths) # loss [3.918017 2.907672] """ name = name if name is not None else id_util.UniqueStr("CTCLoss_") loss, _ = ( flow.user_op_builder(name) .Op("ctc_loss") .Input("log_probs", [log_probs]) .Input("targets", [targets]) .Input("input_lengths", [input_lengths]) .Input("target_lengths", [target_lengths]) .Output("loss") .Output("alpha") .Attr("blank", int(blank)) .Attr("zero_infinity", zero_infinity) .Build() .InferAndTryRun() .RemoteBlobList() ) if zero_infinity: cond = flow.math.equal( loss, flow.constant( float("inf"), dtype=loss.dtype, shape=loss.shape, name=name + "_constant", ), name=name + "_equal", ) loss = flow.where( cond, flow.zeros(dtype=loss.dtype, shape=loss.shape, name=name + "_zeros"), loss, name=name + "_where", ) if reduction == "mean": return flow.math.reduce_mean( flow.math.xdivy( loss, flow.cast( flow.math.clip_by_value( target_lengths, min_value=1, name=name + "_clip_by_value" ), dtype=log_probs.dtype, name=name + "_cast", ), name=name + "_xdivy", ), name=name + "_reduce_mean", ) elif reduction == "sum": return flow.math.reduce_sum(loss, name=name + "_reduce_sum") else: return loss
def _TransformerModel(input_blob, attention_mask_blob, seq_length, hidden_size=768, num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072, intermediate_act_fn=_Gelu, hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1, initializer_range=0.02, do_return_all_layers=False, replace_prob=0.0, compress_ratio=1): # print('| transformer num hidden layers: ', num_hidden_layers) assert hidden_size % num_attention_heads == 0 attention_head_size = int(hidden_size / num_attention_heads) input_width = hidden_size prev_output_blob = flow.reshape(input_blob, (-1, input_width)) # all_layer_output_blobs = [] per_add_teacher_layers = compress_ratio per_add_student_layers = 1 teacher_layer_idx = student_layer_idx = 0 def add_teacher_layer(base_teacher_layer_idx, sub_teacher_output_blob): for add_teacher_layer_idx in range(per_add_teacher_layers): sub_teacher_output_blob = addOnelayer( layer_idx=base_teacher_layer_idx+add_teacher_layer_idx, prev_output_blob=sub_teacher_output_blob, attention_mask_blob=attention_mask_blob, num_attention_heads=num_attention_heads, attention_head_size=attention_head_size, attention_probs_dropout_prob=attention_probs_dropout_prob, initializer_range=initializer_range, seq_length=seq_length, hidden_size=hidden_size, hidden_dropout_prob=hidden_dropout_prob, intermediate_act_fn=intermediate_act_fn, intermediate_size=intermediate_size, namescope_prefix='', is_train=False) return sub_teacher_output_blob def add_student_layer(base_student_layer_idx, sub_student_output_blob): # with flow.scope.namespace("student"): sub_student_output_blob = addOnelayer( base_student_layer_idx, sub_student_output_blob, attention_mask_blob, num_attention_heads, attention_head_size, attention_probs_dropout_prob, initializer_range, seq_length, hidden_size, hidden_dropout_prob, intermediate_act_fn, intermediate_size, namescope_prefix='student-', is_train=True) return sub_student_output_blob while teacher_layer_idx < num_hidden_layers: with flow.scope.placement("cpu", "0:0"): sample = flow.random.coin_flip(name='layer{}_replacing_prob'.format(teacher_layer_idx), probability=replace_prob) sample = sample.with_distribute(flow.distribute.broadcast()) prev_output_blob = flow.where( sample, x=add_student_layer(student_layer_idx, prev_output_blob), y=add_teacher_layer(teacher_layer_idx, prev_output_blob), name='where_layer{}'.format(teacher_layer_idx) ) teacher_layer_idx += per_add_teacher_layers student_layer_idx += per_add_student_layers # print('| current teacher_layer: ', teacher_layer_idx) # print('| current student_layer: ', student_layer_idx) # print('| num_hidden_layers: ', num_hidden_layers) input_shape = (-1, seq_length, hidden_size) final_output_blob = flow.reshape(prev_output_blob, input_shape) return [final_output_blob]
def where_fn(input_def: oft.ListNumpy.Placeholder(input_shape, dtype=flow.float)): return flow.where(input_def)
def forward(self, x): return flow.where( x * self.beta > self.threshold, x, 1 / self.beta * flow.log(1.0 + flow.exp(self.beta * x)), )
def insightface_train_job(): if args.use_synthetic_data: (labels, images) = ofrecord_util.load_synthetic(args) else: labels, images = ofrecord_util.load_train_dataset(args) print("train batch data: ", images.shape) embedding = insightface(images) def _get_initializer(): return flow.random_normal_initializer(mean=0.0, stddev=0.01) trainable = True if args.loss_type == "arc_loss": s = args.margin_s m = args.margin fc1 = flow.math.l2_normalize(input=embedding, axis=1, epsilon=1e-10) fc1 = flow.math.multiply(fc1, s) fc7 = flow.get_variable( name="fc7-weight", shape=(args.class_num, fc1.shape[1]), dtype=fc1.dtype, initializer=_get_initializer(), trainable=trainable, model_name="weight", ) fc7 = flow.math.l2_normalize(input=fc7, axis=1, epsilon=1e-10) matmul = flow.matmul(a=fc1, b=fc7, transpose_b=True) labels_expand = flow.reshape(labels, (labels.shape[0], 1)) zy = flow.gather(matmul, labels_expand, batch_dims=1) cos_t = flow.math.multiply(zy, 1 / s) cos_m = math.cos(m) sin_m = math.sin(m) mm = math.sin(math.pi - m) * m threshold = math.cos(math.pi - m) if args.easy_margin: cond = flow.math.relu(cos_t) else: cond_v = cos_t - threshold cond = flow.math.relu(cond_v) body = flow.math.square(cos_t) body = flow.math.multiply(body, -1.0) body = flow.math.add(1, body) sin_t = flow.math.sqrt(body) new_zy = flow.math.multiply(cos_t, cos_m) b = flow.math.multiply(sin_t, sin_m) b = flow.math.multiply(b, -1.0) new_zy = flow.math.add(new_zy, b) new_zy = flow.math.multiply(new_zy, s) if args.easy_margin: zy_keep = zy else: zy_keep = flow.math.add(zy, -s * mm) cond = flow.cast(cond, dtype=flow.int32) new_zy = flow.where(cond, new_zy, zy_keep) zy = flow.math.multiply(zy, -1.0) diff = flow.math.add(new_zy, zy) gt_one_hot = flow.one_hot( labels, depth=args.class_num, dtype=flow.float ) body = flow.math.multiply(gt_one_hot, diff) fc7 = flow.math.add(matmul, body) elif args.loss_type == "margin_softmax": fc7_weight = flow.get_variable( name="fc7-weight", shape=(args.class_num, embedding.shape[1]), dtype=embedding.dtype, initializer=_get_initializer(), trainable=trainable, model_name="weight", ) s = args.margin_s fc7_weight = flow.math.l2_normalize( input=fc7_weight, axis=1, epsilon=1e-10 ) fc1 = ( flow.math.l2_normalize(input=embedding, axis=1, epsilon=1e-10) * s ) fc7 = flow.matmul(a=fc1, b=fc7_weight, transpose_b=True) if args.loss_m1 != 1.0 or args.loss_m2 != 0.0 or args.loss_m3 != 0.0: if args.loss_m1 == 1.0 and args.loss_m2 == 0.0: s_m = s * args.loss_m3 gt_one_hot = flow.one_hot( labels, depth=args.class_num, on_value=s_m, off_value=0.0, dtype=flow.float, ) fc7 = fc7 - gt_one_hot else: labels_expand = flow.reshape(labels, (labels.shape[0], 1)) zy = flow.gather(fc7, labels_expand, batch_dims=1) cos_t = zy * (1 / s) t = flow.math.acos(cos_t) if args.loss_m1 != 1.0: t = t * args.loss_m1 if args.loss_m2 > 0.0: t = t + args.loss_m2 body = flow.math.cos(t) if args.loss_m3 > 0.0: body = body - args.loss_m3 new_zy = body * s diff = new_zy - zy gt_one_hot = flow.one_hot( labels, depth=args.class_num, on_value=1.0, off_value=0.0, dtype=flow.float, ) body = gt_one_hot * diff fc7 = fc7 + body elif args.loss_type == "softmax": if args.model_parallel: labels = labels.with_distribute(flow.distribute.broadcast()) fc1_distribute = flow.distribute.broadcast() fc7_data_distribute = flow.distribute.split(1) fc7_model_distribute = flow.distribute.split(0) else: fc1_distribute = flow.distribute.split(0) fc7_data_distribute = flow.distribute.split(0) fc7_model_distribute = flow.distribute.broadcast() print("loss 0") fc7 = flow.layers.dense( inputs=embedding.with_distribute(fc1_distribute), units=args.class_num, activation=None, use_bias=False, kernel_initializer=_get_initializer(), bias_initializer=None, trainable=trainable, name=args.models_name, model_distribute=fc7_model_distribute, ) fc7 = fc7.with_distribute(fc7_data_distribute) elif args.loss_type == "arc_loss_ms": labels = labels.with_distribute(flow.distribute.broadcast()) fc7_model_distribute = flow.distribute.split(0) fc7_data_distribute = flow.distribute.split(1) fc7_weight = flow.get_variable( name="fc7-weight", shape=(args.class_num, embedding.shape[1]), dtype=embedding.dtype, initializer=_get_initializer(), trainable=trainable, model_name="weight", distribute=fc7_model_distribute, ) s = args.margin_s fc7_weight = flow.math.l2_normalize( input=fc7_weight, axis=1, epsilon=1e-10 ) fc1 = ( flow.math.l2_normalize(input=embedding, axis=1, epsilon=1e-10) ) fc1 = flow.parallel_cast(fc1, distribute=flow.distribute.broadcast()) fc7 = flow.matmul(a=fc1, b=fc7_weight, transpose_b=True) #s1 fc7 = flow.arc_loss(fc7, labels, margin=args.loss_m2)*60 fc7 = fc7.with_distribute(fc7_data_distribute) else: raise NotImplementedError loss = flow.nn.sparse_softmax_cross_entropy_with_logits( labels, fc7, name="softmax_loss" ) lr_scheduler = flow.optimizer.PiecewiseScalingScheduler(args.base_lr, [100000, 140000, 160000], [0.1, 0.01, 0.001]) flow.optimizer.SGD(lr_scheduler, momentum=0.9).minimize(loss) return loss
def forward(self, inputs, targets): n = inputs.shape[0] # Compute pairwise distance, replace by the official when merged tempname = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S.%f') shape_tensor = flow.constant(value=0.0, dtype=flow.float32, shape=(n, n)) if self.distance == 'euclidean': blob_2 = flow.get_variable( "blob_2_" + tempname, shape=inputs.shape, initializer=flow.constant_initializer(2), dtype=inputs.dtype) dist = flow.math.pow(inputs, blob_2) dist = flow.math.reduce_sum(dist, axis=1, keepdims=True) dist = flow.broadcast_like(dist, shape_tensor) tempdist = flow.transpose(dist) dist = dist + tempdist inputs_t = flow.transpose(inputs) dist = addmm(dist, inputs, inputs_t, beta=1, alpha=-2) dist = flow.clamp(dist, min_value=1e-12) dist = flow.math.sqrt(dist) elif self.distance == 'cosine': #fnorm=flow.math.l2_normalize(inputs, axis=1) fnorm = flow.math.reduce_mean(flow.math.divide( inputs, flow.math.l2_normalize(inputs, axis=1)), axis=1, keepdims=True) expand_fnorm = flow.broadcast_like(fnorm, like=inputs, broadcast_axes=[1]) l2norm = flow.math.divide(inputs, expand_fnorm) l2norm_t = flow.transpose(l2norm, perm=(1, 0)) dist = flow.math.negative(flow.matmul(l2norm, l2norm_t)) # For each anchor, find the hardest positive and negative mask = math.equal( flow.broadcast_like(targets, like=shape_tensor, broadcast_axes=[1]), flow.transpose(flow.broadcast_like(targets, like=shape_tensor, broadcast_axes=[1]), perm=(1, 0), batch_axis_non_change=True)) mask_rev = math.not_equal( flow.broadcast_like(targets, like=shape_tensor, broadcast_axes=[1]), flow.transpose(flow.broadcast_like(targets, like=shape_tensor, broadcast_axes=[1]), perm=(1, 0), batch_axis_non_change=True)) dist_ap, dist_an = [], [] for i in range(n): temp_dist = flow.slice_v2(dist, [(i, i + 1, 1)]) temp_mask = flow.slice_v2(mask, [(i, i + 1, 1)]) temp_mask_rev = flow.slice_v2(mask_rev, [(i, i + 1, 1)]) temp_dist_ap = flow.expand_dims( math.reduce_max( flow.gather_nd(temp_dist, flow.where(temp_mask))), 0) temp_dist_an = flow.expand_dims( math.reduce_min( flow.gather_nd(temp_dist, flow.where(temp_mask_rev))), 0) dist_ap.append(temp_dist_ap) dist_an.append(temp_dist_an) dist_ap = flow.concat(dist_ap, 0) dist_an = flow.concat(dist_an, 0) y = flow.ones_like(dist_an) return self._MarginRankingLoss(dist_an, dist_ap, y)
def loss_layer(self, feature_map, pred, label, bboxes, stride, prefix='loss_layer'): ''' :param feature_map: [N, H, W, 3*(5+class_num)] :param pred: [N, H, W, 3, 4+1+class_num] :param label: [N, H, W, 3, 4+1+class_num] :param bboxes: [N, V, 4] :param stride: :param anchor_per_scale: :return: giou_loss: conf_loss: prob_loss: ''' feature_map = flow.reshape( feature_map, shape=(feature_map.shape[0], feature_map.shape[1], feature_map.shape[2], self.anchor_per_scale, -1)) # shape: [N, H, W, 3, 1] raw_conf = flow.slice(feature_map, begin=[None, None, None, None, 4], size=[None, None, None, None, 1]) # shape: [N, H, W, 3, class_num] raw_prob = flow.slice( feature_map, begin=[None, None, None, None, 5], size=[None, None, None, None, feature_map.shape[-1] - 5]) # [N, H, W, 3, 4] pred_xywh = flow.slice(pred, begin=[None, None, None, None, 0], size=[None, None, None, None, 4]) pred_conf = flow.slice(pred, begin=[None, None, None, None, 4], size=[None, None, None, None, 1]) #flow.slice(label, begin=[None, None, None, None, 0], size=[None, None, None, None, 4]) label_xywh = flow.slice(label, begin=[None, None, None, None, 0], size=[None, None, None, None, 4]) respond_bbox = flow.slice(label, begin=[None, None, None, None, 4], size=[None, None, None, None, 1]) label_prob = flow.slice( label, begin=[None, None, None, None, 5], size=[None, None, None, None, label.shape[-1] - 5]) # [N, H, W, 3, 1] giou = self.bbox_giou(pred_xywh, label_xywh) # label_w = flow.slice(label, begin=[None, None, None, None, 2], size=[None, None, None, None, 1]) # label_h = flow.slice(label, begin=[None, None, None, None, 3], size=[None, None, None, None, 1]) # bbox_loss_scale = 2.0 - 1.0 * label_w * label_h / ((stride * feature_map.shape[1]) ** 2) #??? # [N, H, W, 3, 1] # giou_loss = respond_bbox * bbox_loss_scale * (1 - giou) giou_loss = respond_bbox * (1 - giou) # [N, 1, 1, 1, V, 4] bboxes_ = flow.expand_dims(bboxes, axis=1) bboxes_ = flow.expand_dims(bboxes_, axis=1) bboxes_ = flow.expand_dims(bboxes_, axis=1) # [N, H, W, 3, V] iou = self.bbox_iou(flow.expand_dims(pred_xywh, axis=-2), bboxes_) iou = flow.squeeze(iou, axis=[ -1, ]) # [N, H, W, 3, 1] max_iou = flow.math.reduce_max(iou, axis=-1, keepdims=True) # respond_bgd = (1.0 - respond_bbox) * (max_iou < self.iou_loss_thresh) tmp = flow.math.less( max_iou, flow.constant_like(like=max_iou, value=self.iou_loss_thresh, dtype=flow.float32)) # respond_bgd = (1.0 - respond_bbox) * tmp respond_bgd = flow.where( tmp, 1.0 - respond_bbox, flow.zeros_like(respond_bbox, dtype=flow.float32)) # [N, H, W, 3, 1] # ce = flow.nn.sigmoid_cross_entropy_with_logits(labels=respond_bbox, logits=raw_conf) # alpha_t = respond_bbox*self.focus_loss_alpha+(1.0-respond_bbox)*(1.0-self.focus_loss_alpha) # conf_loss = alpha_t*flow.math.pow(1.0-flow.math.exp(flow.math.negative(ce)), self.focus_loss_gamma)*ce # conf_loss = (respond_bbox+respond_bgd)*conf_loss conf_focal = self.focal(respond_bbox, pred_conf) conf_loss = conf_focal * ( respond_bbox * flow.nn.sigmoid_cross_entropy_with_logits( labels=respond_bbox, logits=raw_conf) + respond_bgd * flow.nn.sigmoid_cross_entropy_with_logits( labels=respond_bbox, logits=raw_conf)) # [N, H, W, 3, 1] prob_loss = respond_bbox * flow.nn.sigmoid_cross_entropy_with_logits( labels=label_prob, logits=raw_prob) #?? # label_w = flow.slice(label, begin=[None, None, None, None, 2], size=[None, None, None, None, 1]) # label_h = flow.slice(label, begin=[None, None, None, None, 3], size=[None, None, None, None, 1]) # bbox_loss_scale = 2.0 - 1.0 * label_w * label_h / ((stride * feature_map.shape[1]) * (stride * feature_map.shape[2])) #??? # # [N, H, W, 3, 1] # giou_loss = respond_bbox * bbox_loss_scale * flow.smooth_l1_loss(prediction=pred_xywh, label=label_xywh) giou_loss = flow.math.reduce_mean( flow.math.reduce_sum(giou_loss, axis=[1, 2, 3, 4])) conf_loss = flow.math.reduce_mean( flow.math.reduce_sum(conf_loss, axis=[1, 2, 3, 4])) prob_loss = flow.math.reduce_mean( flow.math.reduce_sum(prob_loss, axis=[1, 2, 3, 4])) return giou_loss, conf_loss, prob_loss
def do_where(condition, x, y): return flow.where(condition, x, y)