def test_fn( a: flow.typing.Numpy.Placeholder(a_shape), b: flow.typing.Numpy.Placeholder(b_shape), c: flow.typing.Numpy.Placeholder(c_shape), ) -> flow.typing.Numpy: # print(f"a.split_axis: {a.split_axis}") # print(f"b.split_axis: {b.split_axis}") # print(f"c.split_axis: {c.split_axis}") var_a = flow.get_variable( name="var_a", shape=a_shape, dtype=flow.float32, initializer=flow.ones_initializer(), distribute=flow.distribute.split(1), ) # S0 -> S1 a = flow.parallel_cast(a, distribute=flow.distribute.split(1)) a = var_a * a out = flow.matmul(a, b) # P -> B out = flow.parallel_cast( out, distribute=flow.distribute.broadcast(), gradient_distribute=flow.distribute.broadcast(), ) # S0 -> B c = flow.parallel_cast(c, distribute=flow.distribute.broadcast()) out = flow.nn.bias_add(out, c) lr_scheduler = flow.optimizer.PiecewiseConstantScheduler([], [0.001]) flow.optimizer.SGD(lr_scheduler, momentum=0).minimize(out) return out
def _model(dense_fields, wide_sparse_fields, deep_sparse_fields): wide_sparse_fields = flow.parallel_cast( wide_sparse_fields, distribute=flow.distribute.broadcast()) wide_embedding_table = flow.get_variable( name='wide_embedding', shape=(FLAGS.wide_vocab_size, 1), initializer=flow.random_uniform_initializer(minval=-0.05, maxval=0.05), distribute=flow.distribute.split(0), ) wide_embedding = flow.gather(params=wide_embedding_table, indices=wide_sparse_fields) wide_embedding = flow.reshape(wide_embedding, shape=(-1, wide_embedding.shape[-1] * wide_embedding.shape[-2])) wide_scores = flow.math.reduce_sum(wide_embedding, axis=[1], keepdims=True) wide_scores = flow.parallel_cast( wide_scores, distribute=flow.distribute.split(0), gradient_distribute=flow.distribute.broadcast()) deep_sparse_fields = flow.parallel_cast( deep_sparse_fields, distribute=flow.distribute.broadcast()) deep_embedding_table = flow.get_variable( name='deep_embedding', shape=(FLAGS.deep_vocab_size, FLAGS.deep_embedding_vec_size), initializer=flow.random_uniform_initializer(minval=-0.05, maxval=0.05), distribute=flow.distribute.split(1), ) deep_embedding = flow.gather(params=deep_embedding_table, indices=deep_sparse_fields) deep_embedding = flow.parallel_cast( deep_embedding, distribute=flow.distribute.split(0), gradient_distribute=flow.distribute.split(2)) deep_embedding = flow.reshape(deep_embedding, shape=(-1, deep_embedding.shape[-1] * deep_embedding.shape[-2])) deep_features = flow.concat([deep_embedding, dense_fields], axis=1) for idx, units in enumerate(DEEP_HIDDEN_UNITS): deep_features = flow.layers.dense( deep_features, units=units, kernel_initializer=flow.glorot_uniform_initializer(), bias_initializer=flow.constant_initializer(0.0), activation=flow.math.relu, name='fc' + str(idx + 1)) deep_features = flow.nn.dropout(deep_features, rate=FLAGS.deep_dropout_rate) deep_scores = flow.layers.dense( deep_features, units=1, kernel_initializer=flow.glorot_uniform_initializer(), bias_initializer=flow.constant_initializer(0.0), name='fc' + str(len(DEEP_HIDDEN_UNITS) + 1)) scores = wide_scores + deep_scores return scores
def gpt2_func(x: flow.typing.Numpy.Placeholder( (args.batch_size, args.seq_len), dtype=flow.int64)): if x.split_axis == 0: x = flow.parallel_cast(x, distribute=flow.distribute.broadcast()) outputs = {} gpt2 = GPT2(args, name="model") outputs = gpt2.forward(x) loss = gpt2.loss(x, outputs["logits"], parallel_loss=args.parallel_loss) outputs["loss"] = loss optimizer = util.make_optimizer(args) optimizer.minimize(loss) return {"loss": loss}
def with_gradient_distribute(self, distribute): return oneflow.parallel_cast(self, gradient_distribute=distribute)
def insightface_train_job(): if args.use_synthetic_data: (labels, images) = ofrecord_util.load_synthetic(args) else: labels, images = ofrecord_util.load_train_dataset(args) print("train batch data: ", images.shape) embedding = insightface(images) def _get_initializer(): return flow.random_normal_initializer(mean=0.0, stddev=0.01) trainable = True if args.loss_type == "arc_loss": s = args.margin_s m = args.margin fc1 = flow.math.l2_normalize(input=embedding, axis=1, epsilon=1e-10) fc1 = flow.math.multiply(fc1, s) fc7 = flow.get_variable( name="fc7-weight", shape=(args.class_num, fc1.shape[1]), dtype=fc1.dtype, initializer=_get_initializer(), trainable=trainable, model_name="weight", ) fc7 = flow.math.l2_normalize(input=fc7, axis=1, epsilon=1e-10) matmul = flow.matmul(a=fc1, b=fc7, transpose_b=True) labels_expand = flow.reshape(labels, (labels.shape[0], 1)) zy = flow.gather(matmul, labels_expand, batch_dims=1) cos_t = flow.math.multiply(zy, 1 / s) cos_m = math.cos(m) sin_m = math.sin(m) mm = math.sin(math.pi - m) * m threshold = math.cos(math.pi - m) if args.easy_margin: cond = flow.math.relu(cos_t) else: cond_v = cos_t - threshold cond = flow.math.relu(cond_v) body = flow.math.square(cos_t) body = flow.math.multiply(body, -1.0) body = flow.math.add(1, body) sin_t = flow.math.sqrt(body) new_zy = flow.math.multiply(cos_t, cos_m) b = flow.math.multiply(sin_t, sin_m) b = flow.math.multiply(b, -1.0) new_zy = flow.math.add(new_zy, b) new_zy = flow.math.multiply(new_zy, s) if args.easy_margin: zy_keep = zy else: zy_keep = flow.math.add(zy, -s * mm) cond = flow.cast(cond, dtype=flow.int32) new_zy = flow.where(cond, new_zy, zy_keep) zy = flow.math.multiply(zy, -1.0) diff = flow.math.add(new_zy, zy) gt_one_hot = flow.one_hot( labels, depth=args.class_num, dtype=flow.float ) body = flow.math.multiply(gt_one_hot, diff) fc7 = flow.math.add(matmul, body) elif args.loss_type == "margin_softmax": fc7_weight = flow.get_variable( name="fc7-weight", shape=(args.class_num, embedding.shape[1]), dtype=embedding.dtype, initializer=_get_initializer(), trainable=trainable, model_name="weight", ) s = args.margin_s fc7_weight = flow.math.l2_normalize( input=fc7_weight, axis=1, epsilon=1e-10 ) fc1 = ( flow.math.l2_normalize(input=embedding, axis=1, epsilon=1e-10) * s ) fc7 = flow.matmul(a=fc1, b=fc7_weight, transpose_b=True) if args.loss_m1 != 1.0 or args.loss_m2 != 0.0 or args.loss_m3 != 0.0: if args.loss_m1 == 1.0 and args.loss_m2 == 0.0: s_m = s * args.loss_m3 gt_one_hot = flow.one_hot( labels, depth=args.class_num, on_value=s_m, off_value=0.0, dtype=flow.float, ) fc7 = fc7 - gt_one_hot else: labels_expand = flow.reshape(labels, (labels.shape[0], 1)) zy = flow.gather(fc7, labels_expand, batch_dims=1) cos_t = zy * (1 / s) t = flow.math.acos(cos_t) if args.loss_m1 != 1.0: t = t * args.loss_m1 if args.loss_m2 > 0.0: t = t + args.loss_m2 body = flow.math.cos(t) if args.loss_m3 > 0.0: body = body - args.loss_m3 new_zy = body * s diff = new_zy - zy gt_one_hot = flow.one_hot( labels, depth=args.class_num, on_value=1.0, off_value=0.0, dtype=flow.float, ) body = gt_one_hot * diff fc7 = fc7 + body elif args.loss_type == "softmax": if args.model_parallel: labels = labels.with_distribute(flow.distribute.broadcast()) fc1_distribute = flow.distribute.broadcast() fc7_data_distribute = flow.distribute.split(1) fc7_model_distribute = flow.distribute.split(0) else: fc1_distribute = flow.distribute.split(0) fc7_data_distribute = flow.distribute.split(0) fc7_model_distribute = flow.distribute.broadcast() print("loss 0") fc7 = flow.layers.dense( inputs=embedding.with_distribute(fc1_distribute), units=args.class_num, activation=None, use_bias=False, kernel_initializer=_get_initializer(), bias_initializer=None, trainable=trainable, name=args.models_name, model_distribute=fc7_model_distribute, ) fc7 = fc7.with_distribute(fc7_data_distribute) elif args.loss_type == "arc_loss_ms": labels = labels.with_distribute(flow.distribute.broadcast()) fc7_model_distribute = flow.distribute.split(0) fc7_data_distribute = flow.distribute.split(1) fc7_weight = flow.get_variable( name="fc7-weight", shape=(args.class_num, embedding.shape[1]), dtype=embedding.dtype, initializer=_get_initializer(), trainable=trainable, model_name="weight", distribute=fc7_model_distribute, ) s = args.margin_s fc7_weight = flow.math.l2_normalize( input=fc7_weight, axis=1, epsilon=1e-10 ) fc1 = ( flow.math.l2_normalize(input=embedding, axis=1, epsilon=1e-10) ) fc1 = flow.parallel_cast(fc1, distribute=flow.distribute.broadcast()) fc7 = flow.matmul(a=fc1, b=fc7_weight, transpose_b=True) #s1 fc7 = flow.arc_loss(fc7, labels, margin=args.loss_m2)*60 fc7 = fc7.with_distribute(fc7_data_distribute) else: raise NotImplementedError loss = flow.nn.sparse_softmax_cross_entropy_with_logits( labels, fc7, name="softmax_loss" ) lr_scheduler = flow.optimizer.PiecewiseScalingScheduler(args.base_lr, [100000, 140000, 160000], [0.1, 0.01, 0.001]) flow.optimizer.SGD(lr_scheduler, momentum=0.9).minimize(loss) return loss