def test_job(x: oft.Numpy.Placeholder(input_shape, dtype=flow.float32)): v = flow.get_variable( name="v", shape=(1, ), dtype=flow.float32, initializer=flow.zeros_initializer(), ) x = x + v x1 = flow.identity(x) x2 = flow.identity(x) flow.watch_diff(x1, test_global_storage.Setter("x1_diff")) flow.watch_diff(x2, test_global_storage.Setter("x2_diff")) x1 = flow.cast(x1, data_type) x2 = flow.cast(x2, data_type) y1 = flow.layers.batch_normalization_relu(x1, axis=axis, name="BN1") y2 = flow.math.relu( flow.layers.batch_normalization(x2, axis=axis, name="BN2")) y1 = flow.cast(y1, flow.float32) y2 = flow.cast(y2, flow.float32) flow.watch(y1, test_global_storage.Setter("y1")) flow.watch(y2, test_global_storage.Setter("y2")) y1 = flow.where(flow.math.greater(y2, v), y1, v) y2 = flow.where(flow.math.greater(y1, v), y2, v) loss = y1 + y2 flow.optimizer.SGD(flow.optimizer.PiecewiseConstantScheduler([], [0.001]), momentum=0).minimize(flow.math.reduce_sum(loss)) return loss
def broadcast_to_compatible_with_fn( x_def: oft.ListNumpy.Placeholder(x_shape, dtype=flow.float), a_def: oft.ListNumpy.Placeholder(a_shape, dtype=flow.float), b_def: oft.ListNumpy.Placeholder(b_shape, dtype=flow.float), ): return flow.broadcast_to_compatible_with( x_def, [flow.identity(a_def), flow.identity(b_def)] )
def split_to_broadcast_job(input_blob: oft.Numpy.Placeholder( (96, 96))): with flow.scope.placement("gpu", "0:0"): src = flow.identity( input_blob.with_distribute(flow.distribute.split(0))) with flow.scope.placement("gpu", ["0:0", "1:0"]): dst = flow.identity( src.with_distribute(flow.distribute.broadcast())) return dst
def FlowJob( value: oft.Numpy.Placeholder(x_shape), bias: oft.Numpy.Placeholder(bias_shape) ): with flow.scope.placement(device_type, "0:0"): value += flow.get_variable( name="v1", shape=(1,), dtype=flow.float, initializer=flow.zeros_initializer(), ) bias += flow.get_variable( name="v2", shape=(1,), dtype=flow.float, initializer=flow.zeros_initializer(), ) x1 = flow.identity(value) x2 = flow.identity(value) bias1 = flow.identity(bias) bias2 = flow.identity(bias) flow.watch_diff(x1, test_global_storage.Setter("x1_diff")) flow.watch_diff(x2, test_global_storage.Setter("x2_diff")) flow.watch_diff(bias1, test_global_storage.Setter("bias1_diff")) flow.watch_diff(bias2, test_global_storage.Setter("bias2_diff")) if data_type == "float16": y1 = flow.cast( flow.math.gelu( flow.nn.bias_add( flow.cast(x1, dtype=flow.float16), flow.cast(bias1, dtype=flow.float16), data_format=data_format, ) ), dtype=flow.float, ) y2 = flow.cast( flow.nn.fused_bias_add_gelu( flow.cast(x2, dtype=flow.float16), flow.cast(bias2, dtype=flow.float16), data_format=data_format, ), dtype=flow.float, ) else: y1 = flow.math.gelu( flow.nn.bias_add(x1, bias1, data_format=data_format) ) y2 = flow.nn.fused_bias_add_gelu(x2, bias2, data_format=data_format) flow.watch(y1, test_global_storage.Setter("y1")) flow.watch(y2, test_global_storage.Setter("y2")) flow.watch_diff(y1, test_global_storage.Setter("y1_diff")) flow.watch_diff(y2, test_global_storage.Setter("y2_diff")) loss = y1 + y2 flow.optimizer.SGD( flow.optimizer.PiecewiseConstantScheduler([], [0.001]), momentum=0 ).minimize(flow.math.reduce_sum(loss)) return loss
def build_b2b(input_blob, src_device_num, dst_device_num): with flow.scope.placement(src_device_type, "0:0-" + str(src_device_num - 1)): src = flow.identity( input_blob.with_distribute(flow.distribute.broadcast())) with flow.scope.placement(dst_device_type, "0:0-" + str(dst_device_num - 1)): dst = flow.identity( src.with_distribute(flow.distribute.broadcast())) return dst
def build_s2s_all2all(input_blob, src_axis, dst_axis): with flow.scope.placement(src_device_type, "0:0-" + str(src_device_num - 1)): src = flow.identity( input_blob.with_distribute(flow.distribute.split(src_axis))) with flow.scope.placement(dst_device_type, "0:0-" + str(dst_device_num - 1)): dst = flow.identity( src.with_distribute(flow.distribute.split(dst_axis))) return dst
def build_p2b(input_blob, src_device_num, dst_device_num): with flow.scope.placement(src_device_type, "0:0-" + str(src_device_num - 1)): src = flow.identity( input_blob.with_distribute(flow.distribute.split(0))) src = flow.math.reduce_sum(src, axis=0) with flow.scope.placement(dst_device_type, "0:0-" + str(dst_device_num - 1)): dst = flow.identity( src.with_distribute(flow.distribute.broadcast())) return dst
def _dense_layer( inputs, units, activation=None, use_bias=True, kernel_initializer=None, bias_initializer=None, trainable=True, name=None, ): in_shape = inputs.shape in_num_axes = len(in_shape) assert in_num_axes >= 2 name_prefix = name if name is not None else id_util.UniqueStr("Dense_") inputs = flow.reshape(inputs, (-1, in_shape[-1])) if in_num_axes > 2 else inputs weight = flow.get_variable( name="{}-weight".format(name_prefix), shape=(units, inputs.shape[1]), dtype=inputs.dtype, initializer=kernel_initializer if kernel_initializer is not None else flow.constant_initializer(0), trainable=trainable, model_name="weight", ) weight = flow.identity(weight) weight = flow.repeat(weight, args.num_piece_in_batch) out = flow.matmul(a=inputs, b=weight, transpose_b=True, name="{}_matmul".format(name_prefix)) if use_bias: bias = flow.get_variable( name="{}-bias".format(name_prefix), shape=(units, ), dtype=inputs.dtype, initializer=bias_initializer if bias_initializer is not None else flow.constant_initializer(0), trainable=trainable, model_name="bias", ) bias = flow.identity(bias) bias = flow.repeat(bias, args.num_piece_in_batch) out = flow.nn.bias_add(out, bias, name="{}_bias_add".format(name_prefix)) out = (activation(out, name="{}_activation".format(name_prefix)) if activation is not None else out) out = flow.reshape(out, in_shape[:-1] + (units, )) if in_num_axes > 2 else out return out
def multi_lbi_job(x: oft.Numpy.Placeholder((96, 96, 96))): with flow.scope.placement(src_device_type, "0:0-" + str(src_device_num - 1)): src_s0 = flow.identity(x.with_distribute(flow.distribute.split(0))) src_s1 = flow.identity(x.with_distribute(flow.distribute.split(1))) src_b = flow.identity(x.with_distribute(flow.distribute.split(1))) (t0_0, t0_1, t0_2) = flow.identity_n((src_s0, src_s1, src_b)) with flow.scope.placement(dst_device_type, "0:0-" + str(dst_device_num - 1)): t0_0 = t0_0.with_distribute(flow.distribute.split(1)) t0_1 = t0_1.with_distribute(flow.distribute.broadcast()) t0_2 = t0_2.with_distribute(flow.distribute.split(1)) (t1_0, t1_1, t1_2) = flow.identity_n((t0_0, t0_1, t0_2)) return (t1_0, t1_1, t1_2)
def _conv2d_layer( args, name, input, filters, kernel_size=3, strides=1, padding="SAME", data_format="NCHW", dilation_rate=1, activation=op_conf_util.kRelu, use_bias=False, weight_initializer=flow.random_uniform_initializer(), bias_initializer=flow.random_uniform_initializer(), ): weight_shape = (filters, input.shape[1], kernel_size, kernel_size) weight = flow.get_variable( name + "-weight", shape=weight_shape, dtype=input.dtype, initializer=weight_initializer, ) weight = flow.identity(weight) weight = flow.repeat(weight, args.num_piece_in_batch) output = flow.nn.conv2d(input, weight, strides, padding, None, data_format, dilation_rate, name=name) if use_bias: bias = flow.get_variable( name + "-bias", shape=(filters, ), dtype=input.dtype, initializer=bias_initializer, ) bias = flow.identity(bias) bias = flow.repeat(bias, args.num_piece_in_batch) output = flow.nn.bias_add(output, bias, data_format) if activation is not None: if activation == op_conf_util.kRelu: output = flow.math.relu(output) else: raise NotImplementedError return output
def SparseSoftmaxCrossEntropyWithLogitsJob(labels: oft.Numpy.Placeholder( (batch_size, ), dtype=type_name_to_flow_type[label_type])): with flow.scope.placement(device_type, "0:0"): x = flow.get_variable( "x", shape=(batch_size, num_classes), dtype=type_name_to_flow_type[data_type], initializer=flow.random_uniform_initializer(minval=-10, maxval=10), trainable=True, ) with flow.scope.placement(device_type, "0:0-3"): labels = flow.parallel_cast(labels, distribute=flow.distribute.broadcast()) logits = flow.parallel_cast( x, distribute=flow.distribute.split(len(x.shape) - 1)) loss = flow.nn.distributed_sparse_softmax_cross_entropy_with_logits( labels, logits) loss = flow.math.square(loss) with flow.scope.placement(device_type, "0:0"): loss = flow.identity(loss) flow.optimizer.SGD(flow.optimizer.PiecewiseConstantScheduler( [], [0.0001]), momentum=0).minimize(loss) flow.watch(x, test_global_storage.Setter("x")) flow.watch_diff(x, test_global_storage.Setter("x_diff")) flow.watch(loss, test_global_storage.Setter("loss")) flow.watch_diff(loss, test_global_storage.Setter("loss_diff")) return loss
def nvtx_range_job(x: oft.Numpy.Placeholder((4, 4, 1024, 1024))): x += flow.get_variable( name="v1", shape=(1, ), dtype=flow.float, initializer=flow.zeros_initializer(), ) x = flow.math.relu(x) x = flow.profiler.nvtx_start(x, mark_prefix="softmax") x = flow.nn.softmax(x) x = flow.nn.softmax(x) x = flow.nn.softmax(x) x = flow.nn.softmax(x) x = flow.nn.softmax(x) x = flow.profiler.nvtx_end(x, mark_prefix="softmax") x = flow.math.relu(x) x = flow.profiler.nvtx_start(x, mark_prefix="gelu") x = flow.math.gelu(x) x = flow.math.gelu(x) x = flow.math.gelu(x) x = flow.math.gelu(x) x = flow.math.gelu(x) x = flow.math.gelu(x) x = flow.profiler.nvtx_end(x, mark_prefix="gelu") flow.optimizer.SGD(flow.optimizer.PiecewiseConstantScheduler([], [0]), momentum=0).minimize(x) return flow.identity(x)
def PartialFcJob(labels: oft.Numpy.Placeholder( (batch_size, ), dtype=type_name_to_flow_type[label_type])): with flow.scope.placement(device_type, "0:0"): x = flow.get_variable( "x-weight", shape=(num_classes, 128), dtype=flow.float, initializer=flow.random_uniform_initializer(minval=-10, maxval=10), trainable=True, ) with flow.scope.placement(device_type, "0:0-3"): lebels_distribute = flow.distribute.broadcast() weight_distribute = flow.distribute.split(0) ( maped_label, sampled_label, sampled_weight, ) = flow.distributed_partial_fc_sample( weight=x.with_distribute(weight_distribute), label=labels.with_distribute(lebels_distribute), num_sample=num_sample, ) with flow.scope.placement(device_type, "0:0"): sampled_weight = flow.identity(sampled_weight) loss = flow.math.square(sampled_weight) flow.optimizer.SGD(flow.optimizer.PiecewiseConstantScheduler( [], [0.0001]), momentum=0).minimize(loss) flow.watch(x, test_global_storage.Setter("x")) flow.watch_diff(x, test_global_storage.Setter("x_diff")) flow.watch_diff(sampled_weight, test_global_storage.Setter("sampled_weight_diff")) return (x, maped_label, sampled_label, sampled_weight)
def SoftmaxJob(): with flow.scope.placement(device_type, "0:0"): x = flow.get_variable( "x", shape=x_shape, dtype=dtype, initializer=flow.random_uniform_initializer(minval=-1.0, maxval=1.0), trainable=True, ) x1 = x x = flow.identity(x) if data_type == "float16": loss = flow.cast( flow.nn.softmax(flow.cast(x, dtype=flow.float16), axis=axis), dtype=flow.float, ) else: loss = flow.nn.softmax(x, axis=axis) flow.watch(x, test_global_storage.Setter("x")) flow.watch_diff(x, test_global_storage.Setter("x_diff")) flow.watch(loss, test_global_storage.Setter("loss")) flow.watch_diff(loss, test_global_storage.Setter("loss_diff")) total_loss = loss * x1 flow.optimizer.SGD(flow.optimizer.PiecewiseConstantScheduler( [], [0.0001]), momentum=0).minimize(total_loss) return loss
def __call__(self, hidden_states): """ hidden_states shape: (batch_size, seq_length, hidden_size) data parallel sbp: S(0) 2d sbp: [S(0), B] """ assert len(hidden_states.shape) == 3 assert hidden_states.shape[-1] == self.hidden_size assert np.prod( hidden_states.shape[:-1]) == self.batch_size * self.seq_length h = hidden_states with flow.scope.namespace(self.name): if self.enable_profiling: h = flow.profiler.nvtx_start( h, mark_prefix=f"transformer-{self.name}") h = flow.identity(h) with flow.experimental.scope.config( checkpointing=self.checkpoint_activations): # input layernorm norm1 = layernorm("layernorm_1", h) # attention h = h + self.attn(norm1) # output layernorm norm2 = layernorm("layernorm_2", h) # mlp h = h + self.mlp(norm2) if self.enable_profiling: h = flow.profiler.nvtx_end( h, mark_prefix=f"transformer-{self.name}") return h
def dynamic_concat_job( input_0_def: oft.ListNumpy.Placeholder( shape=input_static_shape, dtype=flow.float ), input_1_def: oft.ListNumpy.Placeholder( shape=input_static_shape, dtype=flow.float ), ): var_0 = flow.get_variable( "Var0", shape=(1,), dtype=flow.float, initializer=flow.constant_initializer(value=1, dtype=flow.float), trainable=True, ) var_1 = flow.get_variable( "Var1", shape=(1,), dtype=flow.float, initializer=flow.constant_initializer(value=1, dtype=flow.float), trainable=True, ) var_0 = flow.cast_to_current_logical_view(var_0) var_1 = flow.cast_to_current_logical_view(var_1) input_0_def = flow.cast_to_current_logical_view(input_0_def) input_1_def = flow.cast_to_current_logical_view(input_1_def) if callable(watch_cb): flow.watch(var_0, watch_cb) flow.watch(var_1, watch_cb) flow.watch(flow.identity(input_0_def), watch_cb) flow.watch(flow.identity(input_1_def), watch_cb) var_0 = var_0 * input_0_def var_1 = var_1 * input_1_def if callable(watch_cb): flow.watch(var_0, watch_cb) flow.watch(var_1, watch_cb) result = flow.concat( [var_0, var_1], axis=axis, max_dim_size=input_static_shape[axis] ) flow.optimizer.SGD( flow.optimizer.PiecewiseConstantScheduler([], [0.0001]), momentum=0 ).minimize(result) flow.watch_diff(var_0, make_watch_diff_cb(0)) flow.watch_diff(var_1, make_watch_diff_cb(1)) return result
def cast_to_current_logical_view( x: oneflow._oneflow_internal.BlobDesc, ) -> oneflow._oneflow_internal.BlobDesc: if (isinstance(x, oneflow._oneflow_internal.ConsistentBlob) and flow.scope.mirrored_view_enabled() or (isinstance(x, oneflow._oneflow_internal.MirroredBlob) and flow.scope.consistent_view_enabled())): x = flow.identity(x) return x
def broadcast_to_broadcast_job(x: oft.Numpy.Placeholder((96, 96, 96))): with flow.scope.placement( src_device_type, [ "0:0-" + str(src_device_num - 1), "1:0-" + str(src_device_num - 1) ], ): src = flow.identity(x.with_distribute(flow.distribute.broadcast())) with flow.scope.placement( dst_device_type, [ "0:0-" + str(dst_device_num - 1), "1:0-" + str(dst_device_num - 1) ], ): dst = flow.identity( src.with_distribute(flow.distribute.broadcast())) return dst
def split_to_split_job(x: oft.Numpy.Placeholder((32, 16, 64, 48))): with flow.scope.placement( src_device_type, [ "0:0-" + str(src_device_num - 1), "1:0-" + str(src_device_num - 1) ], ): src = flow.identity( x.with_distribute(flow.distribute.split(src_axis))) with flow.scope.placement( dst_device_type, [ "0:0-" + str(dst_device_num - 1), "1:0-" + str(dst_device_num - 1) ], ): dst = flow.identity( src.with_distribute(flow.distribute.split(dst_axis))) return dst
def partial_sum_to_broadcast_job(x: oft.Numpy.Placeholder((96, 96, 96))): with flow.scope.placement( src_device_type, [ "0:0-" + str(src_device_num - 1), "1:0-" + str(src_device_num - 1) ], ): src = flow.identity(x.with_distribute(flow.distribute.split(0))) src = flow.math.reduce_sum(src, axis=0) with flow.scope.placement( dst_device_type, [ "0:0-" + str(dst_device_num - 1), "1:0-" + str(dst_device_num - 1) ], ): dst = flow.identity( src.with_distribute(flow.distribute.broadcast())) return dst
def test_job( x: oft.Numpy.Placeholder(input_shape, dtype=flow.float32), labels: oft.Numpy.Placeholder(label_shape, dtype=flow.int32), ): with flow.scope.placement(device_type, "0:0"): v = flow.get_variable( name="v", shape=(1,), dtype=flow.float32, initializer=flow.zeros_initializer(), ) x = x + v x1 = flow.identity(x) x2 = flow.identity(x) flow.watch_diff(x1, test_global_storage.Setter("x1_diff")) flow.watch_diff(x2, test_global_storage.Setter("x2_diff")) x1 = flow.cast(x1, data_type) x2 = flow.cast(x2, data_type) with flow.scope.placement(device_type, "0:0-3"): y1 = ( flow.combined_margin_loss( x1.with_distribute(flow.distribute.split(1)), labels.with_distribute(flow.distribute.broadcast()), m1, m2, m3, ) * s ) y2 = margin_loss(m1, m2, m3, s, x2, labels) with flow.scope.placement(device_type, "0:0"): y1 = flow.cast(y1, flow.float) y2 = flow.cast(y2, flow.float) flow.watch(y1, test_global_storage.Setter("y1")) flow.watch(y2, test_global_storage.Setter("y2")) loss = y1 + y2 flow.optimizer.SGD( flow.optimizer.PiecewiseConstantScheduler([], [0.001]), momentum=0 ).minimize(flow.math.reduce_sum(loss)) return loss
def ReduceMinJob(x: oft.Numpy.Placeholder(input_shape, dtype=flow.float)): with flow.scope.placement(device_type, "0:0"): x += flow.get_variable( name="v1", shape=input_shape, dtype=flow.float, initializer=flow.zeros_initializer(), ) loss = flow.math.reduce_min(x, axis=axis, keepdims=keepdims) loss = flow.identity(loss) flow.optimizer.SGD(flow.optimizer.PiecewiseConstantScheduler( [], [0.0001]), momentum=0).minimize(loss) flow.watch(x, test_global_storage.Setter("x")) flow.watch_diff(x, test_global_storage.Setter("x_diff")) flow.watch(loss, test_global_storage.Setter("loss")) flow.watch_diff(loss, test_global_storage.Setter("loss_diff")) return loss
def ReduceMeanJob(): with flow.scope.placement(device_type, "0:0"): x = flow.get_variable( "x", shape=input_shape, dtype=flow.float, initializer=flow.random_uniform_initializer(minval=-10, maxval=10), trainable=True, ) loss = flow.math.reduce_mean(x, axis=axis, keepdims=keepdims) loss = flow.identity(loss) flow.optimizer.SGD(flow.optimizer.PiecewiseConstantScheduler( [], [0.0001]), momentum=0).minimize(loss) flow.watch(x, test_global_storage.Setter("x")) flow.watch_diff(x, test_global_storage.Setter("x_diff")) flow.watch(loss, test_global_storage.Setter("loss")) flow.watch_diff(loss, test_global_storage.Setter("loss_diff")) return loss
def two_stage_reduce_job(x: oft.Numpy.Placeholder((4, 20, 20, 20))): with flow.scope.placement(device_type, "0:0"): x += flow.get_variable( name="v1", shape=(1,), dtype=flow.float, initializer=flow.zeros_initializer(), ) with flow.scope.placement(device_type, "0:0-3"): loss = flow_func( x.with_distribute(flow.distribute.split(split_axis)), axis=axis, keepdims=True, ) loss = flow.identity(loss) flow.optimizer.SGD( flow.optimizer.PiecewiseConstantScheduler([], [0.0001]), momentum=0 ).minimize(loss) flow.watch(x, test_global_storage.Setter("x")) flow.watch_diff(x, test_global_storage.Setter("x_diff")) return loss
def test_fused_scale_tril_softmax_dropout_fw_bw_job(): with flow.scope.placement(device_type, "0:0"): x = flow.get_variable( "x", shape=x_shape, dtype=dtype, initializer=flow.random_uniform_initializer(minval=-1.0, maxval=1.0), trainable=True, ) flow.watch(x, test_global_storage.Setter("x")) x1 = flow.identity(x) x2 = flow.identity(x) flow.watch_diff(x1, test_global_storage.Setter("x1_diff")) flow.watch_diff(x2, test_global_storage.Setter("x2_diff")) if data_type == "float16": y1 = flow.cast( flow.nn.dropout( flow.nn.softmax( flow.math.fused_scale_tril( flow.cast(x1, dtype=flow.float16), diagonal=diagonal, fill_value=fill_value, scale=scale, ) ), rate=rate, name="dropout", ), dtype=flow.float, ) y2 = flow.cast( flow.nn.fused_scale_tril_softmax_dropout( flow.cast(x2, dtype=flow.float16), diagonal=diagonal, fill_value=fill_value, scale=scale, rate=rate, ), dtype=flow.float, ) else: y1 = flow.nn.dropout( flow.nn.softmax( flow.math.fused_scale_tril( x1, diagonal=diagonal, fill_value=fill_value, scale=scale ) ), rate=rate, name="dropout", ) y2 = flow.nn.fused_scale_tril_softmax_dropout( x2, diagonal=diagonal, fill_value=fill_value, scale=scale, rate=rate, ) flow.watch(y1, test_global_storage.Setter("y1")) flow.watch(y2, test_global_storage.Setter("y2")) flow.watch_diff(y1, test_global_storage.Setter("y1_diff")) flow.watch_diff(y2, test_global_storage.Setter("y2_diff")) loss = y1 + y2 total_loss = loss * x flow.optimizer.SGD( flow.optimizer.PiecewiseConstantScheduler([], [0.001]), momentum=0 ).minimize(flow.math.reduce_sum(total_loss)) return loss
def trt_identity_job(x=flow.FixedTensorDef(input_shape, dtype=dtype)): return flow.identity(x)
def identity_fn(): with flow.scope.placement(dst_device_tag, "0:0-{}".format(device_num - 1)): var = get_var() return flow.identity(var)
def split_to_split_job(x: oft.Numpy.Placeholder((32, 16, 64, 48))): with flow.scope.placement("gpu", "0:0-1"): src = flow.identity(x.with_distribute(flow.distribute.split(src_axis))) dst = flow.identity(src.with_distribute(flow.distribute.split(dst_axis))) return dst
def split_to_broadcast_job(x: oft.Numpy.Placeholder((96, 96))): with flow.scope.placement("gpu", "0:0-1"): src = flow.identity(x.with_distribute(flow.distribute.split(src_axis))) dst = flow.identity(src.with_distribute(flow.distribute.broadcast())) return dst
def partial_sum_to_split_job(x: oft.Numpy.Placeholder((96, 96, 96))): with flow.scope.placement("gpu", "0:0-1"): src = flow.identity(x.with_distribute(flow.distribute.split(0))) src = flow.math.reduce_sum(src, axis=0) dst = flow.identity(src.with_distribute(flow.distribute.split(dst_axis))) return dst