def _batch_norm( inputs, epsilon, center=True, scale=True, trainable=True, is_training=True, data_format="NCHW", name=None, ): return flow.layers.batch_normalization( inputs=inputs, axis=3 if data_format == "NHWC" and inputs.shape == 4 else 1, momentum=0.9, epsilon=epsilon, center=center, scale=scale, beta_initializer=flow.zeros_initializer(), gamma_initializer=flow.ones_initializer(), beta_regularizer=_get_regularizer("beta"), gamma_regularizer=_get_regularizer("gamma"), moving_mean_initializer=flow.zeros_initializer(), moving_variance_initializer=flow.ones_initializer(), trainable=trainable, training=is_training, name=name, )
def _batch_norm( inputs, epsilon, center=True, scale=True, trainable=True, is_training=True, name=None, ): return flow.layers.batch_normalization( inputs=inputs, axis=1, momentum=0.9, epsilon=epsilon, center=center, scale=scale, beta_initializer=flow.zeros_initializer(), gamma_initializer=flow.ones_initializer(), beta_regularizer=_get_regularizer(), gamma_regularizer=_get_regularizer(), moving_mean_initializer=flow.zeros_initializer(), moving_variance_initializer=flow.ones_initializer(), trainable=trainable, training=is_training, name=name, )
def _batch_norm(inputs, axis, momentum, epsilon, center=True, scale=True, trainable=True, name=None): if trainable: training = True else: training = False return flow.layers.batch_normalization( inputs=inputs, axis=axis, momentum=momentum, epsilon=epsilon, center=center, scale=scale, beta_initializer=flow.zeros_initializer(), gamma_initializer=flow.ones_initializer(), moving_mean_initializer=flow.zeros_initializer(), moving_variance_initializer=flow.ones_initializer(), trainable=trainable, training=training, name=name)
def _batch_norm(inputs, name, trainable=True, training=True): params_shape = [inputs.shape[1]] # Float32 required to avoid precision-loss when using fp16 input/output params_dtype = flow.float32 if inputs.dtype == flow.float16 else inputs.dtype if not flow.current_global_function_desc().IsTrainable() or not trainable: training = False with flow.scope.namespace(name): beta = flow.get_variable( name="beta", shape=params_shape, dtype=params_dtype, initializer=flow.zeros_initializer(), trainable=trainable, distribute=distribute_util.broadcast(), ) gamma = flow.get_variable( name="gamma", shape=params_shape, dtype=params_dtype, initializer=flow.ones_initializer(), trainable=trainable, distribute=distribute_util.broadcast(), ) moving_mean = flow.get_variable( name="moving_mean", shape=params_shape, dtype=params_dtype, initializer=flow.zeros_initializer(), trainable=False, distribute=distribute_util.broadcast(), ) moving_variance = flow.get_variable( name="moving_variance", shape=params_shape, dtype=params_dtype, initializer=flow.ones_initializer(), trainable=False, distribute=distribute_util.broadcast(), ) builder = (flow.user_op_builder( id_util.UniqueStr(name)).Op("normalization").Input( "x", [inputs]).Input("moving_mean", [moving_mean]).Input( "moving_variance", [moving_variance]).Input("gamma", [gamma]).Input( "beta", [beta]).Output("y").Attr("axis", 1).Attr( "epsilon", 1.001e-5).Attr("training", training).Attr("momentum", 0.997)) if trainable and training: builder = builder.Output("mean").Output("inv_variance") return builder.Build().InferAndTryRun().RemoteBlobList()[0]
def flow_net(var_name, random_mask): with flow.scope.placement(device_type, "0:0-0"): x = flow.get_variable( name=var_name, shape=x_shape, dtype=flow.float32, initializer=flow.ones_initializer(), trainable=True, ) constant_val = flow.constant(3.0, dtype=flow.float32, shape=(1, )) x = x * constant_val x = x * 2.0 if device_type == "gpu": x = flow.cast(x, flow.float16) x = flow.math.relu(x) x = flow.cast(x, flow.float) loss = flow.math.reduce_mean(x * random_mask) flow.optimizer.Adam( flow.optimizer.PiecewiseConstantScheduler([], [learning_rate]), beta1=beta1, beta2=beta2, epsilon=epsilon, do_bias_correction=True, ).minimize(loss) return x
def _batch_norm_relu(self, inputs, name=None, last=False): if self.fuse_bn_relu: initializer = flow.zeros_initializer( ) if last else flow.ones_initializer() axis = 1 if self.data_format == "NHWC": axis = 3 return flow.layers.batch_normalization_relu( inputs=inputs, axis=axis, momentum=0.9, epsilon=1e-5, center=True, scale=True, trainable=self.trainable, training=self.training, gamma_initializer=initializer, moving_variance_initializer=initializer, gamma_regularizer=self.weight_regularizer, beta_regularizer=self.weight_regularizer, name=name + "_bn_relu", ) else: return flow.nn.relu( self._batch_norm(inputs, name + "_bn", last=last))
def test_fn( a: flow.typing.Numpy.Placeholder(a_shape), b: flow.typing.Numpy.Placeholder(b_shape), c: flow.typing.Numpy.Placeholder(c_shape), ) -> flow.typing.Numpy: # print(f"a.split_axis: {a.split_axis}") # print(f"b.split_axis: {b.split_axis}") # print(f"c.split_axis: {c.split_axis}") var_a = flow.get_variable( name="var_a", shape=a_shape, dtype=flow.float32, initializer=flow.ones_initializer(), distribute=flow.distribute.split(1), ) # S0 -> S1 a = flow.parallel_cast(a, distribute=flow.distribute.split(1)) a = var_a * a out = flow.matmul(a, b) # P -> B out = flow.parallel_cast( out, distribute=flow.distribute.broadcast(), gradient_distribute=flow.distribute.broadcast(), ) # S0 -> B c = flow.parallel_cast(c, distribute=flow.distribute.broadcast()) out = flow.nn.bias_add(out, c) lr_scheduler = flow.optimizer.PiecewiseConstantScheduler([], [0.001]) flow.optimizer.SGD(lr_scheduler, momentum=0).minimize(out) return out
def test_Embedding(x: tp.Numpy.Placeholder(shape=(64, 62), dtype=flow.int32)) -> tp.Numpy: out = EmbeddingLayer(x, 8500, 512) x = flow.get_variable( name="x", shape=(64, 62, 1), dtype=flow.float32, initializer=flow.ones_initializer(), ) return x * out
def trt_batch_norm_job(x=flow.FixedTensorDef(input_shape, dtype=dtype)): out = flow.layers.batch_normalization(x, axis=axis) c = flow.get_variable( "c", shape=out.shape, dtype=flow.float, initializer=flow.ones_initializer(), trainable=True, ) out = flow.math.add_n([out, c]) return out
def foo_job(input_def: oft.Numpy.Placeholder(shape=(2, 5))): var = flow.get_variable( name="var", shape=(2, 5), dtype=flow.float, initializer=flow.ones_initializer(), ) input_def = flow.cast_to_current_logical_view(input_def) var = flow.cast_to_current_logical_view(var) output = var + input_def return output
def _get_initializer(model_name): if model_name == "weight": return flow.variance_scaling_initializer(2.0, mode="fan_out", distribution="random_normal", data_format="NCHW") elif model_name == "bias": return flow.zeros_initializer() elif model_name == "gamma": return flow.ones_initializer() elif model_name == "beta": return flow.zeros_initializer() elif model_name == "dense_weight": return flow.random_normal_initializer(0, 0.01)
def ExpandDimsJob(): with flow.scope.placement(device_type, "0:0"): x = flow.get_variable( "var", shape=x_shape, dtype=flow.float, initializer=flow.ones_initializer(), trainable=True, ) flow.watch_diff(x, check_grad) loss = flow.expand_dims(x, axis) flow.losses.add_loss(loss) return loss
def DropoutJob() -> flow.typing.Numpy: with flow.scope.placement(device_type, "0:0"): x = flow.get_variable( "x", shape=x_shape, dtype=dtype, initializer=flow.ones_initializer(), trainable=True, ) of_out = flow.nn.dropout(x, rate=rate, seed=seed, name="dropout") loss = flow.math.square(of_out) flow.losses.add_loss(loss) return of_out
def trt_matmul_job( a=flow.FixedTensorDef(a_shape, dtype=dtype), b=flow.FixedTensorDef(b_shape, dtype=dtype), ): out = flow.matmul(a, b, transpose_a=trans_a, transpose_b=trans_b) c = flow.get_variable( "c", shape=out.shape, dtype=flow.float, initializer=flow.ones_initializer(), trainable=True, ) out = flow.math.add_n([out, c]) return out
def ExpandDimsJob(): with flow.scope.placement(device_type, "0:0"): x = flow.get_variable( "var", shape=x_shape, dtype=flow.float, initializer=flow.ones_initializer(), trainable=True, ) flow.watch_diff(x, check_grad) loss = flow.expand_dims(x, axis) flow.optimizer.SGD(flow.optimizer.PiecewiseConstantScheduler( [], [1e-4]), momentum=0).minimize(loss) return loss
def instance_norm(input, name_prefix, trainable=True): (mean, variance) = flow.nn.moments(input, [2, 3], keepdims=True) gamma = flow.get_variable(name_prefix + "_gamma", shape=(1, input.shape[1], 1, 1), dtype=input.dtype, initializer=flow.ones_initializer(), trainable=trainable) beta = flow.get_variable(name_prefix + "_beta", shape=(1, input.shape[1], 1, 1), dtype=input.dtype, initializer=flow.zeros_initializer(), trainable=trainable) epsilon = 1e-3 normalized = (input - mean) / flow.math.sqrt(variance + epsilon) return gamma * normalized + beta
def DropoutJob() -> flow.typing.Numpy: with flow.scope.placement(device_type, "0:0"): x = flow.get_variable( "x", shape=x_shape, dtype=dtype, initializer=flow.ones_initializer(), trainable=True, ) of_out = flow.nn.dropout(x, rate=rate, seed=seed, name="dropout") loss = flow.math.square(of_out) flow.optimizer.SGD(flow.optimizer.PiecewiseConstantScheduler( [], [1e-4]), momentum=0).minimize(loss) return of_out
def create_look_ahead_mask(size): """ Return a mask like [[0., 1., 1.], [0., 0., 1.], [0., 0., 0.]] :param size: The matrix size :return: look ahead mask """ ones_blob = flow.get_variable(name="ones_blob", shape=[size, size], dtype=flow.float32, initializer=flow.ones_initializer(), trainable=False) mask = 1 - flow.math.tril(ones_blob, 0) return mask
def _batch_norm(inputs, momentum, epsilon, name, training=True): return flow.layers.batch_normalization( inputs=inputs, axis=-1, momentum=momentum, epsilon=epsilon, center=True, scale=True, # beta_initializer=flow.zeros_initializer(), # gamma_initializer=flow.ones_initializer(), # beta_regularizer=flow.zeros_initializer(), # gamma_regularizer=flow.ones_initializer(), moving_mean_initializer=flow.zeros_initializer(), moving_variance_initializer=flow.ones_initializer(), trainable=True, training=training, name=name)
def test_float_initializer(test_case): initializers = [ flow.random_normal_initializer(mean=3, stddev=4), flow.random_uniform_initializer(minval=-6, maxval=18), flow.truncated_normal_initializer(mean=-5, stddev=8), flow.xavier_uniform_initializer(data_format="NCHW"), flow.xavier_uniform_initializer(data_format="NHWC"), flow.xavier_normal_initializer(data_format="NCHW"), flow.xavier_normal_initializer(data_format="NHWC"), flow.constant_initializer(value=4), flow.ones_initializer(), flow.zeros_initializer(), ] kaiming_args = GenArgDict( OrderedDict( shape=[SHAPE], mode=["fan_in", "fan_out", "fan_avg"], distribution=["random_normal", "random_uniform"], data_format=["NCHW", "NHWC"], negative_slope=[0.5], )) vs_args = GenArgDict( OrderedDict( scale=[3.4], mode=["fan_in", "fan_out", "fan_avg"], distribution=[ "truncated_normal", "random_normal", "random_uniform" ], data_format=["NCHW", "NHWC"], )) for args in kaiming_args: initializers.append(flow.kaiming_initializer(**args)) for args in vs_args: initializers.append(flow.variance_scaling_initializer(**args)) for initializer in initializers: CompareTwoDistribution(test_case, flow.float32, initializer)
def oneflow_mseloss( of_input: tp.Numpy.Placeholder(shape=input.shape), of_target: tp.Numpy.Placeholder(shape=target.shape), ) -> Dict[str, tp.Numpy]: with flow.scope.placement(device_type, "0:0"): v = flow.get_variable( shape=input.shape, dtype=flow.float32, initializer=flow.ones_initializer(), name="x_var", ) x_var = of_input + v flow.watch_diff(x_var, assert_prediction_grad) mseloss = flow.nn.MSELoss(x_var, of_target, reduction="none", name="of_mseloss") mseloss_mean = flow.nn.MSELoss(x_var, of_target, reduction="mean", name="of_mseloss_reduce_mean") mseloss_sum = flow.nn.MSELoss(x_var, of_target, reduction="sum", name="of_mseloss_reduce_sum") with flow.scope.placement(device_type, "0:0"): flow.optimizer.SGD(flow.optimizer.PiecewiseConstantScheduler( [], [1e-3]), momentum=0).minimize(mseloss_mean) return { "of_mse_loss": mseloss, "of_mse_loss_mean": mseloss_mean, "of_mse_loss_sum": mseloss_sum, }
def test_fn( a: flow.typing.Numpy.Placeholder(a_shape), b: flow.typing.Numpy.Placeholder(b_shape), c: flow.typing.Numpy.Placeholder(c_shape), ) -> flow.typing.Numpy: var_a = flow.get_variable( name="var_a", shape=a_shape, dtype=flow.float32, initializer=flow.ones_initializer(), distribute=flow.distribute.split(1), ) # S0 -> S1 a = flow.hierarchical_parallel_cast(a, parallel_distribution=["S(1)"]) a = var_a * a out = flow.matmul(a, b) # P -> B out = flow.hierarchical_parallel_cast(out, parallel_distribution=["B"]) # S0 -> B c = flow.hierarchical_parallel_cast(c, parallel_distribution=["B"]) out = flow.nn.bias_add(out, c) lr_scheduler = flow.optimizer.PiecewiseConstantScheduler([], [0.001]) flow.optimizer.SGD(lr_scheduler, momentum=0).minimize(out) return out
def fake_flow_ones(shape): tensor = flow.Tensor(*shape) tensor.set_data_initializer(flow.ones_initializer()) return tensor
def batch_normalization( inputs: remote_blob_util.BlobDef, axis: int = -1, momentum: float = 0.99, epsilon: float = 0.001, center: bool = True, scale: bool = True, beta_initializer: Optional[op_conf_util.InitializerConf] = None, gamma_initializer: Optional[op_conf_util.InitializerConf] = None, beta_regularizer: Optional[op_conf_util.RegularizerConf] = None, gamma_regularizer: Optional[op_conf_util.RegularizerConf] = None, moving_mean_initializer: Optional[op_conf_util.InitializerConf] = None, moving_variance_initializer: Optional[op_conf_util.InitializerConf] = None, trainable: bool = True, training: bool = True, name: str = "BatchNorm", ) -> remote_blob_util.BlobDef: r"""Analogous to `tf.keras.layers.BatchNormalization <https://www.tensorflow.org/api_docs/python/tf/keras/layers/BatchNormalization>`_ Args: inputs (remote_blob_util.BlobDef): Input `Blob`. axis (int, optional): An int specifies the aixs that should be normalized . Default is -1, which normalizes the last axis. momentum (float, optional): A float specifies the momontum for the moving average. Defaults to 0.99. epsilon (float, optional): A small float added to avoid division by zero. Defaults to 0.001. center (bool, optional): A boolean specifies whether to add offset to normalized `Blob`. Defaults to True. scale (bool, optional): A boolean specifies whether to multiply normalized `Blob` by gamma. Defaults to True. beta_initializer (Optional[op_conf_util.InitializerConf], optional): Initializer for beta. Defaults to None. gamma_initializer (Optional[op_conf_util.InitializerConf], optional): Initializer for gamma. Defaults to None. beta_regularizer (Optional[op_conf_util.RegularizerConf], optional): Regularizer for beta. Defaults to None. gamma_regularizer (Optional[op_conf_util.RegularizerConf], optional): Regularizer for gamma. Defaults to None. moving_mean_initializer (Optional[op_conf_util.InitializerConf], optional): Initializer for moving mean. Defaults to None. moving_variance_initializer (Optional[op_conf_util.InitializerConf], optional): Initializer for moving variance. Defaults to None. trainable (bool, optional): A boolean specifies whether to train variables. Defaults to True. training (bool, optional): A boolean specifies whether now is training the model. Defaults to True. name (Optional[str], optional): This layer's name. Defaults to None. Returns: remote_blob_util.BlobDef: A `Blob` with same shape of input. Raises: ValueError: If axis is out of dimension of input. """ if axis < 0: axis += len(inputs.shape) assert axis >= 0 and axis < len(inputs.shape) params_shape = [inputs.shape[axis]] # Float32 required to avoid precision-loss when using fp16 input/output params_dtype = flow.float32 if inputs.dtype == flow.float16 else inputs.dtype if not flow.current_global_function_desc().IsTrainable() or not trainable: training = False with flow.scope.namespace(name): if center: beta = flow.get_variable( name="beta", shape=params_shape, dtype=params_dtype, initializer=beta_initializer or flow.zeros_initializer(), regularizer=beta_regularizer, trainable=trainable, distribute=distribute_util.broadcast(), reuse=False, ) else: beta = flow.constant(0, dtype=params_dtype, shape=params_shape, name="beta") if scale: gamma = flow.get_variable( name="gamma", shape=params_shape, dtype=params_dtype, initializer=gamma_initializer or flow.ones_initializer(), regularizer=gamma_regularizer, trainable=trainable, distribute=distribute_util.broadcast(), reuse=False, ) else: gamma = flow.constant(1, dtype=params_dtype, shape=params_shape, name="gamma") moving_mean = flow.get_variable( name="moving_mean", shape=params_shape, dtype=params_dtype, initializer=moving_mean_initializer or flow.zeros_initializer(), trainable=False, distribute=distribute_util.broadcast(), reuse=False, ) moving_variance = flow.get_variable( name="moving_variance", shape=params_shape, dtype=params_dtype, initializer=moving_variance_initializer or flow.ones_initializer(), trainable=False, distribute=distribute_util.broadcast(), reuse=False, ) if flow.current_scope().device_parallel_desc_symbol.device_tag == "cpu": if training: reduce_axis = [] for dim in range(len(inputs.shape)): if dim != axis: reduce_axis.append(dim) mean, variance = flow.nn.moments(inputs, reduce_axis, keepdims=False) def update_moving(moving, this_batch): moving_identity = flow.identity(moving) flow.assign( moving, momentum * moving_identity + (1 - momentum) * this_batch) update_moving(moving_mean, mean) update_moving(moving_variance, variance) return flow.nn.batch_normalization( x=inputs, mean=mean, variance=variance, offset=beta, scale=gamma, variance_epsilon=epsilon, axis=axis, name=name, ) else: mean = moving_mean variance = moving_variance return flow.nn.batch_normalization( x=inputs, mean=mean, variance=variance, offset=beta, scale=gamma, variance_epsilon=epsilon, axis=axis, name=name, ) else: builder = (flow.user_op_builder(name).Op("normalization").Input( "x", [inputs]).Input("moving_mean", [moving_mean]).Input( "moving_variance", [moving_variance]).Input("gamma", [gamma]).Input( "beta", [beta]).Output("y").Attr("axis", axis).Attr( "epsilon", epsilon).Attr("training", training).Attr("momentum", momentum)) if trainable and training: builder = builder.Output("mean").Output("inv_variance") return builder.Build().InferAndTryRun().RemoteBlobList()[0]