def dense(input, units, name, use_bias=False, trainable=True, reuse=False, const_init=False): name_ = name if reuse == False else name + "_reuse" in_shape = input.shape in_num_axes = len(in_shape) assert in_num_axes >= 2 inputs = flow.reshape(input, (-1, in_shape[-1])) if in_num_axes > 2 else input weight = flow.get_variable( name="{}-weight".format(name), shape=(units, inputs.shape[1]), dtype=inputs.dtype, initializer=flow.random_normal_initializer( stddev=0.02) if not const_init else get_const_initializer(), trainable=trainable, reuse=reuse, model_name="weight", ) out = flow.matmul( a=inputs, b=weight, transpose_b=True, name=name_ + "matmul", ) if use_bias: bias = flow.get_variable( name="{}-bias".format(name), shape=(units, ), dtype=inputs.dtype, initializer=flow.random_normal_initializer() if not const_init else get_const_initializer(), trainable=trainable, reuse=reuse, model_name="bias", ) out = flow.nn.bias_add(out, bias, name=name_ + "_bias_add") out = flow.reshape(out, in_shape[:-1] + (units, )) if in_num_axes > 2 else out return out
def __init__(self, batch_size, seq_length, hidden_size, vocab_size): self.batch_size = batch_size self.seq_length = seq_length self.hidden_size = hidden_size self.vocab_size = vocab_size args = get_args() self.embedding_dropout_rate = args.hidden_dropout self.use_fp16 = args.fp16 self.wpe_initializer = flow.random_normal_initializer( stddev=args.init_method_std) self.wte_initializer = flow.random_normal_initializer( stddev=args.init_method_std)
def get_linear_params( name, input_size, output_size, dtype, weight_initializer=flow.random_normal_initializer(stddev=0.02), bias_initializer=flow.constant_initializer(0.0), weight_parallel_dist=None, bias_parallel_dist=None, ): with flow.scope.namespace(name): weight = flow.get_variable( name="weight", shape=(input_size, output_size), dtype=dtype, initializer=weight_initializer, nd_sbp=weight_parallel_dist, ) bias = flow.get_variable( name="bias", shape=(output_size, ), dtype=dtype, initializer=bias_initializer, nd_sbp=bias_parallel_dist, ) return weight, bias
def _AddClassficationLoss(input_blob, label_blob, hidden_size, label_num, initializer_range, scope_name='classification'): with flow.scope.namespace(scope_name): output_weight_blob = flow.get_variable( name="output_weights", shape=[label_num, hidden_size], dtype=input_blob.dtype, # initializer=bert_util.CreateInitializer(initializer_range), initializer=flow.random_normal_initializer( mean=0.0, stddev=initializer_range, seed=None, dtype=None)) output_bias_blob = flow.get_variable( name="output_bias", shape=[label_num], dtype=input_blob.dtype, initializer=flow.constant_initializer(0.0), ) logit_blob = flow.matmul(input_blob, output_weight_blob, transpose_b=True) logit_blob = flow.nn.bias_add(logit_blob, output_bias_blob) pre_example_loss = flow.nn.sparse_softmax_cross_entropy_with_logits( logits=logit_blob, labels=label_blob) loss = pre_example_loss return loss, pre_example_loss, logit_blob
def broadcast_to_compatible_with_fn( x_def: oft.Numpy.Placeholder(x.shape, dtype=flow.float) ): x_var = flow.get_variable( "x_var", shape=x.shape, dtype=flow.float, initializer=flow.constant_initializer(0), trainable=True, ) compatible_var = [ flow.get_variable( "compatible_var_{}".format(i), shape=cp_shape, dtype=flow.float, initializer=flow.random_normal_initializer(), trainable=False, ) for (i, cp_shape) in enumerate(compatible_shape) ] x_var = x_var + x_def y = flow.broadcast_to_compatible_with(x_var, compatible_var) flow.optimizer.SGD( flow.optimizer.PiecewiseConstantScheduler([], [0.001]), momentum=0 ).minimize(y) flow.watch_diff(x_var, dx_watcher) return y
def get_variable(name): return flow.get_variable( name=name, shape=(10, 80, 40, 20), dtype=dtype, initializer=flow.random_normal_initializer(mean=10, stddev=1), distribute=flow.distribute.split(0), )
def model() -> tp.Numpy: with get_placement(): x = flow.get_variable( name="x", shape=(10, 801, 820, 4), dtype=dtype, initializer=flow.random_normal_initializer(mean=10, stddev=1), distribute=flow.distribute.split(0), ) y = flow.get_variable( name="y", shape=(10, 801, 820, 4), dtype=dtype, initializer=flow.random_normal_initializer(mean=10, stddev=1), distribute=flow.distribute.split(0), ) return flow.math.reduce_mean(x + y)
def model() -> tp.Numpy: with get_placement(): x = flow.get_variable( name="x", shape=(4, 5), dtype=flow.float32, initializer=flow.random_normal_initializer(mean=10, stddev=1), ) w = flow.get_variable( name="w", shape=(5, 6), dtype=flow.float32, initializer=flow.random_normal_initializer(mean=10, stddev=1), distribute=flow.distribute.split(0), ) y = flow.matmul(x, w) flow.optimizer.SGD( flow.optimizer.PiecewiseConstantScheduler([], [0.01]), momentum=0.9 ).minimize(y) return y
def add() -> tp.Numpy: with get_placement(): x = flow.get_variable( name="x", shape=(9, 3), dtype=dtype, initializer=flow.random_normal_initializer(mean=10, stddev=1), distribute=flow.distribute.split(0), ) y = flow.get_variable( name="y", shape=(9, 3), dtype=dtype, initializer=flow.constant_initializer(5, dtype=dtype), ) z = flow.get_variable( name="z", shape=(9, 3), dtype=dtype, initializer=flow.random_normal_initializer(), ) return flow.math.add_n([x, y, z])
def deconv2d( input, filters, size, name, strides=2, trainable=True, reuse=False, const_init=False, use_bias=False, ): name_ = name if reuse == False else name + "_reuse" # weight : [in_channels, out_channels, height, width] weight_shape = (input.shape[1], filters, size, size) output_shape = ( input.shape[0], input.shape[1], input.shape[2] * strides, input.shape[3] * strides, ) weight = flow.get_variable( name + "-weight", shape=weight_shape, dtype=input.dtype, initializer=flow.random_normal_initializer( stddev=0.02) if not const_init else get_const_initializer(), trainable=trainable, ) output = flow.nn.conv2d_transpose( input, weight, strides=[strides, strides], output_shape=output_shape, padding="SAME", data_format="NCHW", name=name_, ) if use_bias: bias = flow.get_variable( name + "-bias", shape=(filters, ), dtype=input.dtype, initializer=flow.constant_initializer(0.0), trainable=trainable, ) output = flow.nn.bias_add(output, bias, "NCHW") return output
def __init__(self, batch_size, seq_length, hidden_size): self.batch_size = batch_size self.seq_length = seq_length self.hidden_size = hidden_size args = get_args() self.multihead_attention_fusion = args.multihead_attention_fusion self.num_layers = args.num_layers self.layers = [] for i in range(self.num_layers): self.layers.append( TransformerLayer( f"h{i}", i + 1, batch_size, seq_length, hidden_size, initializer=flow.random_normal_initializer( stddev=args.init_method_std), output_layer_initializer=flow.random_normal_initializer( stddev=(args.init_method_std / math.sqrt(2.0 * self.num_layers))), ))
def _get_initializer(model_name): if model_name == "weight": return flow.variance_scaling_initializer(2.0, mode="fan_out", distribution="random_normal", data_format="NCHW") elif model_name == "bias": return flow.zeros_initializer() elif model_name == "gamma": return flow.ones_initializer() elif model_name == "beta": return flow.zeros_initializer() elif model_name == "dense_weight": return flow.random_normal_initializer(0, 0.01)
def broadcast_to_compatible_with_fn( x_def: oft.ListNumpy.Placeholder(shape=x_shape, dtype=flow.float) ): compatible_var = [ flow.get_variable( "compatible_var_{}".format(i), shape=cp_shape, dtype=flow.float, initializer=flow.random_normal_initializer(), trainable=False, ) for (i, cp_shape) in enumerate(compatible_shape) ] return flow.broadcast_to_compatible_with(x_def, compatible_var)
def conv2d( input, filters, size, name, strides=2, padding="same", trainable=True, reuse=False, const_init=False, use_bias=True, ): name_ = name if reuse == False else name + "_reuse" # (output_dim, k_h, k_w, input.shape[3]) if NHWC weight_shape = (filters, input.shape[1], size, size) weight = flow.get_variable( name + "-weight", shape=weight_shape, dtype=input.dtype, initializer=flow.random_normal_initializer( stddev=0.02) if not const_init else get_const_initializer(), trainable=trainable, reuse=reuse, ) output = flow.nn.compat_conv2d( input, weight, strides=[strides, strides], padding=padding, data_format="NCHW", name=name_, ) if use_bias: bias = flow.get_variable( name + "-bias", shape=(filters, ), dtype=input.dtype, initializer=flow.constant_initializer(0.0), trainable=trainable, reuse=reuse, ) output = flow.nn.bias_add(output, bias, "NCHW") return output
def _get_kernel_initializer(): return flow.random_normal_initializer(stddev=0.01)
def kaiming_initializer( shape: Sequence[int], distribution: str = "random_normal", mode: str = "fan_in", nonlinearity: str = "leaky_relu", negative_slope: float = 0.0, data_format: str = "NCHW", ) -> None: """Initialize weight according to the method described in `Delving deep into rectifiers: Surpassing human-level performance on ImageNet classification` - He, K. et al. (2015), using a normal or uniform distribution. When distribution is "random_normal" The equation is: .. math:: W \\sim N(0, \\sqrt{\\frac{{2}}{{n}}}) When distribution is "random_uniform" The equation is: .. math:: W \\sim U(-\\sqrt{\\frac{{6}}{{n}}}, \\sqrt{\\frac{{6}}{{n}}}) If mode is "fan_in", the "n" is the number of input units in the weight Blob. If mode is "fan_out", the "n" is the number of output units in the weight Blob. if mode is "fan_avg", the "n" is the average of the number of input and output units in the weight Blob Args: shape (Sequence[int]): Blob shape. distribution (str, optional): 'random_normal' or 'random_uniform'. Defaults to "random_normal". mode (str, optional): 'fan_in', 'fan_out' or 'fan_avg'. Defaults to "fan_in". nonlinearity (str, optional): None, 'tanh', 'sigmoid', 'relu' or 'leaky_relu'. Defaults to "leaky_relu". negative_slope (float, optional): The negative slope of leaky_relu. Defaults to 0.0. data_format (str, optional): 'NCHW', 'NHWC'. Defaults to "NCHW". Raises: NotImplementedError: Only support normal and uniform distribution Returns: [type]: flow.random_normal_initializer or flow.random_uniform_initializer For example: Example 1: .. code-block:: python import oneflow.compatible.single_client as flow import oneflow.compatible.single_client.typing as tp def watch_handler(y: tp.Numpy): print("out", y) @flow.global_function() def kaiming_Job() -> None: init = flow.kaiming_initializer(shape=(3, 3), mode="fan_avg", nonlinearity="relu") blob = flow.get_variable( "blob-weight", shape=(3, 3), initializer=init, trainable=True ) flow.watch(blob, watch_handler) checkpoint = flow.train.CheckPoint() checkpoint.init() kaiming_Job() # out [[ 0.54521346 0.32585594 1.3474437 ] # [ 0.30729076 -0.19158769 0.2709008 ] # [-0.95830524 -0.05093324 0.28178614]] Example 2: .. code-block:: python import oneflow.compatible.single_client as flow import numpy as np import oneflow.compatible.single_client.typing as tp @flow.global_function() def conv2d_kaiming_Job(x: tp.Numpy.Placeholder((1, 256, 32, 32)) ) -> tp.Numpy: initializer = flow.kaiming_initializer(shape=(1, 256, 32, 32)) conv2d = flow.layers.conv2d( x, filters=128, kernel_size=3, strides=1, padding='SAME', kernel_initializer=initializer, name="Conv2d" ) return conv2d x = np.random.randn(1, 256, 32, 32).astype(np.float32) out = conv2d_kaiming_Job(x) # out.shape (1, 128, 32, 32) """ assert isinstance(shape, (tuple, flow.Size)) assert len(shape) >= 2 elem_cnt = functools.reduce(lambda a, b: a * b, shape, 1) assert elem_cnt > 0 assert distribution in ["random_normal", "random_uniform"] assert mode in ["fan_in", "fan_out", "fan_avg"] assert nonlinearity in [None, "tanh", "sigmoid", "relu", "leaky_relu"] assert data_format in ["NCHW", "NHWC"] fan = _CalcFan(shape, mode, _get_data_format(data_format)) gain = CalcGain(nonlinearity, negative_slope) std = gain / math.sqrt(fan) if distribution == "random_normal": return flow.random_normal_initializer(0.0, std) elif distribution == "random_uniform": bound = math.sqrt(3.0) * std return flow.random_uniform_initializer(-bound, bound) else: raise NotImplementedError( "Only support normal and uniform distribution")