def __init__(self, params): super(BatchNormLayer, self).__init__(params) p = self.params assert p.name pc = py_utils.WeightParams( shape=[p.dim], init=py_utils.WeightInit.Constant(0.0), dtype=p.dtype, collections=[self.__class__.__name__ + '_vars']) with tf.variable_scope(p.name): if not p.use_moving_avg_in_training: self.CreateVariable('beta', pc) # Note, The real gamma to use is 1 + gamma. self.CreateVariable('gamma', pc, lambda x: 1.0 + x) # Two statistics. _, self._moving_mean = py_utils.CreateVariable('moving_mean', pc, trainable=False) pc = py_utils.WeightParams( shape=[p.dim], init=py_utils.WeightInit.Constant(1.0), dtype=p.dtype, collections=[self.__class__.__name__ + '_vars']) _, self._moving_variance = py_utils.CreateVariable( 'moving_variance', pc, trainable=False) self._epsilon = 0.001 self._decay = p.decay
def testRenamingRules(self): pc = py_utils.WeightParams([3, 3]) with tf.variable_scope('model'): _, v1 = py_utils.CreateVariable('v1', pc) with py_utils.VariableRenameScope([('model/(.*)', 'data/%s')]): _, v2 = py_utils.CreateVariable('v2', pc) _, v3 = py_utils.CreateVariable('v3', pc) self.assertTrue(v1.name == 'model/v1/var:0') self.assertTrue(v2.name == 'data/v2/var:0') self.assertTrue(v3.name == 'model/v3/var:0')
def testNoConsting(self): with inference_graph_exporter.ConstGuaranteeScope(): wp = py_utils.WeightParams(shape=[1], init=py_utils.WeightInit.Constant(0.0), dtype=tf.float32, collections=['v']) v = py_utils.CreateVariable('v', wp) self.assertEqual(tf.Tensor, type(v)) with inference_graph_exporter.NoConstGuaranteeScope(): v = py_utils.CreateVariable('v', wp, reuse=True) self.assertIsInstance(v, tf.Variable)
def testCreateVariableDifferentSeed(self): with self.session(use_gpu=False) as sess: tf.set_random_seed(3251343) pc = py_utils.WeightParams([2, 3], py_utils.WeightInit.Gaussian()) with tf.variable_scope('layer0'): w0, _ = py_utils.CreateVariable('w', pc) with tf.variable_scope('layer1'): w1, _ = py_utils.CreateVariable('w', pc) sess.run(tf.global_variables_initializer()) # w0_val, w1_val should be sufficient different. w0_val, w1_val = sess.run([w0, w1]) print(['diff = ', w0_val - w1_val]) self.assertTrue(np.max(np.abs(w0_val - w1_val)) > 0.1)
def __init__(self, params): super(LinearModel, self).__init__(params) p = self.params with tf.variable_scope(p.name): w = py_utils.WeightParams(shape=[3], init=py_utils.WeightInit.Gaussian( scale=1.0, seed=123456), dtype=p.dtype) b = py_utils.WeightParams(shape=[], init=py_utils.WeightInit.Gaussian( scale=1.0, seed=234567), dtype=p.dtype) self._w, _ = py_utils.CreateVariable('w', w) self._b, _ = py_utils.CreateVariable('b', b)
def testCreateLocalTheta(self): methods = [py_utils.WeightInit.Gaussian, py_utils.WeightInit.Uniform] dtypes = [tf.float32, tf.complex64] shapes = [[2, 4], [3]] test_vars = py_utils.NestedMap() for i, (m, dt, sp) in enumerate(itertools.product(methods, dtypes, shapes)): pc = py_utils.WeightParams(sp, m(), dt, 'col1') test_vars['var_%d' % i] = py_utils.CreateVariable('var_%d' % i, pc)[0] test_devices = [ '/job:worker/replica:0/device:GPU:0', '/job:worker/replica:0/device:GPU:1' ] sharded_local_vars = py_utils.CreateLocalTheta(test_vars, test_devices) sharded_local_vars_list = sharded_local_vars.Flatten() # assert the name is now Identity* for v in sharded_local_vars_list: self.assertTrue('Identity' in v.name) # assert proper device placement for i, v in enumerate(sharded_local_vars_list): expected_device = test_devices[i % len(test_devices)] self.assertEqual(v.device, expected_device)
def testCreateVariableNormal(self): with self.session(use_gpu=False, graph=tf.Graph()): tf.set_random_seed(832124) methods = [ py_utils.WeightInit.Gaussian, py_utils.WeightInit.GaussianSqrtDim, ] dtypes = [tf.float32, tf.complex64] shapes = [[2, 3]] all_vars = [] for i, (dt, m, sp) in enumerate( itertools.product(dtypes, methods, shapes)): pc = py_utils.WeightParams(sp, m(), dt) all_vars.append(py_utils.CreateVariable('var_%d' % i, pc)[0]) v1_v_expted = [[-1.472208, 0.960204, -0.192588], [-0.461884, 1.018134, 0.063719]] v2_v_expted = [[-0.862255, -0.688153, 0.82515], [-0.07671, 0.613031, -0.020327]] v3_v_expted = [ [1.005469 + 0.827639j, 1.249896 + 0.802671j, -0.026286 - 0.813836j], [0.865386 + 0.301172j, 0.876698 - 0.907293j, 1.996337 + 1.840192j], ] tf.global_variables_initializer().run() v1_v = all_vars[0].eval() v2_v = all_vars[1].eval() v3_v = all_vars[2].eval() self.assertAllClose(v1_v_expted, v1_v.tolist()) self.assertAllClose(v2_v_expted, v2_v.tolist()) self.assertAllClose(v3_v_expted, v3_v.tolist())
def CreateVariable(self, name, var_params, theta_fn=None, *args, **kwargs): """Create a variable of this layer according to the parameter `var_params`. E.g.:: def __init__(self, ...): # A layer's constructor self.CreateVariable( 'weight', py_utils.WeightParams(shape=[100, 100])) `theta_fn` is used to apply a simple transformation on the created variable's value before used by the forward computation. E.g., to add the global variational noise according to this layer's parameter, one can do:: def __init__(self, ...): # A layer's constructor self.CreateVariable( name='weight', var_params=py_utils.WeightParams(shape=[100, 100]), theta_fn=self.AddGlobalVN) Args: name: Variable name which is used as the key into vars/theta. var_params: `Params` used to create the variable. theta_fn: A python function that takes a variable's value and returns a new value to be used later for computation. Its signature must be (tf.Tensor) -> (tf.Tensor). *args: List of args passed to `.py_utils.CreateVariable`. **kwargs: Keyword args passed to `.py_utils.CreateVariable`. """ self._CheckName(name) value, var = py_utils.CreateVariable(name, var_params, *args, **kwargs) self._private_vars[name] = var if theta_fn is not None: value = theta_fn(value) self._private_theta[name] = value
def _CreateVariableInternal(self, name, meta): """Immediately creates the variable described by `meta`. DO NOT OVERRIDE. For internal use only. Subclasses of BaseLayer should use self.CreateVariable() to create variables. Args: name: The variable name. meta: A CreateVariableMeta describing the variable to be created. """ meta.kwargs.setdefault('default_seed', self.params.random_seed) var = py_utils.CreateVariable(name, meta.var_params, **meta.kwargs) self._private_vars[name] = var if self.cluster.params.worker.gpus_per_replica > 0: # On GPU (which always trains a single step per session.run()), reference # a tensor in FProp to cache it on device and avoid extraneous sends from # reading variables from ps multiple times. with tf.device(var.device): value = tf.identity(var) else: # Pass the resource variable directly into the training loop. value = var if meta.theta_fn is not None: value = meta.theta_fn(value) self._private_theta[name] = value
def testCreateVariableUniform(self): with self.session(use_gpu=False, graph=tf.Graph()): tf.set_random_seed(12345678) methods = [ py_utils.WeightInit.Uniform, py_utils.WeightInit.UniformSqrtDim, py_utils.WeightInit.UniformUnitScaling, ] dtypes = [tf.float32, tf.complex64] shapes = [[2, 3]] all_vars = [] for i, (dt, m, sp) in enumerate( itertools.product(dtypes, methods, shapes)): pc = py_utils.WeightParams(sp, m(0.1), dt) all_vars.append(py_utils.CreateVariable('var_%d' % i, pc)[0]) v1_v_expted = [[0.069674, -0.072278, -0.021777], [-0.052155, -0.050274, 0.086218]] v2_v_expted = [[0.005361, 0.036109, -0.036575], [0.058314, 0.031438, 0.049196]] v4_v_expted = [ [0.015448 + 0.068295j, -0.098710 - 0.054435j, 0.037030 - 0.048017j], [-0.047435 + 0.035301j, 0.041994 + 0.000279j, -0.029097 + 0.084902j], ] tf.global_variables_initializer().run() v1_v = all_vars[0].eval() v2_v = all_vars[1].eval() v4_v = all_vars[3].eval() self.assertAllClose(v1_v_expted, v1_v.tolist()) self.assertAllClose(v2_v_expted, v2_v.tolist()) self.assertAllClose(v4_v_expted, v4_v.tolist())
def testCreateVariableException(self): with self.session(use_gpu=False, graph=tf.Graph()): tf.set_random_seed(832124) pc = py_utils.WeightParams([2, 3], py_utils.WeightInit.Gaussian()) var1 = py_utils.CreateVariable('var1', pc)[0] tf.get_variable_scope().reuse_variables() # Reuses an existing variable. var2 = py_utils.CreateVariable('var1', pc)[0] # An exception should be thrown in this case. pc = py_utils.WeightParams([2, 3], py_utils.WeightInit.Gaussian(2.0)) with self.assertRaises(AssertionError): py_utils.CreateVariable('var1', pc) tf.global_variables_initializer().run() self.assertAllEqual(var1.eval(), var2.eval())
def __init__(self, name): self._name = name _, self._var = py_utils.CreateVariable( name=name, params=py_utils.WeightParams([], py_utils.WeightInit.Constant(0), tf.int64), trainable=False) self._value = self._var.value() + 0 # Makes a copy.
def _CreateQStateVar(self, t_name, suffix, params): name = t_name + '_' + suffix assert name not in self._qvars, 'QState var already exists: %s' % name var_name = self._qvars_scope.name + '/' + name with tf.variable_scope(py_utils.GetGlobalVariableScope()): v = py_utils.CreateVariable(var_name, params, trainable=False) self._qvars[name] = v return v
def __init__(self, params): super(MergerLayer, self).__init__(params) p = self.params if not p.name: raise ValueError('Layer must have a specified name!') if p.merger_op not in set(self.MERGER_OPS): raise ValueError('Merger op must be one of: ', self.MERGER_OPS) if p.merger_op == 'atten': atten_params = p.attention_tpl.Copy() atten_params.source_dim = p.source_dim atten_params.query_dim = p.query_dim atten_params.hidden_dim = p.hidden_dim atten_params.dtype = p.dtype if atten_params.params_init is None: atten_params.params_init = py_utils.WeightInit.Gaussian( 1. / math.sqrt(atten_params.source_dim + atten_params.query_dim)) self.CreateChild('atten', atten_params) if p.pre_proj_input_dims: if not p.pre_proj_output_dim: raise ValueError( 'Output dim should be specified for projection.') pre_proj_params = [] for i, pre_proj_dim in enumerate(p.pre_proj_input_dims): proj_p = p.proj_tpl.Copy() proj_p.name = 'merger_pre_proj_%d' % i proj_p.input_dim = pre_proj_dim proj_p.output_dim = p.pre_proj_output_dim pre_proj_params.append(proj_p) self.CreateChildren('pre_proj', pre_proj_params) if p.merger_op == 'weighted_sum': assert p.num_sources > 0, ( 'For merger_op=weighted_sum, must specify ' 'num_sources > 0.') params_init = py_utils.WeightInit.Constant(1.0 / p.num_sources) # Weights to be learned. pw = py_utils.WeightParams( shape=[p.num_sources], init=params_init, dtype=p.dtype, collections=[self.__class__.__name__ + '_vars']) with tf.variable_scope(p.name): _, self._sum_weight = py_utils.CreateVariable('sum_weight', pw) if p.merger_op == 'gated_avg': assert p.num_sources > 0, ('For merger_op=gated_avg, must specify ' 'num_sources > 0.') params = p.gated_avg_tpl.Copy() params.name = 'g_avg_merger' params.num_nodes = p.source_dim params.num_inputs = p.num_sources self.CreateChild('gated_average', params)
def CreateTaskGlobalStep(params, task_name): """Create if needed and return the global_step.""" with tf.name_scope(None), tf.variable_scope(py_utils.global_variable_scope): graph_collections = [tf.GraphKeys.GLOBAL_VARIABLES, 'TASK_GLOBAL_STEP'] _, v = py_utils.CreateVariable( name=task_name + '_global_step', params=py_utils.WeightParams([], py_utils.WeightInit.Constant(0), tf.int64), trainable=False, collections=graph_collections) summary_utils.scalar(params, v.name, v) return v
def __init__(self, params): super().__init__(params) p = self.params with tf.variable_scope(p.name): wp = py_utils.WeightParams( shape=[], init=py_utils.WeightInit.Constant(1.0), collections=['DevBasedSchedule_vars'], dtype=tf.float32) self._cur_factor = py_utils.CreateVariable( 'cur_factor', wp, trainable=False) wp = py_utils.WeightParams( shape=[], init=py_utils.WeightInit.Constant(0), collections=['DevBasedSchedule_vars'], dtype=tf.int64) self._ref_step = py_utils.CreateVariable('ref_step', wp, trainable=False) self._metric_history = early_stop.MetricHistory(p.metric_history) self._best_step = ops.best_step(self._metric_history.hist_file, p.tolerance)
def testCreateVariableBasics(self): with self.session(use_gpu=False, graph=tf.Graph()): methods = [ py_utils.WeightInit.Gaussian, py_utils.WeightInit.Uniform, py_utils.WeightInit.Constant, py_utils.WeightInit.TruncatedGaussian, py_utils.WeightInit.GaussianSqrtDim, py_utils.WeightInit.UniformSqrtDim, py_utils.WeightInit.UniformUnitScaling, py_utils.WeightInit.TruncatedGaussianSqrtDim, ] dtypes = [tf.float32, tf.float64, tf.complex64] shapes = [[], [3], [2, 4]] collections = ['col1', 'col2'] all_vars = [] for i, (m, dt, sp) in enumerate( itertools.product(methods, dtypes, shapes)): pc = py_utils.WeightParams(sp, m(), dt, collections) all_vars.append(py_utils.CreateVariable('var_%d' % i, pc)[0]) # To reuse existing variables tf.get_variable_scope().reuse_variables() self.assertEqual(len(tf.all_variables()), len(all_vars)) all_vars_copy = [] for i, (m, dt, sp) in enumerate( itertools.product(methods, dtypes, shapes)): pc = py_utils.WeightParams(sp, m(), dt, collections) all_vars_copy.append(py_utils.CreateVariable('var_%d' % i, pc)[0]) tf.global_variables_initializer().run() for v1, v2 in zip(all_vars, all_vars_copy): v1_v = v1.eval() v2_v = v2.eval() self.assertAllEqual(v1_v, v2_v)
def CreateVariable(self, name: str, var_params: hyperparams.Params, **kwargs) -> None: """Create a variable of this layer according to the parameter `var_params`. E.g.:: def __init__(self, ...): # A layer's constructor self.CreateVariable( 'weight', py_utils.WeightParams(shape=[100, 100])) Args: name: Variable name which is used as the key into vars/theta. var_params: `Params` used to create the variable. **kwargs: Keyword args passed to `.py_utils.CreateVariable`. """ kwargs.setdefault('default_seed', self.params.random_seed) if self.params.device_mesh is not None: if (len([dim for dim in var_params.shape if dim > 1]) > 1 and var_params.tensor_split_dims_mapping is None): tf.logging.warning( 'tensor_split_dims_mapping missing for %s.%s: shape=%s', self.path, name, var_params.shape) self._CheckName(name) if (self.params.skip_lp_regularization and py_utils.SKIP_LP_REGULARIZATION not in var_params.collections): var_params = py_utils.WeightParams( shape=var_params.shape, dtype=var_params.dtype, init=var_params.init, collections=(var_params.collections + [py_utils.SKIP_LP_REGULARIZATION])) self._var_symbolic_shape_map[name] = var_params.shape var = py_utils.CreateVariable(name, var_params, **kwargs) self._private_vars[name] = var if py_utils.IsEagerMode(): # With eager trainer, always use the variable directly. value = var else: if self.cluster.params.worker.gpus_per_replica > 0: # On GPU (which always trains a single step per session.run()), # reference a tensor in FProp to cache it on device and avoid extraneous # sends from reading variables from ps multiple times. with tf.device(var.device): value = tf.identity(var, name=name) else: value = var self._private_theta[name] = value
def _Acc(vg): """Updating accumulators.""" v, g = vg with tf.variable_scope(v.op.name): _, a = py_utils.CreateVariable( 'grad_accumulator', py_utils.WeightParams(v.get_shape(), py_utils.WeightInit.Constant(0.0), self.params.dtype), trainable=False) a = tf.assign_add(a, g) return py_utils.VarGrad(v, a)
def CreateVariable(self, name, var_params, theta_fn=None, *args, **kwargs): """Create a variable of this layer according to the parameter `var_params`. E.g.:: def __init__(self, ...): # A layer's constructor self.CreateVariable( 'weight', py_utils.WeightParams(shape=[100, 100])) `theta_fn` is used to apply a simple transformation on the created variable's value before used by the forward computation. E.g., to add the global variational noise according to this layer's parameter, one can do:: def __init__(self, ...): # A layer's constructor self.CreateVariable( name='weight', var_params=py_utils.WeightParams(shape=[100, 100]), theta_fn=self.AddGlobalVN) Args: name: Variable name which is used as the key into vars/theta. var_params: `Params` used to create the variable. theta_fn: A python function that takes a variable's value and returns a new value to be used later for computation. Its signature must be (tf.Tensor) -> (tf.Tensor). *args: List of args passed to `.py_utils.CreateVariable`. **kwargs: Keyword args passed to `.py_utils.CreateVariable`. """ self._CheckName(name) if (self.params.skip_lp_regularization and py_utils.SKIP_LP_REGULARIZATION not in var_params.collections): var_params = py_utils.WeightParams( shape=var_params.shape, dtype=var_params.dtype, init=var_params.init, collections=(var_params.collections + [py_utils.SKIP_LP_REGULARIZATION])) self._var_symbolic_shape_map[name] = var_params.shape if (var_params.shape and any(symbolic.IsExpr(dim) for dim in var_params.shape)): var_params.shape = symbolic.EvalExpr(var_params.shape) value, var = py_utils.CreateVariable(name, var_params, *args, **kwargs) self._private_vars[name] = var if theta_fn is not None: value = theta_fn(value) self._private_theta[name] = value
def _CreateVariable(self, name, meta): """Immediately creates the variable described by `meta`. DO NOT OVERRIDE. For internal use only. Subclasses of BaseLayer should use self.CreateVariable() to create variables. Args: name: The variable name. meta: A CreateVariableMeta describing the variable to be created. """ with tf.variable_scope(meta.var_scope): meta.kwargs.setdefault('default_seed', self.params.random_seed) value, var = py_utils.CreateVariable(name, meta.var_params, **meta.kwargs) self._private_vars[name] = var if meta.theta_fn is not None: value = meta.theta_fn(value) self._private_theta[name] = value
def _Acc(vg): """Updating accumulators.""" v, g = vg scope_name = v.name if scope_name.endswith(':0'): scope_name = scope_name[:-2] with tf.variable_scope(scope_name): a = py_utils.CreateVariable( 'grad_accumulator', py_utils.WeightParams(v.get_shape(), py_utils.WeightInit.Constant(0.0), self.params.dtype), trainable=False) a = tf.assign_add(a, g) return py_utils.VarGrad(v, a)
def testXavier3D(self): with self.session(use_gpu=False, graph=tf.Graph()): tf.set_random_seed(1618) methods = [py_utils.WeightInit.Xavier] dtypes = [tf.float32, tf.float16, tf.complex64] shapes = [[1, 1, 2]] all_vars = [] for i, (m, dt, sp) in enumerate( itertools.product(methods, dtypes, shapes)): pc = py_utils.WeightParams(sp, m(), dt) all_vars.append(py_utils.CreateVariable('var_%d' % i, pc)[0]) v1_v_expted = [[[1.357139, -1.23832]]] tf.global_variables_initializer().run() v1_v = all_vars[0].eval() self.assertAllClose(v1_v_expted, v1_v.tolist())
def testOpportunisticReuse(self): pc = py_utils.WeightParams([3, 3]) _, v1 = py_utils.CreateVariable('v1', pc) with self.assertRaises(Exception): _ = py_utils.CreateVariable('v1', pc) with py_utils.OpportunisticVariableReuseScope(True): _, v2 = py_utils.CreateVariable('v1', pc) _, x1 = py_utils.CreateVariable('x1', pc) with py_utils.OpportunisticVariableReuseScope(False): with self.assertRaises(Exception): _ = py_utils.CreateVariable('v1', pc) _, v3 = py_utils.CreateVariable('v1', pc) with self.assertRaises(Exception): _ = py_utils.CreateVariable('v1', pc) for v in [v2, v3]: self.assertTrue(v1 is v) self.assertTrue(v1 is not x1)
def _CreateVariableInternal(self, name, meta): """Immediately creates the variable described by `meta`. DO NOT OVERRIDE. For internal use only. Subclasses of BaseLayer should use self.CreateVariable() to create variables. Args: name: The variable name. meta: A CreateVariableMeta describing the variable to be created. """ meta.kwargs.setdefault('default_seed', self.params.random_seed) var = py_utils.CreateVariable(name, meta.var_params, **meta.kwargs) self._private_vars[name] = var if FLAGS.no_identity_on_vars: value = var else: with tf.device(var.device): value = tf.identity(var) if meta.theta_fn is not None: value = meta.theta_fn(value) self._private_theta[name] = value
def _CreateVariableInternal(self, name: str, meta: CreateVariableMeta) -> None: """Immediately creates the variable described by `meta`. DO NOT OVERRIDE. For internal use only. Subclasses of BaseLayer should use self.CreateVariable() to create variables. Args: name: The variable name. meta: A CreateVariableMeta describing the variable to be created. """ meta.kwargs.setdefault('default_seed', self.params.random_seed) var = py_utils.CreateVariable(name, meta.var_params, **meta.kwargs) self._private_vars[name] = var if self.cluster.params.worker.gpus_per_replica > 0: # On GPU (which always trains a single step per session.run()), reference # a tensor in FProp to cache it on device and avoid extraneous sends from # reading variables from ps multiple times. with tf.device(var.device): value = tf.identity(var) else: # Pass the resource variable directly into the training loop. value = var # Due to b/174956514, we have to annotate the use of the variable once, # otherwise, the sharding annotation on the var will be ignored. # TODO(yonghui): Get rid of this once b/174956514 is fixed. if (meta.var_params.device_mesh is not None and var.shape.rank == len(meta.var_params.tensor_split_dims_mapping)): value = gshard_utils.MeshSplit( value, meta.var_params.device_mesh, meta.var_params.tensor_split_dims_mapping, use_sharding_op=True) if meta.theta_fn is not None: self._private_theta_fn[name] = meta.theta_fn self._private_theta[name] = value
def testXavier(self): with self.session(use_gpu=False, graph=tf.Graph()): tf.set_random_seed(1618) methods = [py_utils.WeightInit.Xavier] dtypes = [tf.float32, tf.float16, tf.complex64] shapes = [[2, 3]] all_vars = [] for i, (m, dt, sp) in enumerate( itertools.product(methods, dtypes, shapes)): pc = py_utils.WeightParams(sp, m(), dt) all_vars.append(py_utils.CreateVariable('var_%d' % i, pc)[0]) v1_v_expted = [[1.051236, -0.959198, 0.796091], [-0.685691, 0.230933, -1.006293]] v3_v_expted = [ [0.149996 - 0.064369j, 0.689145 + 0.017257j, -0.502070 - 0.367683j], [0.519782 + 0.470412j, 0.738902 - 0.054006j, 0.028603 + 0.471832j], ] tf.global_variables_initializer().run() v1_v = all_vars[0].eval() v3_v = all_vars[2].eval() self.assertAllClose(v1_v_expted, v1_v.tolist()) self.assertAllClose(v3_v_expted, v3_v.tolist())