def _get_state_dtype(self, *args): if self.params.state_dtype: return self.params.state_dtype if self.params.nested_map_fprop: inputs = args[0].Filter(lambda x: x is not None) return py_utils.Flatten(inputs)[0].dtype return args[0].dtype
def _get_input_shapes(self, *args): p = self.params if p.nested_map_fprop: assert len(args) == 1 assert isinstance(args[0], py_utils.NestedMap) input_tensors = py_utils.Flatten(args[0]) else: input_tensors = _ToTuple(args) # Get batch size from the first tensor which is not None. mini_batch_size = None for input_tensor in input_tensors: if input_tensor is not None: mini_batch_size = input_tensor.get_shape().as_list()[p.batch_dim] assert mini_batch_size is not None micro_batch_size = p.micro_batch_size if not micro_batch_size: if p.num_micro_batches > mini_batch_size: p.num_micro_batches = mini_batch_size micro_batch_size = mini_batch_size // p.num_micro_batches if mini_batch_size is not None: if micro_batch_size * p.num_micro_batches != mini_batch_size: raise ValueError('micro_batch_size * num_micro_batches != batch_size.') input_shapes = () for input_tensor in input_tensors: if input_tensor is not None: input_shape = input_tensor.get_shape().as_list() input_shape[p.batch_dim] = micro_batch_size input_shapes += (tf.TensorShape(input_shape),) else: input_shapes += (None,) if p.nested_map_fprop: input_shapes = py_utils.Pack(args[0], input_shapes) return input_shapes
def BuildDataSource(self, data_source_from_file_pattern_fn): """Read and return input batch from a p.file_pattern list. `p.file_patterns` is a list of file patterns, `p.weights` contains weights for each file pattern. If provided `p.bprop_variable_filters` includes a bprop_variable_filter for each file pattern. Args: data_source_from_file_pattern_fn: a function that takes file_pattern as an argument and returns an input batch. Returns: A NestedMap containing: data: a tuple of tf.Tensor or `.NestedMap` of tf.Tensor source_selected: a tensor of size [batch_size, number of data sources] selected_bprop: a tensor of size [number of data sources] bprop_variable_filters: containing a list of bprop_variable filters for each source Raises: ValueError: If unknown token type. """ p = self.params if len(p.weights) != len(p.sub): raise ValueError( 'Expected p.sub and p.weights to be the same length. ' 'Found %d sub, and %d weights' % (len(p.sub), len(p.weights))) def GetDatasourceFn(sub): def DatasourceFn(): datasource = sub.BuildDataSource( data_source_from_file_pattern_fn) return datasource.data return DatasourceFn inputs = [GetDatasourceFn(sub) for sub in self.sub] if not p.bprop_variable_filters: bprop_variable_filters = [''] * len(inputs) else: bprop_variable_filters = p.bprop_variable_filters data_source, selected_bprop = py_utils.MixByWeight(inputs, p.weights, seed=p.random_seed) # TODO(neerajgaur): Remove _bprop_onehot and change code that uses it to # use source_selected from input_batch. batch_size = py_utils.GetShape(py_utils.Flatten(data_source)[0])[0] ret = py_utils.NestedMap() ret.data = data_source ret.bprop_variable_filters = bprop_variable_filters ret.selected_bprop = selected_bprop ret.source_selected = tf.tile(tf.expand_dims(selected_bprop, 0), [batch_size, 1]) return ret
def _VerifyChildren(self) -> None: """Verify all children created by this layer are via `CreateChild(ren)`.""" created_children = py_utils.Flatten(self._private_children) for v in self._children_list: if v not in created_children: tf.logging.info([(child.params.name, type(child)) for child in created_children]) raise ValueError( '%s is not created by BaseLayer.CreateChild(ren) in %r.' % (v.params.name, self))
def GetNext(self): p = self.params inputs = [sub.GetNext for sub in self.sub] data_source, self._selected_bprop = py_utils.MixByWeight( inputs, p.weights, seed=p.random_seed) # TODO(neerajgaur): Remove _bprop_onehot and change code that uses it to # use source_selected from input_batch. shape = py_utils.GetShape(py_utils.Flatten(data_source)[0]) self._batch_size = shape[0] if shape != [] else 1 # pylint: disable=g-explicit-bool-comparison return data_source
def PostTrainingStepUpdate(self): """Returns a TF op which will be invoked at each training step. Subclasses of `BaseLayer` can implement this method. The method should return a TF op to be invoked during training after gradients are applied. """ update_ops = [ child.PostTrainingStepUpdate() for child in py_utils.Flatten(self._private_children) ] return tf.group(*update_ops)
def GetVariablesDict(self, visited=None): """Returns a dict of variables from the model and all its children.""" if visited is None: visited = set() elif id(self) in visited: return {} visited.add(id(self)) res = self._GetSelfVariablesDict() for child in py_utils.Flatten(self.children): res = py_utils.MergeDictsWithValueCheck( res, child.GetVariablesDict(visited)) return res
def AddNormSummary(name, vs_gs): """"Returns and creates summary for norms of vs and their gradients gs. Args: name: A name string for summary. vs_gs: A `.NestedMap` or a list of `.NestedMap` of (variable, gradient). Returns: norm of variables, and norm of gradients. """ flatten = py_utils.Flatten(vs_gs) v_norm = tf.sqrt(py_utils.SumSquared([v for (v, _) in flatten])) scalar('var_norm/%s' % name, v_norm) g_norm = tf.sqrt(py_utils.SumSquared([g for (_, g) in flatten])) scalar('grad_norm/%s' % name, g_norm) return v_norm, g_norm
def ScaleGradients(self, var_grads, gradient_adjuster=None): """Scales gradients according to training params. Args: var_grads: a `.NestedMap` whose values are (var, grad) pairs. gradient_adjuster: if not None, a function that mutates a given var_grads. Returns: A `.NestedMap` containing - final_var_grads: a `.NestedMap` whose values are (var, grad) pairs, where gradients have already been scaled. - grad_scale: the gradient scale. 0 if gradient updates should be skipped for the step. (Optional, only returned in case global norm clipping is used.) """ p = self.params # Computes gradients' norm and adds their summaries. Note that all_grad_norm # may be nan, which may cause grad_scale to be nan. for name, vg in var_grads.FlattenItems(): summary_utils.AddNormSummary( py_utils.SanitizeScopeKey(name) + '/' + p.name, vg) flatten = py_utils.Flatten(var_grads) all_grad_norm = tf.sqrt(py_utils.SumSquared([g for (_, g) in flatten])) all_var_norm = tf.sqrt(py_utils.SumSquared([v for (v, _) in flatten])) grad_norm_is_nan_or_inf = tf.logical_or(tf.is_nan(all_grad_norm), tf.is_inf(all_grad_norm)) # Optional gradient adjustment. Note that this happens after computing # all_grad_norm. if gradient_adjuster is not None: tf.logging.info('gradient_adjuster=%s', gradient_adjuster) var_grads = gradient_adjuster(var_grads) # Handles NaN/Inf gradients. has_nan_or_inf = py_utils.HasNanOrInfGradient(var_grads) # Grad norm can still be inf even if none of the individual grad is inf. has_nan_or_inf = tf.logical_or(has_nan_or_inf, grad_norm_is_nan_or_inf) self._AddEvalMetric('has_nan_or_inf', has_nan_or_inf, tf.constant(1.0)) return_values = py_utils.NestedMap() if p.clip_gradient_single_norm_to_value: # Currently using both types of clipping simultaneously is unsupported. if p.clip_gradient_norm_to_value: raise ValueError( 'Cannot use clip_gradient_single_norm_to_value=%f and ' 'clip_gradient_norm_to_value=%f.' % (p.clip_gradient_single_norm_to_value, p.clip_gradient_norm_to_value)) final_var_grads = py_utils.ApplyGradNormClipping( var_grads, p.clip_gradient_single_norm_to_value) else: grad_scale = self._GetGlobalGradScale(all_grad_norm, has_nan_or_inf) self._AddEvalMetric('grad_norm/all', all_grad_norm, tf.constant(1.0)) self._AddEvalMetric('var_norm/all', all_var_norm, tf.constant(1.0)) self._AddEvalMetric('grad_scale_all', grad_scale, tf.constant(1.0)) final_var_grads = py_utils.ApplyGradMultiplier( var_grads, grad_scale) return_values.grad_scale = grad_scale return_values.final_var_grads = final_var_grads return return_values
def AddChild(self, name, children): """Add existing layer or layers as sublayer.""" for child in py_utils.Flatten(children): assert isinstance(child, BaseLayer) self._CheckName(name) self._private_children[name] = children
def _BuildStackedRecurrentElman(self, seqlen, trailing_pad_len, batch, dims, layers): tf.set_random_seed(342462) np.random.seed(32540) seqlen += trailing_pad_len dtype = tf.float64 def CreateTheta(): return py_utils.NestedMap( w=tf.constant(np.random.uniform(0, 0.2, (2 * dims, dims)), dtype=dtype), b=tf.constant(np.random.uniform(0, 0.2, (dims, )), dtype=dtype)) def CreateState0(): return py_utils.NestedMap(h=tf.constant(np.random.uniform( 0, 0.2, (batch, dims)), dtype=dtype), padding=tf.constant([[0]] * batch, dtype=dtype)) devices = ['/cpu:0'] * layers cell_fns = [Elman] * layers cell_grads = [ElmanGrad] * layers cell_outs = [ElmanOut] * layers cell_out_grads = [ElmanOutGrad] * layers thetas = [CreateTheta() for _ in range(layers)] init_states = [CreateState0() for _ in range(layers)] padding = np.zeros((seqlen, batch, 1)) padding[-trailing_pad_len:, :, :] = 1. padding[-trailing_pad_len - 3:-trailing_pad_len - 1, :, :] = 1. inputs = py_utils.NestedMap(x=tf.constant(np.random.uniform( 0, 0.2, (seqlen, batch, dims)), dtype=dtype), padding=tf.constant(padding, dtype=dtype)) output, _ = recurrent.StackedRecurrent(devices=devices, cell_fns=cell_fns, cell_grads=cell_grads, cell_outs=cell_outs, cell_out_grads=cell_out_grads, thetas=thetas, init_states=init_states, inputs=inputs) o = output.x if 'padding' in inputs: o *= (1 - inputs.padding) loss = tf.reduce_sum(tf.square(o)) xs = py_utils.Flatten(thetas + [py_utils.NestedMap(x=inputs.x)]) dxs = tf.gradients(ys=loss, xs=xs) # Reference implementation using Recurrent(). ref = inputs for i in range(layers): ref = ElmanOut( recurrent.Recurrent(cell_fn=cell_fns[i], cell_grad=cell_grads[i], theta=thetas[i], state0=init_states[i], inputs=ref)[0]) return ref.x, output.x, loss, xs, dxs
def _GetSelfVariablesDict(self): """Returns a dict of variables from the model, excluding its children.""" res = {} for var in py_utils.Flatten(self._private_vars.values()): res[var.name] = var return res