Beispiel #1
0
 def _get_state_dtype(self, *args):
   if self.params.state_dtype:
     return self.params.state_dtype
   if self.params.nested_map_fprop:
     inputs = args[0].Filter(lambda x: x is not None)
     return py_utils.Flatten(inputs)[0].dtype
   return args[0].dtype
Beispiel #2
0
  def _get_input_shapes(self, *args):
    p = self.params
    if p.nested_map_fprop:
      assert len(args) == 1
      assert isinstance(args[0], py_utils.NestedMap)
      input_tensors = py_utils.Flatten(args[0])
    else:
      input_tensors = _ToTuple(args)
    # Get batch size from the first tensor which is not None.
    mini_batch_size = None
    for input_tensor in input_tensors:
      if input_tensor is not None:
        mini_batch_size = input_tensor.get_shape().as_list()[p.batch_dim]
    assert mini_batch_size is not None
    micro_batch_size = p.micro_batch_size
    if not micro_batch_size:
      if p.num_micro_batches > mini_batch_size:
        p.num_micro_batches = mini_batch_size
      micro_batch_size = mini_batch_size // p.num_micro_batches
    if mini_batch_size is not None:
      if micro_batch_size * p.num_micro_batches != mini_batch_size:
        raise ValueError('micro_batch_size * num_micro_batches != batch_size.')

    input_shapes = ()
    for input_tensor in input_tensors:
      if input_tensor is not None:
        input_shape = input_tensor.get_shape().as_list()
        input_shape[p.batch_dim] = micro_batch_size
        input_shapes += (tf.TensorShape(input_shape),)
      else:
        input_shapes += (None,)

    if p.nested_map_fprop:
      input_shapes = py_utils.Pack(args[0], input_shapes)
    return input_shapes
Beispiel #3
0
    def BuildDataSource(self, data_source_from_file_pattern_fn):
        """Read and return input batch from a p.file_pattern list.

    `p.file_patterns` is a list of file patterns, `p.weights` contains
    weights for each file pattern.  If provided `p.bprop_variable_filters`
    includes a bprop_variable_filter for each file pattern.

    Args:
      data_source_from_file_pattern_fn: a function that takes file_pattern as an
        argument and returns an input batch.

    Returns:
      A NestedMap containing:
        data: a tuple of tf.Tensor or `.NestedMap` of tf.Tensor
        source_selected: a tensor of size [batch_size, number of data sources]
        selected_bprop: a tensor of size [number of data sources]
        bprop_variable_filters: containing a list of bprop_variable filters for
        each source

    Raises:
      ValueError: If unknown token type.
    """
        p = self.params

        if len(p.weights) != len(p.sub):
            raise ValueError(
                'Expected p.sub and p.weights to be the same length. '
                'Found %d sub, and %d weights' % (len(p.sub), len(p.weights)))

        def GetDatasourceFn(sub):
            def DatasourceFn():
                datasource = sub.BuildDataSource(
                    data_source_from_file_pattern_fn)
                return datasource.data

            return DatasourceFn

        inputs = [GetDatasourceFn(sub) for sub in self.sub]
        if not p.bprop_variable_filters:
            bprop_variable_filters = [''] * len(inputs)
        else:
            bprop_variable_filters = p.bprop_variable_filters

        data_source, selected_bprop = py_utils.MixByWeight(inputs,
                                                           p.weights,
                                                           seed=p.random_seed)
        # TODO(neerajgaur): Remove _bprop_onehot and change code that uses it to
        # use source_selected from input_batch.
        batch_size = py_utils.GetShape(py_utils.Flatten(data_source)[0])[0]
        ret = py_utils.NestedMap()
        ret.data = data_source
        ret.bprop_variable_filters = bprop_variable_filters
        ret.selected_bprop = selected_bprop
        ret.source_selected = tf.tile(tf.expand_dims(selected_bprop, 0),
                                      [batch_size, 1])
        return ret
Beispiel #4
0
 def _VerifyChildren(self) -> None:
     """Verify all children created by this layer are via `CreateChild(ren)`."""
     created_children = py_utils.Flatten(self._private_children)
     for v in self._children_list:
         if v not in created_children:
             tf.logging.info([(child.params.name, type(child))
                              for child in created_children])
             raise ValueError(
                 '%s is not created by BaseLayer.CreateChild(ren) in %r.' %
                 (v.params.name, self))
Beispiel #5
0
    def GetNext(self):
        p = self.params
        inputs = [sub.GetNext for sub in self.sub]

        data_source, self._selected_bprop = py_utils.MixByWeight(
            inputs, p.weights, seed=p.random_seed)
        # TODO(neerajgaur): Remove _bprop_onehot and change code that uses it to
        # use source_selected from input_batch.
        shape = py_utils.GetShape(py_utils.Flatten(data_source)[0])
        self._batch_size = shape[0] if shape != [] else 1  # pylint: disable=g-explicit-bool-comparison
        return data_source
Beispiel #6
0
    def PostTrainingStepUpdate(self):
        """Returns a TF op which will be invoked at each training step.

    Subclasses of `BaseLayer` can implement this method. The method should
    return a TF op to be invoked during training after gradients are applied.
    """
        update_ops = [
            child.PostTrainingStepUpdate()
            for child in py_utils.Flatten(self._private_children)
        ]
        return tf.group(*update_ops)
Beispiel #7
0
    def GetVariablesDict(self, visited=None):
        """Returns a dict of variables from the model and all its children."""
        if visited is None:
            visited = set()
        elif id(self) in visited:
            return {}
        visited.add(id(self))

        res = self._GetSelfVariablesDict()
        for child in py_utils.Flatten(self.children):
            res = py_utils.MergeDictsWithValueCheck(
                res, child.GetVariablesDict(visited))

        return res
Beispiel #8
0
def AddNormSummary(name, vs_gs):
    """"Returns and creates summary for norms of vs and their gradients gs.

  Args:
    name: A name string for summary.
    vs_gs: A `.NestedMap` or a list of `.NestedMap` of (variable, gradient).

  Returns:
    norm of variables, and norm of gradients.
  """
    flatten = py_utils.Flatten(vs_gs)
    v_norm = tf.sqrt(py_utils.SumSquared([v for (v, _) in flatten]))
    scalar('var_norm/%s' % name, v_norm)
    g_norm = tf.sqrt(py_utils.SumSquared([g for (_, g) in flatten]))
    scalar('grad_norm/%s' % name, g_norm)
    return v_norm, g_norm
Beispiel #9
0
    def ScaleGradients(self, var_grads, gradient_adjuster=None):
        """Scales gradients according to training params.

    Args:
      var_grads: a `.NestedMap` whose values are (var, grad) pairs.
      gradient_adjuster: if not None, a function that mutates a given var_grads.

    Returns:
      A `.NestedMap` containing

      - final_var_grads: a `.NestedMap` whose values are (var, grad) pairs,
        where gradients have already been scaled.
      - grad_scale: the gradient scale. 0 if gradient updates should be skipped
        for the step. (Optional, only returned in case global norm clipping is
        used.)
    """
        p = self.params

        # Computes gradients' norm and adds their summaries. Note that all_grad_norm
        # may be nan, which may cause grad_scale to be nan.
        for name, vg in var_grads.FlattenItems():
            summary_utils.AddNormSummary(
                py_utils.SanitizeScopeKey(name) + '/' + p.name, vg)
        flatten = py_utils.Flatten(var_grads)
        all_grad_norm = tf.sqrt(py_utils.SumSquared([g for (_, g) in flatten]))
        all_var_norm = tf.sqrt(py_utils.SumSquared([v for (v, _) in flatten]))
        grad_norm_is_nan_or_inf = tf.logical_or(tf.is_nan(all_grad_norm),
                                                tf.is_inf(all_grad_norm))

        # Optional gradient adjustment. Note that this happens after computing
        # all_grad_norm.
        if gradient_adjuster is not None:
            tf.logging.info('gradient_adjuster=%s', gradient_adjuster)
            var_grads = gradient_adjuster(var_grads)

        # Handles NaN/Inf gradients.
        has_nan_or_inf = py_utils.HasNanOrInfGradient(var_grads)
        # Grad norm can still be inf even if none of the individual grad is inf.
        has_nan_or_inf = tf.logical_or(has_nan_or_inf, grad_norm_is_nan_or_inf)
        self._AddEvalMetric('has_nan_or_inf', has_nan_or_inf, tf.constant(1.0))

        return_values = py_utils.NestedMap()
        if p.clip_gradient_single_norm_to_value:
            # Currently using both types of clipping simultaneously is unsupported.
            if p.clip_gradient_norm_to_value:
                raise ValueError(
                    'Cannot use clip_gradient_single_norm_to_value=%f and '
                    'clip_gradient_norm_to_value=%f.' %
                    (p.clip_gradient_single_norm_to_value,
                     p.clip_gradient_norm_to_value))
            final_var_grads = py_utils.ApplyGradNormClipping(
                var_grads, p.clip_gradient_single_norm_to_value)

        else:
            grad_scale = self._GetGlobalGradScale(all_grad_norm,
                                                  has_nan_or_inf)
            self._AddEvalMetric('grad_norm/all', all_grad_norm,
                                tf.constant(1.0))
            self._AddEvalMetric('var_norm/all', all_var_norm, tf.constant(1.0))
            self._AddEvalMetric('grad_scale_all', grad_scale, tf.constant(1.0))
            final_var_grads = py_utils.ApplyGradMultiplier(
                var_grads, grad_scale)
            return_values.grad_scale = grad_scale

        return_values.final_var_grads = final_var_grads
        return return_values
Beispiel #10
0
 def AddChild(self, name, children):
     """Add existing layer or layers as sublayer."""
     for child in py_utils.Flatten(children):
         assert isinstance(child, BaseLayer)
     self._CheckName(name)
     self._private_children[name] = children
Beispiel #11
0
    def _BuildStackedRecurrentElman(self, seqlen, trailing_pad_len, batch,
                                    dims, layers):
        tf.set_random_seed(342462)
        np.random.seed(32540)

        seqlen += trailing_pad_len
        dtype = tf.float64

        def CreateTheta():
            return py_utils.NestedMap(
                w=tf.constant(np.random.uniform(0, 0.2, (2 * dims, dims)),
                              dtype=dtype),
                b=tf.constant(np.random.uniform(0, 0.2, (dims, )),
                              dtype=dtype))

        def CreateState0():
            return py_utils.NestedMap(h=tf.constant(np.random.uniform(
                0, 0.2, (batch, dims)),
                                                    dtype=dtype),
                                      padding=tf.constant([[0]] * batch,
                                                          dtype=dtype))

        devices = ['/cpu:0'] * layers
        cell_fns = [Elman] * layers
        cell_grads = [ElmanGrad] * layers
        cell_outs = [ElmanOut] * layers
        cell_out_grads = [ElmanOutGrad] * layers
        thetas = [CreateTheta() for _ in range(layers)]
        init_states = [CreateState0() for _ in range(layers)]
        padding = np.zeros((seqlen, batch, 1))
        padding[-trailing_pad_len:, :, :] = 1.
        padding[-trailing_pad_len - 3:-trailing_pad_len - 1, :, :] = 1.
        inputs = py_utils.NestedMap(x=tf.constant(np.random.uniform(
            0, 0.2, (seqlen, batch, dims)),
                                                  dtype=dtype),
                                    padding=tf.constant(padding, dtype=dtype))
        output, _ = recurrent.StackedRecurrent(devices=devices,
                                               cell_fns=cell_fns,
                                               cell_grads=cell_grads,
                                               cell_outs=cell_outs,
                                               cell_out_grads=cell_out_grads,
                                               thetas=thetas,
                                               init_states=init_states,
                                               inputs=inputs)
        o = output.x
        if 'padding' in inputs:
            o *= (1 - inputs.padding)
        loss = tf.reduce_sum(tf.square(o))

        xs = py_utils.Flatten(thetas + [py_utils.NestedMap(x=inputs.x)])
        dxs = tf.gradients(ys=loss, xs=xs)

        # Reference implementation using Recurrent().
        ref = inputs
        for i in range(layers):
            ref = ElmanOut(
                recurrent.Recurrent(cell_fn=cell_fns[i],
                                    cell_grad=cell_grads[i],
                                    theta=thetas[i],
                                    state0=init_states[i],
                                    inputs=ref)[0])
        return ref.x, output.x, loss, xs, dxs
Beispiel #12
0
 def _GetSelfVariablesDict(self):
     """Returns a dict of variables from the model, excluding its children."""
     res = {}
     for var in py_utils.Flatten(self._private_vars.values()):
         res[var.name] = var
     return res