def _buildInitialVars(self, shape, dev_list):
     values = []
     num_devices = len(dev_list)
     dim = np.prod(shape, dtype=int) if shape else 1
     for d in range(0, num_devices):
         with ops.device(dev_list[d]):
             npt = np.zeros(shape).astype(np.float32)
             alias = np.frombuffer(npt.data, dtype=np.float32)
             for i in range(0, dim):
                 alias[i] = i + 0.01 * d
             var = state_ops.variable_op(shape, types_pb2.DT_FLOAT)
             state_ops.init_variable(var, npt).op.run()
             values.append(var)
     return values
 def _buildInitialVars(self, shape, dev_list):
   values = []
   num_devices = len(dev_list)
   dim = np.prod(shape) if shape else 1
   for d in range(0, num_devices):
     with ops.device(dev_list[d]):
       npt = np.zeros(shape).astype(np.float32)
       alias = np.frombuffer(npt.data, dtype=np.float32)
       for i in range(0, dim):
         alias[i] = i + 0.01 * d
       var = state_ops.variable_op(shape, types_pb2.DT_FLOAT)
       state_ops.init_variable(var, npt).op.run()
       values.append(var)
   return values
    def _AddParam(self,
                  shape,
                  dtype,
                  name,
                  initializer=None,
                  return_average=False):
        """Add a model parameter w.r.t. we expect to compute gradients.

    _AddParam creates both regular parameters (usually for training) and
    averaged nodes (usually for inference). It returns one or the other based
    on the 'return_average' arg.

    Args:
      shape: int list, tensor shape of the parameter to create
      dtype: tf.DataType, data type of the parameter
      name: string, name of the parameter in the TF graph
      initializer: optional initializer for the paramter
      return_average: if False, return parameter otherwise return moving average

    Returns:
      parameter or averaged parameter
    """
        if name not in self.params:
            with tf.device('/cpu:0'):
                step = tf.cast(self.GetStep(), tf.float32)
            # Put all parameters and their initializing ops in their own scope
            # irrespective of the current scope (training or eval).
            with tf.name_scope(self._param_scope):
                self.params[name] = tf.get_variable(name, shape, dtype,
                                                    initializer)
                param = self.params[name]
                if initializer is not None:
                    self.inits[name] = state_ops.init_variable(
                        param, initializer)
                if self._averaging_decay == 1:
                    #logging.info('Using vanilla averaging of parameters.')
                    ema = tf.train.ExponentialMovingAverage(
                        decay=(step / (step + 1.0)), num_updates=None)
                else:
                    ema = tf.train.ExponentialMovingAverage(
                        decay=self._averaging_decay, num_updates=step)
                self._averaging[name + '_avg_update'] = ema.apply([param])
                self.variables[name + '_avg_var'] = ema.average(param)
                self.inits[name + '_avg_init'] = state_ops.init_variable(
                    ema.average(param), tf.zeros_initializer)
        return (self.variables[name + '_avg_var']
                if return_average else self.params[name])
Exemple #4
0
  def AddTraining(self,
                  task_context,
                  batch_size,
                  learning_rate=0.1,
                  decay_steps=4000,
                  momentum=0.9,
                  corpus_name='documents'):
    """Builds a trainer to minimize the cross entropy cost function.

    Args:
      task_context: file path from which to read the task context
      batch_size: batch size to request from reader op
      learning_rate: initial value of the learning rate
      decay_steps: decay learning rate by 0.96 every this many steps
      momentum: momentum parameter used when training with momentum
      corpus_name: name of the task input to read parses from

    Returns:
      Dictionary of named training nodes.
    """
    with tf.name_scope('training'):
      nodes = self.training
      nodes.update(self._AddGoldReader(task_context, batch_size, corpus_name))
      nodes.update(self._BuildNetwork(nodes['feature_endpoints'],
                                      return_average=False))
      nodes.update(self._AddCostFunction(batch_size, nodes['gold_actions'],
                                         nodes['logits']))
      # Add the optimizer
      if self._only_train:
        trainable_params = [v
                            for k, v in self.params.iteritems()
                            if k in self._only_train]
      else:
        trainable_params = self.params.values()
      lr = self._AddLearningRate(learning_rate, decay_steps)
      optimizer = tf.train.MomentumOptimizer(lr,
                                             momentum,
                                             use_locking=self._use_locking)
      train_op = optimizer.minimize(nodes['cost'], var_list=trainable_params)
      for param in trainable_params:
        slot = optimizer.get_slot(param, 'momentum')
        self.inits[slot.name] = state_ops.init_variable(slot,
                                                        tf.zeros_initializer)
        self.variables[slot.name] = slot
      numerical_checks = [
          tf.check_numerics(param,
                            message='Parameter is not finite.')
          for param in trainable_params
          if param.dtype.base_dtype in [tf.float32, tf.float64]
      ]
      check_op = tf.group(*numerical_checks)
      avg_update_op = tf.group(*self._averaging.values())
      train_ops = [train_op]
      if self._check_parameters:
        train_ops.append(check_op)
      if self._use_averaging:
        train_ops.append(avg_update_op)
      nodes['train_op'] = tf.group(*train_ops, name='train_op')
    return nodes
 def _AddVariable(self, shape, dtype, name, initializer=None):
     if name in self.variables:
         return self.variables[name]
     self.variables[name] = tf.get_variable(name, shape, dtype, initializer)
     if initializer is not None:
         self.inits[name] = state_ops.init_variable(self.variables[name],
                                                    initializer)
     return self.variables[name]
 def _AddVariable(self, shape, dtype, name, initializer=None):
   if name in self.variables:
     return self.variables[name]
   self.variables[name] = tf.get_variable(name, shape, dtype, initializer)
   if initializer is not None:
     self.inits[name] = state_ops.init_variable(self.variables[name],
                                                initializer)
   return self.variables[name]
  def _AddParam(self,
                shape,
                dtype,
                name,
                initializer=None,
                return_average=False):
    """Add a model parameter w.r.t. we expect to compute gradients.

    _AddParam creates both regular parameters (usually for training) and
    averaged nodes (usually for inference). It returns one or the other based
    on the 'return_average' arg.

    Args:
      shape: int list, tensor shape of the parameter to create
      dtype: tf.DataType, data type of the parameter
      name: string, name of the parameter in the TF graph
      initializer: optional initializer for the paramter
      return_average: if False, return parameter otherwise return moving average

    Returns:
      parameter or averaged parameter
    """
    if name not in self.params:
      step = tf.cast(self.GetStep(), tf.float32)
      # Put all parameters and their initializing ops in their own scope
      # irrespective of the current scope (training or eval).
      with tf.name_scope(self._param_scope):
        self.params[name] = tf.get_variable(name, shape, dtype, initializer)
        param = self.params[name]
        if initializer is not None:
          self.inits[name] = state_ops.init_variable(param, initializer)
        if self._averaging_decay == 1:
          logging.info('Using vanilla averaging of parameters.')
          ema = tf.train.ExponentialMovingAverage(decay=(step / (step + 1.0)),
                                                  num_updates=None)
        else:
          ema = tf.train.ExponentialMovingAverage(decay=self._averaging_decay,
                                                  num_updates=step)
        self._averaging[name + '_avg_update'] = ema.apply([param])
        self.variables[name + '_avg_var'] = ema.average(param)
        self.inits[name + '_avg_init'] = state_ops.init_variable(
            ema.average(param), tf.zeros_initializer)
    return (self.variables[name + '_avg_var'] if return_average else
            self.params[name])
Exemple #8
0
    def addParam(self,
                 shape,
                 dtype,
                 name,
                 initializer=None,
                 return_average=False):
        # this isn't a problem. we reload variables if they already exist.
        #if name in self.params:
        #    self.logger.warning(name + ' already exists!')

        if name not in self.params:
            step = tf.cast(self.getStep(), tf.float32)
            with tf.name_scope(self._param_scope):
                # Put all parameters and their initializing ops in their own
                # scope irrespective of the current scope (training or eval).
                self.params[name] = tf.get_variable(name, shape, dtype,
                                                    initializer)
                param = self.params[name]

                if initializer is not None:
                    self.inits[name] = state_ops.init_variable(
                        param, initializer)
                if self.averaging_decay == 1:
                    self.logging.info('Using vanilla averaging of parameters.')
                    ema = tf.train.ExponentialMovingAverage(
                        decay=(step / (step + 1.0)), num_updates=None)
                else:
                    ema = tf.train.ExponentialMovingAverage(
                        decay=self.averaging_decay, num_updates=step)

                self.averaging[name + '_avg_update'] = ema.apply([param])
                self.variables[name + '_avg_var'] = ema.average(param)
                self.inits[name + '_avg_init'] = state_ops.init_variable(
                    ema.average(param), tf.zeros_initializer())
        return (self.variables[name + '_avg_var']
                if return_average else self.params[name])
  def AddTraining(self,
                  task_context,
                  batch_size,
                  learning_rate=0.1,
                  decay_steps=4000,
                  momentum=None,
                  corpus_name='documents'):
    with tf.name_scope('training'):
      n = self.training
      n['accumulated_alive_steps'] = self._AddVariable(
          [batch_size], tf.int32, 'accumulated_alive_steps',
          tf.zeros_initializer())
      n.update(self._AddBeamReader(task_context, batch_size, corpus_name))
      # This adds a required 'step' node too:
      learning_rate = tf.constant(learning_rate, dtype=tf.float32)
      n['learning_rate'] = self._AddLearningRate(learning_rate, decay_steps)
      # Call BuildNetwork *only* to set up the params outside of the main loop.
      self._BuildNetwork(list(n['features']))

      n.update(self._BuildSequence(batch_size, self._max_steps, n['features'],
                                   n['state']))

      flat_concat_scores = tf.reshape(n['concat_scores'], [-1])
      (indices_and_paths, beams_and_slots, n['gold_slot'], n[
          'beam_path_scores']) = gen_parser_ops.beam_parser_output(n[
              'state'])
      n['indices'] = tf.reshape(tf.gather(indices_and_paths, [0]), [-1])
      n['path_ids'] = tf.reshape(tf.gather(indices_and_paths, [1]), [-1])
      n['all_path_scores'] = tf.sparse_segment_sum(
          flat_concat_scores, n['indices'], n['path_ids'])
      n['beam_ids'] = tf.reshape(tf.gather(beams_and_slots, [0]), [-1])
      n.update(AddCrossEntropy(batch_size, n))

      if self._only_train:
        trainable_params = {k: v for k, v in self.params.iteritems()
                            if k in self._only_train}
      else:
        trainable_params = self.params
      for p in trainable_params:
        tf.logging.info('trainable_param: %s', p)

      regularized_params = [
          tf.nn.l2_loss(p) for k, p in trainable_params.iteritems()
          if k.startswith('weights') or k.startswith('bias')]
      l2_loss = 1e-4 * tf.add_n(regularized_params) if regularized_params else 0

      n['cost'] = tf.add(n['cross_entropy'], l2_loss, name='cost')

      n['gradients'] = tf.gradients(n['cost'], trainable_params.values())

      with tf.control_dependencies([n['alive_steps']]):
        update_accumulators = tf.group(
            tf.assign_add(n['accumulated_alive_steps'], n['alive_steps']))

      def ResetAccumulators():
        return tf.assign(
            n['accumulated_alive_steps'], tf.zeros([batch_size], tf.int32))
      n['reset_accumulators_func'] = ResetAccumulators

      optimizer = tf.train.MomentumOptimizer(n['learning_rate'],
                                             momentum,
                                             use_locking=self._use_locking)
      train_op = optimizer.minimize(n['cost'],
                                    var_list=trainable_params.values())
      for param in trainable_params.values():
        slot = optimizer.get_slot(param, 'momentum')
        self.inits[slot.name] = state_ops.init_variable(slot,
                                                        tf.zeros_initializer())
        self.variables[slot.name] = slot

      def NumericalChecks():
        return tf.group(*[
            tf.check_numerics(param, message='Parameter is not finite.')
            for param in trainable_params.values()
            if param.dtype.base_dtype in [tf.float32, tf.float64]])
      check_op = cf.cond(tf.equal(tf.mod(self.GetStep(), self._check_every), 0),
                         NumericalChecks, tf.no_op)
      avg_update_op = tf.group(*self._averaging.values())
      train_ops = [train_op]
      if self._check_parameters:
        train_ops.append(check_op)
      if self._use_averaging:
        train_ops.append(avg_update_op)
      with tf.control_dependencies([update_accumulators]):
        n['train_op'] = tf.group(*train_ops, name='train_op')
      n['alive_steps'] = tf.identity(n['alive_steps'], name='alive_steps')
    return n
Exemple #10
0
    def AddTraining(self,
                    task_context,
                    batch_size,
                    learning_rate=0.1,
                    decay_steps=4000,
                    momentum=None,
                    corpus_name='documents'):
        with tf.name_scope('training'):
            n = self.training
            n['accumulated_alive_steps'] = self._AddVariable(
                [batch_size], tf.int32, 'accumulated_alive_steps',
                tf.zeros_initializer())
            n.update(self._AddBeamReader(task_context, batch_size,
                                         corpus_name))
            # This adds a required 'step' node too:
            learning_rate = tf.constant(learning_rate, dtype=tf.float32)
            n['learning_rate'] = self._AddLearningRate(learning_rate,
                                                       decay_steps)
            # Call BuildNetwork *only* to set up the params outside of the main loop.
            self._BuildNetwork(list(n['features']))

            n.update(
                self._BuildSequence(batch_size, self._max_steps, n['features'],
                                    n['state']))

            flat_concat_scores = tf.reshape(n['concat_scores'], [-1])
            (indices_and_paths, beams_and_slots, n['gold_slot'],
             n['beam_path_scores']) = gen_parser_ops.beam_parser_output(
                 n['state'])
            n['indices'] = tf.reshape(tf.gather(indices_and_paths, [0]), [-1])
            n['path_ids'] = tf.reshape(tf.gather(indices_and_paths, [1]), [-1])
            n['all_path_scores'] = tf.sparse_segment_sum(
                flat_concat_scores, n['indices'], n['path_ids'])
            n['beam_ids'] = tf.reshape(tf.gather(beams_and_slots, [0]), [-1])
            n.update(AddCrossEntropy(batch_size, n))

            if self._only_train:
                trainable_params = {
                    k: v
                    for k, v in self.params.iteritems()
                    if k in self._only_train
                }
            else:
                trainable_params = self.params
            for p in trainable_params:
                tf.logging.info('trainable_param: %s', p)

            regularized_params = [
                tf.nn.l2_loss(p) for k, p in trainable_params.iteritems()
                if k.startswith('weights') or k.startswith('bias')
            ]
            l2_loss = 1e-4 * tf.add_n(
                regularized_params) if regularized_params else 0

            n['cost'] = tf.add(n['cross_entropy'], l2_loss, name='cost')

            n['gradients'] = tf.gradients(n['cost'], trainable_params.values())

            with tf.control_dependencies([n['alive_steps']]):
                update_accumulators = tf.group(
                    tf.assign_add(n['accumulated_alive_steps'],
                                  n['alive_steps']))

            def ResetAccumulators():
                return tf.assign(n['accumulated_alive_steps'],
                                 tf.zeros([batch_size], tf.int32))

            n['reset_accumulators_func'] = ResetAccumulators

            optimizer = tf.train.MomentumOptimizer(
                n['learning_rate'], momentum, use_locking=self._use_locking)
            train_op = optimizer.minimize(n['cost'],
                                          var_list=trainable_params.values())
            for param in trainable_params.values():
                slot = optimizer.get_slot(param, 'momentum')
                self.inits[slot.name] = state_ops.init_variable(
                    slot, tf.zeros_initializer())
                self.variables[slot.name] = slot

            def NumericalChecks():
                return tf.group(*[
                    tf.check_numerics(param,
                                      message='Parameter is not finite.')
                    for param in trainable_params.values()
                    if param.dtype.base_dtype in [tf.float32, tf.float64]
                ])

            check_op = cf.cond(
                tf.equal(tf.mod(self.GetStep(), self._check_every), 0),
                NumericalChecks, tf.no_op)
            avg_update_op = tf.group(*self._averaging.values())
            train_ops = [train_op]
            if self._check_parameters:
                train_ops.append(check_op)
            if self._use_averaging:
                train_ops.append(avg_update_op)
            with tf.control_dependencies([update_accumulators]):
                n['train_op'] = tf.group(*train_ops, name='train_op')
            n['alive_steps'] = tf.identity(n['alive_steps'],
                                           name='alive_steps')
        return n
Exemple #11
0
    def buildNetwork(self, mode='train'):
        assert mode == 'train' or mode == 'eval'

        if mode == 'train':
            return_average = False
            nodes = self.training
        else:
            return_average = self.use_averaging
            nodes = self.evaluation

        learningRate = self.modelParams.cfg['learningRate']
        decaySteps = self.modelParams.cfg['decaySteps']
        # FIXME: does momentum/learning rate reload properly when retraining?
        momentum = self.modelParams.cfg['momentum']
        topK = self.modelParams.cfg['topK']
        hiddenLayerSizes = self.modelParams.cfg['hiddenLayerSizes']
        batchSize = self.modelParams.cfg['batchSize']

        with tf.name_scope(mode):
            weights = []
            biases = []
            embeddings = []
            nodes['feature_endpoints'] = []

            for i in range(len(self.featureMajorTypeGroups)):
                major_type = self.featureMajorTypeGroups[i]
                # shape will be [-1, number of sparse integer features in group]
                nodes['feature_endpoints'].append(tf.placeholder(tf.int32, \
                    [None, len(self.featureNames[i])],
                    name="ph_feature_endpoints_%s" % major_type))
                embeddings.append(self.addEmbedding( \
                                            nodes['feature_endpoints'][i],
                                            len(self.featureNames[i]),
                                            self.featureDomainSizes[i],
                                            self.featureEmbeddingSizes[i],
                                            major_type,
                                            return_average=return_average))

            # Input layer
            last_layer = tf.concat(embeddings, 1)
            last_layer_size = self.BAG_OF_FEATURES_LEN

            # Hidden layers
            for i in range(len(hiddenLayerSizes)):
                h = hiddenLayerSizes[i]

                weights.append(
                    self.addParam([last_layer_size, h],
                                  tf.float32,
                                  'layer_%d_weights' % i,
                                  tf.random_normal_initializer(stddev=1e-4,
                                                               seed=0),
                                  return_average=return_average))

                biases.append(
                    self.addParam([h],
                                  tf.float32,
                                  'layer_%d_biases' % i,
                                  tf.constant_initializer(0.2),
                                  return_average=return_average))

                last_layer = tf.nn.relu_layer(last_layer,
                                              weights[-1],
                                              biases[-1],
                                              name='layer_%d' % i)
                last_layer_size = h

            # Output layer
            weights.append(
                self.addParam([last_layer_size, self.ACTION_COUNT],
                              tf.float32,
                              'softmax_weights',
                              tf.random_normal_initializer(stddev=1e-4,
                                                           seed=0),
                              return_average=return_average))

            biases.append(
                self.addParam([self.ACTION_COUNT],
                              tf.float32,
                              'softmax_biases',
                              tf.zeros_initializer(),
                              return_average=return_average))

            logits = tf.nn.xw_plus_b(last_layer,
                                     weights[-1],
                                     biases[-1],
                                     name='logits')

            if mode == 'train':
                nodes['gold_actions'] = tf.placeholder(tf.int32, [None], \
                    name='ph_gold_actions')
                nodes['filled_slots'] = tf.placeholder(tf.int32, \
                    name='ph_filled_slots')

                # one-hot encoding for each batch
                dense_golden = batchedSparseToDense(nodes['gold_actions'], \
                    self.ACTION_COUNT)

                #cross_entropy = tf.div(
                #    tf.reduce_sum(tf.nn.softmax_cross_entropy_with_logits(
                #            logits=logits, labels=dense_golden)),
                #        tf.cast(nodes['filled_slots'], tf.float32))

                # we should divide by batch size here, not filled slots
                # seems to fix the accuracy issue for whatever reason,
                # even though cost seems to go crazy momentarily
                # (plummets because only a few slots are filled)
                cross_entropy = tf.div(
                    tf.reduce_sum(
                        tf.nn.softmax_cross_entropy_with_logits(
                            logits=logits, labels=dense_golden)), batchSize)

                # regularize all parameters except output layer
                regularized_params = [tf.nn.l2_loss(p) for p in weights[:-1]]
                regularized_params += [tf.nn.l2_loss(p) for p in biases[:-1]]

                l2_loss = 1e-4 * tf.add_n(regularized_params) \
                    if regularized_params else 0

                cost = tf.add(cross_entropy, l2_loss, name='cost')

                lr = self.addLearningRate(learningRate, decaySteps)

                optimizer = tf.train.MomentumOptimizer(lr,
                                                       momentum,
                                                       use_locking=False)

                trainableParams = self.params.values()

                train_op = optimizer.minimize(cost, var_list=trainableParams)

                for param in trainableParams:
                    slot = optimizer.get_slot(param, 'momentum')
                    self.inits[slot.name] = state_ops.init_variable(
                        slot, tf.zeros_initializer())
                    self.variables[slot.name] = slot

                numerical_checks = [
                    tf.check_numerics(param,
                                      message='Parameter is not finite.')
                    for param in trainableParams
                    if param.dtype.base_dtype in [tf.float32, tf.float64]
                ]
                check_op = tf.group(*numerical_checks)
                avg_update_op = tf.group(*self.averaging.values())
                train_ops = [train_op]
                if self.check_parameters:
                    train_ops.append(check_op)
                if self.use_averaging:
                    train_ops.append(avg_update_op)

                nodes['train_op'] = tf.group(*train_ops, name='train_op')
                nodes['cost'] = cost
                nodes['logits'] = logits
                #nodes['softmax'] = tf.nn.softmax(logits)
            else:
                nodes['logits'] = logits