def testAccumulatorSizeEmpty(self):
   with self.cached_session():
     q = data_flow_ops.ConditionalAccumulator(dtypes_lib.float32, name="Q")
     self.assertEqual(q.num_accumulated().eval(), 0)
Esempio n. 2
0
  def apply_gradients(self, grads_and_vars, global_step=None, name=None):
    """Apply gradients to variables.

    This contains most of the synchronization implementation and also wraps the
    apply_gradients() from the real optimizer.

    Args:
      grads_and_vars: List of (gradient, variable) pairs as returned by
        compute_gradients().
      global_step: Optional Variable to increment by one after the
        variables have been updated.
      name: Optional name for the returned operation.  Default to the
        name passed to the Optimizer constructor.

    Returns:
      train_op: The op to dequeue a token so the replicas can exit this batch
      and start the next one. This is executed by each replica.

    Raises:
      ValueError: If the grads_and_vars is empty.
      ValueError: If global step is not provided, the staleness cannot be
        checked.
    """
    if not grads_and_vars:
      raise ValueError("Must supply at least one variable")

    if global_step is None:
      raise ValueError("Global step is required to check staleness")

    self._global_step = global_step
    train_ops = []
    aggregated_grad = []
    var_list = []

    # local_anchor op will be placed on this worker task by default.
    local_anchor = control_flow_ops.no_op()
    # Colocating local_step variable prevents it being placed on the PS.
    distribution_strategy = distribution_strategy_context.get_strategy()
    with distribution_strategy.extended.colocate_vars_with(local_anchor):
      self._local_step = variable_scope.variable(
          initial_value=0,
          trainable=False,
          collections=[ops.GraphKeys.LOCAL_VARIABLES],
          dtype=global_step.dtype.base_dtype,
          name="sync_rep_local_step")

    self.local_step_init_op = state_ops.assign(self._local_step, global_step)
    chief_init_ops = [self.local_step_init_op]
    self.ready_for_local_init_op = variables.report_uninitialized_variables(
        variables.global_variables())

    with ops.name_scope(None, self._name):
      for grad, var in grads_and_vars:
        var_list.append(var)
        with ops.device(var.device):
          # Dense gradients.
          if grad is None:
            aggregated_grad.append(None)  # pass-through.
            continue
          elif isinstance(grad, ops.Tensor):
            grad_accum = data_flow_ops.ConditionalAccumulator(
                grad.dtype,
                shape=var.get_shape(),
                shared_name=var.name + "/grad_accum")
            train_ops.append(grad_accum.apply_grad(
                grad, local_step=self._local_step))
            aggregated_grad.append(grad_accum.take_grad(
                self._replicas_to_aggregate))
          else:
            if not isinstance(grad, ops.IndexedSlices):
              raise ValueError("Unknown grad type!")
            grad_accum = data_flow_ops.SparseConditionalAccumulator(
                grad.dtype, shape=(), shared_name=var.name + "/grad_accum")
            train_ops.append(grad_accum.apply_indexed_slices_grad(
                grad, local_step=self._local_step))
            aggregated_grad.append(grad_accum.take_indexed_slices_grad(
                self._replicas_to_aggregate))

          self._accumulator_list.append((grad_accum, var.device))

      aggregated_grads_and_vars = zip(aggregated_grad, var_list)

      # sync_op will be assigned to the same device as the global step.
      with ops.device(global_step.device), ops.name_scope(""):
        update_op = self._opt.apply_gradients(aggregated_grads_and_vars,
                                              global_step)

      # Create token queue.
      with ops.device(global_step.device), ops.name_scope(""):
        sync_token_queue = (
            data_flow_ops.FIFOQueue(-1,
                                    global_step.dtype.base_dtype,
                                    shapes=(),
                                    name="sync_token_q",
                                    shared_name="sync_token_q"))
        self._sync_token_queue = sync_token_queue

        # dummy_queue is passed to the queue runner. Don't use the real queues
        # because the queue runner doesn't automatically reopen it once it
        # closed queues in PS devices.
        dummy_queue = (
            data_flow_ops.FIFOQueue(1,
                                    types_pb2.DT_INT32,
                                    shapes=(),
                                    name="dummy_queue",
                                    shared_name="dummy_queue"))

      with ops.device(global_step.device), ops.name_scope(""):
        # Replicas have to wait until they can get a token from the token queue.
        with ops.control_dependencies(train_ops):
          token = sync_token_queue.dequeue()
        train_op = state_ops.assign(self._local_step, token)

        with ops.control_dependencies([update_op]):
          # Sync_op needs to insert tokens to the token queue at the end of the
          # step so the replicas can fetch them to start the next step.
          tokens = array_ops.fill([self._tokens_per_step], global_step)
          sync_op = sync_token_queue.enqueue_many((tokens,))

        if self._variable_averages is not None:
          with ops.control_dependencies([sync_op]), ops.name_scope(""):
            sync_op = self._variable_averages.apply(
                self._variables_to_average)

        self._chief_queue_runner = queue_runner.QueueRunner(dummy_queue,
                                                            [sync_op])
      for accum, dev in self._accumulator_list:
        with ops.device(dev):
          chief_init_ops.append(
              accum.set_global_step(
                  global_step, name="SetGlobalStep"))
      self.chief_init_op = control_flow_ops.group(*(chief_init_ops))
      self._gradients_applied = True
      return train_op
 def testConstructorWithInvalidArg(self):
   with ops.Graph().as_default():
     with self.assertRaises(ValueError):
       data_flow_ops.ConditionalAccumulator(
           dtypes_lib.float32, name="Q", reduction_type="Invalid")
    def apply_gradients(self,
                        grads_and_vars,
                        worker_id,
                        global_step=None,
                        name=None,
                        collect_cdfs=False):
        """Apply gradients to variables.
    This contains most of the synchronization implementation and also wraps the
    apply_gradients() from the real optimizer.
    Args:
      grads_and_vars: List of (gradient, variable) pairs as returned by
        compute_gradients().
      global_step: Optional Variable to increment by one after the
        variables have been updated.
      name: Optional name for the returned operation.  Default to the
        name passed to the Optimizer constructor.
    Returns:
      train_op: The op to dequeue a token so the replicas can exit this batch
      and start the next one. This is executed by each replica.
    Raises:
      ValueError: If the grads_and_vars is empty.
      ValueError: If global step is not provided, the staleness cannot be
        checked.
    """
        if not grads_and_vars:
            raise ValueError("Must supply at least one variable")

        if global_step is None:
            raise ValueError("Global step is required to check staleness")

        self._global_step = global_step
        train_ops = []
        aggregated_grad = []
        var_list = []
        printer_ops = []

        def f_pos():
            enq_total_ops = self._stop_queue.enqueue(global_step)
            '''
      for worker_id in range(self._total_num_replicas):
        enq_ops = self._should_stop_queues[worker_id].enqueue(global_step)
        with ops.control_dependencies([enq_ops]):
          L = []
      '''
            #      ret_pos = [tf.constant(i) for i in range(self._construtor)]
            with ops.control_dependencies([enq_total_ops]):
                return tf.Print(global_step, [global_step],
                                message="Enquequed to stop queue")
#        ret_pos = tf.Variable(33)
#        return ret_pos

        def f_neg():
            #      ret_neg = [tf.constant(i+5) for i in range(self._construtor)]
            ret_neg = tf.Variable(22)
            return tf.Print(global_step, [global_step],
                            message="Nothing to stop queue")


#      worker_id_list_printer = logging_ops.Print(global_step,
#                  [a for a in self._worker_idx_list] + [worker_id] + [global_step],
#                  message="Worker ID list status")
#      train_ops.append(worker_id_list_printer)

        self._local_step = variables.Variable(
            initial_value=0,
            trainable=False,
            collections=[ops.GraphKeys.LOCAL_VARIABLES],
            dtype=global_step.dtype.base_dtype,
            name="sync_rep_local_step")
        self.local_step_init_op = state_ops.assign(self._local_step,
                                                   global_step._ref())
        chief_init_ops = [self.local_step_init_op]

        self.ready_for_local_init_op = variables.report_uninitialized_variables(
            variables.all_variables())

        # The wait op waits for the current worker to dequeue a token from its respective token queue
        self._wait_op = self._sync_token_queues[worker_id].dequeue()

        # Replicas have to wait until they can get a token from the token queue
        # BEFORE begining to compute gradients.
        with ops.device(global_step.device):
            queue_size = self._sync_token_queues[worker_id].size()
            update_local_step_op = state_ops.assign(self._local_step,
                                                    global_step._ref())

        # Gradient accum creation
        with ops.name_scope(None, self._name):
            for grad, var in grads_and_vars:
                var_list.append(var)
                tf.logging.info("Grad " + str(grad) + " assigned to " +
                                str(var.device))
                with ops.device(var.device):
                    if grad is None:
                        continue
                    elif isinstance(grad, ops.Tensor):
                        grad_accum = data_flow_ops.ConditionalAccumulator(
                            grad.dtype,
                            shape=var.get_shape(),
                            shared_name=var.name + "/grad_accum")
                    else:
                        if not isinstance(grad, ops.IndexedSlices):
                            raise ValueError("Unknown grad type!")
                        grad_accum = data_flow_ops.SparseConditionalAccumulator(
                            grad.dtype,
                            shape=(),
                            shared_name=var.name + "/grad_accum")
                    self._accumulator_list.append((grad_accum, var))
            """# Phase 1 gradient computation
      with ops.control_dependencies([update_local_step_op]):
        for index, (grad, var) in enumerate(grads_and_vars):
          with ops.device(var.device):
            if grad is None:
              continue

            elif isinstance(grad, ops.Tensor):
              grad_accum = self._accumulator_list[index][0]

              train_ops.append(grad_accum.apply_grad(grad,
                                                     local_step=self._local_step._ref()))

            else:
              if not isinstance(grad, ops.IndexedSlices):
                raise ValueError("Unknown grad type!")
              grad_accum = self._accumulator_list[index][0]

              train_ops.append(grad_accum.apply_indexed_slices_grad(
                grad, local_step=self._local_step._ref()))"""

            # Phase 1 gradient computation
            with ops.control_dependencies([update_local_step_op]):
                for index, (grad, var) in enumerate(grads_and_vars):
                    print_start_op = logging_ops.Print(
                        global_step, [global_step],
                        message="Starting to apply grads for variable %d" %
                        index)
                    with ops.device(var.device):
                        if grad is None:
                            continue

                        elif isinstance(grad, ops.Tensor):
                            grad_accum = self._accumulator_list[index][0]

                            with ops.control_dependencies([print_start_op]):
                                with tf.device("job:worker/task:%d" %
                                               worker_id):
                                    apply_grad_op = grad_accum.apply_grad(
                                        grad,
                                        local_step=self._local_step._ref())
                                    with ops.control_dependencies(
                                        [apply_grad_op]):
                                        finished_print_op = logging_ops.Print(
                                            global_step, [global_step],
                                            message=
                                            "Done applying grads for variable %d"
                                            % index)
                                        train_ops.append(finished_print_op)

                        else:
                            if not isinstance(grad, ops.IndexedSlices):
                                raise ValueError("Unknown grad type!")
                            grad_accum = self._accumulator_list[index][0]

                            with ops.control_dependencies([print_start_op]):
                                with tf.device("job:worker/task:%d" %
                                               worker_id):
                                    apply_grad_op = grad_accum.apply_indexed_slices_grad(
                                        grad,
                                        local_step=self._local_step._ref())
                                    with ops.control_dependencies(
                                        [apply_grad_op]):
                                        finished_print_op = logging_ops.Print(
                                            global_step, [global_step],
                                            message=
                                            "Done applying grads for variable %d"
                                            % index)
                                        train_ops.append(finished_print_op)

                        with ops.control_dependencies([apply_grad_op]):
                            accum_sizes_printer = logging_ops.Print(
                                global_step,
                                [
                                    x[0].num_accumulated()
                                    for x in self._accumulator_list
                                ] + [worker_id] + [global_step],
                                message="Accum aggregated status on ps")
                            train_ops.append(accum_sizes_printer)
                            x = self._accumulator_list[0]
                            ret = tf.cond(
                                tf.greater_equal(
                                    x[0].num_accumulated(),
                                    self._constant_for_comparison), f_pos,
                                f_neg)

                            should_stop_list_printer = logging_ops.Print(
                                global_step, [ret],
                                message="Should stop ret val status on ps")
                            train_ops.append(should_stop_list_printer)
                            with ops.control_dependencies([ret]):
                                queue_total_printer = logging_ops.Print(
                                    global_step, [self._stop_queue.size()],
                                    message="shared should stop queue size")
                                train_ops.append(queue_total_printer)

            # Phase 2 gradient applying
            for index, (grad, var) in enumerate(grads_and_vars):
                with ops.device(var.device):
                    grad_accum = self._accumulator_list[index][0]
                    if grad is None:
                        aggregated_grad.append(None)
                    elif isinstance(grad, ops.Tensor):
                        if collect_cdfs:
                            aggregated_grad.append(
                                grad_accum.take_grad(self._total_num_replicas))
                        else:
                            aggregated_grad.append(grad_accum.take_grad(1))
                    else:
                        if collect_cdfs:
                            aggregated_grad.append(
                                grad_accum.take_grad(self._total_num_replicas))
                        else:
                            aggregated_grad.append(
                                grad_accum.take_indexed_slices_grad(1))

            aggregated_grads_and_vars = zip(aggregated_grad, var_list)

            # Some debug operations
            self.print_sizes = logging_ops.Print(global_step, [
                self._sync_token_queues[i].size()
                for i in range(self._total_num_replicas)
            ],
                                                 message="queue sizes")
            self.print_accum_sizes = logging_ops.Print(
                self._local_step,
                [x[0].num_accumulated()
                 for x in self._accumulator_list] + [worker_id],
                message="Accum sizes")
            self.print_local_step = logging_ops.Print(
                self._local_step,
                [self._local_step._ref(),
                 global_step._ref()],
                message="local vs global step")

            # sync_op will be assigned to the same device as the global step.
            with ops.device(global_step.device), ops.name_scope(""):
                with ops.control_dependencies([self.print_accum_sizes]):
                    update_op = self._opt.apply_gradients(
                        aggregated_grads_and_vars, global_step)
                    self._update_op = update_op
                    num_to_dequeue = self._stop_queue.size()
                    deq_ops = self._stop_queue.dequeue_many(num_to_dequeue)
                    with ops.control_dependencies([deq_ops]):
                        size_printer_2 = logging_ops.Print(
                            global_step, [self.print_accum_sizes],
                            message="Complelted the dequeue operation!")
                        printer_ops.append(size_printer_2)
                    with ops.control_dependencies(printer_ops):
                        with ops.control_dependencies([update_op]):
                            sync_op = []
                            for cur_worker_id in range(
                                    self._total_num_replicas):
                                sync_op.append(
                                    self._sync_token_queues[cur_worker_id].
                                    enqueue(global_step))
                            sync_op = control_flow_ops.group(*(sync_op))

                # dummy_queue is passed to the queue runner. Don't use the real queues
                # because the queue runner doesn't automatically reopen it once it
                # closed queues in PS devices.
                dummy_queue = (data_flow_ops.FIFOQueue(
                    1,
                    types_pb2.DT_INT32,
                    shapes=(),
                    shared_name="dummy_queue"))

                self._chief_queue_runner = queue_runner.QueueRunner(
                    dummy_queue, [sync_op])

            with ops.device(global_step.device), ops.name_scope(""):
                with ops.control_dependencies(train_ops):
                    # Worker finished applying gradients. Add token to phase1_finished_queue
                    train_op = logging_ops.Print(
                        self._local_step._ref(), [
                            x[0].num_accumulated()
                            for x in self._accumulator_list
                        ] + [worker_id] + [global_step],
                        message="Finished worker updates",
                        name="FinishedWorkerUpdatesPrint")

            for accum, var in self._accumulator_list:
                with ops.device(var.device):
                    chief_init_ops.append(
                        accum.set_global_step(global_step,
                                              name="SetGlobalStep"))
            self.chief_init_op = control_flow_ops.group(*(chief_init_ops))
            self._gradients_applied = True

            return train_op
Esempio n. 5
0
    def _train_op_fn(loss):
      """Run one training iteration."""
      if training_state_cache:
        train_op.append(training_state_cache.insert(tree_ids, node_ids, logits))
      if closed_form_grad_and_hess_fn:
        gradients, hessians = closed_form_grad_and_hess_fn(logits, labels)
      else:
        gradients = gradients_impl.gradients(loss, logits, name='Gradients')[0]
        hessians = gradients_impl.gradients(
            gradients, logits, name='Hessians')[0]

      stats_summaries_list = []
      for i, feature_ids in enumerate(feature_ids_list):
        num_buckets = bucket_size_list[i]
        summaries = [
            array_ops.squeeze(
                boosted_trees_ops.make_stats_summary(
                    node_ids=node_ids,
                    gradients=gradients,
                    hessians=hessians,
                    bucketized_features_list=[input_feature_list[f]],
                    max_splits=max_splits,
                    num_buckets=num_buckets),
                axis=0) for f in feature_ids
        ]
        stats_summaries_list.append(summaries)

      accumulators = []

      def grow_tree_from_stats_summaries(stats_summaries_list,
                                         feature_ids_list):
        """Updates ensemble based on the best gains from stats summaries."""
        node_ids_per_feature = []
        gains_list = []
        thresholds_list = []
        left_node_contribs_list = []
        right_node_contribs_list = []
        all_feature_ids = []

        assert len(stats_summaries_list) == len(feature_ids_list)

        for i, feature_ids in enumerate(feature_ids_list):
          (numeric_node_ids_per_feature, numeric_gains_list,
           numeric_thresholds_list, numeric_left_node_contribs_list,
           numeric_right_node_contribs_list) = (
               boosted_trees_ops.calculate_best_gains_per_feature(
                   node_id_range=last_layer_nodes_range,
                   stats_summary_list=stats_summaries_list[i],
                   l1=tree_hparams.l1,
                   l2=tree_hparams.l2,
                   tree_complexity=tree_hparams.tree_complexity,
                   min_node_weight=tree_hparams.min_node_weight,
                   max_splits=max_splits))

          all_feature_ids += feature_ids
          node_ids_per_feature += numeric_node_ids_per_feature
          gains_list += numeric_gains_list
          thresholds_list += numeric_thresholds_list
          left_node_contribs_list += numeric_left_node_contribs_list
          right_node_contribs_list += numeric_right_node_contribs_list

        grow_op = boosted_trees_ops.update_ensemble(
            # Confirm if local_tree_ensemble or tree_ensemble should be used.
            tree_ensemble.resource_handle,
            feature_ids=all_feature_ids,
            node_ids=node_ids_per_feature,
            gains=gains_list,
            thresholds=thresholds_list,
            left_node_contribs=left_node_contribs_list,
            right_node_contribs=right_node_contribs_list,
            learning_rate=tree_hparams.learning_rate,
            max_depth=tree_hparams.max_depth,
            pruning_mode=boosted_trees_ops.PruningMode.NO_PRUNING)
        return grow_op

      if train_in_memory and is_single_machine:
        train_op.append(distribute_lib.increment_var(global_step))
        train_op.append(
            grow_tree_from_stats_summaries(stats_summaries_list,
                                           feature_ids_list))
      else:
        dependencies = []

        for i, feature_ids in enumerate(feature_ids_list):
          stats_summaries = stats_summaries_list[i]
          accumulator = data_flow_ops.ConditionalAccumulator(
              dtype=dtypes.float32,
              # The stats consist of grads and hessians (the last dimension).
              shape=[len(feature_ids), max_splits, bucket_size_list[i], 2],
              shared_name='numeric_stats_summary_accumulator_' + str(i))
          accumulators.append(accumulator)

          apply_grad = accumulator.apply_grad(
              array_ops.stack(stats_summaries, axis=0), stamp_token)
          dependencies.append(apply_grad)

        def grow_tree_from_accumulated_summaries_fn():
          """Updates the tree with the best layer from accumulated summaries."""
          # Take out the accumulated summaries from the accumulator and grow.
          stats_summaries_list = []

          stats_summaries_list = [
              array_ops.unstack(accumulator.take_grad(1), axis=0)
              for accumulator in accumulators
          ]

          grow_op = grow_tree_from_stats_summaries(stats_summaries_list,
                                                   feature_ids_list)
          return grow_op

        with ops.control_dependencies(dependencies):
          train_op.append(distribute_lib.increment_var(global_step))
          if config.is_chief:
            min_accumulated = math_ops.reduce_min(
                array_ops.stack(
                    [acc.num_accumulated() for acc in accumulators]))

            train_op.append(
                control_flow_ops.cond(
                    math_ops.greater_equal(min_accumulated,
                                           n_batches_per_layer),
                    grow_tree_from_accumulated_summaries_fn,
                    control_flow_ops.no_op,
                    name='wait_until_n_batches_accumulated'))

      return control_flow_ops.group(train_op, name='train_op')
Esempio n. 6
0
        def _train_op_fn(loss):
            """Run one training iteration."""
            train_op = []
            if cache:
                train_op.append(cache.insert(tree_ids, node_ids, logits))
            if closed_form_grad_and_hess_fn:
                gradients, hessians = closed_form_grad_and_hess_fn(
                    logits, labels)
            else:
                gradients = gradients_impl.gradients(loss,
                                                     logits,
                                                     name='Gradients')[0]
                hessians = gradients_impl.gradients(gradients,
                                                    logits,
                                                    name='Hessians')[0]
            stats_summary_list = [
                array_ops.squeeze(boosted_trees_ops.make_stats_summary(
                    node_ids=node_ids,
                    gradients=gradients,
                    hessians=hessians,
                    bucketized_features_list=[input_feature_list[f]],
                    max_splits=max_splits,
                    num_buckets=num_buckets),
                                  axis=0) for f in range(num_features)
            ]

            def grow_tree_from_stats_summaries(stats_summary_list):
                """Updates ensemble based on the best gains from stats summaries."""
                (node_ids_per_feature, gains_list, thresholds_list,
                 left_node_contribs_list, right_node_contribs_list) = (
                     boosted_trees_ops.calculate_best_gains_per_feature(
                         node_id_range=array_ops.stack([
                             math_ops.reduce_min(node_ids),
                             math_ops.reduce_max(node_ids)
                         ]),
                         stats_summary_list=stats_summary_list,
                         l1=tree_hparams.l1,
                         l2=tree_hparams.l2,
                         tree_complexity=tree_hparams.tree_complexity,
                         max_splits=max_splits))
                grow_op = boosted_trees_ops.update_ensemble(
                    # Confirm if local_tree_ensemble or tree_ensemble should be used.
                    tree_ensemble.resource_handle,
                    feature_ids=math_ops.range(0,
                                               num_features,
                                               dtype=dtypes.int32),
                    node_ids=node_ids_per_feature,
                    gains=gains_list,
                    thresholds=thresholds_list,
                    left_node_contribs=left_node_contribs_list,
                    right_node_contribs=right_node_contribs_list,
                    learning_rate=tree_hparams.learning_rate,
                    max_depth=tree_hparams.max_depth,
                    pruning_mode=boosted_trees_ops.PruningMode.NO_PRUNING)
                return grow_op

            if train_in_memory and is_single_machine:
                train_op.append(distribute_lib.increment_var(global_step))
                train_op.append(
                    grow_tree_from_stats_summaries(stats_summary_list))
            else:
                summary_accumulator = data_flow_ops.ConditionalAccumulator(
                    dtype=dtypes.float32,
                    # The stats consist of gradients and hessians (the last dimension).
                    shape=[num_features, max_splits, num_buckets, 2],
                    shared_name='stats_summary_accumulator')
                apply_grad = summary_accumulator.apply_grad(
                    array_ops.stack(stats_summary_list, axis=0), stamp_token)

                def grow_tree_from_accumulated_summaries_fn():
                    """Updates the tree with the best layer from accumulated summaries."""
                    # Take out the accumulated summaries from the accumulator and grow.
                    stats_summary_list = array_ops.unstack(
                        summary_accumulator.take_grad(1), axis=0)
                    grow_op = grow_tree_from_stats_summaries(
                        stats_summary_list)
                    return grow_op

                with ops.control_dependencies([apply_grad]):
                    train_op.append(distribute_lib.increment_var(global_step))
                    if config.is_chief:
                        train_op.append(
                            control_flow_ops.cond(
                                math_ops.greater_equal(
                                    summary_accumulator.num_accumulated(),
                                    n_batches_per_layer),
                                grow_tree_from_accumulated_summaries_fn,
                                control_flow_ops.no_op,
                                name='wait_until_n_batches_accumulated'))

            return control_flow_ops.group(train_op, name='train_op')
Esempio n. 7
0
 def testAccumulatorApplyGradFloat32(self):
   with self.cached_session():
     q = data_flow_ops.ConditionalAccumulator(
         dtypes_lib.float32, name="Q", shape=tensor_shape.TensorShape([1]))
     accum_op = q.apply_grad((10.0,))
     accum_op.run()
Esempio n. 8
0
 def testAccumulatorSetGlobalStep(self):
   with self.cached_session():
     q = data_flow_ops.ConditionalAccumulator(
         dtypes_lib.float32, name="Q", shape=tensor_shape.TensorShape([1]))
     set_global_step_op = q.set_global_step(1)
     set_global_step_op.run()
        def _apply_averages():  # pylint: disable=missing-docstring
            # Collect local and global vars
            local_vars = [v for g, v in grads_and_vars if g is not None]
            global_vars = ops.get_collection_ref("global_model")
            # sync queue, place it in the ps
            with ops.colocate_with(self._global_step):
                sync_queue = data_flow_ops.FIFOQueue(-1, [dtypes.bool],
                                                     shapes=[[]],
                                                     shared_name="sync_queue")
            train_ops = []
            aggregated_vars = []
            with ops.name_scope(None, self._name + "/global"):
                for var, gvar in zip(local_vars, global_vars):
                    # pylint: disable=protected-access
                    # Get reference to the tensor,
                    # this works with Variable and ResourceVariable
                    var = ops.convert_to_tensor(var)
                    # Place the accumulator in the same ps as the global_var
                    with ops.device(gvar.device):
                        var_accum = data_flow_ops.ConditionalAccumulator(
                            var.dtype,
                            shape=var.get_shape(),
                            shared_name=gvar.name + "/var_accum",
                        )
                        # Add op to push local_var to accumulator
                        train_ops.append(
                            var_accum.apply_grad(var, local_step=global_step))
                        # Op to average the vars in the accumulator
                        aggregated_vars.append(
                            var_accum.take_grad(self._replicas_to_aggregate))
                        # Remember accumulator and corresponding device
                        self._accumulator_list.append((var_accum, gvar.device))
            # chief worker updates global vars and enqueues tokens to the sync queue
            if self._is_chief:
                update_ops = []
                # Make sure train_ops are run
                with ops.control_dependencies(train_ops):
                    # Update global_vars with average values
                    for avg_var, gvar in zip(aggregated_vars, global_vars):
                        with ops.device(gvar.device):
                            update_ops.append(state_ops.assign(gvar, avg_var))
                    # Update shared global_step
                    with ops.device(global_step.device):
                        update_ops.append(
                            state_ops.assign_add(self._global_step, 1))
                # After averaging, push tokens to the queue
                with ops.control_dependencies(update_ops), ops.device(
                        global_step.device):
                    tokens = array_ops.fill([self._tokens_per_step],
                                            constant_op.constant(False))
                    sync_op = sync_queue.enqueue_many(tokens)
            # non chief workers deque a token, they will block here until chief is done
            else:
                # Make sure train_ops are run
                with ops.control_dependencies(train_ops), ops.device(
                        global_step.device):
                    sync_op = sync_queue.dequeue()

            # All workers pull averaged values
            with ops.control_dependencies([sync_op]):
                local_update_op = self._assign_vars(local_vars, global_vars)
            return local_update_op
Esempio n. 10
0
  def apply_gradients(self, grads_and_vars, worker_id, global_step=None, name=None, collect_cdfs=False,
    #  batch_idx_list=None, worker_kill_list=None, num_workers=None, num_batches_per_epoch=None):
    matrix_to_solve=None, num_batches_per_epoch=None):
    """Apply gradients to variables.
    This contains most of the synchronization implementation and also wraps the
    apply_gradients() from the real optimizer.
    Args:
      grads_and_vars: List of (gradient, variable) pairs as returned by
        compute_gradients().
      global_step: Optional Variable to increment by one after the
        variables have been updated.
      name: Optional name for the returned operation.  Default to the
        name passed to the Optimizer constructor.
    Returns:
      train_op: The op to dequeue a token so the replicas can exit this batch
      and start the next one. This is executed by each replica.
    Raises:
      ValueError: If the grads_and_vars is empty.
      ValueError: If global step is not provided, the staleness cannot be
        checked.
    """
    if not grads_and_vars:
      raise ValueError("Must supply at least one variable")

    if global_step is None:
      raise ValueError("Global step is required to check staleness")

    self._global_step = global_step
    train_ops = []
    aggregated_grad = []
    var_list = []

    self._local_step = variables.Variable(
        initial_value=0,
        trainable=False,
        collections=[ops.GraphKeys.LOCAL_VARIABLES],
        dtype=global_step.dtype.base_dtype,
        name="sync_rep_local_step")
    self.local_step_init_op = state_ops.assign(self._local_step, global_step._ref())
    chief_init_ops = [self.local_step_init_op]
    self.ready_for_local_init_op = variables.report_uninitialized_variables(
      variables.all_variables())

    # The wait op waits for the current worker to dequeue a token from its respective token queue
    self._wait_op = self._sync_token_queues[worker_id].dequeue()

    # Replicas have to wait until they can get a token from the token queue
    # BEFORE begining to compute gradients.
    with ops.device(global_step.device):
      queue_size = self._sync_token_queues[worker_id].size()
      update_local_step_op = state_ops.assign(self._local_step, global_step._ref())

    # Gradient accum creation
    with ops.name_scope(None, self._name):
      for grad, var in grads_and_vars:
        var_list.append(var)
        tf.logging.info("Grad " + str(grad) + " assigned to " + str(var.device))
        with ops.device(var.device):
          if grad is None:
            continue
          elif isinstance(grad, ops.Tensor):
            grad_accum = data_flow_ops.ConditionalAccumulator(
              grad.dtype,
              shape=var.get_shape(),
              shared_name=var.name + "/grad_accum")
          else:
            if not isinstance(grad, ops.IndexedSlices):
              raise ValueError("Unknown grad type!")
            grad_accum = data_flow_ops.SparseConditionalAccumulator(
              grad.dtype, shape=(), shared_name=var.name + "/grad_accum")

          self._accumulator_list.append((grad_accum, var))

      """# Phase 1 gradient computation
      with ops.control_dependencies([update_local_step_op]):
        for index, (grad, var) in enumerate(grads_and_vars):
          with ops.device(var.device):
            if grad is None:
              continue
            elif isinstance(grad, ops.Tensor):
              grad_accum = self._accumulator_list[index][0]
              train_ops.append(grad_accum.apply_grad(grad,
                                                     local_step=self._local_step._ref()))
            else:
              if not isinstance(grad, ops.IndexedSlices):
                raise ValueError("Unknown grad type!")
              grad_accum = self._accumulator_list[index][0]
              train_ops.append(grad_accum.apply_indexed_slices_grad(
                grad, local_step=self._local_step._ref()))"""

      # Phase 1 gradient computation
      with ops.control_dependencies([update_local_step_op]):
        for index, (grad, var) in enumerate(grads_and_vars):
          print_start_op = logging_ops.Print(global_step, [global_step], message="Starting to apply grads for variable %d" % index)
          train_ops.append(print_start_op)
          with ops.device(var.device):
            ps_step_printer0 = logging_ops.Print(global_step, [global_step], message="global step printer0 on ps")
            train_ops.append(ps_step_printer0)
            '''Implement LS computation and solution here'''            
            #b = np.ones(int(num_batches_per_epoch))
            b = tf.ones([int(num_batches_per_epoch),1], tf.float32)         
            A = matrix_to_solve
#            A_for_calc = np.transpose(A)
            LS_solution = linalg_ops.matrix_solve_ls(A, b, fast=False)
            LS_calc = tf.reshape(LS_solution, [-1])
            weight = tf.slice(LS_calc, [worker_id], [1])
#            print_ls_op = logging_ops.Print(LS_calc, [LS_calc], message="Solution for LS!")
#            train_ops.append(print_ls_op)
            weighted_grad = tf.scalar_mul(weight[0], grad)
            '''Kill some workers'''
            if grad is None:
              continue

            elif isinstance(grad, ops.Tensor):
              grad_accum = self._accumulator_list[index][0]

              num_accum = grad_accum.num_accumulated()
              tf.logging.info("Grad Accumed %s, Worker ID: %s" % (str(num_accum), str(worker_id)))

              with ops.control_dependencies([print_start_op]):
                with tf.device("job:worker/task:%d" % worker_id):
                  apply_grad_op = grad_accum.apply_grad(grad,
#                  apply_grad_op = grad_accum.apply_grad(weighted_grad,
                                                        local_step=self._local_step._ref())
                  with ops.control_dependencies([apply_grad_op]):
                    finished_print_op = logging_ops.Print(global_step, [global_step], message="Done applying grads for variable %d" % index)
                    train_ops.append(finished_print_op)

            else:
              if not isinstance(grad, ops.IndexedSlices):
                raise ValueError("Unknown grad type!")
              grad_accum = self._accumulator_list[index][0]

              with ops.control_dependencies([print_start_op]):
                with tf.device("job:worker/task:%d" % worker_id):
                  apply_grad_op = grad_accum.apply_indexed_slices_grad(
                    grad, local_step=self._local_step._ref())
#                    weighted_grad, local_step=self._local_step._ref())
                  with ops.control_dependencies([apply_grad_op]):
                    finished_print_op = logging_ops.Print(global_step, [global_step], message="Done applying grads for variable %d" % index)
                    train_ops.append(finished_print_op)

      # Phase 2 gradient applying
      for index, (grad, var) in enumerate(grads_and_vars):
        with ops.device(var.device):
          work_idx_print1 = logging_ops.Print(worker_id, [worker_id], message="worker id for aggregate grad")
          ps_step_printer1 = logging_ops.Print(global_step, [global_step], message="global step printer1 on ps")
          num_replica_aggragate = logging_ops.Print(self._replicas_to_aggregate, [self._replicas_to_aggregate], message="num replica aggregate")
          train_ops.append(work_idx_print1)
          train_ops.append(ps_step_printer1)
          train_ops.append(num_replica_aggragate)
          grad_accum = self._accumulator_list[index][0]
       
          if grad is None:
            aggregated_grad.append(None)
          elif isinstance(grad, ops.Tensor):
            if collect_cdfs:
#              aggregated_grad.append(grad_accum.take_grad(self._total_num_replicas))
              aggregated_grad.append(grad_accum.take_grad(self._replicas_to_aggregate))
            else:
              aggregated_grad.append(grad_accum.take_grad(1))
          else:
            if collect_cdfs:
#              aggregated_grad.append(grad_accum.take_grad(self._total_num_replicas))
              aggregated_grad.append(grad_accum.take_grad(self._replicas_to_aggregate))
            else:
              aggregated_grad.append(grad_accum.take_indexed_slices_grad(1))

      aggregated_grads_and_vars = zip(aggregated_grad, var_list)

      # Some debug operations
      self.print_sizes = logging_ops.Print(global_step, [self._sync_token_queues[i].size() for i in range(self._total_num_replicas)], message="queue sizes")
      self.print_accum_sizes = logging_ops.Print(self._local_step,
                                                 [x[0].num_accumulated() for x in self._accumulator_list] + [worker_id],
                                                 message="Accum sizes")
      self.print_local_step = logging_ops.Print(self._local_step, [self._local_step._ref(), global_step._ref()], message="local vs global step")

      # sync_op will be assigned to the same device as the global step.
      with ops.device(global_step.device), ops.name_scope(""):
        with ops.control_dependencies([self.print_accum_sizes]):
          update_op = self._opt.apply_gradients(aggregated_grads_and_vars, global_step)
          self._update_op = update_op
          with ops.control_dependencies([update_op]):
            sync_op = []
            for cur_worker_id in range(self._total_num_replicas):
              sync_op.append(self._sync_token_queues[cur_worker_id].enqueue(global_step))
            sync_op = control_flow_ops.group(*(sync_op))

        # dummy_queue is passed to the queue runner. Don't use the real queues
        # because the queue runner doesn't automatically reopen it once it
        # closed queues in PS devices.
        dummy_queue = (
            data_flow_ops.FIFOQueue(1,
                                    types_pb2.DT_INT32,
                                    shapes=(),
                                    shared_name="dummy_queue"))

        self._chief_queue_runner = queue_runner.QueueRunner(dummy_queue,
                                                            [sync_op])

      with ops.device(global_step.device), ops.name_scope(""):
        with ops.control_dependencies(train_ops):
          # Worker finished applying gradients. Add token to phase1_finished_queue
          train_op = logging_ops.Print(self._local_step._ref(),
                                       [x[0].num_accumulated() for x in self._accumulator_list] + [worker_id],
                                       message="Finished worker updates",
                                       name="FinishedWorkerUpdatesPrint")

      for accum, var in self._accumulator_list:
        with ops.device(var.device):
          chief_init_ops.append(
              accum.set_global_step(
                  global_step, name="SetGlobalStep"))
      self.chief_init_op = control_flow_ops.group(*(chief_init_ops))
      self._gradients_applied = True

      return train_op