コード例 #1
0
    def __init__(self,
                 workers,
                 num_sgd_iter=1,
                 train_batch_size=1,
                 sgd_minibatch_size=0,
                 standardize_fields=frozenset([]),
                 keep_local_weights_in_sync=True,
                 backend="gloo"):
        PolicyOptimizer.__init__(self, workers)
        self.learner_stats = {}
        self.num_sgd_iter = num_sgd_iter
        self.train_batch_size = train_batch_size
        self.sgd_minibatch_size = sgd_minibatch_size
        self.standardize_fields = standardize_fields
        self.keep_local_weights_in_sync = keep_local_weights_in_sync
        self.update_weights_timer = TimerStat()
        self.learn_timer = TimerStat()

        # Setup the distributed processes.
        if not self.workers.remote_workers():
            raise ValueError("This optimizer requires >0 remote workers.")
        ip = ray.get(workers.remote_workers()[0].get_node_ip.remote())
        port = ray.get(workers.remote_workers()[0].find_free_port.remote())
        address = "tcp://{ip}:{port}".format(ip=ip, port=port)
        logger.info(
            "Creating torch process group with leader {}".format(address))

        # Get setup tasks in order to throw errors on failure.
        ray.get([
            worker.setup_torch_data_parallel.remote(
                address, i, len(workers.remote_workers()), backend)
            for i, worker in enumerate(workers.remote_workers())
        ])
        logger.info("Torch process group init completed")
コード例 #2
0
    def __init__(self,
                 workers,
                 learning_starts=1000,
                 buffer_size=10000,
                 train_batch_size=32):
        """Initialize a batch replay optimizer.

        Arguments:
            workers (WorkerSet): set of all workers
            learning_starts (int): start learning after this number of
                timesteps have been collected
            buffer_size (int): max timesteps to keep in the replay buffer
            train_batch_size (int): number of timesteps to train on at once
        """
        PolicyOptimizer.__init__(self, workers)

        self.replay_starts = learning_starts
        self.max_buffer_size = buffer_size
        self.train_batch_size = train_batch_size
        assert self.max_buffer_size >= self.replay_starts

        # List of buffered sample batches
        self.replay_buffer = []
        self.buffer_size = 0

        # Stats
        self.update_weights_timer = TimerStat()
        self.sample_timer = TimerStat()
        self.grad_timer = TimerStat()
        self.learner_stats = {}
コード例 #3
0
 def add_batch(self, batch):
     PolicyOptimizer._check_not_multiagent(batch)
     with self.add_batch_timer:
         for row in batch.rows():
             self.replay_buffer.add(row["obs"], row["actions"],
                                    row["rewards"], row["new_obs"],
                                    row["dones"], row["weights"])
コード例 #4
0
    def __init__(self,
                 workers,
                 num_sgd_iter=1,
                 train_batch_size=1,
                 sgd_minibatch_size=0,
                 standardize_fields=frozenset([]),
                 aux_loss_every_k=16,
                 aux_loss_num_sgd_iter=9,
                 aux_loss_start_after_num_steps=0):
        PolicyOptimizer.__init__(self, workers)

        self.update_weights_timer = TimerStat()
        self.standardize_fields = standardize_fields
        self.sample_timer = TimerStat()
        self.grad_timer = TimerStat()
        self.throughput = RunningStat()
        self.num_sgd_iter = num_sgd_iter
        self.sgd_minibatch_size = sgd_minibatch_size
        self.train_batch_size = train_batch_size
        self.learner_stats = {}
        self.policies = dict(
            self.workers.local_worker().foreach_trainable_policy(lambda p, i:
                                                                 (i, p)))
        logger.debug("Policies to train: {}".format(self.policies))

        self.aux_loss_every_k = aux_loss_every_k
        self.aux_loss_num_sgd_iter = aux_loss_num_sgd_iter
        self.aux_loss_start_after_num_steps = aux_loss_start_after_num_steps
        self.memory = []
        # Assert that train batch size is divisible by sgd minibatch size to make populating
        # policy logits simpler.
        assert train_batch_size % sgd_minibatch_size == 0, (
            f"train_batch_size: {train_batch_size}"
            f"sgd_minibatch_size: {sgd_minibatch_size}")
コード例 #5
0
    def __init__(self, workers, num_sgd_iter=1, train_batch_size=1):
        PolicyOptimizer.__init__(self, workers)

        self.update_weights_timer = TimerStat()
        self.sample_timer = TimerStat()
        self.grad_timer = TimerStat()
        self.throughput = RunningStat()
        self.num_sgd_iter = num_sgd_iter
        self.train_batch_size = train_batch_size
        self.learner_stats = {}
コード例 #6
0
    def __init__(self, local_evaluator, remote_evaluators, grads_per_step=100):
        PolicyOptimizer.__init__(self, local_evaluator, remote_evaluators)

        self.apply_timer = TimerStat()
        self.wait_timer = TimerStat()
        self.dispatch_timer = TimerStat()
        self.grads_per_step = grads_per_step
        self.learner_stats = {}
        if not self.remote_evaluators:
            raise ValueError(
                "Async optimizer requires at least 1 remote evaluator")
コード例 #7
0
    def __init__(self, workers, grads_per_step=100):
        PolicyOptimizer.__init__(self, workers)

        self.apply_timer = TimerStat()
        self.wait_timer = TimerStat()
        self.dispatch_timer = TimerStat()
        self.grads_per_step = grads_per_step
        self.learner_stats = {}
        if not self.workers.remote_workers():
            raise ValueError(
                "Async optimizer requires at least 1 remote workers")
コード例 #8
0
ファイル: sync_replay_optimizer.py プロジェクト: wsjeon/ray
    def __init__(self,
                 workers,
                 learning_starts=1000,
                 buffer_size=10000,
                 prioritized_replay=True,
                 prioritized_replay_alpha=0.6,
                 prioritized_replay_beta=0.4,
                 schedule_max_timesteps=100000,
                 beta_annealing_fraction=0.2,
                 final_prioritized_replay_beta=0.4,
                 prioritized_replay_eps=1e-6,
                 train_batch_size=32,
                 sample_batch_size=4,
                 before_learn_on_batch=None,
                 synchronize_sampling=False):
        PolicyOptimizer.__init__(self, workers)

        self.replay_starts = learning_starts
        # linearly annealing beta used in Rainbow paper
        self.prioritized_replay_beta = LinearSchedule(
            schedule_timesteps=int(schedule_max_timesteps *
                                   beta_annealing_fraction),
            initial_p=prioritized_replay_beta,
            final_p=final_prioritized_replay_beta)
        self.prioritized_replay_eps = prioritized_replay_eps
        self.train_batch_size = train_batch_size
        self.before_learn_on_batch = before_learn_on_batch
        self.synchronize_sampling = synchronize_sampling

        # Stats
        self.update_weights_timer = TimerStat()
        self.sample_timer = TimerStat()
        self.replay_timer = TimerStat()
        self.grad_timer = TimerStat()
        self.learner_stats = {}

        # Set up replay buffer
        if prioritized_replay:

            def new_buffer():
                return PrioritizedReplayBuffer(buffer_size,
                                               alpha=prioritized_replay_alpha)
        else:

            def new_buffer():
                return ReplayBuffer(buffer_size)

        self.replay_buffers = collections.defaultdict(new_buffer)

        if buffer_size < self.replay_starts:
            logger.warning("buffer_size={} < replay_starts={}".format(
                buffer_size, self.replay_starts))
コード例 #9
0
 def stats(self):
     timing = {
         "{}_time_ms".format(k): round(1000 * self.timers[k].mean, 3)
         for k in self.timers
     }
     timing["learner_grad_time_ms"] = round(
         1000 * self.learner.grad_timer.mean, 3)
     timing["learner_load_time_ms"] = round(
         1000 * self.learner.load_timer.mean, 3)
     timing["learner_load_wait_time_ms"] = round(
         1000 * self.learner.load_wait_timer.mean, 3)
     timing["learner_dequeue_time_ms"] = round(
         1000 * self.learner.queue_timer.mean, 3)
     stats = {
         "sample_throughput": round(self.timers["sample"].mean_throughput,
                                    3),
         "train_throughput": round(self.timers["train"].mean_throughput, 3),
         "num_weight_syncs": self.num_weight_syncs,
         "num_steps_replayed": self.num_replayed,
         "timing_breakdown": timing,
         "learner_queue": self.learner.learner_queue_size.stats(),
     }
     if self.learner.stats:
         stats["learner"] = self.learner.stats
     return dict(PolicyOptimizer.stats(self), **stats)
コード例 #10
0
 def stats(self):
     replay_stats = ray.get(self.replay_actors[0].stats.remote(self.debug))
     timing = {
         "{}_time_ms".format(k): round(1000 * self.timers[k].mean, 3)
         for k in self.timers
     }
     timing["learner_grad_time_ms"] = round(
         1000 * self.learner.grad_timer.mean, 3)
     timing["learner_dequeue_time_ms"] = round(
         1000 * self.learner.queue_timer.mean, 3)
     stats = {
         "sample_throughput": round(self.timers["sample"].mean_throughput,
                                    3),
         "train_throughput": round(self.timers["train"].mean_throughput, 3),
         "num_weight_syncs": self.num_weight_syncs,
         "num_samples_dropped": self.num_samples_dropped,
         "learner_queue": self.learner.learner_queue_size.stats(),
         "replay_shard_0": replay_stats,
     }
     debug_stats = {
         "timing_breakdown": timing,
         "pending_sample_tasks": self.sample_tasks.count,
         "pending_replay_tasks": self.replay_tasks.count,
     }
     if self.debug:
         stats.update(debug_stats)
     if self.learner.stats:
         stats["learner"] = self.learner.stats
     return dict(PolicyOptimizer.stats(self), **stats)
コード例 #11
0
 def stats(self):
     return dict(PolicyOptimizer.stats(self), **{
         "sample_time_ms": round(1000 * self.sample_timer.mean, 3),
         "load_time_ms": round(1000 * self.load_timer.mean, 3),
         "grad_time_ms": round(1000 * self.grad_timer.mean, 3),
         "update_time_ms": round(1000 * self.update_weights_timer.mean, 3),
     })
コード例 #12
0
 def stats(self):
     replay_stats = ray_get_and_free(self.replay_actors[0].stats.remote(
         self.debug))
     timing = {
         "{}_time_ms".format(k): round(1000 * self.timers[k].mean, 3)
         for k in self.timers
     }
     timing["learner_grad_time_ms"] = round(
         1000 * self.learner.grad_timer.mean, 3)
     timing["learner_dequeue_time_ms"] = round(
         1000 * self.learner.queue_timer.mean, 3)
     stats = {
         "sample_throughput": round(self.timers["sample"].mean_throughput,
                                    3),
         "train_throughput": round(self.timers["train"].mean_throughput, 3),
         "num_weight_syncs": self.num_weight_syncs,
         "num_samples_dropped": self.num_samples_dropped,
         "learner_queue": self.learner.learner_queue_size.stats(),
         "replay_shard_0": replay_stats,
     }
     debug_stats = {
         "timing_breakdown": timing,
         "pending_sample_tasks": self.sample_tasks.count,
         "pending_replay_tasks": self.replay_tasks.count,
     }
     if self.debug:
         stats.update(debug_stats)
     if self.learner.stats:
         stats["learner"] = self.learner.stats
     return dict(PolicyOptimizer.stats(self), **stats)
コード例 #13
0
 def stats(self):
     timing = {
         "{}_time_ms".format(k): round(1000 * self.timers[k].mean, 3)
         for k in self.timers
     }
     timing["learner_grad_time_ms"] = round(
         1000 * self.learner.grad_timer.mean, 3)
     timing["learner_load_time_ms"] = round(
         1000 * self.learner.load_timer.mean, 3)
     timing["learner_load_wait_time_ms"] = round(
         1000 * self.learner.load_wait_timer.mean, 3)
     timing["learner_dequeue_time_ms"] = round(
         1000 * self.learner.queue_timer.mean, 3)
     stats = {
         "sample_throughput": round(self.timers["sample"].mean_throughput,
                                    3),
         "train_throughput": round(self.timers["train"].mean_throughput, 3),
         "num_weight_syncs": self.num_weight_syncs,
         "num_steps_replayed": self.num_replayed,
         "timing_breakdown": timing,
         "learner_queue": self.learner.learner_queue_size.stats(),
     }
     if self.learner.stats:
         stats["learner"] = self.learner.stats
     return dict(PolicyOptimizer.stats(self), **stats)
コード例 #14
0
 def stats(self):
     return dict(
         PolicyOptimizer.stats(self), **{
             "wait_time_ms": round(1000 * self.wait_timer.mean, 3),
             "apply_time_ms": round(1000 * self.apply_timer.mean, 3),
             "dispatch_time_ms": round(1000 * self.dispatch_timer.mean, 3),
         })
コード例 #15
0
 def stats(self):
     return dict(
         PolicyOptimizer.stats(self), **{
             "update_weights_time_ms": round(
                 1000 * self.update_weights_timer.mean, 3),
             "learn_time_ms": round(1000 * self.learn_timer.mean, 3),
             "learner": self.learner_stats,
         })
コード例 #16
0
 def stats(self):
     return dict(PolicyOptimizer.stats(self), **{
         "sample_time_ms": round(1000 * self.sample_timer.mean, 3),
         "grad_time_ms": round(1000 * self.grad_timer.mean, 3),
         "update_time_ms": round(1000 * self.update_weights_timer.mean, 3),
         "opt_peak_throughput": round(self.grad_timer.mean_throughput, 3),
         "opt_samples": round(self.grad_timer.mean_units_processed, 3),
     })
コード例 #17
0
 def stats(self):
     return dict(
         PolicyOptimizer.stats(self), **{
             "sample_time_ms": round(1000 * self.sample_timer.mean, 3),
             "load_time_ms": round(1000 * self.load_timer.mean, 3),
             "grad_time_ms": round(1000 * self.grad_timer.mean, 3),
             "update_time_ms": round(1000 * self.update_weights_timer.mean,
                                     3),
         })
コード例 #18
0
    def __init__(self, workers, grads_per_step=100):
        """Initialize an async gradients optimizer.

        Arguments:
            grads_per_step (int): The number of gradients to collect and apply
                per each call to step(). This number should be sufficiently
                high to amortize the overhead of calling step().
        """
        PolicyOptimizer.__init__(self, workers)

        self.apply_timer = TimerStat()
        self.wait_timer = TimerStat()
        self.dispatch_timer = TimerStat()
        self.grads_per_step = grads_per_step
        self.learner_stats = {}
        if not self.workers.remote_workers():
            raise ValueError(
                "Async optimizer requires at least 1 remote workers")
コード例 #19
0
 def stats(self):
     return dict(
         PolicyOptimizer.stats(self), **{
             "sync_weights_up_time": round(1000 * self.sync_up_timer.mean,
                                           3),
             "sync_weights_down_time": round(
                 1000 * self.sync_down_timer.mean, 3),
             "learn_time_ms": round(1000 * self.learn_timer.mean, 3),
             "learner": self.learner_stats,
         })
コード例 #20
0
    def __init__(self, workers, train_batch_size=10000, microbatch_size=1000):
        PolicyOptimizer.__init__(self, workers)

        if train_batch_size <= microbatch_size:
            raise ValueError(
                "The microbatch size must be smaller than the train batch "
                "size, got {} vs {}".format(microbatch_size, train_batch_size))

        self.update_weights_timer = TimerStat()
        self.sample_timer = TimerStat()
        self.grad_timer = TimerStat()
        self.throughput = RunningStat()
        self.train_batch_size = train_batch_size
        self.microbatch_size = microbatch_size
        self.learner_stats = {}
        self.policies = dict(
            self.workers.local_worker().foreach_trainable_policy(lambda p, i:
                                                                 (i, p)))
        logger.debug("Policies to train: {}".format(self.policies))
コード例 #21
0
    def __init__(self,
                 workers,
                 num_sgd_iter=1,
                 train_batch_size=1,
                 sgd_minibatch_size=0,
                 standardize_fields=frozenset([])):
        PolicyOptimizer.__init__(self, workers)

        self.update_weights_timer = TimerStat()
        self.standardize_fields = standardize_fields
        self.sample_timer = TimerStat()
        self.grad_timer = TimerStat()
        self.throughput = RunningStat()
        self.num_sgd_iter = num_sgd_iter
        self.sgd_minibatch_size = sgd_minibatch_size
        self.train_batch_size = train_batch_size
        self.learner_stats = {}
        self.policies = dict(self.workers.local_worker()
                             .foreach_trainable_policy(lambda p, i: (i, p)))
        logger.debug("Policies to train: {}".format(self.policies))
コード例 #22
0
    def __init__(self,
                 workers,
                 learning_starts=1000,
                 buffer_size=10000,
                 train_batch_size=32):
        PolicyOptimizer.__init__(self, workers)

        self.replay_starts = learning_starts
        self.max_buffer_size = buffer_size
        self.train_batch_size = train_batch_size
        assert self.max_buffer_size >= self.replay_starts

        # List of buffered sample batches
        self.replay_buffer = []
        self.buffer_size = 0

        # Stats
        self.update_weights_timer = TimerStat()
        self.sample_timer = TimerStat()
        self.grad_timer = TimerStat()
        self.learner_stats = {}
コード例 #23
0
 def stats(self):
     return dict(
         PolicyOptimizer.stats(self), **{
             "sample_time_ms": round(1000 * self.sample_timer.mean, 3),
             "grad_time_ms": round(1000 * self.grad_timer.mean, 3),
             "update_time_ms": round(1000 * self.update_weights_timer.mean,
                                     3),
             "opt_peak_throughput": round(self.grad_timer.mean_throughput,
                                          3),
             "opt_samples": round(self.grad_timer.mean_units_processed, 3),
             "learner": self.learner_stats,
         })
コード例 #24
0
    def stats(self):
        def timer_to_ms(timer):
            return round(1000 * timer.mean, 3)

        stats = self.aggregator.stats()
        stats.update(self.get_mean_stats_and_reset())
        stats["timing_breakdown"] = {
            "optimizer_step_time_ms": timer_to_ms(self._optimizer_step_timer),
            "learner_grad_time_ms": timer_to_ms(self.learner.grad_timer),
            "learner_load_time_ms": timer_to_ms(self.learner.load_timer),
            "learner_load_wait_time_ms":
            timer_to_ms(self.learner.load_wait_timer),
            "learner_dequeue_time_ms": timer_to_ms(self.learner.queue_timer),
        }
        stats["learner_queue"] = self.learner.learner_queue_size.stats()
        if self.learner.stats:
            stats["learner"] = self.learner.stats
        return dict(PolicyOptimizer.stats(self), **stats)
コード例 #25
0
    def stats(self):
        def timer_to_ms(timer):
            return round(1000 * timer.mean, 3)

        stats_list = []
        learner_info = {}

        for ws_id in self.aggregator_set.keys():
            aggregator = self.aggregator_set[ws_id]
            learner = self.learner_set[ws_id]

            stats = aggregator.stats()
            stats.update(self.get_mean_stats_and_reset())
            stats["timing_breakdown"] = {
                "optimizer_step_time_ms": timer_to_ms(
                    self._optimizer_step_timer),
                "learner_grad_time_ms": timer_to_ms(learner.grad_timer),
                "learner_load_time_ms": timer_to_ms(learner.load_timer),
                "learner_load_wait_time_ms": timer_to_ms(
                    learner.load_wait_timer),
                "learner_dequeue_time_ms": timer_to_ms(learner.queue_timer),
            }
            stats["learner_queue"] = learner.learner_queue_size.stats()
            if learner.stats:
                learner_info["policy{}".format(ws_id)] = learner.stats
                if not self.sync_sampling:
                    learner_info["policy{}".format(ws_id)]["train_timesteps"] \
                        = int(learner.stats[
                                  "train_timesteps"] // learner.num_sgd_iter)
                learner_info["policy{}".format(ws_id)]["sample_timesteps"] = \
                    stats["sample_timesteps"]
                learner_info["policy{}".format(ws_id)]["training_iteration"] = \
                    int(stats["sample_timesteps"] // self.train_batch_size)
            stats.pop("sample_timesteps")

            stats_list.append(stats)

        ret_stat = wrap_dict_list(stats_list)
        ret_stat["learner"] = learner_info
        original_stat = PolicyOptimizer.stats(self)
        original_stat.update(ret_stat)
        return original_stat
コード例 #26
0
    def stats(self):
        def timer_to_ms(timer):
            return round(1000 * timer.mean, 3)

        timing = {
            "optimizer_step_time_ms": timer_to_ms(self._optimizer_step_timer),
            "learner_grad_time_ms": timer_to_ms(self.learner.grad_timer),
            "learner_load_time_ms": timer_to_ms(self.learner.load_timer),
            "learner_load_wait_time_ms": timer_to_ms(
                self.learner.load_wait_timer),
            "learner_dequeue_time_ms": timer_to_ms(self.learner.queue_timer),
        }
        stats = dict({
            "num_weight_syncs": self.num_weight_syncs,
            "num_steps_replayed": self.num_replayed,
            "timing_breakdown": timing,
            "learner_queue": self.learner.learner_queue_size.stats(),
        }, **self.get_mean_stats_and_reset())
        self._last_stats_val.clear()
        if self.learner.stats:
            stats["learner"] = self.learner.stats
        return dict(PolicyOptimizer.stats(self), **stats)
コード例 #27
0
    def stats(self):
        def timer_to_ms(timer):
            return round(1000 * timer.mean, 3)

        timing = {
            "optimizer_step_time_ms": timer_to_ms(self._optimizer_step_timer),
            "learner_grad_time_ms": timer_to_ms(self.learner.grad_timer),
            "learner_load_time_ms": timer_to_ms(self.learner.load_timer),
            "learner_load_wait_time_ms":
            timer_to_ms(self.learner.load_wait_timer),
            "learner_dequeue_time_ms": timer_to_ms(self.learner.queue_timer),
        }
        stats = dict(
            {
                "num_weight_syncs": self.num_weight_syncs,
                "num_steps_replayed": self.num_replayed,
                "timing_breakdown": timing,
                "learner_queue": self.learner.learner_queue_size.stats(),
            }, **self.get_mean_stats_and_reset())
        self._last_stats_val.clear()
        if self.learner.stats:
            stats["learner"] = self.learner.stats
        return dict(PolicyOptimizer.stats(self), **stats)
コード例 #28
0
    def __init__(self,
                 workers,
                 learning_starts=1000,
                 buffer_size=10000,
                 prioritized_replay=True,
                 prioritized_replay_alpha=0.6,
                 prioritized_replay_beta=0.4,
                 prioritized_replay_eps=1e-6,
                 train_batch_size=512,
                 rollout_fragment_length=50,
                 num_replay_buffer_shards=1,
                 max_weight_sync_delay=400,
                 debug=False,
                 batch_replay=False):
        """Initialize an async replay optimizer.

        Arguments:
            workers (WorkerSet): all workers
            learning_starts (int): wait until this many steps have been sampled
                before starting optimization.
            buffer_size (int): max size of the replay buffer
            prioritized_replay (bool): whether to enable prioritized replay
            prioritized_replay_alpha (float): replay alpha hyperparameter
            prioritized_replay_beta (float): replay beta hyperparameter
            prioritized_replay_eps (float): replay eps hyperparameter
            train_batch_size (int): size of batches to learn on
            rollout_fragment_length (int): size of batches to sample from
                workers.
            num_replay_buffer_shards (int): number of actors to use to store
                replay samples
            max_weight_sync_delay (int): update the weights of a rollout worker
                after collecting this number of timesteps from it
            debug (bool): return extra debug stats
            batch_replay (bool): replay entire sequential batches of
                experiences instead of sampling steps individually
        """
        PolicyOptimizer.__init__(self, workers)

        self.debug = debug
        self.batch_replay = batch_replay
        self.replay_starts = learning_starts
        self.prioritized_replay_beta = prioritized_replay_beta
        self.prioritized_replay_eps = prioritized_replay_eps
        self.max_weight_sync_delay = max_weight_sync_delay

        self.learner = LearnerThread(self.workers.local_worker())
        self.learner.start()

        if self.batch_replay:
            replay_cls = BatchReplayActor
        else:
            replay_cls = ReplayActor
        self.replay_actors = create_colocated(replay_cls, [
            num_replay_buffer_shards,
            learning_starts,
            buffer_size,
            train_batch_size,
            prioritized_replay_alpha,
            prioritized_replay_beta,
            prioritized_replay_eps,
        ], num_replay_buffer_shards)

        # Stats
        self.timers = {
            k: TimerStat()
            for k in [
                "put_weights", "get_samples", "sample_processing",
                "replay_processing", "update_priorities", "train", "sample"
            ]
        }
        self.num_weight_syncs = 0
        self.num_samples_dropped = 0
        self.learning_started = False

        # Number of worker steps since the last weight update
        self.steps_since_update = {}

        # Otherwise kick of replay tasks for local gradient updates
        self.replay_tasks = TaskPool()
        for ra in self.replay_actors:
            for _ in range(REPLAY_QUEUE_DEPTH):
                self.replay_tasks.add(ra, ra.replay.remote())

        # Kick off async background sampling
        self.sample_tasks = TaskPool()
        if self.workers.remote_workers():
            self._set_workers(self.workers.remote_workers())
コード例 #29
0
    def __init__(self,
                 workers,
                 train_batch_size=500,
                 sample_batch_size=50,
                 # num_envs_per_worker=1,
                 num_gpus=0,
                 # lr=0.0005,
                 replay_buffer_num_slots=0,
                 replay_proportion=0.0,
                 num_data_loader_buffers=1,
                 max_sample_requests_in_flight_per_worker=2,
                 broadcast_interval=1,
                 num_sgd_iter=1,
                 sgd_minibatch_size=1,
                 learner_queue_size=16,
                 learner_queue_timeout=300,
                 num_aggregation_workers=0,
                 shuffle_sequences=True,
                 sync_sampling=False,
                 minibatch_buffer_size=1,
                 _fake_gpus=False):
        PolicyOptimizer.__init__(self, workers)

        self._stats_start_time = time.time()
        self._last_stats_time = {}
        self._last_stats_sum = {}

        self.learner_set = {}
        self.aggregator_set = {}

        self.sync_sampling = sync_sampling

        assert isinstance(workers, SuperWorkerSet)

        for ws_id, ws in workers.items():
            if num_gpus > 1 or num_data_loader_buffers > 1:
                # logger.info(
                #     "Enabling multi-GPU mode, {} GPUs, {} parallel
                #     loaders".format(
                #         num_gpus, num_data_loader_buffers))
                # if num_data_loader_buffers < minibatch_buffer_size:
                #     raise ValueError(
                #         "In multi-gpu mode you must have at least as many "
                #         "parallel data loader buffers as minibatch buffers: "
                #         "{} vs {}".format(num_data_loader_buffers,
                #                           minibatch_buffer_size))
                # self.learner = TFMultiGPULearner(
                #     self.workers.local_worker(),
                #     lr=lr,
                #     num_gpus=num_gpus,
                #     train_batch_size=train_batch_size,
                #     num_data_loader_buffers=num_data_loader_buffers,
                #     minibatch_buffer_size=minibatch_buffer_size,
                #     num_sgd_iter=num_sgd_iter,
                #     learner_queue_size=learner_queue_size,
                #     learner_queue_timeout=learner_queue_timeout,
                #     _fake_gpus=_fake_gpus)
                raise NotImplementedError()
            else:
                if self.sync_sampling:
                    learner = SyncLearnerThread(
                        ws.local_worker(),
                        minibatch_buffer_size=minibatch_buffer_size,
                        num_sgd_iter=num_sgd_iter,
                        learner_queue_size=learner_queue_size,
                        learner_queue_timeout=learner_queue_timeout,
                        num_gpus=num_gpus,
                        sgd_batch_size=sgd_minibatch_size
                    )
                else:
                    learner = AsyncLearnerThread(
                        ws.local_worker(),
                        minibatch_buffer_size=minibatch_buffer_size,
                        num_sgd_iter=num_sgd_iter,
                        learner_queue_size=learner_queue_size,
                        learner_queue_timeout=learner_queue_timeout,
                    )
            learner.start()
            self.learner_set[ws_id] = learner

            if num_aggregation_workers > 0:
                raise NotImplementedError()
                # self.aggregator = TreeAggregator(
                #     workers,
                #     num_aggregation_workers,
                #     replay_proportion=replay_proportion,
                #     max_sample_requests_in_flight_per_worker=(
                #         max_sample_requests_in_flight_per_worker),
                #     replay_buffer_num_slots=replay_buffer_num_slots,
                #     train_batch_size=train_batch_size,
                #     sample_batch_size=sample_batch_size,
                #     broadcast_interval=broadcast_interval)
            else:
                aggregator = DRAggregator(
                    ws,
                    replay_proportion=replay_proportion,
                    max_sample_requests_in_flight_per_worker=(
                        max_sample_requests_in_flight_per_worker if not
                        self.sync_sampling else 1),
                    replay_buffer_num_slots=replay_buffer_num_slots,
                    train_batch_size=train_batch_size,
                    sample_batch_size=sample_batch_size,
                    broadcast_interval=broadcast_interval,
                    sync_sampling=sync_sampling
                )
            self.aggregator_set[ws_id] = aggregator
        self.train_batch_size = train_batch_size
        self.shuffle_sequences = shuffle_sequences
        logger.debug("===== Do you in sync sampling mode? {} =====".format(
            sync_sampling))

        # Stats
        self._optimizer_step_timer = TimerStat()
        self._stats_start_time = time.time()
        self._last_stats_time = {}

        self.episode_history = {ws_id: [] for ws_id, _ in self.workers.items()}
        self.to_be_collected = {ws_id: [] for ws_id, _ in self.workers.items()}
コード例 #30
0
    def __init__(
        self,
        workers,
        learning_starts=1000,
        buffer_size=10000,
        prioritized_replay=True,
        prioritized_replay_alpha=0.6,
        prioritized_replay_beta=0.4,
        prioritized_replay_eps=1e-6,
        final_prioritized_replay_beta=0.4,
        train_batch_size=32,
        before_learn_on_batch=None,
        synchronize_sampling=False,
        prioritized_replay_beta_annealing_timesteps=100000 * 0.2,
    ):
        """Initialize an sync replay optimizer.

        Args:
            workers (WorkerSet): all workers
            learning_starts (int): wait until this many steps have been sampled
                before starting optimization.
            buffer_size (int): max size of the replay buffer
            prioritized_replay (bool): whether to enable prioritized replay
            prioritized_replay_alpha (float): replay alpha hyperparameter
            prioritized_replay_beta (float): replay beta hyperparameter
            prioritized_replay_eps (float): replay eps hyperparameter
            final_prioritized_replay_beta (float): Final value of beta.
            train_batch_size (int): size of batches to learn on
            before_learn_on_batch (function): callback to run before passing
                the sampled batch to learn on
            synchronize_sampling (bool): whether to sample the experiences for
                all policies with the same indices (used in MADDPG).
            prioritized_replay_beta_annealing_timesteps (int): The timestep at
                which PR-beta annealing should end.
        """
        PolicyOptimizer.__init__(self, workers)

        self.replay_starts = learning_starts

        # Linearly annealing beta used in Rainbow paper, stopping at
        # `final_prioritized_replay_beta`.
        self.prioritized_replay_beta = PiecewiseSchedule(
            endpoints=[(0, prioritized_replay_beta),
                       (prioritized_replay_beta_annealing_timesteps,
                        final_prioritized_replay_beta)],
            outside_value=final_prioritized_replay_beta,
            framework=None)
        self.prioritized_replay_eps = prioritized_replay_eps
        self.train_batch_size = train_batch_size
        self.before_learn_on_batch = before_learn_on_batch
        self.synchronize_sampling = synchronize_sampling

        # Stats
        self.update_weights_timer = TimerStat()
        self.sample_timer = TimerStat()
        self.replay_timer = TimerStat()
        self.grad_timer = TimerStat()
        self.learner_stats = {}

        # Set up replay buffer
        if prioritized_replay:

            def new_buffer():
                return PrioritizedReplayBuffer(buffer_size,
                                               alpha=prioritized_replay_alpha)
        else:

            def new_buffer():
                return ReplayBuffer(buffer_size)

        self.replay_buffers = collections.defaultdict(new_buffer)

        if buffer_size < self.replay_starts:
            logger.warning("buffer_size={} < replay_starts={}".format(
                buffer_size, self.replay_starts))
コード例 #31
0
    def __init__(self,
                 workers,
                 learning_starts=1000,
                 buffer_size=10000,
                 prioritized_replay=True,
                 prioritized_replay_alpha=0.6,
                 prioritized_replay_beta=0.4,
                 prioritized_replay_eps=1e-6,
                 schedule_max_timesteps=100000,
                 beta_annealing_fraction=0.2,
                 final_prioritized_replay_beta=0.4,
                 train_batch_size=32,
                 sample_batch_size=4,
                 before_learn_on_batch=None,
                 synchronize_sampling=False):
        """Initialize an sync replay optimizer.

        Arguments:
            workers (WorkerSet): all workers
            learning_starts (int): wait until this many steps have been sampled
                before starting optimization.
            buffer_size (int): max size of the replay buffer
            prioritized_replay (bool): whether to enable prioritized replay
            prioritized_replay_alpha (float): replay alpha hyperparameter
            prioritized_replay_beta (float): replay beta hyperparameter
            prioritized_replay_eps (float): replay eps hyperparameter
            schedule_max_timesteps (int): number of timesteps in the schedule
            beta_annealing_fraction (float): fraction of schedule to anneal
                beta over
            final_prioritized_replay_beta (float): final value of beta
            train_batch_size (int): size of batches to learn on
            sample_batch_size (int): size of batches to sample from workers
            before_learn_on_batch (function): callback to run before passing
                the sampled batch to learn on
            synchronize_sampling (bool): whether to sample the experiences for
                all policies with the same indices (used in MADDPG).
        """
        PolicyOptimizer.__init__(self, workers)

        self.replay_starts = learning_starts
        # linearly annealing beta used in Rainbow paper
        self.prioritized_replay_beta = LinearSchedule(
            schedule_timesteps=int(schedule_max_timesteps *
                                   beta_annealing_fraction),
            initial_p=prioritized_replay_beta,
            final_p=final_prioritized_replay_beta)
        self.prioritized_replay_eps = prioritized_replay_eps
        self.train_batch_size = train_batch_size
        self.before_learn_on_batch = before_learn_on_batch
        self.synchronize_sampling = synchronize_sampling

        # Stats
        self.update_weights_timer = TimerStat()
        self.sample_timer = TimerStat()
        self.replay_timer = TimerStat()
        self.grad_timer = TimerStat()
        self.learner_stats = {}

        # Set up replay buffer
        if prioritized_replay:

            def new_buffer():
                return PrioritizedReplayBuffer(buffer_size,
                                               alpha=prioritized_replay_alpha)
        else:

            def new_buffer():
                return ReplayBuffer(buffer_size)

        self.replay_buffers = collections.defaultdict(new_buffer)

        if buffer_size < self.replay_starts:
            logger.warning("buffer_size={} < replay_starts={}".format(
                buffer_size, self.replay_starts))
コード例 #32
0
    def __init__(self,
                 workers,
                 train_batch_size=500,
                 sample_batch_size=50,
                 num_envs_per_worker=1,
                 num_gpus=0,
                 lr=0.0005,
                 replay_buffer_num_slots=0,
                 replay_proportion=0.0,
                 num_data_loader_buffers=1,
                 max_sample_requests_in_flight_per_worker=2,
                 broadcast_interval=1,
                 num_sgd_iter=1,
                 minibatch_buffer_size=1,
                 learner_queue_size=16,
                 num_aggregation_workers=0,
                 _fake_gpus=False):
        PolicyOptimizer.__init__(self, workers)

        self._stats_start_time = time.time()
        self._last_stats_time = {}
        self._last_stats_sum = {}

        if num_gpus > 1 or num_data_loader_buffers > 1:
            logger.info(
                "Enabling multi-GPU mode, {} GPUs, {} parallel loaders".format(
                    num_gpus, num_data_loader_buffers))
            if num_data_loader_buffers < minibatch_buffer_size:
                raise ValueError(
                    "In multi-gpu mode you must have at least as many "
                    "parallel data loader buffers as minibatch buffers: "
                    "{} vs {}".format(num_data_loader_buffers,
                                      minibatch_buffer_size))
            self.learner = TFMultiGPULearner(
                self.workers.local_worker(),
                lr=lr,
                num_gpus=num_gpus,
                train_batch_size=train_batch_size,
                num_data_loader_buffers=num_data_loader_buffers,
                minibatch_buffer_size=minibatch_buffer_size,
                num_sgd_iter=num_sgd_iter,
                learner_queue_size=learner_queue_size,
                _fake_gpus=_fake_gpus)
        else:
            self.learner = LearnerThread(self.workers.local_worker(),
                                         minibatch_buffer_size, num_sgd_iter,
                                         learner_queue_size)
        self.learner.start()

        # Stats
        self._optimizer_step_timer = TimerStat()
        self._stats_start_time = time.time()
        self._last_stats_time = {}

        if num_aggregation_workers > 0:
            self.aggregator = TreeAggregator(
                workers,
                num_aggregation_workers,
                replay_proportion=replay_proportion,
                max_sample_requests_in_flight_per_worker=(
                    max_sample_requests_in_flight_per_worker),
                replay_buffer_num_slots=replay_buffer_num_slots,
                train_batch_size=train_batch_size,
                sample_batch_size=sample_batch_size,
                broadcast_interval=broadcast_interval)
        else:
            self.aggregator = SimpleAggregator(
                workers,
                replay_proportion=replay_proportion,
                max_sample_requests_in_flight_per_worker=(
                    max_sample_requests_in_flight_per_worker),
                replay_buffer_num_slots=replay_buffer_num_slots,
                train_batch_size=train_batch_size,
                sample_batch_size=sample_batch_size,
                broadcast_interval=broadcast_interval)
コード例 #33
0
    def __init__(self,
                 workers,
                 sgd_batch_size=128,
                 num_sgd_iter=10,
                 sample_batch_size=200,
                 num_envs_per_worker=1,
                 train_batch_size=1024,
                 num_gpus=0,
                 standardize_fields=[],
                 straggler_mitigation=False):
        PolicyOptimizer.__init__(self, workers)

        self.batch_size = sgd_batch_size
        self.num_sgd_iter = num_sgd_iter
        self.num_envs_per_worker = num_envs_per_worker
        self.sample_batch_size = sample_batch_size
        self.train_batch_size = train_batch_size
        self.straggler_mitigation = straggler_mitigation
        if not num_gpus:
            self.devices = ["/cpu:0"]
        else:
            self.devices = [
                "/gpu:{}".format(i) for i in range(int(math.ceil(num_gpus)))
            ]
        self.batch_size = int(sgd_batch_size / len(self.devices)) * len(
            self.devices)
        assert self.batch_size % len(self.devices) == 0
        assert self.batch_size >= len(self.devices), "batch size too small"
        self.per_device_batch_size = int(self.batch_size / len(self.devices))
        self.sample_timer = TimerStat()
        self.load_timer = TimerStat()
        self.grad_timer = TimerStat()
        self.update_weights_timer = TimerStat()
        self.standardize_fields = standardize_fields

        logger.info("LocalMultiGPUOptimizer devices {}".format(self.devices))

        self.policies = dict(
            self.workers.local_worker().foreach_trainable_policy(lambda p, i:
                                                                 (i, p)))
        logger.debug("Policies to train: {}".format(self.policies))
        for policy_id, policy in self.policies.items():
            if not isinstance(policy, TFPolicy):
                raise ValueError(
                    "Only TF policies are supported with multi-GPU. Try using "
                    "the simple optimizer instead.")

        # per-GPU graph copies created below must share vars with the policy
        # reuse is set to AUTO_REUSE because Adam nodes are created after
        # all of the device copies are created.
        self.optimizers = {}
        with self.workers.local_worker().tf_sess.graph.as_default():
            with self.workers.local_worker().tf_sess.as_default():
                for policy_id, policy in self.policies.items():
                    with tf.variable_scope(policy_id, reuse=tf.AUTO_REUSE):
                        if policy._state_inputs:
                            rnn_inputs = policy._state_inputs + [
                                policy._seq_lens
                            ]
                        else:
                            rnn_inputs = []
                        self.optimizers[policy_id] = (
                            LocalSyncParallelOptimizer(
                                policy._optimizer, self.devices,
                                [v
                                 for _, v in policy._loss_inputs], rnn_inputs,
                                self.per_device_batch_size, policy.copy))

                self.sess = self.workers.local_worker().tf_sess
                self.sess.run(tf.global_variables_initializer())
コード例 #34
0
ファイル: multi_gpu_optimizer.py プロジェクト: zhuohan123/ray
    def __init__(self,
                 workers,
                 sgd_batch_size=128,
                 num_sgd_iter=10,
                 rollout_fragment_length=200,
                 num_envs_per_worker=1,
                 train_batch_size=1024,
                 num_gpus=0,
                 standardize_fields=[],
                 shuffle_sequences=True,
                 _fake_gpus=False):
        """Initialize a synchronous multi-gpu optimizer.

        Arguments:
            workers (WorkerSet): all workers
            sgd_batch_size (int): SGD minibatch size within train batch size
            num_sgd_iter (int): number of passes to learn on per train batch
            rollout_fragment_length (int): size of batches to sample from
                workers.
            num_envs_per_worker (int): num envs in each rollout worker
            train_batch_size (int): size of batches to learn on
            num_gpus (int): number of GPUs to use for data-parallel SGD
            standardize_fields (list): list of fields in the training batch
                to normalize
            shuffle_sequences (bool): whether to shuffle the train batch prior
                to SGD to break up correlations
            _fake_gpus (bool): Whether to use fake-GPUs (CPUs) instead of
                actual GPUs (should only be used for testing on non-GPU
                machines).
        """
        PolicyOptimizer.__init__(self, workers)

        self.batch_size = sgd_batch_size
        self.num_sgd_iter = num_sgd_iter
        self.num_envs_per_worker = num_envs_per_worker
        self.rollout_fragment_length = rollout_fragment_length
        self.train_batch_size = train_batch_size
        self.shuffle_sequences = shuffle_sequences

        # Collect actual devices to use.
        if not num_gpus:
            _fake_gpus = True
            num_gpus = 1
        type_ = "cpu" if _fake_gpus else "gpu"
        self.devices = [
            "/{}:{}".format(type_, i) for i in range(int(math.ceil(num_gpus)))
        ]

        self.batch_size = int(sgd_batch_size / len(self.devices)) * len(
            self.devices)
        assert self.batch_size % len(self.devices) == 0
        assert self.batch_size >= len(self.devices), "batch size too small"
        self.per_device_batch_size = int(self.batch_size / len(self.devices))
        self.sample_timer = TimerStat()
        self.load_timer = TimerStat()
        self.grad_timer = TimerStat()
        self.update_weights_timer = TimerStat()
        self.standardize_fields = standardize_fields

        logger.info("LocalMultiGPUOptimizer devices {}".format(self.devices))

        self.policies = dict(self.workers.local_worker()
                             .foreach_trainable_policy(lambda p, i: (i, p)))
        logger.debug("Policies to train: {}".format(self.policies))
        for policy_id, policy in self.policies.items():
            if not isinstance(policy, TFPolicy):
                raise ValueError(
                    "Only TF graph policies are supported with multi-GPU. "
                    "Try setting `simple_optimizer=True` instead.")

        # per-GPU graph copies created below must share vars with the policy
        # reuse is set to AUTO_REUSE because Adam nodes are created after
        # all of the device copies are created.
        self.optimizers = {}
        with self.workers.local_worker().tf_sess.graph.as_default():
            with self.workers.local_worker().tf_sess.as_default():
                for policy_id, policy in self.policies.items():
                    with tf.variable_scope(policy_id, reuse=tf.AUTO_REUSE):
                        if policy._state_inputs:
                            rnn_inputs = policy._state_inputs + [
                                policy._seq_lens
                            ]
                        else:
                            rnn_inputs = []
                        self.optimizers[policy_id] = (
                            LocalSyncParallelOptimizer(
                                policy._optimizer, self.devices,
                                [v
                                 for _, v in policy._loss_inputs], rnn_inputs,
                                self.per_device_batch_size, policy.copy))

                self.sess = self.workers.local_worker().tf_sess
                self.sess.run(tf.global_variables_initializer())
コード例 #35
0
    def __init__(self,
                 local_evaluator,
                 remote_evaluators,
                 learning_starts=1000,
                 buffer_size=10000,
                 prioritized_replay=True,
                 prioritized_replay_alpha=0.6,
                 prioritized_replay_beta=0.4,
                 prioritized_replay_eps=1e-6,
                 train_batch_size=512,
                 sample_batch_size=50,
                 num_replay_buffer_shards=1,
                 max_weight_sync_delay=400,
                 debug=False,
                 batch_replay=False):
        PolicyOptimizer.__init__(self, local_evaluator, remote_evaluators)

        self.debug = debug
        self.batch_replay = batch_replay
        self.replay_starts = learning_starts
        self.prioritized_replay_beta = prioritized_replay_beta
        self.prioritized_replay_eps = prioritized_replay_eps
        self.max_weight_sync_delay = max_weight_sync_delay

        self.learner = LearnerThread(self.local_evaluator)
        self.learner.start()

        if self.batch_replay:
            replay_cls = BatchReplayActor
        else:
            replay_cls = ReplayActor
        self.replay_actors = create_colocated(replay_cls, [
            num_replay_buffer_shards,
            learning_starts,
            buffer_size,
            train_batch_size,
            prioritized_replay_alpha,
            prioritized_replay_beta,
            prioritized_replay_eps,
        ], num_replay_buffer_shards)

        # Stats
        self.timers = {
            k: TimerStat()
            for k in [
                "put_weights", "get_samples", "sample_processing",
                "replay_processing", "update_priorities", "train", "sample"
            ]
        }
        self.num_weight_syncs = 0
        self.num_samples_dropped = 0
        self.learning_started = False

        # Number of worker steps since the last weight update
        self.steps_since_update = {}

        # Otherwise kick of replay tasks for local gradient updates
        self.replay_tasks = TaskPool()
        for ra in self.replay_actors:
            for _ in range(REPLAY_QUEUE_DEPTH):
                self.replay_tasks.add(ra, ra.replay.remote())

        # Kick off async background sampling
        self.sample_tasks = TaskPool()
        if self.remote_evaluators:
            self._set_evaluators(self.remote_evaluators)