def testMultiplePasses(self): local, remotes = self._make_evs() optimizer = AsyncSamplesOptimizer( local, remotes, { "minibatch_buffer_size": 10, "num_sgd_iter": 10, "sample_batch_size": 10, "train_batch_size": 50, }) self._wait_for(optimizer, 1000, 10000) self.assertLess(optimizer.stats()["num_steps_sampled"], 5000) self.assertGreater(optimizer.stats()["num_steps_trained"], 8000)
def testReplay(self): local, remotes = self._make_evs() optimizer = AsyncSamplesOptimizer( local, remotes, { "replay_buffer_num_slots": 100, "replay_proportion": 10, "sample_batch_size": 10, "train_batch_size": 10, }) self._wait_for(optimizer, 1000, 1000) self.assertLess(optimizer.stats()["num_steps_sampled"], 5000) self.assertGreater(optimizer.stats()["num_steps_replayed"], 8000) self.assertGreater(optimizer.stats()["num_steps_trained"], 8000)
def _init(self, config, env_creator): for k in OPTIMIZER_SHARED_CONFIGS: if k not in config["optimizer"]: config["optimizer"][k] = config[k] policy_cls = self._get_policy_graph() self.local_evaluator = self.make_local_evaluator( env_creator, policy_cls) self.remote_evaluators = self.make_remote_evaluators( env_creator, policy_cls, config["num_workers"]) self.optimizer = AsyncSamplesOptimizer(self.local_evaluator, self.remote_evaluators, config["optimizer"]) if config["entropy_coeff"] < 0: raise DeprecationWarning("entropy_coeff must be >= 0")
def testReplayAndMultiplePasses(self): local, remotes = self._make_evs() optimizer = AsyncSamplesOptimizer( local, remotes, { "minibatch_buffer_size": 10, "num_sgd_iter": 10, "replay_buffer_num_slots": 100, "replay_proportion": 10, "sample_batch_size": 10, "train_batch_size": 10, }) self._wait_for(optimizer, 1000, 1000) self.assertLess(optimizer.stats()["num_steps_sampled"], 5000) self.assertGreater(optimizer.stats()["num_steps_replayed"], 8000) self.assertGreater(optimizer.stats()["num_steps_trained"], 40000)
def _init(self): for k in OPTIMIZER_SHARED_CONFIGS: if k not in self.config["optimizer"]: self.config["optimizer"][k] = self.config[k] if self.config["vtrace"]: policy_cls = self._policy_graph else: policy_cls = A3CPolicyGraph self.local_evaluator = self.make_local_evaluator( self.env_creator, policy_cls) self.remote_evaluators = self.make_remote_evaluators( self.env_creator, policy_cls, self.config["num_workers"]) self.optimizer = AsyncSamplesOptimizer(self.local_evaluator, self.remote_evaluators, self.config["optimizer"])
def testReplay(self): local, remotes = self._make_evs() optimizer = AsyncSamplesOptimizer( local, remotes, { "replay_buffer_num_slots": 100, "replay_proportion": 10, "sample_batch_size": 10, "train_batch_size": 10, }) self._wait_for(optimizer, 1000, 1000) stats = optimizer.stats() self.assertLess(stats["num_steps_sampled"], 5000) replay_ratio = stats["num_steps_replayed"] / stats["num_steps_sampled"] self.assertGreater(replay_ratio, 0.7) self.assertLess(stats["num_steps_trained"], stats["num_steps_sampled"])
def testReplay(self): local, remotes = self._make_evs() workers = WorkerSet._from_existing(local, remotes) optimizer = AsyncSamplesOptimizer( workers, replay_buffer_num_slots=100, replay_proportion=10, sample_batch_size=10, train_batch_size=10, ) self._wait_for(optimizer, 1000, 1000) stats = optimizer.stats() self.assertLess(stats["num_steps_sampled"], 5000) replay_ratio = stats["num_steps_replayed"] / stats["num_steps_sampled"] self.assertGreater(replay_ratio, 0.7) self.assertLess(stats["num_steps_trained"], stats["num_steps_sampled"])
def testMultiTierAggregationBadConf(self): local, remotes = self._make_evs() workers = WorkerSet._from_existing(local, remotes) aggregators = TreeAggregator.precreate_aggregators(4) optimizer = AsyncSamplesOptimizer(workers, num_aggregation_workers=4) self.assertRaises(ValueError, lambda: optimizer.aggregator.init(aggregators))
def testMultiTierAggregation(self): local, remotes = self._make_evs() workers = WorkerSet._from_existing(local, remotes) aggregators = TreeAggregator.precreate_aggregators(1) optimizer = AsyncSamplesOptimizer(workers, num_aggregation_workers=1) optimizer.aggregator.init(aggregators) self._wait_for(optimizer, 1000, 1000)
def testMultiGPU(self): local, remotes = self._make_evs() optimizer = AsyncSamplesOptimizer(local, remotes, num_gpus=2, _fake_gpus=True) self._wait_for(optimizer, 1000, 1000)
def testMultiGPU(self): local, remotes = self._make_evs() optimizer = AsyncSamplesOptimizer(local, remotes, { "num_gpus": 2, "_fake_gpus": True }) self._wait_for(optimizer, 1000, 1000)
def make_aggregators_and_optimizer(workers, config): if config["num_aggregation_workers"] > 0: # Create co-located aggregator actors first for placement pref aggregators = TreeAggregator.precreate_aggregators( config["num_aggregation_workers"]) else: aggregators = None workers.add_workers(config["num_workers"]) optimizer = AsyncSamplesOptimizer( workers, lr=config["lr"], num_gpus=config["num_gpus"], rollout_fragment_length=config["rollout_fragment_length"], train_batch_size=config["train_batch_size"], replay_buffer_num_slots=config["replay_buffer_num_slots"], replay_proportion=config["replay_proportion"], num_data_loader_buffers=config["num_data_loader_buffers"], max_sample_requests_in_flight_per_worker=config[ "max_sample_requests_in_flight_per_worker"], broadcast_interval=config["broadcast_interval"], num_sgd_iter=config["num_sgd_iter"], minibatch_buffer_size=config["minibatch_buffer_size"], num_aggregation_workers=config["num_aggregation_workers"], learner_queue_size=config["learner_queue_size"], learner_queue_timeout=config["learner_queue_timeout"], **config["optimizer"]) if aggregators: # Assign the pre-created aggregators to the optimizer optimizer.aggregator.init(aggregators) return optimizer
def testMultiTierAggregationBadConf(self): local, remotes = self._make_evs() aggregators = TreeAggregator.precreate_aggregators(4) optimizer = AsyncSamplesOptimizer(local, remotes, {"num_aggregation_workers": 4}) self.assertRaises(ValueError, lambda: optimizer.aggregator.init(aggregators))
def test_replay_and_multiple_passes(self): local, remotes = self._make_envs() workers = WorkerSet._from_existing(local, remotes) optimizer = AsyncSamplesOptimizer(workers, minibatch_buffer_size=10, num_sgd_iter=10, replay_buffer_num_slots=100, replay_proportion=10, rollout_fragment_length=10, train_batch_size=10) self._wait_for(optimizer, 1000, 1000) stats = optimizer.stats() print(stats) self.assertLess(stats["num_steps_sampled"], 5000) replay_ratio = stats["num_steps_replayed"] / stats["num_steps_sampled"] self.assertGreater(replay_ratio, 0.7)
def testMultiTierAggregation(self): local, remotes = self._make_evs() aggregators = TreeAggregator.precreate_aggregators(1) optimizer = AsyncSamplesOptimizer(local, remotes, { "num_aggregation_workers": 1, }) optimizer.aggregator.init(aggregators) self._wait_for(optimizer, 1000, 1000)
def testMultiGPUParallelLoad(self): local, remotes = self._make_evs() workers = WorkerSet._from_existing(local, remotes) optimizer = AsyncSamplesOptimizer(workers, num_gpus=2, num_data_loader_buffers=2, _fake_gpus=True) self._wait_for(optimizer, 1000, 1000)
def testMultiGPUParallelLoad(self): local, remotes = self._make_evs() optimizer = AsyncSamplesOptimizer(local, remotes, num_gpus=2, num_data_loader_buffers=2, _fake_gpus=True) self._wait_for(optimizer, 1000, 1000)
def testLearnerQueueTimeout(self): local, remotes = self._make_envs() workers = WorkerSet._from_existing(local, remotes) optimizer = AsyncSamplesOptimizer(workers, sample_batch_size=1000, train_batch_size=1000, learner_queue_timeout=1) self.assertRaises(AssertionError, lambda: self._wait_for(optimizer, 1000, 1000))
class ImpalaAgent(Agent): """IMPALA implementation using DeepMind's V-trace.""" _agent_name = "IMPALA" _default_config = DEFAULT_CONFIG _policy_graph = VTracePolicyGraph @classmethod def default_resource_request(cls, config): cf = dict(cls._default_config, **config) return Resources( cpu=1, gpu=cf["num_gpus"] and cf["num_gpus"] * cf["gpu_fraction"] or 0, extra_cpu=cf["num_cpus_per_worker"] * cf["num_workers"], extra_gpu=cf["num_gpus_per_worker"] * cf["num_workers"]) def _init(self): for k in OPTIMIZER_SHARED_CONFIGS: if k not in self.config["optimizer"]: self.config["optimizer"][k] = self.config[k] if self.config["vtrace"]: policy_cls = self._policy_graph else: policy_cls = A3CPolicyGraph self.local_evaluator = self.make_local_evaluator( self.env_creator, policy_cls) self.remote_evaluators = self.make_remote_evaluators( self.env_creator, policy_cls, self.config["num_workers"], {"num_cpus": 1}) self.optimizer = AsyncSamplesOptimizer(self.local_evaluator, self.remote_evaluators, self.config["optimizer"]) def _train(self): prev_steps = self.optimizer.num_steps_sampled start = time.time() self.optimizer.step() while time.time() - start < self.config["min_iter_time_s"]: self.optimizer.step() result = self.optimizer.collect_metrics( self.config["collect_metrics_timeout"]) result.update(timesteps_this_iter=self.optimizer.num_steps_sampled - prev_steps) return result
def testReplayAndMultiplePasses(self): local, remotes = self._make_evs() optimizer = AsyncSamplesOptimizer(local, remotes, minibatch_buffer_size=10, num_sgd_iter=10, replay_buffer_num_slots=100, replay_proportion=10, sample_batch_size=10, train_batch_size=10) self._wait_for(optimizer, 1000, 1000) stats = optimizer.stats() print(stats) self.assertLess(stats["num_steps_sampled"], 5000) replay_ratio = stats["num_steps_replayed"] / stats["num_steps_sampled"] train_ratio = stats["num_steps_sampled"] / stats["num_steps_trained"] self.assertGreater(replay_ratio, 0.7) self.assertLess(train_ratio, 0.4)
class ImpalaAgent(Agent): """IMPALA implementation using DeepMind's V-trace.""" _agent_name = "IMPALA" _default_config = DEFAULT_CONFIG _policy_graph = VTracePolicyGraph @override(Agent) def _init(self): for k in OPTIMIZER_SHARED_CONFIGS: if k not in self.config["optimizer"]: self.config["optimizer"][k] = self.config[k] policy_cls = self._get_policy_graph() self.local_evaluator = self.make_local_evaluator( self.env_creator, policy_cls) self.remote_evaluators = self.make_remote_evaluators( self.env_creator, policy_cls, self.config["num_workers"]) self.optimizer = AsyncSamplesOptimizer(self.local_evaluator, self.remote_evaluators, self.config["optimizer"]) if self.config["entropy_coeff"] < 0: raise DeprecationWarning("entropy_coeff must be >= 0") @override(Agent) def _train(self): prev_steps = self.optimizer.num_steps_sampled start = time.time() self.optimizer.step() while (time.time() - start < self.config["min_iter_time_s"] or self.optimizer.num_steps_sampled == prev_steps): self.optimizer.step() result = self.optimizer.collect_metrics( self.config["collect_metrics_timeout"]) result.update(timesteps_this_iter=self.optimizer.num_steps_sampled - prev_steps) return result def _get_policy_graph(self): if self.config["vtrace"]: policy_cls = self._policy_graph else: policy_cls = A3CPolicyGraph return policy_cls
def _init(self): for k in OPTIMIZER_SHARED_CONFIGS: if k not in self.config["optimizer"]: self.config["optimizer"][k] = self.config[k] policy_cls = self._get_policy_graph() self.local_evaluator = self.make_local_evaluator( self.env_creator, policy_cls) self.remote_evaluators = self.make_remote_evaluators( self.env_creator, policy_cls, self.config["num_workers"]) self.optimizer = AsyncSamplesOptimizer(self.local_evaluator, self.remote_evaluators, self.config["optimizer"])
class ImpalaAgent(Agent): """IMPALA implementation using DeepMind's V-trace.""" _agent_name = "IMPALA" _default_config = DEFAULT_CONFIG _policy_graph = VTracePolicyGraph @override(Agent) def _init(self): for k in OPTIMIZER_SHARED_CONFIGS: if k not in self.config["optimizer"]: self.config["optimizer"][k] = self.config[k] policy_cls = self._get_policy_graph() self.local_evaluator = self.make_local_evaluator( self.env_creator, policy_cls) self.remote_evaluators = self.make_remote_evaluators( self.env_creator, policy_cls, self.config["num_workers"]) self.optimizer = AsyncSamplesOptimizer(self.local_evaluator, self.remote_evaluators, self.config["optimizer"]) @override(Agent) def _train(self): prev_steps = self.optimizer.num_steps_sampled start = time.time() self.optimizer.step() while time.time() - start < self.config["min_iter_time_s"]: self.optimizer.step() result = self.optimizer.collect_metrics( self.config["collect_metrics_timeout"]) result.update(timesteps_this_iter=self.optimizer.num_steps_sampled - prev_steps) return result def _get_policy_graph(self): if self.config["vtrace"]: policy_cls = self._policy_graph else: policy_cls = A3CPolicyGraph return policy_cls
def _init(self, config, env_creator): for k in OPTIMIZER_SHARED_CONFIGS: if k not in config["optimizer"]: config["optimizer"][k] = config[k] policy_cls = self._get_policy_graph() self.local_evaluator = self.make_local_evaluator( self.env_creator, policy_cls) if self.config["num_aggregation_workers"] > 0: # Create co-located aggregator actors first for placement pref aggregators = TreeAggregator.precreate_aggregators( self.config["num_aggregation_workers"]) self.remote_evaluators = self.make_remote_evaluators( env_creator, policy_cls, config["num_workers"]) self.optimizer = AsyncSamplesOptimizer(self.local_evaluator, self.remote_evaluators, config["optimizer"]) if config["entropy_coeff"] < 0: raise DeprecationWarning("entropy_coeff must be >= 0") if self.config["num_aggregation_workers"] > 0: # Assign the pre-created aggregators to the optimizer self.optimizer.aggregator.init(aggregators)
class ImpalaTrainer(Trainer): """IMPALA implementation using DeepMind's V-trace.""" _name = "IMPALA" _default_config = DEFAULT_CONFIG _policy_graph = VTracePolicyGraph @override(Trainer) def _init(self, config, env_creator): for k in OPTIMIZER_SHARED_CONFIGS: if k not in config["optimizer"]: config["optimizer"][k] = config[k] policy_cls = self._get_policy_graph() self.local_evaluator = self.make_local_evaluator( self.env_creator, policy_cls) if self.config["num_aggregation_workers"] > 0: # Create co-located aggregator actors first for placement pref aggregators = TreeAggregator.precreate_aggregators( self.config["num_aggregation_workers"]) self.remote_evaluators = self.make_remote_evaluators( env_creator, policy_cls, config["num_workers"]) self.optimizer = AsyncSamplesOptimizer(self.local_evaluator, self.remote_evaluators, **config["optimizer"]) if config["entropy_coeff"] < 0: raise DeprecationWarning("entropy_coeff must be >= 0") if self.config["num_aggregation_workers"] > 0: # Assign the pre-created aggregators to the optimizer self.optimizer.aggregator.init(aggregators) @classmethod @override(Trainable) def default_resource_request(cls, config): cf = dict(cls._default_config, **config) Trainer._validate_config(cf) return Resources( cpu=cf["num_cpus_for_driver"], gpu=cf["num_gpus"], extra_cpu=cf["num_cpus_per_worker"] * cf["num_workers"] + cf["num_aggregation_workers"], extra_gpu=cf["num_gpus_per_worker"] * cf["num_workers"]) @override(Trainer) def _train(self): prev_steps = self.optimizer.num_steps_sampled start = time.time() self.optimizer.step() while (time.time() - start < self.config["min_iter_time_s"] or self.optimizer.num_steps_sampled == prev_steps): self.optimizer.step() result = self.collect_metrics() result.update(timesteps_this_iter=self.optimizer.num_steps_sampled - prev_steps) return result def _get_policy_graph(self): if self.config["vtrace"]: policy_cls = self._policy_graph else: policy_cls = A3CPolicyGraph return policy_cls # MURFETD def reset_config(self, new_config): config = copy.deepcopy(DEFAULT_CONFIG) config.update(new_config) self.config = config # see LearningRateSchedule.__init__(self, self.config["lr"],self.config["lr_schedule"]) # in vtrace_policy_graph.py # see policy_evaluator.py ev = self.optimizer.local_evaluator p = ev.policy_map[DEFAULT_POLICY_ID] p.lr_schedule = ConstantSchedule(self.config["lr"]) p.cur_lr.load(self.config["lr"], session=ev.tf_sess) return True @override(Trainer) def _try_recover(self): """Try to identify and blacklist any unhealthy workers. This method is called after an unexpected remote error is encountered from a worker. It issues check requests to all current workers and blacklists any that respond with error. If no healthy workers remain, an error is raised. MURFETD: some changes from Ray-0.7.0-dev2 """ if not self._has_policy_optimizer(): raise NotImplementedError( "Recovery is not supported for this algorithm") logger.info("Health checking all workers...") checks = [] for ev in self.optimizer.remote_evaluators: _, obj_id = ev.sample_with_count.remote() checks.append(obj_id) healthy_evaluators = [] for i, obj_id in enumerate(checks): ev = self.optimizer.remote_evaluators[i] try: ray.get(obj_id) healthy_evaluators.append(ev) logger.info("Worker {} looks healthy".format(i + 1)) except RayError: logger.exception("Blacklisting worker {}".format(i + 1)) try: ev.__ray_terminate__.remote() except Exception: logger.exception("Error terminating unhealthy worker") if len(healthy_evaluators) < 1: raise RuntimeError( "Not enough healthy workers remain to continue.") # MURFETD (add additional new remote_evaluators) #num_new_evaluators = len(checks) - len(healthy_evaluators) #new_evaluators = self.make_remote_evaluators( # self.env_creator, self._get_policy_graph(), num_new_evaluators) #healthy_evaluators.extend(new_evaluators) # MURFETD (keep our remote_evaluator list in sync with the optimizer/aggregator) #self.remote_evaluators = healthy_evaluators self.optimizer.reset(healthy_evaluators)
class ImpalaAgent(Agent): """IMPALA implementation using DeepMind's V-trace.""" _agent_name = "IMPALA" _default_config = DEFAULT_CONFIG _policy_graph = VTracePolicyGraph @override(Agent) def _init(self, config, env_creator): for k in OPTIMIZER_SHARED_CONFIGS: if k not in config["optimizer"]: config["optimizer"][k] = config[k] policy_cls = self._get_policy_graph() self.local_evaluator = self.make_local_evaluator( self.env_creator, policy_cls) if self.config["num_aggregation_workers"] > 0: # Create co-located aggregator actors first for placement pref aggregators = TreeAggregator.precreate_aggregators( self.config["num_aggregation_workers"]) self.remote_evaluators = self.make_remote_evaluators( env_creator, policy_cls, config["num_workers"]) self.optimizer = AsyncSamplesOptimizer(self.local_evaluator, self.remote_evaluators, config["optimizer"]) if config["entropy_coeff"] < 0: raise DeprecationWarning("entropy_coeff must be >= 0") if self.config["num_aggregation_workers"] > 0: # Assign the pre-created aggregators to the optimizer self.optimizer.aggregator.init(aggregators) @classmethod @override(Trainable) def default_resource_request(cls, config): cf = dict(cls._default_config, **config) Agent._validate_config(cf) return Resources( cpu=cf["num_cpus_for_driver"], gpu=cf["num_gpus"], extra_cpu=cf["num_cpus_per_worker"] * cf["num_workers"] + cf["num_aggregation_workers"], extra_gpu=cf["num_gpus_per_worker"] * cf["num_workers"]) @override(Agent) def _train(self): prev_steps = self.optimizer.num_steps_sampled start = time.time() self.optimizer.step() while (time.time() - start < self.config["min_iter_time_s"] or self.optimizer.num_steps_sampled == prev_steps): self.optimizer.step() result = self.collect_metrics() result.update(timesteps_this_iter=self.optimizer.num_steps_sampled - prev_steps) return result def _get_policy_graph(self): if self.config["vtrace"]: policy_cls = self._policy_graph else: policy_cls = A3CPolicyGraph return policy_cls
class ImpalaAgent(Agent): """IMPALA implementation using DeepMind's V-trace.""" _agent_name = "IMPALA" _default_config = DEFAULT_CONFIG @classmethod def default_resource_request(cls, config): cf = dict(cls._default_config, **config) return Resources( cpu=1, gpu=cf["gpu"] and 1 or 0, extra_cpu=cf["num_cpus_per_worker"] * cf["num_workers"], extra_gpu=cf["num_gpus_per_worker"] * cf["num_workers"]) def _init(self): for k in OPTIMIZER_SHARED_CONFIGS: if k not in self.config["optimizer"]: self.config["optimizer"][k] = self.config[k] if self.config["vtrace"]: policy_cls = VTracePolicyGraph else: policy_cls = A3CPolicyGraph self.local_evaluator = self.make_local_evaluator( self.env_creator, policy_cls) self.remote_evaluators = self.make_remote_evaluators( self.env_creator, policy_cls, self.config["num_workers"], {"num_cpus": 1}) self.optimizer = AsyncSamplesOptimizer(self.local_evaluator, self.remote_evaluators, self.config["optimizer"]) def _train(self): prev_steps = self.optimizer.num_steps_sampled start = time.time() self.optimizer.step() while time.time() - start < self.config["min_iter_time_s"]: self.optimizer.step() FilterManager.synchronize(self.local_evaluator.filters, self.remote_evaluators) result = self.optimizer.collect_metrics() result = result._replace( timesteps_this_iter=self.optimizer.num_steps_sampled - prev_steps) return result def _stop(self): # workaround for https://github.com/ray-project/ray/issues/1516 for ev in self.remote_evaluators: ev.__ray_terminate__.remote() def _save(self, checkpoint_dir): checkpoint_path = os.path.join(checkpoint_dir, "checkpoint-{}".format(self.iteration)) agent_state = ray.get( [a.save.remote() for a in self.remote_evaluators]) extra_data = { "remote_state": agent_state, "local_state": self.local_evaluator.save() } pickle.dump(extra_data, open(checkpoint_path + ".extra_data", "wb")) return checkpoint_path def _restore(self, checkpoint_path): extra_data = pickle.load(open(checkpoint_path + ".extra_data", "rb")) ray.get([ a.restore.remote(o) for a, o in zip(self.remote_evaluators, extra_data["remote_state"]) ]) self.local_evaluator.restore(extra_data["local_state"])
def testSimple(self): local, remotes = self._make_evs() optimizer = AsyncSamplesOptimizer(local, remotes) self._wait_for(optimizer, 1000, 1000)
def testMultiGPU(self): local, remotes = self._make_evs() workers = WorkerSet._from_existing(local, remotes) optimizer = AsyncSamplesOptimizer(workers, num_gpus=1, _fake_gpus=True) self._wait_for(optimizer, 1000, 1000)
def testSimple(self): local, remotes = self._make_evs() workers = WorkerSet._from_existing(local, remotes) optimizer = AsyncSamplesOptimizer(workers) self._wait_for(optimizer, 1000, 1000)