class ImpalaAgent(Agent): """IMPALA implementation using DeepMind's V-trace.""" _agent_name = "IMPALA" _default_config = DEFAULT_CONFIG _policy_graph = VTracePolicyGraph def _init(self): for k in OPTIMIZER_SHARED_CONFIGS: if k not in self.config["optimizer"]: self.config["optimizer"][k] = self.config[k] if self.config["vtrace"]: policy_cls = self._policy_graph else: policy_cls = A3CPolicyGraph self.local_evaluator = self.make_local_evaluator( self.env_creator, policy_cls) self.remote_evaluators = self.make_remote_evaluators( self.env_creator, policy_cls, self.config["num_workers"]) self.optimizer = AsyncSamplesOptimizer(self.local_evaluator, self.remote_evaluators, self.config["optimizer"]) def _train(self): prev_steps = self.optimizer.num_steps_sampled start = time.time() self.optimizer.step() while time.time() - start < self.config["min_iter_time_s"]: self.optimizer.step() result = self.optimizer.collect_metrics( self.config["collect_metrics_timeout"]) result.update(timesteps_this_iter=self.optimizer.num_steps_sampled - prev_steps) return result
class ImpalaAgent(Agent): """IMPALA implementation using DeepMind's V-trace.""" _agent_name = "IMPALA" _default_config = DEFAULT_CONFIG _policy_graph = VTracePolicyGraph @classmethod def default_resource_request(cls, config): cf = dict(cls._default_config, **config) return Resources( cpu=1, gpu=cf["num_gpus"] and cf["num_gpus"] * cf["gpu_fraction"] or 0, extra_cpu=cf["num_cpus_per_worker"] * cf["num_workers"], extra_gpu=cf["num_gpus_per_worker"] * cf["num_workers"]) def _init(self): for k in OPTIMIZER_SHARED_CONFIGS: if k not in self.config["optimizer"]: self.config["optimizer"][k] = self.config[k] if self.config["vtrace"]: policy_cls = self._policy_graph else: policy_cls = A3CPolicyGraph self.local_evaluator = self.make_local_evaluator( self.env_creator, policy_cls) self.remote_evaluators = self.make_remote_evaluators( self.env_creator, policy_cls, self.config["num_workers"], {"num_cpus": 1}) self.optimizer = AsyncSamplesOptimizer(self.local_evaluator, self.remote_evaluators, self.config["optimizer"]) def _train(self): prev_steps = self.optimizer.num_steps_sampled start = time.time() self.optimizer.step() while time.time() - start < self.config["min_iter_time_s"]: self.optimizer.step() result = self.optimizer.collect_metrics( self.config["collect_metrics_timeout"]) result.update(timesteps_this_iter=self.optimizer.num_steps_sampled - prev_steps) return result
class ImpalaAgent(Agent): """IMPALA implementation using DeepMind's V-trace.""" _agent_name = "IMPALA" _default_config = DEFAULT_CONFIG _policy_graph = VTracePolicyGraph @override(Agent) def _init(self, config, env_creator): for k in OPTIMIZER_SHARED_CONFIGS: if k not in config["optimizer"]: config["optimizer"][k] = config[k] policy_cls = self._get_policy_graph() self.local_evaluator = self.make_local_evaluator( env_creator, policy_cls) self.remote_evaluators = self.make_remote_evaluators( env_creator, policy_cls, config["num_workers"]) self.optimizer = AsyncSamplesOptimizer(self.local_evaluator, self.remote_evaluators, config["optimizer"]) if config["entropy_coeff"] < 0: raise DeprecationWarning("entropy_coeff must be >= 0") @override(Agent) def _train(self): prev_steps = self.optimizer.num_steps_sampled start = time.time() self.optimizer.step() while (time.time() - start < self.config["min_iter_time_s"] or self.optimizer.num_steps_sampled == prev_steps): self.optimizer.step() result = self.collect_metrics() result.update(timesteps_this_iter=self.optimizer.num_steps_sampled - prev_steps) return result def _get_policy_graph(self): if self.config["vtrace"]: policy_cls = self._policy_graph else: policy_cls = A3CPolicyGraph return policy_cls
class ImpalaAgent(Agent): """IMPALA implementation using DeepMind's V-trace.""" _agent_name = "IMPALA" _default_config = DEFAULT_CONFIG _policy_graph = VTracePolicyGraph @override(Agent) def _init(self): for k in OPTIMIZER_SHARED_CONFIGS: if k not in self.config["optimizer"]: self.config["optimizer"][k] = self.config[k] policy_cls = self._get_policy_graph() self.local_evaluator = self.make_local_evaluator( self.env_creator, policy_cls) self.remote_evaluators = self.make_remote_evaluators( self.env_creator, policy_cls, self.config["num_workers"]) self.optimizer = AsyncSamplesOptimizer(self.local_evaluator, self.remote_evaluators, self.config["optimizer"]) @override(Agent) def _train(self): prev_steps = self.optimizer.num_steps_sampled start = time.time() self.optimizer.step() while time.time() - start < self.config["min_iter_time_s"]: self.optimizer.step() result = self.optimizer.collect_metrics( self.config["collect_metrics_timeout"]) result.update(timesteps_this_iter=self.optimizer.num_steps_sampled - prev_steps) return result def _get_policy_graph(self): if self.config["vtrace"]: policy_cls = self._policy_graph else: policy_cls = A3CPolicyGraph return policy_cls
class ImpalaAgent(Agent): """IMPALA implementation using DeepMind's V-trace.""" _agent_name = "IMPALA" _default_config = DEFAULT_CONFIG _policy_graph = VTracePolicyGraph @override(Agent) def _init(self, config, env_creator): for k in OPTIMIZER_SHARED_CONFIGS: if k not in config["optimizer"]: config["optimizer"][k] = config[k] policy_cls = self._get_policy_graph() self.local_evaluator = self.make_local_evaluator( self.env_creator, policy_cls) if self.config["num_aggregation_workers"] > 0: # Create co-located aggregator actors first for placement pref aggregators = TreeAggregator.precreate_aggregators( self.config["num_aggregation_workers"]) self.remote_evaluators = self.make_remote_evaluators( env_creator, policy_cls, config["num_workers"]) self.optimizer = AsyncSamplesOptimizer(self.local_evaluator, self.remote_evaluators, config["optimizer"]) if config["entropy_coeff"] < 0: raise DeprecationWarning("entropy_coeff must be >= 0") if self.config["num_aggregation_workers"] > 0: # Assign the pre-created aggregators to the optimizer self.optimizer.aggregator.init(aggregators) @classmethod @override(Trainable) def default_resource_request(cls, config): cf = dict(cls._default_config, **config) Agent._validate_config(cf) return Resources( cpu=cf["num_cpus_for_driver"], gpu=cf["num_gpus"], extra_cpu=cf["num_cpus_per_worker"] * cf["num_workers"] + cf["num_aggregation_workers"], extra_gpu=cf["num_gpus_per_worker"] * cf["num_workers"]) @override(Agent) def _train(self): prev_steps = self.optimizer.num_steps_sampled start = time.time() self.optimizer.step() while (time.time() - start < self.config["min_iter_time_s"] or self.optimizer.num_steps_sampled == prev_steps): self.optimizer.step() result = self.collect_metrics() result.update(timesteps_this_iter=self.optimizer.num_steps_sampled - prev_steps) return result def _get_policy_graph(self): if self.config["vtrace"]: policy_cls = self._policy_graph else: policy_cls = A3CPolicyGraph return policy_cls
class ImpalaAgent(Agent): """IMPALA implementation using DeepMind's V-trace.""" _agent_name = "IMPALA" _default_config = DEFAULT_CONFIG @classmethod def default_resource_request(cls, config): cf = dict(cls._default_config, **config) return Resources( cpu=1, gpu=cf["gpu"] and 1 or 0, extra_cpu=cf["num_cpus_per_worker"] * cf["num_workers"], extra_gpu=cf["num_gpus_per_worker"] * cf["num_workers"]) def _init(self): for k in OPTIMIZER_SHARED_CONFIGS: if k not in self.config["optimizer"]: self.config["optimizer"][k] = self.config[k] if self.config["vtrace"]: policy_cls = VTracePolicyGraph else: policy_cls = A3CPolicyGraph self.local_evaluator = self.make_local_evaluator( self.env_creator, policy_cls) self.remote_evaluators = self.make_remote_evaluators( self.env_creator, policy_cls, self.config["num_workers"], {"num_cpus": 1}) self.optimizer = AsyncSamplesOptimizer(self.local_evaluator, self.remote_evaluators, self.config["optimizer"]) def _train(self): prev_steps = self.optimizer.num_steps_sampled start = time.time() self.optimizer.step() while time.time() - start < self.config["min_iter_time_s"]: self.optimizer.step() FilterManager.synchronize(self.local_evaluator.filters, self.remote_evaluators) result = self.optimizer.collect_metrics() result = result._replace( timesteps_this_iter=self.optimizer.num_steps_sampled - prev_steps) return result def _stop(self): # workaround for https://github.com/ray-project/ray/issues/1516 for ev in self.remote_evaluators: ev.__ray_terminate__.remote() def _save(self, checkpoint_dir): checkpoint_path = os.path.join(checkpoint_dir, "checkpoint-{}".format(self.iteration)) agent_state = ray.get( [a.save.remote() for a in self.remote_evaluators]) extra_data = { "remote_state": agent_state, "local_state": self.local_evaluator.save() } pickle.dump(extra_data, open(checkpoint_path + ".extra_data", "wb")) return checkpoint_path def _restore(self, checkpoint_path): extra_data = pickle.load(open(checkpoint_path + ".extra_data", "rb")) ray.get([ a.restore.remote(o) for a, o in zip(self.remote_evaluators, extra_data["remote_state"]) ]) self.local_evaluator.restore(extra_data["local_state"])
class ImpalaTrainer(Trainer): """IMPALA implementation using DeepMind's V-trace.""" _name = "IMPALA" _default_config = DEFAULT_CONFIG _policy_graph = VTracePolicyGraph @override(Trainer) def _init(self, config, env_creator): for k in OPTIMIZER_SHARED_CONFIGS: if k not in config["optimizer"]: config["optimizer"][k] = config[k] policy_cls = self._get_policy_graph() self.local_evaluator = self.make_local_evaluator( self.env_creator, policy_cls) if self.config["num_aggregation_workers"] > 0: # Create co-located aggregator actors first for placement pref aggregators = TreeAggregator.precreate_aggregators( self.config["num_aggregation_workers"]) self.remote_evaluators = self.make_remote_evaluators( env_creator, policy_cls, config["num_workers"]) self.optimizer = AsyncSamplesOptimizer(self.local_evaluator, self.remote_evaluators, **config["optimizer"]) if config["entropy_coeff"] < 0: raise DeprecationWarning("entropy_coeff must be >= 0") if self.config["num_aggregation_workers"] > 0: # Assign the pre-created aggregators to the optimizer self.optimizer.aggregator.init(aggregators) @classmethod @override(Trainable) def default_resource_request(cls, config): cf = dict(cls._default_config, **config) Trainer._validate_config(cf) return Resources( cpu=cf["num_cpus_for_driver"], gpu=cf["num_gpus"], extra_cpu=cf["num_cpus_per_worker"] * cf["num_workers"] + cf["num_aggregation_workers"], extra_gpu=cf["num_gpus_per_worker"] * cf["num_workers"]) @override(Trainer) def _train(self): prev_steps = self.optimizer.num_steps_sampled start = time.time() self.optimizer.step() while (time.time() - start < self.config["min_iter_time_s"] or self.optimizer.num_steps_sampled == prev_steps): self.optimizer.step() result = self.collect_metrics() result.update(timesteps_this_iter=self.optimizer.num_steps_sampled - prev_steps) return result def _get_policy_graph(self): if self.config["vtrace"]: policy_cls = self._policy_graph else: policy_cls = A3CPolicyGraph return policy_cls # MURFETD def reset_config(self, new_config): config = copy.deepcopy(DEFAULT_CONFIG) config.update(new_config) self.config = config # see LearningRateSchedule.__init__(self, self.config["lr"],self.config["lr_schedule"]) # in vtrace_policy_graph.py # see policy_evaluator.py ev = self.optimizer.local_evaluator p = ev.policy_map[DEFAULT_POLICY_ID] p.lr_schedule = ConstantSchedule(self.config["lr"]) p.cur_lr.load(self.config["lr"], session=ev.tf_sess) return True @override(Trainer) def _try_recover(self): """Try to identify and blacklist any unhealthy workers. This method is called after an unexpected remote error is encountered from a worker. It issues check requests to all current workers and blacklists any that respond with error. If no healthy workers remain, an error is raised. MURFETD: some changes from Ray-0.7.0-dev2 """ if not self._has_policy_optimizer(): raise NotImplementedError( "Recovery is not supported for this algorithm") logger.info("Health checking all workers...") checks = [] for ev in self.optimizer.remote_evaluators: _, obj_id = ev.sample_with_count.remote() checks.append(obj_id) healthy_evaluators = [] for i, obj_id in enumerate(checks): ev = self.optimizer.remote_evaluators[i] try: ray.get(obj_id) healthy_evaluators.append(ev) logger.info("Worker {} looks healthy".format(i + 1)) except RayError: logger.exception("Blacklisting worker {}".format(i + 1)) try: ev.__ray_terminate__.remote() except Exception: logger.exception("Error terminating unhealthy worker") if len(healthy_evaluators) < 1: raise RuntimeError( "Not enough healthy workers remain to continue.") # MURFETD (add additional new remote_evaluators) #num_new_evaluators = len(checks) - len(healthy_evaluators) #new_evaluators = self.make_remote_evaluators( # self.env_creator, self._get_policy_graph(), num_new_evaluators) #healthy_evaluators.extend(new_evaluators) # MURFETD (keep our remote_evaluator list in sync with the optimizer/aggregator) #self.remote_evaluators = healthy_evaluators self.optimizer.reset(healthy_evaluators)