Example #1
0
 def test_train_external_multi_agent_cartpole_many_policies(self):
     n = 20
     single_env = gym.make("CartPole-v0")
     act_space = single_env.action_space
     obs_space = single_env.observation_space
     policies = {}
     for i in range(20):
         policies["pg_{}".format(i)] = (PGTFPolicy, obs_space, act_space,
                                        {})
     policy_ids = list(policies.keys())
     ev = RolloutWorker(
         env_creator=lambda _: MultiAgentCartPole({"num_agents": n}),
         policy=policies,
         policy_mapping_fn=lambda agent_id: random.choice(policy_ids),
         rollout_fragment_length=100)
     optimizer = SyncSamplesOptimizer(WorkerSet._from_existing(ev))
     for i in range(100):
         optimizer.step()
         result = collect_metrics(ev)
         print("Iteration {}, rew {}".format(i,
                                             result["policy_reward_mean"]))
         print("Total reward", result["episode_reward_mean"])
         if result["episode_reward_mean"] >= 25 * n:
             return
     raise Exception("failed to improve reward")
 def testTrainMultiCartpoleManyPolicies(self):
     n = 20
     env = gym.make("CartPole-v0")
     act_space = env.action_space
     obs_space = env.observation_space
     policies = {}
     for i in range(20):
         policies["pg_{}".format(i)] = (PGPolicyGraph, obs_space, act_space,
                                        {})
     policy_ids = list(policies.keys())
     ev = PolicyEvaluator(
         env_creator=lambda _: MultiCartpole(n),
         policy_graph=policies,
         policy_mapping_fn=lambda agent_id: random.choice(policy_ids),
         batch_steps=100)
     optimizer = SyncSamplesOptimizer(ev, [], {})
     for i in range(100):
         optimizer.step()
         result = collect_metrics(ev)
         print("Iteration {}, rew {}".format(i,
                                             result["policy_reward_mean"]))
         print("Total reward", result["episode_reward_mean"])
         if result["episode_reward_mean"] >= 25 * n:
             return
     raise Exception("failed to improve reward")
Example #3
0
File: pg.py Project: zdpau/ray-1
class PGAgent(Agent):
    """Simple policy gradient agent.

    This is an example agent to show how to implement algorithms in RLlib.
    In most cases, you will probably want to use the PPO agent instead.
    """

    _agent_name = "PG"
    _default_config = DEFAULT_CONFIG
    _policy_graph = PGPolicyGraph

    @override(Agent)
    def _init(self):
        self.local_evaluator = self.make_local_evaluator(
            self.env_creator, self._policy_graph)
        self.remote_evaluators = self.make_remote_evaluators(
            self.env_creator, self._policy_graph, self.config["num_workers"])
        optimizer_config = dict(
            self.config["optimizer"],
            **{"train_batch_size": self.config["train_batch_size"]})
        self.optimizer = SyncSamplesOptimizer(self.local_evaluator,
                                              self.remote_evaluators,
                                              optimizer_config)

    @override(Agent)
    def _train(self):
        prev_steps = self.optimizer.num_steps_sampled
        self.optimizer.step()
        result = self.optimizer.collect_metrics(
            self.config["collect_metrics_timeout"])
        result.update(timesteps_this_iter=self.optimizer.num_steps_sampled -
                      prev_steps)
        return result
Example #4
0
class PGAgent(Agent):
    """Simple policy gradient agent.

    This is an example agent to show how to implement algorithms in RLlib.
    In most cases, you will probably want to use the PPO agent instead.
    """

    _agent_name = "PG"
    _default_config = DEFAULT_CONFIG
    _policy_graph = PGPolicyGraph

    @classmethod
    def default_resource_request(cls, config):
        cf = merge_dicts(cls._default_config, config)
        return Resources(cpu=1, gpu=0, extra_cpu=cf["num_workers"])

    def _init(self):
        self.local_evaluator = self.make_local_evaluator(
            self.env_creator, self._policy_graph)
        self.remote_evaluators = self.make_remote_evaluators(
            self.env_creator, self._policy_graph, self.config["num_workers"],
            {})
        self.optimizer = SyncSamplesOptimizer(self.local_evaluator,
                                              self.remote_evaluators,
                                              self.config["optimizer"])

    def _train(self):
        prev_steps = self.optimizer.num_steps_sampled
        self.optimizer.step()
        result = self.optimizer.collect_metrics()
        result.update(timesteps_this_iter=self.optimizer.num_steps_sampled -
                      prev_steps)
        return result
Example #5
0
        def _init(self, config, env_creator):
            if validate_config:
                validate_config(config)
            if get_initial_state:
                self.state = get_initial_state(self)
            else:
                self.state = {}
            if get_policy_class is None:
                policy = default_policy
            else:
                policy = get_policy_class(config)
            if before_init:
                before_init(self)
            if make_workers:
                self.workers = make_workers(self, env_creator, policy, config)
            else:
                self.workers = self._make_workers(env_creator, policy, config,
                                                  self.config["num_workers"])
            self.train_pipeline = None
            self.optimizer = None

            if training_pipeline:
                self.train_pipeline = training_pipeline(self.workers, config)
            elif make_policy_optimizer:
                self.optimizer = make_policy_optimizer(self.workers, config)
            else:
                optimizer_config = dict(
                    config["optimizer"],
                    **{"train_batch_size": config["train_batch_size"]})
                self.optimizer = SyncSamplesOptimizer(self.workers,
                                                      **optimizer_config)
            if after_init:
                after_init(self)
Example #6
0
class PGAgent(Agent):
    """Simple policy gradient agent.

    This is an example agent to show how to implement algorithms in RLlib.
    In most cases, you will probably want to use the PPO agent instead.
    """

    _agent_name = "PG"
    _default_config = DEFAULT_CONFIG
    _policy_graph = PGPolicyGraph

    @override(Agent)
    def _init(self):
        self.local_evaluator = self.make_local_evaluator(
            self.env_creator, self._policy_graph)
        self.remote_evaluators = self.make_remote_evaluators(
            self.env_creator, self._policy_graph, self.config["num_workers"])
        optimizer_config = dict(
            self.config["optimizer"],
            **{"train_batch_size": self.config["train_batch_size"]})
        self.optimizer = SyncSamplesOptimizer(
            self.local_evaluator, self.remote_evaluators, optimizer_config)

    @override(Agent)
    def _train(self):
        prev_steps = self.optimizer.num_steps_sampled
        self.optimizer.step()
        result = self.optimizer.collect_metrics(
            self.config["collect_metrics_timeout"])
        result.update(timesteps_this_iter=self.optimizer.num_steps_sampled -
                      prev_steps)
        return result
Example #7
0
 def testTrainMultiCartpoleManyPolicies(self):
     n = 20
     env = gym.make("CartPole-v0")
     act_space = env.action_space
     obs_space = env.observation_space
     policies = {}
     for i in range(20):
         policies["pg_{}".format(i)] = (PGPolicyGraph, obs_space, act_space,
                                        {})
     policy_ids = list(policies.keys())
     ev = PolicyEvaluator(
         env_creator=lambda _: MultiCartpole(n),
         policy_graph=policies,
         policy_mapping_fn=lambda agent_id: random.choice(policy_ids),
         batch_steps=100)
     optimizer = SyncSamplesOptimizer(ev, [], {})
     for i in range(100):
         optimizer.step()
         result = collect_metrics(ev)
         print("Iteration {}, rew {}".format(i,
                                             result["policy_reward_mean"]))
         print("Total reward", result["episode_reward_mean"])
         if result["episode_reward_mean"] >= 25 * n:
             return
     raise Exception("failed to improve reward")
Example #8
0
        def _init(self, config, env_creator):
            if validate_config:
                validate_config(config)
            if get_initial_state:
                self.state = get_initial_state(self)
            else:
                self.state = {}
            if get_policy_class is None:
                policy = default_policy
            else:
                policy = get_policy_class(config)
            if before_init:
                before_init(self)
            if make_workers:
                self.workers = make_workers(self, env_creator, policy, config)
            else:
                self.workers = self._make_workers(env_creator, policy, config,
                                                  self.config["num_workers"])
            self.train_pipeline = None
            self.optimizer = None

            if training_pipeline and (self.config["use_pipeline_impl"] or
                                      "RLLIB_USE_PIPELINE_IMPL" in os.environ):
                logger.warning("Using experimental pipeline based impl.")
                self.train_pipeline = training_pipeline(self.workers, config)
            elif make_policy_optimizer:
                self.optimizer = make_policy_optimizer(self.workers, config)
            else:
                optimizer_config = dict(
                    config["optimizer"],
                    **{"train_batch_size": config["train_batch_size"]})
                self.optimizer = SyncSamplesOptimizer(self.workers,
                                                      **optimizer_config)
            if after_init:
                after_init(self)
Example #9
0
        def _init(self, config, env_creator):
            if validate_config:
                validate_config(config)
            if get_initial_state:
                self.state = get_initial_state(self)
            else:
                self.state = {}
            if get_policy_class is None:
                policy = default_policy
            else:
                policy = get_policy_class(config)
            if before_init:
                before_init(self)
            if make_workers:
                self.workers = make_workers(self, env_creator, policy, config)
            else:
                self.workers = self._make_workers(env_creator, policy, config,
                                                  self.config["num_workers"])
            if make_policy_optimizer:
                self.optimizer = make_policy_optimizer(self.workers, config)
            else:
                optimizer_config = dict(
                    config["optimizer"],
                    **{"train_batch_size": config["train_batch_size"]})
                self.optimizer = SyncSamplesOptimizer(self.workers,
                                                      **optimizer_config)
            # self.optimizer: <Override_ray.sync_replay_optimizer.SyncReplayOptimizer object at 0x7f7424799d90>

            if after_init:
                after_init(self)
Example #10
0
 def test_train_multi_cartpole_many_policies(self):
     n = 20
     env = gym.make("CartPole-v0")
     act_space = env.action_space
     obs_space = env.observation_space
     policies = {}
     for i in range(20):
         policies["pg_{}".format(i)] = (PGTFPolicy, obs_space, act_space,
                                        {})
     policy_ids = list(policies.keys())
     worker = RolloutWorker(
         env_creator=lambda _: MultiCartpole(n),
         policy=policies,
         policy_mapping_fn=lambda agent_id: random.choice(policy_ids),
         batch_steps=100)
     workers = WorkerSet._from_existing(worker, [])
     optimizer = SyncSamplesOptimizer(workers)
     for i in range(100):
         optimizer.step()
         result = collect_metrics(worker)
         print("Iteration {}, rew {}".format(i,
                                             result["policy_reward_mean"]))
         print("Total reward", result["episode_reward_mean"])
         if result["episode_reward_mean"] >= 25 * n:
             return
     raise Exception("failed to improve reward")
Example #11
0
 def _init(self):
     self.local_evaluator = self.make_local_evaluator(
         self.env_creator, self._policy_graph)
     self.remote_evaluators = self.make_remote_evaluators(
         self.env_creator, self._policy_graph, self.config["num_workers"])
     self.optimizer = SyncSamplesOptimizer(self.local_evaluator,
                                           self.remote_evaluators,
                                           self.config["optimizer"])
Example #12
0
 def _init(self, config, env_creator):
     self._validate_config(config)
     self.workers = self._make_workers(env_creator,
                                       self._policy,
                                       config,
                                       num_workers=config["num_workers"])
     self.optimizer = SyncSamplesOptimizer(
         self.workers, train_batch_size=config["train_batch_size"])
Example #13
0
File: pg.py Project: zdpau/ray-1
 def _init(self):
     self.local_evaluator = self.make_local_evaluator(
         self.env_creator, self._policy_graph)
     self.remote_evaluators = self.make_remote_evaluators(
         self.env_creator, self._policy_graph, self.config["num_workers"])
     optimizer_config = dict(
         self.config["optimizer"],
         **{"train_batch_size": self.config["train_batch_size"]})
     self.optimizer = SyncSamplesOptimizer(self.local_evaluator,
                                           self.remote_evaluators,
                                           optimizer_config)
Example #14
0
 def _init(self, config, env_creator):
     if config["use_pytorch"]:
         from ray.rllib.agents.pg.torch_pg_policy_graph import \
             PGTorchPolicyGraph
         policy_cls = PGTorchPolicyGraph
     else:
         policy_cls = self._policy_graph
     self.local_evaluator = self.make_local_evaluator(
         env_creator, policy_cls)
     self.remote_evaluators = self.make_remote_evaluators(
         env_creator, policy_cls, config["num_workers"])
     optimizer_config = dict(
         config["optimizer"],
         **{"train_batch_size": config["train_batch_size"]})
     self.optimizer = SyncSamplesOptimizer(
         self.local_evaluator, self.remote_evaluators, **optimizer_config)
Example #15
0
class TRPOTrainer(Trainer):
    """Single agent trainer for TRPO."""

    _name = "TRPO"
    _default_config = DEFAULT_CONFIG
    _policy = TRPOTorchPolicy

    # pylint:disable=attribute-defined-outside-init

    @override(Trainer)
    def _init(self, config, env_creator):
        self._validate_config(config)
        self.workers = self._make_workers(env_creator,
                                          self._policy,
                                          config,
                                          num_workers=config["num_workers"])
        self.optimizer = SyncSamplesOptimizer(
            self.workers, train_batch_size=config["train_batch_size"])

    @override(Trainer)
    def _train(self):
        while not self._iteration_done():
            _ = self.optimizer.step()

        res = self.collect_metrics()
        timesteps = self.optimizer.num_steps_sampled - self.global_vars[
            "timestep"]
        res.update(timesteps_this_iter=timesteps, info=res.get("info", {}))
        return res
Example #16
0
 def _init(self):
     self.optimizer = SyncSamplesOptimizer.make(
         evaluator_cls=CommonPolicyEvaluator,
         evaluator_args={
             "env_creator":
             self.env_creator,
             "policy_graph": (self.config["multiagent"]["policy_graphs"]
                              or PGPolicyGraph),
             "policy_mapping_fn":
             self.config["multiagent"]["policy_mapping_fn"],
             "batch_steps":
             self.config["batch_size"],
             "batch_mode":
             "truncate_episodes",
             "model_config":
             self.config["model"],
             "env_config":
             self.config["env_config"],
             "policy_config":
             self.config,
             "num_envs":
             self.config["num_envs"],
         },
         num_workers=self.config["num_workers"],
         optimizer_config=self.config["optimizer"])
Example #17
0
File: ppo.py Project: xlnwel/ray
 def _init(self):
     waste_ratio = (self.config["sample_batch_size"] *
                    self.config["num_workers"] /
                    self.config["train_batch_size"])
     if waste_ratio > 1:
         msg = ("sample_batch_size * num_workers >> train_batch_size. "
                "This means that many steps will be discarded. Consider "
                "reducing sample_batch_size, or increase train_batch_size.")
         if waste_ratio > 1.5:
             raise ValueError(msg)
         else:
             print("Warning: " + msg)
     self.local_evaluator = self.make_local_evaluator(
         self.env_creator, self._policy_graph)
     self.remote_evaluators = self.make_remote_evaluators(
         self.env_creator, self._policy_graph, self.config["num_workers"], {
             "num_cpus": self.config["num_cpus_per_worker"],
             "num_gpus": self.config["num_gpus_per_worker"]
         })
     if self.config["simple_optimizer"]:
         self.optimizer = SyncSamplesOptimizer(
             self.local_evaluator, self.remote_evaluators, {
                 "num_sgd_iter": self.config["num_sgd_iter"],
                 "train_batch_size": self.config["train_batch_size"]
             })
     else:
         self.optimizer = LocalMultiGPUOptimizer(
             self.local_evaluator, self.remote_evaluators, {
                 "sgd_batch_size": self.config["sgd_minibatch_size"],
                 "num_sgd_iter": self.config["num_sgd_iter"],
                 "num_gpus": self.config["num_gpus"],
                 "train_batch_size": self.config["train_batch_size"],
                 "standardize_fields": ["advantages"],
             })
Example #18
0
 def _init(self):
     self._validate_config()
     self.local_evaluator = self.make_local_evaluator(
         self.env_creator, self._policy_graph)
     self.remote_evaluators = self.make_remote_evaluators(
         self.env_creator, self._policy_graph, self.config["num_workers"])
     if self.config["simple_optimizer"]:
         self.optimizer = SyncSamplesOptimizer(
             self.local_evaluator, self.remote_evaluators, {
                 "num_sgd_iter": self.config["num_sgd_iter"],
                 "train_batch_size": self.config["train_batch_size"],
             })
     else:
         self.optimizer = LocalMultiGPUOptimizer(
             self.local_evaluator, self.remote_evaluators, {
                 "sgd_batch_size": self.config["sgd_minibatch_size"],
                 "num_sgd_iter": self.config["num_sgd_iter"],
                 "num_gpus": self.config["num_gpus"],
                 "sample_batch_size": self.config["sample_batch_size"],
                 "num_envs_per_worker": self.config["num_envs_per_worker"],
                 "train_batch_size": self.config["train_batch_size"],
                 "standardize_fields": ["advantages"],
                 "straggler_mitigation":
                 (self.config["straggler_mitigation"]),
             })
Example #19
0
        def _init(self, config, env_creator):
            if validate_config:
                validate_config(config)

            if get_initial_state:
                self.state = get_initial_state(self)
            else:
                self.state = {}

            # Override default policy if `get_policy_class` is provided.
            if get_policy_class is not None:
                self._policy = get_policy_class(config)

            if before_init:
                before_init(self)
            use_exec_api = (execution_plan
                            and (self.config["use_exec_api"]
                                 or "RLLIB_EXEC_API" in os.environ))

            # Creating all workers (excluding evaluation workers).
            if make_workers and not use_exec_api:
                self.workers = make_workers(self, env_creator, self._policy,
                                            config)
            else:
                self.workers = self._make_workers(env_creator, self._policy,
                                                  config,
                                                  self.config["num_workers"])
            self.train_exec_impl = None
            self.optimizer = None
            self.execution_plan = execution_plan

            if use_exec_api:
                logger.warning(
                    "The experimental distributed execution API is enabled "
                    "for this algorithm. Disable this by setting "
                    "'use_exec_api': False.")
                self.train_exec_impl = execution_plan(self.workers, config)
            elif make_policy_optimizer:
                self.optimizer = make_policy_optimizer(self.workers, config)
            else:
                optimizer_config = dict(
                    config["optimizer"],
                    **{"train_batch_size": config["train_batch_size"]})
                self.optimizer = SyncSamplesOptimizer(self.workers,
                                                      **optimizer_config)
            if after_init:
                after_init(self)
Example #20
0
File: a2c.py Project: zqxyz73/ray
def choose_policy_optimizer(workers, config):
    if config["microbatch_size"]:
        return MicrobatchOptimizer(workers,
                                   train_batch_size=config["train_batch_size"],
                                   microbatch_size=config["microbatch_size"])
    else:
        return SyncSamplesOptimizer(
            workers, train_batch_size=config["train_batch_size"])
Example #21
0
 def _init(self, config, env_creator):
     if validate_config:
         validate_config(config)
     if get_policy_class is None:
         policy = default_policy
     else:
         policy = get_policy_class(config)
     self.workers = self._make_workers(env_creator, policy, config,
                                       self.config["num_workers"])
     if make_policy_optimizer:
         self.optimizer = make_policy_optimizer(self.workers, config)
     else:
         optimizer_config = dict(
             config["optimizer"],
             **{"train_batch_size": config["train_batch_size"]})
         self.optimizer = SyncSamplesOptimizer(self.workers,
                                               **optimizer_config)
Example #22
0
 def _init(self):
     self.local_evaluator = self.make_local_evaluator(
         self.env_creator, self._policy_graph)
     self.remote_evaluators = self.make_remote_evaluators(
         self.env_creator, self._policy_graph, self.config["num_workers"])
     optimizer_config = dict(
         self.config["optimizer"],
         **{"train_batch_size": self.config["train_batch_size"]})
     self.optimizer = SyncSamplesOptimizer(
         self.local_evaluator, self.remote_evaluators, optimizer_config)
Example #23
0
        def _init(self, config, env_creator):
            if validate_config:
                validate_config(config)

            if get_initial_state:
                self.state = get_initial_state(self)
            else:
                self.state = {}
            if get_policy_class is None:
                self._policy = default_policy
            else:
                self._policy = get_policy_class(config)
            if before_init:
                before_init(self)
            use_exec_api = (execution_plan
                            and (self.config["use_exec_api"]
                                 or "RLLIB_EXEC_API" in os.environ))

            # Creating all workers (excluding evaluation workers).
            if make_workers and not use_exec_api:
                self.workers = make_workers(self, env_creator, self._policy,
                                            config)
            else:
                self.workers = self._make_workers(env_creator, self._policy,
                                                  config,
                                                  self.config["num_workers"])
            self.train_exec_impl = None
            self.optimizer = None
            self.execution_plan = execution_plan

            if use_exec_api:
                self.train_exec_impl = execution_plan(self.workers, config)
            elif make_policy_optimizer:
                self.optimizer = make_policy_optimizer(self.workers, config)
            else:
                optimizer_config = dict(
                    config["optimizer"],
                    **{"train_batch_size": config["train_batch_size"]})
                self.optimizer = SyncSamplesOptimizer(self.workers,
                                                      **optimizer_config)
            if after_init:
                after_init(self)
Example #24
0
    def _init(self, config, env_creator):
        # Random seed
        seed = config['seed']
        torch.manual_seed(seed)
        np.random.seed(seed)

        self.env_config = config['env_config']
        self.num_sgd_iter = config['num_sgd_iter']
        self.num_workers = config['num_workers']
        self.sgd_minibatch_size = config['sgd_minibatch_size']
        self.train_batch_size = config['train_batch_size']

        # Set up workers
        policy_cls = policy_options[config['policy']]
        self.workers = self._make_workers(env_creator, policy_cls, config,
                                          self.num_workers)
        self.optimizer = SyncSamplesOptimizer(
            self.workers,
            num_sgd_iter=self.num_sgd_iter,
            train_batch_size=self.train_batch_size,
            sgd_minibatch_size=self.sgd_minibatch_size)
Example #25
0
def choose_policy_optimizer(workers, config):
    if config["simple_optimizer"]:
        return SyncSamplesOptimizer(
            workers,
            num_sgd_iter=config["num_sgd_iter"],
            train_batch_size=config["train_batch_size"])
    else:
        return SyncBatchesReplayOptimizer(
            workers,
            num_gradient_descents=config["num_sgd_iter"],
            learning_starts=config["learning_starts"],
            train_batch_size=config["train_batch_size"],
            buffer_size=config["buffer_size"])
Example #26
0
 def _init(self, config, env_creator):
     if validate_config:
         validate_config(config)
     if get_policy_class is None:
         policy_graph = default_policy
     else:
         policy_graph = get_policy_class(config)
     self.local_evaluator = self.make_local_evaluator(
         env_creator, policy_graph)
     self.remote_evaluators = self.make_remote_evaluators(
         env_creator, policy_graph, config["num_workers"])
     if make_policy_optimizer:
         self.optimizer = make_policy_optimizer(self.local_evaluator,
                                                self.remote_evaluators,
                                                config)
     else:
         optimizer_config = dict(
             config["optimizer"],
             **{"train_batch_size": config["train_batch_size"]})
         self.optimizer = SyncSamplesOptimizer(self.local_evaluator,
                                               self.remote_evaluators,
                                               **optimizer_config)
Example #27
0
File: pg.py Project: wanghuimu/ray
class PGTrainer(Trainer):
    """Simple policy gradient agent.

    This is an example agent to show how to implement algorithms in RLlib.
    In most cases, you will probably want to use the PPO agent instead.
    """

    _name = "PG"
    _default_config = DEFAULT_CONFIG
    _policy_graph = PGPolicyGraph

    @override(Trainer)
    def _init(self, config, env_creator):
        if config["use_pytorch"]:
            from ray.rllib.agents.pg.torch_pg_policy_graph import \
                PGTorchPolicyGraph
            policy_cls = PGTorchPolicyGraph
        else:
            policy_cls = self._policy_graph
        self.local_evaluator = self.make_local_evaluator(
            env_creator, policy_cls)
        self.remote_evaluators = self.make_remote_evaluators(
            env_creator, policy_cls, config["num_workers"])
        optimizer_config = dict(
            config["optimizer"],
            **{"train_batch_size": config["train_batch_size"]})
        self.optimizer = SyncSamplesOptimizer(self.local_evaluator,
                                              self.remote_evaluators,
                                              optimizer_config)

    @override(Trainer)
    def _train(self):
        prev_steps = self.optimizer.num_steps_sampled
        self.optimizer.step()
        result = self.collect_metrics()
        result.update(timesteps_this_iter=self.optimizer.num_steps_sampled -
                      prev_steps)
        return result
Example #28
0
class PPOTrainer(Trainer):

    _name = "PPO"
    _default_config = DEFAULT_CONFIG

    def _init(self, config, env_creator):
        # Random seed
        seed = config['seed']
        torch.manual_seed(seed)
        np.random.seed(seed)

        self.env_config = config['env_config']
        self.num_sgd_iter = config['num_sgd_iter']
        self.num_workers = config['num_workers']
        self.sgd_minibatch_size = config['sgd_minibatch_size']
        self.train_batch_size = config['train_batch_size']

        # Set up workers
        policy_cls = policy_options[config['policy']]
        self.workers = self._make_workers(env_creator, policy_cls, config,
                                          self.num_workers)
        self.optimizer = SyncSamplesOptimizer(
            self.workers,
            num_sgd_iter=self.num_sgd_iter,
            train_batch_size=self.train_batch_size,
            sgd_minibatch_size=self.sgd_minibatch_size)

    def _train(self):
        self.optimizer.step()

        res = dict(timesteps_this_iter=self.optimizer.num_steps_sampled,
                   info=self.optimizer.stats())
        return res

    def evaluate(self):
        return self.workers.local_worker().sample()
Example #29
0
def choose_policy_optimizer(workers, config):
    if config["simple_optimizer"]:
        return SyncSamplesOptimizer(
            workers,
            num_sgd_iter=config["num_sgd_iter"],
            train_batch_size=config["train_batch_size"])

    return LocalMultiGPUOptimizer(
        workers,
        sgd_batch_size=config["sgd_minibatch_size"],
        num_sgd_iter=config["num_sgd_iter"],
        num_gpus=config["num_gpus"],
        sample_batch_size=config["sample_batch_size"],
        num_envs_per_worker=config["num_envs_per_worker"],
        train_batch_size=config["train_batch_size"],
        standardize_fields=["advantages"],
        shuffle_sequences=config["shuffle_sequences"])
Example #30
0
def choose_policy_optimizer(workers, config):
    if config["distributed_data_parallel_optimizer"]:
        if not config["use_pytorch"]:
            raise ValueError(
                "Distributed data parallel is only supported for PyTorch")
        if config["num_gpus"]:
            raise ValueError(
                "When using distributed data parallel, you should set "
                "num_gpus=0 since all optimization "
                "is happening on workers. Enable GPUs for workers by setting "
                "num_gpus_per_worker=1.")
        if config["batch_mode"] != "truncate_episodes":
            raise ValueError(
                "Distributed data parallel requires truncate_episodes "
                "batch mode.")
        if config["sample_batch_size"] != config["train_batch_size"]:
            raise ValueError(
                "Distributed data parallel requires sample_batch_size to be "
                "equal to train_batch_size. Each worker will sample and learn "
                "on train_batch_size samples per iteration.")

        return TorchDistributedDataParallelOptimizer(
            workers,
            num_sgd_iter=config["num_sgd_iter"],
            train_batch_size=config["train_batch_size"],
            sgd_minibatch_size=config["sgd_minibatch_size"],
            standardize_fields=["advantages"])

    if config["simple_optimizer"]:
        return SyncSamplesOptimizer(
            workers,
            num_sgd_iter=config["num_sgd_iter"],
            train_batch_size=config["train_batch_size"],
            sgd_minibatch_size=config["sgd_minibatch_size"],
            standardize_fields=["advantages"])

    return LocalMultiGPUOptimizer(
        workers,
        sgd_batch_size=config["sgd_minibatch_size"],
        num_sgd_iter=config["num_sgd_iter"],
        num_gpus=config["num_gpus"],
        sample_batch_size=config["sample_batch_size"],
        num_envs_per_worker=config["num_envs_per_worker"],
        train_batch_size=config["train_batch_size"],
        standardize_fields=["advantages"],
        shuffle_sequences=config["shuffle_sequences"])
Example #31
0
    class trainer_cls(Trainer):
        _name = name
        _default_config = default_config or COMMON_CONFIG
        _policy = default_policy

        def _init(self, config, env_creator):
            if validate_config:
                validate_config(config)
            if get_policy_class is None:
                policy = default_policy
            else:
                policy = get_policy_class(config)
            self.workers = self._make_workers(env_creator, policy, config,
                                              self.config["num_workers"])
            if make_policy_optimizer:
                self.optimizer = make_policy_optimizer(self.workers, config)
            else:
                optimizer_config = dict(
                    config["optimizer"],
                    **{"train_batch_size": config["train_batch_size"]})
                self.optimizer = SyncSamplesOptimizer(self.workers,
                                                      **optimizer_config)

        @override(Trainer)
        def _train(self):
            if before_train_step:
                before_train_step(self)
            prev_steps = self.optimizer.num_steps_sampled

            start = time.time()
            while True:
                fetches = self.optimizer.step()
                if after_optimizer_step:
                    after_optimizer_step(self, fetches)
                if time.time() - start > self.config["min_iter_time_s"]:
                    break

            res = self.collect_metrics()
            res.update(timesteps_this_iter=self.optimizer.num_steps_sampled -
                       prev_steps,
                       info=res.get("info", {}))
            if after_train_result:
                after_train_result(self, res)
            return res
Example #32
0
File: ppo.py Project: locussam/ray
def make_optimizer(local_evaluator, remote_evaluators, config):
    if config["simple_optimizer"]:
        return SyncSamplesOptimizer(
            local_evaluator,
            remote_evaluators,
            num_sgd_iter=config["num_sgd_iter"],
            train_batch_size=config["train_batch_size"])

    return LocalMultiGPUOptimizer(
        local_evaluator,
        remote_evaluators,
        sgd_batch_size=config["sgd_minibatch_size"],
        num_sgd_iter=config["num_sgd_iter"],
        num_gpus=config["num_gpus"],
        sample_batch_size=config["sample_batch_size"],
        num_envs_per_worker=config["num_envs_per_worker"],
        train_batch_size=config["train_batch_size"],
        standardize_fields=["advantages"],
        straggler_mitigation=config["straggler_mitigation"])
Example #33
0
    class trainer_cls(Trainer):
        _name = name
        _default_config = default_config or Trainer.COMMON_CONFIG
        _policy_graph = default_policy

        def _init(self, config, env_creator):
            if validate_config:
                validate_config(config)
            if get_policy_class is None:
                policy_graph = default_policy
            else:
                policy_graph = get_policy_class(config)
            self.local_evaluator = self.make_local_evaluator(
                env_creator, policy_graph)
            self.remote_evaluators = self.make_remote_evaluators(
                env_creator, policy_graph, config["num_workers"])
            if make_policy_optimizer:
                self.optimizer = make_policy_optimizer(self.local_evaluator,
                                                       self.remote_evaluators,
                                                       config)
            else:
                optimizer_config = dict(
                    config["optimizer"],
                    **{"train_batch_size": config["train_batch_size"]})
                self.optimizer = SyncSamplesOptimizer(self.local_evaluator,
                                                      self.remote_evaluators,
                                                      **optimizer_config)

        @override(Trainer)
        def _train(self):
            if before_train_step:
                before_train_step(self)
            prev_steps = self.optimizer.num_steps_sampled
            fetches = self.optimizer.step()
            if after_optimizer_step:
                after_optimizer_step(self, fetches)
            res = self.collect_metrics()
            res.update(timesteps_this_iter=self.optimizer.num_steps_sampled -
                       prev_steps,
                       info=res.get("info", {}))
            if after_train_result:
                after_train_result(self, res)
            return res