def __init__(self, render, sim_config, exp_config, checkpoint_path):

        import ray
        from ray.tune import run_experiments
        from ray.tune.registry import register_env
        from ray import tune
        import yaml
        from SingleLaneIDM.SimulatorCode.main_env import Wrapper

        from ray.rllib.agents.ppo.ppo import PPOAgent
        import os
        import pickle

        if render == 1:
            sim_config["config"]["render"] = True
        else:
            sim_config["config"]["render"] = False

        sim_config["config"]["acc-noise"] = False

        exp_name = list(exp_config.keys())[0]
        exp_config[exp_name]["config"]["num_gpus"] = 0
        exp_config[exp_name]["config"]["num_workers"] = 1
        exp_config[exp_name]["config"]["num_envs_per_worker"] = 1

        env_creator_name = "tsim-v0"
        register_env(env_creator_name, lambda config: Wrapper(sim_config))

        ray.init()
        self.agent = PPOAgent(env="tsim-v0",
                              config=exp_config[exp_name]["config"])
        self.agent.restore(checkpoint_path)
class PPORLControllerWithActionProbs():
    def __init__(self, render, sim_config, exp_config, checkpoint_path):

        import ray
        from ray.tune import run_experiments
        from ray.tune.registry import register_env
        from ray import tune
        import yaml
        from SingleLaneIDM.SimulatorCode.main_env import Wrapper

        from ray.rllib.agents.ppo.ppo import PPOAgent
        import os
        import pickle

        if render == 1:
            sim_config["config"]["render"] = True
        else:
            sim_config["config"]["render"] = False

        sim_config["config"]["acc-noise"] = False

        exp_name = list(exp_config.keys())[0]
        exp_config[exp_name]["config"]["num_gpus"] = 0
        exp_config[exp_name]["config"]["num_workers"] = 1
        exp_config[exp_name]["config"]["num_envs_per_worker"] = 1

        env_creator_name = "tsim-v0"
        register_env(env_creator_name, lambda config: Wrapper(sim_config))

        ray.init()
        self.agent = PPOAgent(env="tsim-v0",
                              config=exp_config[exp_name]["config"])
        self.agent.restore(checkpoint_path)
        #self.agent.optimizer.foreach_evaluator(lambda ev: ev.for_policy(lambda pi:pi.set_epsilon(0.0), policy_id="default"))

    def getAction(self, observation):

        state = []
        preprocessed = self.agent.local_evaluator.preprocessors[
            "default"].transform(observation)
        filtered_obs = self.agent.local_evaluator.filters['default'](
            preprocessed, update=False)

        result = self.agent.get_policy('default').compute_single_action(
            filtered_obs,
            state,
            None,
            None,
            None,
            clip_actions=self.agent.config["clip_actions"])

        action = result[0]
        probs = softmax(result[2]["logits"])  # imported from scipy.special

        return action, probs
class PPORLController():
    def __init__(self, render, sim_config, exp_config, checkpoint_path):

        import ray
        from ray.tune import run_experiments
        from ray.tune.registry import register_env
        from ray import tune
        import yaml
        from SingleLaneIDM.SimulatorCode.main_env import Wrapper

        from ray.rllib.agents.ppo.ppo import PPOAgent
        import os
        import pickle

        if render == 1:
            sim_config["config"]["render"] = True
        else:
            sim_config["config"]["render"] = False

        sim_config["config"]["acc-noise"] = False

        exp_name = list(exp_config.keys())[0]
        exp_config[exp_name]["config"]["num_gpus"] = 0
        exp_config[exp_name]["config"]["num_workers"] = 1
        exp_config[exp_name]["config"]["num_envs_per_worker"] = 1

        env_creator_name = "tsim-v0"
        register_env(env_creator_name, lambda config: Wrapper(sim_config))

        ray.init()
        self.agent = PPOAgent(env="tsim-v0",
                              config=exp_config[exp_name]["config"])
        self.agent.restore(checkpoint_path)
        #self.agent.optimizer.foreach_evaluator(lambda ev: ev.for_policy(lambda pi:pi.set_epsilon(0.0), policy_id="default"))

    def getAction(self, state):

        action = self.agent.compute_action(state)
        #print(action)
        return action
Esempio n. 4
0
#     def optimizer(self):
#         # return tf.train.GradientDescentOptimizer(0.001)
#         return tf.train.AdamOptimizer()

# def _train(self):
#     prev_steps = self.optimizer.num_steps_sampled
#     # start = time.time()
#     # while time.time() - start < self.config["min_iter_time_s"]:
#     self.optimizer.step()
#     result = self.optimizer.collect_metrics()
#     result.update(timesteps_this_iter=self.optimizer.num_steps_sampled -
#                   prev_steps)
#     return result

# PGAgent._policy_graph = NewPolicyGraph
# A2CAgent._policy_graph = NewPolicyGraph
# A2CAgent._train = _train

# agent = PGAgent(config=config, env="PointEnv")
# agent = A2CAgent(config=config, env="PointEnv")
agent = PPOAgent(config=config, env="PointEnv")
# res = agent.train()
for i in range(1000):
    print(i)
    res = agent.train()
    print(
        pretty_print({
            key: value
            for key, value in res.items() if key.startswith("episode")
        }))
        "dqn_policy": (DQNPolicyGraph, obs_space, act_space, {}),
    }

    def policy_mapping_fn(agent_id):
        if agent_id % 2 == 0:
            return "ppo_policy"
        else:
            return "dqn_policy"

    ppo_trainer = PPOAgent(
        env="multi_cartpole",
        config={
            "multiagent": {
                "policy_graphs": policy_graphs,
                "policy_mapping_fn": policy_mapping_fn,
                "policies_to_train": ["ppo_policy"],
            },
            "simple_optimizer": True,
            # disable filters, otherwise we would need to synchronize those
            # as well to the DQN agent
            "observation_filter": "NoFilter",
        })

    dqn_trainer = DQNAgent(env="multi_cartpole",
                           config={
                               "multiagent": {
                                   "policy_graphs": policy_graphs,
                                   "policy_mapping_fn": policy_mapping_fn,
                                   "policies_to_train": ["dqn_policy"],
                               },
                               "gamma": 0.95,
        "ppo_policy": (PPOPolicyGraph, obs_space, act_space, {}),
        "dqn_policy": (DQNPolicyGraph, obs_space, act_space, {}),
    }

    def policy_mapping_fn(agent_id):
        if agent_id % 2 == 0:
            return "ppo_policy"
        else:
            return "dqn_policy"

    ppo_trainer = PPOAgent(
        env="multi_cartpole",
        config={
            "multiagent": {
                "policy_graphs": policy_graphs,
                "policy_mapping_fn": policy_mapping_fn,
                "policies_to_train": ["ppo_policy"],
            },
            # disable filters, otherwise we would need to synchronize those
            # as well to the DQN agent
            "observation_filter": "NoFilter",
        })

    dqn_trainer = DQNAgent(
        env="multi_cartpole",
        config={
            "multiagent": {
                "policy_graphs": policy_graphs,
                "policy_mapping_fn": policy_mapping_fn,
                "policies_to_train": ["dqn_policy"],
            },
            "gamma": 0.95,