Beispiel #1
0
def _register_if_needed(env_object):
    if isinstance(env_object, six.string_types):
        return env_object
    elif isinstance(env_object, type):
        name = env_object.__name__
        register_env(name, lambda config: env_object(config))
        return name
def check_support(alg, config, stats, check_bounds=False):
    for a_name, action_space in ACTION_SPACES_TO_TEST.items():
        for o_name, obs_space in OBSERVATION_SPACES_TO_TEST.items():
            print("=== Testing", alg, action_space, obs_space, "===")
            stub_env = make_stub_env(action_space, obs_space, check_bounds)
            register_env("stub_env", lambda c: stub_env())
            stat = "ok"
            a = None
            try:
                a = get_agent_class(alg)(config=config, env="stub_env")
                a.train()
            except UnsupportedSpaceException:
                stat = "unsupported"
            except Exception as e:
                stat = "ERROR"
                print(e)
                print(traceback.format_exc())
            finally:
                if a:
                    try:
                        a.stop()
                    except Exception as e:
                        print("Ignoring error stopping agent", e)
                        pass
            print(stat)
            print()
            stats[alg, a_name, o_name] = stat
    def testPyTorchModel(self):
        ModelCatalog.register_custom_model("composite", TorchSpyModel)
        register_env("nested", lambda _: NestedDictEnv())
        a2c = A2CAgent(
            env="nested",
            config={
                "num_workers": 0,
                "use_pytorch": True,
                "sample_batch_size": 5,
                "train_batch_size": 5,
                "model": {
                    "custom_model": "composite",
                },
            })

        a2c.train()

        # Check that the model sees the correct reconstructed observations
        for i in range(4):
            seen = pickle.loads(
                ray.experimental.internal_kv._internal_kv_get(
                    "torch_spy_in_{}".format(i)))
            pos_i = DICT_SAMPLES[i]["sensors"]["position"].tolist()
            cam_i = DICT_SAMPLES[i]["sensors"]["front_cam"][0].tolist()
            task_i = one_hot(
                DICT_SAMPLES[i]["inner_state"]["job_status"]["task"], 5)
            self.assertEqual(seen[0][0].tolist(), pos_i)
            self.assertEqual(seen[1][0].tolist(), cam_i)
            self.assertEqual(seen[2][0].tolist(), task_i)
    def doTestNestedTuple(self, make_env):
        ModelCatalog.register_custom_model("composite2", TupleSpyModel)
        register_env("nested2", make_env)
        pg = PGAgent(
            env="nested2",
            config={
                "num_workers": 0,
                "sample_batch_size": 5,
                "train_batch_size": 5,
                "model": {
                    "custom_model": "composite2",
                },
            })
        pg.train()

        # Check that the model sees the correct reconstructed observations
        for i in range(4):
            seen = pickle.loads(
                ray.experimental.internal_kv._internal_kv_get(
                    "t_spy_in_{}".format(i)))
            pos_i = TUPLE_SAMPLES[i][0].tolist()
            cam_i = TUPLE_SAMPLES[i][1][0].tolist()
            task_i = one_hot(TUPLE_SAMPLES[i][2], 5)
            self.assertEqual(seen[0][0].tolist(), pos_i)
            self.assertEqual(seen[1][0].tolist(), cam_i)
            self.assertEqual(seen[2][0].tolist(), task_i)
    def doTestNestedDict(self, make_env, test_lstm=False):
        ModelCatalog.register_custom_model("composite", DictSpyModel)
        register_env("nested", make_env)
        pg = PGAgent(
            env="nested",
            config={
                "num_workers": 0,
                "sample_batch_size": 5,
                "train_batch_size": 5,
                "model": {
                    "custom_model": "composite",
                    "use_lstm": test_lstm,
                },
            })
        pg.train()

        # Check that the model sees the correct reconstructed observations
        for i in range(4):
            seen = pickle.loads(
                ray.experimental.internal_kv._internal_kv_get(
                    "d_spy_in_{}".format(i)))
            pos_i = DICT_SAMPLES[i]["sensors"]["position"].tolist()
            cam_i = DICT_SAMPLES[i]["sensors"]["front_cam"][0].tolist()
            task_i = one_hot(
                DICT_SAMPLES[i]["inner_state"]["job_status"]["task"], 5)
            self.assertEqual(seen[0][0].tolist(), pos_i)
            self.assertEqual(seen[1][0].tolist(), cam_i)
            self.assertEqual(seen[2][0].tolist(), task_i)
Beispiel #6
0
    def testMinibatchSequencing(self):
        ModelCatalog.register_custom_model("rnn", RNNSpyModel)
        register_env("counter", lambda _: DebugCounterEnv())
        ppo = PPOAgent(
            env="counter",
            config={
                "num_workers": 0,
                "sample_batch_size": 20,
                "train_batch_size": 20,
                "sgd_minibatch_size": 10,
                "vf_share_layers": True,
                "simple_optimizer": False,
                "num_sgd_iter": 1,
                "model": {
                    "custom_model": "rnn",
                    "max_seq_len": 4,
                },
            })
        ppo.train()
        ppo.train()

        # first epoch: 20 observations get split into 2 minibatches of 8
        # four observations are discarded
        batch0 = pickle.loads(
            ray.experimental.internal_kv._internal_kv_get("rnn_spy_in_0"))
        batch1 = pickle.loads(
            ray.experimental.internal_kv._internal_kv_get("rnn_spy_in_1"))
        if batch0["sequences"][0][0][0] > batch1["sequences"][0][0][0]:
            batch0, batch1 = batch1, batch0  # sort minibatches
        self.assertEqual(batch0["seq_lens"].tolist(), [4, 4])
        self.assertEqual(batch1["seq_lens"].tolist(), [4, 3])
        self.assertEqual(batch0["sequences"].tolist(), [
            [[0], [1], [2], [3]],
            [[4], [5], [6], [7]],
        ])
        self.assertEqual(batch1["sequences"].tolist(), [
            [[8], [9], [10], [11]],
            [[12], [13], [14], [0]],
        ])

        # second epoch: 20 observations get split into 2 minibatches of 8
        # four observations are discarded
        batch2 = pickle.loads(
            ray.experimental.internal_kv._internal_kv_get("rnn_spy_in_2"))
        batch3 = pickle.loads(
            ray.experimental.internal_kv._internal_kv_get("rnn_spy_in_3"))
        if batch2["sequences"][0][0][0] > batch3["sequences"][0][0][0]:
            batch2, batch3 = batch3, batch2
        self.assertEqual(batch2["seq_lens"].tolist(), [4, 4])
        self.assertEqual(batch3["seq_lens"].tolist(), [2, 4])
        self.assertEqual(batch2["sequences"].tolist(), [
            [[5], [6], [7], [8]],
            [[9], [10], [11], [12]],
        ])
        self.assertEqual(batch3["sequences"].tolist(), [
            [[13], [14], [0], [0]],
            [[0], [1], [2], [3]],
        ])
 def testTrainCartpole(self):
     register_env("test", lambda _: SimpleServing(gym.make("CartPole-v0")))
     pg = PGAgent(env="test", config={"num_workers": 0})
     for i in range(100):
         result = pg.train()
         print("Iteration {}, reward {}, timesteps {}".format(
             i, result["episode_reward_mean"], result["timesteps_total"]))
         if result["episode_reward_mean"] >= 100:
             return
     raise Exception("failed to improve reward")
Beispiel #8
0
    def testSimpleOptimizerSequencing(self):
        ModelCatalog.register_custom_model("rnn", RNNSpyModel)
        register_env("counter", lambda _: DebugCounterEnv())
        ppo = PPOAgent(
            env="counter",
            config={
                "num_workers": 0,
                "sample_batch_size": 10,
                "train_batch_size": 10,
                "sgd_minibatch_size": 10,
                "vf_share_layers": True,
                "simple_optimizer": True,
                "num_sgd_iter": 1,
                "model": {
                    "custom_model": "rnn",
                    "max_seq_len": 4,
                },
            })
        ppo.train()
        ppo.train()

        batch0 = pickle.loads(
            ray.experimental.internal_kv._internal_kv_get("rnn_spy_in_0"))
        self.assertEqual(
            batch0["sequences"].tolist(),
            [[[0], [1], [2], [3]], [[4], [5], [6], [7]], [[8], [9], [0], [0]]])
        self.assertEqual(batch0["seq_lens"].tolist(), [4, 4, 2])
        self.assertEqual(batch0["state_in"][0][0].tolist(), [0, 0, 0])
        self.assertEqual(batch0["state_in"][1][0].tolist(), [0, 0, 0])
        self.assertGreater(abs(np.sum(batch0["state_in"][0][1])), 0)
        self.assertGreater(abs(np.sum(batch0["state_in"][1][1])), 0)
        self.assertTrue(
            np.allclose(batch0["state_in"][0].tolist()[1:],
                        batch0["state_out"][0].tolist()[:-1]))
        self.assertTrue(
            np.allclose(batch0["state_in"][1].tolist()[1:],
                        batch0["state_out"][1].tolist()[:-1]))

        batch1 = pickle.loads(
            ray.experimental.internal_kv._internal_kv_get("rnn_spy_in_1"))
        self.assertEqual(batch1["sequences"].tolist(), [
            [[10], [11], [12], [13]],
            [[14], [0], [0], [0]],
            [[0], [1], [2], [3]],
            [[4], [0], [0], [0]],
        ])
        self.assertEqual(batch1["seq_lens"].tolist(), [4, 1, 4, 1])
        self.assertEqual(batch1["state_in"][0][2].tolist(), [0, 0, 0])
        self.assertEqual(batch1["state_in"][1][2].tolist(), [0, 0, 0])
        self.assertGreater(abs(np.sum(batch1["state_in"][0][0])), 0)
        self.assertGreater(abs(np.sum(batch1["state_in"][1][0])), 0)
        self.assertGreater(abs(np.sum(batch1["state_in"][0][1])), 0)
        self.assertGreater(abs(np.sum(batch1["state_in"][1][1])), 0)
        self.assertGreater(abs(np.sum(batch1["state_in"][0][3])), 0)
        self.assertGreater(abs(np.sum(batch1["state_in"][1][3])), 0)
def run(args, parser):
    def create_environment(env_config):
        # This import must happen inside the method so that worker processes import this code
        import roboschool
        return gym.make(args.env)

    if not args.config:
        # Load configuration from file
        config_dir = os.path.dirname(args.checkpoint)
        # params.json is saved in the model directory during ray training by default
        config_path = os.path.join(config_dir, "params.json")
        with open(config_path) as f:
            args.config = json.load(f)

    if not args.env:
        if not args.config.get("env"):
            parser.error("the following arguments are required: --env")
        args.env = args.config.get("env")

    ray.init()

    register_env(args.env, create_environment)

    cls = get_agent_class(args.algorithm)
    config = args.config
    config["monitor"] = False
    config["num_workers"] = 1
    config["num_gpus"] = 0
    agent = cls(env=args.env, config=config)
    agent.restore(args.checkpoint)
    num_episodes = int(args.evaluate_episodes)

    if args.algorithm == "DQN":
        env = gym.make(args.env)
        env = wrap_dqn(env, args.config.get("model", {}))
    else:
        env = ModelCatalog.get_preprocessor_as_wrapper(gym.make(args.env))
    env = wrappers.Monitor(env, OUTPUT_DIR, force=True, video_callable=lambda episode_id: True)
    all_rewards = []
    for episode in range(num_episodes):
        steps = 0
        state = env.reset()
        done = False
        reward_total = 0.0
        while not done:
            action = agent.compute_action(state)
            next_state, reward, done, _ = env.step(action)
            reward_total += reward
            steps += 1
            state = next_state
        all_rewards.append(reward_total)
        print("Episode reward: %s. Episode steps: %s" % (reward_total, steps))
    print("Mean Reward:", np.mean(all_rewards))
    print("Max Reward:", np.max(all_rewards))
    print("Min Reward:", np.min(all_rewards))
 def testTrainMultiCartpoleSinglePolicy(self):
     n = 10
     register_env("multi_cartpole", lambda _: MultiCartpole(n))
     pg = PGAgent(env="multi_cartpole", config={"num_workers": 0})
     for i in range(100):
         result = pg.train()
         print("Iteration {}, reward {}, timesteps {}".format(
             i, result["episode_reward_mean"], result["timesteps_total"]))
         if result["episode_reward_mean"] >= 50 * n:
             return
     raise Exception("failed to improve reward")
def check_support_multiagent(alg, config):
    register_env("multi_mountaincar", lambda _: MultiMountainCar(2))
    register_env("multi_cartpole", lambda _: MultiCartpole(2))
    if "DDPG" in alg:
        a = get_agent_class(alg)(config=config, env="multi_mountaincar")
    else:
        a = get_agent_class(alg)(config=config, env="multi_cartpole")
    try:
        a.train()
    finally:
        a.stop()
Beispiel #12
0
 def _register_if_needed(self, env_object):
     if isinstance(env_object, six.string_types):
         return env_object
     elif isinstance(env_object, type):
         name = env_object.__name__
         register_env(name, lambda config: env_object(config))
         return name
     raise ValueError(
         "{} is an invalid env specification. ".format(env_object) +
         "You can specify a custom env as either a class "
         "(e.g., YourEnvCls) or a registered env id (e.g., \"your_env\").")
 def testTrainCartpoleOffPolicy(self):
     register_env(
         "test3", lambda _: PartOffPolicyServing(
             gym.make("CartPole-v0"), off_pol_frac=0.2))
     dqn = DQNAgent(env="test3", config={"exploration_fraction": 0.001})
     for i in range(100):
         result = dqn.train()
         print("Iteration {}, reward {}, timesteps {}".format(
             i, result["episode_reward_mean"], result["timesteps_total"]))
         if result["episode_reward_mean"] >= 100:
             return
     raise Exception("failed to improve reward")
 def testQueryEvaluators(self):
     register_env("test", lambda _: gym.make("CartPole-v0"))
     pg = PGAgent(
         env="test", config={
             "num_workers": 2,
             "sample_batch_size": 5
         })
     results = pg.optimizer.foreach_evaluator(
         lambda ev: ev.sample_batch_size)
     results2 = pg.optimizer.foreach_evaluator_with_index(
         lambda ev, i: (i, ev.sample_batch_size))
     self.assertEqual(results, [5, 5, 5])
     self.assertEqual(results2, [(0, 5), (1, 5), (2, 5)])
Beispiel #15
0
    def testMultiAgent(self):
        register_env("multi_cartpole", lambda _: MultiCartpole(10))
        single_env = gym.make("CartPole-v0")

        def gen_policy():
            obs_space = single_env.observation_space
            act_space = single_env.action_space
            return (PGPolicyGraph, obs_space, act_space, {})

        pg = PGAgent(
            env="multi_cartpole",
            config={
                "num_workers": 0,
                "output": self.test_dir,
                "multiagent": {
                    "policy_graphs": {
                        "policy_1": gen_policy(),
                        "policy_2": gen_policy(),
                    },
                    "policy_mapping_fn": (
                        lambda agent_id: random.choice(
                            ["policy_1", "policy_2"])),
                },
            })
        pg.train()
        self.assertEqual(len(os.listdir(self.test_dir)), 1)

        pg.stop()
        pg = PGAgent(
            env="multi_cartpole",
            config={
                "num_workers": 0,
                "input": self.test_dir,
                "input_evaluation": ["simulation"],
                "train_batch_size": 2000,
                "multiagent": {
                    "policy_graphs": {
                        "policy_1": gen_policy(),
                        "policy_2": gen_policy(),
                    },
                    "policy_mapping_fn": (
                        lambda agent_id: random.choice(
                            ["policy_1", "policy_2"])),
                },
            })
        for _ in range(50):
            result = pg.train()
            if not np.isnan(result["episode_reward_mean"]):
                return  # simulation ok
            time.sleep(0.1)
        assert False, "did not see any simulation results"
    def testRolloutDictSpace(self):
        register_env("nested", lambda _: NestedDictEnv())
        agent = PGAgent(env="nested")
        agent.train()
        path = agent.save()
        agent.stop()

        # Test train works on restore
        agent2 = PGAgent(env="nested")
        agent2.restore(path)
        agent2.train()

        # Test rollout works on restore
        rollout(agent2, "nested", 100)
    def testMultiAgentComplexSpaces(self):
        ModelCatalog.register_custom_model("dict_spy", DictSpyModel)
        ModelCatalog.register_custom_model("tuple_spy", TupleSpyModel)
        register_env("nested_ma", lambda _: NestedMultiAgentEnv())
        act_space = spaces.Discrete(2)
        pg = PGAgent(
            env="nested_ma",
            config={
                "num_workers": 0,
                "sample_batch_size": 5,
                "train_batch_size": 5,
                "multiagent": {
                    "policy_graphs": {
                        "tuple_policy": (
                            PGPolicyGraph, TUPLE_SPACE, act_space,
                            {"model": {"custom_model": "tuple_spy"}}),
                        "dict_policy": (
                            PGPolicyGraph, DICT_SPACE, act_space,
                            {"model": {"custom_model": "dict_spy"}}),
                    },
                    "policy_mapping_fn": lambda a: {
                        "tuple_agent": "tuple_policy",
                        "dict_agent": "dict_policy"}[a],
                },
            })
        pg.train()

        for i in range(4):
            seen = pickle.loads(
                ray.experimental.internal_kv._internal_kv_get(
                    "d_spy_in_{}".format(i)))
            pos_i = DICT_SAMPLES[i]["sensors"]["position"].tolist()
            cam_i = DICT_SAMPLES[i]["sensors"]["front_cam"][0].tolist()
            task_i = one_hot(
                DICT_SAMPLES[i]["inner_state"]["job_status"]["task"], 5)
            self.assertEqual(seen[0][0].tolist(), pos_i)
            self.assertEqual(seen[1][0].tolist(), cam_i)
            self.assertEqual(seen[2][0].tolist(), task_i)

        for i in range(4):
            seen = pickle.loads(
                ray.experimental.internal_kv._internal_kv_get(
                    "t_spy_in_{}".format(i)))
            pos_i = TUPLE_SAMPLES[i][0].tolist()
            cam_i = TUPLE_SAMPLES[i][1][0].tolist()
            task_i = one_hot(TUPLE_SAMPLES[i][2], 5)
            self.assertEqual(seen[0][0].tolist(), pos_i)
            self.assertEqual(seen[1][0].tolist(), cam_i)
            self.assertEqual(seen[2][0].tolist(), task_i)
 def testQueryEvaluators(self):
     register_env("test", lambda _: gym.make("CartPole-v0"))
     pg = PGAgent(
         env="test",
         config={
             "num_workers": 2,
             "sample_batch_size": 5,
             "num_envs_per_worker": 2,
         })
     results = pg.optimizer.foreach_evaluator(
         lambda ev: ev.sample_batch_size)
     results2 = pg.optimizer.foreach_evaluator_with_index(
         lambda ev, i: (i, ev.sample_batch_size))
     results3 = pg.optimizer.foreach_evaluator(
         lambda ev: ev.foreach_env(lambda env: 1))
     self.assertEqual(results, [10, 10, 10])
     self.assertEqual(results2, [(0, 10), (1, 10), (2, 10)])
     self.assertEqual(results3, [[1, 1], [1, 1], [1, 1]])
    def testTrainMultiCartpoleMultiPolicy(self):
        n = 10
        register_env("multi_cartpole", lambda _: MultiCartpole(n))
        single_env = gym.make("CartPole-v0")

        def gen_policy():
            config = {
                "gamma": random.choice([0.5, 0.8, 0.9, 0.95, 0.99]),
                "n_step": random.choice([1, 2, 3, 4, 5]),
            }
            obs_space = single_env.observation_space
            act_space = single_env.action_space
            return (PGPolicyGraph, obs_space, act_space, config)

        pg = PGAgent(
            env="multi_cartpole",
            config={
                "num_workers": 0,
                "multiagent": {
                    "policy_graphs": {
                        "policy_1": gen_policy(),
                        "policy_2": gen_policy(),
                    },
                    "policy_mapping_fn": lambda agent_id: "policy_1",
                },
            })

        # Just check that it runs without crashing
        for i in range(10):
            result = pg.train()
            print("Iteration {}, reward {}, timesteps {}".format(
                i, result["episode_reward_mean"], result["timesteps_total"]))
        self.assertTrue(
            pg.compute_action([0, 0, 0, 0], policy_id="policy_1") in [0, 1])
        self.assertTrue(
            pg.compute_action([0, 0, 0, 0], policy_id="policy_2") in [0, 1])
        self.assertRaises(
            KeyError,
            lambda: pg.compute_action([0, 0, 0, 0], policy_id="policy_3"))
Beispiel #20
0
    register(
      id=env_name,
      entry_point='ray.rllib.examples:' + "MultiAgentMountainCarEnv",
      max_episode_steps=200,
      kwargs={}
    )


def create_env(env_config):
    pass_params_to_gym(env_name)
    env = gym.envs.make(env_name)
    return env


if __name__ == '__main__':
    register_env(env_name, lambda env_config: create_env(env_config))
    config = ppo.DEFAULT_CONFIG.copy()
    horizon = 10
    num_cpus = 4
    ray.init(num_cpus=num_cpus, redirect_output=True)
    config["num_workers"] = num_cpus
    config["timesteps_per_batch"] = 10
    config["num_sgd_iter"] = 10
    config["gamma"] = 0.999
    config["horizon"] = horizon
    config["use_gae"] = False
    config["model"].update({"fcnet_hiddens": [256, 256]})
    options = {"multiagent_obs_shapes": [2, 2],
               "multiagent_act_shapes": [1, 1],
               "multiagent_shared_model": False,
               "multiagent_fcnet_hiddens": [[32, 32]] * 2}
        return 0

    def step(self, action):
        return 0, 0, True, {}


def leaked_processes():
    """Returns whether any subprocesses were leaked."""
    result = subprocess.check_output(
        "ps aux | grep '{}' | grep -v grep || true".format(UNIQUE_CMD),
        shell=True)
    return result


if __name__ == "__main__":
    register_env("subproc", lambda config: EnvWithSubprocess(config))
    ray.init()
    assert os.path.exists(UNIQUE_FILE_0)
    assert os.path.exists(UNIQUE_FILE_1)
    assert not leaked_processes()
    run_experiments({
        "demo": {
            "run": "PG",
            "env": "subproc",
            "num_samples": 1,
            "config": {
                "num_workers": 1,
            },
            "stop": {
                "training_iteration": 1
            },
env_name = 'sonic_env'
# Note that the hyperparameters have been tuned for sonic, which can be used
# run by replacing the below function with:
#
#     register_env(env_name, lambda config: sonic_on_ray.make(
#                                game='SonicTheHedgehog-Genesis',
#                                state='GreenHillZone.Act1'))
#
# However, to try Sonic, you have to obtain the ROM yourself (see then
# instructions at https://github.com/openai/retro/blob/master/README.md).
# register_env(env_name,
#              lambda config: sonic_on_ray.make(game='Airstriker-Genesis',
#                                               state='Level1'))

register_env(env_name,
             lambda config: sonic_on_ray.make(game='BustAMove-Snes',
                                              state='BustAMove.1pplay.Level10'))

ray.init()

run_experiments({
    'sonic-ppo': {
        'run': 'PPO',
        'env': 'sonic_env',
        # 'trial_resources': {
        #     'gpu': 2,  # note, keep this in sync with 'devices' config value
        #     'cpu': lambda spec: spec.config.num_workers,  # one cpu per worker
        # },
        'config': {
            # grid search over learning rate
            'sgd_stepsize': grid_search([1e-4, 5e-5, 1e-5, 5e-6]),
Beispiel #23
0
    def _reset(self):
        self.cur_pos = 0
        return [self.cur_pos]

    def _step(self, action):
        assert action in [0, 1]
        if action == 0 and self.cur_pos > 0:
            self.cur_pos -= 1
        elif action == 1:
            self.cur_pos += 1
        done = self.cur_pos >= self.end_pos
        return [self.cur_pos], 1 if done else 0, done, {}


if __name__ == "__main__":
    env_creator_name = "corridor"
    register_env(env_creator_name, lambda config: SimpleCorridor(config))
    ray.init()
    run_experiments({
        "demo": {
            "run": "PPO",
            "env": "corridor",
            "config": {
                "env_config": {
                    "corridor_length": 5,
                },
            },
        },
    })
Beispiel #24
0
 def test_no_step_on_init(self):
     # Allow for Unittest run.
     ray.init(num_cpus=5, ignore_reinit_error=True)
     register_env("fail", lambda _: FailOnStepEnv())
     pg = PGTrainer(env="fail", config={"num_workers": 1})
     self.assertRaises(Exception, lambda: pg.train())
Beispiel #25
0
                auxiliary_name_scope=False):
            last_layer = slim.fully_connected(
                input_dict["obs"], 64, activation_fn=tf.nn.relu, scope="fc1")
        last_layer = slim.fully_connected(
            last_layer, 64, activation_fn=tf.nn.relu, scope="fc2")
        output = slim.fully_connected(
            last_layer, num_outputs, activation_fn=None, scope="fc_out")
        return output, last_layer


if __name__ == "__main__":
    args = parser.parse_args()
    ray.init()

    # Simple environment with `num_agents` independent cartpole entities
    register_env("multi_cartpole", lambda _: MultiCartpole(args.num_agents))
    ModelCatalog.register_custom_model("model1", CustomModel1)
    ModelCatalog.register_custom_model("model2", CustomModel2)
    single_env = gym.make("CartPole-v0")
    obs_space = single_env.observation_space
    act_space = single_env.action_space

    # Each policy can have a different configuration (including custom model)
    def gen_policy(i):
        config = {
            "model": {
                "custom_model": ["model1", "model2"][i % 2],
            },
            "gamma": random.choice([0.95, 0.99]),
        }
        return (PPOPolicyGraph, obs_space, act_space, config)
Beispiel #26
0
    return parser.parse_args()


if __name__ == '__main__':
    args = get_parser()

    # Start ray
    ray.init()
    # NOTE: We are using DuckietownLF environment because SteeringToWheelVelWrapper does not cooperate with multimap.
    ModelCatalog.register_custom_model(
        "image-ddpg",
        DDPGRLLibModel,
    )

    register_env("DuckieTown-MultiMap",
                 lambda _: MultiMapSteeringToWheelVelWrapper(MultiMapEnv()))

    csv_path = "searches/ddpg_results.csv"
    starting_idx = 0
    if os.path.exists(csv_path):
        with open(csv_path, mode="r") as f:
            starting_idx = len(f.readlines())

    for search_idx in trange(args.n_searches, desc="Searches"):
        config = {
            "framework": "torch",
            "model": {
                "custom_model": "image-ddpg",
            },
            # "use_state_preprocessor": True,
            "learning_starts": 0,
Beispiel #27
0
            spaces.Box(
                low=-10, high=10,
                shape=(config["observation_size"],),
                dtype=np.float32))

    def run(self):
        print("Starting policy server at {}:{}".format(SERVER_ADDRESS,
                                                       SERVER_PORT))
        server = PolicyServer(self, SERVER_ADDRESS, SERVER_PORT)
        server.serve_forever()


if __name__ == "__main__":
    args = parser.parse_args()
    ray.init()
    register_env("srv", lambda config: SimpleServing(config))

    if args.run == "DQN":
        trainer = DQNTrainer(
            env="srv",
            config={
                # Use a single process to avoid needing a load balancer
                "num_workers": 0,
                # Configure the trainer to run short iterations for debugging
                "exploration_fraction": 0.01,
                "learning_starts": 100,
                "timesteps_per_iteration": 200,
                "env_config": {
                    "observation_size": args.observation_size,
                    "action_size": args.action_size,
                },
import ray
from ray.tune.registry import register_env
from ray.tune import run_experiments
from ray.rllib.agents import ppo


def env_creator(env_config={}):
    env = QuadLocEnv(dataDir='/home/Pearl/quantm/RL_env/data/', num=500)
    num = env.action_space.n
    # print("Action:", num)
    env.reset()
    return env


register_env("QuadLocEnv-v0", env_creator)

############
ray.init(use_raylet=True, redis_password=os.urandom(128).hex())
register_env("QuadLocEnv-v0", env_creator)

experiment_spec = {
    "custom_env": {
        "run": "A3C",
        "env": "QuadLocEnv-v0",
        #             "restore": checkpoint,
        "config": {
            "model": {
                "custom_model": "ConvNet2D",
            },
        },
Beispiel #29
0
# save the flow params for replay
flow_json = json.dumps(flow_params,
                       cls=FlowParamsEncoder,
                       sort_keys=True,
                       indent=4)  # generating a string version of flow_params
config['env_config'][
    'flow_params'] = flow_json  # adding the flow_params to config dict
config['env_config']['run'] = alg_run

# Call the utility function make_create_env to be able to
# register the Flow env for this experiment
create_env, gym_name = make_create_env(params=flow_params, version=0)

config["env"] = gym_name
# Register as rllib env with Gym
register_env(gym_name, create_env)

exp = Experiment(
    flow_params["exp_tag"],
    **{
        "run": alg_run,
        "config": {
            **config
        },
        "checkpoint_freq": 5,  # number of iterations between checkpoints
        "checkpoint_at_end": True,  # generate a checkpoint at the end
        "max_failures": 5,
        "stop": {  # stopping conditions
            "training_iteration": 400,  # number of iterations to stop after
        },
        "num_samples": 1,
def run(args, parser):
    def create_environment(env_config):
        return gym.make(args.env)

    if not args.config:
        # Load configuration from file
        config_dir = os.path.dirname(args.checkpoint)
        # params.json is saved in the model directory during ray training by default
        config_path = os.path.join(config_dir, "params.json")
        with open(config_path) as f:
            args.config = json.load(f)

    if not args.env:
        if not args.config.get("env"):
            parser.error("the following arguments are required: --env")
        args.env = args.config.get("env")

    ray.init()

    register_env(args.env, create_environment)

    if ray.__version__ >= "0.6.5":
        from ray.rllib.agents.registry import get_agent_class
    else:
        from ray.rllib.agents.agent import get_agent_class

    cls = get_agent_class(args.algorithm)
    config = args.config
    config["monitor"] = False
    config["num_workers"] = 1
    config["num_gpus"] = 0
    agent = cls(env=args.env, config=config)
    agent.restore(args.checkpoint)
    num_episodes = int(args.evaluate_episodes)

    if ray.__version__ >= "0.6.5":
        env = gym.make(args.env)
    else:
        from ray.rllib.agents.dqn.common.wrappers import wrap_dqn

        if args.algorithm == "DQN":
            env = gym.make(args.env)
            env = wrap_dqn(env, args.config.get("model", {}))
        else:
            env = ModelCatalog.get_preprocessor_as_wrapper(gym.make(args.env))

    env = wrappers.Monitor(env, OUTPUT_DIR, force=True, video_callable=lambda episode_id: True)
    all_rewards = []
    for episode in range(num_episodes):
        steps = 0
        state = env.reset()
        done = False
        reward_total = 0.0
        while not done:
            action = agent.compute_action(state)
            next_state, reward, done, _ = env.step(action)
            reward_total += reward
            steps += 1
            state = next_state
        all_rewards.append(reward_total)
        print("Episode reward: %s. Episode steps: %s" % (reward_total, steps))
    print("Mean Reward:", np.mean(all_rewards))
    print("Max Reward:", np.max(all_rewards))
    print("Min Reward:", np.min(all_rewards))
Beispiel #31
0
from ray.tune.registry import register_env
from wanderer_roborobo import WandererRoborobo

if __name__ == "__main__":

    ray.init(num_cpus=1, num_gpus=1)

    #%%

    n_players = 1
    max_moves = 1000
    agents_id = ['player{:d}'.format(i) for i in range(n_players)]
    actions = {agents_id[i]: 1 for i in range(n_players)}

    register_env("wanderer_roborobo",
                 lambda _: WandererRoborobo(n_players, max_moves))
    act_space = WandererRoborobo.action_space

    obs_space = WandererRoborobo.observation_space

    policies = {
        agents_id[i]: (None, obs_space, act_space, {})
        for i in range(n_players)
    }

    def select_policy(agent_id):
        return agent_id

    config = {
        "num_gpus": 0,
        'num_workers': 0,
Beispiel #32
0
from ray.tune.registry import register_env
from nes_py.wrappers import JoypadSpace
from ray.rllib.env.atari_wrappers import WarpFrame

import gym_tetris
from gym_tetris.actions import MOVEMENT


def tetris_env_creator(version="TetrisA-v0"):
    def env_creator(env_config):
        env = gym_tetris.make(version)
        env = JoypadSpace(env, MOVEMENT)
        env = WarpFrame(env, dim=84)
        return env
    return env_creator


register_env("TetrisA-v0", tetris_env_creator("TetrisA-v0"))
register_env("TetrisA-v1", tetris_env_creator("TetrisA-v1"))
register_env("TetrisA-v2", tetris_env_creator("TetrisA-v2"))
register_env("TetrisA-v3", tetris_env_creator("TetrisA-v3"))
Beispiel #33
0
            sgd_minibatch_size=128))

    # Combined training flow
    train_op = Concurrently([ppo_train_op, dqn_train_op],
                            mode="async",
                            output_indexes=[1])

    return StandardMetricsReporting(train_op, workers, config)


if __name__ == "__main__":
    args = parser.parse_args()
    ray.init()

    # Simple environment with 4 independent cartpole entities
    register_env("multi_agent_cartpole",
                 lambda _: MultiAgentCartPole({"num_agents": 4}))
    single_env = gym.make("CartPole-v0")
    obs_space = single_env.observation_space
    act_space = single_env.action_space

    # Note that since the trainer below does not include a default policy or
    # policy configs, we have to explicitly set it in the multiagent config:
    policies = {
        "ppo_policy": (PPOTFPolicy, obs_space, act_space, PPO_CONFIG),
        "dqn_policy": (DQNTFPolicy, obs_space, act_space, DQN_CONFIG),
    }

    def policy_mapping_fn(agent_id):
        if agent_id % 2 == 0:
            return "ppo_policy"
        else:
Beispiel #34
0
        self.dts_taken_so_far = 1
        return self.env.env.robot.calc_state() #return state of robot

    def step(self, action):
        input("Press Enter  .....")    
        print("Colisions for feet:					       ", self.env.env.robot.calc_state()[20], "   ", self.env.env.robot.calc_state()[21]) #returns states, last 2 numbers inticate whther foot is in contact with ground
        self.dts_taken_so_far += 1
        if self.debug:
            print("Time elapsed in episode: ", self.dts_taken_so_far * self.env.env.scene.dt)
            print("Number of dt's taken in episode: " , self.dts_taken_so_far)
        return self.env.step(action)



from ray.tune.registry import register_env
register_env("cartpolebulletenv", lambda config: MultiEnv(config))
register_env("reacherbulletenv", lambda config: ReacherEnv(config))
register_env("pusherbulletenv", lambda config: PusherEnv(config))
register_env("throwerbulletenv", lambda config: ThrowerEnv(config))
register_env("strikerbulletenv", lambda config: StrikerEnv(config))
register_env("walkerbulletenv", lambda config: WalkerEnv(config))


#register_env("octoenv", lambda config: OctoEnv(config))
#trainer = ppo.PPOTrainer(config=config, env="octoenv")

###########


class RolloutSaver:
    """Utility class for storing rollouts.
Beispiel #35
0
def main(argv):
    ModelCatalog.register_custom_model("my_model", MyModelClass)

    model = {
        # cusom model options
        "custom_model": "my_model",
        "custom_preprocessor": None,
        # Extra options to pass to the custom classes
        "custom_options": {},

        # built in options
        # Number of hidden layers for fully connected net
        "fcnet_hiddens": [256, 256, 256, 256],
    }

    num_workers = 2

    # read out command line arguments
    try:
        opts, args = getopt.getopt(argv, "hn:", ["number-worker="])
    except getopt.GetoptError:
        print('ray_server.py -n <number-worker>')
        sys.exit(2)
    for opt, arg in opts:
        if opt == '-h':
            print('ray_server.py -n <number-worker>')
            print('-n --number-worker  - number of worker to start')
            sys.exit()
        elif opt in ("-n", "--number-worker"):
            num_workers = int(arg)

    ray.init()
    print("[RAY] Initialized")
    register_env("srv", lambda _: CartpoleServing())

    if ALGORITHM == "APEX":
        dqn = ApexTrainer(
            env="srv",
            config={
                # model
                "model": model,
                "gamma": 0.8,
                "noisy": False,
                "num_gpus": 1,

                # evaluation
                # everything default, see dqn.py

                #exploration
                "target_network_update_freq": 500000,
                # rest: everything default, see dqn.py

                #replay buffer
                # Size of the replay buffer. Note that if async_updates is set, then
                # each worker will have a replay buffer of this size. default 50000
                "buffer_size": 2000000,
                # If True prioritized replay buffer will be used.
                "prioritized_replay": True,
                # here are many parameters, untouched from me (see dqn.py)

                # Optimization
                # Learning rate - defaults to 5e-4
                "lr": 0.01,
                # Size of rollout batch
                # Default sample batch size (unroll length). Batches of this size are
                # collected from workers until train_batch_size is met. When using
                # multiple envs per worker, this is multiplied by num_envs_per_worker.
                "sample_batch_size": 4,
                # Training batch size, if applicable. Should be >= sample_batch_size.
                # Samples batches will be concatenated together to this size for training.
                "train_batch_size": 256,
                # How many steps of the model to sample before learning starts
                "learning_starts": 50000,

                #parallelism
                "num_workers": num_workers,
                # distribute epsilon over workers (default for apex)
                "per_worker_exploration": True,
                # determine per worker which experience should be prioritized, before giving those to the
                # shared experience memory
                "worker_side_prioritization": True,

                # "schedule_max_timesteps": 100000, # was tut es?
                # "timesteps_per_iteration": 25000, # was tut es?
                # "min_iter_time_s": 30, # was tut es?
            })
    else:
        dqn = DQNTrainer(
            env="srv",
            config={
                # model
                # mehrere Threads fuer worker! fuer debugging auf false setzen
                # "sample_async": True,
                # "grad_clip": 0.5,
                "model": model,
                "gamma": 0.8,
                "noisy": False,
                "num_gpus": 1,

                # Whether to use dueling dqn
                "dueling": True,
                # Whether to use double dqn
                "double_q": True,

                # evaluation
                # everything default, see dqn.py

                # exploration
                "target_network_update_freq": 500000,
                # rest: everything default, see dqn.py

                # replay buffer
                # Size of the replay buffer. Note that if async_updates is set, then
                # each worker will have a replay buffer of this size. default 50000
                "buffer_size": 2000000,
                # If True prioritized replay buffer will be used.
                "prioritized_replay": True,
                # here are many parameters, untouched from me (see dqn.py)

                # Optimization
                # Learning rate - defaults to 5e-4
                "lr": 0.01,
                # Update the replay buffer with this many samples at once. Note that
                # this setting applies per-worker if num_workers > 1.
                #"sample_batch_size": 1024,
                # How many steps of the model to sample before learning starts
                "learning_starts": 50000,
                # Size of a batched sampled from replay buffer for training. Note that
                # if async_updates is set, then each worker returns gradients for a
                # batch of this size. (Minibatch size) hould be >= sample_batch_size
                # Samples batches will be concatenated together to this size for training.
                "train_batch_size": 256,

                # parallelism
                # Number of workers for collecting samples with. This only makes sense
                # to increase if your environment is particularly slow to sample, or if
                # you"re using the Async or Ape-X optimizers.
                "num_workers": num_workers,
                # distribute epsilon over workers
                "per_worker_exploration": True,
                # compute worker side prioritazation (DQN: False, DDQN: True, APEX: True!!)
                "worker_side_prioritization": True,
            })

    # write policy graph to tensorboard (for debugging purposes)
    policy_graph = dqn.local_evaluator.policy_map["default_policy"].sess.graph
    writer = tf.summary.FileWriter(dqn._result_logger.logdir, policy_graph)
    writer.close()

    # Attempt to restore from checkpoint if possible.
    if os.path.exists(CHECKPOINT_FILE):
        checkpoint_path = open(CHECKPOINT_FILE).read()
        print("Restoring from checkpoint path", checkpoint_path)
        dqn.restore(checkpoint_path)

    # Serving and training loop
    while True:
        print(pretty_print(dqn.train()))
        checkpoint_path = dqn.save()
        print("Last checkpoint", checkpoint_path)
        with open(CHECKPOINT_FILE, "w") as f:
            f.write(checkpoint_path)
Beispiel #36
0
    rllib rollout /tmp/ray/checkpoint_dir/checkpoint-0 --run DQN
    --env CartPole-v0 --steps 1000000 --out rollouts.pkl

Example Usage via executable:
    ./rollout.py /tmp/ray/checkpoint_dir/checkpoint-0 --run DQN
    --env CartPole-v0 --steps 1000000 --out rollouts.pkl
"""

ENV = "PathPlanningEnv"
if ENV == "CarlaRoadEnv":

    def env_creator(env_config):
        env = CarlaRoadEnv(env_config)
        return env

    register_env("CarlaRoadEnv-v0", env_creator)
    ModelCatalog.register_custom_model("carla_road_model", FactoredModel)
else:

    def env_creator(env_config):
        env = PathPlanningEnv(env_config)
        return env

    register_env("PathPlanningEnv-v0", env_creator)
    ModelCatalog.register_custom_model("path_planning_model",
                                       PathPlanningModel)

# Note: if you use any custom models or envs, register them here first, e.g.:
#
# ModelCatalog.register_custom_model("pa_model", ParametricActionsModel)
# register_env("pa_cartpole", lambda _: ParametricActionCartpole(10))
Beispiel #37
0
def visualizer_rllib(args):
    """Visualizer for RLlib experiments.

    This function takes args (see function create_parser below for
    more detailed information on what information can be fed to this
    visualizer), and renders the experiment associated with it.
    """
    result_dir = args.result_dir if args.result_dir[-1] != '/' \
        else args.result_dir[:-1]

    config = get_rllib_config(result_dir)
    # TODO(ev) backwards compatibility hack
    try:
        pkl = get_rllib_pkl(result_dir)
    except Exception:
        pass

    # check if we have a multiagent scenario but in a
    # backwards compatible way
    if config.get('multiagent', {}).get('policy_graphs', {}):
        multiagent = True
        config['multiagent'] = pkl['multiagent']
    else:
        multiagent = False

    # Run on only one cpu for rendering purposes
    config['num_workers'] = 0

    flow_params = get_flow_params(config)

    # hack for old pkl files
    # TODO(ev) remove eventually
    sim_params = flow_params['sim']
    setattr(sim_params, 'num_clients', 1)

    # Determine agent and checkpoint
    config_run = config['env_config']['run'] if 'run' in config['env_config'] \
        else None
    if args.run and config_run:
        if args.run != config_run:
            print('visualizer_rllib.py: error: run argument ' +
                  '\'{}\' passed in '.format(args.run) +
                  'differs from the one stored in params.json ' +
                  '\'{}\''.format(config_run))
            sys.exit(1)
    if args.run:
        agent_cls = get_agent_class(args.run)
    elif config_run:
        agent_cls = get_agent_class(config_run)
    else:
        print('visualizer_rllib.py: error: could not find flow parameter '
              '\'run\' in params.json, '
              'add argument --run to provide the algorithm or model used '
              'to train the results\n e.g. '
              'python ./visualizer_rllib.py /tmp/ray/result_dir 1 --run PPO')
        sys.exit(1)

    sim_params.restart_instance = True
    dir_path = os.path.dirname(os.path.realpath(__file__))
    emission_path = '{0}/test_time_rollout/'.format(dir_path)
    sim_params.emission_path = emission_path if args.gen_emission else None

    # pick your rendering mode
    if args.render_mode == 'sumo_web3d':
        sim_params.num_clients = 2
        sim_params.render = False
    elif args.render_mode == 'drgb':
        sim_params.render = 'drgb'
        sim_params.pxpm = 4
    elif args.render_mode == 'sumo_gui':
        sim_params.render = True
        print('NOTE: With render mode {}, an extra instance of the SUMO GUI '
              'will display before the GUI for visualizing the result. Click '
              'the green Play arrow to continue.'.format(args.render_mode))
    elif args.render_mode == 'no_render':
        sim_params.render = False
    if args.save_render:
        sim_params.render = 'drgb'
        sim_params.pxpm = 4
        sim_params.save_render = True

    # Create and register a gym+rllib env
    create_env, env_name = make_create_env(params=flow_params, version=0)
    register_env(env_name, create_env)

    # check if the environment is a single or multiagent environment, and
    # get the right address accordingly
    # single_agent_envs = [env for env in dir(flow.envs)
    #                      if not env.startswith('__')]

    # if flow_params['env_name'] in single_agent_envs:
    #     env_loc = 'flow.envs'
    # else:
    #     env_loc = 'flow.multiagent_envs'

    # Start the environment with the gui turned on and a path for the
    # emission file
    env_params = flow_params['env']
    env_params.restart_instance = False
    if args.evaluate:
        env_params.evaluate = True

    # lower the horizon if testing
    if args.horizon:
        config['horizon'] = args.horizon
        env_params.horizon = args.horizon

    # create the agent that will be used to compute the actions
    agent = agent_cls(env=env_name, config=config)
    checkpoint = result_dir + '/checkpoint_' + args.checkpoint_num
    checkpoint = checkpoint + '/checkpoint-' + args.checkpoint_num
    agent.restore(checkpoint)

    if hasattr(agent, "local_evaluator") and \
            os.environ.get("TEST_FLAG") != 'True':
        env = agent.local_evaluator.env
    else:
        env = gym.make(env_name)

    if multiagent:
        rets = {}
        # map the agent id to its policy
        policy_map_fn = config['multiagent']['policy_mapping_fn'].func
        for key in config['multiagent']['policy_graphs'].keys():
            rets[key] = []
    else:
        rets = []

    if config['model']['use_lstm']:
        use_lstm = True
        if multiagent:
            state_init = {}
            # map the agent id to its policy
            policy_map_fn = config['multiagent']['policy_mapping_fn'].func
            size = config['model']['lstm_cell_size']
            for key in config['multiagent']['policy_graphs'].keys():
                state_init[key] = [
                    np.zeros(size, np.float32),
                    np.zeros(size, np.float32)
                ]
        else:
            state_init = [
                np.zeros(config['model']['lstm_cell_size'], np.float32),
                np.zeros(config['model']['lstm_cell_size'], np.float32)
            ]
    else:
        use_lstm = False

    env.restart_simulation(sim_params=sim_params, render=sim_params.render)

    # Simulate and collect metrics
    final_outflows = []
    final_inflows = []
    mean_speed = []
    std_speed = []
    for i in range(args.num_rollouts):
        vel = []
        state = env.reset()
        if multiagent:
            ret = {key: [0] for key in rets.keys()}
        else:
            ret = 0
        for j in range(env_params.horizon):
            vehicles = env.unwrapped.k.vehicle
            vel.append(np.mean(vehicles.get_speed(vehicles.get_ids())))
            if multiagent:
                action = {}
                for agent_id in state.keys():
                    if use_lstm:
                        action[agent_id], state_init[agent_id], logits = \
                            agent.compute_action(
                            state[agent_id], state=state_init[agent_id],
                            policy_id=policy_map_fn(agent_id))
                    else:
                        action[agent_id] = agent.compute_action(
                            state[agent_id], policy_id=policy_map_fn(agent_id))
                        if j == 0:
                            action[agent_id] = 0.2  # to prevent accident
                        #print("hello, ", j, action[agent_id])

            else:
                action = agent.compute_action(state)
            state, reward, done, _ = env.step(action)
            if multiagent:
                for actor, rew in reward.items():
                    ret[policy_map_fn(actor)][0] += rew
            else:
                ret += reward
            if multiagent and done['__all__']:
                break
            if not multiagent and done:
                break

        if multiagent:
            for key in rets.keys():
                rets[key].append(ret[key])
        else:
            rets.append(ret)
        outflow = vehicles.get_outflow_rate(500)
        final_outflows.append(outflow)
        inflow = vehicles.get_inflow_rate(500)
        final_inflows.append(inflow)
        if np.all(np.array(final_inflows) > 1e-5):
            throughput_efficiency = [
                x / y for x, y in zip(final_outflows, final_inflows)
            ]
        else:
            throughput_efficiency = [0] * len(final_inflows)
        mean_speed.append(np.mean(vel))
        std_speed.append(np.std(vel))
        if multiagent:
            for agent_id, rew in rets.items():
                print('Round {}, Return: {} for agent {}'.format(
                    i, ret, agent_id))
        else:
            print('Round {}, Return: {}'.format(i, ret))

    print('==== Summary of results ====')
    print("Return:")
    print(mean_speed)
    if multiagent:
        for agent_id, rew in rets.items():
            print('For agent', agent_id)
            print(rew)
            print('Average, std return: {}, {} for agent {}'.format(
                np.mean(rew), np.std(rew), agent_id))
    else:
        print(rets)
        print('Average, std: {}, {}'.format(np.mean(rets), np.std(rets)))

    print("\nSpeed, mean (m/s):")
    print(mean_speed)
    print('Average, std: {}, {}'.format(np.mean(mean_speed),
                                        np.std(mean_speed)))
    print("\nSpeed, std (m/s):")
    print(std_speed)
    print('Average, std: {}, {}'.format(np.mean(std_speed), np.std(std_speed)))

    # Compute arrival rate of vehicles in the last 500 sec of the run
    print("\nOutflows (veh/hr):")
    print(final_outflows)
    print('Average, std: {}, {}'.format(np.mean(final_outflows),
                                        np.std(final_outflows)))
    # Compute departure rate of vehicles in the last 500 sec of the run
    print("Inflows (veh/hr):")
    print(final_inflows)
    print('Average, std: {}, {}'.format(np.mean(final_inflows),
                                        np.std(final_inflows)))
    # Compute throughput efficiency in the last 500 sec of the
    print("Throughput efficiency (veh/hr):")
    print(throughput_efficiency)
    print('Average, std: {}, {}'.format(np.mean(throughput_efficiency),
                                        np.std(throughput_efficiency)))

    # terminate the environment
    env.unwrapped.terminate()

    # if prompted, convert the emission file into a csv file
    if args.gen_emission:
        time.sleep(0.1)

        dir_path = os.path.dirname(os.path.realpath(__file__))
        emission_filename = '{0}-emission.xml'.format(env.scenario.name)

        emission_path = \
            '{0}/test_time_rollout/{1}'.format(dir_path, emission_filename)

        # convert the emission file into a csv file
        emission_to_csv(emission_path)

        # delete the .xml version of the emission file
        os.remove(emission_path)

    # if we wanted to save the render, here we create the movie
    if args.save_render:
        dirs = os.listdir(os.path.expanduser('~') + '/flow_rendering')
        # Ignore hidden files
        dirs = [d for d in dirs if d[0] != '.']
        dirs.sort(key=lambda date: datetime.strptime(date, "%Y-%m-%d-%H%M%S"))
        recent_dir = dirs[-1]
        # create the movie
        movie_dir = os.path.expanduser('~') + '/flow_rendering/' + recent_dir
        save_dir = os.path.expanduser('~') + '/flow_movies'
        if not os.path.exists(save_dir):
            os.mkdir(save_dir)
        os_cmd = "cd " + movie_dir + " && ffmpeg -i frame_%06d.png"
        os_cmd += " -pix_fmt yuv420p " + dirs[-1] + ".mp4"
        os_cmd += "&& cp " + dirs[-1] + ".mp4 " + save_dir + "/"
        os.system(os_cmd)
Beispiel #38
0
        self.obs_in = input_dict["obs"]
        self.fcnet = FullyConnectedNetwork(input_dict, self.obs_space,
                                           self.action_space, num_outputs,
                                           options)
        return self.fcnet.outputs, self.fcnet.last_layer


if __name__ == "__main__":

    board_sizes = (3, 3)
    diff = 2
    st = 100000000

    env_name = 'puzzle-v0'
    #my_board = gym.make('gym_puzzle:puzzle-v0')
    register_env(env_name, lambda config: PuzzleEnv(config))

    ray.init()
    # ModelCatalog.register_custom_model("my_model", CustomModel)
    tune.run(
        "PPO",
        stop={
            #"timesteps_total": 10000,
            #"episode_len_mean": 20.0,
            "training_iteration": 50,
        },
        config={
            "env": "puzzle-v0",  # or "puzzle-v0" if registered above
            # "model":
            #     "custom_model": "my_model",
            # },
        # Batch dot product => shape of logits is [BATCH, MAX_ACTIONS].
        action_logits = tf.reduce_sum(avail_actions * intent_vector, axis=2)

        # Mask out invalid actions (use tf.float32.min for stability)
        inf_mask = tf.maximum(tf.log(action_mask), tf.float32.min)
        masked_logits = inf_mask + action_logits

        return masked_logits, last_layer


if __name__ == "__main__":
    args = parser.parse_args()
    ray.init()

    ModelCatalog.register_custom_model("pa_model", ParametricActionsModel)
    register_env("pa_cartpole", lambda _: ParametricActionCartpole(10))
    if args.run == "PPO":
        cfg = {
            "observation_filter": "NoFilter",  # don't filter the action list
            "vf_share_layers": True,  # don't create duplicate value model
        }
    elif args.run == "DQN":
        cfg = {
            "hiddens": [],  # important: don't postprocess the action scores
        }
    else:
        cfg = {}  # PG, IMPALA, A2C, etc.
    run_experiments({
        "parametric_cartpole": {
            "run": args.run,
            "env": "pa_cartpole",
 def register_env_creator(self):
     register_env("RoboschoolReacher-v1", create_environment)
Beispiel #41
0
            # Box(low=-1, high=1000, shape=(31,), dtype=np.float)
            Box(np.array(lower_bounds), np.array(upper_bounds)))

    def run(self):
        print("Starting policy server at {}:{}".format(SERVER_ADDRESS,
                                                       SERVER_PORT))
        server = PolicyServer(self, SERVER_ADDRESS, SERVER_PORT)
        server.serve_forever()


if __name__ == "__main__":

    # ray.init(redis_max_memory=10000000000, object_store_memory=3000000000, memory=2000000000)
    ray.init()

    register_env("srv", lambda _: MarketServing())

    # We use DQN since it supports off-policy actions, but you can choose and
    # configure any agent.
    # dqn = PGTrainer(
    #     env="srv",
    #     config={
    #         # Use a single process to avoid needing to set up a load balancer
    #         # "num_workers": 0,
    #         "evaluation_num_episodes": 1,
    #         # "sample_batch_size": 40,
    #         # "train_batch_size": 40,
    #         # "horizon": 40,
    #         "sample_batch_size": 15,
    #         "train_batch_size": 128,
    #     })
Beispiel #42
0
class CartpoleServing(ExternalEnv):
    def __init__(self):
        ExternalEnv.__init__(
            self, spaces.Discrete(2),
            spaces.Box(low=-10, high=10, shape=(4, ), dtype=np.float32))

    def run(self):
        print("Starting policy server at {}:{}".format(SERVER_ADDRESS,
                                                       SERVER_PORT))
        server = PolicyServer(self, SERVER_ADDRESS, SERVER_PORT)
        server.serve_forever()


if __name__ == "__main__":
    ray.init()
    register_env("srv", lambda _: CartpoleServing())

    # We use DQN since it supports off-policy actions, but you can choose and
    # configure any agent.
    dqn = DQNTrainer(
        env="srv",
        config={
            # Use a single process to avoid needing to set up a load balancer
            "num_workers": 0,
            # Configure the agent to run short iterations for debugging
            "exploration_fraction": 0.01,
            "learning_starts": 100,
            "timesteps_per_iteration": 200,
        })

    # Attempt to restore from checkpoint if possible.
Beispiel #43
0
    def reset(self):
        self.cur_pos = 0
        return [self.cur_pos]

    def step(self, action):
        assert action in [0, 1], action
        if action == 0 and self.cur_pos > 0:
            self.cur_pos -= 1
        elif action == 1:
            self.cur_pos += 1
        done = self.cur_pos >= self.end_pos
        return [self.cur_pos], 1 if done else 0, done, {}


if __name__ == "__main__":
    print("CORRIDOR TEST")
    env_creator_name = "corridor"
    register_env(env_creator_name, lambda config: SimpleCorridor(config))
    ray.init()
    run_experiments({
        "demo": {
            "run": "PPO",
            "env": "corridor",
            "config": {
                "env_config": {
                    "corridor_length": 5,
                },
            },
        },
    })
Beispiel #44
0
        # Batch dot product => shape of logits is [BATCH, MAX_ACTIONS].
        action_logits = tf.reduce_sum(avail_actions * intent_vector, axis=2)

        # Mask out invalid actions (use tf.float32.min for stability)
        inf_mask = tf.maximum(tf.log(action_mask), tf.float32.min)
        masked_logits = inf_mask + action_logits

        return masked_logits, last_layer


if __name__ == "__main__":
    args = parser.parse_args()
    ray.init()

    ModelCatalog.register_custom_model("pa_model", ParametricActionsModel)
    register_env("pa_cartpole", lambda _: ParametricActionCartpole(10))
    if args.run == "PPO":
        cfg = {
            "observation_filter": "NoFilter",  # don't filter the action list
            "vf_share_layers": True,  # don't create duplicate value model
        }
    elif args.run == "DQN":
        cfg = {
            "hiddens": [],  # important: don't postprocess the action scores
        }
    else:
        cfg = {}  # PG, IMPALA, A2C, etc.
    run_experiments({
        "parametric_cartpole": {
            "run": args.run,
            "env": "pa_cartpole",
 def register_env_creator(self):
     register_env("RoboschoolHumanoid-v1", create_environment)
            rewards.append(reward)
        return {
            "episode_reward_mean": np.mean(rewards),
            "timesteps_this_iter": steps,
        }




import ray
from ray import tune
from ray.rllib.utils.seed import seed as rllib_seed
import rl_toy
from rl_toy.envs import RLToyEnv
from ray.tune.registry import register_env
register_env("RLToy-v0", lambda config: RLToyEnv(config))



from ray.rllib.models.preprocessors import OneHotPreprocessor
from ray.rllib.models import ModelCatalog
ModelCatalog.register_custom_preprocessor("ohe", OneHotPreprocessor)



#rllib_seed(0, 0, 0) ####IMP Doesn't work due to multi-process I think; so use config["seed"]
# np.random.seed(0)
# import random
# random.seed(0)
# import tensorflow as tf
# tf.set_random_seed(0)
from ray.rllib.agents.dqn.dqn_policy_graph import DQNPolicyGraph
from ray.rllib.agents.ppo.ppo import PPOAgent
from ray.rllib.agents.ppo.ppo_policy_graph import PPOPolicyGraph
from ray.rllib.tests.test_multi_agent_env import MultiCartpole
from ray.tune.logger import pretty_print
from ray.tune.registry import register_env

parser = argparse.ArgumentParser()
parser.add_argument("--num-iters", type=int, default=20)

if __name__ == "__main__":
    args = parser.parse_args()
    ray.init()

    # Simple environment with 4 independent cartpole entities
    register_env("multi_cartpole", lambda _: MultiCartpole(4))
    single_env = gym.make("CartPole-v0")
    obs_space = single_env.observation_space
    act_space = single_env.action_space

    # You can also have multiple policy graphs per trainer, but here we just
    # show one each for PPO and DQN.
    policy_graphs = {
        "ppo_policy": (PPOPolicyGraph, obs_space, act_space, {}),
        "dqn_policy": (DQNPolicyGraph, obs_space, act_space, {}),
    }

    def policy_mapping_fn(agent_id):
        if agent_id % 2 == 0:
            return "ppo_policy"
        else:
Beispiel #48
0
def setup_exps_rllib(flow_params,
                     n_cpus,
                     n_rollouts,
                     policy_graphs=None,
                     policy_mapping_fn=None,
                     policies_to_train=None):
    """Return the relevant components of an RLlib experiment.

    Parameters
    ----------
    flow_params : dict
        flow-specific parameters (see flow/utils/registry.py)
    n_cpus : int
        number of CPUs to run the experiment over
    n_rollouts : int
        number of rollouts per training iteration
    policy_graphs : dict, optional
        TODO
    policy_mapping_fn : function, optional
        TODO
    policies_to_train : list of str, optional
        TODO

    Returns
    -------
    str
        name of the training algorithm
    str
        name of the gym environment to be trained
    dict
        training configuration parameters
    """
    from ray import tune
    from ray.tune.registry import register_env
    try:
        from ray.rllib.agents.agent import get_agent_class
    except ImportError:
        from ray.rllib.agents.registry import get_agent_class

    horizon = flow_params['env'].horizon

    alg_run = "PPO"

    agent_cls = get_agent_class(alg_run)
    config = deepcopy(agent_cls._default_config)

    config["num_workers"] = n_cpus
    config["train_batch_size"] = horizon * n_rollouts
    config["gamma"] = 0.999  # discount rate
    config["model"].update({"fcnet_hiddens": [32, 32, 32]})
    config["use_gae"] = True
    config["lambda"] = 0.97
    config["kl_target"] = 0.02
    config["num_sgd_iter"] = 10
    config["horizon"] = horizon

    # save the flow params for replay
    flow_json = json.dumps(flow_params,
                           cls=FlowParamsEncoder,
                           sort_keys=True,
                           indent=4)
    config['env_config']['flow_params'] = flow_json
    config['env_config']['run'] = alg_run

    # multiagent configuration
    if policy_graphs is not None:
        print("policy_graphs", policy_graphs)
        config['multiagent'].update({'policies': policy_graphs})
    if policy_mapping_fn is not None:
        config['multiagent'].update(
            {'policy_mapping_fn': tune.function(policy_mapping_fn)})
    if policies_to_train is not None:
        config['multiagent'].update({'policies_to_train': policies_to_train})

    create_env, gym_name = make_create_env(params=flow_params)

    # Register as rllib env
    register_env(gym_name, create_env)
    return alg_run, gym_name, config
 def testNoStepOnInit(self):
     register_env("fail", lambda _: FailOnStepEnv())
     pg = PGAgent(env="fail", config={"num_workers": 1})
     self.assertRaises(Exception, lambda: pg.train())
Beispiel #50
0
    def __init__(self):
        super(FeedingPandaEnv,
              self).__init__(robot=Panda(robot_arm),
                             human=Human(human_controllable_joint_indices,
                                         controllable=False))


class FeedingPR2HumanEnv(FeedingEnv, MultiAgentEnv):
    def __init__(self):
        super(FeedingPR2HumanEnv,
              self).__init__(robot=PR2(robot_arm),
                             human=Human(human_controllable_joint_indices,
                                         controllable=True))


register_env('assistive_gym:FeedingPR2Human-v1',
             lambda config: FeedingPR2HumanEnv())


class FeedingBaxterHumanEnv(FeedingEnv, MultiAgentEnv):
    def __init__(self):
        super(FeedingBaxterHumanEnv,
              self).__init__(robot=Baxter(robot_arm),
                             human=Human(human_controllable_joint_indices,
                                         controllable=True))


register_env('assistive_gym:FeedingBaxterHuman-v1',
             lambda config: FeedingBaxterHumanEnv())


class FeedingSawyerHumanEnv(FeedingEnv, MultiAgentEnv):
Beispiel #51
0
"""Integration test: (1) pendulum works, (2) single-agent multi-agent works."""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import ray
from ray.rllib.test.test_multi_agent_env import make_multiagent
from ray.tune import run_experiments
from ray.tune.registry import register_env

if __name__ == "__main__":
    ray.init()
    MultiPendulum = make_multiagent("Pendulum-v0")
    register_env("multi_pend", lambda _: MultiPendulum(1))
    trials = run_experiments({
        "test": {
            "run": "PPO",
            "env": "multi_pend",
            "stop": {
                "timesteps_total": 500000,
                "episode_reward_mean": -200,
            },
            "config": {
                "train_batch_size": 2048,
                "vf_clip_param": 10.0,
                "num_workers": 0,
                "num_envs_per_worker": 10,
                "lambda": 0.1,
                "gamma": 0.95,
                "lr": 0.0003,
def setup_exps_rllib(flow_params,
                     n_cpus,
                     n_rollouts,
                     policy_graphs=None,
                     policy_mapping_fn=None,
                     policies_to_train=None,
                     flags=None):
    from ray import tune
    from ray.tune.registry import register_env
    try:
        from ray.rllib.agents.agent import get_agent_class
    except ImportError:
        from ray.rllib.agents.registry import get_agent_class
    import torch
    horizon = flow_params['env'].horizon

    from ray.rllib.agents.ddpg.ddpg import DEFAULT_CONFIG
    alg_run = "DDPG"
    agent_cls = get_agent_class(alg_run)
    config = deepcopy(agent_cls._default_config)
    config["num_workers"] = 1
    # model
    config['n_step'] = 1
    config['actor_hiddens'] = [64, 64]
    config['actor_lr'] = 0.0001  # in article 'ddpg'
    config['critic_lr'] = 0.0001
    config['critic_hiddens'] = [64, 64]
    config['gamma'] = 0.99
    config['model']['fcnet_hiddens'] = [64, 64]
    config['lr'] = 1e-5
    # exploration
    config['exploration_config']['final_scale'] = 0.05
    config['exploration_config']['scale_timesteps'] = 1500000
    config['exploration_config']['ou_base_scale'] = 0.1
    config['exploration_config']['ou_theta'] = 0.15
    config['exploration_config']['ou_sigma'] = 0.2
    # optimization
    config['tau'] = 0.001
    config['l2_reg'] = 1e-6
    config['train_batch_size'] = 64
    config['learning_starts'] = 3000
    # evaluation
    #config['evaluation_interval'] = 5
    config['buffer_size'] = 300000  #3e5
    config['timesteps_per_iteration'] = 3000
    config['prioritized_replay'] = False

    #common config
    config['framework'] = 'torch'
    config['callbacks'] = {
        "on_episode_end": None,
        "on_episode_start": None,
        "on_episode_step": None,
        "on_postprocess_traj": None,
        "on_sample_end": None,
        "on_train_result": None
    }
    # config["opt_type"]= "adam" for impala and APPO, default is SGD
    # TrainOneStep class call SGD -->execution_plan function can have policy update function
    print("cuda is available: ", torch.cuda.is_available())
    print('Beginning training.')
    print("==========================================")
    print("running algorithm: ", alg_run)  # "Framework: ", "torch"

    # save the flow params for replay
    flow_json = json.dumps(flow_params,
                           cls=FlowParamsEncoder,
                           sort_keys=True,
                           indent=4)
    config['env_config']['flow_params'] = flow_json
    config['env_config']['run'] = alg_run

    # multiagent configuration
    if policy_graphs is not None:
        print("policy_graphs", policy_graphs)
        config['multiagent'].update({'policies': policy_graphs})
    if policy_mapping_fn is not None:
        config['multiagent'].update(
            {'policy_mapping_fn': tune.function(policy_mapping_fn)})
    if policies_to_train is not None:
        config['multiagent'].update({'policies_to_train': policies_to_train})

    create_env, gym_name = make_create_env(params=flow_params)

    # Register as rllib env
    register_env(gym_name, create_env)
    return alg_run, gym_name, config
 def register_env_creator(self):
     register_env("NetworkCompression-v1", create_environment)
Beispiel #54
0
            obs[i], rew[i], done[i], info[i] = [
                featurize(step_obs[0][i]),
                step_obs[1][i],
                step_obs[1][i] == -1 or step_obs[2],
                step_obs[3],
            ]

        done["__all__"] = step_obs[2]
        return obs, rew, done, info

    def reset(self):
        obs = self.env.reset()
        return {i: featurize(obs[i]) for i in self.agents_index}


register_env("pommer_team", lambda _: MultiAgent())

sys.setrecursionlimit(1000)


class PhasePPO(PPOAgent):
    def __init__(self, config=None, env=None, logger_creator=None):
        super(PhasePPO, self).__init__(config=config,
                                       env=env,
                                       logger_creator=logger_creator)
        self.train_phase = 0


def on_episode_end(info):
    env = info["env"]
    episode.custom_metrics["train_phase"] = env.get_phase()
Beispiel #55
0
def train(args,parser = None):
    """Train agent

    :args: Argparse.args: User-defined arguments

    """

    # Set logging level
    logging.basicConfig(level= args.log_level,
                        format='%(message)s')

    # Initialize mail bot
    if args.email_updates:
        args.mail_bot = CameleonEmailBot(email_sender = args.email_sender,
                                    email_receiver = args.email_receiver,
                                    email_server = args.email_server)

    # Initialize Ray - and try to prevent OOM
    #Spin up Ray only if it is not already running
    if args.init_ray:
        ray.init(object_store_memory = args.ray_obj_store_mem)

    # Set up environment
    env = gym.make(args.env_name)

    # Wrap environment
    env = wrap_env(env, args.wrappers)

    # Register environment with Ray
    register_env(args.env_name, lambda config: env)

    # Set model and config
    model, config = str2model(args.model_name, config = True)

    #Add to config for compute resources
    config['num_workers'] = args.num_workers
    config['num_gpus'] = args.num_gpus
    config['framework'] = args.framework
    config['seed'] = args.seed
    _determine_stopping_criteria(args)

    #Update config if one was passed
    if args.config:
        config = update_config(config, args.config)

    # Update outdir
    args.outdir_root = args.outdir
    args.outdir = "{}{}_{}_{}_rs{}_w{}_{}".format(args.outdir,
                                    args.model_name,
                                    args.framework,
                                    args.env_name,
                                    args.seed,
                                    args.num_workers,
                                    dt.datetime.now().strftime("%Y.%m.%d"))

    args.tune_dirname = "{}_{}_rs{}_w{}_{}".format(
                                    args.model_name,
                                    args.framework,
                                    args.seed,
                                    args.num_workers,
                                    dt.datetime.now().strftime("%Y.%m.%d"))


    # Set up agent
    agent = model(env = args.env_name,
                config = config,
                logger_creator=cameleon_logger_creator(
                        args.outdir))

    # Change to pretrained model if needed
    if args.checkpoint_path:
        agent.restore(args.checkpoint_path)

    if args.tune:
        agent = args.model_name


    # Train the agent
    train_agent(agent,
                args,
                config,
                tune = args.tune)

    # Shutdown Ray (ensures fresh start for random seeds)
    ray.shutdown()

    # Send email update, if necessary
    if args.email_updates and not args.failure_message:
        args.mail_bot.send_email("train_finished", args)
Beispiel #56
0
def main(args):
    ray.init(redis_max_memory=int(ray.utils.get_system_memory() * 0.4),
             memory=int(ray.utils.get_system_memory() * 0.2),
             object_store_memory=int(ray.utils.get_system_memory() * 0.2),
             num_gpus=args.num_gpus,
             num_cpus=6,
             temp_dir=args.temp_dir)

    discrete_action_input = False

    if args.trainer == 'dqn':
        trainer = DQNTrainer
        discrete_action_input = True
    else:
        raise Exception('Unknown trainer: "{}"'.format(args.trainer))

    def env_creater(mpe_args):
        return MultiAgentParticleEnv(**mpe_args)

    register_env("mpe", env_creater)

    env = env_creater({
        "scenario_name": args.scenario,
        "discrete_action_input": discrete_action_input
    })

    def gen_policy(i):
        return (None, env.observation_space_dict[i], env.action_space_dict[i],
                {
                    "agent_id": i,
                    "use_local_critic": False,
                    "obs_space_dict": env.observation_space_dict,
                    "act_space_dict": env.action_space_dict,
                })

    policies = {
        "policy_%d" % i: gen_policy(i)
        for i in range(len(env.observation_space_dict))
    }
    policy_ids = list(policies.keys())

    def policy_mapping_fn(agent_id):
        return policy_ids[agent_id]

    exp_name = "{}{}".format(
        args.scenario.replace("_", "").replace("-", ""),
        "_{}".format(args.add_postfix) if args.add_postfix != "" else "")

    run_experiments(
        {
            exp_name: {
                "run": trainer,
                "env": "mpe",
                "stop": {
                    "episodes_total": args.num_episodes,
                },
                "checkpoint_freq": args.checkpoint_freq,
                "local_dir": args.local_dir,
                "restore": args.restore,
                "config": {
                    # === Log ===
                    "log_level": "ERROR",

                    # === Environment ===
                    "env_config": {
                        "scenario_name": args.scenario,
                        "discrete_action_input": discrete_action_input
                    },
                    "num_envs_per_worker": args.num_envs_per_worker,
                    "horizon": args.max_episode_len,

                    # === Policy Config ===
                    # --- Model ---
                    # "good_policy": args.good_policy,
                    # "adv_policy": args.adv_policy,
                    # "actor_hiddens": [args.num_units] * 2,
                    # "actor_hidden_activation": "relu",
                    # "critic_hiddens": [args.num_units] * 2,
                    # "critic_hidden_activation": "relu",
                    "n_step": args.n_step,
                    "gamma": args.gamma,

                    # --- Exploration ---
                    # "tau": 0.01,

                    # --- Replay buffer ---
                    "buffer_size":
                    args.replay_buffer,  # int(10000), # int(1e6)

                    # --- Optimization ---
                    # "actor_lr": args.lr,
                    # "critic_lr": args.lr,
                    "learning_starts":
                    args.train_batch_size * args.max_episode_len,
                    "sample_batch_size": args.sample_batch_size,
                    "train_batch_size": args.train_batch_size,
                    "batch_mode": "truncate_episodes",

                    # --- Parallelism ---
                    "num_workers": args.num_workers,
                    "num_gpus": args.num_gpus,
                    "num_gpus_per_worker": 0,

                    # === Multi-agent setting ===
                    "multiagent": {
                        "policies": policies,
                        "policy_mapping_fn":
                        ray.tune.function(policy_mapping_fn)
                    },
                },
            },
        },
        verbose=0,
        reuse_actors=False)  # reuse_actors=True - messes up the results
Beispiel #57
0
 def register_env_creator(self):
     register_env(
         "stacked_procgen_env",  # This should be different from procgen_env_wrapper
         lambda config: gym.wrappers.FrameStack(ProcgenEnvWrapper(config), 4
                                                ))
    # vehicles to be placed in the network at the start of a rollout (see
    # flow.core.params.VehicleParams)
    veh=vehicles,

    # parameters specifying the positioning of vehicles upon initialization/
    # reset (see flow.core.params.InitialConfig)
    initial=InitialConfig(),
)

# SET UP RLLIB MULTI-AGENT FEATURES

create_env, env_name = make_create_env(params=flow_params, version=0)

# register as rllib env
register_env(env_name, create_env)

# multiagent configuration
test_env = create_env()
obs_space = test_env.observation_space
act_space = test_env.action_space

POLICY_GRAPHS = {'av': (PPOTFPolicy, obs_space, act_space, {})}

POLICIES_TO_TRAIN = ['av']


def policy_mapping_fn(_):
    """Map a policy in RLlib."""
    return 'av'
Beispiel #59
0
import os

import ray
from ray.rllib.agents.dqn import DQNAgent
from ray.rllib.models import ModelCatalog
from ray.tune.logger import pretty_print
from ray.tune.registry import register_env

from algos.gym_halite import env_creator
from algos.model import ParametricActionsModel

CHECKPOINT_FILE = "last_checkpoint.out"

ray.init(local_mode=True)
ModelCatalog.register_custom_model("parametric", ParametricActionsModel)
register_env("halite_env", env_creator)
dqn = DQNAgent(
    env="halite_env",
    config={
        "env_config": {},
        "num_workers": 1,
        "num_cpus_per_worker": 1,
        "num_envs_per_worker": 1,
        "num_gpus": 1,
        "hiddens": [],
        "schedule_max_timesteps": 100000000,
        "timesteps_per_iteration": 1000,
        "exploration_fraction": 0.8,
        "exploration_final_eps": 0.02,
        "lr": 1e-3,
        "model": {
Beispiel #60
0
        else:
            reward = -1
        done = len(self.history) > 100
        return self._next_obs(), reward, done, {}

    def _next_obs(self):
        token = random.choice([0, 1])
        self.history.append(token)
        return token


if __name__ == "__main__":
    ray.init()
    args = parser.parse_args()
    ModelCatalog.register_custom_model("rnn", MyKerasRNN)
    register_env("RepeatAfterMeEnv", lambda c: RepeatAfterMeEnv(c))
    register_env("RepeatInitialEnv", lambda _: RepeatInitialEnv())
    tune.run(args.run,
             stop={"episode_reward_mean": args.stop},
             config={
                 "env": args.env,
                 "env_config": {
                     "repeat_delay": 2,
                 },
                 "gamma": 0.9,
                 "num_workers": 0,
                 "num_envs_per_worker": 20,
                 "entropy_coeff": 0.001,
                 "num_sgd_iter": 5,
                 "vf_loss_coeff": 1e-5,
                 "model": {