Esempio n. 1
0
    def doTestNestedTuple(self, make_env):
        ModelCatalog.register_custom_model("composite2", TupleSpyModel)
        register_env("nested2", make_env)
        pg = PGAgent(
            env="nested2",
            config={
                "num_workers": 0,
                "sample_batch_size": 5,
                "train_batch_size": 5,
                "model": {
                    "custom_model": "composite2",
                },
            })
        pg.train()

        # Check that the model sees the correct reconstructed observations
        for i in range(4):
            seen = pickle.loads(
                ray.experimental.internal_kv._internal_kv_get(
                    "t_spy_in_{}".format(i)))
            pos_i = TUPLE_SAMPLES[i][0].tolist()
            cam_i = TUPLE_SAMPLES[i][1][0].tolist()
            task_i = one_hot(TUPLE_SAMPLES[i][2], 5)
            self.assertEqual(seen[0][0].tolist(), pos_i)
            self.assertEqual(seen[1][0].tolist(), cam_i)
            self.assertEqual(seen[2][0].tolist(), task_i)
Esempio n. 2
0
    def testPyTorchModel(self):
        ModelCatalog.register_custom_model("composite", TorchSpyModel)
        register_env("nested", lambda _: NestedDictEnv())
        a2c = A2CAgent(
            env="nested",
            config={
                "num_workers": 0,
                "use_pytorch": True,
                "sample_batch_size": 5,
                "train_batch_size": 5,
                "model": {
                    "custom_model": "composite",
                },
            })

        a2c.train()

        # Check that the model sees the correct reconstructed observations
        for i in range(4):
            seen = pickle.loads(
                ray.experimental.internal_kv._internal_kv_get(
                    "torch_spy_in_{}".format(i)))
            pos_i = DICT_SAMPLES[i]["sensors"]["position"].tolist()
            cam_i = DICT_SAMPLES[i]["sensors"]["front_cam"][0].tolist()
            task_i = one_hot(
                DICT_SAMPLES[i]["inner_state"]["job_status"]["task"], 5)
            self.assertEqual(seen[0][0].tolist(), pos_i)
            self.assertEqual(seen[1][0].tolist(), cam_i)
            self.assertEqual(seen[2][0].tolist(), task_i)
Esempio n. 3
0
    def doTestNestedDict(self, make_env, test_lstm=False):
        ModelCatalog.register_custom_model("composite", DictSpyModel)
        register_env("nested", make_env)
        pg = PGAgent(
            env="nested",
            config={
                "num_workers": 0,
                "sample_batch_size": 5,
                "train_batch_size": 5,
                "model": {
                    "custom_model": "composite",
                    "use_lstm": test_lstm,
                },
            })
        pg.train()

        # Check that the model sees the correct reconstructed observations
        for i in range(4):
            seen = pickle.loads(
                ray.experimental.internal_kv._internal_kv_get(
                    "d_spy_in_{}".format(i)))
            pos_i = DICT_SAMPLES[i]["sensors"]["position"].tolist()
            cam_i = DICT_SAMPLES[i]["sensors"]["front_cam"][0].tolist()
            task_i = one_hot(
                DICT_SAMPLES[i]["inner_state"]["job_status"]["task"], 5)
            self.assertEqual(seen[0][0].tolist(), pos_i)
            self.assertEqual(seen[1][0].tolist(), cam_i)
            self.assertEqual(seen[2][0].tolist(), task_i)
Esempio n. 4
0
 def testInvalidModel(self):
     ModelCatalog.register_custom_model("invalid", InvalidModel)
     self.assertRaises(ValueError, lambda: PGAgent(
         env="CartPole-v0", config={
             "model": {
                 "custom_model": "invalid",
             },
         }))
Esempio n. 5
0
 def testCustomModel(self):
     ray.init()
     ModelCatalog.register_custom_model("foo", CustomModel)
     p1 = ModelCatalog.get_model({
         "obs": tf.constant([1, 2, 3])
     }, Box(0, 1, shape=(3, ), dtype=np.float32), Discrete(5), 5,
                                 {"custom_model": "foo"})
     self.assertEqual(str(type(p1)), str(CustomModel))
Esempio n. 6
0
    def testMinibatchSequencing(self):
        ModelCatalog.register_custom_model("rnn", RNNSpyModel)
        register_env("counter", lambda _: DebugCounterEnv())
        ppo = PPOAgent(
            env="counter",
            config={
                "num_workers": 0,
                "sample_batch_size": 20,
                "train_batch_size": 20,
                "sgd_minibatch_size": 10,
                "vf_share_layers": True,
                "simple_optimizer": False,
                "num_sgd_iter": 1,
                "model": {
                    "custom_model": "rnn",
                    "max_seq_len": 4,
                },
            })
        ppo.train()
        ppo.train()

        # first epoch: 20 observations get split into 2 minibatches of 8
        # four observations are discarded
        batch0 = pickle.loads(
            ray.experimental.internal_kv._internal_kv_get("rnn_spy_in_0"))
        batch1 = pickle.loads(
            ray.experimental.internal_kv._internal_kv_get("rnn_spy_in_1"))
        if batch0["sequences"][0][0][0] > batch1["sequences"][0][0][0]:
            batch0, batch1 = batch1, batch0  # sort minibatches
        self.assertEqual(batch0["seq_lens"].tolist(), [4, 4])
        self.assertEqual(batch1["seq_lens"].tolist(), [4, 3])
        self.assertEqual(batch0["sequences"].tolist(), [
            [[0], [1], [2], [3]],
            [[4], [5], [6], [7]],
        ])
        self.assertEqual(batch1["sequences"].tolist(), [
            [[8], [9], [10], [11]],
            [[12], [13], [14], [0]],
        ])

        # second epoch: 20 observations get split into 2 minibatches of 8
        # four observations are discarded
        batch2 = pickle.loads(
            ray.experimental.internal_kv._internal_kv_get("rnn_spy_in_2"))
        batch3 = pickle.loads(
            ray.experimental.internal_kv._internal_kv_get("rnn_spy_in_3"))
        if batch2["sequences"][0][0][0] > batch3["sequences"][0][0][0]:
            batch2, batch3 = batch3, batch2
        self.assertEqual(batch2["seq_lens"].tolist(), [4, 4])
        self.assertEqual(batch3["seq_lens"].tolist(), [2, 4])
        self.assertEqual(batch2["sequences"].tolist(), [
            [[5], [6], [7], [8]],
            [[9], [10], [11], [12]],
        ])
        self.assertEqual(batch3["sequences"].tolist(), [
            [[13], [14], [0], [0]],
            [[0], [1], [2], [3]],
        ])
Esempio n. 7
0
 def testInvalidModel2(self):
     ModelCatalog.register_custom_model("invalid2", InvalidModel2)
     self.assertRaisesRegexp(
         ValueError, "Expected output.*",
         lambda: PGAgent(
             env="CartPole-v0", config={
                 "model": {
                     "custom_model": "invalid2",
                 },
             }))
Esempio n. 8
0
    def testSimpleOptimizerSequencing(self):
        ModelCatalog.register_custom_model("rnn", RNNSpyModel)
        register_env("counter", lambda _: DebugCounterEnv())
        ppo = PPOAgent(
            env="counter",
            config={
                "num_workers": 0,
                "sample_batch_size": 10,
                "train_batch_size": 10,
                "sgd_minibatch_size": 10,
                "vf_share_layers": True,
                "simple_optimizer": True,
                "num_sgd_iter": 1,
                "model": {
                    "custom_model": "rnn",
                    "max_seq_len": 4,
                },
            })
        ppo.train()
        ppo.train()

        batch0 = pickle.loads(
            ray.experimental.internal_kv._internal_kv_get("rnn_spy_in_0"))
        self.assertEqual(
            batch0["sequences"].tolist(),
            [[[0], [1], [2], [3]], [[4], [5], [6], [7]], [[8], [9], [0], [0]]])
        self.assertEqual(batch0["seq_lens"].tolist(), [4, 4, 2])
        self.assertEqual(batch0["state_in"][0][0].tolist(), [0, 0, 0])
        self.assertEqual(batch0["state_in"][1][0].tolist(), [0, 0, 0])
        self.assertGreater(abs(np.sum(batch0["state_in"][0][1])), 0)
        self.assertGreater(abs(np.sum(batch0["state_in"][1][1])), 0)
        self.assertTrue(
            np.allclose(batch0["state_in"][0].tolist()[1:],
                        batch0["state_out"][0].tolist()[:-1]))
        self.assertTrue(
            np.allclose(batch0["state_in"][1].tolist()[1:],
                        batch0["state_out"][1].tolist()[:-1]))

        batch1 = pickle.loads(
            ray.experimental.internal_kv._internal_kv_get("rnn_spy_in_1"))
        self.assertEqual(batch1["sequences"].tolist(), [
            [[10], [11], [12], [13]],
            [[14], [0], [0], [0]],
            [[0], [1], [2], [3]],
            [[4], [0], [0], [0]],
        ])
        self.assertEqual(batch1["seq_lens"].tolist(), [4, 1, 4, 1])
        self.assertEqual(batch1["state_in"][0][2].tolist(), [0, 0, 0])
        self.assertEqual(batch1["state_in"][1][2].tolist(), [0, 0, 0])
        self.assertGreater(abs(np.sum(batch1["state_in"][0][0])), 0)
        self.assertGreater(abs(np.sum(batch1["state_in"][1][0])), 0)
        self.assertGreater(abs(np.sum(batch1["state_in"][0][1])), 0)
        self.assertGreater(abs(np.sum(batch1["state_in"][1][1])), 0)
        self.assertGreater(abs(np.sum(batch1["state_in"][0][3])), 0)
        self.assertGreater(abs(np.sum(batch1["state_in"][1][3])), 0)
Esempio n. 9
0
    def testMultiAgentComplexSpaces(self):
        ModelCatalog.register_custom_model("dict_spy", DictSpyModel)
        ModelCatalog.register_custom_model("tuple_spy", TupleSpyModel)
        register_env("nested_ma", lambda _: NestedMultiAgentEnv())
        act_space = spaces.Discrete(2)
        pg = PGAgent(
            env="nested_ma",
            config={
                "num_workers": 0,
                "sample_batch_size": 5,
                "train_batch_size": 5,
                "multiagent": {
                    "policy_graphs": {
                        "tuple_policy": (
                            PGPolicyGraph, TUPLE_SPACE, act_space,
                            {"model": {"custom_model": "tuple_spy"}}),
                        "dict_policy": (
                            PGPolicyGraph, DICT_SPACE, act_space,
                            {"model": {"custom_model": "dict_spy"}}),
                    },
                    "policy_mapping_fn": lambda a: {
                        "tuple_agent": "tuple_policy",
                        "dict_agent": "dict_policy"}[a],
                },
            })
        pg.train()

        for i in range(4):
            seen = pickle.loads(
                ray.experimental.internal_kv._internal_kv_get(
                    "d_spy_in_{}".format(i)))
            pos_i = DICT_SAMPLES[i]["sensors"]["position"].tolist()
            cam_i = DICT_SAMPLES[i]["sensors"]["front_cam"][0].tolist()
            task_i = one_hot(
                DICT_SAMPLES[i]["inner_state"]["job_status"]["task"], 5)
            self.assertEqual(seen[0][0].tolist(), pos_i)
            self.assertEqual(seen[1][0].tolist(), cam_i)
            self.assertEqual(seen[2][0].tolist(), task_i)

        for i in range(4):
            seen = pickle.loads(
                ray.experimental.internal_kv._internal_kv_get(
                    "t_spy_in_{}".format(i)))
            pos_i = TUPLE_SAMPLES[i][0].tolist()
            cam_i = TUPLE_SAMPLES[i][1][0].tolist()
            task_i = one_hot(TUPLE_SAMPLES[i][2], 5)
            self.assertEqual(seen[0][0].tolist(), pos_i)
            self.assertEqual(seen[1][0].tolist(), cam_i)
            self.assertEqual(seen[2][0].tolist(), task_i)
Esempio n. 10
0
def get_config(args: Args):
    # num_rollouts = 2
    ModelCatalog.register_custom_model("SoftModularActorCriticNet", SoftModularActorCriticNet)
    ModelCatalog.register_custom_model("SimpleEnsembleActorCriticNet", SimpleEnsembleActorCriticNet)
    # 1. Gets default training configuration and specifies the POMgame to load.
    config = deepcopy(get_agent_class(args.alg_name)._default_config)

    # 2. Set environment config. This will be passed to
    # the env_creator function via the register env lambda below.
    # local_ratio specify hthe ratio between global reward and the local reward
    # config["env_config"] = {"local_ratio": 0.5}
    def env_creator():
        if args.game.__package__.endswith('atari'):
            if (args.game_name.startswith('foozpong') or
                args.game_name.startswith('basketball_pong') or
                args.game_name.startswith('volleyball_pong')
                ):
                env = args.game.env(obs_type=args.atari_obs_type,
                                    max_cycles=args.max_steps['atari'],
                                    full_action_space=False,
                                    num_players=2)
            else:
                env = args.game.env(obs_type=args.atari_obs_type,
                                    full_action_space=False,
                                    max_cycles=args.max_steps['atari'])
            env = frame_skip_v0(env, args.atari_frame_skip_num)
            env = frame_stack_v1(env, args.atari_frame_stack_num)

        else:
            env = args.game.env()
        if args.game_name.startswith('rps'):
            env = one_hot_obs_wrapper(env)
        env = dtype_v0(env, dtype=float32)
        env = pad_observations_v0(env)
        env = pad_action_space_v0(env)
        if args.game_name.startswith('connect_four') or args.game_name.startswith('tictactoe'):
            env = FlattenEnvWrapper(env)
        GAUSSIAN_STD = 1.0
        assert abs(GAUSSIAN_STD - 1.0) < 1e-5, "must be 1.0, otherwise simple ensemble implementation is wrong"
        env = LatentGaussianAugmentedEnvWrapper(env,
                                                latent_parameter_dim=args.latent_para_dim,
                                                gaussian_std=1.0,
                                                use_dict_obs_space=args.use_dict_obs_space)
        return env

    # 3. Register env, and get trainer_class
    register_env(args.game_name,
                 lambda config: PettingZooEnv(env_creator()))
    trainer_class = get_agent_class(args.alg_name)

    # 4. Extract space dimensions
    test_env = PettingZooEnv(env_creator())
    obs_space = test_env.observation_space
    act_space = test_env.action_space
    agents_id = test_env.agents
    print(f"obs_space: {obs_space}; act_space: {act_space}")

    # 5. Configuration for multiagent setup:
    config["framework"] = "torch"
    config["num_gpus"] = 0
    config["log_level"] = "INFO"
    config["num_workers"] = args.num_cpus // 2
    config["num_cpus_per_worker"] = 1
    config['num_envs_per_worker'] = 5
    # Fragment length, collected at once from each worker and for each agent!
    config["rollout_fragment_length"] = 100
    # Training batch size -> Fragments are concatenated up to this point.
    config["train_batch_size"] = 2000
    config["sgd_minibatch_size"] = 256
    config["entropy_coeff"] = 0.01
    config["lambda"] = 0.9
    config["vf_clip_param"] = 50
    config["num_sgd_iter"] = 10
    # After n steps, force reset simulation
    config["horizon"] = args.max_steps[args.game_type]
    # Default: False
    config["no_done_at_end"] = False
    # Info: If False, each agents trajectory is expected to have
    # maximum one done=True in the last step of the trajectory.
    # If no_done_at_end = True, environment is not resetted
    # when dones[__all__]= True.
    config['ignore_worker_failures'] = True

    def get_main_and_test_config(config: Dict[str, Any]) -> Tuple[Dict[str, Any],
                                                                Dict[str, Any]]:

        main_policies = {}
        for i, agent_id in enumerate(agents_id):
            for j in range(1):
                main_policies[f'{agent_id}_{j}'] = (PPOTorchPolicy,
                                                    obs_space,
                                                    act_space,
                                                    {"framework": "torch"})
        test_policies = {
                'test_' + agent_id: (PPOTorchPolicy, obs_space, act_space, {"framework": "torch"})
                for agent_id in agents_id if is_adversary(agent_id)
                        }
        policies = {**main_policies, **test_policies}

        main_config, test_config = deepcopy(config), deepcopy(config)

        main_config["multiagent"] = {
            "policies": policies,
            "policy_mapping_fn": lambda agent_id: f'{agent_id}_{0}',
            "policies_to_train": list(main_policies.keys())
        }

        def test_config_policy_mapping(agent_id: str) -> str:
            if is_adversary(agent_id):
                return 'test_' + agent_id
            return f'{agent_id}_{0}'

        test_config["multiagent"] = {
            "policies": policies,
            "policy_mapping_fn": test_config_policy_mapping,
            "policies_to_train": list(test_policies.keys())
        }
        return main_config, test_config

    def get_simple_ensemble_training_config(config: Dict[str, Any], ensemble_size: int=3) -> Tuple[Dict[str, Any],
                                                                             Dict[str, Any]]:
        if ensemble_size > 1:
            config["model"] = {
                    "custom_model": "SimpleEnsembleActorCriticNet",
                    "custom_model_config": {
                                            "use_dict_obs_space": args.use_dict_obs_space,
                                            'ensemble_size': ensemble_size
                                            }
                            }
        main_config, test_config = get_main_and_test_config(config)
        return main_config, test_config

    def get_implicit_ensemble_training_config(config: Dict[str, Any]) -> Tuple[Dict[str, Any],
                                                                               Dict[str, Any]]:
        config["model"] = {
                "custom_model": "SoftModularActorCriticNet",
                "custom_model_config": {
                                        "use_latent_embedding": args.use_latent_embedding,
                                        "use_dict_obs_space": args.use_dict_obs_space,
                                        "base_type": MLPBase,
                                        "em_input_shape": args.latent_para_dim,
                                        "emb_shaping_net_hidden_shapes": args.emb_shaping_net_hidden_shapes,
                                        'emb_shaping_net_last_softmax': args.emb_shaping_net_last_softmax,
                                        'em_hidden_shapes': [args.soft_modular_net_hidden_dim,
                                                             args.soft_modular_net_hidden_dim], #[400],
                                        'hidden_shapes': [args.soft_modular_net_hidden_dim,
                                                          args.soft_modular_net_hidden_dim], #[400, 400],
                                        'num_layers': args.soft_modular_net_num_layers, #4,
                                        'num_modules': args.soft_modular_net_num_modules, #4,
                                        'module_hidden': args.soft_modular_net_hidden_dim, #128,
                                        'gating_hidden': args.soft_modular_net_hidden_dim, #256,
                                        'num_gating_layers': 2,  #with 1 gating layer, 500 step works for simple_spread
                                        'add_bn': False,
                                        }
                        }
        main_config, test_config = get_main_and_test_config(config)
        return main_config, test_config

    if args.train_setting == 'single_policy':
        main_config, test_config = get_simple_ensemble_training_config(config, ensemble_size=1)
    elif args.train_setting == 'simple_ensemble':
        main_config, test_config = get_simple_ensemble_training_config(config, ensemble_size=3)
    else:
        assert args.train_setting == 'implicit_ensemble'
        main_config, test_config = get_implicit_ensemble_training_config(config)

    return trainer_class, test_env, main_config, test_config
Esempio n. 11
0
        self.base_model = tf.keras.Model(self.inputs, layer_out)

    # Implement the core forward method.
    def forward(self, input_dict, state, seq_lens):
        model_out = self.base_model(input_dict["obs"])
        return model_out, state

    def metrics(self):
        return {"foo": tf.constant(42.0)}


if __name__ == "__main__":
    args = parser.parse_args()
    ray.init(num_cpus=args.num_cpus or None)
    ModelCatalog.register_custom_model(
        "keras_model",
        MyVisionNetwork if args.use_vision_network else MyKerasModel)
    ModelCatalog.register_custom_model(
        "keras_q_model",
        MyVisionNetwork if args.use_vision_network else MyKerasQModel)

    # Tests https://github.com/ray-project/ray/issues/7293
    def check_has_custom_metric(result):
        r = result["result"]["info"]["learner"]
        if DEFAULT_POLICY_ID in r:
            r = r[DEFAULT_POLICY_ID].get(LEARNER_STATS_KEY,
                                         r[DEFAULT_POLICY_ID])
        assert r["model"]["foo"] == 42, result

    if args.run == "DQN":
        extra_config = {"learning_starts": 0}
Esempio n. 12
0
    ray.init(local_mode=args.debug)
    if args.debug:
        tune_config = {
            'log_level': 'DEBUG',
            'num_workers': 1,
        }
    else:
        tune_config = {
            'num_workers': 1,
            'num_gpus': 1,
        }

    env_config = {'map_filename': args.map}
    if args.use_cnn:
        env_cls = SquarePycroRts3MultiAgentEnv
        ModelCatalog.register_custom_model('masked_actions_model', MaskedActionsCNN)
        model_config = {
            'custom_model': 'masked_actions_model',
            'conv_filters': [[16, [2, 2], 1], [32, [2, 2], 1], [64, [3, 3], 2]],
            'conv_activation': 'leaky_relu',
            'fcnet_hiddens': [128, 128],
            'fcnet_activation': 'leaky_relu',
        }
    else:
        env_cls = PycroRts3MultiAgentEnv
        ModelCatalog.register_custom_model('masked_actions_model', MaskedActionsMLP)
        model_config = {
            'custom_model': 'masked_actions_model',
            'fcnet_hiddens': [128, 128],
            'fcnet_activation': 'leaky_relu',
        }
Esempio n. 13
0
        lstm_out = self.lstm(
            x, [torch.unsqueeze(state[0], 0),
                torch.unsqueeze(state[1], 0)])
        action_out = self.action_branch(lstm_out[0])
        self._cur_value = torch.reshape(self.value_branch(lstm_out[0]), [-1])
        return action_out, [
            torch.squeeze(lstm_out[1][0], 0),
            torch.squeeze(lstm_out[1][1], 0)
        ]


if __name__ == "__main__":
    args = parser.parse_args()

    ray.init(num_cpus=args.num_cpus or None)
    ModelCatalog.register_custom_model("rnn", RNNModel)
    tune.register_env("repeat_initial",
                      lambda _: RepeatInitialEnv(episode_len=100))
    tune.register_env("repeat_after_me",
                      lambda _: RepeatAfterMeEnv({"repeat_delay": 1}))
    tune.register_env("cartpole_stateless", lambda _: CartPoleStatelessEnv())

    config = {
        "env": args.env,
        "use_pytorch": True,
        "num_workers": 0,
        "num_envs_per_worker": 20,
        "gamma": 0.9,
        "entropy_coeff": 0.0001,
        "model": {
            "custom_model": "rnn",
Esempio n. 14
0
    if config["framework"] == "torch":
        return CCPPOTorchPolicy


CCTrainer = PPOTrainer.with_updates(
    name="CCPPOTrainer",
    default_policy=CCPPOTFPolicy,
    get_policy_class=get_policy_class,
)

if __name__ == "__main__":
    ray.init()
    args = parser.parse_args()

    ModelCatalog.register_custom_model(
        "cc_model", TorchCentralizedCriticModel
        if args.framework == "torch" else CentralizedCriticModel)

    config = {
        "env": TwoStepGame,
        "batch_mode": "complete_episodes",
        # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0.
        "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")),
        "num_workers": 0,
        "multiagent": {
            "policies": {
                "pol1": (None, Discrete(6), TwoStepGame.action_space, {
                    "framework": args.framework,
                }),
                "pol2": (None, Discrete(6), TwoStepGame.action_space, {
                    "framework": args.framework,
Esempio n. 15
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--eval-num", type=int, default=5)
    parser.add_argument("--eval-every", type=int, default=1)
    parser.add_argument("--num-workers", type=int, default=1)
    parser.add_argument("--cpus-per-worker", type=float, default=0.5)
    parser.add_argument("--cpus-for-driver", type=float, default=0.5)
    parser.add_argument("--address", type=str, default=None)
    parser.add_argument(
        "--model-path",
        type=str,
        default="/home/jippo/ray_results/YanivTrainer_2021-05-02_16-44-14/YanivTrainer_yaniv_3ee8a_00000_0_2021-05-02_16-44-14/models",
    )
    parser.add_argument("--opponent", type=str, default="intermediate")
    args = parser.parse_args()

    register_env("yaniv", lambda config: YanivEnv(config))
    ModelCatalog.register_custom_model("yaniv_mask", YanivActionMaskModel)

    if args.opponent == "intermediate":
        stepfn = intermediate_rule_step
    elif args.opponent == "novice":
        stepfn = novice_rule_step
    else:
        raise ValueError("opponent not defined: {}".format(args.opponent))

    env_config = {
        "end_after_n_deck_replacements": 0,
        "end_after_n_steps": 130,
        "early_end_reward": 0,
        "use_scaled_negative_reward": True,
        "use_scaled_positive_reward": True,
        "max_negative_reward": -1,
        "negative_score_cutoff": 30,
        "single_step": False,
        "step_reward": 0,
        "use_unkown_cards_in_state": False,
        "use_dead_cards_in_state": True,
        "observation_scheme": 1,
        "n_players": 2,
        "state_n_players": 2,
        "player_step_fn": {"player_1": stepfn},
    }

    env = YanivEnv(env_config)
    obs_space = env.observation_space
    act_space = env.action_space

    config = {
        "callbacks": YanivCallbacks,
        "num_gpus": 1,
        "env": "yaniv",
        "env_config": env_config,
        "framework": "torch",
        "multiagent": {
            "policies": {
                "policy_1": (None, obs_space, act_space, {}),
            },
            "policy_mapping_fn": policy_mapping_fn,
            "policies_to_train": ["policy_1"],
        },
        "model": {
            "custom_model": "yaniv_mask",
            "fcnet_hiddens": [512, 512],
        },
        "num_envs_per_worker": 1,
        "num_cpus_per_worker": args.cpus_per_worker,
        "num_cpus_for_driver": args.cpus_for_driver,
        "num_workers": 1,
        "evaluation_num_workers": args.num_workers,
        "evaluation_num_episodes": args.eval_num,
        "evaluation_interval": 1,
    }

    ray.init(include_dashboard=False, address=args.address)
    trainer = A3CTrainer(env="yaniv", config=config)

    # models_path = "/home/jippo/ray_results/YanivTrainer_2021-05-02_16-44-14/YanivTrainer_yaniv_3ee8a_00000_0_2021-05-02_16-44-14/models"
    # models_path = "/scratch/student/models"
    models_path = args.model_path
    models = os.listdir(models_path)

    results = []

    for model in tqdm(sorted(models)):
        if not model.startswith("model"):
            print("idk", model)
            continue

        model_num = int(model[6:-4])

        if model_num % args.eval_every != 0:
            continue

        path = os.path.join(models_path, model)
        with open(path, "rb") as f:
            policy = pickle.load(f)

        trainer.get_policy("policy_1").set_state(policy)
        metrics = trainer._evaluate()
        metrics["evaluation"].pop("hist_stats")

        stats = {
            k: v
            for k, v in metrics["evaluation"]["custom_metrics"].items()
            if k.endswith("mean")
        }
        stats["model_number"] = model_num
        tqdm.write(
            "model: {: <6}: win_mean: {}, episodes: {}".format(
                model_num,
                stats["player_0_win_mean"],
                metrics["evaluation"]["episodes_this_iter"],
            )
        )
        results.append(stats)

    with open("{}_vs_models_{}.json".format(args.opponent, args.eval_num), "w") as f:
        json.dump(results, f, indent=4)
Esempio n. 16
0
class CartpoleModel(Model):
    def _build_layers_v2(self, input_dict, num_outputs, options):
        self.model = Sequential()
        self.model.add(layers.InputLayer(input_tensor=input_dict["obs"], input_shape=(4,)))
        self.model.add(layers.Dense(4, name='l1', activation='relu'))
        self.model.add(layers.Dense(10, name='l2', activation='relu'))
        self.model.add(layers.Dense(10, name='l3', activation='relu'))
        self.model.add(layers.Dense(10, name='l4', activation='relu'))
        self.model.add(layers.Dense(2, name='l5', activation='relu'))
        return self.model.get_layer("l5").output, self.model.get_layer("l4").output



ray.init()
ModelCatalog.register_custom_model("CartpoleModel", CartpoleModel)
CartpoleEnv = gym.make('CartPole-v0')
CartpoleEnv=ScaleReward(CartpoleEnv)
register_env("CP", lambda _:CartpoleEnv)



trainer = a3c.A3CTrainer(env="CP", config={
    #"model": {"custom_model": "CartpoleModel"},
    #"observation_filter": "MeanStdFilter",
    #"vf_share_layers": True,
}, logger_creator=lambda _:ray.tune.logger.NoopLogger({},None))

if os.path.isfile('weights.pickle'):
   weights = pickle.load(open("weights.pickle", "rb"))
   trainer.restore_from_object(weights)
Esempio n. 17
0
        return self.value_module(model_out)

    # NOTE: customs
    @override(TorchModelV2)
    def forward(self, input_dict, state, seq_lens):
        """ return action logits/scores # return embedding value
        NOTE: only output embedded output to fit the "compute_q_values" func signature
        from https://github.com/ray-project/ray/blob/master/rllib/agents/dqn/dqn_torch_policy.py
        """
        x, state = self.get_embeddings(input_dict, state, seq_lens)
        # logits = self.get_advantages_or_q_values(x)[0]
        return x, state

    def get_embeddings(self, input_dict, state, seq_lens, permute=True):
        """ encode observations 
        """
        x = input_dict["obs"].float()
        if permute:
            x = x.permute(0, 3, 1, 2)  # NHWC => NCHW
        x = self.encoder(x)
        return x, state


#######################################################################################################
#####################################   Misc   #####################################################
#######################################################################################################

# Register model in ModelCatalog
ModelCatalog.register_custom_model("baseline_rainbow",
                                   BaselineRainbowTorchModel)
Esempio n. 18
0
                    initializer=normc_initializer(1.0),
                    activation_fn=activation)

    def forward(self, input_dict: Dict[str, TensorType],
                state: List[TensorType],
                seq_lens: TensorType) -> (TensorType, List[TensorType]):

        obs = input_dict['obs']
        data, privates = obs['data'], obs['privates']
        b = privates.shape[0]
        N = data.shape[1]
        T = data.shape[2]
        # lstm
        # x1 = (td - torch.min(td)) / (torch.max(td) - torch.min(td)) normalize
        lstm_in = data.permute(0, 2, 1, 3).contiguous().view(b, T, -1)
        lstm_out = self.lstm_net(lstm_in)

        # cnn

        x = torch.cat([privates, lstm_out], dim=1)
        self._features = self._hidden_layers(x)
        logits = self._policy_net.forward(self._features)
        return logits, state

    def value_function(self):
        assert self._features is not None
        return self._value_net.forward(self._features).squeeze(-1)


ModelCatalog.register_custom_model("mlstm_net", MLSTM_NET)
        # Batch dot product => shape of logits is [BATCH, MAX_ACTIONS].
        action_logits = tf.reduce_sum(avail_actions * intent_vector, axis=2)

        # Mask out invalid actions (use tf.float32.min for stability)
        inf_mask = tf.maximum(tf.log(action_mask), tf.float32.min)
        masked_logits = inf_mask + action_logits

        return masked_logits, last_layer


if __name__ == "__main__":
    args = parser.parse_args()
    ray.init()

    ModelCatalog.register_custom_model("pa_model", ParametricActionsModel)
    register_env("pa_cartpole", lambda _: ParametricActionCartpole(10))
    if args.run == "PPO":
        cfg = {
            "observation_filter": "NoFilter",  # don't filter the action list
            "vf_share_layers": True,  # don't create duplicate value model
        }
    elif args.run == "DQN":
        cfg = {
            "hiddens": [],  # important: don't postprocess the action scores
        }
    else:
        cfg = {}  # PG, IMPALA, A2C, etc.
    run_experiments({
        "parametric_cartpole": {
            "run": args.run,
Esempio n. 20
0
            last_layer = slim.fully_connected(
                input_dict["obs"], 64, activation_fn=tf.nn.relu, scope="fc1")
        last_layer = slim.fully_connected(
            last_layer, 64, activation_fn=tf.nn.relu, scope="fc2")
        output = slim.fully_connected(
            last_layer, num_outputs, activation_fn=None, scope="fc_out")
        return output, last_layer


if __name__ == "__main__":
    args = parser.parse_args()
    ray.init()

    # Simple environment with `num_agents` independent cartpole entities
    register_env("multi_cartpole", lambda _: MultiCartpole(args.num_agents))
    ModelCatalog.register_custom_model("model1", CustomModel1)
    ModelCatalog.register_custom_model("model2", CustomModel2)
    single_env = gym.make("CartPole-v0")
    obs_space = single_env.observation_space
    act_space = single_env.action_space

    # Each policy can have a different configuration (including custom model)
    def gen_policy(i):
        config = {
            "model": {
                "custom_model": ["model1", "model2"][i % 2],
            },
            "gamma": random.choice([0.95, 0.99]),
        }
        return (PPOPolicyGraph, obs_space, act_space, config)
Esempio n. 21
0
            last_layer = tf.layers.batch_normalization(
                last_layer, training=input_dict["is_training"])
        output = slim.fully_connected(
            last_layer,
            num_outputs,
            weights_initializer=normc_initializer(0.01),
            activation_fn=None,
            scope="fc_out")
        return output, last_layer


if __name__ == "__main__":
    args = parser.parse_args()
    ray.init()

    ModelCatalog.register_custom_model("bn_model", BatchNormModel)
    run_experiments({
        "batch_norm_demo": {
            "run": args.run,
            "env": "Pendulum-v0" if args.run == "DDPG" else "CartPole-v0",
            "stop": {
                "training_iteration": args.num_iters
            },
            "config": {
                "model": {
                    "custom_model": "bn_model",
                },
                "num_workers": 0,
            },
        },
    })
Esempio n. 22
0
    """

    def _build_layers_v2(self, input_dict, num_outputs, options):
        print(input_dict)
        self.obs_in = input_dict["obs"]
        self.fcnet = FullyConnectedNetwork(
            input_dict, self.obs_space, self.action_space, num_outputs, options
        )
        return self.fcnet.outputs, self.fcnet.last_layer


if __name__ == "__main__":
    print("THIS EXPERIMENT HAS NOT BEEN FULLY TESTED")
    kill_server()
    ray.init()
    ModelCatalog.register_custom_model("my_model", CustomModel)
    tune.run(
    "PPO",
    stop={"timesteps_total": 1000000},
    checkpoint_freq=1,
    config={
        "env": CarlaEnv,  # CarlaEnv,SimpleCorridor,  # or "corridor" if registered above
        "model": {"custom_model": "my_model"},
        "lr": grid_search([1e-2, 1e-4, 1e-6]),  # try different lrs
        "num_workers": 4,  # parallelism
        "num_gpus_per_worker": 0.2,
        "env_config": env_config,
    },
        resume=False,
    )
Esempio n. 23
0
import ray
from gym.wrappers import TimeLimit
from ray import tune
from ray.rllib.models import ModelCatalog
from ray.tune import register_env

from custom_envs.corridor_env import CorridorEnv
from custom_models.corridor_net import CorridorNet

if __name__ == '__main__':
    register_env('CorridorEnv', lambda env_config: TimeLimit(CorridorEnv(env_config['length']),
                                                             max_episode_steps=env_config['length']))
    ModelCatalog.register_custom_model('CorridorNet', CorridorNet)

    ray.init(local_mode=True)

    tune.run(
        'PPO',
        stop={'episode_reward_mean': 0.9},
        config={
            'env': 'CorridorEnv',
            'env_config': {
                'length': tune.grid_search([5, 10, 50]),
            },

            'model': {
                'custom_model': 'CorridorNet',
                'custom_options': {},
            },

            'use_pytorch': True,
Esempio n. 24
0
from ray.rllib.agents.ppo import PPOTrainer
from ray.rllib.models import ModelCatalog

from training.hierarchical_learning.bomberman_arena_multi_env import BombermanArenaEnv
from training.hierarchical_learning.hierarchical_bomberman_multi_env import *
from ray import tune
from training.hierarchical_learning.arena_callback import MyCallbacks
from training.train_with_action_masking_2.tfnet_with_masking import ComplexInputNetwork


if __name__ == '__main__':
    ray.init(
        _redis_max_memory=1024 * 1024 * 100,num_gpus=1, object_store_memory=10*2**30)
    env = HierarchicalBombermanMultiEnv([f'agent_{i}_high' for i in range(4)])

    ModelCatalog.register_custom_model("custom_model", ComplexInputNetwork)
    tune.register_env('BomberMan-v0', lambda c: BombermanArenaEnv([f'agent_{i}' for i in range(4)]))


    def policy_mapping_fn(agent_id):
        if agent_id.startswith("agent_0"):
            return "policy_kill"
        else:
            return "policy_kill_opp"

    def train(config, checkpoint_dir=None):
        trainer = PPOTrainer(config=config, env='BomberMan-v0')
        #trainer.restore('C:\\Users\\Florian\\ray_results\\PPO_BomberMan-v0_2021-03-22_10-57-05mz9533ge\\checkpoint_000140\\checkpoint-140')
        iter = 0

        #def update_phase(ev):
Esempio n. 25
0
                              stride,
                              activation_fn=activation,
                              padding="VALID",
                              scope="fc1")
            fc2 = slim.conv2d(fc1,
                              num_outputs, [1, 1],
                              activation_fn=None,
                              normalizer_fn=None,
                              scope="fc2")
            print(fc1, fc2)
            print(flatten(fc1), flatten(fc2))
            # exit(123)
            return flatten(fc2), flatten(fc1)


ModelCatalog.register_custom_model("my_model", MyModelClass)

model = {
    "use_lstm":
    True,
    "conv_activation":
    "elu",
    "custom_model":
    "my_model",
    "dim":
    42,
    "grayscale":
    True,
    "zero_mean":
    False,
    # Reduced channel depth and kernel size from default
Esempio n. 26
0
                                           [a1_logits, a2_logits])
        self.action_model.summary()
        self.register_variables(self.action_model.variables)

    def forward(self, input_dict, state, seq_lens):
        context, self._value_out = self.base_model(input_dict["obs"])
        return context, state

    def value_function(self):
        return tf.reshape(self._value_out, [-1])


if __name__ == "__main__":
    args = parser.parse_args()
    ray.init(num_cpus=args.num_cpus or None)
    ModelCatalog.register_custom_model("autoregressive_model",
                                       AutoregressiveActionsModel)
    ModelCatalog.register_custom_action_dist("binary_autoreg_output",
                                             BinaryAutoregressiveOutput)
    tune.run(args.run,
             stop={"episode_reward_mean": args.stop},
             config={
                 "env": CorrelatedActionsEnv,
                 "gamma": 0.5,
                 "num_gpus": 0,
                 "model": {
                     "custom_model": "autoregressive_model",
                     "custom_action_dist": "binary_autoreg_output",
                 },
             })
Esempio n. 27
0
def persuasive_a3c_conf(rollout_size=10,
                        agents=100,
                        debug_folder=None,
                        eval_folder=None,
                        alpha=0.0001,
                        gamma=0.99):
    """
        https://github.com/ray-project/ray/blob/releases/1.0.0/rllib/agents/trainer.py#L44
        https://github.com/ray-project/ray/blob/releases/1.0.0/rllib/agents/a3c/a3c.py#L14
        https://github.com/ray-project/ray/blob/releases/1.0.0/rllib/models/catalog.py#L37
    """

    ModelCatalog.register_custom_model('custom_rrn', RNNModel)
    ModelCatalog.register_custom_action_dist(
        "custom_action_distribution", PersuasiveActionDistribution)

    custom_configuration = DEFAULT_CONFIG

    custom_configuration['collect_metrics_timeout'] = 86400 # a day
    custom_configuration['framework'] = 'tf'
    custom_configuration['ignore_worker_failures'] = True
    custom_configuration['log_level'] = 'WARN'
    custom_configuration['monitor'] = True
    custom_configuration['num_cpus_for_driver'] = 1
    custom_configuration['num_cpus_per_worker'] = 1
    custom_configuration['num_envs_per_worker'] = 1
    custom_configuration['num_gpus_per_worker'] = 1
    custom_configuration['num_gpus'] = 1
    custom_configuration['num_workers'] = 1
    custom_configuration['output'] = debug_folder
    custom_configuration['remote_env_batch_wait_ms'] = 1000
    custom_configuration['remote_worker_envs'] = False
    custom_configuration['seed'] = 42
    custom_configuration['timesteps_per_iteration'] = 1

    # === Environment Settings ===
    custom_configuration['batch_mode'] = 'complete_episodes'
    custom_configuration['gamma'] = gamma
    custom_configuration['lr'] = alpha
    custom_configuration['no_done_at_end'] = False
    # Divide episodes into fragments of this many steps each during rollouts.
    # Sample batches of this size are collected from rollout workers and
    # combined into a larger batch of `train_batch_size` for learning.
    # For example, given rollout_fragment_length=100 and train_batch_size=1000:
    #   1. RLlib collects 10 fragments of 100 steps each from rollout workers.
    #   2. These fragments are concatenated and we perform an epoch of SGD.
    # When using multiple envs per worker, the fragment size is multiplied by
    # `num_envs_per_worker`. This is since we are collecting steps from
    # multiple envs in parallel. For example, if num_envs_per_worker=5, then
    # rollout workers will return experiences in chunks of 5*100 = 500 steps.
    # The dataflow here can vary per algorithm. For example, PPO further
    # divides the train batch into minibatches for multi-epoch SGD.
    custom_configuration['rollout_fragment_length'] = rollout_size
    # Training batch size, if applicable. Should be >= rollout_fragment_length.
    # Samples batches will be concatenated together to a batch of this size,
    # which is then passed to SGD.
    custom_configuration['train_batch_size'] = rollout_size * agents

    # === Exploration Settings ===
    # https://github.com/ray-project/ray/blob/releases/1.0.0/rllib/utils/exploration/stochastic_sampling.py
    # custom_configuration['exploration_config']['type'] = 'StochasticSampling'

    # https://github.com/ray-project/ray/blob/releases/1.0.0/rllib/utils/exploration/epsilon_greedy.py
    custom_configuration['exploration_config']['type'] = 'EpsilonGreedy'
    custom_configuration['exploration_config']['initial_epsilon'] = 1.0
    custom_configuration['exploration_config']['final_epsilon'] = 0.0001

    # ==================== MODEL - DEFAULT ====================
    # custom_configuration['model']['fcnet_hiddens'] = [64, 64]
    # === Built-in options ===
    # Filter config. List of [out_channels, kernel, stride] for each filter
    # custom_configuration['model']['conv_filters'] = None
    # Nonlinearity for built-in convnet
    # custom_configuration['model']['conv_activation'] = "relu"
    # Nonlinearity for fully connected net (tanh, relu)
    # custom_configuration['model']['fcnet_activation'] = "tanh"
    # Number of hidden layers for fully connected net
    # custom_configuration['model']['fcnet_hiddens'] = [64, 64]
    # For DiagGaussian action distributions, make the second half of the model
    # outputs floating bias variables instead of state-dependent. This only
    # has an effect is using the default fully connected net.
    # custom_configuration['model']['free_log_std'] = False
    # Whether to skip the final linear layer used to resize the hidden layer
    # outputs to size `num_outputs`. If True, then the last hidden layer
    # should already match num_outputs.
    # custom_configuration['model']['no_final_linear'] = False
    # Whether layers should be shared for the value function.
    # custom_configuration['model']['vf_share_layers'] = True

    # == LSTM ==
    # Whether to wrap the model with an LSTM.
    # custom_configuration['model']['use_lstm'] = False
    # Max seq len for training the LSTM, defaults to 20.
    # custom_configuration['model']['max_seq_len'] = 20
    # Size of the LSTM cell.
    # custom_configuration['model']['lstm_cell_size'] = 64
    # Whether to feed a_{t-1}, r_{t-1} to LSTM.
    # custom_configuration['model']['lstm_use_prev_action_reward'] = False
    # When using modelv1 models with a modelv2 algorithm, you may have to
    # define the state shape here (e.g., [256, 256]).
    # custom_configuration['model']['state_shape'] = None # [64, 64]

    # == Atari ==
    # Whether to enable framestack for Atari envs
    # custom_configuration['model']['framestack'] = True
    # Final resized frame dimension
    # custom_configuration['model']['dim'] = 84
    # (deprecated) Converts ATARI frame to 1 Channel Grayscale image
    # custom_configuration['model']['grayscale'] = False
    # (deprecated) Changes frame to range from [-1, 1] if true
    # custom_configuration['model']['zero_mean'] = True

    # === Options for custom models ===
    # Name of a custom model to use
    custom_configuration['model']['custom_model'] = 'custom_rrn'
    # Extra options to pass to the custom classes.
    # These will be available in the Model's
    custom_configuration['model']['custom_model_config'] = {}
    # Name of a custom action distribution to use.
    # See: https://docs.ray.io/en/releases-1.0.0/rllib-models.html#custom-action-distributions
    custom_configuration['model']['custom_action_dist'] = 'custom_action_distribution'

    # == OPTIMIZER ==
    # Arguments to pass to the policy optimizer. These vary by optimizer.
    # custom_configuration['optimizer'] = {}

    # == Persuasive A3C ==
    custom_configuration['callbacks'] = PersuasiveCallbacks
    custom_configuration['min_iter_time_s'] = 5

    custom_configuration['use_gae'] = True

    # === Evaluation Settings ===
    # Evaluate with every `evaluation_interval` training iterations.
    # The evaluation stats will be reported under the "evaluation" metric key.
    # Note that evaluation is currently not parallelized, and that for Ape-X
    # metrics are already only reported for the lowest epsilon workers.
    custom_configuration['evaluation_interval'] = 5

    # Number of episodes to run per evaluation period. If using multiple
    # evaluation workers, we will run at least this many episodes total.
    custom_configuration['evaluation_num_episodes'] = 5

    # Internal flag that is set to True for evaluation workers.
    # DEFAUTL: 'in_evaluation': False,

    # Typical usage is to pass extra args to evaluation env creator
    # and to disable exploration by computing deterministic actions.
    # IMPORTANT NOTE: Policy gradient algorithms are able to find the optimal
    # policy, even if this is a stochastic one. Setting 'explore=False' here
    # will result in the evaluation workers not using this optimal policy!
    custom_configuration['evaluation_config']['explore'] = False
    custom_configuration['evaluation_config']['lr'] = 0
    custom_configuration['evaluation_config']['num_gpus_per_worker'] = 0
    custom_configuration['evaluation_config']['num_gpus'] = 0
    custom_configuration['evaluation_config']['output'] = eval_folder
    # custom_configuration['evaluation_config']['env_config'] = {...},

    # Number of parallel workers to use for evaluation. Note that this is set
    # to zero by default, which means evaluation will be run in the trainer
    # process. If you increase this, it will increase the Ray resource usage
    # of the trainer since evaluation workers are created separately from
    # rollout workers.
    custom_configuration['evaluation_num_workers'] = 1

    # Customize the evaluation method. This must be a function of signature
    # (trainer: Trainer, eval_workers: WorkerSet) -> metrics: dict. See the
    # Trainer._evaluate() method to see the default implementation. The
    # trainer guarantees all eval workers have the latest policy state before
    # this function is called.
    custom_configuration['custom_eval_function'] = None #custom_eval_function

    return custom_configuration
Esempio n. 28
0
                             Tout=tf.float32)

    return penalty - tf.reduce_mean(action_dist.logp(actions) * rewards)


# <class 'ray.rllib.policy.tf_policy_template.MyTFPolicy'>
MyTFPolicy = build_tf_policy(
    name="MyTFPolicy",
    loss_fn=policy_gradient_loss,
)

# <class 'ray.rllib.agents.trainer_template.MyCustomTrainer'>
MyTrainer = build_trainer(
    name="MyCustomTrainer",
    default_policy=MyTFPolicy,
)

if __name__ == "__main__":
    ray.init()
    args = parser.parse_args()
    ModelCatalog.register_custom_model("eager_model", EagerModel)
    tune.run(MyTrainer,
             stop={"training_iteration": args.iters},
             config={
                 "env": "CartPole-v0",
                 "num_workers": 0,
                 "model": {
                     "custom_model": "eager_model"
                 },
             })
Esempio n. 29
0
parser = argparse.ArgumentParser()
parser.add_argument("--run", type=str, default="PPO")
parser.add_argument("--env", type=str, default="RepeatAfterMeEnv")
parser.add_argument("--num-cpus", type=int, default=0)
parser.add_argument("--as-test", action="store_true")
parser.add_argument("--torch", action="store_true")
parser.add_argument("--stop-reward", type=float, default=90)
parser.add_argument("--stop-iters", type=int, default=100)
parser.add_argument("--stop-timesteps", type=int, default=100000)

if __name__ == "__main__":
    args = parser.parse_args()

    ray.init(num_cpus=args.num_cpus or None)

    ModelCatalog.register_custom_model(
        "rnn", TorchRNNModel if args.torch else RNNModel)
    register_env("RepeatAfterMeEnv", lambda c: RepeatAfterMeEnv(c))
    register_env("RepeatInitialObsEnv", lambda _: RepeatInitialObsEnv())

    config = {
        "env": args.env,
        "env_config": {
            "repeat_delay": 2,
        },
        "gamma": 0.9,
        "num_workers": 0,
        "num_envs_per_worker": 20,
        "entropy_coeff": 0.001,
        "num_sgd_iter": 5,
        "vf_loss_coeff": 1e-5,
        "model": {
Esempio n. 30
0
            reward = 1
        else:
            reward = -1
        done = len(self.history) > 100
        return self._next_obs(), reward, done, {}

    def _next_obs(self):
        token = random.choice([0, 1])
        self.history.append(token)
        return token


if __name__ == "__main__":
    args = parser.parse_args()
    ray.init(num_cpus=args.num_cpus or None)
    ModelCatalog.register_custom_model("rnn", MyKerasRNN)
    register_env("RepeatAfterMeEnv", lambda c: RepeatAfterMeEnv(c))
    register_env("RepeatInitialEnv", lambda _: RepeatInitialEnv())

    config = {
        "env": args.env,
        "env_config": {
            "repeat_delay": 2,
        },
        "gamma": 0.9,
        "num_workers": 0,
        "num_envs_per_worker": 20,
        "entropy_coeff": 0.001,
        "num_sgd_iter": 5,
        "vf_loss_coeff": 1e-5,
        "model": {
Esempio n. 31
0
    multi_hunter_trainer = PPOTrainer(MultiHunterEnv, config)
    for _ in range(100):
        environment.simulate()
        result = multi_hunter_trainer.train()
        result["phase"] = 1
        reporter(**result)
        phase1_time = result["timesteps_total"]
    state = multi_hunter_trainer.save()
    multi_hunter_trainer.stop()


if __name__ == '__main__':
    training = True
    ray.init()

    ModelCatalog.register_custom_model("DQNModel", DQNModel_Hunter)
    config_hunter = {
        "num_gpus": 0,
        "num_workers": 1,
        "framework": "torch",
        "lr": 4e-3,
        # "lr": tune.grid_search([5e-3, 2e-3, 1e-3, 5e-4]),
        "gamma": 0.985,
        # "gamma": tune.grid_search([0.983, 0.985, 0.986, 0.987, 0.988, 0.989]),
        "epsilon": 1,
        "epsilon_decay": 0.99998,
        "epsilon_min": 0.01,
        "buffer_size": 20000,
        "batch_size": 2000,
        "env": MultiHunterEnv,
        "env_config": {
Esempio n. 32
0
parser = argparse.ArgumentParser()
parser.add_argument("--run", type=str, default="PPO")
parser.add_argument("--torch", action="store_true")
parser.add_argument("--as-test", action="store_true")
parser.add_argument("--stop-iters", type=int, default=200)
parser.add_argument("--stop-reward", type=float, default=150.0)
parser.add_argument("--stop-timesteps", type=int, default=100000)

if __name__ == "__main__":
    args = parser.parse_args()
    ray.init()

    register_env("pa_cartpole", lambda _: ParametricActionsCartPole(10))
    ModelCatalog.register_custom_model(
        "pa_model",
        TorchParametricActionsModel if args.torch else ParametricActionsModel)

    if args.run == "DQN":
        cfg = {
            # TODO(ekl) we need to set these to prevent the masked values
            # from being further processed in DistributionalQModel, which
            # would mess up the masking. It is possible to support these if we
            # defined a a custom DistributionalQModel that is aware of masking.
            "hiddens": [],
            "dueling": False,
        }
    else:
        cfg = {}

    config = dict(
Esempio n. 33
0
    name="CCPPO",
    postprocess_fn=centralized_critic_postprocessing,
    loss_fn=loss_with_central_critic,
    before_loss_init=setup_mixins,
    grad_stats_fn=central_vf_stats,
    mixins=[
        LearningRateSchedule, EntropyCoeffSchedule, KLCoeffMixin,
        CentralizedValueMixin
    ])

CCTrainer = PPOTrainer.with_updates(
    name="CCPPOTrainer", default_policy=CCPPO, get_policy_class=None)

if __name__ == "__main__":
    args = parser.parse_args()
    ModelCatalog.register_custom_model("cc_model", CentralizedCriticModel)
    tune.run(
        CCTrainer,
        stop={
            "timesteps_total": args.stop,
            "episode_reward_mean": 7.99,
        },
        config={
            "env": TwoStepGame,
            "batch_mode": "complete_episodes",
            "eager": False,
            "num_workers": 0,
            "multiagent": {
                "policies": {
                    "pol1": (None, Discrete(6), TwoStepGame.action_space, {}),
                    "pol2": (None, Discrete(6), TwoStepGame.action_space, {}),
Esempio n. 34
0
                                          activation_fn=tf.nn.relu,
                                          scope="fc2")
        output = slim.fully_connected(last_layer,
                                      num_outputs,
                                      activation_fn=None,
                                      scope="fc_out")
        return output, last_layer


if __name__ == "__main__":
    args = parser.parse_args()
    ray.init()

    # Simple environment with `num_agents` independent cartpole entities
    register_env("multi_cartpole", lambda _: MultiCartpole(args.num_agents))
    ModelCatalog.register_custom_model("model1", CustomModel1)
    ModelCatalog.register_custom_model("model2", CustomModel2)
    single_env = gym.make("CartPole-v0")
    obs_space = single_env.observation_space
    act_space = single_env.action_space

    # Each policy can have a different configuration (including custom model)
    def gen_policy(i):
        config = {
            "model": {
                "custom_model": ["model1", "model2"][i % 2],
            },
            "gamma": random.choice([0.95, 0.99]),
        }
        return (PPOPolicyGraph, obs_space, act_space, config)
Esempio n. 35
0
     env = clip_reward(env, lower_bound=-1, upper_bound=1)
     env = sticky_actions(env, repeat_action_probability=0.25)
     env = resize(env, 84, 84)
     #env = color_reduction(env, mode='full')
     #env = frame_skip(env, 4)
     env = frame_stack(env, 4)
     env = agent_indicator(env, type_only=False)
     return env
 
 register_env(env_name, lambda config: PettingZooEnv(env_creator(config)))
 
 test_env = PettingZooEnv(env_creator({}))
 obs_space = test_env.observation_space
 act_space = test_env.action_space
 
 ModelCatalog.register_custom_model("AtariModel", AtariModel)
 #ModelCatalog.register_custom_model("RandomPolicy", RandomPolicy)
 
 def gen_policy(i):
     config = {
         "model": {
             "custom_model": "AtariModel",
         },
         "gamma": 0.99,
     }
     return (None, obs_space, act_space, config)
 
 policies = {
         "policy_0": gen_policy(0),
         "random": (RandomPolicy, obs_space, act_space, {"ignore_action_bounds": True}),
         }
Esempio n. 36
0
def main(args):
    # ====================================
    # init env config
    # ====================================
    if args.no_debug:
        ray.init()
    else:
        ray.init(local_mode=True)
    # use ray cluster for training
    # ray.init(
    #     address="auto" if args.address is None else args.address,
    #     redis_password="******",
    # )
    #
    # print(
    #     "--------------- Ray startup ------------\n{}".format(
    #         ray.state.cluster_resources()
    #     )
    # )

    agent_specs = {"AGENT-007": agent_spec}

    env_config = {
        "seed": 42,
        "scenarios": [scenario_paths],
        "headless": args.headless,
        "agent_specs": agent_specs,
    }

    # ====================================
    # init tune config
    # ====================================
    class MultiEnv(RLlibHiWayEnv):
        def __init__(self, env_config):
            env_config["scenarios"] = [
                scenario_paths[(env_config.worker_index - 1) % len(scenario_paths)]
            ]
            super(MultiEnv, self).__init__(config=env_config)

    ModelCatalog.register_custom_model("my_rnn", RNNModel)
    tune_config = {
        "env": MultiEnv,
        "env_config": env_config,
        "multiagent": {
            "policies": {
                "default_policy": (None, OBSERVATION_SPACE, ACTION_SPACE, {},)
            },
            "policy_mapping_fn": lambda agent_id: "default_policy",
        },
        "model": {
            "custom_model": "my_rnn",
        },
        "framework": "torch",
        "callbacks": {
            "on_episode_start": on_episode_start,
            "on_episode_step": on_episode_step,
            "on_episode_end": on_episode_end,
        },
        "lr": 1e-4,
        "log_level": "WARN",
        "num_workers": args.num_workers,
        "horizon": args.horizon,
        "train_batch_size": 10240 * 3,

        "observation_filter": "MeanStdFilter",
        "batch_mode": "complete_episodes",
        "grad_clip": 0.5, 

        # "model":{
        #     "use_lstm": True,
        # },
    }

    tune_config.update(
        {
            "lambda": 0.95,
            "clip_param": 0.2,
            "num_sgd_iter": 10,
            "sgd_minibatch_size": 1024,
        }
    )

    # ====================================
    # init log and checkpoint dir_info
    # ====================================
    experiment_name = EXPERIMENT_NAME.format(
        scenario="multi_scenarios", algorithm="PPO", n_agent=1,
    )

    log_dir = Path(args.log_dir).expanduser().absolute() / RUN_NAME
    log_dir.mkdir(parents=True, exist_ok=True)
    print(f"Checkpointing at {log_dir}")

    if args.restore:
        restore_path = Path(args.restore).expanduser()
        print(f"Loading model from {restore_path}")
    else:
        restore_path = None

    # run experiments
    analysis = tune.run(
        PPOTrainer,
        # "PPO",
        name=experiment_name,
        stop={"time_total_s": 24 * 60 * 60},
        checkpoint_freq=2,
        checkpoint_at_end=True,
        local_dir=str(log_dir),
        resume=args.resume,
        restore=restore_path,
        max_failures=1000,
        export_formats=["model", "checkpoint"],
        config=tune_config,
    )

    print(analysis.dataframe().head())
Esempio n. 37
0
    metrics = summarize_episodes(episodes)
    eval_metrics.append(metrics)

    return metrics


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--num-iters", type=int, default=10)
    parser.add_argument("--num-workers", type=int, default=2)
    args = parser.parse_args()

    ray.init()

    register_env("dominion", lambda config: DominionEnv(config))
    ModelCatalog.register_custom_model("domraymodel", DomrayModel)

    config = {
        "env": DominionEnv,
        "env_config": env_config,
        "num_gpus": 1,
        "train_batch_size": 200,
        "model": {
            "custom_model": "domraymodel",
            "fcnet_hiddens":
            [256, 256, 34],  #TODO: 34 is the action space size, refactor
            "vf_share_layers": True,
        },
        "callbacks": DomCallbacks,

        # Evaluation settings
Esempio n. 38
0
def register_actor_mask_model():
    ModelCatalog.register_custom_model("action_mask", ActionMaskModel)
Esempio n. 39
0
tf1, tf, tfv = try_import_tf()

parser = argparse.ArgumentParser()
parser.add_argument("--run", type=str, default="PPO")
parser.add_argument("--as-test", action="store_true")
parser.add_argument("--torch", action="store_true")
parser.add_argument("--stop-iters", type=int, default=200)
parser.add_argument("--stop-timesteps", type=int, default=100000)
parser.add_argument("--stop-reward", type=float, default=150)

if __name__ == "__main__":
    args = parser.parse_args()
    ray.init()

    ModelCatalog.register_custom_model(
        "bn_model", TorchBatchNormModel if args.torch else
        KerasBatchNormModel if args.run != "PPO" else BatchNormModel)

    config = {
        "env": "Pendulum-v0" if args.run in ["DDPG", "SAC"] else "CartPole-v0",
        "model": {
            "custom_model": "bn_model",
        },
        "lr": 0.0003,
        # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0.
        "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")),
        "num_workers": 0,
        "framework": "torch" if args.torch else "tf",
    }

    stop = {
Esempio n. 40
0
                                     shape=(84, 84, 4),
                                     dtype=np.float32)
        self.i = 0

    def reset(self):
        self.i = 0
        return self.zeros

    def step(self, action):
        self.i += 1
        return self.zeros, 1, self.i > 1000, {}


if __name__ == "__main__":
    ray.init()
    ModelCatalog.register_custom_model("fast_model", FastModel)
    run_experiments({
        "demo": {
            "run": "IMPALA",
            "env": FastImageEnv,
            "config": {
                "compress_observations":
                True,
                "model": {
                    "custom_model": "fast_model"
                },
                "num_gpus":
                0,
                "num_workers":
                2,
                "num_envs_per_worker":
Esempio n. 41
0
tf1, tf, tfv = try_import_tf()

parser = argparse.ArgumentParser()
parser.add_argument("--run", type=str, default="PPO")
parser.add_argument("--as-test", action="store_true")
parser.add_argument("--torch", action="store_true")
parser.add_argument("--stop-iters", type=int, default=200)
parser.add_argument("--stop-timesteps", type=int, default=100000)
parser.add_argument("--stop-reward", type=float, default=150)

if __name__ == "__main__":
    args = parser.parse_args()
    ray.init()

    ModelCatalog.register_custom_model(
        "bn_model", TorchBatchNormModel if args.torch else BatchNormModel)

    config = {
        "env": "Pendulum-v0" if args.run == "DDPG" else "CartPole-v0",
        "model": {
            "custom_model": "bn_model",
        },
        "num_workers": 0,
        "framework": "torch" if args.torch else "tf",
    }

    stop = {
        "training_iteration": args.stop_iters,
        "timesteps_total": args.stop_timesteps,
        "episode_reward_mean": args.stop_reward,
    }
    def forward_rnn(self, inputs, state, seq_lens):
        model_out, self._value_out, h, c = self.rnn_model([inputs, seq_lens] +
                                                          state)
        return model_out, [h, c]

    def get_initial_state(self):
        return [
            np.zeros(self.cell_size, np.float32),
            np.zeros(self.cell_size, np.float32),
        ]

    @override(ModelV2)
    def value_function(self):
        return tf.reshape(self._value_out, [-1])

ModelCatalog.register_custom_model("my_model", MyKerasModel)


analysis=tune.run(PPOTrainer, stop={"timesteps_total": 100000},
                  config={
                      "env": BlackjackEnv,
                      "gamma": 0.99,
                      "num_workers": 1,
                      "num_envs_per_worker": 8,
                      "entropy_coeff": 0.001,
                      "num_sgd_iter": 5,
                      "vf_loss_coeff": 1e-5,
                      "lr":tune.grid_search([0.0001,0.0005,0.00001,0.00005]),
                      "model":
                          {
                             "custom_model": "my_model",
Esempio n. 43
0
 def testCustomModel(self):
     ray.init()
     ModelCatalog.register_custom_model("foo", CustomModel)
     p1 = ModelCatalog.get_model(
         get_registry(), 1, 5, {"custom_model": "foo"})
     self.assertEqual(str(type(p1)), str(CustomModel))
Esempio n. 44
0
        self.imitation_loss = tf.reduce_mean(
            -action_dist.logp(input_ops["actions"]))
        return policy_loss + 10 * self.imitation_loss

    def custom_stats(self):
        return {
            "policy_loss": self.policy_loss,
            "imitation_loss": self.imitation_loss,
        }


if __name__ == "__main__":
    ray.init()
    args = parser.parse_args()

    ModelCatalog.register_custom_model("custom_loss", CustomLossModel)
    run_experiments({
        "custom_loss": {
            "run": "PG",
            "env": "CartPole-v0",
            "stop": {
                "training_iteration": args.iters,
            },
            "config": {
                "num_workers": 0,
                "model": {
                    "custom_model": "custom_loss",
                    "custom_options": {
                        "input_files": args.input_files,
                    },
                },