Esempio n. 1
0
    def testPyTorchModel(self):
        ModelCatalog.register_custom_model("composite", TorchSpyModel)
        register_env("nested", lambda _: NestedDictEnv())
        a2c = A2CAgent(
            env="nested",
            config={
                "num_workers": 0,
                "use_pytorch": True,
                "sample_batch_size": 5,
                "train_batch_size": 5,
                "model": {
                    "custom_model": "composite",
                },
            })

        a2c.train()

        # Check that the model sees the correct reconstructed observations
        for i in range(4):
            seen = pickle.loads(
                ray.experimental.internal_kv._internal_kv_get(
                    "torch_spy_in_{}".format(i)))
            pos_i = DICT_SAMPLES[i]["sensors"]["position"].tolist()
            cam_i = DICT_SAMPLES[i]["sensors"]["front_cam"][0].tolist()
            task_i = one_hot(
                DICT_SAMPLES[i]["inner_state"]["job_status"]["task"], 5)
            self.assertEqual(seen[0][0].tolist(), pos_i)
            self.assertEqual(seen[1][0].tolist(), cam_i)
            self.assertEqual(seen[2][0].tolist(), task_i)
Esempio n. 2
0
    def doTestNestedDict(self, make_env, test_lstm=False):
        ModelCatalog.register_custom_model("composite", DictSpyModel)
        register_env("nested", make_env)
        pg = PGAgent(
            env="nested",
            config={
                "num_workers": 0,
                "sample_batch_size": 5,
                "train_batch_size": 5,
                "model": {
                    "custom_model": "composite",
                    "use_lstm": test_lstm,
                },
            })
        pg.train()

        # Check that the model sees the correct reconstructed observations
        for i in range(4):
            seen = pickle.loads(
                ray.experimental.internal_kv._internal_kv_get(
                    "d_spy_in_{}".format(i)))
            pos_i = DICT_SAMPLES[i]["sensors"]["position"].tolist()
            cam_i = DICT_SAMPLES[i]["sensors"]["front_cam"][0].tolist()
            task_i = one_hot(
                DICT_SAMPLES[i]["inner_state"]["job_status"]["task"], 5)
            self.assertEqual(seen[0][0].tolist(), pos_i)
            self.assertEqual(seen[1][0].tolist(), cam_i)
            self.assertEqual(seen[2][0].tolist(), task_i)
Esempio n. 3
0
    def doTestNestedTuple(self, make_env):
        ModelCatalog.register_custom_model("composite2", TupleSpyModel)
        register_env("nested2", make_env)
        pg = PGAgent(
            env="nested2",
            config={
                "num_workers": 0,
                "sample_batch_size": 5,
                "train_batch_size": 5,
                "model": {
                    "custom_model": "composite2",
                },
            })
        pg.train()

        # Check that the model sees the correct reconstructed observations
        for i in range(4):
            seen = pickle.loads(
                ray.experimental.internal_kv._internal_kv_get(
                    "t_spy_in_{}".format(i)))
            pos_i = TUPLE_SAMPLES[i][0].tolist()
            cam_i = TUPLE_SAMPLES[i][1][0].tolist()
            task_i = one_hot(TUPLE_SAMPLES[i][2], 5)
            self.assertEqual(seen[0][0].tolist(), pos_i)
            self.assertEqual(seen[1][0].tolist(), cam_i)
            self.assertEqual(seen[2][0].tolist(), task_i)
Esempio n. 4
0
    def __init__(self,
                 sess,
                 action_space,
                 obs_space,
                 preprocessor,
                 observation_filter,
                 model_config,
                 action_noise_std=0.0):
        self.sess = sess
        self.action_space = action_space
        self.action_noise_std = action_noise_std
        self.preprocessor = preprocessor
        self.observation_filter = get_filter(observation_filter,
                                             self.preprocessor.shape)
        self.inputs = tf.placeholder(tf.float32,
                                     [None] + list(self.preprocessor.shape))

        # Policy network.
        dist_class, dist_dim = ModelCatalog.get_action_dist(
            action_space, model_config, dist_type="deterministic")

        model = ModelCatalog.get_model({
            "obs": self.inputs
        }, obs_space, action_space, dist_dim, model_config)
        dist = dist_class(model.outputs)
        self.sampler = dist.sample()

        self.variables = ray.experimental.tf_utils.TensorFlowVariables(
            model.outputs, self.sess)

        self.num_params = sum(
            np.prod(variable.shape.as_list())
            for _, variable in self.variables.variables.items())
        self.sess.run(tf.global_variables_initializer())
Esempio n. 5
0
 def testCustomModel(self):
     ray.init()
     ModelCatalog.register_custom_model("foo", CustomModel)
     p1 = ModelCatalog.get_model({
         "obs": tf.constant([1, 2, 3])
     }, Box(0, 1, shape=(3, ), dtype=np.float32), Discrete(5), 5,
                                 {"custom_model": "foo"})
     self.assertEqual(str(type(p1)), str(CustomModel))
Esempio n. 6
0
    def testGymPreprocessors(self):
        p1 = ModelCatalog.get_preprocessor(
            get_registry(), gym.make("CartPole-v0"))
        self.assertEqual(type(p1), NoPreprocessor)

        p2 = ModelCatalog.get_preprocessor(
            get_registry(), gym.make("FrozenLake-v0"))
        self.assertEqual(type(p2), OneHotPreprocessor)
Esempio n. 7
0
 def testInvalidModel(self):
     ModelCatalog.register_custom_model("invalid", InvalidModel)
     self.assertRaises(ValueError, lambda: PGAgent(
         env="CartPole-v0", config={
             "model": {
                 "custom_model": "invalid",
             },
         }))
Esempio n. 8
0
    def testMinibatchSequencing(self):
        ModelCatalog.register_custom_model("rnn", RNNSpyModel)
        register_env("counter", lambda _: DebugCounterEnv())
        ppo = PPOAgent(
            env="counter",
            config={
                "num_workers": 0,
                "sample_batch_size": 20,
                "train_batch_size": 20,
                "sgd_minibatch_size": 10,
                "vf_share_layers": True,
                "simple_optimizer": False,
                "num_sgd_iter": 1,
                "model": {
                    "custom_model": "rnn",
                    "max_seq_len": 4,
                },
            })
        ppo.train()
        ppo.train()

        # first epoch: 20 observations get split into 2 minibatches of 8
        # four observations are discarded
        batch0 = pickle.loads(
            ray.experimental.internal_kv._internal_kv_get("rnn_spy_in_0"))
        batch1 = pickle.loads(
            ray.experimental.internal_kv._internal_kv_get("rnn_spy_in_1"))
        if batch0["sequences"][0][0][0] > batch1["sequences"][0][0][0]:
            batch0, batch1 = batch1, batch0  # sort minibatches
        self.assertEqual(batch0["seq_lens"].tolist(), [4, 4])
        self.assertEqual(batch1["seq_lens"].tolist(), [4, 3])
        self.assertEqual(batch0["sequences"].tolist(), [
            [[0], [1], [2], [3]],
            [[4], [5], [6], [7]],
        ])
        self.assertEqual(batch1["sequences"].tolist(), [
            [[8], [9], [10], [11]],
            [[12], [13], [14], [0]],
        ])

        # second epoch: 20 observations get split into 2 minibatches of 8
        # four observations are discarded
        batch2 = pickle.loads(
            ray.experimental.internal_kv._internal_kv_get("rnn_spy_in_2"))
        batch3 = pickle.loads(
            ray.experimental.internal_kv._internal_kv_get("rnn_spy_in_3"))
        if batch2["sequences"][0][0][0] > batch3["sequences"][0][0][0]:
            batch2, batch3 = batch3, batch2
        self.assertEqual(batch2["seq_lens"].tolist(), [4, 4])
        self.assertEqual(batch3["seq_lens"].tolist(), [2, 4])
        self.assertEqual(batch2["sequences"].tolist(), [
            [[5], [6], [7], [8]],
            [[9], [10], [11], [12]],
        ])
        self.assertEqual(batch3["sequences"].tolist(), [
            [[13], [14], [0], [0]],
            [[0], [1], [2], [3]],
        ])
Esempio n. 9
0
 def testInvalidModel2(self):
     ModelCatalog.register_custom_model("invalid2", InvalidModel2)
     self.assertRaisesRegexp(
         ValueError, "Expected output.*",
         lambda: PGAgent(
             env="CartPole-v0", config={
                 "model": {
                     "custom_model": "invalid2",
                 },
             }))
Esempio n. 10
0
    def testSimpleOptimizerSequencing(self):
        ModelCatalog.register_custom_model("rnn", RNNSpyModel)
        register_env("counter", lambda _: DebugCounterEnv())
        ppo = PPOAgent(
            env="counter",
            config={
                "num_workers": 0,
                "sample_batch_size": 10,
                "train_batch_size": 10,
                "sgd_minibatch_size": 10,
                "vf_share_layers": True,
                "simple_optimizer": True,
                "num_sgd_iter": 1,
                "model": {
                    "custom_model": "rnn",
                    "max_seq_len": 4,
                },
            })
        ppo.train()
        ppo.train()

        batch0 = pickle.loads(
            ray.experimental.internal_kv._internal_kv_get("rnn_spy_in_0"))
        self.assertEqual(
            batch0["sequences"].tolist(),
            [[[0], [1], [2], [3]], [[4], [5], [6], [7]], [[8], [9], [0], [0]]])
        self.assertEqual(batch0["seq_lens"].tolist(), [4, 4, 2])
        self.assertEqual(batch0["state_in"][0][0].tolist(), [0, 0, 0])
        self.assertEqual(batch0["state_in"][1][0].tolist(), [0, 0, 0])
        self.assertGreater(abs(np.sum(batch0["state_in"][0][1])), 0)
        self.assertGreater(abs(np.sum(batch0["state_in"][1][1])), 0)
        self.assertTrue(
            np.allclose(batch0["state_in"][0].tolist()[1:],
                        batch0["state_out"][0].tolist()[:-1]))
        self.assertTrue(
            np.allclose(batch0["state_in"][1].tolist()[1:],
                        batch0["state_out"][1].tolist()[:-1]))

        batch1 = pickle.loads(
            ray.experimental.internal_kv._internal_kv_get("rnn_spy_in_1"))
        self.assertEqual(batch1["sequences"].tolist(), [
            [[10], [11], [12], [13]],
            [[14], [0], [0], [0]],
            [[0], [1], [2], [3]],
            [[4], [0], [0], [0]],
        ])
        self.assertEqual(batch1["seq_lens"].tolist(), [4, 1, 4, 1])
        self.assertEqual(batch1["state_in"][0][2].tolist(), [0, 0, 0])
        self.assertEqual(batch1["state_in"][1][2].tolist(), [0, 0, 0])
        self.assertGreater(abs(np.sum(batch1["state_in"][0][0])), 0)
        self.assertGreater(abs(np.sum(batch1["state_in"][1][0])), 0)
        self.assertGreater(abs(np.sum(batch1["state_in"][0][1])), 0)
        self.assertGreater(abs(np.sum(batch1["state_in"][1][1])), 0)
        self.assertGreater(abs(np.sum(batch1["state_in"][0][3])), 0)
        self.assertGreater(abs(np.sum(batch1["state_in"][1][3])), 0)
Esempio n. 11
0
    def testDefaultModels(self):
        ray.init()

        with tf.variable_scope("test1"):
            p1 = ModelCatalog.get_model(
                get_registry(), np.zeros((10, 3), dtype=np.float32), 5)
            self.assertEqual(type(p1), FullyConnectedNetwork)

        with tf.variable_scope("test2"):
            p2 = ModelCatalog.get_model(
                get_registry(), np.zeros((10, 80, 80, 3), dtype=np.float32), 5)
            self.assertEqual(type(p2), VisionNetwork)
Esempio n. 12
0
    def testDefaultModels(self):
        ray.init()

        with tf.variable_scope("test1"):
            p1 = ModelCatalog.get_model({
                "obs": tf.zeros((10, 3), dtype=tf.float32)
            }, Box(0, 1, shape=(3, ), dtype=np.float32), 5, {})
            self.assertEqual(type(p1), FullyConnectedNetwork)

        with tf.variable_scope("test2"):
            p2 = ModelCatalog.get_model({
                "obs": tf.zeros((10, 84, 84, 3), dtype=tf.float32)
            }, Box(0, 1, shape=(84, 84, 3), dtype=np.float32), 5, {})
            self.assertEqual(type(p2), VisionNetwork)
Esempio n. 13
0
    def testMultiAgentComplexSpaces(self):
        ModelCatalog.register_custom_model("dict_spy", DictSpyModel)
        ModelCatalog.register_custom_model("tuple_spy", TupleSpyModel)
        register_env("nested_ma", lambda _: NestedMultiAgentEnv())
        act_space = spaces.Discrete(2)
        pg = PGAgent(
            env="nested_ma",
            config={
                "num_workers": 0,
                "sample_batch_size": 5,
                "train_batch_size": 5,
                "multiagent": {
                    "policy_graphs": {
                        "tuple_policy": (
                            PGPolicyGraph, TUPLE_SPACE, act_space,
                            {"model": {"custom_model": "tuple_spy"}}),
                        "dict_policy": (
                            PGPolicyGraph, DICT_SPACE, act_space,
                            {"model": {"custom_model": "dict_spy"}}),
                    },
                    "policy_mapping_fn": lambda a: {
                        "tuple_agent": "tuple_policy",
                        "dict_agent": "dict_policy"}[a],
                },
            })
        pg.train()

        for i in range(4):
            seen = pickle.loads(
                ray.experimental.internal_kv._internal_kv_get(
                    "d_spy_in_{}".format(i)))
            pos_i = DICT_SAMPLES[i]["sensors"]["position"].tolist()
            cam_i = DICT_SAMPLES[i]["sensors"]["front_cam"][0].tolist()
            task_i = one_hot(
                DICT_SAMPLES[i]["inner_state"]["job_status"]["task"], 5)
            self.assertEqual(seen[0][0].tolist(), pos_i)
            self.assertEqual(seen[1][0].tolist(), cam_i)
            self.assertEqual(seen[2][0].tolist(), task_i)

        for i in range(4):
            seen = pickle.loads(
                ray.experimental.internal_kv._internal_kv_get(
                    "t_spy_in_{}".format(i)))
            pos_i = TUPLE_SAMPLES[i][0].tolist()
            cam_i = TUPLE_SAMPLES[i][1][0].tolist()
            task_i = one_hot(TUPLE_SAMPLES[i][2], 5)
            self.assertEqual(seen[0][0].tolist(), pos_i)
            self.assertEqual(seen[1][0].tolist(), cam_i)
            self.assertEqual(seen[2][0].tolist(), task_i)
Esempio n. 14
0
    def __init__(self, state_values, cumulative_rewards, logits, actions,
                 action_space, beta):
        ma_adv_norm = tf.get_variable(
            name="moving_average_of_advantage_norm",
            dtype=tf.float32,
            initializer=100.0,
            trainable=False)
        # advantage estimation
        adv = cumulative_rewards - state_values
        # update averaged advantage norm
        update_adv_norm = tf.assign_add(
            ref=ma_adv_norm,
            value=1e-6 * (tf.reduce_mean(tf.square(adv)) - ma_adv_norm))

        # exponentially weighted advantages
        with tf.control_dependencies([update_adv_norm]):
            exp_advs = tf.exp(
                beta * tf.divide(adv, 1e-8 + tf.sqrt(ma_adv_norm)))

        # log\pi_\theta(a|s)
        dist_cls, _ = ModelCatalog.get_action_dist(action_space, {})
        action_dist = dist_cls(logits)
        logprobs = action_dist.logp(actions)

        self.loss = -1.0 * tf.reduce_mean(
            tf.stop_gradient(exp_advs) * logprobs)
Esempio n. 15
0
def _build_q_network(registry, inputs, num_actions, config):
    dueling = config["dueling"]
    hiddens = config["hiddens"]
    frontend = ModelCatalog.get_model(registry, inputs, 1, config["model"])
    frontend_out = frontend.last_layer

    with tf.variable_scope("action_value"):
        action_out = frontend_out
        for hidden in hiddens:
            action_out = layers.fully_connected(
                action_out, num_outputs=hidden, activation_fn=tf.nn.relu)
        action_scores = layers.fully_connected(
            action_out, num_outputs=num_actions, activation_fn=None)

    if dueling:
        with tf.variable_scope("state_value"):
            state_out = frontend_out
            for hidden in hiddens:
                state_out = layers.fully_connected(
                    state_out, num_outputs=hidden, activation_fn=tf.nn.relu)
            state_score = layers.fully_connected(
                state_out, num_outputs=1, activation_fn=None)
        action_scores_mean = tf.reduce_mean(action_scores, 1)
        action_scores_centered = action_scores - tf.expand_dims(
            action_scores_mean, 1)
        return state_score + action_scores_centered
    else:
        return action_scores
Esempio n. 16
0
 def _build_q_network(self, obs, obs_space, action_space, actions):
     q_net = QNetwork(
         ModelCatalog.get_model({
             "obs": obs,
             "is_training": self._get_is_training_placeholder(),
         }, obs_space, action_space, 1, self.config["model"]), actions,
         self.config["critic_hiddens"],
         self.config["critic_hidden_activation"])
     return q_net.value, q_net.model
Esempio n. 17
0
 def _build_p_network(self, obs, obs_space):
     policy_net = PNetwork(
         ModelCatalog.get_model({
             "obs": obs,
             "is_training": self._get_is_training_placeholder(),
         }, obs_space, 1, self.config["model"]), self.dim_actions,
         self.config["actor_hiddens"],
         self.config["actor_hidden_activation"])
     return policy_net.action_scores, policy_net.model
Esempio n. 18
0
 def __init__(self, registry, env_creator, config, logdir):
     env = ModelCatalog.get_preprocessor_as_wrapper(registry, env_creator(
         config["env_config"]), config["model"])
     self.dataset = ExperienceDataset(config["dataset_path"])
     # TODO(rliaw): should change this to be just env.observation_space
     self.policy = BCPolicy(registry, env.observation_space.shape,
                            env.action_space, config)
     self.config = config
     self.logdir = logdir
     self.metrics_queue = queue.Queue()
Esempio n. 19
0
 def _build_q_network(self, obs, space):
     qnet = QNetwork(
         ModelCatalog.get_model({
             "obs": obs,
             "is_training": self._get_is_training_placeholder(),
         }, space, self.num_actions, self.config["model"]),
         self.num_actions, self.config["dueling"], self.config["hiddens"],
         self.config["noisy"], self.config["num_atoms"],
         self.config["v_min"], self.config["v_max"], self.config["sigma0"])
     return qnet.value, qnet.logits, qnet.dist, qnet.model
def run(args, parser):
    def create_environment(env_config):
        # This import must happen inside the method so that worker processes import this code
        import roboschool
        return gym.make(args.env)

    if not args.config:
        # Load configuration from file
        config_dir = os.path.dirname(args.checkpoint)
        # params.json is saved in the model directory during ray training by default
        config_path = os.path.join(config_dir, "params.json")
        with open(config_path) as f:
            args.config = json.load(f)

    if not args.env:
        if not args.config.get("env"):
            parser.error("the following arguments are required: --env")
        args.env = args.config.get("env")

    ray.init()

    register_env(args.env, create_environment)

    cls = get_agent_class(args.algorithm)
    config = args.config
    config["monitor"] = False
    config["num_workers"] = 1
    config["num_gpus"] = 0
    agent = cls(env=args.env, config=config)
    agent.restore(args.checkpoint)
    num_episodes = int(args.evaluate_episodes)

    if args.algorithm == "DQN":
        env = gym.make(args.env)
        env = wrap_dqn(env, args.config.get("model", {}))
    else:
        env = ModelCatalog.get_preprocessor_as_wrapper(gym.make(args.env))
    env = wrappers.Monitor(env, OUTPUT_DIR, force=True, video_callable=lambda episode_id: True)
    all_rewards = []
    for episode in range(num_episodes):
        steps = 0
        state = env.reset()
        done = False
        reward_total = 0.0
        while not done:
            action = agent.compute_action(state)
            next_state, reward, done, _ = env.step(action)
            reward_total += reward
            steps += 1
            state = next_state
        all_rewards.append(reward_total)
        print("Episode reward: %s. Episode steps: %s" % (reward_total, steps))
    print("Mean Reward:", np.mean(all_rewards))
    print("Max Reward:", np.max(all_rewards))
    print("Min Reward:", np.min(all_rewards))
Esempio n. 21
0
    def testTuplePreprocessor(self):
        ray.init()

        class TupleEnv(object):
            def __init__(self):
                self.observation_space = Tuple(
                    [Discrete(5), Box(0, 1, shape=(3,), dtype=np.float32)])
        p1 = ModelCatalog.get_preprocessor(
            get_registry(), TupleEnv())
        self.assertEqual(p1.shape, (8,))
        self.assertEqual(
            list(p1.transform((0, [1, 2, 3]))),
            [float(x) for x in [1, 0, 0, 0, 0, 1, 2, 3]])
Esempio n. 22
0
 def testCustomPreprocessor(self):
     ray.init()
     ModelCatalog.register_custom_preprocessor("foo", CustomPreprocessor)
     ModelCatalog.register_custom_preprocessor("bar", CustomPreprocessor2)
     env = gym.make("CartPole-v0")
     p1 = ModelCatalog.get_preprocessor(env, {"custom_preprocessor": "foo"})
     self.assertEqual(str(type(p1)), str(CustomPreprocessor))
     p2 = ModelCatalog.get_preprocessor(env, {"custom_preprocessor": "bar"})
     self.assertEqual(str(type(p2)), str(CustomPreprocessor2))
     p3 = ModelCatalog.get_preprocessor(env)
     self.assertEqual(type(p3), NoPreprocessor)
Esempio n. 23
0
 def _build_policy_map(self, policy_dict, policy_config):
     policy_map = {}
     preprocessors = {}
     for name, (cls, obs_space, act_space,
                conf) in sorted(policy_dict.items()):
         merged_conf = merge_dicts(policy_config, conf)
         if self.preprocessing_enabled:
             preprocessor = ModelCatalog.get_preprocessor_for_space(
                 obs_space, merged_conf.get("model"))
             preprocessors[name] = preprocessor
             obs_space = preprocessor.observation_space
         else:
             preprocessors[name] = NoPreprocessor(obs_space)
         if isinstance(obs_space, gym.spaces.Dict) or \
                 isinstance(obs_space, gym.spaces.Tuple):
             raise ValueError(
                 "Found raw Tuple|Dict space as input to policy graph. "
                 "Please preprocess these observations with a "
                 "Tuple|DictFlatteningPreprocessor.")
         with tf.variable_scope(name):
             policy_map[name] = cls(obs_space, act_space, merged_conf)
     return policy_map, preprocessors
Esempio n. 24
0
    def __init__(
            self, registry, env_creator, config, logdir, start_sampler=True):
        env = ModelCatalog.get_preprocessor_as_wrapper(
            registry, env_creator(config["env_config"]), config["model"])
        self.env = env
        policy_cls = get_policy_cls(config)
        # TODO(rliaw): should change this to be just env.observation_space
        self.policy = policy_cls(
            registry, env.observation_space.shape, env.action_space, config)
        self.config = config

        # Technically not needed when not remote
        self.obs_filter = get_filter(
            config["observation_filter"], env.observation_space.shape)
        self.rew_filter = get_filter(config["reward_filter"], ())
        self.filters = {"obs_filter": self.obs_filter,
                        "rew_filter": self.rew_filter}
        self.sampler = AsyncSampler(env, self.policy, self.obs_filter,
                                    config["batch_size"])
        if start_sampler and self.sampler.async:
            self.sampler.start()
        self.logdir = logdir
Esempio n. 25
0
                        os.path.dirname(os.path.abspath(__file__)),
                        "../tests/data/cartpole_small"))

if __name__ == "__main__":
    ray.init()
    args = parser.parse_args()

    # Bazel makes it hard to find files specified in `args` (and `data`).
    # Look for them here.
    if not os.path.exists(args.input_files):
        # This script runs in the ray/rllib/examples dir.
        rllib_dir = Path(__file__).parent.parent
        input_dir = rllib_dir.absolute().joinpath(args.input_files)
        args.input_files = str(input_dir)

    ModelCatalog.register_custom_model(
        "custom_loss", TorchCustomLossModel if args.torch else CustomLossModel)

    config = {
        "env": "CartPole-v0",
        "num_workers": 0,
        "model": {
            "custom_model": "custom_loss",
            "custom_model_config": {
                "input_files": args.input_files,
            },
        },
        "framework": "torch" if args.torch else "tf",
    }

    stop = {
        "training_iteration": args.stop_iters,
Esempio n. 26
0
File: loss.py Progetto: adgirish/ray
    def __init__(
            self, observation_space, action_space,
            observations, value_targets, advantages, actions,
            prev_logits, prev_vf_preds, logit_dim,
            kl_coeff, distribution_class, config, sess, registry):
        self.prev_dist = distribution_class(prev_logits)

        # Saved so that we can compute actions given different observations
        self.observations = observations

        self.curr_logits = ModelCatalog.get_model(
            registry, observations, logit_dim, config["model"]).outputs
        self.curr_dist = distribution_class(self.curr_logits)
        self.sampler = self.curr_dist.sample()

        if config["use_gae"]:
            vf_config = config["model"].copy()
            # Do not split the last layer of the value function into
            # mean parameters and standard deviation parameters and
            # do not make the standard deviations free variables.
            vf_config["free_log_std"] = False
            with tf.variable_scope("value_function"):
                self.value_function = ModelCatalog.get_model(
                    registry, observations, 1, vf_config).outputs
            self.value_function = tf.reshape(self.value_function, [-1])

        # Make loss functions.
        self.ratio = tf.exp(self.curr_dist.logp(actions) -
                            self.prev_dist.logp(actions))
        self.kl = self.prev_dist.kl(self.curr_dist)
        self.mean_kl = tf.reduce_mean(self.kl)
        self.entropy = self.curr_dist.entropy()
        self.mean_entropy = tf.reduce_mean(self.entropy)
        self.surr1 = self.ratio * advantages
        self.surr2 = tf.clip_by_value(self.ratio, 1 - config["clip_param"],
                                      1 + config["clip_param"]) * advantages
        self.surr = tf.minimum(self.surr1, self.surr2)
        self.mean_policy_loss = tf.reduce_mean(-self.surr)

        if config["use_gae"]:
            # We use a huber loss here to be more robust against outliers,
            # which seem to occur when the rollouts get longer (the variance
            # scales superlinearly with the length of the rollout)
            self.vf_loss1 = tf.square(self.value_function - value_targets)
            vf_clipped = prev_vf_preds + tf.clip_by_value(
                self.value_function - prev_vf_preds,
                -config["clip_param"], config["clip_param"])
            self.vf_loss2 = tf.square(vf_clipped - value_targets)
            self.vf_loss = tf.minimum(self.vf_loss1, self.vf_loss2)
            self.mean_vf_loss = tf.reduce_mean(self.vf_loss)
            self.loss = tf.reduce_mean(
                -self.surr + kl_coeff * self.kl +
                config["vf_loss_coeff"] * self.vf_loss -
                config["entropy_coeff"] * self.entropy)
        else:
            self.mean_vf_loss = tf.constant(0.0)
            self.loss = tf.reduce_mean(
                -self.surr +
                kl_coeff * self.kl -
                config["entropy_coeff"] * self.entropy)

        self.sess = sess

        if config["use_gae"]:
            self.policy_results = [
                self.sampler, self.curr_logits, self.value_function]
        else:
            self.policy_results = [
                self.sampler, self.curr_logits, tf.constant("NA")]
Esempio n. 27
0

def reward_adapter(env_obs, env_reward):
    return env_reward


def action_adapter(model_action):
    throttle, brake, steering = model_action
    return np.array([throttle, brake, steering])


class TrainingModel(FullyConnectedNetwork):
    NAME = "FullyConnectedNetwork"


ModelCatalog.register_custom_model(TrainingModel.NAME, TrainingModel)


class ModelPolicy(AgentPolicy):
    def __init__(self, path_to_model, observation_space):
        self._prep = ModelCatalog.get_preprocessor_for_space(observation_space)
        self._path_to_model = path_to_model

    def setup(self):
        self._sess = tf.Session(graph=tf.Graph())
        self._sess.__enter__()
        tf.saved_model.load(self._sess,
                            export_dir=self._path_to_model,
                            tags=["serve"])

    def teardown(self):
if __name__ == '__main__':

    args = parser.parse_args()

    sep = os.pathsep
    os.environ['PYTHONPATH'] = sep.join(sys.path)

    ray.init(include_dashboard=False, num_gpus=1, num_cpus=args.num_cpus)
    env_name = "ray-griddly-env"

    def _create_env(env_config):
        env = RLlibEnv(env_config)
        return FlatActionWrapper(env)

    register_env(env_name, _create_env)
    ModelCatalog.register_custom_model("SimpleConv", SimpleConvFlatAgent)

    wandbLoggerCallback = WandbLoggerCallback(
        project='conditional_action_trees_reproduce',
        api_key_file='~/.wandb_rc',
        dir=args.root_directory)

    max_training_steps = args.max_training_steps
    gdy_file = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                            args.yaml_file)

    config = {
        'framework': 'torch',
        'seed': args.seed,
        'num_workers': args.num_workers,
        'num_envs_per_worker': args.num_envs_per_worker,
Esempio n. 29
0
from ray.rllib.models import ModelCatalog
from ray.tune.registry import register_env

from .model import ReallocationModel, Dirichlet
from .env import create_env


register_env("TradingEnv", create_env)
ModelCatalog.register_custom_action_dist("dirichlet", Dirichlet)
ModelCatalog.register_custom_model("reallocate", ReallocationModel)
# <class 'ray.rllib.policy.tf_policy_template.MyTFPolicy'>
MyTFPolicy = build_tf_policy(
    name="MyTFPolicy",
    loss_fn=policy_gradient_loss,
)

# <class 'ray.rllib.agents.trainer_template.MyCustomTrainer'>
MyTrainer = build_trainer(
    name="MyCustomTrainer",
    default_policy=MyTFPolicy,
)

if __name__ == "__main__":
    ray.init()
    args = parser.parse_args()
    ModelCatalog.register_custom_model("eager_model", EagerModel)

    config = {
        "env": "CartPole-v0",
        "num_workers": 0,
        "model": {
            "custom_model": "eager_model"
        },
        "framework": "tfe",
    }
    stop = {
        "timesteps_total": args.stop_timesteps,
        "training_iteration": args.stop_iters,
        "episode_reward_mean": args.stop_reward,
    }
Esempio n. 31
0
from MultiAgentSimEnv import MultiAgentSimEnv
from hunter_dqn.dqn import DQNTrainer
from hunter_dqn.dqn_model import DQNModel
from hunter_dqn.hunter_policy import HunterPolicy
from prey_dqn.MultiPreyEnv import MultiPreyEnv
from prey_dqn.prey_model import DQNModelPrey
from prey_dqn.prey_policy import PreyPolicy


def env_creator(env_config):
    return MultiAgentSimEnv(env_config)


if __name__ == "__main__":
    ray.init()
    ModelCatalog.register_custom_model("DQNModel", DQNModel)

    config = {
        "num_hunters": 20,
        "num_preys": 100,
    }

    env = register_env("MultiAgentSimEnv-v0", env_creator)

    singleAgentEnv = MultiAgentSimEnv(config)
    policies = {
        "hunter": (HunterPolicy, singleAgentEnv.observation_space_hunter,
                   singleAgentEnv.action_space_hunter, config),
        "prey": (PreyPolicy, singleAgentEnv.observation_space_prey,
                 singleAgentEnv.action_space_prey, config)
    }
Esempio n. 32
0
def build_rnnsac_model(
    policy: Policy,
    obs_space: gym.spaces.Space,
    action_space: gym.spaces.Space,
    config: TrainerConfigDict,
) -> ModelV2:
    """Constructs the necessary ModelV2 for the Policy and returns it.

    Args:
        policy (Policy): The TFPolicy that will use the models.
        obs_space (gym.spaces.Space): The observation space.
        action_space (gym.spaces.Space): The action space.
        config (TrainerConfigDict): The SAC trainer's config dict.

    Returns:
        ModelV2: The ModelV2 to be used by the Policy. Note: An additional
            target model will be created in this function and assigned to
            `policy.target_model`.
    """
    # With separate state-preprocessor (before obs+action concat).
    num_outputs = int(np.product(obs_space.shape))

    # Force-ignore any additionally provided hidden layer sizes.
    # Everything should be configured using SAC's `q_model_config` and
    # `policy_model_config` config settings.
    policy_model_config = MODEL_DEFAULTS.copy()
    policy_model_config.update(config["policy_model_config"])
    q_model_config = MODEL_DEFAULTS.copy()
    q_model_config.update(config["q_model_config"])

    default_model_cls = RNNSACTorchModel

    model = ModelCatalog.get_model_v2(
        obs_space=obs_space,
        action_space=action_space,
        num_outputs=num_outputs,
        model_config=config["model"],
        framework=config["framework"],
        default_model=default_model_cls,
        name="sac_model",
        policy_model_config=policy_model_config,
        q_model_config=q_model_config,
        twin_q=config["twin_q"],
        initial_alpha=config["initial_alpha"],
        target_entropy=config["target_entropy"],
    )

    assert isinstance(model, default_model_cls)

    # Create an exact copy of the model and store it in `policy.target_model`.
    # This will be used for tau-synched Q-target models that run behind the
    # actual Q-networks and are used for target q-value calculations in the
    # loss terms.
    policy.target_model = ModelCatalog.get_model_v2(
        obs_space=obs_space,
        action_space=action_space,
        num_outputs=num_outputs,
        model_config=config["model"],
        framework=config["framework"],
        default_model=default_model_cls,
        name="target_sac_model",
        policy_model_config=policy_model_config,
        q_model_config=q_model_config,
        twin_q=config["twin_q"],
        initial_alpha=config["initial_alpha"],
        target_entropy=config["target_entropy"],
    )

    assert isinstance(policy.target_model, default_model_cls)

    return model
Esempio n. 33
0
 def wrap(env):
     return ModelCatalog.get_preprocessor_as_wrapper(
         env, model_config)
Esempio n. 34
0
def build_ddpg_models(policy, observation_space, action_space, config):
    if config["model"]["custom_model"]:
        logger.warning(
            "Setting use_state_preprocessor=True since a custom model "
            "was specified.")
        config["use_state_preprocessor"] = True

    if not isinstance(action_space, Box):
        raise UnsupportedSpaceException(
            "Action space {} is not supported for DDPG.".format(action_space))
    elif len(action_space.shape) > 1:
        raise UnsupportedSpaceException(
            "Action space has multiple dimensions "
            "{}. ".format(action_space.shape) +
            "Consider reshaping this into a single dimension, "
            "using a Tuple action space, or the multi-agent API.")

    if policy.config["use_state_preprocessor"]:
        default_model = None  # catalog decides
        num_outputs = 256  # arbitrary
        config["model"]["no_final_linear"] = True
    else:
        default_model = NoopModel
        num_outputs = int(np.product(observation_space.shape))

    policy.model = ModelCatalog.get_model_v2(
        obs_space=observation_space,
        action_space=action_space,
        num_outputs=num_outputs,
        model_config=config["model"],
        framework="tf",
        model_interface=DDPGModel,
        default_model=default_model,
        name="ddpg_model",
        actor_hidden_activation=config["actor_hidden_activation"],
        actor_hiddens=config["actor_hiddens"],
        critic_hidden_activation=config["critic_hidden_activation"],
        critic_hiddens=config["critic_hiddens"],
        twin_q=config["twin_q"],
        add_layer_norm=(policy.config["exploration_config"].get("type") ==
                        "ParameterNoise"),
    )

    policy.target_model = ModelCatalog.get_model_v2(
        obs_space=observation_space,
        action_space=action_space,
        num_outputs=num_outputs,
        model_config=config["model"],
        framework="tf",
        model_interface=DDPGModel,
        default_model=default_model,
        name="target_ddpg_model",
        actor_hidden_activation=config["actor_hidden_activation"],
        actor_hiddens=config["actor_hiddens"],
        critic_hidden_activation=config["critic_hidden_activation"],
        critic_hiddens=config["critic_hiddens"],
        twin_q=config["twin_q"],
        add_layer_norm=(policy.config["exploration_config"].get("type") ==
                        "ParameterNoise"),
    )

    return policy.model
Esempio n. 35
0
    def forward(self, input_dict, state, seq_lens):
        model_out, self._value_out = self.model(
            [input_dict["obs"]["a"], input_dict["obs"]["b"]]
        )
        return model_out, state

    def value_function(self):
        return tf.reshape(self._value_out, [-1])


if __name__ == "__main__":
    # Can also register the env creator function explicitly with:
    # register_env("corridor", lambda config: SimpleCorridor(config))
    ray.init()
    ModelCatalog.register_custom_model("my_model", CustomModel)
    tune.run(
        "PPO",
        stop={"timesteps_total": 10000},
        config={
            # "log_level": "ERROR",
            "eager": False,
            "env": SimpleCorridor,  # or "corridor" if registered above
            "model": {"custom_model": "my_model"},
            "vf_share_layers": True,
            # "lr": grid_search([1e-2, 1e-4, 1e-6]),  # try different lrs
            # "lr": grid_search([1e-2, 1e-4, 1e-6]),  # try different lrs
            "num_workers": 1,  # parallelism
            "env_config": {"corridor_length": 5},
        },
    )
Esempio n. 36
0
def build_sac_model(policy, obs_space, action_space, config):
    if config["model"].get("custom_model"):
        logger.warning(
            "Setting use_state_preprocessor=True since a custom model "
            "was specified.")
        config["use_state_preprocessor"] = True
    if not isinstance(action_space, (Box, Discrete)):
        raise UnsupportedSpaceException(
            "Action space {} is not supported for SAC.".format(action_space))
    if isinstance(action_space, Box) and len(action_space.shape) > 1:
        raise UnsupportedSpaceException(
            "Action space has multiple dimensions "
            "{}. ".format(action_space.shape) +
            "Consider reshaping this into a single dimension, "
            "using a Tuple action space, or the multi-agent API.")

    # 2 cases:
    # 1) with separate state-preprocessor (before obs+action concat).
    # 2) no separate state-preprocessor: concat obs+actions right away.
    if config["use_state_preprocessor"]:
        num_outputs = 256  # Flatten last Conv2D to this many nodes.
    else:
        config["model"]["fcnet_hiddens"] = []
        num_outputs = 0

    # Force-ignore any additionally provided hidden layer sizes.
    # Everything should be configured using SAC's "Q_model" and "policy_model"
    # settings.
    policy.model = ModelCatalog.get_model_v2(
        obs_space=obs_space,
        action_space=action_space,
        num_outputs=num_outputs,
        model_config=config["model"],
        framework="torch" if config["use_pytorch"] else "tf",
        model_interface=SACTorchModel if config["use_pytorch"] else SACTFModel,
        name="sac_model",
        actor_hidden_activation=config["policy_model"]["fcnet_activation"],
        actor_hiddens=config["policy_model"]["fcnet_hiddens"],
        critic_hidden_activation=config["Q_model"]["fcnet_activation"],
        critic_hiddens=config["Q_model"]["fcnet_hiddens"],
        twin_q=config["twin_q"],
        initial_alpha=config["initial_alpha"],
        target_entropy=config["target_entropy"])

    policy.target_model = ModelCatalog.get_model_v2(
        obs_space=obs_space,
        action_space=action_space,
        num_outputs=num_outputs,
        model_config=config["model"],
        framework="torch" if config["use_pytorch"] else "tf",
        model_interface=SACTorchModel if config["use_pytorch"] else SACTFModel,
        name="target_sac_model",
        actor_hidden_activation=config["policy_model"]["fcnet_activation"],
        actor_hiddens=config["policy_model"]["fcnet_hiddens"],
        critic_hidden_activation=config["Q_model"]["fcnet_activation"],
        critic_hiddens=config["Q_model"]["fcnet_hiddens"],
        twin_q=config["twin_q"],
        initial_alpha=config["initial_alpha"],
        target_entropy=config["target_entropy"])

    return policy.model
Esempio n. 37
0
        "--ppo-checkpoint",
        type=str,
        default=
        "/home/jippo/ray_results/YanivTrainer_2021-04-03_21-40-03/YanivTrainer_yaniv_c49f4_00000_0_2021-04-03_21-40-03/checkpoint_001580/checkpoint-225"
    )
    parser.add_argument(
        "--a3c-checkpoint",
        type=str,
        default=
        "/home/jippo/ray_results/YanivTrainer_2021-04-11_23-01-13/YanivTrainer_yaniv_6e345_00000_0_2021-04-11_23-01-13/checkpoint_021605/checkpoint-13385"
    )
    parser.add_argument("--obs-scheme", type=int, default=0)
    args = parser.parse_args()

    register_env("yaniv", lambda config: YanivEnv(config))
    ModelCatalog.register_custom_model("yaniv_mask", YanivActionMaskModel)

    env_config = {
        "end_after_n_deck_replacements": 0,
        "end_after_n_steps": 130,
        "early_end_reward": 0,
        "use_scaled_negative_reward": True,
        "use_scaled_positive_reward": True,
        "max_negative_reward": -1,
        "negative_score_cutoff": 30,
        "single_step": False,
        "step_reward": 0,
        "use_unkown_cards_in_state": False,
        "use_dead_cards_in_state": True,
        "observation_scheme": args.obs_scheme,
        "n_players": 2,
Esempio n. 38
0
                input_dict["obs"], 64, activation_fn=tf.nn.relu, scope="fc1")
        last_layer = slim.fully_connected(
            last_layer, 64, activation_fn=tf.nn.relu, scope="fc2")
        output = slim.fully_connected(
            last_layer, num_outputs, activation_fn=None, scope="fc_out")
        return output, last_layer
'''

if __name__ == "__main__":
    args = parser.parse_args()
    ray.init()

    # Simple environment with `num_agents` independent entities
    register_env("collision_avoidance",
                 lambda _: Collision_Avoidance_Env(args.num_agents))
    ModelCatalog.register_custom_model("model1", CustomModel1)
    # ModelCatalog.register_custom_model("model2", CustomModel2)
    single_env = gym.make('collision_avoidance-v0')
    obs_space = single_env.observation_space
    act_space = single_env.action_space

    # Each policy can have a different configuration (including custom model)
    def gen_policy(i):
        config = {
            "model": {
                # "custom_model": ["model1", "model2"][i % 2],
                "custom_model": "model1"
            },
            # "gamma": random.choice([0.95, 0.99]),
            "gamma": 0.95,
        }
Esempio n. 39
0
    return CCPPOTorchPolicy if config["framework"] == "torch" \
        else CCPPOTFPolicy


CCTrainer = PPOTrainer.with_updates(
    name="CCPPOTrainer",
    default_policy=CCPPOTFPolicy,
    get_policy_class=get_policy_class,
)

if __name__ == "__main__":
    ray.init(local_mode=True)
    args = parser.parse_args()

    ModelCatalog.register_custom_model(
        "cc_model",
        TorchCentralizedCriticModel if args.torch else CentralizedCriticModel)

    config = {
        "env": TwoStepGame,
        "batch_mode": "complete_episodes",
        "num_workers": 0,
        "multiagent": {
            "policies": {
                "pol1": (None, Discrete(6), TwoStepGame.action_space, {
                    "framework": "torch" if args.torch else "tf",
                }),
                "pol2": (None, Discrete(6), TwoStepGame.action_space, {
                    "framework": "torch" if args.torch else "tf",
                }),
            },
Esempio n. 40
0
    def test_gym_preprocessors(self):
        p1 = ModelCatalog.get_preprocessor(gym.make("CartPole-v0"))
        self.assertEqual(type(p1), NoPreprocessor)

        p2 = ModelCatalog.get_preprocessor(gym.make("FrozenLake-v0"))
        self.assertEqual(type(p2), OneHotPreprocessor)
Esempio n. 41
0
tf = try_import_tf()

cnn_shape = (4, 4, 3)
# The torch version of MobileNetV2 does channels first.
cnn_shape_torch = (3, 224, 224)

parser = argparse.ArgumentParser()
parser.add_argument("--torch", action="store_true")

if __name__ == "__main__":
    args = parser.parse_args()

    # Register our custom model.
    ModelCatalog.register_custom_model(
        "my_model",
        TorchMobileV2PlusRNNModel if args.torch else MobileV2PlusRNNModel)

    # Configure our Trainer.
    config = {
        "use_pytorch": args.torch,
        "model": {
            "custom_model": "my_model",
            # Extra config passed to the custom model's c'tor as kwargs.
            "custom_options": {
                "cnn_shape": cnn_shape_torch if args.torch else cnn_shape,
            },
            "max_seq_len": 20,
        },
        "vf_share_layers": True,
        "num_workers": 0,  # no parallelism
Esempio n. 42
0
def gen_trainer_from_params(params):
    # All ray environment set-up
    if not ray.is_initialized():
        ray.init(ignore_reinit_error=True, include_webui=False, temp_dir=params['ray_params']['temp_dir'])
    register_env("overcooked_multi_agent", params['ray_params']['env_creator'])
    ModelCatalog.register_custom_model(params['ray_params']['custom_model_id'], params['ray_params']['custom_model_cls'])

    # Parse params
    training_params = params['training_params']
    environment_params = params['environment_params']
    evaluation_params = params['evaluation_params']
    multi_agent_params = params['environment_params']['multi_agent_params']
    agent_params = params["agent_params"] # only ml based agents

    env = OvercookedMultiAgent.from_config(environment_params)

    # Returns a properly formatted policy tuple to be passed into ppotrainer config
    def gen_policy(policy_type="ppo"):
        return (
            agent_params[policy_type].get("policy_cls"),
            env.observation_spaces[policy_type],
            env.action_space,
            agent_params[policy_type]["config"]
            )

    # Rllib compatible way of setting the directory we store agent checkpoints in
    logdir_prefix = "{0}_{1}_{2}".format(params["experiment_name"], params['training_params']['seed'], timestr)
    def custom_logger_creator(config):
        """Creates a Unified logger that stores results in <params['results_dir']>/<params["experiment_name"]>_<seed>_<timestamp>
        """
        results_dir = params['results_dir']
        if not os.path.exists(results_dir):
            try:
                os.makedirs(results_dir)
            except Exception as e:
                print("error creating custom logging dir. Falling back to default logdir {}".format(DEFAULT_RESULTS_DIR))
                results_dir = DEFAULT_RESULTS_DIR
        logdir = tempfile.mkdtemp(
            prefix=logdir_prefix, dir=results_dir)
        logger = UnifiedLogger(config, logdir, loggers=None)
        return logger

    if "outer_shape" not in environment_params:
        environment_params["outer_shape"] = None

    if "mdp_params" in environment_params:
        environment_params["eval_mdp_params"] = environment_params["mdp_params"]
    
    # Create rllib compatible multi-agent config based on params
    multi_agent_config = {}

    if multi_agent_params.get('bc_schedule'):
        agents_schedule = OvercookedMultiAgent.bc_schedule_to_agents_schedule(multi_agent_params['bc_schedule'])
    else:
        agents_schedule = multi_agent_params['agents_schedule']
    all_policies = OvercookedMultiAgent.agents_from_schedule(agents_schedule)
    ml_policies = [p for p in all_policies if OvercookedMultiAgent.is_ml_agent(p)]

    multi_agent_config['policies'] = { policy : gen_policy(policy) for policy in ml_policies }
    
    def select_policy(agent_id):
        return OvercookedMultiAgent.agent_id_to_agent_name(agent_id)

    multi_agent_config['policy_mapping_fn'] = select_policy
    multi_agent_config['policies_to_train'] = 'ppo'

    eval_function = get_rllib_eval_function(evaluation_params, environment_params['eval_mdp_params'],
        environment_params['env_params'], environment_params["outer_shape"], multi_agent_params["featurize_fns"], shuffle=multi_agent_params["shuffle_agents"],
        )

    trainer = PPOTrainer(env="overcooked_multi_agent", config={
        "multiagent": multi_agent_config,
        "callbacks" : TrainingCallbacks,
        "custom_eval_function" : eval_function,
        "env_config" : environment_params,
        "eager" : False,
        **training_params
    }, logger_creator=custom_logger_creator)
    return trainer
# Training Deep RL agents with custom models using Ray Tune
# Chapter 8, TensorFlow 2 Reinforcement Learning Cookbook | Praveen Palanisamy

import sys

import ray
import ray.rllib.agents.impala as impala
from ray.tune.logger import pretty_print
from ray.rllib.models import ModelCatalog

if not "." in sys.path:
    sys.path.insert(0, ".")
from custom_model import CustomModel

# Register custom-model in ModelCatalog
ModelCatalog.register_custom_model("CustomCNN", CustomModel)

ray.init()
config = impala.DEFAULT_CONFIG.copy()
config["num_gpus"] = 0
config["num_workers"] = 1
config["model"]["custom_model"] = "CustomCNN"
config["log_level"] = "INFO"
config["framework"] = "tf2"
trainer = impala.ImpalaTrainer(config=config, env="procgen:procgen-coinrun-v0")

for step in range(1000):
    # Custom training loop
    result = trainer.train()
    print(pretty_print(result))
Esempio n. 44
0
        if self.prev_action_mode == "concat":
            policy_input = torch.cat((policy_input, a), axis=-1)
        logits = self.policy_fc2(policy_input)

        # Generate the value output.
        value_input = self.value_fc1(x)
        value_input = nn.functional.relu(value_input)
        value_input = self.dropout_fc(value_input)
        if self.prev_action_mode == "concat":
            value_input = torch.cat((value_input, a), axis=-1)
        value = self.value_fc2(value_input)

        self._cur_value = value.squeeze(-1)
        return logits, [h]

    def set_norm_layer_mode(self, mode):
        if mode == "train":
            self.dropout_fc.train()
        else:
            self.dropout_fc.eval()
        for conv_seq in self.conv_seqs:
            conv_seq.set_norm_layer_mode(mode)

    def _in_rollout(self, x):
        # Single timestep indicates rollout.
        return x.shape[1] == 1


ModelCatalog.register_custom_model("custom_impala_cnn_rnn_torch",
                                   CustomImpalaCNNRNN)
Esempio n. 45
0
                    help="Reward at which we stop training.")

if __name__ == "__main__":
    args = parser.parse_args()

    ray.init(num_cpus=args.num_cpus or None)

    # Register the models to use.
    if args.framework == "torch":
        mod1 = mod2 = TorchSharedWeightsModel
    elif args.framework in ["tfe", "tf2"]:
        mod1 = mod2 = TF2SharedWeightsModel
    else:
        mod1 = SharedWeightsModel1
        mod2 = SharedWeightsModel2
    ModelCatalog.register_custom_model("model1", mod1)
    ModelCatalog.register_custom_model("model2", mod2)

    # Get obs- and action Spaces.
    single_env = gym.make("CartPole-v0")
    obs_space = single_env.observation_space
    act_space = single_env.action_space

    # Each policy can have a different configuration (including custom model).
    def gen_policy(i):
        config = {
            "model": {
                "custom_model": ["model1", "model2"][i % 2],
            },
            "gamma": random.choice([0.95, 0.99]),
        }
    my_id = info["agent_id"]
    other_id = 1 if my_id == 0 else 0
    action_encoder = ModelCatalog.get_preprocessor_for_space(Discrete(2))

    # set the opponent actions into the observation
    _, opponent_batch = info["all_pre_batches"][other_id]
    opponent_actions = np.array([
        action_encoder.transform(a)
        for a in opponent_batch[SampleBatch.ACTIONS]
    ])
    to_update[:, -2:] = opponent_actions


if __name__ == "__main__":
    args = parser.parse_args()
    ModelCatalog.register_custom_model("cc_model", CentralizedCriticModel)
    tune.run(
        "PPO",
        stop={
            "timesteps_total": args.stop,
            "episode_reward_mean": 7.99,
        },
        config={
            "env": GlobalObsTwoStepGame,
            "batch_mode": "complete_episodes",
            "callbacks": {
                "on_postprocess_traj": fill_in_actions,
            },
            "num_workers": 0,
            "multiagent": {
                "policies": {
Esempio n. 47
0
    def __init__(self, observation_space, action_space, config):
        config = dict(ray.rllib.agents.dqn.dqn.DEFAULT_CONFIG, **config)
        self.config = config

        dist_cls, logit_dim = ModelCatalog.get_action_dist(
            action_space, self.config["model"])

        # Action inputs
        self.obs_t = tf.placeholder(
            tf.float32, shape=(None, ) + observation_space.shape)
        prev_actions_ph = ModelCatalog.get_action_placeholder(action_space)
        prev_rewards_ph = tf.placeholder(
            tf.float32, [None], name="prev_reward")

        with tf.variable_scope(P_SCOPE) as scope:
            self.model = ModelCatalog.get_model({
                "obs": self.obs_t,
                "prev_actions": prev_actions_ph,
                "prev_rewards": prev_rewards_ph,
                "is_training": self._get_is_training_placeholder(),
            }, observation_space, action_space, logit_dim,
                                                self.config["model"])
            logits = self.model.outputs
            self.p_func_vars = _scope_vars(scope.name)

        # Action outputs
        action_dist = dist_cls(logits)
        self.output_actions = action_dist.sample()

        # Training inputs
        self.act_t = tf.placeholder(tf.int32, [None], name="action")
        self.cum_rew_t = tf.placeholder(tf.float32, [None], name="reward")

        # v network evaluation
        with tf.variable_scope(V_SCOPE) as scope:
            state_values = self.model.value_function()
            self.v_func_vars = _scope_vars(scope.name)
        self.v_loss = self._build_value_loss(state_values, self.cum_rew_t)
        self.p_loss = self._build_policy_loss(state_values, self.cum_rew_t,
                                              logits, self.act_t, action_space)

        # which kind of objective to optimize
        objective = (
            self.p_loss.loss + self.config["vf_coeff"] * self.v_loss.loss)
        self.explained_variance = tf.reduce_mean(
            explained_variance(self.cum_rew_t, state_values))

        # initialize TFPolicyGraph
        self.sess = tf.get_default_session()
        self.loss_inputs = [
            ("obs", self.obs_t),
            ("actions", self.act_t),
            ("advantages", self.cum_rew_t),
        ]
        TFPolicyGraph.__init__(
            self,
            observation_space,
            action_space,
            self.sess,
            obs_input=self.obs_t,
            action_sampler=self.output_actions,
            action_prob=action_dist.sampled_action_prob(),
            loss=objective,
            model=self.model,
            loss_inputs=self.loss_inputs,
            state_inputs=self.model.state_in,
            state_outputs=self.model.state_out,
            prev_action_input=prev_actions_ph,
            prev_reward_input=prev_rewards_ph)
        self.sess.run(tf.global_variables_initializer())

        self.stats_fetches = {
            "total_loss": objective,
            "vf_explained_var": self.explained_variance,
            "policy_loss": self.p_loss.loss,
            "vf_loss": self.v_loss.loss
        }
Esempio n. 48
0
def visualizer_rllib(args):
    result_dir = args.result_dir if args.result_dir[-1] != '/' \
        else args.result_dir[:-1]

    # config = get_rllib_config(result_dir + '/..')
    # pkl = get_rllib_pkl(result_dir + '/..')
    config = get_rllib_config(result_dir)
    # TODO(ev) backwards compatibility hack
    try:
        pkl = get_rllib_pkl(result_dir)
    except Exception:
        pass

    # check if we have a multiagent scenario but in a
    # backwards compatible way
    if config.get('multiagent', {}).get('policy_graphs', {}):
        multiagent = True
        config['multiagent'] = pkl['multiagent']
    else:
        multiagent = False

    # Run on only one cpu for rendering purposes
    config['num_workers'] = 0

    flow_params = get_flow_params(config)

    # hack for old pkl files
    # TODO(ev) remove eventually
    sim_params = flow_params['sim']
    setattr(sim_params, 'num_clients', 1)

    # Create and register a gym+rllib env
    create_env, env_name = make_create_env(params=flow_params,
                                           version=0,
                                           render=False)
    register_env(env_name, create_env)

    # Determine agent and checkpoint
    config_run = config['env_config']['run'] if 'run' in config['env_config'] \
        else None
    if args.run and config_run:
        if args.run != config_run:
            print('visualizer_rllib.py: error: run argument ' +
                  '\'{}\' passed in '.format(args.run) +
                  'differs from the one stored in params.json ' +
                  '\'{}\''.format(config_run))
            sys.exit(1)
    if args.run:
        agent_cls = get_agent_class(args.run)
    elif config_run:
        agent_cls = get_agent_class(config_run)
    else:
        print('visualizer_rllib.py: error: could not find flow parameter '
              '\'run\' in params.json, '
              'add argument --run to provide the algorithm or model used '
              'to train the results\n e.g. '
              'python ./visualizer_rllib.py /tmp/ray/result_dir 1 --run PPO')
        sys.exit(1)

    sim_params.restart_instance = False

    sim_params.emission_path = './test_time_rollout/'

    # prepare for rendering
    if args.render_mode == 'sumo_web3d':
        sim_params.num_clients = 2
        sim_params.render = False
    elif args.render_mode == 'drgb':
        sim_params.render = 'drgb'
        sim_params.pxpm = 4
    elif args.render_mode == 'sumo_gui':
        sim_params.render = True
    elif args.render_mode == 'no_render':
        sim_params.render = False

    if args.save_render:
        sim_params.render = 'drgb'
        sim_params.pxpm = 4
        sim_params.save_render = True

    # Recreate the scenario from the pickled parameters
    exp_tag = flow_params['exp_tag']
    net_params = flow_params['net']
    vehicles = flow_params['veh']
    initial_config = flow_params['initial']
    module = __import__('flow.scenarios', fromlist=[flow_params['scenario']])
    scenario_class = getattr(module, flow_params['scenario'])

    scenario = scenario_class(name=exp_tag,
                              vehicles=vehicles,
                              net_params=net_params,
                              initial_config=initial_config)

    # check if the environment is a single or multiagent environment, and
    # get the right address accordingly
    single_agent_envs = [
        env for env in dir(flow.envs) if not env.startswith('__')
    ]

    if flow_params['env_name'] in single_agent_envs:
        env_loc = 'flow.envs'
    else:
        env_loc = 'flow.multiagent_envs'

    # Start the environment with the gui turned on and a path for the
    # emission file
    module = __import__(env_loc, fromlist=[flow_params['env_name']])
    env_class = getattr(module, flow_params['env_name'])
    env_params = flow_params['env']
    env_params.restart_instance = False
    if args.evaluate:
        env_params.evaluate = True

    # lower the horizon if testing
    if args.horizon:
        config['horizon'] = args.horizon
        env_params.horizon = args.horizon

    # create the agent that will be used to compute the actions
    agent = agent_cls(env=env_name, config=config)
    checkpoint = result_dir + '/checkpoint_' + args.checkpoint_num
    checkpoint = checkpoint + '/checkpoint-' + args.checkpoint_num
    agent.restore(checkpoint)

    env = ModelCatalog.get_preprocessor_as_wrapper(
        env_class(env_params=env_params,
                  sim_params=sim_params,
                  scenario=scenario))

    if multiagent:
        rets = {}
        # map the agent id to its policy
        policy_map_fn = config['multiagent']['policy_mapping_fn'].func
        for key in config['multiagent']['policy_graphs'].keys():
            rets[key] = []
    else:
        rets = []
    final_outflows = []
    mean_speed = []
    for i in range(args.num_rollouts):
        vel = []
        state = env.reset()
        if multiagent:
            ret = {key: [0] for key in rets.keys()}
        else:
            ret = 0
        for _ in range(env_params.horizon):
            vehicles = env.unwrapped.vehicles
            vel.append(np.mean(vehicles.get_speed(vehicles.get_ids())))
            if multiagent:
                action = {}
                for agent_id in state.keys():
                    action[agent_id] = agent.compute_action(
                        state[agent_id], policy_id=policy_map_fn(agent_id))
            else:
                action = agent.compute_action(state)
            state, reward, done, _ = env.step(action)
            if multiagent:
                for actor, rew in reward.items():
                    ret[policy_map_fn(actor)][0] += rew
            else:
                ret += reward
            if multiagent and done['__all__']:
                break
            if not multiagent and done:
                break

        if multiagent:
            for key in rets.keys():
                rets[key].append(ret[key])
        else:
            rets.append(ret)
        outflow = vehicles.get_outflow_rate(500)
        final_outflows.append(outflow)
        mean_speed.append(np.mean(vel))
        if multiagent:
            for agent_id, rew in rets.items():
                print('Round {}, Return: {} for agent {}'.format(
                    i, ret, agent_id))
        else:
            print('Round {}, Return: {}'.format(i, ret))
    if multiagent:
        for agent_id, rew in rets.items():
            print('Average, std return: {}, {} for agent {}'.format(
                np.mean(rew), np.std(rew), agent_id))
    else:
        print('Average, std return: {}, {}'.format(np.mean(rets),
                                                   np.std(rets)))
    print('Average, std speed: {}, {}'.format(np.mean(mean_speed),
                                              np.std(mean_speed)))
    print('Average, std outflow: {}, {}'.format(np.mean(final_outflows),
                                                np.std(final_outflows)))

    # terminate the environment
    env.unwrapped.terminate()

    # if prompted, convert the emission file into a csv file
    if args.emission_to_csv:
        dir_path = os.path.dirname(os.path.realpath(__file__))
        emission_filename = '{0}-emission.xml'.format(scenario.name)

        emission_path = \
            '{0}/test_time_rollout/{1}'.format(dir_path, emission_filename)

        emission_to_csv(emission_path)

    # if we wanted to save the render, here we create the movie
    if args.save_render:
        dirs = os.listdir(os.path.expanduser('~') + '/flow_rendering')
        dirs.sort(key=lambda date: datetime.strptime(date, "%Y-%m-%d-%H%M%S"))
        recent_dir = dirs[-1]
        # create the movie
        movie_dir = os.path.expanduser('~') + '/flow_rendering/' + recent_dir
        save_dir = os.path.expanduser('~') + '/flow_movies'
        if not os.path.exists(save_dir):
            os.mkdir(save_dir)
        os_cmd = "cd " + movie_dir + " && ffmpeg -i frame_%06d.png"
        os_cmd += " -pix_fmt yuv420p " + dirs[-1] + ".mp4"
        os_cmd += "&& cp " + dirs[-1] + ".mp4 " + save_dir + "/"
        os.system(os_cmd)
Esempio n. 49
0
 def __init__(self, path_to_model, observation_space):
     self._prep = ModelCatalog.get_preprocessor_for_space(observation_space)
     self._path_to_model = path_to_model
Esempio n. 50
0
def main(args):
    # ====================================
    # init env config
    # ====================================
    if args.no_debug:
        ray.init(webui_host="127.0.0.1")
    else:
        ray.init(local_mode=True, webui_host="127.0.0.1")
    # use ray cluster for training
    # ray.init(
    #     address="auto" if args.address is None else args.address,
    #     redis_password="******",
    # )
    #
    # print(
    #     "--------------- Ray startup ------------\n{}".format(
    #         ray.state.cluster_resources()
    #     )
    # )

    agent_specs = {"AGENT-007": agent_spec}

    env_config = {
        "seed": 42,
        "scenarios": [scenario_paths],
        "headless": args.headless,
        "agent_specs": agent_specs,
    }

    # ====================================
    # init tune config
    # ====================================
    class MultiEnv(RLlibHiWayEnv):
        def __init__(self, env_config):
            env_config["scenarios"] = [
                scenario_paths[(env_config.worker_index - 1) %
                               len(scenario_paths)]
            ]
            super(MultiEnv, self).__init__(config=env_config)

    ModelCatalog.register_custom_model("my_fc", FullyConnectedNetwork)
    tune_config = {
        "env": MultiEnv,
        "env_config": env_config,
        "multiagent": {
            "policies": {
                "default_policy": (
                    None,
                    OBSERVATION_SPACE,
                    ACTION_SPACE,
                    {},
                )
            },
            "policy_mapping_fn": lambda agent_id: "default_policy",
        },
        "model": {
            "custom_model": "my_fc",
        },
        "framework": "torch",
        "callbacks": {
            "on_episode_start": on_episode_start,
            "on_episode_step": on_episode_step,
            "on_episode_end": on_episode_end,
        },
        "lr": 1e-4,
        "log_level": "WARN",
        "num_workers": args.num_workers,
        "horizon": args.horizon,
        "train_batch_size": 10240 * 3,

        # "observation_filter": "MeanStdFilter",
        # "batch_mode": "complete_episodes",
        # "grad_clip": 0.5,

        # "model":{
        #     "use_lstm": True,
        # },
    }

    tune_config.update({
        "lambda": 0.95,
        "clip_param": 0.2,
        "num_sgd_iter": 10,
        "sgd_minibatch_size": 1024,
        "gamma": 0.995,
        # "l2_coeff": 5e-4,
    })

    # ====================================
    # init log and checkpoint dir_info
    # ====================================
    experiment_name = EXPERIMENT_NAME.format(
        scenario="multi_scenarios",
        algorithm="PPO",
        n_agent=1,
    )

    log_dir = Path(args.log_dir).expanduser().absolute() / RUN_NAME
    log_dir.mkdir(parents=True, exist_ok=True)
    print(f"Checkpointing at {log_dir}")

    if args.restore:
        restore_path = Path(args.restore).expanduser()
        print(f"Loading model from {restore_path}")
    else:
        restore_path = None

    # run experiments
    analysis = tune.run(
        PPOTrainer,
        # "PPO",
        name=experiment_name,
        stop={"time_total_s": 24 * 60 * 60},
        checkpoint_freq=2,
        checkpoint_at_end=True,
        local_dir=str(log_dir),
        resume=args.resume,
        restore=restore_path,
        max_failures=1000,
        export_formats=["model", "checkpoint"],
        config=tune_config,
    )

    print(analysis.dataframe().head())
Esempio n. 51
0
def setup(env,
          hparams,
          num_cpus,
          num_gpus,
          num_agents,
          use_gpus_for_workers=False,
          use_gpu_for_driver=False,
          num_workers_per_device=1):

    if env == 'harvest':

        def env_creator(_):
            return HarvestEnv(num_agents=num_agents)

        single_env = HarvestEnv()
    else:

        def env_creator(_):
            return CleanupEnv(num_agents=num_agents)

        single_env = CleanupEnv()

    env_name = env + "_env"
    register_env(env_name, env_creator)

    obs_space = single_env.observation_space
    act_space = single_env.action_space

    # Each policy can have a different configuration (including custom model)
    def gen_policy():
        return (PPOPolicyGraph, obs_space, act_space, {})

    # Setup PPO with an ensemble of `num_policies` different policy graphs
    policy_graphs = {}
    for i in range(num_agents):
        policy_graphs['agent-' + str(i)] = gen_policy()

    def policy_mapping_fn(agent_id):
        return agent_id

    # register the custom model
    model_name = "conv_to_fc_net"
    ModelCatalog.register_custom_model(model_name, ConvToFCNet)

    algorithm = 'A3C'
    agent_cls = get_agent_class(algorithm)
    config = agent_cls._default_config.copy()

    # information for replay
    config['env_config']['func_create'] = tune.function(env_creator)
    config['env_config']['env_name'] = env_name
    config['env_config']['run'] = algorithm

    # Calculate device configurations
    gpus_for_driver = int(use_gpu_for_driver)
    cpus_for_driver = 1 - gpus_for_driver
    if use_gpus_for_workers:
        spare_gpus = (num_gpus - gpus_for_driver)
        num_workers = int(spare_gpus * num_workers_per_device)
        num_gpus_per_worker = spare_gpus / num_workers
        num_cpus_per_worker = 0
    else:
        spare_cpus = (num_cpus - cpus_for_driver)
        num_workers = int(spare_cpus * num_workers_per_device)
        num_gpus_per_worker = 0
        num_cpus_per_worker = spare_cpus / num_workers

    # hyperparams
    config.update({
        "train_batch_size":
        128,
        "horizon":
        1000,
        "lr_schedule": [[0, hparams['lr_init']],
                        [20000000, hparams['lr_final']]],
        "num_workers":
        num_workers,
        "num_gpus":
        gpus_for_driver,  # The number of GPUs for the driver
        "num_cpus_for_driver":
        cpus_for_driver,
        "num_gpus_per_worker":
        num_gpus_per_worker,  # Can be a fraction
        "num_cpus_per_worker":
        num_cpus_per_worker,  # Can be a fraction
        "entropy_coeff":
        hparams['entropy_coeff'],
        "multiagent": {
            "policy_graphs": policy_graphs,
            "policy_mapping_fn": tune.function(policy_mapping_fn),
        },
        "model": {
            "custom_model": "conv_to_fc_net",
            "use_lstm": True,
            "lstm_cell_size": 128
        }
    })
    return algorithm, env_name, config
Esempio n. 52
0
            last_layer = slim.fully_connected(
                input_dict["obs"], 64, activation_fn=tf.nn.relu, scope="fc1")
        last_layer = slim.fully_connected(
            last_layer, 64, activation_fn=tf.nn.relu, scope="fc2")
        output = slim.fully_connected(
            last_layer, num_outputs, activation_fn=None, scope="fc_out")
        return output, last_layer


if __name__ == "__main__":
    args = parser.parse_args()
    ray.init()

    # Simple environment with `num_agents` independent cartpole entities
    register_env("multi_cartpole", lambda _: MultiCartpole(args.num_agents))
    ModelCatalog.register_custom_model("model1", CustomModel1)
    ModelCatalog.register_custom_model("model2", CustomModel2)
    single_env = gym.make("CartPole-v0")
    obs_space = single_env.observation_space
    act_space = single_env.action_space

    # Each policy can have a different configuration (including custom model)
    def gen_policy(i):
        config = {
            "model": {
                "custom_model": ["model1", "model2"][i % 2],
            },
            "gamma": random.choice([0.95, 0.99]),
        }
        return (PPOPolicyGraph, obs_space, act_space, config)
Esempio n. 53
0
            last_layer = tf.layers.batch_normalization(
                last_layer, training=input_dict["is_training"])
        output = slim.fully_connected(
            last_layer,
            num_outputs,
            weights_initializer=normc_initializer(0.01),
            activation_fn=None,
            scope="fc_out")
        return output, last_layer


if __name__ == "__main__":
    args = parser.parse_args()
    ray.init()

    ModelCatalog.register_custom_model("bn_model", BatchNormModel)
    run_experiments({
        "batch_norm_demo": {
            "run": args.run,
            "env": "Pendulum-v0" if args.run == "DDPG" else "CartPole-v0",
            "stop": {
                "training_iteration": args.num_iters
            },
            "config": {
                "model": {
                    "custom_model": "bn_model",
                },
                "num_workers": 0,
            },
        },
    })
Esempio n. 54
0
        # Batch dot product => shape of logits is [BATCH, MAX_ACTIONS].
        action_logits = tf.reduce_sum(avail_actions * intent_vector, axis=2)

        # Mask out invalid actions (use tf.float32.min for stability)
        inf_mask = tf.maximum(tf.log(action_mask), tf.float32.min)
        return action_logits + inf_mask, state

    def value_function(self):
        return self.action_embed_model.value_function()


if __name__ == "__main__":
    args = parser.parse_args()
    ray.init()

    ModelCatalog.register_custom_model("pa_model", ParametricActionsModel)
    register_env("pa_cartpole", lambda _: ParametricActionCartpole(10))
    if args.run == "DQN":
        cfg = {
            # TODO(ekl) we need to set these to prevent the masked values
            # from being further processed in DistributionalQModel, which
            # would mess up the masking. It is possible to support these if we
            # defined a a custom DistributionalQModel that is aware of masking.
            "hiddens": [],
            "dueling": False,
        }
    else:
        cfg = {}
    tune.run(
        args.run,
        stop={
Esempio n. 55
0
import importlib
config = importlib.import_module(args.config_file, package=None)
print(config.num_seeds)

# Did not do ConfigParser or JSON because I want to give config programmatically
# config = configparser.ConfigParser()
# config.read(args.config_file)
# print(config.sections(), [i for i in config['ConfigSpace']])

# import json
# print([json.loads(config['ConfigSpace'][i]) for i in config['ConfigSpace']])

from ray.rllib.models.preprocessors import OneHotPreprocessor
from ray.rllib.models import ModelCatalog
ModelCatalog.register_custom_preprocessor("ohe", OneHotPreprocessor)

ray.init(local_mode=True)  #, object_id_seed=0)

# num_seeds = 10
# state_space_sizes = [8]#, 10, 12, 14] # [2**i for i in range(1,6)]
# action_space_sizes = [8]#2, 4, 8, 16] # [2**i for i in range(1,6)]
# delays = [0] + [2**i for i in range(4)]
# sequence_lengths = [1, 2, 3, 4]#i for i in range(1,4)]
# reward_densities = [0.25] # np.linspace(0.0, 1.0, num=5)
# # make_reward_dense = [True, False]
# terminal_state_densities = [0.25] # np.linspace(0.1, 1.0, num=5)
# algorithms = ["DQN"]
# seeds = [i for i in range(num_seeds)]
# # Others, keep the rest fixed for these: learning_starts, target_network_update_freq, double_dqn, fcnet_hiddens, fcnet_activation, use_lstm, lstm_seq_len, sample_batch_size/train_batch_size, learning rate
# # More others: adam_epsilon, exploration_final_eps/exploration_fraction, buffer_size
Esempio n. 56
0
def register_mm_ray_policy(name: str, policy_model: Type[tf.keras.Model],
                           networks: Dict[str, tf.keras.Model]):
    """
    Constructs a Ray policy with multiple models as part of the collections. This
    allows for distributed training with multiple parameter sets (for example, 
    when using auxillary losses)
    
    Arguments:
        policy_model {Type[tf.keras.Model]} -- The policy model which is called to make predictions
        networks {Dict[str, tf.keras.Model]} -- A dictionary of additional networks
    
    Returns:
        ray.rllib.models.Model -- A model which can be used with Ray
    """
    class MMRayPolicy(ray.rllib.models.Model):
        @override(ray.rllib.models.Model)
        def _build_layers_v2(self, input_dict, num_outputs, options):

            # Setup the policy model
            if tf.get_collection('_rk_policy_model'):
                self.model = tf.get_collection('_rk_policy_model')[0]
            else:
                self.model = policy_model(num_outputs, **options)
                tf.add_to_collection('_rk_policy_model', self.model)

            # Add any other models to the collection
            if networks:
                self.networks = {}
            for key in networks.keys():
                if tf.get_collection('_rk_networks_{}'.format(key)):
                    self.networks[key] = tf.get_collection(
                        '_rk_networks_{}'.format(key))
                else:
                    self.networks[key] = [networks[key](**options), None, None]
                    tf.add_to_collection('_rk_networks_{}'.format(key),
                                         self.networks[key])

            if self.model.recurrent:
                self.state_init = [
                    np.zeros([state_size])
                    for state_size in self.model.state_size
                ]

                if not self.state_in:
                    self.state_in = [
                        tf.placeholder(tf.float32, [None, state_size])
                        for state_size in self.model.state_size
                    ]

                output = self.model(input_dict,
                                    seqlens=self.seq_lens,
                                    initial_state=self.state_in)
                self.state_out = list(output['state_out'])
            else:
                output = self.model(input_dict)
            self.policy_output = output

            # Update the input dict with the model outputs
            input_dict['model_outputs'] = output

            # Compute the outputs for each of the networks
            for key, net in self.networks.items():
                if net[0].recurrent:
                    net[1] = [
                        np.zeros([state_size])
                        for state_size in net[0].state_size
                    ]

                    if not net[2]:
                        net[2] = [
                            tf.placeholder(tf.float32, [None, state_size])
                            for state_size in net[0].state_size
                        ]

                    self.network_outputs[key] = net[0](input_dict,
                                                       seqlens=self.seq_lens,
                                                       initial_state=net[2])
                else:
                    self.network_outputs[key] = net[0](input_dict)

            return output['logits'], output['latent']

        @override(ray.rllib.models.Model)
        def custom_loss(self, policy_loss, loss_inputs):

            # Update the loss_inputs with all of the model outputs
            if self.networks:
                loss_inputs['network_outputs'] = {
                    k: self.network_outputs[k]
                    for k in self.networks.keys()
                }
                loss_inputs['network_outputs'][
                    'policy_model'] = self.policy_output

            total_loss = policy_loss
            if hasattr(self.model, 'custom_loss'):
                total_loss = self.model.custom_loss(policy_loss, loss_inputs)

            if self.networks:
                for _, net in self.networks.items():
                    if hasattr(net[0], 'custom_loss'):
                        total_loss = net[0].custom_loss(
                            total_loss,
                            loss_inputs,
                        )
                return total_loss

    MMRayPolicy.__name__ = name
    MMRayPolicy.__doc__ = "Wraped Multi-Network RAY policy"

    ModelCatalog.register_custom_model(name, MMRayPolicy)

    return MMRayPolicy
Esempio n. 57
0
 def testCustomModel(self):
     ray.init()
     ModelCatalog.register_custom_model("foo", CustomModel)
     p1 = ModelCatalog.get_model(
         get_registry(), 1, 5, {"custom_model": "foo"})
     self.assertEqual(str(type(p1)), str(CustomModel))
            return np.array(transform_module(observation))

        elif flag == 2:
            return np.flip(observation, 1)

        elif flag == 3:
            h1 = np.random.randint(10, 20)
            w1 = np.random.randint(10, 20)
            observation[h1:h1 + h1, w1:w1 + w1, :] = 0
            return observation

        elif flag == 4:
            h1 = np.random.randint(10, 20)
            w1 = np.random.randint(10, 20)
            rand_color = np.random.randint(0, 255, size=(3, )) / 255.
            observation[h1:h1 + h1, w1:w1 + w1, :] = np.tile(
                rand_color.reshape(1, 1, -1),
                observation[h1:h1 + h1, w1:w1 + w1, :].shape[:2] + (1, ))
            return observation

        elif flag == 5:
            observation = observation[:, :,
                                      0] * 0.2989 + observation[:, :,
                                                                1] * 0.587 + observation[:, :,
                                                                                         2] * 0.114
            observation = np.expand_dims(observation, axis=2)
            return observation


ModelCatalog.register_custom_preprocessor("my_prep", MyPreprocessorClass)
Esempio n. 59
0
        if not args.config.get("env"):
            parser.error("the following arguments are required: --env")
        args.env = args.config.get("env")

    ray.init()

    cls = get_agent_class(args.run)
    agent = cls(env=args.env, config=args.config)
    agent.restore(args.checkpoint)
    num_steps = int(args.steps)

    if args.run == "DQN":
        env = gym.make(args.env)
        env = wrap_dqn(get_registry(), env, args.config.get("model", {}))
    else:
        env = ModelCatalog.get_preprocessor_as_wrapper(get_registry(),
                                                       gym.make(args.env))
    if args.out is not None:
        rollouts = []
    steps = 0
    while steps < (num_steps or steps + 1):
        if args.out is not None:
            rollout = []
        state = env.reset()
        done = False
        reward_total = 0.0
        while not done and steps < (num_steps or steps + 1):
            action = agent.compute_action(state)
            next_state, reward, done, _ = env.step(action)
            reward_total += reward
            if not args.no_render:
                env.render()
Esempio n. 60
0
                    size,
                    weights_initializer=normc_initializer(1.0),
                    activation_fn=activation,
                    scope=label)
                i += 1
            label = "fc_out"
            output = slim.fully_connected(
                last_layer,
                num_outputs,
                weights_initializer=normc_initializer(0.01),
                activation_fn=None,
                scope=label)
            return output, last_layer


ModelCatalog.register_custom_model("my_model", MyModelClass)

#ray.init(num_gpus=2)
ray.init()
print("hello!")

def my_train_fn(config, reporter):
    agent = a3c.A3CAgent(config=config)
    policy_graph = agent.local_evaluator.policy_map["default"].sess.graph
    writer = tf.summary.FileWriter(agent._result_logger.logdir, policy_graph)
    writer.close()
    for _ in range(10):
        result = agent.train()
        reporter(**result)

    agent.stop()