def setup(self):
        if self._sess:
            return

        if self._algorithm == "PPO":
            from ray.rllib.agents.ppo.ppo_tf_policy import PPOTFPolicy as LoadPolicy
        elif self._algorithm in ["A2C", "A3C"]:
            from ray.rllib.agents.a3c.a3c_tf_policy import A3CTFPolicy as LoadPolicy
        elif self._algorithm == "PG":
            from ray.rllib.agents.pg.pg_tf_policy import PGTFPolicy as LoadPolicy
        elif self._algorithm == "DQN":
            from ray.rllib.agents.dqn.dqn_policy import DQNTFPolicy as LoadPolicy
        else:
            raise TypeError("Unsupport algorithm")

        self._prep = ModelCatalog.get_preprocessor_for_space(
            self._observation_space)
        self._sess = tf.Session(graph=tf.Graph())
        self._sess.__enter__()

        with tf.name_scope(self._policy_name):
            # obs_space need to be flattened before passed to PPOTFPolicy
            flat_obs_space = self._prep.observation_space
            self.policy = LoadPolicy(flat_obs_space, self._action_space, {})
            objs = pickle.load(open(self._load_path, "rb"))
            objs = pickle.loads(objs["worker"])
            state = objs["state"]
            weights = state[self._policy_name]
            self.policy.set_weights(weights)
Esempio n. 2
0
    def __init__(self, obs_space, action_space, config):
        self.action_space = action_space
        self.action_noise_std = config["action_noise_std"]
        self.preprocessor = ModelCatalog.get_preprocessor_for_space(obs_space)
        self.observation_filter = get_filter(config["observation_filter"],
                                             self.preprocessor.shape)
        self.single_threaded = config.get("single_threaded", False)
        self.sess = make_session(single_threaded=self.single_threaded)
        self.inputs = tf.placeholder(tf.float32,
                                     [None] + list(self.preprocessor.shape))

        # Policy network.
        dist_class, dist_dim = ModelCatalog.get_action_dist(
            self.action_space, config["model"], dist_type="deterministic")
        model = ModelCatalog.get_model({SampleBatch.CUR_OBS: self.inputs},
                                       obs_space, action_space, dist_dim,
                                       config["model"])
        dist = dist_class(model.outputs, model)
        self.sampler = dist.sample()

        self.variables = ray.experimental.tf_utils.TensorFlowVariables(
            model.outputs, self.sess)

        self.num_params = sum(
            np.prod(variable.shape.as_list())
            for _, variable in self.variables.variables.items())
        self.sess.run(tf.global_variables_initializer())
Esempio n. 3
0
    def __init__(self, load_path, observation_space, action_space):
        self._prep = ModelCatalog.get_preprocessor_for_space(observation_space)
        self._path_to_model = load_path

        if isinstance(action_space, gym.spaces.Box):
            self.is_continuous = True
        elif isinstance(action_space, gym.spaces.Discrete):
            self.is_continuous = False
        else:
            raise TypeError("Unsupport action space")

        self._sess = tf.Session(graph=tf.Graph())
        self._sess.__enter__()
        tf.saved_model.load(self._sess,
                            export_dir=self._path_to_model,
                            tags=["serve"])

        graph = tf.get_default_graph()

        if self.is_continuous:
            # These tensor names were found by inspecting the trained model
            # deterministic
            self.output_node = graph.get_tensor_by_name(
                "default_policy/split:0")
            # add guassian noise
            # output_node = graph.get_tensor_by_name("default_policy/add:0")
        else:
            self.output_node = graph.get_tensor_by_name(
                "default_policy/ArgMax:0")

        self.input_node = graph.get_tensor_by_name(
            "default_policy/observation:0")
Esempio n. 4
0
    def __init__(self, load_path, algorithm, policy_name, observation_space,
                 action_space):
        self._checkpoint_path = load_path
        self._policy_name = policy_name
        self._observation_space = observation_space
        self._action_space = action_space
        self._prep = ModelCatalog.get_preprocessor_for_space(
            self._observation_space)
        flat_obs_space = self._prep.observation_space

        ray.init(ignore_reinit_error=True, local_mode=True)

        from zhr_train_rllib.ppo_policy_modeldist_multiv_multiobj import PPOTorchPolicy as LoadPolicy
        config = ray.rllib.agents.ppo.ppo.DEFAULT_CONFIG.copy()
        config['num_workers'] = 0
        config['model']['free_log_std'] = False
        config["exploration_config"][
            "type"] = "zhr.utils.saved_model_simple.StochasticSampling"

        self.policy = LoadPolicy(flat_obs_space, self._action_space, config)
        objs = pickle.load(open(self._checkpoint_path, "rb"))
        objs = pickle.loads(objs["worker"])
        state = objs["state"]
        filters = objs["filters"]
        self.filters = filters[self._policy_name]
        weights = state[self._policy_name]
        weights.pop("_optimizer_variables")
        self.policy.set_weights(weights)
        self.model = self.policy.model
Esempio n. 5
0
 def _build_policy_map(self, policy_dict, policy_config):
     policy_map = {}
     preprocessors = {}
     for name, (cls, obs_space, act_space,
                conf) in sorted(policy_dict.items()):
         logger.debug("Creating policy for {}".format(name))
         merged_conf = merge_dicts(policy_config, conf)
         if self.preprocessing_enabled:
             preprocessor = ModelCatalog.get_preprocessor_for_space(
                 obs_space, merged_conf.get("model"))
             preprocessors[name] = preprocessor
             obs_space = preprocessor.observation_space
         else:
             preprocessors[name] = NoPreprocessor(obs_space)
         if isinstance(obs_space, gym.spaces.Dict) or \
                 isinstance(obs_space, gym.spaces.Tuple):
             raise ValueError(
                 "Found raw Tuple|Dict space as input to policy. "
                 "Please preprocess these observations with a "
                 "Tuple|DictFlatteningPreprocessor.")
         if tf:
             with tf.variable_scope(name):
                 policy_map[name] = cls(obs_space, act_space, merged_conf)
         else:
             policy_map[name] = cls(obs_space, act_space, merged_conf)
     if self.worker_index == 0:
         logger.info("Built policy map: {}".format(policy_map))
         logger.info("Built preprocessor map: {}".format(preprocessors))
     return policy_map, preprocessors
Esempio n. 6
0
    def __init__(self, load_path, policy_name, observation_space,
                 action_space):
        self._checkpoint_path = load_path
        self._policy_name = policy_name
        self._observation_space = observation_space
        self._action_space = action_space
        self._sess = None

        if isinstance(action_space, gym.spaces.Box):
            self.is_continuous = True
        elif isinstance(action_space, gym.spaces.Discrete):
            self.is_continuous = False
        else:
            raise TypeError("Unsupport action space")

        if self._sess:
            return

        self._prep = ModelCatalog.get_preprocessor_for_space(
            self._observation_space)
        self._sess = tf.compat.v1.Session(graph=tf.Graph())
        self._sess.__enter__()

        with tf.compat.v1.name_scope(self._policy_name):
            # obs_space need to be flattened before passed to PPOTFPolicy
            flat_obs_space = self._prep.observation_space
            self.policy = LoadPolicy(flat_obs_space, self._action_space, {})
            objs = pickle.load(open(self._checkpoint_path, "rb"))
            objs = pickle.loads(objs["worker"])
            state = objs["state"]
            weights = state[self._policy_name]
            self.policy.set_weights(weights)
Esempio n. 7
0
    def __init__(self, load_path, algorithm, policy_name, observation_space, action_space):
        self._checkpoint_path = load_path
        self._policy_name = policy_name
        self._observation_space = observation_space
        self._action_space = action_space
        self._prep = ModelCatalog.get_preprocessor_for_space(self._observation_space)
        flat_obs_space = self._prep.observation_space

        ray.init(ignore_reinit_error=True, local_mode=True)

        from utils.ppo_policy import PPOTorchPolicy as LoadPolicy
        config = ray.rllib.agents.ppo.ppo.DEFAULT_CONFIG.copy()
        config['num_workers'] = 0
        config["model"]["use_lstm"] = True
        config['model']['free_log_std'] = False

        self.policy = LoadPolicy(flat_obs_space, self._action_space, config)
        objs = pickle.load(open(self._checkpoint_path, "rb"))
        objs = pickle.loads(objs["worker"])
        state = objs["state"]
        filters = objs["filters"]
        self.filters = filters[self._policy_name]
        weights = state[self._policy_name]
        weights.pop("_optimizer_variables")
        self.policy.set_weights(weights)
        self.model = self.policy.model

        self.rnn_state = self.model.get_initial_state()
        self.rnn_state = [self.rnn_state[0].unsqueeze(0),self.rnn_state[1].unsqueeze(0)]
Esempio n. 8
0
    def __init__(self, load_path, algorithm, policy_name, observation_space, action_space):
        self._checkpoint_path = load_path
        self._policy_name = policy_name
        self._observation_space = observation_space
        self._action_space = action_space
        self._prep = ModelCatalog.get_preprocessor_for_space(self._observation_space)
        flat_obs_space = self._prep.observation_space

        ray.init(ignore_reinit_error=True, local_mode=True)

        from utils.ppo_policy import PPOTorchPolicy as LoadPolicy
        from utils.fc_model import FCMultiLayerNetwork
        ModelCatalog.register_custom_model("my_fc", FCMultiLayerNetwork)
        config = ray.rllib.agents.ppo.ppo.DEFAULT_CONFIG.copy()
        config["vf_share_layers"] = True
        config['num_workers'] = 0
        config["model"]["custom_model"] = "my_fc"
        config['model']['free_log_std'] = False

        self.policy = LoadPolicy(flat_obs_space, self._action_space, config)
        objs = pickle.load(open(self._checkpoint_path, "rb"))
        objs = pickle.loads(objs["worker"])
        state = objs["state"]
        filters = objs["filters"]
        self.filters = filters[self._policy_name]
        weights = state[self._policy_name]
        weights.pop("_optimizer_variables")
        self.policy.set_weights(weights)
        self.model = self.policy.model
Esempio n. 9
0
    def __init__(self, load_path, algorithm, policy_name, observation_space, action_space):
        self._checkpoint_path = load_path
        self._policy_name = policy_name
        self._observation_space = observation_space
        self._action_space = action_space
        self._prep = ModelCatalog.get_preprocessor_for_space(self._observation_space)
        flat_obs_space = self._prep.observation_space

        ray.init(ignore_reinit_error=True, local_mode=True)

        from utils.ppo_policy import PPOTorchPolicy as LoadPolicy
        from utils.rnn_model import RNNDVEModel
        ModelCatalog.register_custom_model("my_rnn", RNNDVEModel)
        config = ray.rllib.agents.ppo.ppo.DEFAULT_CONFIG.copy()
        config['num_workers'] = 0
        config["model"]["custom_model"] = "my_rnn"

        self.policy = LoadPolicy(flat_obs_space, self._action_space, config)
        objs = pickle.load(open(self._checkpoint_path, "rb"))
        objs = pickle.loads(objs["worker"])
        state = objs["state"]
        filters = objs["filters"]
        self.filters = filters[self._policy_name]
        weights = state[self._policy_name]
        weights.pop("_optimizer_variables")
        self.policy.set_weights(weights)
        self.model = self.policy.model

        self.rnn_state = self.model.get_initial_state()
        self.rnn_state = [torch.reshape(self.rnn_state[0], shape=(1, -1))]
Esempio n. 10
0
    def __init__(self, load_path, observation_space, action_space):
        self._prep = ModelCatalog.get_preprocessor_for_space(observation_space)
        self._checkpoint_path = load_path

        if isinstance(action_space, gym.spaces.Box):
            self.is_continuous = True
        elif isinstance(action_space, gym.spaces.Discrete):
            self.is_continuous = False
        else:
            raise TypeError("Unsupport action space")

        self._sess = tf.Session(graph=tf.Graph())
        self._sess.__enter__()
        saver = tf.train.import_meta_graph(
            os.path.join(os.path.dirname(self._checkpoint_path), "model.meta"))
        saver.restore(
            self._sess,
            os.path.join(os.path.dirname(self._checkpoint_path), "model"))

        graph = tf.get_default_graph()

        if self.is_continuous:
            # These tensor names were found by inspecting the trained model
            # deterministic
            self.output_node = graph.get_tensor_by_name(
                "default_policy/split:0")
            # add guassian noise
            # output_node = graph.get_tensor_by_name("default_policy/add:0")
        else:
            self.output_node = graph.get_tensor_by_name(
                "default_policy/ArgMax:0")

        self.input_node = graph.get_tensor_by_name(
            "default_policy/observation:0")
Esempio n. 11
0
    def __init__(
        self, load_path, algorithm, policy_name, observation_space, action_space
    ):
        load_path = str(load_path)
        if algorithm == "PPO":
            from ray.rllib.agents.ppo.ppo_tf_policy import PPOTFPolicy as LoadPolicy
        elif algorithm in ["A2C", "A3C"]:
            from ray.rllib.agents.a3c.a3c_tf_policy import A3CTFPolicy as LoadPolicy
        elif algorithm == "PG":
            from ray.rllib.agents.pg.pg_tf_policy import PGTFPolicy as LoadPolicy
        elif algorithm == "DQN":
            from ray.rllib.agents.dqn.dqn_policy import DQNTFPolicy as LoadPolicy
        else:
            raise ValueError(f"Unsupported algorithm: {algorithm}")

        self._prep = ModelCatalog.get_preprocessor_for_space(observation_space)
        self._sess = tf.compat.v1.Session(graph=tf.Graph())

        with tf.compat.v1.name_scope(policy_name):
            # obs_space need to be flattened before passed to PPOTFPolicy
            flat_obs_space = self._prep.observation_space
            policy = LoadPolicy(flat_obs_space, self._action_space, {})
            objs = pickle.load(open(load_path, "rb"))
            objs = pickle.loads(objs["worker"])
            state = objs["state"]
            weights = state[policy_name]
            policy.set_weights(weights)

        # These tensor names were found by inspecting the trained model
        if algorithm == "PPO":
            # CRUCIAL FOR SAFETY:
            #   We use Tensor("split") instead of Tensor("add") to force
            #   PPO to be deterministic.
            self._input_node = self._sess.graph.get_tensor_by_name(
                f"{policy_name}/observation:0"
            )
            self._output_node = self._sess.graph.get_tensor_by_name(
                f"{policy_name}/split:0"
            )
        elif self._algorithm == "DQN":
            self._input_node = self._sess.graph.get_tensor_by_name(
                f"{policy_name}/observations:0"
            )
            self._output_node = tf.argmax(
                input=self._sess.graph.get_tensor_by_name(
                    f"{policy_name}/value_out/BiasAdd:0"
                ),
                axis=1,
            )
        else:
            self._input_node = self._sess.graph.get_tensor_by_name(
                f"{policy_name}/observations:0"
            )
            self._output_node = tf.argmax(
                input=self._sess.graph.get_tensor_by_name(
                    f"{policy_name}/fc_out/BiasAdd:0"
                ),
                axis=1,
            )
Esempio n. 12
0
    def __init__(self, obs_space, action_space, config):
        super().__init__(obs_space, action_space, config)
        self.action_noise_std = self.config["action_noise_std"]
        self.preprocessor = ModelCatalog.get_preprocessor_for_space(
            self.observation_space)
        self.observation_filter = get_filter(self.config["observation_filter"],
                                             self.preprocessor.shape)

        self.single_threaded = self.config.get("single_threaded", False)
        if self.config["framework"] == "tf":
            self.sess = make_session(single_threaded=self.single_threaded)

            # Set graph-level seed.
            if config.get("seed") is not None:
                with self.sess.as_default():
                    tf1.set_random_seed(config["seed"])

            self.inputs = tf1.placeholder(tf.float32, [None] +
                                          list(self.preprocessor.shape))
        else:
            if not tf1.executing_eagerly():
                tf1.enable_eager_execution()
            self.sess = self.inputs = None
            if config.get("seed") is not None:
                # Tf2.x.
                if config.get("framework") == "tf2":
                    tf.random.set_seed(config["seed"])
                # Tf-eager.
                elif tf1 and config.get("framework") == "tfe":
                    tf1.set_random_seed(config["seed"])

        # Policy network.
        self.dist_class, dist_dim = ModelCatalog.get_action_dist(
            self.action_space, self.config["model"], dist_type="deterministic")

        self.model = ModelCatalog.get_model_v2(
            obs_space=self.preprocessor.observation_space,
            action_space=self.action_space,
            num_outputs=dist_dim,
            model_config=self.config["model"],
        )

        self.sampler = None
        if self.sess:
            dist_inputs, _ = self.model({SampleBatch.CUR_OBS: self.inputs})
            dist = self.dist_class(dist_inputs, self.model)
            self.sampler = dist.sample()
            self.variables = ray.experimental.tf_utils.TensorFlowVariables(
                dist_inputs, self.sess)
            self.sess.run(tf1.global_variables_initializer())
        else:
            self.variables = ray.experimental.tf_utils.TensorFlowVariables(
                [], None, self.model.variables())

        self.num_params = sum(
            np.prod(variable.shape.as_list())
            for _, variable in self.variables.variables.items())
Esempio n. 13
0
 def on_postprocess_trajectory(self, worker, episode, agent_id, policy_id, policies, postprocessed_batch, original_batches, **kwargs):
     to_update = postprocessed_batch[SampleBatch.CUR_OBS]
     other_id = 1 if agent_id == 0 else 0
     action_encoder = ModelCatalog.get_preprocessor_for_space( 
                                                              Box(-np.inf, np.inf, (ACTION_VEC_SIZE,), np.float32) # Unbounded
                                                              )
     _, opponent_batch = original_batches[other_id]
     opponent_actions = np.array([action_encoder.transform(a) for a in opponent_batch[SampleBatch.ACTIONS]])
     to_update[:, -ACTION_VEC_SIZE:] = opponent_actions
Esempio n. 14
0
    def __init__(self, load_path, algorithm, policy_name, observation_space,
                 action_space):
        self._checkpoint_path = load_path
        self._algorithm = algorithm
        self._policy_name = policy_name
        self._observation_space = observation_space
        self._action_space = action_space
        self._sess = None

        if isinstance(action_space, gym.spaces.Box):
            self.is_continuous = True
        elif isinstance(action_space, gym.spaces.Discrete):
            self.is_continuous = False
        else:
            raise TypeError("Unsupport action space")

        if self._sess:
            return

        if self._algorithm == "PPO":
            from ray.rllib.agents.ppo.ppo_tf_policy import PPOTFPolicy as LoadPolicy
        elif self._algorithm in ["A2C", "A3C"]:
            from ray.rllib.agents.a3c.a3c_tf_policy import A3CTFPolicy as LoadPolicy
        elif self._algorithm == "PG":
            from ray.rllib.agents.pg.pg_tf_policy import PGTFPolicy as LoadPolicy
        elif self._algorithm == "DQN":
            from ray.rllib.agents.dqn.dqn_tf_policy import DQNTFPolicy as LoadPolicy
        else:
            raise TypeError("Unsupport algorithm")

        self._prep = ModelCatalog.get_preprocessor_for_space(
            self._observation_space)
        self._sess = tf.Session(graph=tf.Graph())
        self._sess.__enter__()

        import ray.rllib.agents.ppo as ppo
        config = ppo.DEFAULT_CONFIG.copy()
        config['num_workers'] = 0
        config["model"]["use_lstm"] = True

        with tf.name_scope(self._policy_name):
            # obs_space need to be flattened before passed to PPOTFPolicy
            flat_obs_space = self._prep.observation_space
            self.policy = LoadPolicy(flat_obs_space, self._action_space,
                                     config)
            objs = pickle.load(open(self._checkpoint_path, "rb"))
            objs = pickle.loads(objs["worker"])
            state = objs["state"]
            filters = objs["filters"]
            self.filters = filters[self._policy_name]
            weights = state[self._policy_name]
            self.policy.set_weights(weights)

            self.model = self.policy.model
            # print(self.model.summary())
            self.rnn_state = self.model.get_initial_state()
            self.rnn_state = [[self.rnn_state[0]], [self.rnn_state[1]]]
Esempio n. 15
0
    def __init__(self, path_to_model, observation_space, action_space):
        self._prep = ModelCatalog.get_preprocessor_for_space(observation_space)
        self._path_to_model = path_to_model

        if isinstance(action_space, gym.spaces.Box):
            self.is_continuous = True
        elif isinstance(action_space, gym.spaces.Discrete):
            self.is_continuous = False
        else:
            raise TypeError("Unsupport action space")
Esempio n. 16
0
def generate_policies(
    policy_id: str,
    policy_constructor_tuple: Tuple["PolicyClass", "gym.Space", "gym.Space",
                                    dict],
    policies: Dict[str, TFPolicy],
    policies_to_train: List[str],
    policy_config: dict,
    preprocessors: Dict[str, Any],
    obs_filters: Dict[str, Any],
    observation_filter: str,
):
    """
    Get policies for each ``agent_id``, and instantiate new ones
    for newly created agents.
    """

    policy_cls, obs_space, act_space, conf = policy_constructor_tuple

    if policy_id in preprocessors != policy_id in policies:
        raise ValueError("'preprocessors' and 'policies' do not agree.")
    if policy_id in obs_filters != policy_id in policies:
        raise ValueError("'obs_filters' and 'policies' do not agree.")

    # If we haven't seen this id, we instantiate a new policy.
    if policy_id not in policies:
        merged_conf = merge_dicts(policy_config, conf)

        # We assume ``self.preprocessing_enabled == True`` in ``RolloutWorker``.
        preprocessor = ModelCatalog.get_preprocessor_for_space(
            obs_space, merged_conf.get("model"))
        preprocessors[policy_id] = preprocessor
        obs_space = preprocessor.observation_space

        if tf and tf.executing_eagerly():
            if hasattr(policy_cls, "as_eager"):
                policy_cls = policy_cls.as_eager()
                if policy_config["eager_tracing"]:
                    policy_cls = policy_cls.with_tracing()
            elif not issubclass(policy_cls, TFPolicy):
                pass  # could be some other type of policy
            else:
                raise ValueError("This policy does not support eager "
                                 "execution: {}".format(policy_cls))
        if tf:
            with tf.variable_scope(policy_id):
                policies[policy_id] = policy_cls(obs_space, act_space,
                                                 merged_conf)
                policies_to_train.append(policy_id)
        else:
            policies[policy_id] = policy_cls(obs_space, act_space, merged_conf)
            policies_to_train.append(policy_id)

        obs_filters[policy_id] = get_filter(observation_filter,
                                            obs_space.shape)
    return policies, preprocessors, obs_filters, policies_to_train
Esempio n. 17
0
 def __init__(self, path_to_model, observation_space):
     path_to_model = str(path_to_model)  # might be a str or a Path, normalize to str
     self._prep = ModelCatalog.get_preprocessor_for_space(observation_space)
     self._sess = tf.compat.v1.Session(graph=tf.Graph())
     tf.compat.v1.saved_model.load(
         self._sess, export_dir=path_to_model, tags=["serve"]
     )
     self._output_node = self._sess.graph.get_tensor_by_name("default_policy/add:0")
     self._input_node = self._sess.graph.get_tensor_by_name(
         "default_policy/observation:0"
     )
Esempio n. 18
0
def before_init(policy, observation_space, action_space, config):
    policy.action_noise_std = config["action_noise_std"]
    policy.preprocessor = ModelCatalog.get_preprocessor_for_space(
        observation_space)
    policy.observation_filter = get_filter(config["observation_filter"],
                                           policy.preprocessor.shape)
    policy.single_threaded = config.get("single_threaded", False)

    def _set_flat_weights(policy, theta):
        pos = 0
        theta_dict = policy.model.state_dict()
        new_theta_dict = {}

        for k in sorted(theta_dict.keys()):
            shape = policy.param_shapes[k]
            num_params = int(np.prod(shape))
            new_theta_dict[k] = torch.from_numpy(
                np.reshape(theta[pos:pos + num_params], shape))
            pos += num_params
        policy.model.load_state_dict(new_theta_dict)

    def _get_flat_weights(policy):
        # Get the parameter tensors.
        theta_dict = policy.model.state_dict()
        # Flatten it into a single np.ndarray.
        theta_list = []
        for k in sorted(theta_dict.keys()):
            theta_list.append(torch.reshape(theta_dict[k], (-1, )))
        cat = torch.cat(theta_list, dim=0)
        return cat.numpy()

    type(policy).set_flat_weights = _set_flat_weights
    type(policy).get_flat_weights = _get_flat_weights

    def _compute_actions(policy, obs_batch, add_noise=False, update=True):
        observation = policy.preprocessor.transform(obs_batch)
        observation = policy.observation_filter(
            observation[None], update=update)

        observation = convert_to_torch_tensor(observation)
        dist_inputs, _ = policy.model({
            SampleBatch.CUR_OBS: observation
        }, [], None)
        dist = policy.dist_class(dist_inputs, policy.model)
        action = dist.sample().detach().numpy()
        action = unbatch_actions(action)
        if add_noise and isinstance(policy.action_space, gym.spaces.Box):
            action += np.random.randn(*action.shape) * policy.action_noise_std
        return action

    type(policy).compute_actions = _compute_actions
Esempio n. 19
0
    def on_postprocess_trajectory(self, worker, episode, agent_id, policy_id,
                                  policies, postprocessed_batch,
                                  original_batches, **kwargs):
        to_update = postprocessed_batch[SampleBatch.CUR_OBS]
        other_id = 1 if agent_id == 0 else 0
        action_encoder = ModelCatalog.get_preprocessor_for_space(Discrete(2))

        # set the opponent actions into the observation
        _, opponent_batch = original_batches[other_id]
        opponent_actions = np.array([
            action_encoder.transform(a)
            for a in opponent_batch[SampleBatch.ACTIONS]
        ])
        to_update[:, -2:] = opponent_actions
Esempio n. 20
0
    def __init__(self, load_path, algorithm, policy_names, observation_space,
                 action_space):
        self._checkpoint_path = load_path
        self._algorithm = algorithm
        self._policy_mapping = dict.fromkeys(policy_names, None)
        self._observation_space = observation_space
        self._action_space = action_space
        self._sess = None

        if isinstance(action_space, gym.spaces.Box):
            self.is_continuous = True
        elif isinstance(action_space, gym.spaces.Discrete):
            self.is_continuous = False
        else:
            raise TypeError("Unsupported action space")

        if self._sess:
            return

        if self._algorithm == "PPO":
            from ray.rllib.agents.ppo.ppo_tf_policy import PPOTFPolicy as LoadPolicy
        elif self._algorithm in ["A2C", "A3C"]:
            from ray.rllib.agents.a3c.a3c_tf_policy import A3CTFPolicy as LoadPolicy
        elif self._algorithm == "PG":
            from ray.rllib.agents.pg.pg_tf_policy import PGTFPolicy as LoadPolicy
        elif self._algorithm == "DQN":
            from ray.rllib.agents.dqn.dqn_tf_policy import DQNTFPolicy as LoadPolicy
        else:
            raise TypeError("Unsupport algorithm")

        self._prep = ModelCatalog.get_preprocessor_for_space(
            self._observation_space)
        self._sess = tf.Session(graph=tf.Graph())
        self._sess.__enter__()

        objs = pickle.load(open(self._checkpoint_path, "rb"))
        objs = pickle.loads(objs["worker"])
        state = objs["state"]

        for name in self._policy_mapping:
            with tf.variable_scope(name):
                # obs_space need to be flattened before passed to PPOTFPolicy
                flat_obs_space = self._prep.observation_space
                self._policy_mapping[name] = PolicyWrapper(
                    LoadPolicy,
                    params=(flat_obs_space, self._action_space, {}))
                self._policy_mapping[name].set_preprocessor(self._prep)
                weights = state[name]
                self._policy_mapping[name].set_weights(weights)
Esempio n. 21
0
def fill_in_actions(info):
    """Callback that saves opponent actions into the agent obs.

    If you don't care about opponent actions you can leave this out."""

    to_update = info["post_batch"][SampleBatch.CUR_OBS]
    my_id = info["agent_id"]
    other_id = 1 if my_id == 0 else 0
    action_encoder = ModelCatalog.get_preprocessor_for_space(Discrete(2))

    # set the opponent actions into the observation
    _, opponent_batch = info["all_pre_batches"][other_id]
    opponent_actions = np.array([
        action_encoder.transform(a)
        for a in opponent_batch[SampleBatch.ACTIONS]
    ])
    to_update[:, -2:] = opponent_actions
Esempio n. 22
0
 def _build_policy_map(
         self, policy_dict: MultiAgentPolicyConfigDict,
         policy_config: TrainerConfigDict
 ) -> Tuple[Dict[PolicyID, Policy], Dict[PolicyID, Preprocessor]]:
     policy_map = {}
     preprocessors = {}
     for name, (cls, obs_space, act_space,
                conf) in sorted(policy_dict.items()):
         logger.debug("Creating policy for {}".format(name))
         merged_conf = merge_dicts(policy_config, conf)
         merged_conf["num_workers"] = self.num_workers
         merged_conf["worker_index"] = self.worker_index
         if self.preprocessing_enabled:
             preprocessor = ModelCatalog.get_preprocessor_for_space(
                 obs_space, merged_conf.get("model"))
             preprocessors[name] = preprocessor
             obs_space = preprocessor.observation_space
         else:
             preprocessors[name] = NoPreprocessor(obs_space)
         if isinstance(obs_space, gym.spaces.Dict) or \
                 isinstance(obs_space, gym.spaces.Tuple):
             raise ValueError(
                 "Found raw Tuple|Dict space as input to policy. "
                 "Please preprocess these observations with a "
                 "Tuple|DictFlatteningPreprocessor.")
         if tf1 and tf1.executing_eagerly():
             if hasattr(cls, "as_eager"):
                 cls = cls.as_eager()
                 if policy_config.get("eager_tracing"):
                     cls = cls.with_tracing()
             elif not issubclass(cls, TFPolicy):
                 pass  # could be some other type of policy
             else:
                 raise ValueError("This policy does not support eager "
                                  "execution: {}".format(cls))
         if tf1:
             with tf1.variable_scope(name):
                 policy_map[name] = cls(obs_space, act_space, merged_conf)
         else:
             policy_map[name] = cls(obs_space, act_space, merged_conf)
     if self.worker_index == 0:
         logger.info("Built policy map: {}".format(policy_map))
         logger.info("Built preprocessor map: {}".format(preprocessors))
     return policy_map, preprocessors
Esempio n. 23
0
 def _build_policy_map(self, policy_dict, policy_config):
     policy_map = {}
     preprocessors = {}
     for name, (cls, obs_space, act_space,
                conf) in sorted(policy_dict.items()):
         merged_conf = merge_dicts(policy_config, conf)
         if self.preprocessing_enabled:
             preprocessor = ModelCatalog.get_preprocessor_for_space(
                 obs_space, merged_conf.get("model"))
             preprocessors[name] = preprocessor
             obs_space = preprocessor.observation_space
         else:
             preprocessors[name] = NoPreprocessor(obs_space)
         if isinstance(obs_space, gym.spaces.Dict) or \
                 isinstance(obs_space, gym.spaces.Tuple):
             raise ValueError(
                 "Found raw Tuple|Dict space as input to policy graph. "
                 "Please preprocess these observations with a "
                 "Tuple|DictFlatteningPreprocessor.")
         with tf.variable_scope(name):
             policy_map[name] = cls(obs_space, act_space, merged_conf)
     return policy_map, preprocessors
Esempio n. 24
0
 def _build_policy_map(self, policy_dict, policy_config):
     policy_map = {}
     preprocessors = {}
     for name, (cls, obs_space, act_space,
                conf) in sorted(policy_dict.items()):
         merged_conf = merge_dicts(policy_config, conf)
         if self.preprocessing_enabled:
             preprocessor = ModelCatalog.get_preprocessor_for_space(
                 obs_space, merged_conf.get("model"))
             preprocessors[name] = preprocessor
             obs_space = preprocessor.observation_space
         else:
             preprocessors[name] = NoPreprocessor(obs_space)
         if isinstance(obs_space, gym.spaces.Dict) or \
                 isinstance(obs_space, gym.spaces.Tuple):
             raise ValueError(
                 "Found raw Tuple|Dict space as input to policy graph. "
                 "Please preprocess these observations with a "
                 "Tuple|DictFlatteningPreprocessor.")
         with tf.variable_scope(name):
             policy_map[name] = cls(obs_space, act_space, merged_conf)
     return policy_map, preprocessors
Esempio n. 25
0
 def __init__(self, load_path, algorithm, policy_name, observation_space):
     load_path = str(load_path)
     self._prep = ModelCatalog.get_preprocessor_for_space(observation_space)
     self._sess = tf.compat.v1.Session(graph=tf.Graph())
     tf.compat.v1.saved_model.load(
         self._sess, export_dir=load_path, tags=["serve"], clear_devices=True,
     )
     # These tensor names were found by inspecting the trained model
     if algorithm == "PPO":
         # CRUCIAL FOR SAFETY:
         #   We use Tensor("split") instead of Tensor("add") to force
         #   PPO to be deterministic.
         self._input_node = self._sess.graph.get_tensor_by_name(
             f"{policy_name}/observation:0"
         )
         self._output_node = self._sess.graph.get_tensor_by_name(
             f"{policy_name}/split:0"
         )
     # todo: need to check
     elif algorithm == "DQN":
         self._input_node = self._sess.graph.get_tensor_by_name(
             f"{policy_name}/observations:0"
         )
         self._output_node = tf.argmax(
             input=self._sess.graph.get_tensor_by_name(
                 f"{policy_name}/value_out/BiasAdd:0"
             ),
             axis=1,
         )
     else:
         self._input_node = self._sess.graph.get_tensor_by_name(
             f"{policy_name}/observations:0"
         )
         self._output_node = tf.argmax(
             input=self._sess.graph.get_tensor_by_name(
                 f"{policy_name}/fc_out/BiasAdd:0"
             ),
             axis=1,
         )
Esempio n. 26
0
    def __init__(self, load_path, algorithm, policy_name, yaml_path):
        load_path = str(load_path)
        if algorithm == "ppo":
            from ray.rllib.agents.ppo.ppo_tf_policy import PPOTFPolicy as LoadPolicy
        elif algorithm in "a2c":
            from ray.rllib.agents.a3c.a3c_tf_policy import A3CTFPolicy as LoadPolicy
            from ray.rllib.agents.a3c import DEFAULT_CONFIG
        elif algorithm == "pg":
            from ray.rllib.agents.pg.pg_tf_policy import PGTFPolicy as LoadPolicy
        elif algorithm == "dqn":
            from ray.rllib.agents.dqn import DQNTFPolicy as LoadPolicy
        elif algorithm == "maac":
            from benchmark.agents.maac.tf_policy import CA2CTFPolicy as LoadPolicy
            from benchmark.agents.maac.tf_policy import DEFAULT_CONFIG
        elif algorithm == "maddpg":
            from benchmark.agents.maddpg.tf_policy import MADDPG2TFPolicy as LoadPolicy
            from benchmark.agents.maddpg.tf_policy import DEFAULT_CONFIG
        elif algorithm == "mfac":
            from benchmark.agents.mfac.tf_policy import MFACTFPolicy as LoadPolicy
            from benchmark.agents.mfac.tf_policy import DEFAULT_CONFIG
        elif algorithm == "networked_pg":
            from benchmark.agents.networked_pg.tf_policy import (
                NetworkedPG as LoadPolicy,
            )
            from benchmark.agents.networked_pg.tf_policy import (
                PG_DEFAULT_CONFIG as DEFAULT_CONFIG,
            )
        else:
            raise ValueError(f"Unsupported algorithm: {algorithm}")

        yaml_path = BASE_DIR / yaml_path
        load_path = BASE_DIR / f"log/results/run/{load_path}"

        config = load_config(yaml_path)
        observation_space = config["policy"][1]
        action_space = config["policy"][2]
        pconfig = DEFAULT_CONFIG

        pconfig["model"].update(config["policy"][-1].get("model", {}))
        pconfig["agent_id"] = policy_name

        self._prep = ModelCatalog.get_preprocessor_for_space(observation_space)
        self._sess = tf.Session(graph=tf.get_default_graph())

        with tf.name_scope(policy_name):
            # Observation space needs to be flattened before passed to the policy
            flat_obs_space = self._prep.observation_space
            policy = LoadPolicy(flat_obs_space, action_space, pconfig)
            self._sess.run(tf.global_variables_initializer())
            objs = pickle.load(open(load_path, "rb"))
            objs = pickle.loads(objs["worker"])
            state = objs["state"]
            weights = state[policy_name]
            policy.set_weights(weights)

        # for op in tf.get_default_graph().get_operations():
        #     print(str(op.name))

        # These tensor names were found by inspecting the trained model
        if algorithm == "ppo":
            # CRUCIAL FOR SAFETY:
            #   We use Tensor("split") instead of Tensor("add") to force
            #   PPO to be deterministic.
            self._input_node = self._sess.graph.get_tensor_by_name(
                f"{policy_name}/observation:0"
            )
            self._output_node = self._sess.graph.get_tensor_by_name(
                f"{policy_name}/split:0"
            )
        elif algorithm == "dqn":
            self._input_node = self._sess.graph.get_tensor_by_name(
                f"{policy_name}/observations:0"
            )
            self._output_node = tf.argmax(
                self._sess.graph.get_tensor_by_name(
                    f"{policy_name}/value_out/BiasAdd:0"
                ),
                axis=1,
            )
        elif algorithm == "maac":
            self._input_node = self._sess.graph.get_tensor_by_name(
                f"{policy_name}/policy-inputs:0"
            )
            self._output_node = tf.argmax(
                self._sess.graph.get_tensor_by_name(
                    f"{policy_name}/logits_out/BiasAdd:0"
                ),
                axis=1,
            )
        elif algorithm == "maddpg":
            self._input_node = self._sess.graph.get_tensor_by_name(
                f"{policy_name}/obs_2:0"
            )
            self._output_node = tf.argmax(
                self._sess.graph.get_tensor_by_name(
                    f"{policy_name}/actor/AGENT_2_actor_RelaxedOneHotCategorical_1/sample/AGENT_2_actor_exp/forward/Exp:0"
                )
            )
        else:
            self._input_node = self._sess.graph.get_tensor_by_name(
                f"{policy_name}/observations:0"
            )
            self._output_node = tf.argmax(
                self._sess.graph.get_tensor_by_name(f"{policy_name}/fc_out/BiasAdd:0"),
                axis=1,
            )
Esempio n. 27
0
 def __init__(self, path_to_model, observation_space):
     self._prep = ModelCatalog.get_preprocessor_for_space(observation_space)
     self._path_to_model = path_to_model
Esempio n. 28
0
def before_init(policy, observation_space, action_space, config):
    policy.action_noise_std = config["action_noise_std"]
    policy.action_space_struct = get_base_struct_from_space(action_space)
    policy.preprocessor = ModelCatalog.get_preprocessor_for_space(
        observation_space)
    policy.observation_filter = get_filter(config["observation_filter"],
                                           policy.preprocessor.shape)
    policy.single_threaded = config.get("single_threaded", False)

    def _set_flat_weights(policy, theta):
        pos = 0
        theta_dict = policy.model.state_dict()
        new_theta_dict = {}

        for k in sorted(theta_dict.keys()):
            shape = policy.param_shapes[k]
            num_params = int(np.prod(shape))
            new_theta_dict[k] = torch.from_numpy(
                np.reshape(theta[pos:pos + num_params], shape))
            pos += num_params
        policy.model.load_state_dict(new_theta_dict)

    def _get_flat_weights(policy):
        # Get the parameter tensors.
        theta_dict = policy.model.state_dict()
        # Flatten it into a single np.ndarray.
        theta_list = []
        for k in sorted(theta_dict.keys()):
            theta_list.append(torch.reshape(theta_dict[k], (-1, )))
        cat = torch.cat(theta_list, dim=0)
        return cat.cpu().numpy()

    type(policy).set_flat_weights = _set_flat_weights
    type(policy).get_flat_weights = _get_flat_weights

    def _compute_actions(policy,
                         obs_batch,
                         add_noise=False,
                         update=True,
                         **kwargs):
        # Batch is given as list -> Try converting to numpy first.
        if isinstance(obs_batch, list) and len(obs_batch) == 1:
            obs_batch = obs_batch[0]
        observation = policy.preprocessor.transform(obs_batch)
        observation = policy.observation_filter(observation[None],
                                                update=update)

        observation = convert_to_torch_tensor(observation, policy.device)
        dist_inputs, _ = policy.model({SampleBatch.CUR_OBS: observation}, [],
                                      None)
        dist = policy.dist_class(dist_inputs, policy.model)
        action = dist.sample()

        def _add_noise(single_action, single_action_space):
            single_action = single_action.detach().cpu().numpy()
            if add_noise and isinstance(single_action_space, gym.spaces.Box):
                single_action += np.random.randn(*single_action.shape) * \
                                 policy.action_noise_std
            return single_action

        action = tree.map_structure(_add_noise, action,
                                    policy.action_space_struct)
        action = unbatch(action)
        return action, [], {}

    def _compute_single_action(policy,
                               observation,
                               add_noise=False,
                               update=True,
                               **kwargs):
        action, state_outs, extra_fetches = policy.compute_actions(
            [observation], add_noise=add_noise, update=update, **kwargs)
        return action[0], state_outs, extra_fetches

    type(policy).compute_actions = _compute_actions
    type(policy).compute_single_action = _compute_single_action
Esempio n. 29
0
def generate_policies(
    policy_id: str,
    policy_constructor_tuple: Tuple["PolicyClass", "gym.Space", "gym.Space",
                                    dict],
    policies: Dict[str, TFPolicy],
    policies_to_train: List[str],
    dead_policies: Set[str],
    policy_config: dict,
    preprocessors: Dict[str, Any],
    obs_filters: Dict[str, Any],
    observation_filter: str,
    tf_sess,
):
    """
    Get policies for each ``agent_id``, and instantiate new ones
    for newly created agents.
    """

    policy_cls, obs_space, act_space, conf = policy_constructor_tuple

    if (policy_id in preprocessors) != (policy_id in policies):
        raise ValueError("'preprocessors' and 'policies' do not agree.")
    if (policy_id in obs_filters) != (policy_id in policies):
        raise ValueError("'obs_filters' and 'policies' do not agree.")

    # If we haven't seen this id, we instantiate a new policy.
    if policy_id not in policies:

        # We assume configs are homogeneous.
        # Use a dead policy for this new agent.
        if dead_policies:
            dead_policy_id = dead_policies.pop()
            dead_preprocessor = preprocessors.pop(dead_policy_id)
            dead_obs_space = dead_preprocessor.observation_space
            dead_policy = policies.pop(dead_policy_id)
            dead_obs_filter = obs_filters.pop(dead_policy_id)

            start = time.time()
            # Run variable initializer ops, assuming tf model.
            trainable_model_variables = dead_policy.model.trainable_variables()
            sess = dead_policy.get_session()
            sess.run([var.initializer for var in trainable_model_variables])

            preprocessors[policy_id] = dead_preprocessor
            policies[policy_id] = dead_policy
            obs_filters[policy_id] = dead_obs_filter

            policies_to_train.append(policy_id)
            # DEBUG
            print("sampler.py: Reinitializing dead model: %fs" %
                  (time.time() - start))
        else:
            merged_conf = merge_dicts(policy_config, conf)

            # We assume ``self.preprocessing_enabled == True`` in ``RolloutWorker``.
            preprocessor = ModelCatalog.get_preprocessor_for_space(
                obs_space, merged_conf.get("model"))
            preprocessors[policy_id] = preprocessor
            obs_space = preprocessor.observation_space

            if tf and tf.executing_eagerly():
                if hasattr(policy_cls, "as_eager"):
                    policy_cls = policy_cls.as_eager()
                    if policy_config["eager_tracing"]:
                        policy_cls = policy_cls.with_tracing()
                elif not issubclass(policy_cls, TFPolicy):
                    pass  # could be some other type of policy
                else:
                    raise ValueError("This policy does not support eager "
                                     "execution: {}".format(policy_cls))

            if tf:
                # TODO: Is this necessary? Yes.
                with tf.variable_scope(policy_id):
                    # DEBUG
                    print("sampler.py: Default graph:", tf.get_default_graph())
                    print("sampler.py: Calling policy init.")
                    start = time.time()
                    policies[policy_id] = policy_cls(obs_space, act_space,
                                                     merged_conf)
                # DEBUG
                print("sampler.py: Done policy init: %fs" %
                      (time.time() - start))
                policies_to_train.append(policy_id)
            else:
                policies[policy_id] = policy_cls(obs_space, act_space,
                                                 merged_conf)
                policies_to_train.append(policy_id)

            # DEBUG
            # print("sampler.py: Getting new filter.")
            obs_filters[policy_id] = get_filter(observation_filter,
                                                obs_space.shape)
            # DEBUG
            # print("sampler.py: Got new filter.")
    return policies, preprocessors, obs_filters, policies_to_train, dead_policies
Esempio n. 30
0
    AGENT_ID = "Agent-007"
    env = gym.make(
        "smarts.env:hiway-v0",
        scenarios=scenario_paths,
        agent_specs={AGENT_ID: agent_spec},
        # set headless to false if u want to use envision
        headless=False,
        visdom=False,
        seed=args.seed,
    )

    torch.manual_seed(args.seed)
    np.random.seed(args.seed)

    preprocessor = ModelCatalog.get_preprocessor_for_space(OBSERVATION_SPACE)

    state_dim = 0
    for val in OBSERVATION_SPACE.spaces.values():
        state_dim += val.shape[0]
    state_dim = (state_dim, )

    if type(ACTION_SPACE) == gym.spaces.Discrete:
        act_dim = ACTION_SPACE.n
        action_max = 1
    else:
        act_dim = ACTION_SPACE.shape
        action_max = ACTION_SPACE.high[0]
    ppo = core.PPO(state_dim,
                   act_dim,
                   action_max,
Esempio n. 31
0
 def __init__(self, observation_space, action_space, model_config):
     self._prep = ModelCatalog.get_preprocessor_for_space(observation_space)
     self.model = TrainingModel(self._prep.observation_space, action_space, num_outputs=3, model_config=model_config, name="Name")