Ejemplo n.º 1
0
    def __init__(self):
        # Game parameters
        self.num_turns = 150
        self.num_units = 100
        self.num_groups = 12
        self.num_nodes = 11
        self.num_actions_per_turn = 7
        self.unit_classes = ['controller', 'striker', 'tank', 'recon']

        # Integers are used to represent the unit type (e.g. 0: controller, 1: striker).
        # With 4 types of units, a group containing all 4 would have the maximum value
        # for this part of the observation space. If each unit is designated by index, then
        # this integer would be 3210.
        self.unit_config_high = 3210

        # Define the action space
        self.action_space = Tuple(
            (Discrete(self.num_groups), Discrete(self.num_nodes + 1)) *
            self.num_actions_per_turn)

        # Define the state space
        self.observation_space = self._build_observation_space()

        return
Ejemplo n.º 2
0
 def __init__(self, env):
     super(ScaledStateWrapper, self).__init__(env)
     obs = env.observation_space
     self.compound = False
     self.low = None
     self.high = None
     print(type(obs))
     print(obs)
     if isinstance(obs, gym.spaces.Box):
         self.low = env.observation_space.low
         self.high = env.observation_space.high
         self.observation_space = gym.spaces.Box(low=-np.ones(self.low.shape), high=np.ones(self.high.shape),
                                                 dtype=np.float32)
     elif isinstance(obs, Tuple):
         self.low = obs.spaces[0].low
         self.high = obs.spaces[0].high
         assert len(obs.spaces) == 2 and isinstance(obs.spaces[1], gym.spaces.Discrete)
         self.observation_space = Tuple(
             (gym.spaces.Box(low=-np.ones(self.low.shape), high=np.ones(self.high.shape),
                             dtype=np.float32),
              obs.spaces[1]))
         self.compound = False
     else:
         raise Exception("Unsupported observation space type: %s" % self.observation_space)
Ejemplo n.º 3
0
parser.add_argument("--as-test", action="store_true")
parser.add_argument("--torch", action="store_true")
parser.add_argument("--stop-reward", type=float, default=7.0)
parser.add_argument("--stop-timesteps", type=int, default=50000)

if __name__ == "__main__":
    args = parser.parse_args()

    grouping = {
        "group_1": [0, 1],
    }
    obs_space = Tuple([
        Dict({
            "obs": MultiDiscrete([2, 2, 2, 3]),
            ENV_STATE: MultiDiscrete([2, 2, 2])
        }),
        Dict({
            "obs": MultiDiscrete([2, 2, 2, 3]),
            ENV_STATE: MultiDiscrete([2, 2, 2])
        }),
    ])
    act_space = Tuple([
        TwoStepGame.action_space,
        TwoStepGame.action_space,
    ])
    register_env(
        "grouped_twostep", lambda config: TwoStepGame(config).
        with_agent_groups(grouping, obs_space=obs_space, act_space=act_space))

    if args.run == "contrib/MADDPG":
        obs_space_dict = {
            "agent_1": Discrete(6),
Ejemplo n.º 4
0
from gym.utils.seeding import RandomNumberGenerator

spaces = [
    Box(low=np.array(-1.0), high=np.array(1.0), dtype=np.float64),
    Box(low=np.array([0.0]), high=np.array([10.0]), dtype=np.float64),
    Box(low=np.array([-1.0, 0.0, 0.0]),
        high=np.array([1.0, 1.0, 1.0]),
        dtype=np.float64),
    Box(low=np.array([[-1.0, 0.0], [0.0, -1.0]]),
        high=np.ones((2, 2)),
        dtype=np.float64),
    Box(low=0, high=255, shape=(), dtype=np.uint8),
    Box(low=0, high=255, shape=(32, 32, 3), dtype=np.uint8),
    Discrete(2),
    Discrete(5, start=-2),
    Tuple((Discrete(3), Discrete(5))),
    Tuple((
        Discrete(7),
        Box(low=np.array([0.0, -1.0]),
            high=np.array([1.0, 1.0]),
            dtype=np.float64),
    )),
    MultiDiscrete([11, 13, 17]),
    MultiBinary(19),
    Dict({
        "position":
        Discrete(23),
        "velocity":
        Box(low=np.array([0.0]), high=np.array([1.0]), dtype=np.float64),
    }),
    Dict({
Ejemplo n.º 5
0
    def _load_config_params(self):

        # Load Params from the desired Yaml file
        rospkg_path = rospkg.RosPack().get_path("parrot_ardrone_rl")
        config_file_name = "parrotdrone_goto.yaml"
        config_file_path = os.path.join(
            rospkg_path, "scripts/parrot_gym/parrotdrone_tasks/config/" +
            str(config_file_name))
        parameters_list = rosparam.load_file(config_file_path)
        for params, namespace in parameters_list:
            rosparam.upload_params(namespace, params)

        # Continuous action space
        hv_range = rospy.get_param('/parrotdrone/lxy_vel_range')
        vv_range = rospy.get_param('/parrotdrone/lz_vel_range')
        rv_range = rospy.get_param('/parrotdrone/rot_vel_range')

        self.action_low = np.array(
            [-1 * hv_range, -1 * hv_range, -1 * vv_range, -1 * rv_range])
        self.action_high = np.array([hv_range, hv_range, vv_range, rv_range])
        self.action_space = Box(low=self.action_low,
                                high=self.action_high,
                                dtype=np.float32)

        self.reward_range = (-np.inf, np.inf)

        self.init_vel_vec = Twist()
        self.init_vel_vec.linear.x = rospy.get_param(
            '/parrotdrone/init_velocity_vector/linear_x')
        self.init_vel_vec.linear.y = rospy.get_param(
            '/parrotdrone/init_velocity_vector/linear_y')
        self.init_vel_vec.linear.z = rospy.get_param(
            '/parrotdrone/init_velocity_vector/linear_z')
        self.init_vel_vec.angular.x = rospy.get_param(
            '/parrotdrone/init_velocity_vector/angular_x')
        self.init_vel_vec.angular.y = rospy.get_param(
            '/parrotdrone/init_velocity_vector/angular_y')
        self.init_vel_vec.angular.z = rospy.get_param(
            '/parrotdrone/init_velocity_vector/angular_z')

        # Get WorkSpace Cube Dimensions
        self.work_space_x_max = rospy.get_param(
            "/parrotdrone/work_space/x_max")
        self.work_space_x_min = rospy.get_param(
            "/parrotdrone/work_space/x_min")
        self.work_space_y_max = rospy.get_param(
            "/parrotdrone/work_space/y_max")
        self.work_space_y_min = rospy.get_param(
            "/parrotdrone/work_space/y_min")
        self.work_space_z_max = rospy.get_param(
            "/parrotdrone/work_space/z_max")
        self.work_space_z_min = rospy.get_param(
            "/parrotdrone/work_space/z_min")

        # Maximum Quaternion values
        self.max_qw = rospy.get_param("/parrotdrone/max_orientation/w")
        self.max_qx = rospy.get_param("/parrotdrone/max_orientation/x")
        self.max_qy = rospy.get_param("/parrotdrone/max_orientation/y")
        self.max_qz = rospy.get_param("/parrotdrone/max_orientation/z")

        # Maximum velocity values
        self.max_vel_lin_x = rospy.get_param(
            "/parrotdrone/max_velocity_vector/linear_x")
        self.max_vel_lin_y = rospy.get_param(
            "/parrotdrone/max_velocity_vector/linear_y")
        self.max_vel_lin_z = rospy.get_param(
            "/parrotdrone/max_velocity_vector/linear_z")
        self.max_vel_ang_x = rospy.get_param(
            "/parrotdrone/max_velocity_vector/angular_x")
        self.max_vel_ang_y = rospy.get_param(
            "/parrotdrone/max_velocity_vector/angular_y")
        self.max_vel_ang_z = rospy.get_param(
            "/parrotdrone/max_velocity_vector/angular_z")

        #Front camera resolution
        self.front_cam_h = rospy.get_param("/parrotdrone/front_cam_res/height")
        self.front_cam_w = rospy.get_param("/parrotdrone/front_cam_res/width")

        # Get Desired Point to Get
        self.desired_pose = Pose()
        self.desired_pose.position.x = rospy.get_param(
            "/parrotdrone/desired_position/x")
        self.desired_pose.position.y = rospy.get_param(
            "/parrotdrone/desired_position/y")
        self.desired_pose.position.z = rospy.get_param(
            "/parrotdrone/desired_position/z")
        self.desired_pose.orientation.w = rospy.get_param(
            "/parrotdrone/desired_orientation/w")
        self.desired_pose.orientation.x = rospy.get_param(
            "/parrotdrone/desired_orientation/x")
        self.desired_pose.orientation.y = rospy.get_param(
            "/parrotdrone/desired_orientation/y")
        self.desired_pose.orientation.z = rospy.get_param(
            "/parrotdrone/desired_orientation/z")

        self.desired_pose_epsilon = rospy.get_param(
            "/parrotdrone/desired_point_epsilon")

        self.geo_distance = rospy.get_param("/parrotdrone/geodesic_distance")

        self.min_height = rospy.get_param("parrotdrone/min_height")

        # We place the Maximum and minimum values of the X,Y,Z,W,X,Y,Z
        # of the pose

        numeric_high = np.array([
            self.work_space_x_max, self.work_space_y_max,
            self.work_space_z_max, self.max_qw, self.max_qx, self.max_qy,
            self.max_qz, self.max_vel_lin_x, self.max_vel_lin_y,
            self.max_vel_lin_z, self.max_vel_ang_x, self.max_vel_ang_y,
            self.max_vel_ang_z
        ])

        numeric_low = np.array([
            self.work_space_x_min, self.work_space_y_min,
            self.work_space_z_min, -1 * self.max_qw, -1 * self.max_qx,
            -1 * self.max_qy, -1 * self.max_qz, -1 * self.max_vel_lin_x,
            -1 * self.max_vel_lin_y, -1 * self.max_vel_lin_z,
            -1 * self.max_vel_ang_x, -1 * self.max_vel_ang_y,
            -1 * self.max_vel_ang_z
        ])

        self.numeric_obs_space = Box(numeric_low,
                                     numeric_high,
                                     dtype=np.float32)
        self.image_obs_space = Box(low=0,
                                   high=255,
                                   shape=(self.front_cam_h, self.front_cam_w,
                                          3),
                                   dtype=np.uint8)
        self.observation_space = Tuple(
            [self.numeric_obs_space, self.image_obs_space])

        # rospy.logdebug("ACTION SPACES TYPE===>"+str(self.action_space))
        # rospy.logdebug("OBSERVATION SPACES TYPE===>" +
        #             str(self.observation_space))

        # Rewards
        self.closer_to_point_reward = rospy.get_param(
            "/parrotdrone/closer_to_point_reward")
        self.not_ending_point_reward = rospy.get_param(
            "/parrotdrone/not_ending_point_reward")
        self.end_episode_points = rospy.get_param(
            "/parrotdrone/end_episode_points")
        self.cumulated_steps = 0.0
Ejemplo n.º 6
0
 def __init__(self, venv):
     """Init."""
     super().__init__(venv)
     self.observation_space = Tuple(
         [self.observation_space, self.observation_space])
Ejemplo n.º 7
0
def main():
    args = parser.parse_args()
    config = generate_config(args)

    # env = CityFlowEnvRay(config)
    # eng = cityflow.Engine(config["cityflow_config_file"], thread_num = config["thread_num"])
    # config["eng"] = [eng,]
    # print(config["eng"])
    num_agents = 6
    grouping = {
        "group_1": [id_ for id_ in config["intersection_id"]]
    }
    obs_space = Tuple([
        CityFlowEnvRay.observation_space for _ in range(num_agents)
    ])
    act_space = Tuple([
        CityFlowEnvRay.action_space for _ in range(num_agents)
    ])
    register_env(
        "cityflow_multi",
        lambda config_: CityFlowEnvRay(config_).with_agent_groups(
            grouping, obs_space=obs_space, act_space=act_space))

    if args.algo == "QMIX":
        config_ = {
            # "num_workers": 2,
            "num_gpus_per_worker": 0,
            "sample_batch_size": 20,
            "num_cpus_per_worker": 8,
            "train_batch_size": 32,
            "exploration_final_eps": 0.0,
            "num_workers": 0,
            "mixer": grid_search(["qmix"]),
            "env_config": config
        }
        group = True
    elif args.algo == "APEX_QMIX":
        config_ = {
            "num_gpus": 1,
            "num_workers": 2,
            "optimizer": {
                "num_replay_buffer_shards": 1,
            },
            "min_iter_time_s": 3,
            "buffer_size": 2000,
            "learning_starts": 300,
            "train_batch_size": 64,
            "sample_batch_size": 32,
            "target_network_update_freq": 100,
            "timesteps_per_iteration": 1000,
            "env_config": config
        }
        group = True
    else:
        config_ = {}
        group = False

    ray.init()
    tune.run(
        args.algo,
        stop={
            "timesteps_total": args.epoch * args.num_step
        },
        checkpoint_freq=args.save_freq,
        config=dict(config_,
                    **{"env": "cityflow_multi"}),
    )
Ejemplo n.º 8
0
import json  # note: ujson fails this test due to float equality
import copy

import numpy as np
import pytest

from gym.spaces import Tuple, Box, Discrete, MultiDiscrete, MultiBinary, Dict


@pytest.mark.parametrize(
    "space",
    [
        Discrete(3),
        Discrete(5, start=-2),
        Box(low=0.0, high=np.inf, shape=(2, 2)),
        Tuple([Discrete(5), Discrete(10)]),
        Tuple([
            Discrete(5),
            Box(low=np.array([0, 0]), high=np.array([1, 5]), dtype=np.float32),
        ]),
        Tuple((Discrete(5), Discrete(2), Discrete(2))),
        Tuple((Discrete(5), Discrete(2, start=6), Discrete(2, start=-4))),
        MultiDiscrete([2, 2, 100]),
        MultiBinary(10),
        Dict({
            "position":
            Discrete(5),
            "velocity":
            Box(low=np.array([0, 0]), high=np.array([1, 5]), dtype=np.float32),
        }),
    ],
Ejemplo n.º 9
0
                return "high_level_policy"

        config = {
            "env": HierarchicalWindyMazeEnv,
            "num_workers": 0,
            "log_level": "INFO",
            "entropy_coeff": 0.01,
            "multiagent": {
                "policies": {
                    "high_level_policy": (None, maze.observation_space,
                                          Discrete(4), {
                                              "gamma": 0.9
                                          }),
                    "low_level_policy": (None,
                                         Tuple([
                                             maze.observation_space,
                                             Discrete(4)
                                         ]), maze.action_space, {
                                             "gamma": 0.0
                                         }),
                },
                "policy_mapping_fn": function(policy_mapping_fn),
            },
            "framework": "torch" if args.torch else "tf",
        }

        results = tune.run("PPO", stop=stop, config=config)

    if args.as_test:
        check_learning_achieved(results, args.stop_reward)

    ray.shutdown()
Ejemplo n.º 10
0
import json  # note: ujson fails this test due to float equality

import numpy as np
from nose2 import tools

from gym.spaces import Tuple, Box, Discrete, HighLow


@tools.params(
    Discrete(3),
    Tuple([Discrete(5), Discrete(10)]),
    Tuple([Discrete(5), Box(np.array([0, 0]), np.array([1, 5]))]),
    Tuple((Discrete(5), Discrete(2), Discrete(2))),
    HighLow(np.matrix([[0, 1, 0], [0, 1, 0], [0.0, 100.0, 2]])),
)
def test_roundtripping(space):
    sample_1 = space.sample()
    sample_2 = space.sample()
    assert space.contains(sample_1)
    assert space.contains(sample_2)
    json_rep = space.to_jsonable([sample_1, sample_2])

    json_roundtripped = json.loads(json.dumps(json_rep))

    samples_after_roundtrip = space.from_jsonable(json_roundtripped)
    sample_1_prime, sample_2_prime = samples_after_roundtrip

    s1 = space.to_jsonable([sample_1])
    s1p = space.to_jsonable([sample_1_prime])
    s2 = space.to_jsonable([sample_2])
    s2p = space.to_jsonable([sample_2_prime])
Ejemplo n.º 11
0
class RockPaperScissors(MultiAgentEnv):
    """
    Two-player environment for the famous rock paper scissors game, modified:
    - There are two agents which alternate, the action of one agent provides the
        state for the next agent. Since one of the two players begins, the agent
        which starts second should learn to always win! The startign player
        is drawn randomly.
    - The action space changes. The game is divided in three rounds across
        which you can't re-use the same action.
    """

    # Action/State spaces
    ACTION_SPACE = Discrete(Actions.SIZE)

    OBSERVATION_SPACE = Dict({
        "real_obs": Tuple((
            # First round
            Tuple((Discrete(4), Discrete(4))),

            # Second round
            Tuple((Discrete(4), Discrete(4))),

            # Third round
            Tuple((Discrete(4), Discrete(4))),
        )),

        # we have to handle changing action spaces
        "action_mask": Box(0, 1, shape=(Actions.SIZE,)),
    })

    # Reward mapping
    rewards = {
        (Actions.ROCK, Actions.ROCK): (0, 0),
        (Actions.ROCK, Actions.PAPER): (-1, 1),
        (Actions.ROCK, Actions.SCISSORS): (1, -1),
        (Actions.PAPER, Actions.ROCK): (1, -1),
        (Actions.PAPER, Actions.PAPER): (0, 0),
        (Actions.PAPER, Actions.SCISSORS): (-1, 1),
        (Actions.SCISSORS, Actions.ROCK): (-1, 1),
        (Actions.SCISSORS, Actions.PAPER): (1, -1),
        (Actions.SCISSORS, Actions.SCISSORS): (0, 0),
    }

    def __init__(self, config=None):

        # state and action spaces
        self.action_space = self.ACTION_SPACE
        self.observation_space = self.OBSERVATION_SPACE

        self.players = ["player_1", "player_2"]

    def reset(self):
        self.player_scores = {p: 0 for p in self.players}  # just used to collect the scores
        self.curr_round = 0
        self.player_pointer = random.randint(0, 1)
        self.state = [
            [3, 3],
            [3, 3],
            [3, 3],
        ]

        # reward is given to the last player with 1 delay
        self.reward_buffer = {p: 0 for p in self.players}

        # actions cannot be reused across one game, we keep a mask for each player
        self.action_mask = {p: [1 for _ in range(self.action_space.n)] for p in self.players}

        return {self.players[self.player_pointer]: self.get_state(self.players[self.player_pointer])}

    def step(self, action_dict):
        # Get current player
        curr_player_pointer = self.player_pointer
        curr_player = self.players[self.player_pointer]

        # Get next player
        next_player_pointer = (self.player_pointer + 1) % 2
        next_player = self.players[next_player_pointer]

        # Make sure you have the ation only for the current player
        assert curr_player in action_dict and len(action_dict) == 1, \
            "{} should be playing but action {} was received.".format(curr_player, action_dict)

        # Play the action
        curr_action = action_dict[curr_player]
        assert self.action_space.contains(curr_action), 'Action {} is not valid'.format(curr_action)
        assert self.state[self.curr_round][curr_player_pointer] == Actions.NA, \
            "Player {} has already played in round {}. Here the current state: {}".format(
                curr_player_pointer,
                self.curr_round,
                self.state
            )
        assert self.action_mask[curr_player][curr_action] == 1, \
            '{} has already played action {}. State: {}'.format(curr_player, curr_action, self.state)
        self.action_mask[curr_player][curr_action] = 0  # mask out this action
        self.state[self.curr_round][curr_player_pointer] = curr_action

        # We might be not done yet
        done = {"__all__": False}

        # If the next player has already played, the round is done
        game_done = False
        round_done = self.state[self.curr_round][next_player_pointer] != Actions.NA
        if round_done:
            # If the round is done we compute the rewards
            curr_rewards = self.rewards[tuple(self.state[self.curr_round])]
            self.player_scores["player_1"] += curr_rewards[0]
            self.player_scores["player_2"] += curr_rewards[1]
            self.reward_buffer[curr_player] = curr_rewards[curr_player_pointer]

            self.curr_round += 1
            if self.curr_round == 3:
                done = {"__all__": True}
                # Return reward and state for all players
                reward = self.reward_buffer
                obs = {p: self.get_state(next_player) for p in self.players}
                game_done = True

        # Get the state and reward for the next player
        if not game_done:
            obs = {next_player: self.get_state(next_player)}
            reward = {next_player: self.reward_buffer[next_player]}

        # Move pointer to next player
        self.player_pointer = next_player_pointer
        return obs, reward, done, {}

    def get_state(self, player):
        return {
            'real_obs': self.state,
            'action_mask': self.action_mask[player]
        }
Ejemplo n.º 12
0
    def test_multi_action_distribution(self):
        """Tests the MultiActionDistribution (across all frameworks)."""
        batch_size = 1000
        input_space = Tuple([
            Box(-10.0, 10.0, shape=(batch_size, 4)),
            Box(-2.0, 2.0, shape=(
                batch_size,
                6,
            )),
            Dict({"a": Box(-1.0, 1.0, shape=(batch_size, 4))}),
        ])
        std_space = Box(-0.05, 0.05, shape=(
            batch_size,
            3,
        ))

        low, high = -1.0, 1.0
        value_space = Tuple([
            Box(0, 3, shape=(batch_size, ), dtype=np.int32),
            Box(-2.0, 2.0, shape=(batch_size, 3), dtype=np.float32),
            Dict({"a": Box(0.0, 1.0, shape=(batch_size, 2), dtype=np.float32)})
        ])

        for fw, sess in framework_iterator(session=True):
            if fw == "torch":
                cls = TorchMultiActionDistribution
                child_distr_cls = [
                    TorchCategorical, TorchDiagGaussian,
                    partial(TorchBeta, low=low, high=high)
                ]
            else:
                cls = MultiActionDistribution
                child_distr_cls = [
                    Categorical,
                    DiagGaussian,
                    partial(Beta, low=low, high=high),
                ]

            inputs = list(input_space.sample())
            distr = cls(np.concatenate([inputs[0], inputs[1], inputs[2]["a"]],
                                       axis=1),
                        model={},
                        action_space=value_space,
                        child_distributions=child_distr_cls,
                        input_lens=[4, 6, 4])

            # Adjust inputs for the Beta distr just as Beta itself does.
            inputs[2]["a"] = np.clip(inputs[2]["a"], np.log(SMALL_NUMBER),
                                     -np.log(SMALL_NUMBER))
            inputs[2]["a"] = np.log(np.exp(inputs[2]["a"]) + 1.0) + 1.0
            # Sample deterministically.
            expected_det = [
                np.argmax(inputs[0], axis=-1),
                inputs[1][:, :3],  # [:3]=Mean values.
                # Mean for a Beta distribution:
                # 1 / [1 + (beta/alpha)] * range + low
                (1.0 /
                 (1.0 + inputs[2]["a"][:, 2:] / inputs[2]["a"][:, 0:2])) *
                (high - low) + low,
            ]
            out = distr.deterministic_sample()
            if sess:
                out = sess.run(out)
            check(out[0], expected_det[0])
            check(out[1], expected_det[1])
            check(out[2]["a"], expected_det[2])

            # Stochastic sampling -> expect roughly the mean.
            inputs = list(input_space.sample())
            # Fix categorical inputs (not needed for distribution itself, but
            # for our expectation calculations).
            inputs[0] = softmax(inputs[0], -1)
            # Fix std inputs (shouldn't be too large for this test).
            inputs[1][:, 3:] = std_space.sample()
            # Adjust inputs for the Beta distr just as Beta itself does.
            inputs[2]["a"] = np.clip(inputs[2]["a"], np.log(SMALL_NUMBER),
                                     -np.log(SMALL_NUMBER))
            inputs[2]["a"] = np.log(np.exp(inputs[2]["a"]) + 1.0) + 1.0
            distr = cls(np.concatenate([inputs[0], inputs[1], inputs[2]["a"]],
                                       axis=1),
                        model={},
                        action_space=value_space,
                        child_distributions=child_distr_cls,
                        input_lens=[4, 6, 4])
            expected_mean = [
                np.mean(np.sum(inputs[0] * np.array([0, 1, 2, 3]), -1)),
                inputs[1][:, :3],  # [:3]=Mean values.
                # Mean for a Beta distribution:
                # 1 / [1 + (beta/alpha)] * range + low
                (1.0 / (1.0 + inputs[2]["a"][:, 2:] / inputs[2]["a"][:, :2])) *
                (high - low) + low,
            ]
            out = distr.sample()
            if sess:
                out = sess.run(out)
            out = list(out)
            if fw == "torch":
                out[0] = out[0].numpy()
                out[1] = out[1].numpy()
                out[2]["a"] = out[2]["a"].numpy()
            check(np.mean(out[0]), expected_mean[0], decimals=1)
            check(np.mean(out[1], 0), np.mean(expected_mean[1], 0), decimals=1)
            check(np.mean(out[2]["a"], 0),
                  np.mean(expected_mean[2], 0),
                  decimals=1)

            # Test log-likelihood outputs.
            # Make sure beta-values are within 0.0 and 1.0 for the numpy
            # calculation (which doesn't have scaling).
            inputs = list(input_space.sample())
            # Adjust inputs for the Beta distr just as Beta itself does.
            inputs[2]["a"] = np.clip(inputs[2]["a"], np.log(SMALL_NUMBER),
                                     -np.log(SMALL_NUMBER))
            inputs[2]["a"] = np.log(np.exp(inputs[2]["a"]) + 1.0) + 1.0
            distr = cls(np.concatenate([inputs[0], inputs[1], inputs[2]["a"]],
                                       axis=1),
                        model={},
                        action_space=value_space,
                        child_distributions=child_distr_cls,
                        input_lens=[4, 6, 4])
            inputs[0] = softmax(inputs[0], -1)
            values = list(value_space.sample())
            log_prob_beta = np.log(
                beta.pdf(values[2]["a"], inputs[2]["a"][:, :2],
                         inputs[2]["a"][:, 2:]))
            # Now do the up-scaling for [2] (beta values) to be between
            # low/high.
            values[2]["a"] = values[2]["a"] * (high - low) + low
            inputs[1][:, 3:] = np.exp(inputs[1][:, 3:])
            expected_log_llh = np.sum(
                np.concatenate([
                    np.expand_dims(
                        np.log(
                            [i[values[0][j]]
                             for j, i in enumerate(inputs[0])]), -1),
                    np.log(
                        norm.pdf(values[1], inputs[1][:, :3],
                                 inputs[1][:, 3:])), log_prob_beta
                ], -1), -1)

            values[0] = np.expand_dims(values[0], -1)
            if fw == "torch":
                values = tree.map_structure(lambda s: torch.Tensor(s), values)
            # Test all flattened input.
            concat = np.concatenate(tree.flatten(values),
                                    -1).astype(np.float32)
            out = distr.logp(concat)
            if sess:
                out = sess.run(out)
            check(out, expected_log_llh, atol=15)
            # Test structured input.
            out = distr.logp(values)
            if sess:
                out = sess.run(out)
            check(out, expected_log_llh, atol=15)
            # Test flattened input.
            out = distr.logp(tree.flatten(values))
            if sess:
                out = sess.run(out)
            check(out, expected_log_llh, atol=15)
Ejemplo n.º 13
0
    def test_sac_compilation(self):
        """Tests whether an SACTrainer can be built with all frameworks."""
        config = sac.DEFAULT_CONFIG.copy()
        config["Q_model"] = sac.DEFAULT_CONFIG["Q_model"].copy()
        config["num_workers"] = 0  # Run locally.
        config["n_step"] = 3
        config["twin_q"] = True
        config["clip_actions"] = False
        config["normalize_actions"] = True
        config["learning_starts"] = 0
        config["prioritized_replay"] = True
        config["rollout_fragment_length"] = 10
        config["train_batch_size"] = 10
        # If we use default buffer size (1e6), the buffer will take up
        # 169.445 GB memory, which is beyond travis-ci's current (Mar 19, 2021)
        # available system memory (8.34816 GB).
        config["buffer_size"] = 40000
        # Test with saved replay buffer.
        config["store_buffer_in_checkpoints"] = True
        num_iterations = 1

        ModelCatalog.register_custom_model("batch_norm", KerasBatchNormModel)
        ModelCatalog.register_custom_model("batch_norm_torch",
                                           TorchBatchNormModel)

        image_space = Box(-1.0, 1.0, shape=(84, 84, 3))
        simple_space = Box(-1.0, 1.0, shape=(3, ))

        for fw in framework_iterator(config):
            # Test for different env types (discrete w/ and w/o image, + cont).
            for env in [
                    RandomEnv,
                    "MsPacmanNoFrameskip-v4",
                    "CartPole-v0",
            ]:
                print("Env={}".format(env))
                if env == RandomEnv:
                    config["env_config"] = {
                        "observation_space":
                        Tuple((simple_space, Discrete(2), image_space)),
                        "action_space":
                        Box(-1.0, 1.0, shape=(1, )),
                    }
                else:
                    config["env_config"] = {}
                # Test making the Q-model a custom one for CartPole, otherwise,
                # use the default model.
                config["Q_model"]["custom_model"] = "batch_norm{}".format(
                    "_torch" if fw ==
                    "torch" else "") if env == "CartPole-v0" else None
                trainer = sac.SACTrainer(config=config, env=env)
                for i in range(num_iterations):
                    results = trainer.train()
                    print(results)
                check_compute_single_action(trainer)

                # Test, whether the replay buffer is saved along with
                # a checkpoint (no point in doing it for all frameworks since
                # this is framework agnostic).
                if fw == "tf" and env == "CartPole-v0":
                    checkpoint = trainer.save()
                    new_trainer = sac.SACTrainer(config, env=env)
                    new_trainer.restore(checkpoint)
                    # Get some data from the buffer and compare.
                    data = trainer.local_replay_buffer.replay_buffers[
                        "default_policy"]._storage[:42 + 42]
                    new_data = new_trainer.local_replay_buffer.replay_buffers[
                        "default_policy"]._storage[:42 + 42]
                    check(data, new_data)
                    new_trainer.stop()

                trainer.stop()
Ejemplo n.º 14
0
            self.agent_2: global_rew / 2.0
        }
        obs = {self.agent_1: self.state, self.agent_2: self.state + 3}
        dones = {"__all__": done}
        infos = {}
        return obs, rewards, dones, infos


if __name__ == "__main__":
    args = parser.parse_args()

    grouping = {
        "group_1": [0, 1],
    }
    obs_space = Tuple([
        TwoStepGame.observation_space,
        TwoStepGame.observation_space,
    ])
    act_space = Tuple([
        TwoStepGame.action_space,
        TwoStepGame.action_space,
    ])
    register_env(
        "grouped_twostep", lambda config: TwoStepGame(config).
        with_agent_groups(grouping, obs_space=obs_space, act_space=act_space))

    if args.run == "contrib/MADDPG":
        obs_space_dict = {
            "agent_1": TwoStepGame.observation_space,
            "agent_2": TwoStepGame.observation_space,
        }
        act_space_dict = {
Ejemplo n.º 15
0
    config = {
        "env": "ray.rllib.examples.env.random_env.RandomEnv",
        "env_config": {
            "config": {
                "observation_space":
                Dict({
                    "a":
                    Discrete(2),
                    "b":
                    Dict({
                        "ba": Discrete(3),
                        "bb": Box(-1.0, 1.0, (2, 3), dtype=np.float32),
                    }),
                    "c":
                    Tuple((MultiDiscrete([2, 3]), Discrete(2))),
                    "d":
                    Box(-1.0, 1.0, (2, ), dtype=np.int32),
                }),
            },
        },
        # Set this to True to enforce no preprocessors being used.
        # Complex observations now arrive directly in the model as
        # structures of batches, e.g. {"a": tensor, "b": [tensor, tensor]}
        # for obs-space=Dict(a=..., b=Tuple(..., ...)).
        "_disable_preprocessor_api": True,
        # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0.
        "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", 0)),
        "framework": args.framework,
    }
Ejemplo n.º 16
0
    def step(self, action_dict):
        if self.state > 0:
            assert action_dict["agent_1"] == self.avail, \
                "Failed to obey available actions mask!"
        self.state += 1
        rewards = {"agent_1": 1}
        obs = {"agent_1": {"obs": 0, "action_mask": self.action_mask}}
        dones = {"__all__": self.state > 20}
        return obs, rewards, dones, {}


if __name__ == "__main__":
    grouping = {
        "group_1": ["agent_1"],  # trivial grouping for testing
    }
    obs_space = Tuple([AvailActionsTestEnv.observation_space])
    act_space = Tuple([AvailActionsTestEnv.action_space])
    register_env(
        "action_mask_test", lambda config: AvailActionsTestEnv(config).
        with_agent_groups(grouping, obs_space=obs_space, act_space=act_space))

    ray.init()
    agent = QMixTrainer(
        env="action_mask_test",
        config={
            "num_envs_per_worker": 5,  # test with vectorization on
            "env_config": {
                "avail_action": 3,
            },
        })
    for _ in range(5):
parser.add_argument("--stop-timesteps", type=int, default=100000)
parser.add_argument("--num-cpus", type=int, default=0)

if __name__ == "__main__":
    args = parser.parse_args()
    ray.init(num_cpus=args.num_cpus or None)
    register_env("NestedSpaceRepeatAfterMeEnv",
                 lambda c: NestedSpaceRepeatAfterMeEnv(c))

    config = {
        "env": "NestedSpaceRepeatAfterMeEnv",
        "env_config": {
            "space": Dict({
                "a": Tuple(
                    [Dict({
                        "d": Box(-10.0, 10.0, ()),
                        "e": Discrete(2)
                    })]),
                "b": Box(-10.0, 10.0, (2, )),
                "c": Discrete(4)
            }),
        },
        "entropy_coeff": 0.00005,  # We don't want high entropy in this Env.
        "gamma": 0.0,  # No history in Env (bandit problem).
        "lr": 0.0005,
        "num_envs_per_worker": 20,
        # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0.
        "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")),
        "num_sgd_iter": 4,
        "num_workers": 0,
        "vf_loss_coeff": 0.01,
import json # note: ujson fails this test due to float equality
from copy import copy

import numpy as np
import pytest

from gym.spaces import Tuple, Box, Discrete, MultiDiscrete, MultiBinary, Dict


@pytest.mark.parametrize("space", [
              Discrete(3),
              Tuple([Discrete(5), Discrete(10)]),
              Tuple([Discrete(5), Box(low=np.array([0, 0]), high=np.array([1, 5]), dtype=np.float32)]),
              Tuple((Discrete(5), Discrete(2), Discrete(2))),
              MultiDiscrete([2, 2, 100]),
              Dict({"position": Discrete(5),
                    "velocity": Box(low=np.array([0, 0]), high=np.array([1, 5]), dtype=np.float32)}),
              ])
def test_roundtripping(space):
    sample_1 = space.sample()
    sample_2 = space.sample()
    assert space.contains(sample_1)
    assert space.contains(sample_2)
    json_rep = space.to_jsonable([sample_1, sample_2])

    json_roundtripped = json.loads(json.dumps(json_rep))

    samples_after_roundtrip = space.from_jsonable(json_roundtripped)
    sample_1_prime, sample_2_prime = samples_after_roundtrip

    s1 = space.to_jsonable([sample_1])
Ejemplo n.º 19
0
 def action_space(self):
     first = Discrete(
         2 - self.state[0]
     )  # For state value 0, we have 2 choices 0 and 1. For state value 1 we have only one choice.
     second = Discrete(2 - self.state[1])
     return Tuple((first, second))
 def __init__(self):
     self.action_space = Discrete(4)
     self.world = np.zeros((4, 12))
     self.world[3, 1:-1] = CliffWalking.CLIFF
     self.observation_space = Tuple((Discrete(self.world.shape[0]), Discrete(self.world.shape[1])))
Ejemplo n.º 21
0
def run(args, cl_args):
    obs_space_fake = Dict({'obs': Discrete(1)})
    n_signals = args.discrete_env_config.n_signals
    nfsp_env_cls = CoordinationEnvPerfectInfo
    sims_env_cls = CoordinationSignalerImperfectInfo

    # NFSP environment
    sample_env = nfsp_env_cls(args.env_config)
    obs_space_defender = sample_env.observation_space
    obs_space_attacker = sample_env.observation_space
    action_space_defender = sample_env.action_space
    action_space_attacker = sample_env.action_space

    register_env("nfsp_env", lambda _: sample_env)

    # SIMS environment
    signaled_sample_env = sims_env_cls(args.discrete_env_config)
    obs_space_signaled = signaled_sample_env.observation_space_signaled
    team_obs_space = Tuple([obs_space_signaled, obs_space_signaled])
    team_act_space = Tuple([action_space_defender, action_space_defender])
    groups = {
        "team": ["t1", "t2"],
        "signaler": ["signaler"],
        "opponent": ["opponent"]
    }
    grouped_env_eval = sims_env_cls(
        args.discrete_env_config).with_agent_groups(groups)
    grouped_env_eval.seed = seed_fn
    args.discrete_env_config.groups = groups
    args.env_config.groups = {
        "t1": ["t1"],
        "t2": ["t2"],
        "opponent": ["opponent"]
    }
    register_env("team_env", lambda _: grouped_env_eval)
    train_config = {
        "n_train_signals": n_signals,
        "use_exec_api": True,
        "seed": tune.sample_from(lambda x: np.random.randint(10000)),
        "env": CoordWrapperClass,
        "env_config": args.env_config,
        "rollout_fragment_length": args.env_config.horizon,
        "timesteps_per_iteration": 1,
        "batch_mode": "complete_episodes",
        "num_workers": 1,
        "num_envs_per_worker": 1,
        "train_batch_size": args.train_batch_size,
        "multiagent": {
            "policies": {
                "policy_team": (SIMSPolicy, team_obs_space, team_act_space, {
                    "train_obs_space":
                    Tuple([obs_space_defender] * 2)
                }),
                "policy_opp": (NFSPPolicy, Tuple([obs_space_attacker]),
                               Tuple([action_space_attacker]), {}),
                "policy_signaler":
                (LearnableSignalerPolicy, Tuple([obs_space_fake]),
                 Tuple([Discrete(n_signals)]), {}),
                "policy_t1": (NFSPPolicy, Tuple([obs_space_defender]),
                              Tuple([action_space_defender]), {
                                  "test_obs_keys": ["obs"],
                                  "train_obs_keys": ["obs"],
                              }),
            },
            "policies_to_train": ["policy_t1", "policy_opp", "policy_team"],
            "policy_mapping_fn": select_policy,
        },
        "callbacks": {
            "on_episode_start": on_episode_start,
            # "on_episode_step": on_episode_step,
            # "on_episode_end": on_episode_end,
            # "on_sample_end": on_sample_end,
            # "on_train_result": on_train_results,
            # "on_postprocess_traj": on_postprocess_traj,
        },
        "replay_train_every": 10,
        "reservoir_train_every": 66,
        "reservoir_train_every_sims": 66,
        "lr": 1e-3,
        "beta": 0.1,
        "clip_actions": False,
        "replay_buffer_size": int(2e4),
        "reservoir_buffer_size": int(1e5),
        "framework": "torch",
        "recurrent_dqn": False,
        "evaluation_interval": 100,
        "evaluation_num_episodes": 100,
        "evaluation_config": {
            'env_config': args.discrete_env_config,
            'anticipatory_param': 0.
        },
        "custom_eval_function": eval_function_sims,
        "log_stats": True,
        "logger_config": {
            "wandb": {
                "project": "sims_coordgame",
                "api_key_file": "/home/coordination/wandb_api",
                "log_config": False
            }
        },
    }

    ray.init(log_to_driver=False, local_mode=False)

    # Build loggers
    DEFAULT_DIR = "~/ray_results/coord_game"
    tune.run(
        SIMSiNFSPTrainer,
        config=train_config,
        local_dir=DEFAULT_DIR,
        stop={"timesteps_total": 3e6},
        checkpoint_at_end=True,
        num_samples=cl_args.num_samples,
        loggers=DEFAULT_LOGGERS + (WandbLogger, ),
    )

    ray.shutdown()
Ejemplo n.º 22
0
def load_agent_config(args):
    result_dir = args.result_dir if args.result_dir[-1] != '/' \
        else args.result_dir[:-1]

    config = get_rllib_config(result_dir)
    pkl = get_rllib_pkl(result_dir)

    # check if we have a multiagent scenario but in a
    # backwards compatible way
    if config.get('multiagent', {}).get('policy_graphs', {}):
        multiagent = True
        config['multiagent'] = pkl['multiagent']
    else:
        multiagent = False

    single_env = env_creator(pkl['env_config'])
    env_name = pkl['env']
    # Create and register a gym+rllib env
    obs_space = Tuple(
        [single_env.observation_space for _ in range(single_env.num_agents)])
    act_space = Tuple(
        [single_env.action_space for _ in range(single_env.num_agents)])

    grouping = {
        "group_1": [f"agent-{i}" for i in range(single_env.num_agents)]
    }

    register_env(
        env_name, lambda env_config: env_creator(env_config).with_agent_groups(
            grouping, obs_space=obs_space, act_space=act_space))

    ModelCatalog.register_custom_model("conv_to_fc_net", ConvToFCNet)

    # Determine agent and checkpoint
    config_run = config['env_config']['run'] if 'run' in config['env_config'] \
        else None
    if (args.run and config_run):
        if (args.run != config_run):
            print('visualizer_rllib.py: error: run argument ' +
                  '\'{}\' passed in '.format(args.run) +
                  'differs from the one stored in params.json ' +
                  '\'{}\''.format(config_run))
            sys.exit(1)
    if (args.run):
        agent_cls = get_agent_class(args.run)
    elif (config_run):
        agent_cls = get_agent_class(config_run)
    else:
        print('visualizer_rllib.py: error: could not find flow parameter '
              '\'run\' in params.json, '
              'add argument --run to provide the algorithm or model used '
              'to train the results\n e.g. '
              'python ./visualizer_rllib.py /tmp/ray/result_dir 1 --run PPO')
        sys.exit(1)

    # Run on only one cpu for rendering purposes if possible; A3C requires two
    if config_run == 'A3C':
        config['num_workers'] = 1
        config["sample_async"] = False
    else:
        config['num_workers'] = 0

    # create the agent that will be used to compute the actions
    agent = agent_cls(env=env_name, config=config)
    checkpoint = result_dir + '/checkpoint_' + args.checkpoint_num
    checkpoint = checkpoint + '/checkpoint-' + args.checkpoint_num
    print('Loading checkpoint', checkpoint)
    agent.restore(checkpoint)
    return agent, config
Ejemplo n.º 23
0
import gym
from gym.spaces import Box, Discrete, Tuple
from gym.envs.registration import EnvSpec
import numpy as np

import ray
from ray.rllib.agents.agent import get_agent_class
from ray.rllib.utils.error import UnsupportedSpaceException
from ray.tune.registry import register_env

ACTION_SPACES_TO_TEST = {
    "discrete": Discrete(5),
    "vector": Box(0.0, 1.0, (5,), dtype=np.float32),
    "simple_tuple": Tuple([
        Box(0.0, 1.0, (5,), dtype=np.float32),
        Box(0.0, 1.0, (5,), dtype=np.float32)]),
    "implicit_tuple": [
        Box(0.0, 1.0, (5,), dtype=np.float32),
        Box(0.0, 1.0, (5,), dtype=np.float32)],
}

OBSERVATION_SPACES_TO_TEST = {
    "discrete": Discrete(5),
    "vector": Box(0.0, 1.0, (5,), dtype=np.float32),
    "image": Box(0.0, 1.0, (80, 80, 1), dtype=np.float32),
    "atari": Box(0.0, 1.0, (210, 160, 3), dtype=np.float32),
    "atari_ram": Box(0.0, 1.0, (128,), dtype=np.float32),
    "simple_tuple": Tuple([
        Box(0.0, 1.0, (5,), dtype=np.float32),
        Box(0.0, 1.0, (5,), dtype=np.float32)]),
Ejemplo n.º 24
0
from ray.rllib.examples.env.random_env import RandomEnv
from ray.rllib.offline.json_reader import JsonReader
from ray.rllib.utils.test_utils import framework_iterator

SPACES = {
    "dict":
    Dict({
        "a":
        Dict({
            "aa": Box(-1.0, 1.0, shape=(3, )),
            "ab": MultiDiscrete([4, 3]),
        }),
        "b":
        Discrete(3),
        "c":
        Tuple([Box(0, 10, (2, ), dtype=np.int32),
               Discrete(2)]),
        "d":
        Box(0, 3, (), dtype=np.int64),
    }),
    "tuple":
    Tuple([
        Tuple([
            Box(-1.0, 1.0, shape=(2, )),
            Discrete(3),
        ]),
        MultiDiscrete([4, 3]),
        Dict({
            "a": Box(0, 100, (), dtype=np.int32),
            "b": Discrete(2),
        }),
    ]),
Ejemplo n.º 25
0
 def __init__(self):
     self.observation_space = Tuple(
         [Discrete(5),
          Box(0, 5, shape=(3, ), dtype=np.float32)])
Ejemplo n.º 26
0
    def get_policy_configs_for_game(game_name):

        # The RLlib server must know about the Spaces that the Client will be
        # using inside Unity3D, up-front.
        obs_spaces = {
            # 3DBall.
            "3DBall":
            Box(float("-inf"), float("inf"), (8, )),
            # 3DBallHard.
            "3DBallHard":
            Box(float("-inf"), float("inf"), (45, )),
            # SoccerStrikersVsGoalie.
            "Goalie":
            Box(float("-inf"), float("inf"), (738, )),
            "Striker":
            Tuple([
                Box(float("-inf"), float("inf"), (231, )),
                Box(float("-inf"), float("inf"), (63, )),
            ]),
            # Tennis.
            "Tennis":
            Box(float("-inf"), float("inf"), (27, )),
            # VisualHallway.
            "VisualHallway":
            Box(float("-inf"), float("inf"), (84, 84, 3)),
            # Walker.
            "Walker":
            Box(float("-inf"), float("inf"), (212, )),
        }
        action_spaces = {
            # 3DBall.
            "3DBall": Box(float("-inf"), float("inf"), (2, ),
                          dtype=np.float32),
            # 3DBallHard.
            "3DBallHard": Box(float("-inf"),
                              float("inf"), (2, ),
                              dtype=np.float32),
            # SoccerStrikersVsGoalie.
            "Goalie": MultiDiscrete([3, 3, 3]),
            "Striker": MultiDiscrete([3, 3, 3]),
            # Tennis.
            "Tennis": Box(float("-inf"), float("inf"), (3, )),
            # VisualHallway.
            "VisualHallway": MultiDiscrete([5]),
            # Walker.
            "Walker": Box(float("-inf"), float("inf"), (39, )),
        }

        # Policies (Unity: "behaviors") and agent-to-policy mapping fns.
        if game_name == "SoccerStrikersVsGoalie":
            policies = {
                "Goalie":
                (None, obs_spaces["Goalie"], action_spaces["Goalie"], {}),
                "Striker":
                (None, obs_spaces["Striker"], action_spaces["Striker"], {}),
            }

            def policy_mapping_fn(agent_id):
                return "Striker" if "Striker" in agent_id else "Goalie"

        else:
            policies = {
                game_name:
                (None, obs_spaces[game_name], action_spaces[game_name], {}),
            }

            def policy_mapping_fn(agent_id):
                return game_name

        return policies, policy_mapping_fn
Ejemplo n.º 27
0
     }),
     (6, ),
 ),
 (
     Dict({
         "key1": Box(shape=(2, 3), low=-1, high=1, dtype=np.float32),
         "key2": Box(shape=(), low=-1, high=1, dtype=np.float32),
         "key3": Box(shape=(2, ), low=-1, high=1, dtype=np.float32),
     }),
     (9, ),
 ),
 (
     Dict({
         "key1":
         Tuple((
             Box(shape=(2, ), low=-1, high=1, dtype=np.float32),
             Box(shape=(2, ), low=-1, high=1, dtype=np.float32),
         )),
         "key2":
         Box(shape=(), low=-1, high=1, dtype=np.float32),
         "key3":
         Box(shape=(2, ), low=-1, high=1, dtype=np.float32),
     }),
     (7, ),
 ),
 (
     Dict({
         "key1":
         Tuple((Box(shape=(2, ), low=-1, high=1, dtype=np.float32), )),
         "key2":
         Box(shape=(), low=-1, high=1, dtype=np.float32),
         "key3":
Ejemplo n.º 28
0
parser.add_argument("--stop", type=int, default=20)

N_TOPICS = 15
TOPICS = ['T{}'.format(i) for i in range(N_TOPICS)]

CONTEXT_ATTRIBUTES = {'hour':['0-7', '8-9', '10-12','13-14','15-18','19-21','22-23'],
                       'week period': ['Weekday', 'Weekend'],
                       'weather': ['Sunny','Cloudy','Raining'],
                      'device':['mobile ios','mac ios','mobile android','windows']}

OBSERVATION_0 = len(CONTEXT_ATTRIBUTES) * [0] + N_TOPICS * [0]

OBSERVATION_SPACE = Tuple((Discrete(7),
                           Discrete(2),
                           Discrete(3),
                           Discrete(4),
                           Discrete(2), Discrete(2), Discrete(2), Discrete(2), Discrete(2),
                           Discrete(2), Discrete(2), Discrete(2), Discrete(2), Discrete(2),
                           Discrete(2), Discrete(2), Discrete(2), Discrete(2), Discrete(2)
                           ))

# ACTION_SPACE = Box(low=0.0, high=1.0, shape=(N_TOPICS,))

ACTION_SPACE = Tuple( (Discrete(2), Discrete(2), Discrete(2), Discrete(2), Discrete(2),
                       Discrete(2), Discrete(2), Discrete(2), Discrete(2), Discrete(2),
                       Discrete(2), Discrete(2), Discrete(2), Discrete(2), Discrete(2)
                       ))

# Probability of a user click based on the distance bwteen article topics
PROBAB = N_TOPICS*[0]
PROBAB[1:8] = [0.2, 0.5, 0.7, 0.4, 0.3, 0.2, 0.1]
Ejemplo n.º 29
0
    def __init__(self,
                 rules_file,
                 leaf_threshold=16,
                 max_cuts_per_dimension=5,
                 max_actions_per_episode=5000,
                 max_depth=100,
                 partition_mode=None,
                 reward_shape="linear",
                 depth_weight=1.0,
                 dump_dir=None,
                 zero_obs=False):

        self.reward_shape = {
            "linear": lambda x: x,
            "log": lambda x: np.log(x),
        }[reward_shape]
        self.zero_obs = zero_obs

        assert partition_mode in [None, "simple", "efficuts", "cutsplit"]
        self.partition_enabled = partition_mode == "simple"
        if partition_mode in ["efficuts", "cutsplit"]:
            self.force_partition = partition_mode
        else:
            self.force_partition = False

        self.dump_dir = dump_dir and os.path.expanduser(dump_dir)
        if self.dump_dir:
            try:
                os.makedirs(self.dump_dir)
            except:
                pass
        self.best_time = float("inf")
        self.best_space = float("inf")

        self.depth_weight = depth_weight
        self.rules_file = rules_file
        self.rules = load_rules_from_file(rules_file)
        self.leaf_threshold = leaf_threshold
        self.max_actions_per_episode = max_actions_per_episode
        self.max_depth = max_depth
        self.num_actions = None
        self.tree = None
        self.node_map = None
        self.child_map = None
        self.max_cuts_per_dimension = max_cuts_per_dimension
        if self.partition_enabled:
            self.num_part_levels = NUM_PART_LEVELS
        else:
            self.num_part_levels = 0
        self.action_space = Tuple([
            Discrete(NUM_DIMENSIONS),
            Discrete(max_cuts_per_dimension + self.num_part_levels)
        ])
        self.observation_space = Dict({
            "real_obs":
            Box(0, 1, (279, ), dtype=np.float32),
            "action_mask":
            Box(0,
                1, (NUM_DIMENSIONS + max_cuts_per_dimension +
                    self.num_part_levels, ),
                dtype=np.float32),
        })
Ejemplo n.º 30
0
ACTION_SPACES_TO_TEST = {
    "discrete":
    Discrete(5),
    "vector":
    Box(-1.0, 1.0, (5, ), dtype=np.float32),
    "vector2":
    Box(-1.0, 1.0, (
        5,
        5,
    ), dtype=np.float32),
    "multidiscrete":
    MultiDiscrete([1, 2, 3, 4]),
    "tuple":
    Tuple([Discrete(2),
           Discrete(3),
           Box(-1.0, 1.0, (5, ), dtype=np.float32)]),
}

OBSERVATION_SPACES_TO_TEST = {
    "discrete":
    Discrete(5),
    "vector":
    Box(-1.0, 1.0, (5, ), dtype=np.float32),
    "vector2":
    Box(-1.0, 1.0, (5, 5), dtype=np.float32),
    "image":
    Box(-1.0, 1.0, (84, 84, 1), dtype=np.float32),
    "atari":
    Box(-1.0, 1.0, (210, 160, 3), dtype=np.float32),
    "tuple":