def __init__(self): # Game parameters self.num_turns = 150 self.num_units = 100 self.num_groups = 12 self.num_nodes = 11 self.num_actions_per_turn = 7 self.unit_classes = ['controller', 'striker', 'tank', 'recon'] # Integers are used to represent the unit type (e.g. 0: controller, 1: striker). # With 4 types of units, a group containing all 4 would have the maximum value # for this part of the observation space. If each unit is designated by index, then # this integer would be 3210. self.unit_config_high = 3210 # Define the action space self.action_space = Tuple( (Discrete(self.num_groups), Discrete(self.num_nodes + 1)) * self.num_actions_per_turn) # Define the state space self.observation_space = self._build_observation_space() return
def __init__(self, env): super(ScaledStateWrapper, self).__init__(env) obs = env.observation_space self.compound = False self.low = None self.high = None print(type(obs)) print(obs) if isinstance(obs, gym.spaces.Box): self.low = env.observation_space.low self.high = env.observation_space.high self.observation_space = gym.spaces.Box(low=-np.ones(self.low.shape), high=np.ones(self.high.shape), dtype=np.float32) elif isinstance(obs, Tuple): self.low = obs.spaces[0].low self.high = obs.spaces[0].high assert len(obs.spaces) == 2 and isinstance(obs.spaces[1], gym.spaces.Discrete) self.observation_space = Tuple( (gym.spaces.Box(low=-np.ones(self.low.shape), high=np.ones(self.high.shape), dtype=np.float32), obs.spaces[1])) self.compound = False else: raise Exception("Unsupported observation space type: %s" % self.observation_space)
parser.add_argument("--as-test", action="store_true") parser.add_argument("--torch", action="store_true") parser.add_argument("--stop-reward", type=float, default=7.0) parser.add_argument("--stop-timesteps", type=int, default=50000) if __name__ == "__main__": args = parser.parse_args() grouping = { "group_1": [0, 1], } obs_space = Tuple([ Dict({ "obs": MultiDiscrete([2, 2, 2, 3]), ENV_STATE: MultiDiscrete([2, 2, 2]) }), Dict({ "obs": MultiDiscrete([2, 2, 2, 3]), ENV_STATE: MultiDiscrete([2, 2, 2]) }), ]) act_space = Tuple([ TwoStepGame.action_space, TwoStepGame.action_space, ]) register_env( "grouped_twostep", lambda config: TwoStepGame(config). with_agent_groups(grouping, obs_space=obs_space, act_space=act_space)) if args.run == "contrib/MADDPG": obs_space_dict = { "agent_1": Discrete(6),
from gym.utils.seeding import RandomNumberGenerator spaces = [ Box(low=np.array(-1.0), high=np.array(1.0), dtype=np.float64), Box(low=np.array([0.0]), high=np.array([10.0]), dtype=np.float64), Box(low=np.array([-1.0, 0.0, 0.0]), high=np.array([1.0, 1.0, 1.0]), dtype=np.float64), Box(low=np.array([[-1.0, 0.0], [0.0, -1.0]]), high=np.ones((2, 2)), dtype=np.float64), Box(low=0, high=255, shape=(), dtype=np.uint8), Box(low=0, high=255, shape=(32, 32, 3), dtype=np.uint8), Discrete(2), Discrete(5, start=-2), Tuple((Discrete(3), Discrete(5))), Tuple(( Discrete(7), Box(low=np.array([0.0, -1.0]), high=np.array([1.0, 1.0]), dtype=np.float64), )), MultiDiscrete([11, 13, 17]), MultiBinary(19), Dict({ "position": Discrete(23), "velocity": Box(low=np.array([0.0]), high=np.array([1.0]), dtype=np.float64), }), Dict({
def _load_config_params(self): # Load Params from the desired Yaml file rospkg_path = rospkg.RosPack().get_path("parrot_ardrone_rl") config_file_name = "parrotdrone_goto.yaml" config_file_path = os.path.join( rospkg_path, "scripts/parrot_gym/parrotdrone_tasks/config/" + str(config_file_name)) parameters_list = rosparam.load_file(config_file_path) for params, namespace in parameters_list: rosparam.upload_params(namespace, params) # Continuous action space hv_range = rospy.get_param('/parrotdrone/lxy_vel_range') vv_range = rospy.get_param('/parrotdrone/lz_vel_range') rv_range = rospy.get_param('/parrotdrone/rot_vel_range') self.action_low = np.array( [-1 * hv_range, -1 * hv_range, -1 * vv_range, -1 * rv_range]) self.action_high = np.array([hv_range, hv_range, vv_range, rv_range]) self.action_space = Box(low=self.action_low, high=self.action_high, dtype=np.float32) self.reward_range = (-np.inf, np.inf) self.init_vel_vec = Twist() self.init_vel_vec.linear.x = rospy.get_param( '/parrotdrone/init_velocity_vector/linear_x') self.init_vel_vec.linear.y = rospy.get_param( '/parrotdrone/init_velocity_vector/linear_y') self.init_vel_vec.linear.z = rospy.get_param( '/parrotdrone/init_velocity_vector/linear_z') self.init_vel_vec.angular.x = rospy.get_param( '/parrotdrone/init_velocity_vector/angular_x') self.init_vel_vec.angular.y = rospy.get_param( '/parrotdrone/init_velocity_vector/angular_y') self.init_vel_vec.angular.z = rospy.get_param( '/parrotdrone/init_velocity_vector/angular_z') # Get WorkSpace Cube Dimensions self.work_space_x_max = rospy.get_param( "/parrotdrone/work_space/x_max") self.work_space_x_min = rospy.get_param( "/parrotdrone/work_space/x_min") self.work_space_y_max = rospy.get_param( "/parrotdrone/work_space/y_max") self.work_space_y_min = rospy.get_param( "/parrotdrone/work_space/y_min") self.work_space_z_max = rospy.get_param( "/parrotdrone/work_space/z_max") self.work_space_z_min = rospy.get_param( "/parrotdrone/work_space/z_min") # Maximum Quaternion values self.max_qw = rospy.get_param("/parrotdrone/max_orientation/w") self.max_qx = rospy.get_param("/parrotdrone/max_orientation/x") self.max_qy = rospy.get_param("/parrotdrone/max_orientation/y") self.max_qz = rospy.get_param("/parrotdrone/max_orientation/z") # Maximum velocity values self.max_vel_lin_x = rospy.get_param( "/parrotdrone/max_velocity_vector/linear_x") self.max_vel_lin_y = rospy.get_param( "/parrotdrone/max_velocity_vector/linear_y") self.max_vel_lin_z = rospy.get_param( "/parrotdrone/max_velocity_vector/linear_z") self.max_vel_ang_x = rospy.get_param( "/parrotdrone/max_velocity_vector/angular_x") self.max_vel_ang_y = rospy.get_param( "/parrotdrone/max_velocity_vector/angular_y") self.max_vel_ang_z = rospy.get_param( "/parrotdrone/max_velocity_vector/angular_z") #Front camera resolution self.front_cam_h = rospy.get_param("/parrotdrone/front_cam_res/height") self.front_cam_w = rospy.get_param("/parrotdrone/front_cam_res/width") # Get Desired Point to Get self.desired_pose = Pose() self.desired_pose.position.x = rospy.get_param( "/parrotdrone/desired_position/x") self.desired_pose.position.y = rospy.get_param( "/parrotdrone/desired_position/y") self.desired_pose.position.z = rospy.get_param( "/parrotdrone/desired_position/z") self.desired_pose.orientation.w = rospy.get_param( "/parrotdrone/desired_orientation/w") self.desired_pose.orientation.x = rospy.get_param( "/parrotdrone/desired_orientation/x") self.desired_pose.orientation.y = rospy.get_param( "/parrotdrone/desired_orientation/y") self.desired_pose.orientation.z = rospy.get_param( "/parrotdrone/desired_orientation/z") self.desired_pose_epsilon = rospy.get_param( "/parrotdrone/desired_point_epsilon") self.geo_distance = rospy.get_param("/parrotdrone/geodesic_distance") self.min_height = rospy.get_param("parrotdrone/min_height") # We place the Maximum and minimum values of the X,Y,Z,W,X,Y,Z # of the pose numeric_high = np.array([ self.work_space_x_max, self.work_space_y_max, self.work_space_z_max, self.max_qw, self.max_qx, self.max_qy, self.max_qz, self.max_vel_lin_x, self.max_vel_lin_y, self.max_vel_lin_z, self.max_vel_ang_x, self.max_vel_ang_y, self.max_vel_ang_z ]) numeric_low = np.array([ self.work_space_x_min, self.work_space_y_min, self.work_space_z_min, -1 * self.max_qw, -1 * self.max_qx, -1 * self.max_qy, -1 * self.max_qz, -1 * self.max_vel_lin_x, -1 * self.max_vel_lin_y, -1 * self.max_vel_lin_z, -1 * self.max_vel_ang_x, -1 * self.max_vel_ang_y, -1 * self.max_vel_ang_z ]) self.numeric_obs_space = Box(numeric_low, numeric_high, dtype=np.float32) self.image_obs_space = Box(low=0, high=255, shape=(self.front_cam_h, self.front_cam_w, 3), dtype=np.uint8) self.observation_space = Tuple( [self.numeric_obs_space, self.image_obs_space]) # rospy.logdebug("ACTION SPACES TYPE===>"+str(self.action_space)) # rospy.logdebug("OBSERVATION SPACES TYPE===>" + # str(self.observation_space)) # Rewards self.closer_to_point_reward = rospy.get_param( "/parrotdrone/closer_to_point_reward") self.not_ending_point_reward = rospy.get_param( "/parrotdrone/not_ending_point_reward") self.end_episode_points = rospy.get_param( "/parrotdrone/end_episode_points") self.cumulated_steps = 0.0
def __init__(self, venv): """Init.""" super().__init__(venv) self.observation_space = Tuple( [self.observation_space, self.observation_space])
def main(): args = parser.parse_args() config = generate_config(args) # env = CityFlowEnvRay(config) # eng = cityflow.Engine(config["cityflow_config_file"], thread_num = config["thread_num"]) # config["eng"] = [eng,] # print(config["eng"]) num_agents = 6 grouping = { "group_1": [id_ for id_ in config["intersection_id"]] } obs_space = Tuple([ CityFlowEnvRay.observation_space for _ in range(num_agents) ]) act_space = Tuple([ CityFlowEnvRay.action_space for _ in range(num_agents) ]) register_env( "cityflow_multi", lambda config_: CityFlowEnvRay(config_).with_agent_groups( grouping, obs_space=obs_space, act_space=act_space)) if args.algo == "QMIX": config_ = { # "num_workers": 2, "num_gpus_per_worker": 0, "sample_batch_size": 20, "num_cpus_per_worker": 8, "train_batch_size": 32, "exploration_final_eps": 0.0, "num_workers": 0, "mixer": grid_search(["qmix"]), "env_config": config } group = True elif args.algo == "APEX_QMIX": config_ = { "num_gpus": 1, "num_workers": 2, "optimizer": { "num_replay_buffer_shards": 1, }, "min_iter_time_s": 3, "buffer_size": 2000, "learning_starts": 300, "train_batch_size": 64, "sample_batch_size": 32, "target_network_update_freq": 100, "timesteps_per_iteration": 1000, "env_config": config } group = True else: config_ = {} group = False ray.init() tune.run( args.algo, stop={ "timesteps_total": args.epoch * args.num_step }, checkpoint_freq=args.save_freq, config=dict(config_, **{"env": "cityflow_multi"}), )
import json # note: ujson fails this test due to float equality import copy import numpy as np import pytest from gym.spaces import Tuple, Box, Discrete, MultiDiscrete, MultiBinary, Dict @pytest.mark.parametrize( "space", [ Discrete(3), Discrete(5, start=-2), Box(low=0.0, high=np.inf, shape=(2, 2)), Tuple([Discrete(5), Discrete(10)]), Tuple([ Discrete(5), Box(low=np.array([0, 0]), high=np.array([1, 5]), dtype=np.float32), ]), Tuple((Discrete(5), Discrete(2), Discrete(2))), Tuple((Discrete(5), Discrete(2, start=6), Discrete(2, start=-4))), MultiDiscrete([2, 2, 100]), MultiBinary(10), Dict({ "position": Discrete(5), "velocity": Box(low=np.array([0, 0]), high=np.array([1, 5]), dtype=np.float32), }), ],
return "high_level_policy" config = { "env": HierarchicalWindyMazeEnv, "num_workers": 0, "log_level": "INFO", "entropy_coeff": 0.01, "multiagent": { "policies": { "high_level_policy": (None, maze.observation_space, Discrete(4), { "gamma": 0.9 }), "low_level_policy": (None, Tuple([ maze.observation_space, Discrete(4) ]), maze.action_space, { "gamma": 0.0 }), }, "policy_mapping_fn": function(policy_mapping_fn), }, "framework": "torch" if args.torch else "tf", } results = tune.run("PPO", stop=stop, config=config) if args.as_test: check_learning_achieved(results, args.stop_reward) ray.shutdown()
import json # note: ujson fails this test due to float equality import numpy as np from nose2 import tools from gym.spaces import Tuple, Box, Discrete, HighLow @tools.params( Discrete(3), Tuple([Discrete(5), Discrete(10)]), Tuple([Discrete(5), Box(np.array([0, 0]), np.array([1, 5]))]), Tuple((Discrete(5), Discrete(2), Discrete(2))), HighLow(np.matrix([[0, 1, 0], [0, 1, 0], [0.0, 100.0, 2]])), ) def test_roundtripping(space): sample_1 = space.sample() sample_2 = space.sample() assert space.contains(sample_1) assert space.contains(sample_2) json_rep = space.to_jsonable([sample_1, sample_2]) json_roundtripped = json.loads(json.dumps(json_rep)) samples_after_roundtrip = space.from_jsonable(json_roundtripped) sample_1_prime, sample_2_prime = samples_after_roundtrip s1 = space.to_jsonable([sample_1]) s1p = space.to_jsonable([sample_1_prime]) s2 = space.to_jsonable([sample_2]) s2p = space.to_jsonable([sample_2_prime])
class RockPaperScissors(MultiAgentEnv): """ Two-player environment for the famous rock paper scissors game, modified: - There are two agents which alternate, the action of one agent provides the state for the next agent. Since one of the two players begins, the agent which starts second should learn to always win! The startign player is drawn randomly. - The action space changes. The game is divided in three rounds across which you can't re-use the same action. """ # Action/State spaces ACTION_SPACE = Discrete(Actions.SIZE) OBSERVATION_SPACE = Dict({ "real_obs": Tuple(( # First round Tuple((Discrete(4), Discrete(4))), # Second round Tuple((Discrete(4), Discrete(4))), # Third round Tuple((Discrete(4), Discrete(4))), )), # we have to handle changing action spaces "action_mask": Box(0, 1, shape=(Actions.SIZE,)), }) # Reward mapping rewards = { (Actions.ROCK, Actions.ROCK): (0, 0), (Actions.ROCK, Actions.PAPER): (-1, 1), (Actions.ROCK, Actions.SCISSORS): (1, -1), (Actions.PAPER, Actions.ROCK): (1, -1), (Actions.PAPER, Actions.PAPER): (0, 0), (Actions.PAPER, Actions.SCISSORS): (-1, 1), (Actions.SCISSORS, Actions.ROCK): (-1, 1), (Actions.SCISSORS, Actions.PAPER): (1, -1), (Actions.SCISSORS, Actions.SCISSORS): (0, 0), } def __init__(self, config=None): # state and action spaces self.action_space = self.ACTION_SPACE self.observation_space = self.OBSERVATION_SPACE self.players = ["player_1", "player_2"] def reset(self): self.player_scores = {p: 0 for p in self.players} # just used to collect the scores self.curr_round = 0 self.player_pointer = random.randint(0, 1) self.state = [ [3, 3], [3, 3], [3, 3], ] # reward is given to the last player with 1 delay self.reward_buffer = {p: 0 for p in self.players} # actions cannot be reused across one game, we keep a mask for each player self.action_mask = {p: [1 for _ in range(self.action_space.n)] for p in self.players} return {self.players[self.player_pointer]: self.get_state(self.players[self.player_pointer])} def step(self, action_dict): # Get current player curr_player_pointer = self.player_pointer curr_player = self.players[self.player_pointer] # Get next player next_player_pointer = (self.player_pointer + 1) % 2 next_player = self.players[next_player_pointer] # Make sure you have the ation only for the current player assert curr_player in action_dict and len(action_dict) == 1, \ "{} should be playing but action {} was received.".format(curr_player, action_dict) # Play the action curr_action = action_dict[curr_player] assert self.action_space.contains(curr_action), 'Action {} is not valid'.format(curr_action) assert self.state[self.curr_round][curr_player_pointer] == Actions.NA, \ "Player {} has already played in round {}. Here the current state: {}".format( curr_player_pointer, self.curr_round, self.state ) assert self.action_mask[curr_player][curr_action] == 1, \ '{} has already played action {}. State: {}'.format(curr_player, curr_action, self.state) self.action_mask[curr_player][curr_action] = 0 # mask out this action self.state[self.curr_round][curr_player_pointer] = curr_action # We might be not done yet done = {"__all__": False} # If the next player has already played, the round is done game_done = False round_done = self.state[self.curr_round][next_player_pointer] != Actions.NA if round_done: # If the round is done we compute the rewards curr_rewards = self.rewards[tuple(self.state[self.curr_round])] self.player_scores["player_1"] += curr_rewards[0] self.player_scores["player_2"] += curr_rewards[1] self.reward_buffer[curr_player] = curr_rewards[curr_player_pointer] self.curr_round += 1 if self.curr_round == 3: done = {"__all__": True} # Return reward and state for all players reward = self.reward_buffer obs = {p: self.get_state(next_player) for p in self.players} game_done = True # Get the state and reward for the next player if not game_done: obs = {next_player: self.get_state(next_player)} reward = {next_player: self.reward_buffer[next_player]} # Move pointer to next player self.player_pointer = next_player_pointer return obs, reward, done, {} def get_state(self, player): return { 'real_obs': self.state, 'action_mask': self.action_mask[player] }
def test_multi_action_distribution(self): """Tests the MultiActionDistribution (across all frameworks).""" batch_size = 1000 input_space = Tuple([ Box(-10.0, 10.0, shape=(batch_size, 4)), Box(-2.0, 2.0, shape=( batch_size, 6, )), Dict({"a": Box(-1.0, 1.0, shape=(batch_size, 4))}), ]) std_space = Box(-0.05, 0.05, shape=( batch_size, 3, )) low, high = -1.0, 1.0 value_space = Tuple([ Box(0, 3, shape=(batch_size, ), dtype=np.int32), Box(-2.0, 2.0, shape=(batch_size, 3), dtype=np.float32), Dict({"a": Box(0.0, 1.0, shape=(batch_size, 2), dtype=np.float32)}) ]) for fw, sess in framework_iterator(session=True): if fw == "torch": cls = TorchMultiActionDistribution child_distr_cls = [ TorchCategorical, TorchDiagGaussian, partial(TorchBeta, low=low, high=high) ] else: cls = MultiActionDistribution child_distr_cls = [ Categorical, DiagGaussian, partial(Beta, low=low, high=high), ] inputs = list(input_space.sample()) distr = cls(np.concatenate([inputs[0], inputs[1], inputs[2]["a"]], axis=1), model={}, action_space=value_space, child_distributions=child_distr_cls, input_lens=[4, 6, 4]) # Adjust inputs for the Beta distr just as Beta itself does. inputs[2]["a"] = np.clip(inputs[2]["a"], np.log(SMALL_NUMBER), -np.log(SMALL_NUMBER)) inputs[2]["a"] = np.log(np.exp(inputs[2]["a"]) + 1.0) + 1.0 # Sample deterministically. expected_det = [ np.argmax(inputs[0], axis=-1), inputs[1][:, :3], # [:3]=Mean values. # Mean for a Beta distribution: # 1 / [1 + (beta/alpha)] * range + low (1.0 / (1.0 + inputs[2]["a"][:, 2:] / inputs[2]["a"][:, 0:2])) * (high - low) + low, ] out = distr.deterministic_sample() if sess: out = sess.run(out) check(out[0], expected_det[0]) check(out[1], expected_det[1]) check(out[2]["a"], expected_det[2]) # Stochastic sampling -> expect roughly the mean. inputs = list(input_space.sample()) # Fix categorical inputs (not needed for distribution itself, but # for our expectation calculations). inputs[0] = softmax(inputs[0], -1) # Fix std inputs (shouldn't be too large for this test). inputs[1][:, 3:] = std_space.sample() # Adjust inputs for the Beta distr just as Beta itself does. inputs[2]["a"] = np.clip(inputs[2]["a"], np.log(SMALL_NUMBER), -np.log(SMALL_NUMBER)) inputs[2]["a"] = np.log(np.exp(inputs[2]["a"]) + 1.0) + 1.0 distr = cls(np.concatenate([inputs[0], inputs[1], inputs[2]["a"]], axis=1), model={}, action_space=value_space, child_distributions=child_distr_cls, input_lens=[4, 6, 4]) expected_mean = [ np.mean(np.sum(inputs[0] * np.array([0, 1, 2, 3]), -1)), inputs[1][:, :3], # [:3]=Mean values. # Mean for a Beta distribution: # 1 / [1 + (beta/alpha)] * range + low (1.0 / (1.0 + inputs[2]["a"][:, 2:] / inputs[2]["a"][:, :2])) * (high - low) + low, ] out = distr.sample() if sess: out = sess.run(out) out = list(out) if fw == "torch": out[0] = out[0].numpy() out[1] = out[1].numpy() out[2]["a"] = out[2]["a"].numpy() check(np.mean(out[0]), expected_mean[0], decimals=1) check(np.mean(out[1], 0), np.mean(expected_mean[1], 0), decimals=1) check(np.mean(out[2]["a"], 0), np.mean(expected_mean[2], 0), decimals=1) # Test log-likelihood outputs. # Make sure beta-values are within 0.0 and 1.0 for the numpy # calculation (which doesn't have scaling). inputs = list(input_space.sample()) # Adjust inputs for the Beta distr just as Beta itself does. inputs[2]["a"] = np.clip(inputs[2]["a"], np.log(SMALL_NUMBER), -np.log(SMALL_NUMBER)) inputs[2]["a"] = np.log(np.exp(inputs[2]["a"]) + 1.0) + 1.0 distr = cls(np.concatenate([inputs[0], inputs[1], inputs[2]["a"]], axis=1), model={}, action_space=value_space, child_distributions=child_distr_cls, input_lens=[4, 6, 4]) inputs[0] = softmax(inputs[0], -1) values = list(value_space.sample()) log_prob_beta = np.log( beta.pdf(values[2]["a"], inputs[2]["a"][:, :2], inputs[2]["a"][:, 2:])) # Now do the up-scaling for [2] (beta values) to be between # low/high. values[2]["a"] = values[2]["a"] * (high - low) + low inputs[1][:, 3:] = np.exp(inputs[1][:, 3:]) expected_log_llh = np.sum( np.concatenate([ np.expand_dims( np.log( [i[values[0][j]] for j, i in enumerate(inputs[0])]), -1), np.log( norm.pdf(values[1], inputs[1][:, :3], inputs[1][:, 3:])), log_prob_beta ], -1), -1) values[0] = np.expand_dims(values[0], -1) if fw == "torch": values = tree.map_structure(lambda s: torch.Tensor(s), values) # Test all flattened input. concat = np.concatenate(tree.flatten(values), -1).astype(np.float32) out = distr.logp(concat) if sess: out = sess.run(out) check(out, expected_log_llh, atol=15) # Test structured input. out = distr.logp(values) if sess: out = sess.run(out) check(out, expected_log_llh, atol=15) # Test flattened input. out = distr.logp(tree.flatten(values)) if sess: out = sess.run(out) check(out, expected_log_llh, atol=15)
def test_sac_compilation(self): """Tests whether an SACTrainer can be built with all frameworks.""" config = sac.DEFAULT_CONFIG.copy() config["Q_model"] = sac.DEFAULT_CONFIG["Q_model"].copy() config["num_workers"] = 0 # Run locally. config["n_step"] = 3 config["twin_q"] = True config["clip_actions"] = False config["normalize_actions"] = True config["learning_starts"] = 0 config["prioritized_replay"] = True config["rollout_fragment_length"] = 10 config["train_batch_size"] = 10 # If we use default buffer size (1e6), the buffer will take up # 169.445 GB memory, which is beyond travis-ci's current (Mar 19, 2021) # available system memory (8.34816 GB). config["buffer_size"] = 40000 # Test with saved replay buffer. config["store_buffer_in_checkpoints"] = True num_iterations = 1 ModelCatalog.register_custom_model("batch_norm", KerasBatchNormModel) ModelCatalog.register_custom_model("batch_norm_torch", TorchBatchNormModel) image_space = Box(-1.0, 1.0, shape=(84, 84, 3)) simple_space = Box(-1.0, 1.0, shape=(3, )) for fw in framework_iterator(config): # Test for different env types (discrete w/ and w/o image, + cont). for env in [ RandomEnv, "MsPacmanNoFrameskip-v4", "CartPole-v0", ]: print("Env={}".format(env)) if env == RandomEnv: config["env_config"] = { "observation_space": Tuple((simple_space, Discrete(2), image_space)), "action_space": Box(-1.0, 1.0, shape=(1, )), } else: config["env_config"] = {} # Test making the Q-model a custom one for CartPole, otherwise, # use the default model. config["Q_model"]["custom_model"] = "batch_norm{}".format( "_torch" if fw == "torch" else "") if env == "CartPole-v0" else None trainer = sac.SACTrainer(config=config, env=env) for i in range(num_iterations): results = trainer.train() print(results) check_compute_single_action(trainer) # Test, whether the replay buffer is saved along with # a checkpoint (no point in doing it for all frameworks since # this is framework agnostic). if fw == "tf" and env == "CartPole-v0": checkpoint = trainer.save() new_trainer = sac.SACTrainer(config, env=env) new_trainer.restore(checkpoint) # Get some data from the buffer and compare. data = trainer.local_replay_buffer.replay_buffers[ "default_policy"]._storage[:42 + 42] new_data = new_trainer.local_replay_buffer.replay_buffers[ "default_policy"]._storage[:42 + 42] check(data, new_data) new_trainer.stop() trainer.stop()
self.agent_2: global_rew / 2.0 } obs = {self.agent_1: self.state, self.agent_2: self.state + 3} dones = {"__all__": done} infos = {} return obs, rewards, dones, infos if __name__ == "__main__": args = parser.parse_args() grouping = { "group_1": [0, 1], } obs_space = Tuple([ TwoStepGame.observation_space, TwoStepGame.observation_space, ]) act_space = Tuple([ TwoStepGame.action_space, TwoStepGame.action_space, ]) register_env( "grouped_twostep", lambda config: TwoStepGame(config). with_agent_groups(grouping, obs_space=obs_space, act_space=act_space)) if args.run == "contrib/MADDPG": obs_space_dict = { "agent_1": TwoStepGame.observation_space, "agent_2": TwoStepGame.observation_space, } act_space_dict = {
config = { "env": "ray.rllib.examples.env.random_env.RandomEnv", "env_config": { "config": { "observation_space": Dict({ "a": Discrete(2), "b": Dict({ "ba": Discrete(3), "bb": Box(-1.0, 1.0, (2, 3), dtype=np.float32), }), "c": Tuple((MultiDiscrete([2, 3]), Discrete(2))), "d": Box(-1.0, 1.0, (2, ), dtype=np.int32), }), }, }, # Set this to True to enforce no preprocessors being used. # Complex observations now arrive directly in the model as # structures of batches, e.g. {"a": tensor, "b": [tensor, tensor]} # for obs-space=Dict(a=..., b=Tuple(..., ...)). "_disable_preprocessor_api": True, # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0. "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", 0)), "framework": args.framework, }
def step(self, action_dict): if self.state > 0: assert action_dict["agent_1"] == self.avail, \ "Failed to obey available actions mask!" self.state += 1 rewards = {"agent_1": 1} obs = {"agent_1": {"obs": 0, "action_mask": self.action_mask}} dones = {"__all__": self.state > 20} return obs, rewards, dones, {} if __name__ == "__main__": grouping = { "group_1": ["agent_1"], # trivial grouping for testing } obs_space = Tuple([AvailActionsTestEnv.observation_space]) act_space = Tuple([AvailActionsTestEnv.action_space]) register_env( "action_mask_test", lambda config: AvailActionsTestEnv(config). with_agent_groups(grouping, obs_space=obs_space, act_space=act_space)) ray.init() agent = QMixTrainer( env="action_mask_test", config={ "num_envs_per_worker": 5, # test with vectorization on "env_config": { "avail_action": 3, }, }) for _ in range(5):
parser.add_argument("--stop-timesteps", type=int, default=100000) parser.add_argument("--num-cpus", type=int, default=0) if __name__ == "__main__": args = parser.parse_args() ray.init(num_cpus=args.num_cpus or None) register_env("NestedSpaceRepeatAfterMeEnv", lambda c: NestedSpaceRepeatAfterMeEnv(c)) config = { "env": "NestedSpaceRepeatAfterMeEnv", "env_config": { "space": Dict({ "a": Tuple( [Dict({ "d": Box(-10.0, 10.0, ()), "e": Discrete(2) })]), "b": Box(-10.0, 10.0, (2, )), "c": Discrete(4) }), }, "entropy_coeff": 0.00005, # We don't want high entropy in this Env. "gamma": 0.0, # No history in Env (bandit problem). "lr": 0.0005, "num_envs_per_worker": 20, # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0. "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")), "num_sgd_iter": 4, "num_workers": 0, "vf_loss_coeff": 0.01,
import json # note: ujson fails this test due to float equality from copy import copy import numpy as np import pytest from gym.spaces import Tuple, Box, Discrete, MultiDiscrete, MultiBinary, Dict @pytest.mark.parametrize("space", [ Discrete(3), Tuple([Discrete(5), Discrete(10)]), Tuple([Discrete(5), Box(low=np.array([0, 0]), high=np.array([1, 5]), dtype=np.float32)]), Tuple((Discrete(5), Discrete(2), Discrete(2))), MultiDiscrete([2, 2, 100]), Dict({"position": Discrete(5), "velocity": Box(low=np.array([0, 0]), high=np.array([1, 5]), dtype=np.float32)}), ]) def test_roundtripping(space): sample_1 = space.sample() sample_2 = space.sample() assert space.contains(sample_1) assert space.contains(sample_2) json_rep = space.to_jsonable([sample_1, sample_2]) json_roundtripped = json.loads(json.dumps(json_rep)) samples_after_roundtrip = space.from_jsonable(json_roundtripped) sample_1_prime, sample_2_prime = samples_after_roundtrip s1 = space.to_jsonable([sample_1])
def action_space(self): first = Discrete( 2 - self.state[0] ) # For state value 0, we have 2 choices 0 and 1. For state value 1 we have only one choice. second = Discrete(2 - self.state[1]) return Tuple((first, second))
def __init__(self): self.action_space = Discrete(4) self.world = np.zeros((4, 12)) self.world[3, 1:-1] = CliffWalking.CLIFF self.observation_space = Tuple((Discrete(self.world.shape[0]), Discrete(self.world.shape[1])))
def run(args, cl_args): obs_space_fake = Dict({'obs': Discrete(1)}) n_signals = args.discrete_env_config.n_signals nfsp_env_cls = CoordinationEnvPerfectInfo sims_env_cls = CoordinationSignalerImperfectInfo # NFSP environment sample_env = nfsp_env_cls(args.env_config) obs_space_defender = sample_env.observation_space obs_space_attacker = sample_env.observation_space action_space_defender = sample_env.action_space action_space_attacker = sample_env.action_space register_env("nfsp_env", lambda _: sample_env) # SIMS environment signaled_sample_env = sims_env_cls(args.discrete_env_config) obs_space_signaled = signaled_sample_env.observation_space_signaled team_obs_space = Tuple([obs_space_signaled, obs_space_signaled]) team_act_space = Tuple([action_space_defender, action_space_defender]) groups = { "team": ["t1", "t2"], "signaler": ["signaler"], "opponent": ["opponent"] } grouped_env_eval = sims_env_cls( args.discrete_env_config).with_agent_groups(groups) grouped_env_eval.seed = seed_fn args.discrete_env_config.groups = groups args.env_config.groups = { "t1": ["t1"], "t2": ["t2"], "opponent": ["opponent"] } register_env("team_env", lambda _: grouped_env_eval) train_config = { "n_train_signals": n_signals, "use_exec_api": True, "seed": tune.sample_from(lambda x: np.random.randint(10000)), "env": CoordWrapperClass, "env_config": args.env_config, "rollout_fragment_length": args.env_config.horizon, "timesteps_per_iteration": 1, "batch_mode": "complete_episodes", "num_workers": 1, "num_envs_per_worker": 1, "train_batch_size": args.train_batch_size, "multiagent": { "policies": { "policy_team": (SIMSPolicy, team_obs_space, team_act_space, { "train_obs_space": Tuple([obs_space_defender] * 2) }), "policy_opp": (NFSPPolicy, Tuple([obs_space_attacker]), Tuple([action_space_attacker]), {}), "policy_signaler": (LearnableSignalerPolicy, Tuple([obs_space_fake]), Tuple([Discrete(n_signals)]), {}), "policy_t1": (NFSPPolicy, Tuple([obs_space_defender]), Tuple([action_space_defender]), { "test_obs_keys": ["obs"], "train_obs_keys": ["obs"], }), }, "policies_to_train": ["policy_t1", "policy_opp", "policy_team"], "policy_mapping_fn": select_policy, }, "callbacks": { "on_episode_start": on_episode_start, # "on_episode_step": on_episode_step, # "on_episode_end": on_episode_end, # "on_sample_end": on_sample_end, # "on_train_result": on_train_results, # "on_postprocess_traj": on_postprocess_traj, }, "replay_train_every": 10, "reservoir_train_every": 66, "reservoir_train_every_sims": 66, "lr": 1e-3, "beta": 0.1, "clip_actions": False, "replay_buffer_size": int(2e4), "reservoir_buffer_size": int(1e5), "framework": "torch", "recurrent_dqn": False, "evaluation_interval": 100, "evaluation_num_episodes": 100, "evaluation_config": { 'env_config': args.discrete_env_config, 'anticipatory_param': 0. }, "custom_eval_function": eval_function_sims, "log_stats": True, "logger_config": { "wandb": { "project": "sims_coordgame", "api_key_file": "/home/coordination/wandb_api", "log_config": False } }, } ray.init(log_to_driver=False, local_mode=False) # Build loggers DEFAULT_DIR = "~/ray_results/coord_game" tune.run( SIMSiNFSPTrainer, config=train_config, local_dir=DEFAULT_DIR, stop={"timesteps_total": 3e6}, checkpoint_at_end=True, num_samples=cl_args.num_samples, loggers=DEFAULT_LOGGERS + (WandbLogger, ), ) ray.shutdown()
def load_agent_config(args): result_dir = args.result_dir if args.result_dir[-1] != '/' \ else args.result_dir[:-1] config = get_rllib_config(result_dir) pkl = get_rllib_pkl(result_dir) # check if we have a multiagent scenario but in a # backwards compatible way if config.get('multiagent', {}).get('policy_graphs', {}): multiagent = True config['multiagent'] = pkl['multiagent'] else: multiagent = False single_env = env_creator(pkl['env_config']) env_name = pkl['env'] # Create and register a gym+rllib env obs_space = Tuple( [single_env.observation_space for _ in range(single_env.num_agents)]) act_space = Tuple( [single_env.action_space for _ in range(single_env.num_agents)]) grouping = { "group_1": [f"agent-{i}" for i in range(single_env.num_agents)] } register_env( env_name, lambda env_config: env_creator(env_config).with_agent_groups( grouping, obs_space=obs_space, act_space=act_space)) ModelCatalog.register_custom_model("conv_to_fc_net", ConvToFCNet) # Determine agent and checkpoint config_run = config['env_config']['run'] if 'run' in config['env_config'] \ else None if (args.run and config_run): if (args.run != config_run): print('visualizer_rllib.py: error: run argument ' + '\'{}\' passed in '.format(args.run) + 'differs from the one stored in params.json ' + '\'{}\''.format(config_run)) sys.exit(1) if (args.run): agent_cls = get_agent_class(args.run) elif (config_run): agent_cls = get_agent_class(config_run) else: print('visualizer_rllib.py: error: could not find flow parameter ' '\'run\' in params.json, ' 'add argument --run to provide the algorithm or model used ' 'to train the results\n e.g. ' 'python ./visualizer_rllib.py /tmp/ray/result_dir 1 --run PPO') sys.exit(1) # Run on only one cpu for rendering purposes if possible; A3C requires two if config_run == 'A3C': config['num_workers'] = 1 config["sample_async"] = False else: config['num_workers'] = 0 # create the agent that will be used to compute the actions agent = agent_cls(env=env_name, config=config) checkpoint = result_dir + '/checkpoint_' + args.checkpoint_num checkpoint = checkpoint + '/checkpoint-' + args.checkpoint_num print('Loading checkpoint', checkpoint) agent.restore(checkpoint) return agent, config
import gym from gym.spaces import Box, Discrete, Tuple from gym.envs.registration import EnvSpec import numpy as np import ray from ray.rllib.agents.agent import get_agent_class from ray.rllib.utils.error import UnsupportedSpaceException from ray.tune.registry import register_env ACTION_SPACES_TO_TEST = { "discrete": Discrete(5), "vector": Box(0.0, 1.0, (5,), dtype=np.float32), "simple_tuple": Tuple([ Box(0.0, 1.0, (5,), dtype=np.float32), Box(0.0, 1.0, (5,), dtype=np.float32)]), "implicit_tuple": [ Box(0.0, 1.0, (5,), dtype=np.float32), Box(0.0, 1.0, (5,), dtype=np.float32)], } OBSERVATION_SPACES_TO_TEST = { "discrete": Discrete(5), "vector": Box(0.0, 1.0, (5,), dtype=np.float32), "image": Box(0.0, 1.0, (80, 80, 1), dtype=np.float32), "atari": Box(0.0, 1.0, (210, 160, 3), dtype=np.float32), "atari_ram": Box(0.0, 1.0, (128,), dtype=np.float32), "simple_tuple": Tuple([ Box(0.0, 1.0, (5,), dtype=np.float32), Box(0.0, 1.0, (5,), dtype=np.float32)]),
from ray.rllib.examples.env.random_env import RandomEnv from ray.rllib.offline.json_reader import JsonReader from ray.rllib.utils.test_utils import framework_iterator SPACES = { "dict": Dict({ "a": Dict({ "aa": Box(-1.0, 1.0, shape=(3, )), "ab": MultiDiscrete([4, 3]), }), "b": Discrete(3), "c": Tuple([Box(0, 10, (2, ), dtype=np.int32), Discrete(2)]), "d": Box(0, 3, (), dtype=np.int64), }), "tuple": Tuple([ Tuple([ Box(-1.0, 1.0, shape=(2, )), Discrete(3), ]), MultiDiscrete([4, 3]), Dict({ "a": Box(0, 100, (), dtype=np.int32), "b": Discrete(2), }), ]),
def __init__(self): self.observation_space = Tuple( [Discrete(5), Box(0, 5, shape=(3, ), dtype=np.float32)])
def get_policy_configs_for_game(game_name): # The RLlib server must know about the Spaces that the Client will be # using inside Unity3D, up-front. obs_spaces = { # 3DBall. "3DBall": Box(float("-inf"), float("inf"), (8, )), # 3DBallHard. "3DBallHard": Box(float("-inf"), float("inf"), (45, )), # SoccerStrikersVsGoalie. "Goalie": Box(float("-inf"), float("inf"), (738, )), "Striker": Tuple([ Box(float("-inf"), float("inf"), (231, )), Box(float("-inf"), float("inf"), (63, )), ]), # Tennis. "Tennis": Box(float("-inf"), float("inf"), (27, )), # VisualHallway. "VisualHallway": Box(float("-inf"), float("inf"), (84, 84, 3)), # Walker. "Walker": Box(float("-inf"), float("inf"), (212, )), } action_spaces = { # 3DBall. "3DBall": Box(float("-inf"), float("inf"), (2, ), dtype=np.float32), # 3DBallHard. "3DBallHard": Box(float("-inf"), float("inf"), (2, ), dtype=np.float32), # SoccerStrikersVsGoalie. "Goalie": MultiDiscrete([3, 3, 3]), "Striker": MultiDiscrete([3, 3, 3]), # Tennis. "Tennis": Box(float("-inf"), float("inf"), (3, )), # VisualHallway. "VisualHallway": MultiDiscrete([5]), # Walker. "Walker": Box(float("-inf"), float("inf"), (39, )), } # Policies (Unity: "behaviors") and agent-to-policy mapping fns. if game_name == "SoccerStrikersVsGoalie": policies = { "Goalie": (None, obs_spaces["Goalie"], action_spaces["Goalie"], {}), "Striker": (None, obs_spaces["Striker"], action_spaces["Striker"], {}), } def policy_mapping_fn(agent_id): return "Striker" if "Striker" in agent_id else "Goalie" else: policies = { game_name: (None, obs_spaces[game_name], action_spaces[game_name], {}), } def policy_mapping_fn(agent_id): return game_name return policies, policy_mapping_fn
}), (6, ), ), ( Dict({ "key1": Box(shape=(2, 3), low=-1, high=1, dtype=np.float32), "key2": Box(shape=(), low=-1, high=1, dtype=np.float32), "key3": Box(shape=(2, ), low=-1, high=1, dtype=np.float32), }), (9, ), ), ( Dict({ "key1": Tuple(( Box(shape=(2, ), low=-1, high=1, dtype=np.float32), Box(shape=(2, ), low=-1, high=1, dtype=np.float32), )), "key2": Box(shape=(), low=-1, high=1, dtype=np.float32), "key3": Box(shape=(2, ), low=-1, high=1, dtype=np.float32), }), (7, ), ), ( Dict({ "key1": Tuple((Box(shape=(2, ), low=-1, high=1, dtype=np.float32), )), "key2": Box(shape=(), low=-1, high=1, dtype=np.float32), "key3":
parser.add_argument("--stop", type=int, default=20) N_TOPICS = 15 TOPICS = ['T{}'.format(i) for i in range(N_TOPICS)] CONTEXT_ATTRIBUTES = {'hour':['0-7', '8-9', '10-12','13-14','15-18','19-21','22-23'], 'week period': ['Weekday', 'Weekend'], 'weather': ['Sunny','Cloudy','Raining'], 'device':['mobile ios','mac ios','mobile android','windows']} OBSERVATION_0 = len(CONTEXT_ATTRIBUTES) * [0] + N_TOPICS * [0] OBSERVATION_SPACE = Tuple((Discrete(7), Discrete(2), Discrete(3), Discrete(4), Discrete(2), Discrete(2), Discrete(2), Discrete(2), Discrete(2), Discrete(2), Discrete(2), Discrete(2), Discrete(2), Discrete(2), Discrete(2), Discrete(2), Discrete(2), Discrete(2), Discrete(2) )) # ACTION_SPACE = Box(low=0.0, high=1.0, shape=(N_TOPICS,)) ACTION_SPACE = Tuple( (Discrete(2), Discrete(2), Discrete(2), Discrete(2), Discrete(2), Discrete(2), Discrete(2), Discrete(2), Discrete(2), Discrete(2), Discrete(2), Discrete(2), Discrete(2), Discrete(2), Discrete(2) )) # Probability of a user click based on the distance bwteen article topics PROBAB = N_TOPICS*[0] PROBAB[1:8] = [0.2, 0.5, 0.7, 0.4, 0.3, 0.2, 0.1]
def __init__(self, rules_file, leaf_threshold=16, max_cuts_per_dimension=5, max_actions_per_episode=5000, max_depth=100, partition_mode=None, reward_shape="linear", depth_weight=1.0, dump_dir=None, zero_obs=False): self.reward_shape = { "linear": lambda x: x, "log": lambda x: np.log(x), }[reward_shape] self.zero_obs = zero_obs assert partition_mode in [None, "simple", "efficuts", "cutsplit"] self.partition_enabled = partition_mode == "simple" if partition_mode in ["efficuts", "cutsplit"]: self.force_partition = partition_mode else: self.force_partition = False self.dump_dir = dump_dir and os.path.expanduser(dump_dir) if self.dump_dir: try: os.makedirs(self.dump_dir) except: pass self.best_time = float("inf") self.best_space = float("inf") self.depth_weight = depth_weight self.rules_file = rules_file self.rules = load_rules_from_file(rules_file) self.leaf_threshold = leaf_threshold self.max_actions_per_episode = max_actions_per_episode self.max_depth = max_depth self.num_actions = None self.tree = None self.node_map = None self.child_map = None self.max_cuts_per_dimension = max_cuts_per_dimension if self.partition_enabled: self.num_part_levels = NUM_PART_LEVELS else: self.num_part_levels = 0 self.action_space = Tuple([ Discrete(NUM_DIMENSIONS), Discrete(max_cuts_per_dimension + self.num_part_levels) ]) self.observation_space = Dict({ "real_obs": Box(0, 1, (279, ), dtype=np.float32), "action_mask": Box(0, 1, (NUM_DIMENSIONS + max_cuts_per_dimension + self.num_part_levels, ), dtype=np.float32), })
ACTION_SPACES_TO_TEST = { "discrete": Discrete(5), "vector": Box(-1.0, 1.0, (5, ), dtype=np.float32), "vector2": Box(-1.0, 1.0, ( 5, 5, ), dtype=np.float32), "multidiscrete": MultiDiscrete([1, 2, 3, 4]), "tuple": Tuple([Discrete(2), Discrete(3), Box(-1.0, 1.0, (5, ), dtype=np.float32)]), } OBSERVATION_SPACES_TO_TEST = { "discrete": Discrete(5), "vector": Box(-1.0, 1.0, (5, ), dtype=np.float32), "vector2": Box(-1.0, 1.0, (5, 5), dtype=np.float32), "image": Box(-1.0, 1.0, (84, 84, 1), dtype=np.float32), "atari": Box(-1.0, 1.0, (210, 160, 3), dtype=np.float32), "tuple":