import tree # pip install dm_tree import unittest import ray from ray.rllib.algorithms.bc import BC from ray.rllib.algorithms.pg import PG, DEFAULT_CONFIG from ray.rllib.examples.env.random_env import RandomEnv from ray.rllib.offline.json_reader import JsonReader from ray.rllib.utils.test_utils import framework_iterator SPACES = { "dict": Dict({ "a": Dict({ "aa": Box(-1.0, 1.0, shape=(3, )), "ab": MultiDiscrete([4, 3]), }), "b": Discrete(3), "c": Tuple([Box(0, 10, (2, ), dtype=np.int32), Discrete(2)]), "d": Box(0, 3, (), dtype=np.int64), }), "tuple": Tuple([ Tuple([ Box(-1.0, 1.0, shape=(2, )), Discrete(3),
def check_support(alg, config, stats, check_bounds=False, name=None): covered_a = set() covered_o = set() config["log_level"] = "ERROR" first_error = None torch = config.get("use_pytorch", False) for a_name, action_space in ACTION_SPACES_TO_TEST.items(): for o_name, obs_space in OBSERVATION_SPACES_TO_TEST.items(): print("=== Testing {} (torch={}) A={} S={} ===".format( alg, torch, action_space, obs_space)) config.update( dict(env_config=dict(action_space=action_space, observation_space=obs_space, reward_space=Box( 1.0, 1.0, shape=(), dtype=np.float32), p_done=1.0, check_action_bounds=check_bounds))) stat = "ok" a = None try: if a_name in covered_a and o_name in covered_o: stat = "skip" # speed up tests by avoiding full grid else: a = get_agent_class(alg)(config=config, env=RandomEnv) if alg not in ["DDPG", "ES", "ARS", "SAC"]: if o_name in ["atari", "image"]: if torch: assert isinstance(a.get_policy().model, TorchVisionNetV2) else: assert isinstance(a.get_policy().model, VisionNetV2) elif o_name in ["vector", "vector2"]: if torch: assert isinstance(a.get_policy().model, TorchFCNetV2) else: assert isinstance(a.get_policy().model, FCNetV2) a.train() covered_a.add(a_name) covered_o.add(o_name) except UnsupportedSpaceException: stat = "unsupported" except Exception as e: stat = "ERROR" print(e) print(traceback.format_exc()) first_error = first_error if first_error is not None else e finally: if a: try: a.stop() except Exception as e: print("Ignoring error stopping agent", e) pass print(stat) print() stats[name or alg, a_name, o_name] = stat # If anything happened, raise error. if first_error is not None: raise first_error
def __init__(self, obs_space, action_space, num_outputs, model_config, name): self.original_space = (obs_space.original_space if hasattr( obs_space, "original_space") else obs_space) self.processed_obs_space = ( self.original_space if model_config.get("_disable_preprocessor_api") else obs_space) nn.Module.__init__(self) TorchModelV2.__init__(self, self.original_space, action_space, num_outputs, model_config, name) self.flattened_input_space = flatten_space(self.original_space) # Atari type CNNs or IMPALA type CNNs (with residual layers)? # self.cnn_type = self.model_config["custom_model_config"].get( # "conv_type", "atari") # Build the CNN(s) given obs_space's image components. self.cnns = {} self.one_hot = {} self.flatten_dims = {} self.flatten = {} concat_size = 0 for i, component in enumerate(self.flattened_input_space): # Image space. if len(component.shape) == 3: config = { "conv_filters": model_config["conv_filters"] if "conv_filters" in model_config else get_filter_config(obs_space.shape), "conv_activation": model_config.get("conv_activation"), "post_fcnet_hiddens": [], } # if self.cnn_type == "atari": self.cnns[i] = ModelCatalog.get_model_v2( component, action_space, num_outputs=None, model_config=config, framework="torch", name="cnn_{}".format(i), ) # TODO (sven): add IMPALA-style option. # else: # cnn = TorchImpalaVisionNet( # component, # action_space, # num_outputs=None, # model_config=config, # name="cnn_{}".format(i)) concat_size += self.cnns[i].num_outputs self.add_module("cnn_{}".format(i), self.cnns[i]) # Discrete|MultiDiscrete inputs -> One-hot encode. elif isinstance(component, (Discrete, MultiDiscrete)): if isinstance(component, Discrete): size = component.n else: size = sum(component.nvec) config = { "fcnet_hiddens": model_config["fcnet_hiddens"], "fcnet_activation": model_config.get("fcnet_activation"), "post_fcnet_hiddens": [], } self.one_hot[i] = ModelCatalog.get_model_v2( Box(-1.0, 1.0, (size, ), np.float32), action_space, num_outputs=None, model_config=config, framework="torch", name="one_hot_{}".format(i), ) concat_size += self.one_hot[i].num_outputs # Everything else (1D Box). else: size = int(np.product(component.shape)) config = { "fcnet_hiddens": model_config["fcnet_hiddens"], "fcnet_activation": model_config.get("fcnet_activation"), "post_fcnet_hiddens": [], } self.flatten[i] = ModelCatalog.get_model_v2( Box(-1.0, 1.0, (size, ), np.float32), action_space, num_outputs=None, model_config=config, framework="torch", name="flatten_{}".format(i), ) self.flatten_dims[i] = size concat_size += self.flatten[i].num_outputs # Optional post-concat FC-stack. post_fc_stack_config = { "fcnet_hiddens": model_config.get("post_fcnet_hiddens", []), "fcnet_activation": model_config.get("post_fcnet_activation", "relu"), } self.post_fc_stack = ModelCatalog.get_model_v2( Box(float("-inf"), float("inf"), shape=(concat_size, ), dtype=np.float32), self.action_space, None, post_fc_stack_config, framework="torch", name="post_fc_stack", ) # Actions and value heads. self.logits_layer = None self.value_layer = None self._value_out = None if num_outputs: # Action-distribution head. self.logits_layer = SlimFC( in_size=self.post_fc_stack.num_outputs, out_size=num_outputs, activation_fn=None, initializer=torch_normc_initializer(0.01), ) # Create the value branch model. self.value_layer = SlimFC( in_size=self.post_fc_stack.num_outputs, out_size=1, activation_fn=None, initializer=torch_normc_initializer(0.01), ) else: self.num_outputs = concat_size
def obs_space(self): low = np.zeros([self._local_width, self._local_width, self._channel_num]) high = np.ones([self._local_width, self._local_width, self._channel_num]) return Box(low, high)
def _build_state_space(self, state_names): # Docstring of superclass low = -1 * np.ones_like(state_names, dtype=float) low[self.U_SUP_IDX] = 0.0 high = np.ones_like(state_names, dtype=float) return Box(low, high)
def observation_space(self): #return Discrete(11) return Box(low=0, high=1, shape=(1, ))
def __init__(self, observation_space: gym.spaces.Space, action_space: gym.spaces.Space, num_outputs: Optional[int], model_config: ModelConfigDict, name: str, *, num_transformer_units: int = 1, attention_dim: int = 64, num_heads: int = 2, memory_inference: int = 50, memory_training: int = 50, head_dim: int = 32, position_wise_mlp_dim: int = 32, init_gru_gate_bias: float = 2.0): """Initializes a GTrXLNet. Args: num_transformer_units (int): The number of Transformer repeats to use (denoted L in [2]). attention_dim (int): The input and output dimensions of one Transformer unit. num_heads (int): The number of attention heads to use in parallel. Denoted as `H` in [3]. memory_inference (int): The number of timesteps to concat (time axis) and feed into the next transformer unit as inference input. The first transformer unit will receive this number of past observations (plus the current one), instead. memory_training (int): The number of timesteps to concat (time axis) and feed into the next transformer unit as training input (plus the actual input sequence of len=max_seq_len). The first transformer unit will receive this number of past observations (plus the input sequence), instead. head_dim (int): The dimension of a single(!) attention head within a multi-head attention unit. Denoted as `d` in [3]. position_wise_mlp_dim (int): The dimension of the hidden layer within the position-wise MLP (after the multi-head attention block within one Transformer unit). This is the size of the first of the two layers within the PositionwiseFeedforward. The second layer always has size=`attention_dim`. init_gru_gate_bias (float): Initial bias values for the GRU gates (two GRUs per Transformer unit, one after the MHA, one after the position-wise MLP). """ super().__init__(observation_space, action_space, num_outputs, model_config, name) nn.Module.__init__(self) self.num_transformer_units = num_transformer_units self.attention_dim = attention_dim self.num_heads = num_heads self.memory_inference = memory_inference self.memory_training = memory_training self.head_dim = head_dim self.max_seq_len = model_config["max_seq_len"] self.obs_dim = observation_space.shape[0] self.linear_layer = SlimFC(in_size=self.obs_dim, out_size=self.attention_dim) self.layers = [self.linear_layer] attention_layers = [] # 2) Create L Transformer blocks according to [2]. for i in range(self.num_transformer_units): # RelativeMultiHeadAttention part. MHA_layer = SkipConnection( RelativeMultiHeadAttention(in_dim=self.attention_dim, out_dim=self.attention_dim, num_heads=num_heads, head_dim=head_dim, input_layernorm=True, output_activation=nn.ReLU), fan_in_layer=GRUGate(self.attention_dim, init_gru_gate_bias)) # Position-wise MultiLayerPerceptron part. E_layer = SkipConnection(nn.Sequential( torch.nn.LayerNorm(self.attention_dim), SlimFC(in_size=self.attention_dim, out_size=position_wise_mlp_dim, use_bias=False, activation_fn=nn.ReLU), SlimFC(in_size=position_wise_mlp_dim, out_size=self.attention_dim, use_bias=False, activation_fn=nn.ReLU)), fan_in_layer=GRUGate( self.attention_dim, init_gru_gate_bias)) # Build a list of all attanlayers in order. attention_layers.extend([MHA_layer, E_layer]) # Create a Sequential such that all parameters inside the attention # layers are automatically registered with this top-level model. self.attention_layers = nn.Sequential(*attention_layers) self.layers.extend(attention_layers) # Final layers if num_outputs not None. self.logits = None self.values_out = None # Last value output. self._value_out = None # Postprocess GTrXL output with another hidden layer. if self.num_outputs is not None: self.logits = SlimFC(in_size=self.attention_dim, out_size=self.num_outputs, activation_fn=nn.ReLU) # Value function used by all RLlib Torch RL implementations. self.values_out = SlimFC(in_size=self.attention_dim, out_size=1, activation_fn=None) else: self.num_outputs = self.attention_dim # Setup trajectory views (`memory-inference` x past memory outs). for i in range(self.num_transformer_units): space = Box(-1.0, 1.0, shape=(self.attention_dim, )) self.view_requirements["state_in_{}".format(i)] = \ ViewRequirement( "state_out_{}".format(i), shift="-{}:-1".format(self.memory_inference), # Repeat the incoming state every max-seq-len times. batch_repeat_value=self.max_seq_len, space=space) self.view_requirements["state_out_{}".format(i)] = \ ViewRequirement( space=space, used_for_training=False)
from supersuit.basic_transforms.flatten import check_param, change_observation from gym.spaces import Box import numpy as np import pytest test_obs_space = Box(low=np.float32(0.), high=np.float32(1.), shape=(4, 4, 3), dtype=np.float32) test_obs = np.zeros([4, 4, 3], dtype=np.float64) + np.arange(3) def test_param_check(): check_param(test_obs_space, True) with pytest.raises(AssertionError): check_param(test_obs_space, 6) def test_change_observation(): new_obs = change_observation(test_obs, test_obs_space, True) assert new_obs.shape == (4 * 4 * 3, )
def __init__(self, env): ObservationWrapper.__init__(self, env) self.unwrapped.observation_space = Box(-np.inf, np.inf, (7, ))
def discrete2box4class(discrete_space): return Box(0.0, 1.0, discrete_space.n)
def __init__(self, env, skip=4): super(CustomSkipFrame, self).__init__(env) self.observation_space = Box(low=0, high=255, shape=(skip, 84, 84)) self.skip = skip self.states = np.zeros((skip, 84, 84), dtype=np.float32)
def __init__( self, hand_low=(-0.5, 0.40, 0.05), hand_high=(0.5, 1, 0.5), obj_low=(0., 0.6, 0.015), obj_high=(0., 0.6, 0.015), random_init=False, # tasks = [{'goal': 0.75, 'obj_init_pos':-0.2, 'obj_init_angle': 0.3}], # tasks = [{'goal': np.array([0., 0.75, 0.04]), 'obj_init_pos':np.array([0., 0.9, 0.04]), 'obj_init_angle': 0.3}], tasks=[{ 'goal': np.array([0., 0.85, 0.02]), 'obj_init_pos': np.array([0., 0.6, 0.015]), 'obj_init_angle': 0.3 }], goal_low=(-0.1, 0.85, 0.02), goal_high=(0.1, 0.9, 0.02), hand_init_pos=(0, 0.6, 0.2), rotMode='fixed', #'fixed', obs_type='plain', multitask=False, multitask_num=1, if_render=False, **kwargs): self.quick_init(locals()) SawyerXYZEnv.__init__(self, frame_skip=5, action_scale=1. / 100, hand_low=hand_low, hand_high=hand_high, model_name=self.model_name, **kwargs) if obj_low is None: obj_low = self.hand_low if goal_low is None: goal_low = self.hand_low if obj_high is None: obj_high = self.hand_high if goal_high is None: goal_high = self.hand_high assert obs_type in OBS_TYPE if multitask: obs_type = 'with_goal_and_id' self.obs_type = obs_type self.random_init = random_init self.max_path_length = 150 #150 self.tasks = tasks self.num_tasks = len(tasks) self.rotMode = rotMode self.hand_init_pos = np.array(hand_init_pos) self.multitask = multitask self.multitask_num = multitask_num self._state_goal_idx = np.zeros(self.multitask_num) self.if_render = if_render if rotMode == 'fixed': self.action_space = Box( np.array([-1, -1, -1, -1]), np.array([1, 1, 1, 1]), ) elif rotMode == 'rotz': self.action_rot_scale = 1. / 50 self.action_space = Box( np.array([-1, -1, -1, -np.pi, -1]), np.array([1, 1, 1, np.pi, 1]), ) elif rotMode == 'quat': self.action_space = Box( np.array([-1, -1, -1, 0, -1, -1, -1, -1]), np.array([1, 1, 1, 2 * np.pi, 1, 1, 1, 1]), ) else: self.action_space = Box( np.array([-1, -1, -1, -np.pi / 2, -np.pi / 2, 0, -1]), np.array([1, 1, 1, np.pi / 2, np.pi / 2, np.pi * 2, 1]), ) self.obj_and_goal_space = Box( np.hstack((obj_low, goal_low)), np.hstack((obj_high, goal_high)), ) self.goal_space = Box(np.array(goal_low), np.array(goal_high)) if not multitask and self.obs_type == 'with_goal_id': self.observation_space = Box( np.hstack( (self.hand_low, obj_low, goal_low, np.zeros(len(tasks)))), np.hstack((self.hand_high, obj_high, goal_high, np.ones(len(tasks)))), ) elif not multitask and self.obs_type == 'plain': self.observation_space = Box( np.hstack((self.hand_low, obj_low)), np.hstack((self.hand_high, obj_high)), ) elif not multitask and self.obs_type == 'with_goal': self.observation_space = Box( np.hstack((self.hand_low, obj_low, goal_low)), np.hstack((self.hand_high, obj_high, goal_high)), ) else: self.observation_space = Box( np.hstack((self.hand_low, obj_low, goal_low, np.zeros(multitask_num))), np.hstack((self.hand_high, obj_high, goal_high, np.zeros(multitask_num))), ) self.reset()
def __init__(self, seed=None, spawn_rate=20, num_archers=2, num_knights=2, killable_knights=True, killable_archers=True, pad_observation=True, black_death=True, line_death=False, max_frames=900): # Game Constants self.ZOMBIE_SPAWN = spawn_rate self.FPS = 90 self.WIDTH = 1280 self.HEIGHT = 720 self.max_frames = 500 self.frames = 0 self.pad_observation = pad_observation self.killable_knights = killable_knights self.killable_archers = killable_archers self.black_death = black_death self.line_death = line_death self.has_reset = False self.np_random, seed = seeding.np_random(seed) # Dictionaries for holding new players and their weapons self.archer_dict = {} self.knight_dict = {} self.arrow_dict = {} self.sword_dict = {} # Game Variables self.frame_count = 0 self.score = 0 self.run = True self.arrow_spawn_rate = self.sword_spawn_rate = self.zombie_spawn_rate = 0 self.knight_player_num = self.archer_player_num = 0 self.archer_killed = False self.knight_killed = False self.sword_killed = False self.closed = False # Creating Sprite Groups self.all_sprites = pygame.sprite.Group() self.zombie_list = pygame.sprite.Group() self.arrow_list = pygame.sprite.Group() self.sword_list = pygame.sprite.Group() self.archer_list = pygame.sprite.Group() self.knight_list = pygame.sprite.Group() # If black_death, this represents agents to remove at end of cycle self.kill_list = [] self.num_archers = num_archers self.num_knights = num_knights # Initializing Pygame self.render_on = False pygame.init() # self.WINDOW = pygame.display.set_mode([self.WIDTH, self.HEIGHT]) self.WINDOW = pygame.Surface((self.WIDTH, self.HEIGHT)) pygame.display.set_caption("Knights, Archers, Zombies") self.clock = pygame.time.Clock() self.left_wall = get_image(os.path.join('img', 'left_wall.png')) self.right_wall = get_image(os.path.join('img', 'right_wall.png')) self.right_wall_rect = self.right_wall.get_rect() self.right_wall_rect.left = self.WIDTH - self.right_wall_rect.width self.floor_patch1 = get_image(os.path.join('img', 'patch1.png')) self.floor_patch2 = get_image(os.path.join('img', 'patch2.png')) self.floor_patch3 = get_image(os.path.join('img', 'patch3.png')) self.floor_patch4 = get_image(os.path.join('img', 'patch4.png')) self.agent_list = [] self.agents = [] for i in range(num_archers): name = "archer_" + str(i) self.archer_dict["archer{0}".format( self.archer_player_num)] = Archer(agent_name=name) self.archer_dict["archer{0}".format( self.archer_player_num)].offset(i * 50, 0) self.archer_list.add(self.archer_dict["archer{0}".format( self.archer_player_num)]) self.all_sprites.add(self.archer_dict["archer{0}".format( self.archer_player_num)]) self.agent_list.append(self.archer_dict["archer{0}".format( self.archer_player_num)]) if i != num_archers - 1: self.archer_player_num += 1 for i in range(num_knights): name = "knight_" + str(i) self.knight_dict["knight{0}".format( self.knight_player_num)] = Knight(agent_name=name) self.knight_dict["knight{0}".format( self.knight_player_num)].offset(i * 50, 0) self.knight_list.add(self.knight_dict["knight{0}".format( self.knight_player_num)]) self.all_sprites.add(self.knight_dict["knight{0}".format( self.knight_player_num)]) self.agent_list.append(self.knight_dict["knight{0}".format( self.knight_player_num)]) if i != num_knights - 1: self.knight_player_num += 1 self.agent_name_mapping = {} a_count = 0 for i in range(num_archers): a_name = "archer_" + str(i) self.agents.append(a_name) self.agent_name_mapping[a_name] = a_count a_count += 1 for i in range(num_knights): k_name = "knight_" + str(i) self.agents.append(k_name) self.agent_name_mapping[k_name] = a_count a_count += 1 self.observation_spaces = dict( zip(self.agents, [ Box(low=0, high=255, shape=(512, 512, 3), dtype=np.uint8) for _ in enumerate(self.agents) ])) self.action_spaces = dict( zip(self.agents, [Discrete(6) for _ in enumerate(self.agents)])) self.display_wait = 0.0 self._agent_selector = agent_selector(self.agents) self.num_agents = len(self.agents) self.reinit()
def __init__(self): """ Initialize agent. Args: hyperparams: Dictionary of hyperparameters. init_node: Whether or not to initialize a new ROS node. """ rospy.init_node('baxter_reacher') self.dt = BAXTER['dt'] self.RATE = 1. / self.dt self.using_torque = BAXTER['using_torque'] self.using_left_arm = BAXTER['using_left_arm'] # init baxter interface self._llimb = baxter_interface.Limb("left") self._rlimb = baxter_interface.Limb("right") self.kin_left = baxter_kinematics('left') self.kin_right = baxter_kinematics("right") print("Getting robot state... ") self._rs = baxter_interface.RobotEnable(CHECK_VERSION) self._init_state = self._rs.state().enabled print("Enabling robot... ") self._rs.enable() self._pub_rate = rospy.Publisher('robot/joint_state_publish_rate', UInt16, queue_size=10) self._pub_rate.publish(self.RATE) self.rate = rospy.Rate(self.RATE) self._max_time_step = 50 self._step_counter = 0 self.action_norm_factor = REAL_ACT_SPACE_HIGH # to control the orientation, we set 3 points on the gripper frame self.ee_points = np.array([[0., -0.04, 0.07], [0., 0.04, 0.07], [0., 0., 0.]]) self.observation_space = Box(-np.inf * np.ones(10), np.inf * np.ones(10)) self.action_space = Box(-np.ones(7, dtype=np.float32), np.ones(7, dtype=np.float32)) self.indicator_flag = False self.num_envs = 1 self.clipob = 10 self.cliprew = 10 self.ob_rms = RunningMeanStd(shape=self.observation_space.shape) self.ret_rms = RunningMeanStd(shape=()) self.ret = np.zeros(1) self.eprew = np.zeros(1) self.gamma = BAXTER["gamma"] self.epsilon = BAXTER["epsilon"] self._done = np.array(False).repeat(self.eprew.shape) self.prev_dones = np.array(False).repeat(self.eprew.shape) self._reward = np.zeros(1, dtype=np.float32) self._obs = np.zeros((1, ) + self.observation_space.shape, dtype=np.float32)
class ContB6BridgeConverter(ContDynamicallyAveragedConverter): """ The continuous B6 bridge converter (B6C) is simulated with three continuous 2QC. Key: 'Cont-B6C' Actions: The Duty Cycle for each half brigde in the range of (-1,1) Action Space: Box(-1, 1, shape=(3,)) Output Voltages and Currents: | voltages: (-1,1) | currents: (-1,1) Output Voltage Space: Box(-0.5, 0.5, shape=(3,)) """ action_space = Box(-1, 1, shape=(3,)) # Only positive voltages can be applied voltages = (-1, 1) # Positive and negative currents are possible currents = (-1, 1) _reset_action = [0, 0, 0] def __init__(self, tau=1e-4, **kwargs): # Docstring in base class super().__init__(tau=tau, **kwargs) self._subconverters = [ ContTwoQuadrantConverter(tau=tau, **kwargs), ContTwoQuadrantConverter(tau=tau, **kwargs), ContTwoQuadrantConverter(tau=tau, **kwargs), ] def reset(self): # Docstring in base class return [ self._subconverters[0].reset()[0] - 0.5, self._subconverters[1].reset()[0] - 0.5, self._subconverters[2].reset()[0] - 0.5, ] def convert(self, i_out, t): # Docstring in base class u_out = [ self._subconverters[0].convert([i_out[0]], t)[0] - 0.5, self._subconverters[1].convert([i_out[1]], t)[0] - 0.5, self._subconverters[2].convert([i_out[2]], t)[0] - 0.5 ] return u_out def set_action(self, action, t): # Docstring in base class times = [] times += self._subconverters[0].set_action([0.5 * (action[0] + 1)], t) times += self._subconverters[1].set_action([0.5 * (action[1] + 1)], t) times += self._subconverters[2].set_action([0.5 * (action[2] + 1)], t) return sorted(list(set(times))) def _convert(self, i_in, t): # Not used pass def i_sup(self, i_out): # Docstring in base class return sum([subconverter.i_sup([i_out_]) for subconverter, i_out_ in zip(self._subconverters, i_out)])
def __init__(self, env): super().__init__(env) self.n = None if isinstance(self.env.observation_space, Discrete): self.n = self.env.observation_space.n self.observation_space = Box(0, 1, (self.n, ))
def action_space(self): return Box(low=0, high=1, shape=(1, ))
def observation_space(self): return Box(low=0, high=1.0, shape=(25, ), dtype=np.float32)
def __init__( self, input_space: gym.spaces.Space, action_space: gym.spaces.Space, *, name: str, max_seq_len: int = 20, num_transformer_units: int = 1, attention_dim: int = 64, num_heads: int = 2, memory_inference: int = 50, memory_training: int = 50, head_dim: int = 32, position_wise_mlp_dim: int = 32, init_gru_gate_bias: float = 2.0, ): """Initializes a GTrXLNet instance. Args: num_transformer_units (int): The number of Transformer repeats to use (denoted L in [2]). attention_dim (int): The input and output dimensions of one Transformer unit. num_heads (int): The number of attention heads to use in parallel. Denoted as `H` in [3]. memory_inference (int): The number of timesteps to concat (time axis) and feed into the next transformer unit as inference input. The first transformer unit will receive this number of past observations (plus the current one), instead. memory_training (int): The number of timesteps to concat (time axis) and feed into the next transformer unit as training input (plus the actual input sequence of len=max_seq_len). The first transformer unit will receive this number of past observations (plus the input sequence), instead. head_dim (int): The dimension of a single(!) attention head within a multi-head attention unit. Denoted as `d` in [3]. position_wise_mlp_dim (int): The dimension of the hidden layer within the position-wise MLP (after the multi-head attention block within one Transformer unit). This is the size of the first of the two layers within the PositionwiseFeedforward. The second layer always has size=`attention_dim`. init_gru_gate_bias (float): Initial bias values for the GRU gates (two GRUs per Transformer unit, one after the MHA, one after the position-wise MLP). """ super().__init__(name=name) self.num_transformer_units = num_transformer_units self.attention_dim = attention_dim self.num_heads = num_heads self.memory_inference = memory_inference self.memory_training = memory_training self.head_dim = head_dim self.max_seq_len = max_seq_len self.obs_dim = input_space.shape[0] # Raw observation input (plus (None) time axis). input_layer = tf.keras.layers.Input( shape=( None, self.obs_dim, ), name="inputs", ) memory_ins = [ tf.keras.layers.Input( shape=( None, self.attention_dim, ), dtype=tf.float32, name="memory_in_{}".format(i), ) for i in range(self.num_transformer_units) ] # Map observation dim to input/output transformer (attention) dim. E_out = tf.keras.layers.Dense(self.attention_dim)(input_layer) # Output, collected and concat'd to build the internal, tau-len # Memory units used for additional contextual information. memory_outs = [E_out] # 2) Create L Transformer blocks according to [2]. for i in range(self.num_transformer_units): # RelativeMultiHeadAttention part. MHA_out = SkipConnection( RelativeMultiHeadAttention( out_dim=self.attention_dim, num_heads=num_heads, head_dim=head_dim, input_layernorm=True, output_activation=tf.nn.relu, ), fan_in_layer=GRUGate(init_gru_gate_bias), name="mha_{}".format(i + 1), )(E_out, memory=memory_ins[i]) # Position-wise MLP part. E_out = SkipConnection( tf.keras.Sequential(( tf.keras.layers.LayerNormalization(axis=-1), PositionwiseFeedforward( out_dim=self.attention_dim, hidden_dim=position_wise_mlp_dim, output_activation=tf.nn.relu, ), )), fan_in_layer=GRUGate(init_gru_gate_bias), name="pos_wise_mlp_{}".format(i + 1), )(MHA_out) # Output of position-wise MLP == E(l-1), which is concat'd # to the current Mem block (M(l-1)) to yield E~(l-1), which is then # used by the next transformer block. memory_outs.append(E_out) self._logits = None self._value_out = None self.trxl_model = tf.keras.Model(inputs=[input_layer] + memory_ins, outputs=[E_out] + memory_outs[:-1]) self.view_requirements = { SampleBatch.OBS: ViewRequirement(space=input_space), } # Setup trajectory views (`memory-inference` x past memory outs). for i in range(self.num_transformer_units): space = Box(-1.0, 1.0, shape=(self.attention_dim, )) self.view_requirements["state_in_{}".format(i)] = ViewRequirement( "state_out_{}".format(i), shift="-{}:-1".format(self.memory_inference), # Repeat the incoming state every max-seq-len times. batch_repeat_value=self.max_seq_len, space=space, ) self.view_requirements["state_out_{}".format(i)] = ViewRequirement( space=space, used_for_training=False)
def __init__(self): super(DEED, self).__init__() self.u_holder = np.array([ [ 150, 470, 786.7988, 38.5397, 0.1524, 450, 0.041, 103.3908, -2.4444, 0.0312, 0.5035, 0.0207, 80, 80 ], [ 135, 470, 451.3251, 46.1591, 0.1058, 600, 0.036, 103.3908, -2.4444, 0.0312, 0.5035, 0.0207, 80, 80 ], [ 73, 340, 1049.9977, 40.3965, 0.0280, 320, 0.038, 300.3910, -4.0695, 0.0509, 0.4968, 0.0202, 80, 80 ], [ 60, 300, 1243.5311, 38.3055, 0.0354, 260, 0.052, 300.3910, -4.0695, 0.0509, 0.4968, 0.0202, 50, 50 ], [ 73, 243, 1658.5696, 36.3278, 0.0211, 280, 0.063, 320.0006, -3.8132, 0.0344, 0.4972, 0.0200, 50, 50 ], [ 57, 160, 1356.6592, 38.2704, 0.0179, 310, 0.048, 320.0006, -3.8132, 0.0344, 0.4972, 0.0200, 50, 50 ], [ 20, 130, 1450.7045, 36.5104, 0.0121, 300, 0.086, 330.0056, -3.9023, 0.0465, 0.5163, 0.0214, 30, 30 ], [ 47, 120, 1450.7045, 36.5104, 0.0121, 340, 0.082, 330.0056, -3.9023, 0.0465, 0.5163, 0.0214, 30, 30 ], [ 20, 80, 1455.6056, 39.5804, 0.1090, 270, 0.098, 350.0056, -3.9524, 0.0465, 0.5475, 0.0234, 30, 30 ], [ 10, 55, 1469.4026, 40.5407, 0.1295, 380, 0.094, 360.0012, -3.9864, 0.0470, 0.5475, 0.0234, 30, 30 ], ]) self.pdm_hold = np.array([ 1036, 1110, 1258, 1406, 1480, 1628, 1702, 1776, 1924, 2022, 2106, 2150, 2072, 1924, 1776, 1554, 1480, 1628, 1776, 1972, 1924, 1628, 1332, 1184 ]) self.b = np.array([ [ 0.000049, 0.000015, 0.000015, 0.000015, 0.000016, 0.000017, 0.000017, 0.000018, 0.000019, 0.000020 ], [ 0.000014, 0.000045, 0.000016, 0.000016, 0.000017, 0.000015, 0.000015, 0.000016, 0.000018, 0.000018 ], [ 0.000015, 0.000016, 0.000039, 0.000010, 0.000012, 0.000012, 0.000014, 0.000014, 0.000016, 0.000016 ], [ 0.000015, 0.000016, 0.000010, 0.000040, 0.000014, 0.000010, 0.000011, 0.000012, 0.000014, 0.000015 ], [ 0.000016, 0.000017, 0.000012, 0.000014, 0.000035, 0.000011, 0.000013, 0.000013, 0.000015, 0.000016 ], [ 0.000017, 0.000015, 0.000012, 0.000010, 0.000011, 0.000036, 0.000012, 0.000012, 0.000014, 0.000015 ], [ 0.000017, 0.000015, 0.000014, 0.000011, 0.000013, 0.000012, 0.000038, 0.000016, 0.000016, 0.000018 ], [ 0.000018, 0.000016, 0.000014, 0.000012, 0.000013, 0.000012, 0.000016, 0.000040, 0.000015, 0.000016 ], [ 0.000019, 0.000018, 0.000016, 0.000014, 0.000015, 0.000014, 0.000016, 0.000015, 0.000042, 0.000019 ], [ 0.000020, 0.000018, 0.000016, 0.000014, 0.000016, 0.000015, 0.000018, 0.000016, 0.000019, 0.000044 ], ]) self.violation_multiplier = 1e6 # constant self.n_actions = 101 # [Actions, Agents] transpose -> [Agents, Actions] self.action_powers = np.linspace(self.u_holder[:, 0], self.u_holder[:, 1], self.n_actions).T self.action_space = MultiDiscrete( np.ones(len(self.u_holder) - 1) * self.n_actions) self.observation_space = MultiDiscrete( np.ones((len(self.u_holder) - 1, 2)) * 20) self.reward_space = Box(-np.ones(3) * np.inf, np.zeros(3))
def __init__( self, hand_low=(-0.5, 0.40, 0.05), hand_high=(0.5, 1, 0.5), obj_low=(-0.1, 0.5, 0.02), obj_high=(0.1, 0.6, 0.02), random_init=False, tasks=[{ 'stick_init_pos': np.array([0., 0.6, 0.02]) }], goal_low=None, goal_high=None, hand_init_pos=(0, 0.6, 0.2), liftThresh=0.04, rotMode='fixed', #'fixed', rewMode='orig', obs_type='plain', multitask=False, multitask_num=1, if_render=False, **kwargs): self.quick_init(locals()) SawyerXYZEnv.__init__(self, frame_skip=5, action_scale=1. / 100, hand_low=hand_low, hand_high=hand_high, model_name=self.model_name, **kwargs) assert obs_type in OBS_TYPE if multitask: obs_type = 'with_goal_and_id' self.obs_type = obs_type if obj_low is None: obj_low = self.hand_low if goal_low is None: goal_low = self.hand_low if obj_high is None: obj_high = self.hand_high if goal_high is None: goal_high = self.hand_high self.random_init = random_init self.liftThresh = liftThresh self.max_path_length = 150 #150 self.tasks = tasks self.num_tasks = len(tasks) self.rewMode = rewMode self.rotMode = rotMode self.hand_init_pos = np.array(hand_init_pos) self.multitask = multitask self.multitask_num = multitask_num self.if_render = if_render self._state_goal_idx = np.zeros(self.multitask_num) if rotMode == 'fixed': self.action_space = Box( np.array([-1, -1, -1, -1]), np.array([1, 1, 1, 1]), ) elif rotMode == 'rotz': self.action_rot_scale = 1. / 50 self.action_space = Box( np.array([-1, -1, -1, -np.pi, -1]), np.array([1, 1, 1, np.pi, 1]), ) elif rotMode == 'quat': self.action_space = Box( np.array([-1, -1, -1, 0, -1, -1, -1, -1]), np.array([1, 1, 1, 2 * np.pi, 1, 1, 1, 1]), ) else: self.action_space = Box( np.array([-1, -1, -1, -np.pi / 2, -np.pi / 2, 0, -1]), np.array([1, 1, 1, np.pi / 2, np.pi / 2, np.pi * 2, 1]), ) # For now, fix the object initial position. self.obj_init_pos = np.array([0.2, 0.6, 0.04]) self.obj_init_qpos = np.array([0.0, 0.0]) self.obj_space = Box(np.array(obj_low), np.array(obj_high)) self.goal_space = Box(np.array(goal_low)[:2], np.array(goal_high)[:2]) self.obs_type = obs_type if not multitask and self.obs_type == 'with_goal_id': self.observation_space = Box( np.hstack((self.hand_low, obj_low, np.zeros(len(tasks)))), np.hstack((self.hand_high, obj_high, np.ones(len(tasks)))), ) elif not multitask and self.obs_type == 'plain': self.observation_space = Box( np.hstack((self.hand_low, obj_low, obj_low)), np.hstack((self.hand_high, obj_high, obj_high)), ) if not multitask and self.obs_type == 'with_goal': self.observation_space = Box( np.hstack((self.hand_low, obj_low, obj_low)), np.hstack((self.hand_high, obj_high, obj_high)), ) elif not multitask and self.obs_type == 'with_goal': self.observation_space = Box( np.hstack((self.hand_low, obj_low, goal_low)), np.hstack((self.hand_high, obj_high, goal_high)), ) else: self.observation_space = Box( np.hstack((self.hand_low, obj_low, goal_low, np.zeros(multitask_num))), np.hstack((self.hand_high, obj_high, goal_high, np.zeros(multitask_num))), ) self.reset()
def get_policy_configs_for_game( game_name: str) -> Tuple[dict, Callable[[AgentID], PolicyID]]: # The RLlib server must know about the Spaces that the Client will be # using inside Unity3D, up-front. obs_spaces = { # 3DBall. "3DBall": Box(float("-inf"), float("inf"), (8, )), # 3DBallHard. "3DBallHard": Box(float("-inf"), float("inf"), (45, )), # Pyramids. "Pyramids": TupleSpace([ Box(float("-inf"), float("inf"), (56, )), Box(float("-inf"), float("inf"), (56, )), Box(float("-inf"), float("inf"), (56, )), Box(float("-inf"), float("inf"), (4, )), ]), # SoccerStrikersVsGoalie. "Goalie": Box(float("-inf"), float("inf"), (738, )), "Striker": TupleSpace([ Box(float("-inf"), float("inf"), (231, )), Box(float("-inf"), float("inf"), (63, )), ]), # Tennis. "Tennis": Box(float("-inf"), float("inf"), (27, )), # VisualHallway. "VisualHallway": Box(float("-inf"), float("inf"), (84, 84, 3)), # Walker. "Walker": Box(float("-inf"), float("inf"), (212, )), # FoodCollector. "FoodCollector": TupleSpace([ Box(float("-inf"), float("inf"), (49, )), Box(float("-inf"), float("inf"), (4, )), ]), } action_spaces = { # 3DBall. "3DBall": Box(float("-inf"), float("inf"), (2, ), dtype=np.float32), # 3DBallHard. "3DBallHard": Box(float("-inf"), float("inf"), (2, ), dtype=np.float32), # Pyramids. "Pyramids": MultiDiscrete([5]), # SoccerStrikersVsGoalie. "Goalie": MultiDiscrete([3, 3, 3]), "Striker": MultiDiscrete([3, 3, 3]), # Tennis. "Tennis": Box(float("-inf"), float("inf"), (3, )), # VisualHallway. "VisualHallway": MultiDiscrete([5]), # Walker. "Walker": Box(float("-inf"), float("inf"), (39, )), # FoodCollector. "FoodCollector": MultiDiscrete([3, 3, 3, 2]), } # Policies (Unity: "behaviors") and agent-to-policy mapping fns. if game_name == "SoccerStrikersVsGoalie": policies = { "Goalie": (None, obs_spaces["Goalie"], action_spaces["Goalie"], {}), "Striker": (None, obs_spaces["Striker"], action_spaces["Striker"], {}), } def policy_mapping_fn(agent_id): return "Striker" if "Striker" in agent_id else "Goalie" else: policies = { game_name: (None, obs_spaces[game_name], action_spaces[game_name], {}), } def policy_mapping_fn(agent_id): return game_name return policies, policy_mapping_fn
def __init__(self, obs_space, bricks_rows=3, bricks_cols=3): output_space = Box(low=0, high=1, shape=(bricks_cols, bricks_rows), dtype=np.uint8) super().__init__(obs_space, output_space)
def test_get_single_step_input_dict_batch_repeat_value_1(self): """Test whether a SampleBatch produces the correct 1-step input dict.""" space = Box(-1.0, 1.0, ()) # With batch-repeat-value==1: state_in_0 is built each timestep. view_reqs = { "state_in_0": ViewRequirement( data_col="state_out_0", shift="-5:-1", space=space, batch_repeat_value=1, ), "state_out_0": ViewRequirement(space=space, used_for_compute_actions=False), } # Trajectory of 1 ts (0) (we would like to compute the 1st). batch = SampleBatch({ "state_in_0": np.array([ [0, 0, 0, 0, 0], # ts=0 ]), "state_out_0": np.array([1]), }) input_dict = batch.get_single_step_input_dict( view_requirements=view_reqs, index="last") check( input_dict, { "state_in_0": [[0, 0, 0, 0, 1]], # ts=1 "seq_lens": [1], }, ) # Trajectory of 6 ts (0-5) (we would like to compute the 6th). batch = SampleBatch({ "state_in_0": np.array([ [0, 0, 0, 0, 0], # ts=0 [0, 0, 0, 0, 1], # ts=1 [0, 0, 0, 1, 2], # ts=2 [0, 0, 1, 2, 3], # ts=3 [0, 1, 2, 3, 4], # ts=4 [1, 2, 3, 4, 5], # ts=5 ]), "state_out_0": np.array([1, 2, 3, 4, 5, 6]), }) input_dict = batch.get_single_step_input_dict( view_requirements=view_reqs, index="last") check( input_dict, { "state_in_0": [[2, 3, 4, 5, 6]], # ts=6 "seq_lens": [1], }, ) # Trajectory of 12 ts (0-11) (we would like to compute the 12th). batch = SampleBatch({ "state_in_0": np.array([ [0, 0, 0, 0, 0], # ts=0 [0, 0, 0, 0, 1], # ts=1 [0, 0, 0, 1, 2], # ts=2 [0, 0, 1, 2, 3], # ts=3 [0, 1, 2, 3, 4], # ts=4 [1, 2, 3, 4, 5], # ts=5 [2, 3, 4, 5, 6], # ts=6 [3, 4, 5, 6, 7], # ts=7 [4, 5, 6, 7, 8], # ts=8 [5, 6, 7, 8, 9], # ts=9 [6, 7, 8, 9, 10], # ts=10 [7, 8, 9, 10, 11], # ts=11 ]), "state_out_0": np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]), }) input_dict = batch.get_single_step_input_dict( view_requirements=view_reqs, index="last") check( input_dict, { "state_in_0": [[8, 9, 10, 11, 12]], # ts=12 "seq_lens": [1], }, )
MultiAgentMountainCar from ray.rllib.examples.env.random_env import RandomEnv from ray.rllib.models.tf.fcnet import FullyConnectedNetwork as FCNetV2 from ray.rllib.models.tf.visionnet import VisionNetwork as VisionNetV2 from ray.rllib.models.torch.visionnet import VisionNetwork as TorchVisionNetV2 from ray.rllib.models.torch.fcnet import FullyConnectedNetwork as TorchFCNetV2 from ray.rllib.utils.error import UnsupportedSpaceException from ray.tune.registry import register_env tf = try_import_tf() ACTION_SPACES_TO_TEST = { "discrete": Discrete(5), "vector": Box(-1.0, 1.0, (5, ), dtype=np.float32), "vector2": Box(-1.0, 1.0, ( 5, 5, ), dtype=np.float32), "multidiscrete": MultiDiscrete([1, 2, 3, 4]), "tuple": Tuple([Discrete(2), Discrete(3), Box(-1.0, 1.0, (5, ), dtype=np.float32)]), "dict": Dict({ "action_choice": Discrete(3),
def test_predict_target(): '''Test predict_target method of Critic.''' policy_module = Policy(Box(0, 1, shape=(4, )), Box(0, 1, shape=(2, ))) observation = tf.placeholder(tf.float32, shape=(None, 4)) predict_target_op = policy_module.predict_target(observation) assert predict_target_op.shape.as_list() == [None, 2]
def __init__(self, config): self.zeros = np.zeros((84, 84, 4)) self.action_space = Discrete(2) self.observation_space = Box(0.0, 1.0, shape=(84, 84, 4), dtype=np.float32) self.i = 0
def __init__( self, reward_info=None, frame_skip=50, pos_action_scale=2. / 100, randomize_goals=True, hide_goal=False, init_block_low=(-0.05, 0.55), init_block_high=(0.05, 0.65), puck_goal_low=(-0.05, 0.55), puck_goal_high=(0.05, 0.65), hand_goal_low=(-0.05, 0.55), hand_goal_high=(0.05, 0.65), fixed_puck_goal=(0.05, 0.6), fixed_hand_goal=(-0.05, 0.6), mocap_low=(-0.1, 0.5, 0.0), mocap_high=(0.1, 0.7, 0.5), force_puck_in_goal_space=False, ): self.quick_init(locals()) self.reward_info = reward_info self.randomize_goals = randomize_goals self._pos_action_scale = pos_action_scale self.hide_goal = hide_goal self.init_block_low = np.array(init_block_low) self.init_block_high = np.array(init_block_high) self.puck_goal_low = np.array(puck_goal_low) self.puck_goal_high = np.array(puck_goal_high) self.hand_goal_low = np.array(hand_goal_low) self.hand_goal_high = np.array(hand_goal_high) self.fixed_puck_goal = np.array(fixed_puck_goal) self.fixed_hand_goal = np.array(fixed_hand_goal) self.mocap_low = np.array(mocap_low) self.mocap_high = np.array(mocap_high) self.force_puck_in_goal_space = force_puck_in_goal_space self._goal_xyxy = self.sample_goal_xyxy() # MultitaskEnv.__init__(self, distance_metric_order=2) MujocoEnv.__init__(self, self.model_name, frame_skip=frame_skip) self.action_space = Box( np.array([-1, -1]), np.array([1, 1]), ) self.obs_box = Box( np.array([-0.2, 0.5, -0.2, 0.5]), np.array([0.2, 0.7, 0.2, 0.7]), ) goal_low = np.concatenate((self.hand_goal_low, self.puck_goal_low)) goal_high = np.concatenate((self.hand_goal_high, self.puck_goal_high)) self.goal_box = Box( goal_low, goal_high, ) self.observation_space = Dict([ ('observation', self.obs_box), ('state_observation', self.obs_box), ('desired_goal', self.goal_box), ('state_desired_goal', self.goal_box), ('achieved_goal', self.goal_box), ('state_achieved_goal', self.goal_box), ]) # hack for state-based experiments for other envs # self.observation_space = Box( # np.array([-0.2, 0.5, -0.2, 0.5, -0.2, 0.5]), # np.array([0.2, 0.7, 0.2, 0.7, 0.2, 0.7]), # ) # self.goal_space = Box( # np.array([-0.2, 0.5, -0.2, 0.5, -0.2, 0.5]), # np.array([0.2, 0.7, 0.2, 0.7, 0.2, 0.7]), # ) self.reset() self.reset_mocap_welds()
def __init__(self, random_init=True, obs_type='with_goal', goal_low=(-0.04, 0.8, -0.08), goal_high=(0.04, 0.88, -0.08), rotMode='fixed', **kwargs): self.quick_init(locals()) hand_low = (-0.5, 0.40, -0.15) hand_high = (0.5, 1, 0.5) obj_low = (-0.1, 0.6, 0.02) obj_high = (0.1, 0.7, 0.02) SawyerXYZEnv.__init__(self, frame_skip=5, action_scale=1. / 100, hand_low=hand_low, hand_high=hand_high, model_name=self.model_name, **kwargs) self.init_config = { 'obj_init_pos': np.array([0, 0.6, 0.02]), 'obj_init_angle': 0.3, 'hand_init_pos': np.array([0, 0.6, 0.2], dtype=np.float32), } self.goal = np.array([0., 0.84, -0.08], dtype=np.float32) self.obj_init_pos = self.init_config['obj_init_pos'] self.obj_init_angle = self.init_config['obj_init_angle'] self.hand_init_pos = self.init_config['hand_init_pos'] assert obs_type in OBS_TYPE self.obs_type = obs_type if goal_low is None: goal_low = self.hand_low if goal_high is None: goal_high = self.hand_high self.random_init = random_init self.max_path_length = 200 self.rotMode = rotMode if rotMode == 'fixed': self.action_space = Box( np.array([-1, -1, -1, -1]), np.array([1, 1, 1, 1]), ) elif rotMode == 'rotz': self.action_rot_scale = 1. / 50 self.action_space = Box( np.array([-1, -1, -1, -np.pi, -1]), np.array([1, 1, 1, np.pi, 1]), ) elif rotMode == 'quat': self.action_space = Box( np.array([-1, -1, -1, 0, -1, -1, -1, -1]), np.array([1, 1, 1, 2 * np.pi, 1, 1, 1, 1]), ) else: self.action_space = Box( np.array([-1, -1, -1, -np.pi / 2, -np.pi / 2, 0, -1]), np.array([1, 1, 1, np.pi / 2, np.pi / 2, np.pi * 2, 1]), ) self.obj_and_goal_space = Box( np.hstack((obj_low, goal_low)), np.hstack((obj_high, goal_high)), ) self.goal_space = Box(np.array(goal_low), np.array(goal_high)) if self.obs_type == 'plain': self.observation_space = Box( np.hstack(( self.hand_low, obj_low, )), np.hstack(( self.hand_high, obj_high, )), ) elif self.obs_type == 'with_goal': self.observation_space = Box( np.hstack((self.hand_low, obj_low, goal_low)), np.hstack((self.hand_high, obj_high, goal_high)), ) else: raise NotImplementedError self.reset()
parser.add_argument("--num-cpus", type=int, default=0) if __name__ == "__main__": args = parser.parse_args() ray.init(num_cpus=args.num_cpus or None) register_env("NestedSpaceRepeatAfterMeEnv", lambda c: NestedSpaceRepeatAfterMeEnv(c)) config = { "env": "NestedSpaceRepeatAfterMeEnv", "env_config": { "space": Dict({ "a": Tuple([Dict({ "d": Box(-10.0, 10.0, ()), "e": Discrete(2) })]), "b": Box(-10.0, 10.0, (2, )), "c": Discrete(4) }), }, "entropy_coeff": 0.00005, # We don't want high entropy in this Env. "gamma": 0.0, # No history in Env (bandit problem). "lr": 0.0005, "num_envs_per_worker": 20, "num_sgd_iter": 4, "num_workers": 0, "vf_loss_coeff": 0.01,