def __init__(self): space = Box(0.0, 1.0, shape=(84, 84, 3), dtype=np.float32) super().__init__(space)
def __init__(self, obj_low=None, obj_high=None, reward_type='hand_and_obj_distance', indicator_threshold=0.06, obj_init_positions=((0, 0.6, 0.02), ), random_init=False, fix_goal=False, fixed_goal=(0.15, 0.6, 0.055, -0.15, 0.6), goal_low=None, goal_high=None, reset_free=False, hide_goal_markers=False, oracle_reset_prob=0.0, presampled_goals=None, num_goals_presampled=10, p_obj_in_hand=.75, **kwargs): self.quick_init(locals()) MultitaskEnv.__init__(self) SawyerXYZEnv.__init__(self, model_name=self.model_name, **kwargs) if obj_low is None: obj_low = self.hand_low if obj_high is None: obj_high = self.hand_high self.obj_low = obj_low self.obj_high = obj_high if goal_low is None: goal_low = np.hstack((self.hand_low, obj_low)) if goal_high is None: goal_high = np.hstack((self.hand_high, obj_high)) self.reward_type = reward_type self.random_init = random_init self.p_obj_in_hand = p_obj_in_hand self.indicator_threshold = indicator_threshold self.obj_init_z = obj_init_positions[0][2] self.obj_init_positions = np.array(obj_init_positions) self.last_obj_pos = self.obj_init_positions[0] self.fix_goal = fix_goal self.fixed_goal = np.array(fixed_goal) self._state_goal = None self.reset_free = reset_free self.oracle_reset_prob = oracle_reset_prob self.hide_goal_markers = hide_goal_markers self.action_space = Box(np.array([-1, -1, -1, -1]), np.array([1, 1, 1, 1]), dtype=np.float32) self.hand_and_obj_space = Box(np.hstack((self.hand_low, obj_low)), np.hstack((self.hand_high, obj_high)), dtype=np.float32) self.hand_space = Box(self.hand_low, self.hand_high, dtype=np.float32) self.gripper_and_hand_and_obj_space = Box(np.hstack( ([0.0], self.hand_low, obj_low)), np.hstack( ([0.04], self.hand_high, obj_high)), dtype=np.float32) self.observation_space = Dict([ ('observation', self.gripper_and_hand_and_obj_space), ('desired_goal', self.hand_and_obj_space), ('achieved_goal', self.hand_and_obj_space), ('state_observation', self.gripper_and_hand_and_obj_space), ('state_desired_goal', self.hand_and_obj_space), ('state_achieved_goal', self.hand_and_obj_space), ('proprio_observation', self.hand_space), ('proprio_desired_goal', self.hand_space), ('proprio_achieved_goal', self.hand_space), ]) self.hand_reset_pos = np.array([0, .6, .2]) if presampled_goals is not None: self._presampled_goals = presampled_goals self.num_goals_presampled = len( list(self._presampled_goals.values)[0]) else: # presampled_goals will be created when sample_goal is first called self._presampled_goals = None self.num_goals_presampled = num_goals_presampled
return args if __name__ == "__main__": args = get_cli_args() ray.init(num_cpus=args.num_cpus or None, local_mode=args.local_mode) # main part: configure the ActionMaskEnv and ActionMaskModel config = { # random env with 100 discrete actions and 5x [-1,1] observations # some actions are declared invalid and lead to errors "env": ActionMaskEnv, "env_config": { "action_space": Discrete(100), "observation_space": Box(-1.0, 1.0, (5,)), }, # the ActionMaskModel retrieves the invalid actions and avoids them "model": { "custom_model": ActionMaskModel if args.framework != "torch" else TorchActionMaskModel, # disable action masking according to CLI "custom_model_config": {"no_masking": args.no_masking}, }, # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0. "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")), "framework": args.framework, # Run with tracing enabled for tfe/tf2? "eager_tracing": args.eager_tracing, }
def __init__(self, random_init=True, obs_type='plain', goal_low=None, goal_high=None, rotMode='fixed', **kwargs): self.quick_init(locals()) hand_low = (-0.5, 0.40, 0.05) hand_high = (0.5, 1, 0.5) obj_low = (-0.05, 0.85, 0.12) obj_high = (0.05, 0.9, 0.12) SawyerXYZEnv.__init__(self, frame_skip=5, action_scale=1. / 100, hand_low=hand_low, hand_high=hand_high, model_name=self.model_name, **kwargs) self.init_config = { 'obj_init_pos': np.array([0., 0.9, 0.12], dtype=np.float32), 'hand_init_pos': np.array([0, 0.6, 0.2], dtype=np.float32), } self.goal = np.array([0, 0.84, 0.12]) self.obj_init_pos = self.init_config['obj_init_pos'] self.hand_init_pos = self.init_config['hand_init_pos'] assert obs_type in OBS_TYPE self.obs_type = obs_type if goal_low is None: goal_low = self.hand_low if goal_high is None: goal_high = self.hand_high self.random_init = random_init self.max_path_length = 150 self.rotMode = rotMode if rotMode == 'fixed': self.action_space = Box( np.array([-1, -1, -1, -1]), np.array([1, 1, 1, 1]), ) elif rotMode == 'rotz': self.action_rot_scale = 1. / 50 self.action_space = Box( np.array([-1, -1, -1, -np.pi, -1]), np.array([1, 1, 1, np.pi, 1]), ) elif rotMode == 'quat': self.action_space = Box( np.array([-1, -1, -1, 0, -1, -1, -1, -1]), np.array([1, 1, 1, 2 * np.pi, 1, 1, 1, 1]), ) else: self.action_space = Box( np.array([-1, -1, -1, -np.pi / 2, -np.pi / 2, 0, -1]), np.array([1, 1, 1, np.pi / 2, np.pi / 2, np.pi * 2, 1]), ) self.obj_and_goal_space = Box( np.array(obj_low), np.array(obj_high), ) self.goal_space = Box(np.array(goal_low), np.array(goal_high)) if self.obs_type == 'plain': self.observation_space = Box( np.hstack(( self.hand_low, obj_low, )), np.hstack(( self.hand_high, obj_high, )), ) elif self.obs_type == 'with_goal': self.observation_space = Box( np.hstack((self.hand_low, obj_low, goal_low)), np.hstack((self.hand_high, obj_high, goal_high)), ) else: raise NotImplementedError self.reset()
def __init__(self, env, active_handles, names, map_size, max_cycles, reward_range, minimap_mode, extra_features): self.map_size = map_size self.max_cycles = max_cycles self.minimap_mode = minimap_mode self.extra_features = extra_features self.env = env self.handles = active_handles self._all_handles = self.env.get_handles() env.reset() self.generate_map() self.team_sizes = team_sizes = [ env.get_num(handle) for handle in self.handles ] self.agents = [ f"{names[j]}_{i}" for j in range(len(team_sizes)) for i in range(team_sizes[j]) ] self.possible_agents = self.agents[:] num_actions = [ env.get_action_space(handle)[0] for handle in self.handles ] action_spaces_list = [ Discrete(num_actions[j]) for j in range(len(team_sizes)) for i in range(team_sizes[j]) ] # may change depending on environment config? Not sure. team_obs_shapes = self._calc_obs_shapes() state_shape = self._calc_state_shape() observation_space_list = [ Box(low=0., high=2., shape=team_obs_shapes[j], dtype=np.float32) for j in range(len(team_sizes)) for i in range(team_sizes[j]) ] self.state_space = Box(low=0., high=2., shape=state_shape, dtype=np.float32) reward_low, reward_high = reward_range if extra_features: for space in observation_space_list: idx = space.shape[2] - 3 if minimap_mode else space.shape[2] - 1 space.low[:, :, idx] = reward_low space.high[:, :, idx] = reward_high idx_state = self.state_space.shape[ 2] - 3 if minimap_mode else self.state_space.shape[2] - 1 self.state_space.low[:, :, idx_state] = reward_low self.state_space.high[:, :, idx_state] = reward_high self.action_spaces = { agent: space for agent, space in zip(self.agents, action_spaces_list) } self.observation_spaces = { agent: space for agent, space in zip(self.agents, observation_space_list) } self._zero_obs = { agent: np.zeros_like(space.low) for agent, space in self.observation_spaces.items() } self.base_state = np.zeros(self.state_space.shape) walls = self.env._get_walls_info() wall_x, wall_y = zip(*walls) self.base_state[wall_x, wall_y, 0] = 1 self._renderer = None self.frames = 0
def __init__(self, config): self.action_space = Box(0.0, 1.0, (1, )) self.observation_space = Box(0.0, 1.0, (1, )) self.max_steps = config.get("max_steps", 100) self.state = None self.steps = None
def __init__(self, env, training=True): super().__init__(env) H, W, C = self.env.render('rgb_array').shape self.observation_space = Box(0, 255, (H, W, C), dtype=np.uint8)
import numpy as np import unittest import ray from ray.rllib.agents.registry import get_trainer_class from ray.rllib.examples.env.random_env import RandomEnv from ray.rllib.models.tf.fcnet import FullyConnectedNetwork as FCNetV2 from ray.rllib.models.tf.visionnet import VisionNetwork as VisionNetV2 from ray.rllib.models.torch.visionnet import VisionNetwork as TorchVisionNetV2 from ray.rllib.models.torch.fcnet import FullyConnectedNetwork as TorchFCNetV2 from ray.rllib.utils.error import UnsupportedSpaceException from ray.rllib.utils.test_utils import framework_iterator ACTION_SPACES_TO_TEST = { "discrete": Discrete(5), "vector": Box(-1.0, 1.0, (5, ), dtype=np.float32), "vector2": Box(-1.0, 1.0, (5, ), dtype=np.float32), "int_actions": Box(0, 3, (2, 3), dtype=np.int32), "multidiscrete": MultiDiscrete([1, 2, 3, 4]), "tuple": Tuple( [Discrete(2), Discrete(3), Box(-1.0, 1.0, (5, ), dtype=np.float32)]), "dict": Dict({ "action_choice": Discrete(3), "parameters": Box(-1.0, 1.0, (1, ), dtype=np.float32), "yet_another_nested_dict": Dict({ "a": Tuple([Discrete(2), Discrete(3)]) }) }), }
def observation_space(self): return Box(-np.ones(2 * self.agent_num), np.ones(2 * self.agent_num), dtype=np.float32)
def __init__(self, env): super().__init__(env) assert isinstance(env.observation_space, Discrete) self.observation_space = Box(0.0, 1.0, (env.observation_space.n,), dtype=np.float32)
""" import sys sys.path.append('../rt_erg_lib') ########################################### # basic function test import numpy as np import numpy.random as npr from basis import Basis from gym.spaces import Box # define the exploration space as gym.Box explr_space = Box(np.array([0.0, 0.0]), np.array([1.0, 1.0]), dtype=np.float32) # define the basis object basis = Basis(explr_space=explr_space, num_basis=5) # simulate/randomize a trajectory xt = [explr_space.sample() for _ in range(10)] # print indices for all basis functions print('indices for all basis functions: ') print(basis.k) # amount is square of num_basis # test basis function, the input is a pose print(basis.fk(xt[0])) # test derivative of basis function wrt a pose print(basis.dfk(xt[0])) # hk, even computed in the source code, is not # used in the end, so we temporarily ignore it ###########################################
class SubepisodedReferenceGenerator(ReferenceGenerator): """ Base Class for Reference Generators, which change its parameters in certain ranges after a random number of time steps and can pre-calculate their references in these "sub episodes". """ reference_space = Box(-1, 1, shape=(1, )) _reference = None def __init__(self, reference_state='omega', episode_lengths=(500, 2000), limit_margin=None, *_, **__): """ Args: reference_state(str): Name of the state that this reference generator is referencing. episode_lengths(Tuple(int,int)): Minimum and maximum length of a sub episode. limit_margin(Tuple(float,float)/float/None): Factor, how close the references should get to the limits. If a tuple is passed, then the lower[0] and upper[1] margin might differ. If a float is passed, both margins are equal. If None(default), the limit margin equals (nominal values/limits). In general, the limit margin should not exceed (-1, 1) """ super().__init__() self._limit_margin = limit_margin self._reference_value = 0.0 self._reference_state = reference_state.lower() self._episode_len_range = episode_lengths self._current_episode_length = int( self._get_current_value(episode_lengths)) self._k = 0 def set_modules(self, physical_system): super().set_modules(physical_system) self._referenced_states = set_state_array({ self._reference_state: 1 }, physical_system.state_names).astype(bool) rs = self._referenced_states ps = physical_system if self._limit_margin is None: upper_margin = (ps.nominal_state[rs] / ps.limits[rs])[0] * ps.state_space.high[rs] lower_margin = (ps.nominal_state[rs] / ps.limits[rs])[0] * ps.state_space.low[rs] self._limit_margin = lower_margin[0], upper_margin[0] elif type(self._limit_margin) in [float, int]: upper_margin = self._limit_margin * ps.state_space.high[rs] lower_margin = self._limit_margin * ps.state_space.low[rs] self._limit_margin = lower_margin[0], upper_margin[0] elif type(self._limit_margin) is tuple: lower_margin = self._limit_margin[0] * ps.state_space.low[rs] upper_margin = self._limit_margin[1] * ps.state_space.high[rs] self._limit_margin = lower_margin[0], upper_margin[0] else: raise Exception('Unknown type for the limit margin.') self.reference_space = Box(lower_margin[0], upper_margin[0], shape=(1, )) def reset(self, initial_state=None, initial_reference=None): """ The references are reset. If an initial reference is passed, this value will be the first reference value of the next episode. Otherwise it will be 0. Args: initial_state(ndarray(float)): The initial state of the physical system. initial_reference(ndarray(float)): (Optional) The first reference value. Returns: initial_reference(ndarray(float)): initial reference array. initial_reference_observation(element of reference_space): An initial observation of the next reference. trajectory(None): No initial trajectory is passed. """ if initial_reference is not None: self._reference_value = initial_reference[ self._referenced_states][0] else: self._reference_value = 0.0 self._current_episode_length = -1 return super().reset(initial_state) def get_reference(self, *_, **__): reference = np.zeros_like(self._referenced_states, dtype=float) reference[self._referenced_states] = self._reference_value return reference def get_reference_observation(self, *_, **__): if self._k >= self._current_episode_length: self._k = 0 self._current_episode_length = int( self._get_current_value(self._episode_len_range)) self._reset_reference() self._reference_value = self._reference[self._k] self._k += 1 return np.array([self._reference_value]) def _reset_reference(self): """ Subclasses implement in this method its generation of the references for the next self._current_episode_length time steps and write it into self._reference. """ raise NotImplementedError @staticmethod def _get_current_value(value_range): """ Return a uniform distributed value for the next sub episode. If float or int is passed this value will be returned. Otherwise a uniform distributed value between value_range[0] and value_range[1] is returned. """ if type(value_range) in [int, float]: return value_range elif type(value_range) in [list, tuple, np.ndarray]: return (value_range[1] - value_range[0]) * np.random.rand() + value_range[0]
def init_observation_space(self): arm = dict() arm['joint_pos_list'] = Box(low=-np.inf, high=np.inf, shape=(self.arm_dof_num, 3), dtype=np.float32) arm['jonit_vel_list'] = Box(low=-np.inf, high=np.inf, shape=(self.arm_dot_num, 3), dtype=np.float32) arm['link_glb_pos_list'] = Box(low=-np.inf, high=np.inf, shape=(self.arm_link_num, 3), dtype=np.float32) arm['link_glb_orn_list'] = Box(low=-np.inf, high=np.inf, shape=(self.arm_link_num, 4), dtype=np.float32) arm['link_loc_pos_list'] = Box(low=-np.inf, high=np.inf, shape=(self.arm_link_num, 3), dtype=np.float32) arm['link_loc_orn_list'] = Box(low=-np.inf, high=np.inf, shape=(self.arm_link_num, 4), dtype=np.float32) gripper = dict() griiper['joint_pos_list'] = Box(low=-np.inf, high=np.inf, shape=(self.gripper_dof_num, 3), dtype=np.float32) griiper['jonit_vel_list'] = Box(low=-np.inf, high=np.inf, shape=(self.gripper_dot_num, 3), dtype=np.float32) griiper['link_glb_pos_list'] = Box(low=-np.inf, high=np.inf, shape=(self.gripper_link_num, 3), dtype=np.float32) griiper['link_glb_orn_list'] = Box(low=-np.inf, high=np.inf, shape=(self.gripper_link_num, 4), dtype=np.float32) griiper['link_loc_pos_list'] = Box(low=-np.inf, high=np.inf, shape=(self.gripper_link_num, 3), dtype=np.float32) griiper['link_loc_orn_list'] = Box(low=-np.inf, high=np.inf, shape=(self.gripper_link_num, 4), dtype=np.float32) super().observation['arm'] = arm super().observation['gripper'] = gripper
def __init__(self): self.observation_space = Tuple( [Discrete(5), Box(0, 5, shape=(3, ), dtype=np.float32)])
def __init__(self, obs_space, action_space, num_outputs, model_config, name): # TODO: (sven) Support Dicts as well. self.original_space = obs_space.original_space if \ hasattr(obs_space, "original_space") else obs_space assert isinstance(self.original_space, (Tuple)), \ "`obs_space.original_space` must be Tuple!" super().__init__(self.original_space, action_space, num_outputs, model_config, name) self.new_obs_space = obs_space # Build the CNN(s) given obs_space's image components. self.cnns = {} self.one_hot = {} self.flatten = {} concat_size_p, concat_size_v = 0, 0 for i, component in enumerate(self.original_space): # Image space. if len(component.shape) == 3: config = { "conv_filters": model_config.get( "conv_filters"), "conv_activation": model_config.get("conv_activation"), "post_fcnet_hiddens": [], } cnn = CustomVisionNetwork(component, action_space, None, config, "cnn_{}".format(i)) ''' cnn = ModelCatalog.get_model_v2( component, action_space, num_outputs=None, model_config=config, framework="tf", name="cnn_{}".format(i)) ''' cnn.base_model.summary() concat_size_p += cnn.num_outputs_p concat_size_v += cnn.num_outputs_v self.cnns[i] = cnn # Discrete inputs -> One-hot encode. elif isinstance(component, Discrete): self.one_hot[i] = True concat_size_p += component.n concat_size_v += component.n # TODO: (sven) Multidiscrete (see e.g. our auto-LSTM wrappers). # Everything else (1D Box). else: self.flatten[i] = int(np.product(component.shape)) concat_size_p += self.flatten[i] concat_size_v += self.flatten[i] # Optional post-concat FC-stack. post_fc_stack_config = { "fcnet_hiddens": model_config.get("post_fcnet_hiddens", []), "fcnet_activation": model_config.get("post_fcnet_activation", "relu"), "vf_share_layers": 'True' } self.post_fc_stack = ModelCatalog.get_model_v2( Box(float("-inf"), float("inf"), shape=(concat_size_p,), dtype=np.float32), self.action_space, None, post_fc_stack_config, framework="tf", name="post_fc_stack") self.post_fc_stack_vf = ModelCatalog.get_model_v2( Box(float("-inf"), float("inf"), shape=(concat_size_v,), dtype=np.float32), self.action_space, None, post_fc_stack_config, framework="tf", name="post_fc_stack_vf") self.post_fc_stack.base_model.summary() self.post_fc_stack_vf.base_model.summary() # Actions and value heads. self.logits_and_value_model = None self._value_out = None if num_outputs: # Action-distribution head. p_layer = tf.keras.layers.Input( (self.post_fc_stack.num_outputs,)) v_layer = tf.keras.layers.Input( (self.post_fc_stack_vf.num_outputs,)) logits_layer = tf.keras.layers.Dense( num_outputs, activation=tf.keras.activations.linear, name="logits")(p_layer) # Create the value branch model. value_layer = tf.keras.layers.Dense( 1, name="value_out", activation=tf.keras.activations.tanh, kernel_initializer=normc_initializer(0.01))(v_layer) self.logits_model = tf.keras.models.Model( p_layer, [logits_layer]) self.value_model = tf.keras.models.Model( v_layer, [value_layer] ) self.logits_model.summary() self.value_model.summary() else: self.num_outputs = self.post_fc_stack.num_outputs
def action_space(self): return Box(-np.ones(1), np.ones(1), dtype=np.float32)
def __init__( self, random_init=False, task_types=['pick_place', 'reach', 'push'], task_type='pick_place', obs_type='plain', goal_low=(-0.1, 0.8, 0.05), goal_high=(0.1, 0.9, 0.3), liftThresh = 0.04, sampleMode='equal', rewMode = 'orig', rotMode='fixed',#'fixed', **kwargs ): self.quick_init(locals()) hand_low=(-0.5, 0.40, 0.05) hand_high=(0.5, 1, 0.5) obj_low=(-0.1, 0.6, 0.02) obj_high=(0.1, 0.7, 0.02) SawyerXYZEnv.__init__( self, frame_skip=5, action_scale=1./100, hand_low=hand_low, hand_high=hand_high, model_name=self.model_name, **kwargs ) self.task_type = task_type self.init_config = { 'obj_init_angle': .3, 'obj_init_pos': np.array([0, 0.6, 0.02]), 'hand_init_pos': np.array([0, .6, .2]), } # we only do one task from [pick_place, reach, push] # per instance of SawyerReachPushPickPlaceEnv. # Please only set task_type from constructor. if self.task_type == 'pick_place': self.goal = np.array([0.1, 0.8, 0.2]) elif self.task_type == 'reach': self.goal = np.array([-0.1, 0.8, 0.2]) elif self.task_type == 'push': self.goal = np.array([0.1, 0.8, 0.02]) else: raise NotImplementedError self.obj_init_angle = self.init_config['obj_init_angle'] self.obj_init_pos = self.init_config['obj_init_pos'] self.hand_init_pos = self.init_config['hand_init_pos'] assert obs_type in OBS_TYPE self.obs_type = obs_type if goal_low is None: goal_low = self.hand_low if goal_high is None: goal_high = self.hand_high self.random_init = random_init self.liftThresh = liftThresh self.max_path_length = 150 self.rewMode = rewMode self.rotMode = rotMode self.sampleMode = sampleMode self.task_types = task_types if rotMode == 'fixed': self.action_space = Box( np.array([-1, -1, -1, -1]), np.array([1, 1, 1, 1]), ) elif rotMode == 'rotz': self.action_rot_scale = 1./50 self.action_space = Box( np.array([-1, -1, -1, -np.pi, -1]), np.array([1, 1, 1, np.pi, 1]), ) elif rotMode == 'quat': self.action_space = Box( np.array([-1, -1, -1, 0, -1, -1, -1, -1]), np.array([1, 1, 1, 2*np.pi, 1, 1, 1, 1]), ) else: self.action_space = Box( np.array([-1, -1, -1, -np.pi/2, -np.pi/2, 0, -1]), np.array([1, 1, 1, np.pi/2, np.pi/2, np.pi*2, 1]), ) self.obj_and_goal_space = Box( np.hstack((obj_low, goal_low)), np.hstack((obj_high, goal_high)), ) self.goal_space = Box(np.array(goal_low), np.array(goal_high)) if self.obs_type == 'plain': self.observation_space = Box( np.hstack((self.hand_low, obj_low,)), np.hstack((self.hand_high, obj_high,)), ) elif self.obs_type == 'with_goal': self.observation_space = Box( np.hstack((self.hand_low, obj_low, goal_low)), np.hstack((self.hand_high, obj_high, goal_high)), ) else: raise NotImplementedError('If you want to use an observation\ with_obs_idx, please discretize the goal space after instantiate an environment.') self.num_resets = 0 self.reset()
def __init__(self, random_init=False, obs_type='plain', goal_low=(0., 0.85, 0.05), goal_high=(0.3, 0.9, 0.05), liftThresh=0.09, rotMode='fixed', rewMode='orig', **kwargs): self.quick_init(locals()) hand_low = (-0.5, 0.40, 0.05) hand_high = (0.5, 1, 0.5) obj_low = (-0.1, 0.5, 0.02) obj_high = (0.1, 0.6, 0.02) SawyerXYZEnv.__init__(self, frame_skip=5, action_scale=1. / 100, hand_low=hand_low, hand_high=hand_high, model_name=self.model_name, **kwargs) # TODO should we put this to goal instead of initial config? self.init_config = { 'hammer_init_pos': np.array([0, 0.6, 0.02]), 'hand_init_pos': np.array([0, 0.6, 0.2]), } self.goal = self.init_config['hammer_init_pos'] # TODO: check this self.hammer_init_pos = self.init_config['hammer_init_pos'] self.hand_init_pos = self.init_config['hand_init_pos'] if goal_low is None: goal_low = self.hand_low if goal_high is None: goal_high = self.hand_high assert obs_type in OBS_TYPE self.obs_type = obs_type self.random_init = random_init self.liftThresh = liftThresh self.max_path_length = 200 self.rewMode = rewMode self.rotMode = rotMode if rotMode == 'fixed': self.action_space = Box( np.array([-1, -1, -1, -1]), np.array([1, 1, 1, 1]), ) elif rotMode == 'rotz': self.action_rot_scale = 1. / 50 self.action_space = Box( np.array([-1, -1, -1, -np.pi, -1]), np.array([1, 1, 1, np.pi, 1]), ) elif rotMode == 'quat': self.action_space = Box( np.array([-1, -1, -1, 0, -1, -1, -1, -1]), np.array([1, 1, 1, 2 * np.pi, 1, 1, 1, 1]), ) else: self.action_space = Box( np.array([-1, -1, -1, -np.pi / 2, -np.pi / 2, 0, -1]), np.array([1, 1, 1, np.pi / 2, np.pi / 2, np.pi * 2, 1]), ) self.obj_and_goal_space = Box( np.hstack((obj_low, goal_low)), np.hstack((obj_high, goal_high)), ) self.goal_space = Box(np.array(goal_low), np.array(goal_high)) if self.obs_type == 'plain': self.observation_space = Box( np.hstack(( self.hand_low, obj_low, )), np.hstack(( self.hand_high, obj_high, )), ) elif self.obs_type == 'with_goal': self.observation_space = Box( np.hstack((self.hand_low, obj_low, goal_low)), np.hstack((self.hand_high, obj_high, goal_high)), ) else: raise NotImplementedError self.reset()
class CityFlowEnvRay(MultiAgentEnv): """ multi intersection cityflow environment, for the Ray framework """ observation_space = Box(0.0 * np.ones((29, )), 150 * np.ones((29, ))) action_space = Discrete(8) # num of agents config = json.load( open( '/home/skylark/PycharmRemote/Gamma-Reward-Perfect/config/global_config.json' )) cityflow_config = json.load(open(config['cityflow_config_file'])) roadnetFile = cityflow_config['dir'] + cityflow_config['roadnetFile'] config["lane_phase_info"] = parse_roadnet(roadnetFile) intersection_id = list(config['lane_phase_info'].keys()) def __init__(self, env_config): config = json.load( open( '/home/skylark/PycharmRemote/Gamma-Reward-Perfect/config/global_config.json' )) cityflow_config = json.load(open(config['cityflow_config_file'])) self.roadnetFile = cityflow_config['dir'] + cityflow_config[ 'roadnetFile'] self.record_dir = '/home/skylark/PycharmRemote/Gamma-Reward-Perfect/record/' + env_config[ "Name"] if not os.path.exists(self.record_dir): os.makedirs(self.record_dir) self.dic_traffic_env_conf = { 'ADJACENCY_BY_CONNECTION_OR_GEO': False, 'TOP_K_ADJACENCY': 5 } self.dic_lane_waiting_vehicle_count_previous_step = {} self.dic_vehicle_speed_previous_step = {} self.dic_vehicle_distance_previous_step = {} self.dic_lane_vehicle_previous_step = {} self.traffic_light_node_dict = self._adjacency_extraction() self.dic_lane_vehicle_current_step = {} self.dic_lane_waiting_vehicle_count_current_step = {} self.dic_vehicle_speed_current_step = {} self.dic_vehicle_distance_current_step = {} self.list_lane_vehicle_previous_step = {} self.list_lane_vehicle_current_step = {} self.dic_vehicle_arrive_leave_time = {} config["lane_phase_info"] = parse_roadnet(self.roadnetFile) intersection_id = list(config['lane_phase_info'].keys()) self.Gamma_Reward = env_config["Gamma_Reward"] self.threshold = env_config["threshold"] self.min_action_time = env_config["MIN_ACTION_TIME"] self.road_sort = env_config['road_sort'] self.eng = cityflow.Engine(config['cityflow_config_file'], thread_num=config["thread_num"]) # self.eng = config["eng"][0] self.num_step = 3600 self.intersection_id = intersection_id # list, [intersection_id, ...] self.num_agents = len(self.intersection_id) self.state_size = None self.lane_phase_info = config["lane_phase_info"] # "intersection_1_1" # self.score = [] # self.score_file = './utils/score_' + str(datetime.datetime.now()) + '.csv' self.current_phase = {} self.current_phase_time = {} self.start_lane = {} self.end_lane = {} self.phase_list = {} self.phase_startLane_mapping = {} self.intersection_lane_mapping = {} # {id_:[lanes]} self.empty = {} self.dic_num_id_inter = {} num_id = 0 for id_ in self.intersection_id: self.start_lane[id_] = self.lane_phase_info[id_]['start_lane'] self.end_lane[id_] = self.lane_phase_info[id_]['end_lane'] self.phase_startLane_mapping[id_] = self.lane_phase_info[id_][ "phase_startLane_mapping"] self.phase_list[id_] = self.lane_phase_info[id_]["phase"] self.current_phase[id_] = self.phase_list[id_][0] self.current_phase_time[id_] = 0 self.dic_lane_vehicle_current_step[id_] = {} self.dic_lane_waiting_vehicle_count_current_step[id_] = {} self.dic_vehicle_arrive_leave_time[id_] = {} self.empty[id_] = {} self.dic_num_id_inter[num_id] = id_ num_id += 1 self.reset_count = 0 self.get_state() # set self.state_size self.num_actions = len(self.phase_list[self.intersection_id[0]]) self.count = 0 self.congestion_count = 0 # self.done = False self.congestion = False self.iteration_count = [] self.reset() def reset(self): print("\033[1;34m=================================\033[0m") print("\033[1;34mreset_count: {0}, iteration: {1}\033[0m".format( self.reset_count, self.count)) # self.iteration_count.append(self.count) # if self.reset_count >= 102: # df = pd.DataFrame(self.iteration_count) # df.to_csv(os.path.join(self.record_dir, 'iteration_count.csv')) if not operator.eq(self.dic_vehicle_arrive_leave_time, self.empty): path_to_log = self.record_dir + '/train_results/episode_{0}/'.format( self.reset_count) if not os.path.exists(path_to_log): os.makedirs(path_to_log) self.log(path_to_log) print("Log is saved !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") print(path_to_log) self.eng.reset() self.dic_vehicle_arrive_leave_time = copy.deepcopy(self.empty) self.dic_lane_vehicle_current_step = copy.deepcopy(self.empty) self.dic_lane_waiting_vehicle_count_current_step = copy.deepcopy( self.empty) # self.traffic_light_node_dict = self._adjacency_extraction() # self.done = False self.congestion = False self.count = 0 self.congestion_count = 0 self.reset_count += 1 return { id_: np.zeros((self.state_size, )) for id_ in self.intersection_id } def step(self, action): """ Calculate the state in time """ step_start_time = time.time() for i in range(self.min_action_time): self._inner_step(action) state = self.get_state() reward = self.get_raw_reward() # 判断是否已经出现拥堵 self.congestion = self.compute_congestion() self.done = {id_: False for id_ in self.intersection_id} self.done['__all__'] = False # if self.count >= self.num_step: # self.done = {id_: True for id_ in self.intersection_id} # self.done['__all__'] = True # if self.count == 3600: # self.reset() return state, reward, self.done, {} def _inner_step(self, action): self.update_previous_measurements() for id_, a in action.items(): # intersection_id, corresponding action if self.current_phase[id_] == self.phase_list[id_][a]: self.current_phase_time[id_] += 1 else: self.current_phase[id_] = self.phase_list[id_][a] self.current_phase_time[id_] = 1 self.eng.set_tl_phase( id_, self.current_phase[id_]) # set phase of traffic light self.eng.next_step() self.count += 1 # print(self.count) self.system_states = { "get_lane_vehicles": self.eng.get_lane_vehicles(), "get_lane_waiting_vehicle_count": self.eng.get_lane_waiting_vehicle_count(), "get_vehicle_speed": None, "get_vehicle_distance": None } for id_ in self.intersection_id: self.update_current_measurements_map(id_, self.system_states) def update_previous_measurements(self): self.dic_lane_vehicle_previous_step = copy.deepcopy( self.dic_lane_vehicle_current_step) self.dic_lane_waiting_vehicle_count_previous_step = copy.deepcopy( self.dic_lane_waiting_vehicle_count_current_step) self.dic_vehicle_speed_previous_step = copy.deepcopy( self.dic_vehicle_speed_current_step) self.dic_vehicle_distance_previous_step = copy.deepcopy( self.dic_vehicle_distance_current_step) def update_current_measurements_map(self, id_, simulator_state): ## need change, debug in seeing format def _change_lane_vehicle_dic_to_list(dic_lane_vehicle): list_lane_vehicle = [] for value in dic_lane_vehicle.values(): list_lane_vehicle.extend(value) return list_lane_vehicle for lane in self.lane_phase_info[id_]['start_lane']: self.dic_lane_vehicle_current_step[id_][lane] = simulator_state[ "get_lane_vehicles"][lane] self.dic_lane_waiting_vehicle_count_current_step[id_][lane] = \ simulator_state["get_lane_waiting_vehicle_count"][ lane] for lane in self.lane_phase_info[id_]['end_lane']: self.dic_lane_vehicle_current_step[id_][lane] = simulator_state[ "get_lane_vehicles"][lane] self.dic_lane_waiting_vehicle_count_current_step[id_][lane] = \ simulator_state["get_lane_waiting_vehicle_count"][ lane] self.dic_vehicle_speed_current_step[id_] = simulator_state[ 'get_vehicle_speed'] self.dic_vehicle_distance_current_step[id_] = simulator_state[ 'get_vehicle_distance'] # get vehicle list self.list_lane_vehicle_current_step[ id_] = _change_lane_vehicle_dic_to_list( self.dic_lane_vehicle_current_step[id_]) self.list_lane_vehicle_previous_step[ id_] = _change_lane_vehicle_dic_to_list( self.dic_lane_vehicle_previous_step[id_]) list_vehicle_new_arrive = list( set(self.list_lane_vehicle_current_step[id_]) - set(self.list_lane_vehicle_previous_step[id_])) # if id_ == 'intersection_6_1': # print('list_lane_vehicle_current_step: ' + str(self.list_lane_vehicle_current_step)) # print('list_lane_vehicle_previous_step: ' + str(self.list_lane_vehicle_previous_step)) list_vehicle_new_left = list( set(self.list_lane_vehicle_previous_step[id_]) - set(self.list_lane_vehicle_current_step[id_])) list_vehicle_new_left_entering_lane_by_lane = self._update_leave_entering_approach_vehicle( id_) list_vehicle_new_left_entering_lane = [] for l in list_vehicle_new_left_entering_lane_by_lane: list_vehicle_new_left_entering_lane += l # print('list_vehicle_new_arrive' + str(list_vehicle_new_arrive)) # print('list_vehicle_new_left_entering_lane' + str(list_vehicle_new_left_entering_lane)) # update vehicle arrive and left time self._update_arrive_time(id_, list_vehicle_new_arrive) self._update_left_time(id_, list_vehicle_new_left_entering_lane) # update vehicle minimum speed in history, # to be implemented # self._update_vehicle_min_speed() # update feature # self._update_feature_map(simulator_state) def compute_congestion(self): index = False intersection_info = {} for id_ in self.intersection_id: intersection_info[id_] = self.intersection_info(id_) congestion = {id_: False for id_ in self.intersection_id} for id_ in self.intersection_id: if np.max( list(intersection_info[id_] ["start_lane_waiting_vehicle_count"].values()) ) > self.threshold: congestion[id_] = True index = True return index def get_state(self): state = {id_: self._get_state(id_) for id_ in self.intersection_id} # self.score.append(self.get_score()) # if self.reset_count >= 100: # ''' # TODO: save fn may be a bug # ''' # score = pd.DataFrame(self.score) # score.to_csv(self.score_file) return state def _get_state(self, id_): state = self.intersection_info(id_) if self.Gamma_Reward: #### dw #### keys = state['end_lane_vehicle_count'].keys() start_index = id_.find('_') s0 = 'road' + id_[start_index:start_index + 4] + '_0' # To East s1 = 'road' + id_[start_index:start_index + 4] + '_1' # To North s2 = 'road' + id_[start_index:start_index + 4] + '_2' # To West s3 = 'road' + id_[start_index:start_index + 4] + '_3' # To South num_w_e = 0 num_e_w = 0 num_n_s = 0 num_s_n = 0 for i in keys: if i.startswith(s0): num_w_e += state['end_lane_vehicle_count'][i] elif i.startswith(s1): num_n_s += state['end_lane_vehicle_count'][i] elif i.startswith(s2): num_e_w += state['end_lane_vehicle_count'][i] elif i.startswith(s3): num_s_n += state['end_lane_vehicle_count'][i] end_lane_dict = { s0: num_w_e, s1: num_n_s, s2: num_e_w, s3: num_s_n } end_lane_sorted_keys = sorted(end_lane_dict.keys()) state_dict_waiting = state['start_lane_waiting_vehicle_count'] state_dict = state['start_lane_vehicle_count'] # 12-dim start lanes car + 12-dim start lanes waiting car number + current phase + 4-dim end_lanes # waiting car number return_state = [state_dict[key] for key in self.road_sort[id_]] + [ state_dict_waiting[key] for key in self.road_sort[id_] ] + [state['current_phase'] ] + [end_lane_dict[key] for key in end_lane_sorted_keys] # return_state = [state_dict[key] for key in sorted_keys] + [state['current_phase']] + \ # [0, 0, 0, 0] else: state_dict = state['start_lane_waiting_vehicle_count'] return_state = [state_dict[key] for key in self.road_sort[id_]] + [0] * 12 + [state['current_phase']] + \ [0, 0, 0, 0] return self.preprocess_state(return_state) def intersection_info(self, id_): """ info of intersection 'id_' """ state = {} get_lane_vehicle_count = self.eng.get_lane_vehicle_count() get_lane_waiting_vehicle_count = self.eng.get_lane_waiting_vehicle_count( ) # get_lane_vehicles = self.eng.get_lane_vehicles() # get_vehicle_speed = self.eng.get_vehicle_speed() state['start_lane_vehicle_count'] = { lane: get_lane_vehicle_count[lane] for lane in self.start_lane[id_] } state['end_lane_vehicle_count'] = { lane: get_lane_vehicle_count[lane] for lane in self.end_lane[id_] } # state['lane_vehicle_count'] = state['start_lane_vehicle_count'].copy() # state['lane_vehicle_count'].update(state['end_lane_vehicle_count'].items()) state['start_lane_waiting_vehicle_count'] = { lane: get_lane_waiting_vehicle_count[lane] for lane in self.start_lane[id_] } # state['end_lane_waiting_vehicle_count'] = {lane: get_lane_waiting_vehicle_count[lane] for lane in # self.end_lane[id_]} # # state['start_lane_vehicles'] = {lane: get_lane_vehicles[lane] for lane in self.start_lane[id_]} # state['end_lane_vehicles'] = {lane: get_lane_vehicles[lane] for lane in self.end_lane[id_]} # # state['start_lane_speed'] = { # lane: np.sum(list(map(lambda vehicle: get_vehicle_speed[vehicle], get_lane_vehicles[lane]))) / ( # get_lane_vehicle_count[lane] + 1e-5) for lane in # self.start_lane[id_]} # compute start lane mean speed # state['end_lane_speed'] = { # lane: np.sum(list(map(lambda vehicle: get_vehicle_speed[vehicle], get_lane_vehicles[lane]))) / ( # get_lane_vehicle_count[lane] + 1e-5) for lane in # self.end_lane[id_]} # compute end lane mean speed state['current_phase'] = self.current_phase[id_] # state['current_phase_time'] = self.current_phase_time[id_] state['adjacency_matrix'] = self.traffic_light_node_dict[id_][ 'adjacency_row'] return state def preprocess_state(self, state): return_state = np.array(state) if self.state_size is None: self.state_size = len(return_state.flatten()) return_state = np.reshape(np.array(return_state), [1, self.state_size]).flatten() return return_state def get_raw_reward(self): reward = { id_: self._get_raw_reward(id_) for id_ in self.intersection_id } # mean_global_sum = np.mean(list(reward.values())) return reward def _get_raw_reward(self, id_): # every agent/intersection's reward state = self.intersection_info(id_) # r = max(list(state['start_lane_vehicle_count'].values())) r = max(list(state['start_lane_waiting_vehicle_count'].values())) return -r # def get_score(self): # score = {id_: self._get_score(id_) for id_ in self.intersection_id} # score = self.dict_Avg(score) # return score # def _get_score(self, id_): # state = self.intersection_info(id_) # start_lane_speed = state['start_lane_speed'] # end_lane_speed = state['end_lane_speed'] # score = (self.dict_Avg(start_lane_speed) + self.dict_Avg(end_lane_speed)) / 2 # # score = (1 / (1 + np.exp(-1 * x))) / self.num_step # return score def sigmoid(self, x): return 1 / (1 + math.exp(-x)) def dict_Avg(self, Dict): Len = len(Dict) # 取字典中键值对的个数 Sum = sum(Dict.values()) # 取字典中键对应值的总和 Avg = Sum / Len return Avg def _adjacency_extraction(self): traffic_light_node_dict = {} file = self.roadnetFile with open('{0}'.format(file)) as json_data: net = json.load(json_data) # print(net) # build the info dict for intersections for inter in net['intersections']: if not inter['virtual']: traffic_light_node_dict[inter['id']] = { 'location': { 'x': float(inter['point']['x']), 'y': float(inter['point']['y']) }, "total_inter_num": None, 'adjacency_row': None, "inter_id_to_index": None, "neighbor_ENWS": None, "entering_lane_ENWS": None, } top_k = self.dic_traffic_env_conf["TOP_K_ADJACENCY"] total_inter_num = len(traffic_light_node_dict.keys()) inter_id_to_index = {} edge_id_dict = {} for road in net['roads']: if road['id'] not in edge_id_dict.keys(): edge_id_dict[road['id']] = {} edge_id_dict[road['id']]['from'] = road['startIntersection'] edge_id_dict[road['id']]['to'] = road['endIntersection'] edge_id_dict[road['id']]['num_of_lane'] = len(road['lanes']) edge_id_dict[road['id']]['length'] = np.sqrt( np.square(pd.DataFrame(road['points'])).sum(axis=1)).sum() index = 0 for i in traffic_light_node_dict.keys(): inter_id_to_index[i] = index index += 1 for i in traffic_light_node_dict.keys(): traffic_light_node_dict[i][ 'inter_id_to_index'] = inter_id_to_index traffic_light_node_dict[i]['neighbor_ENWS'] = [] traffic_light_node_dict[i]['entering_lane_ENWS'] = { "lane_ids": [], "lane_length": [] } for j in range(4): road_id = i.replace("intersection", "road") + "_" + str(j) neighboring_node = edge_id_dict[road_id]['to'] # calculate the neighboring intersections if neighboring_node not in traffic_light_node_dict.keys( ): # virtual node traffic_light_node_dict[i]['neighbor_ENWS'].append( None) else: traffic_light_node_dict[i]['neighbor_ENWS'].append( neighboring_node) # calculate the entering lanes ENWS for key, value in edge_id_dict.items(): if value['from'] == neighboring_node and value[ 'to'] == i: neighboring_road = key neighboring_lanes = [] for k in range(value['num_of_lane']): neighboring_lanes.append(neighboring_road + "_{0}".format(k)) traffic_light_node_dict[i]['entering_lane_ENWS'][ 'lane_ids'].append(neighboring_lanes) traffic_light_node_dict[i]['entering_lane_ENWS'][ 'lane_length'].append(value['length']) for i in traffic_light_node_dict.keys(): location_1 = traffic_light_node_dict[i]['location'] # TODO return with Top K results if not self.dic_traffic_env_conf[ 'ADJACENCY_BY_CONNECTION_OR_GEO']: # use geo-distance row = np.array([0] * total_inter_num) # row = np.zeros((self.dic_traffic_env_conf["NUM_ROW"],self.dic_traffic_env_conf["NUM_col"])) for j in traffic_light_node_dict.keys(): location_2 = traffic_light_node_dict[j]['location'] dist = self._cal_distance(location_1, location_2) row[inter_id_to_index[j]] = dist if len(row) == top_k: adjacency_row_unsorted = np.argpartition( row, -1)[:top_k].tolist() elif len(row) > top_k: adjacency_row_unsorted = np.argpartition( row, top_k)[:top_k].tolist() else: adjacency_row_unsorted = [ k for k in range(total_inter_num) ] adjacency_row_unsorted.remove(inter_id_to_index[i]) traffic_light_node_dict[i]['adjacency_row'] = [ inter_id_to_index[i] ] + adjacency_row_unsorted else: # use connection infomation traffic_light_node_dict[i]['adjacency_row'] = [ inter_id_to_index[i] ] for j in traffic_light_node_dict[i][ 'neighbor_ENWS']: ## TODO if j is not None: traffic_light_node_dict[i]['adjacency_row'].append( inter_id_to_index[j]) else: traffic_light_node_dict[i]['adjacency_row'].append( -1) traffic_light_node_dict[i]['total_inter_num'] = total_inter_num path_to_save = os.path.join(self.record_dir, 'traffic_light_node_dict.conf') with open(path_to_save, 'w') as f: f.write(str(traffic_light_node_dict)) print("\033[1;33mSaved traffic_light_node_dict\033[0m") return traffic_light_node_dict def _update_leave_entering_approach_vehicle(self, id_): list_entering_lane_vehicle_left = [] # update vehicles leaving entering lane if not self.dic_lane_vehicle_previous_step[id_]: for lane in self.lane_phase_info[id_]['start_lane']: list_entering_lane_vehicle_left.append([]) else: last_step_vehicle_id_list = [] current_step_vehilce_id_list = [] for lane in self.lane_phase_info[id_]['start_lane']: last_step_vehicle_id_list.extend( self.dic_lane_vehicle_previous_step[id_][lane]) current_step_vehilce_id_list.extend( self.dic_lane_vehicle_current_step[id_][lane]) list_entering_lane_vehicle_left.append( list( set(last_step_vehicle_id_list) - set(current_step_vehilce_id_list))) return list_entering_lane_vehicle_left def _update_arrive_time(self, id_, list_vehicle_arrive): ts = self.get_current_time() # get dic vehicle enter leave time for vehicle in list_vehicle_arrive: if vehicle not in self.dic_vehicle_arrive_leave_time[id_]: self.dic_vehicle_arrive_leave_time[id_][vehicle] = \ {"enter_time": ts, "leave_time": np.nan} else: # print("vehicle: %s already exists in entering lane!"%vehicle) # sys.exit(-1) pass def _update_left_time(self, id_, list_vehicle_left): ts = self.get_current_time() # update the time for vehicle to leave entering lane for vehicle in list_vehicle_left: try: self.dic_vehicle_arrive_leave_time[id_][vehicle][ "leave_time"] = ts except KeyError: print("vehicle not recorded when entering") sys.exit(-1) @staticmethod def _cal_distance(loc_dict1, loc_dict2): a = np.array((loc_dict1['x'], loc_dict1['y'])) b = np.array((loc_dict2['x'], loc_dict2['y'])) return np.sqrt(np.sum((a - b)**2)) def get_current_time(self): return self.eng.get_current_time() def log(self, path_to_log): for id_ in self.intersection_id: # print("log for ", id_) path_to_log_file = os.path.join(path_to_log, "vehicle_{0}.csv".format(id_)) dic_vehicle = self.get_dic_vehicle_arrive_leave_time(id_) df = pd.DataFrame.from_dict(dic_vehicle, orient='index') df.to_csv(path_to_log_file, na_rep="nan") # path_to_log_file = os.path.join(self.path_to_log, "inter_{0}.pkl".format(inter_ind)) # f = open(path_to_log_file, "wb") # # # Use pickle to pack data flow into # pickle.dump(self.list_inter_log[inter_ind], f) # f.close() def get_dic_vehicle_arrive_leave_time(self, id_): return self.dic_vehicle_arrive_leave_time[id_]
def learn_test_multi_agent_plus_rollout(algo): for fw in framework_iterator(frameworks=("tf", "torch")): tmp_dir = os.popen("mktemp -d").read()[:-1] if not os.path.exists(tmp_dir): # Last resort: Resolve via underlying tempdir (and cut tmp_. tmp_dir = ray._private.utils.tempfile.gettempdir() + tmp_dir[4:] if not os.path.exists(tmp_dir): sys.exit(1) print("Saving results to {}".format(tmp_dir)) rllib_dir = str(Path(__file__).parent.parent.absolute()) print("RLlib dir = {}\nexists={}".format(rllib_dir, os.path.exists(rllib_dir))) def policy_fn(agent_id, episode, **kwargs): return "pol{}".format(agent_id) observation_space = Box(float("-inf"), float("inf"), (4, )) action_space = Discrete(2) config = { "num_gpus": 0, "num_workers": 1, "evaluation_config": { "explore": False }, "framework": fw, "env": MultiAgentCartPole, "multiagent": { "policies": { "pol0": (None, observation_space, action_space, {}), "pol1": (None, observation_space, action_space, {}), }, "policy_mapping_fn": policy_fn, }, } stop = {"episode_reward_mean": 150.0} tune.run( algo, config=config, stop=stop, checkpoint_freq=1, checkpoint_at_end=True, local_dir=tmp_dir, verbose=1) # Find last checkpoint and use that for the rollout. checkpoint_path = os.popen("ls {}/PPO/*/checkpoint_*/" "checkpoint-*".format(tmp_dir)).read()[:-1] checkpoint_paths = checkpoint_path.split("\n") assert len(checkpoint_paths) > 0 checkpoints = [ cp for cp in checkpoint_paths if re.match(r"^.+checkpoint-\d+$", cp) ] # Sort by number and pick last (which should be the best checkpoint). last_checkpoint = sorted( checkpoints, key=lambda x: int(re.match(r".+checkpoint-(\d+)", x).group(1)))[-1] assert re.match(r"^.+checkpoint_\d+/checkpoint-\d+$", last_checkpoint) if not os.path.exists(last_checkpoint): sys.exit(1) print("Best checkpoint={} (exists)".format(last_checkpoint)) ray.shutdown() # Test rolling out n steps. result = os.popen( "python {}/rollout.py --run={} " "--steps=400 " "--out=\"{}/rollouts_n_steps.pkl\" --no-render \"{}\"".format( rllib_dir, algo, tmp_dir, last_checkpoint)).read()[:-1] if not os.path.exists(tmp_dir + "/rollouts_n_steps.pkl"): sys.exit(1) print("Rollout output exists -> Checking reward ...") episodes = result.split("\n") mean_reward = 0.0 num_episodes = 0 for ep in episodes: mo = re.match(r"Episode .+reward: ([\d\.\-]+)", ep) if mo: mean_reward += float(mo.group(1)) num_episodes += 1 mean_reward /= num_episodes print("Rollout's mean episode reward={}".format(mean_reward)) assert mean_reward >= 150.0 # Cleanup. os.popen("rm -rf \"{}\"".format(tmp_dir)).read()
self.observation_space = observation_space def reset(self, *, seed: Optional[int] = None, options: Optional[dict] = None): super().reset(seed=seed) self.observation = self.observation_space.sample() return self.observation OBSERVATION_SPACES = ( ( Dict( OrderedDict([ ("key1", Box(shape=(2, 3), low=0, high=0, dtype=np.float32)), ("key2", Box(shape=(), low=1, high=1, dtype=np.float32)), ("key3", Box(shape=(2, ), low=2, high=2, dtype=np.float32)), ])), True, ), ( Dict( OrderedDict([ ("key2", Box(shape=(), low=0, high=0, dtype=np.float32)), ("key3", Box(shape=(2, ), low=1, high=1, dtype=np.float32)), ("key1", Box(shape=(2, 3), low=2, high=2, dtype=np.float32)), ])), True, ), (
def __init__(self, env, width, height): assert isinstance(env.observation_space, Box) super(ResizeWrapper, self).__init__(env) self.width = width self.height = height self.observation_space = Box(0, 255, (height, width) + env.observation_space.low.shape[2:])
def __init__( self, obj_low=None, obj_high=None, random_init=False, tasks=[{ 'goal': np.array([0.1, 0.8, 0.2]), 'obj_init_pos': np.array([0, 0.6, 0.02]), 'obj_init_angle': 0.3 }], goal_low=None, goal_high=None, hand_init_pos=(0, 0.6, 0.2), liftThresh=0.04, rewMode='orig', rotMode='rotz', #'fixed', **kwargs): hand_low = (-0.5, 0.40, 0.05) hand_high = (0.5, 1, 0.5) obj_low = (-0.5, 0.40, 0.05) obj_high = (0.5, 1, 0.5) SawyerXYZEnv.__init__(self, frame_skip=5, action_scale=1. / 100, hand_low=hand_low, hand_high=hand_high, model_name=self.model_name, **kwargs) if obj_low is None: obj_low = self.hand_low if goal_low is None: goal_low = self.hand_low if obj_high is None: obj_high = self.hand_high if goal_high is None: goal_high = self.hand_high self.random_init = random_init self.liftThresh = liftThresh self.max_path_length = 200 #150 self.tasks = tasks self.num_tasks = len(tasks) self.rewMode = rewMode self.rotMode = rotMode self.hand_init_pos = np.array(hand_init_pos) if rotMode == 'fixed': self.action_space = Box( np.array([-1, -1, -1, -1]), np.array([1, 1, 1, 1]), ) elif rotMode == 'rotz': self.action_rot_scale = 1. / 50 self.action_space = Box( np.array([-1, -1, -1, -np.pi, -1]), np.array([1, 1, 1, np.pi, 1]), ) elif rotMode == 'quat': self.action_space = Box( np.array([-1, -1, -1, 0, -1, -1, -1, -1]), np.array([1, 1, 1, 2 * np.pi, 1, 1, 1, 1]), ) else: self.action_space = Box( np.array([-1, -1, -1, -np.pi / 2, -np.pi / 2, 0, -1]), np.array([1, 1, 1, np.pi / 2, np.pi / 2, np.pi * 2, 1]), ) self.hand_and_obj_space = Box( np.hstack((self.hand_low, obj_low)), np.hstack((self.hand_high, obj_high)), ) self.goal_space = Box(goal_low, goal_high) self.observation_space = Box( np.hstack((self.hand_low, obj_low, obj_low)), np.hstack((self.hand_high, obj_high, obj_high)), )
import numpy as np from gym.spaces import Discrete, Box action_space = Discrete(8) observation_space = Box(-np.inf, np.inf, shape=(240, 320, 1), dtype=np.float32)
def __init__(self, goal_low=None, goal_high=None, action_reward_scale=0, reward_type='angle_difference', indicator_threshold=(.02, .03), fix_goal=False, fixed_goal=(0, .45, .12, -.25), reset_free=False, fixed_hand_z=0.12, hand_low=(-0.25, 0.3, .12), hand_high=(0.25, 0.6, .12), target_pos_scale=1, target_angle_scale=1, min_angle=-1.5708, max_angle=0, xml_path='sawyer_xyz/sawyer_door_pull.xml', **sawyer_xyz_kwargs): self.quick_init(locals()) self.model_name = get_asset_full_path(xml_path) SawyerXYZEnv.__init__(self, self.model_name, hand_low=hand_low, hand_high=hand_high, **sawyer_xyz_kwargs) MultitaskEnv.__init__(self) self.reward_type = reward_type self.indicator_threshold = indicator_threshold self.fix_goal = fix_goal self.fixed_goal = np.array(fixed_goal) self._state_goal = None self.fixed_hand_z = fixed_hand_z self.action_space = Box(np.array([-1, -1]), np.array([1, 1]), dtype=np.float32) self.state_space = Box( np.concatenate((hand_low, [min_angle])), np.concatenate((hand_high, [max_angle])), dtype=np.float32, ) if goal_low is None: goal_low = self.state_space.low if goal_high is None: goal_high = self.state_space.high self.goal_space = Box( np.array(goal_low), np.array(goal_high), dtype=np.float32, ) self.observation_space = Dict([ ('observation', self.state_space), ('desired_goal', self.goal_space), ('achieved_goal', self.state_space), ('state_observation', self.state_space), ('state_desired_goal', self.goal_space), ('state_achieved_goal', self.state_space), ]) self.action_reward_scale = action_reward_scale self.target_pos_scale = target_pos_scale self.target_angle_scale = target_angle_scale self.reset_free = reset_free self.door_angle_idx = self.model.get_joint_qpos_addr('doorjoint') self.reset()
('/rl/on_pedestrian', 'std_msgs.msg.Bool'), ('/rl/obs_factor', 'std_msgs.msg.Float32'), ], defs_action=[('/autoDrive_KeyboardMode', 'std_msgs.msg.Char')], rate_action=10.0, window_sizes={'obs': 2, 'reward': 3}, buffer_sizes={'obs': 2, 'reward': 3}, func_compile_obs=func_compile_obs, func_compile_reward=func_compile_reward, func_compile_action=func_compile_action, step_delay_target=0.5, is_dummy_action=False) # TODO: define these Gym related params insode DrivingSimulatorEnv env.observation_space = Box(low=0, high=255, shape=(350, 350, 3)) env.reward_range = (-np.inf, np.inf) env.metadata = {} env.action_space = Discrete(len(ACTIONS)) env = FrameStack(env, 3) n_interactive = 0 n_skip = 1 n_additional_learn = 4 n_ep = 0 # last ep in the last run, if restart use 0 n_test = 10 # num of episode per test run (no exploration) state_shape = (350, 350, 9) tf.app.flags.DEFINE_string("logdir", "/home/pirate03/PycharmProjects/hobotrl/playground/initialD/imitaion_learning/fnet_rename_learn_q_no_skip_tmp_debug", """save tmp model""") tf.app.flags.DEFINE_string("savedir",
def get_manager_ac_space(ob_space, relative_goals, env_name, use_fingerprints, fingerprint_dim): """Compute the action space for the Manager. If the fingerprint terms are being appended onto the observations, this should be removed from the action space. Parameters ---------- ob_space : gym.spaces.* the observation space of the environment relative_goals : bool specifies whether the goal issued by the Manager is meant to be a relative or absolute goal, i.e. specific state or change in state env_name : str the name of the environment. Used for special cases to assign the Manager action space to only ego observations in the observation space. use_fingerprints : bool specifies whether to add a time-dependent fingerprint to the observations fingerprint_dim : tuple of int the shape of the fingerprint elements, if they are being used Returns ------- gym.spaces.Box the action space of the Manager policy """ if env_name in [ "AntMaze", "AntPush", "AntFall", "AntGather", "AntFourRooms" ]: manager_ac_space = Box( low=np.array([ -10, -10, -0.5, -1, -1, -1, -1, -0.5, -0.3, -0.5, -0.3, -0.5, -0.3, -0.5, -0.3 ]), high=np.array([ 10, 10, 0.5, 1, 1, 1, 1, 0.5, 0.3, 0.5, 0.3, 0.5, 0.3, 0.5, 0.3 ]), dtype=np.float32, ) elif env_name == "UR5": manager_ac_space = Box( low=np.array([-2 * np.pi, -2 * np.pi, -2 * np.pi, -4, -4, -4]), high=np.array([2 * np.pi, 2 * np.pi, 2 * np.pi, 4, 4, 4]), dtype=np.float32, ) elif env_name == "Pendulum": manager_ac_space = Box(low=np.array([-np.pi, -15]), high=np.array([np.pi, 15]), dtype=np.float32) elif env_name in ["ring0", "ring1"]: if relative_goals: manager_ac_space = Box(-.5, .5, shape=(1, ), dtype=np.float32) else: manager_ac_space = Box(0, 1, shape=(1, ), dtype=np.float32) elif env_name == "figureeight0": if relative_goals: manager_ac_space = Box(-.5, .5, shape=(1, ), dtype=np.float32) else: manager_ac_space = Box(0, 1, shape=(1, ), dtype=np.float32) elif env_name == "figureeight1": if relative_goals: manager_ac_space = Box(-.5, .5, shape=(7, ), dtype=np.float32) else: manager_ac_space = Box(0, 1, shape=(7, ), dtype=np.float32) elif env_name == "figureeight2": if relative_goals: manager_ac_space = Box(-.5, .5, shape=(14, ), dtype=np.float32) else: manager_ac_space = Box(0, 1, shape=(14, ), dtype=np.float32) elif env_name == "merge0": if relative_goals: manager_ac_space = Box(-.5, .5, shape=(5, ), dtype=np.float32) else: manager_ac_space = Box(0, 1, shape=(5, ), dtype=np.float32) elif env_name == "merge1": if relative_goals: manager_ac_space = Box(-.5, .5, shape=(13, ), dtype=np.float32) else: manager_ac_space = Box(0, 1, shape=(13, ), dtype=np.float32) elif env_name == "merge2": if relative_goals: manager_ac_space = Box(-.5, .5, shape=(17, ), dtype=np.float32) else: manager_ac_space = Box(0, 1, shape=(17, ), dtype=np.float32) elif env_name == "PD-Biped3D-HLC-Soccer-v1": manager_ac_space = Box( low=np.array( [0, -1, -1, -1, -1, -2, -2, -2, -2, -2, -2, -2, -1, -2]), high=np.array([1.5, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 1, 2]), dtype=np.float32) else: if use_fingerprints: low = np.array(ob_space.low)[:-fingerprint_dim[0]] high = ob_space.high[:-fingerprint_dim[0]] manager_ac_space = Box(low=low, high=high, dtype=np.float32) else: manager_ac_space = ob_space return manager_ac_space
def __init__(self, env): super().__init__(env) self.n = None if isinstance(self.env.observation_space, Discrete): self.n = self.env.observation_space.n self.observation_space = Box(0, 1, (self.n,))
def __init__(self, random_init=False, task_type='pick_place'): liftThresh = 0.04 goal_low = (-0.1, 0.8, 0.05) goal_high = (0.1, 0.9, 0.3) hand_low = (-0.5, 0.40, 0.05) hand_high = (0.5, 1, 0.5) obj_low = (-0.1, 0.6, 0.02) obj_high = (0.1, 0.7, 0.02) self.task_types = ['pick_place', 'reach', 'push'] super().__init__( self.model_name, hand_low=hand_low, hand_high=hand_high, ) self.task_type = task_type self.init_config = { 'obj_init_angle': .3, 'obj_init_pos': np.array([0, 0.6, 0.02]), 'hand_init_pos': np.array([0, .6, .2]), } # we only do one task from [pick_place, reach, push] # per instance of SawyerReachPushPickPlaceEnv. # Please only set task_type from constructor. if self.task_type == 'pick_place': self.goal = np.array([0.1, 0.8, 0.2]) elif self.task_type == 'reach': self.goal = np.array([-0.1, 0.8, 0.2]) elif self.task_type == 'push': self.goal = np.array([0.1, 0.8, 0.02]) else: raise NotImplementedError self.obj_init_angle = self.init_config['obj_init_angle'] self.obj_init_pos = self.init_config['obj_init_pos'] self.hand_init_pos = self.init_config['hand_init_pos'] self.random_init = random_init self.liftThresh = liftThresh self.max_path_length = 150 self.action_space = Box( np.array([-1, -1, -1, -1]), np.array([1, 1, 1, 1]), ) self.obj_and_goal_space = Box( np.hstack((obj_low, goal_low)), np.hstack((obj_high, goal_high)), ) self.goal_space = Box(np.array(goal_low), np.array(goal_high)) self.observation_space = Box( np.hstack(( self.hand_low, obj_low, )), np.hstack(( self.hand_high, obj_high, )), ) self.num_resets = 0 self.reset()
class TestC51Network(TestBaseNetwork): __test__ = True network = C51Network list_work = [[Discrete(3), Discrete(1)], [Discrete(3), Discrete(3)], [Discrete(10), Discrete(50)], [MultiDiscrete([3]), MultiDiscrete([1])], [MultiDiscrete([3, 3]), MultiDiscrete([3, 3])], [MultiDiscrete([4, 4, 4]), MultiDiscrete([50, 4, 4])], [ MultiDiscrete([[100, 3], [3, 5]]), MultiDiscrete([[100, 3], [3, 5]]) ], [ MultiDiscrete([[[100, 3], [3, 5]], [[100, 3], [3, 5]]]), MultiDiscrete([[[100, 3], [3, 5]], [[100, 3], [3, 5]]]) ]] list_fail = [ [None, None], ["dedrfe", "qdzq"], [1215.4154, 157.48], ["zdzd", (Discrete(1))], [Discrete(1), "zdzd"], ["zdzd", (1, 4, 7)], [(1, 4, 7), "zdzd"], [152, 485], [MultiBinary(1), MultiBinary(1)], [MultiBinary(3), MultiBinary(3)], # [MultiBinary([3, 2]), MultiBinary([3, 2])], # Don't work yet because gym don't implemented this [Box(low=0, high=10, shape=[1]), Box(low=0, high=10, shape=[1])], [Box(low=0, high=10, shape=[2, 2]), Box(low=0, high=10, shape=[2, 2])], [ Box(low=0, high=10, shape=[2, 2, 2]), Box(low=0, high=10, shape=[2, 2, 2]) ], [ Tuple([Discrete(1), MultiDiscrete([1, 1])]), Tuple([Discrete(1), MultiDiscrete([1, 1])]) ], [ Dict({ "first": Discrete(1), "second": MultiDiscrete([1, 1]) }), Dict({ "first": Discrete(1), "second": MultiDiscrete([1, 1]) }) ] ] def test_init(self): for ob, ac in self.list_fail: with pytest.raises(TypeError): self.network(observation_space=ob, action_space=ac) for ob, ac in self.list_work: self.network(observation_space=ob, action_space=ac) def test_forward(self): for ob, ac in self.list_work: network = self.network(observation_space=ob, action_space=ac) network.forward(torch.rand((1, flatdim(ob)))) def test_str_(self): for ob, ac in self.list_work: network = self.network(observation_space=ob, action_space=ac) assert 'C51Network-' + str(ob) + "-" + str(ac) == network.__str__()