Beispiel #1
0
def test_box_dtype_check():
    # Related Issues:
    # https://github.com/openai/gym/issues/2357
    # https://github.com/openai/gym/issues/2298

    space = Box(0, 2, tuple(), dtype=np.float32)

    # casting will match the correct type
    assert space.contains(0.5)

    # float64 is not in float32 space
    assert not space.contains(np.array(0.5))
    assert not space.contains(np.array(1))
Beispiel #2
0
class Basic:
    """Mean, stddev, min, max, for the last `lookback` bars, deltas for the last `deltas` bars, and most
    recent actual values of `fields`."""
    def __init__(self, lookback, deltas=4, fields=('bid', 'bidsize', 'ask', 'asksize')):
        self.lookback = int(lookback)
        self.fields = [Obs._fields.index(field) for field in fields]        # Can't be tuple for ndarray slicing to work
        self.vals = deque(maxlen=lookback)
        self.deltas = int(deltas)
        assert self.lookback > self.deltas >= 0
        #obs_size = len(fields) * (5 + deltas)       # mean, std, min, max, last, [deltas] for each field
        self.observation_space = Box(low=-np.inf, high=np.inf, shape=(5 + deltas, len(fields)))
        #       bid ask bidsize asksize
        # mean
        # std
        # min
        # max
        # last
        # delta1
        # ...

    def __call__(self, obs):
        obs = np.asarray(obs)
        assert obs.shape == (len(Obs._fields),)
        self.vals.append(obs[self.fields])
        vals = np.array(self.vals)
        xobs = np.vstack([vals.mean(axis=0), vals.std(axis=0), vals.min(axis=0), vals.max(axis=0), vals[-1], np.diff(vals, axis=0)[-self.deltas:]])
        if len(self.vals) > self.deltas and np.all(np.isfinite(xobs)):
            assert self.observation_space.contains(xobs), 'shape {} expected {}\n{}'.format(xobs.shape, self.observation_space.shape, xobs)
        else:
            xobs = None
        LOG.debug('XFORM %s', xobs)
        return xobs
Beispiel #3
0
class ContinuousCartPoleEnv(CartPoleEnv):

    metadata = {
        'render.modes': ['human', 'rgb_array'],
        'video.frames_per_second': 50
    }

    def __init__(self):
        super().__init__()
        self.action_space = Box(-1, 1, shape=[1], dtype=np.float32)

    def step(self, action):
        assert self.action_space.contains(
            action), "%r (%s) invalid" % (action, type(action))
        state = self.state
        x, x_dot, theta, theta_dot = state
        force = self.force_mag * action.item()
        costheta = math.cos(theta)
        sintheta = math.sin(theta)
        temp = (force + self.polemass_length * theta_dot * theta_dot *
                sintheta) / self.total_mass
        thetaacc = (self.gravity * sintheta - costheta * temp) / (
            self.length *
            (4.0 / 3.0 -
             self.masspole * costheta * costheta / self.total_mass))
        xacc = temp - self.polemass_length * thetaacc * costheta / self.total_mass
        if self.kinematics_integrator == 'euler':
            x = x + self.tau * x_dot
            x_dot = x_dot + self.tau * xacc
            theta = theta + self.tau * theta_dot
            theta_dot = theta_dot + self.tau * thetaacc
        else:  # semi-implicit euler
            x_dot = x_dot + self.tau * xacc
            x = x + self.tau * x_dot
            theta_dot = theta_dot + self.tau * thetaacc
            theta = theta + self.tau * theta_dot
        self.state = (x, x_dot, theta, theta_dot)
        done = x < -self.x_threshold \
               or x > self.x_threshold \
               or theta < -self.theta_threshold_radians \
               or theta > self.theta_threshold_radians
        done = bool(done)

        if not done:
            reward = 1.0
        elif self.steps_beyond_done is None:
            # Pole just fell!
            self.steps_beyond_done = 0
            reward = 1.0
        else:
            if self.steps_beyond_done == 0:
                logger.warn(
                    "You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior."
                )
            self.steps_beyond_done += 1
            reward = 0.0

        return np.array(self.state), reward, done, {}
Beispiel #4
0
class Instrument(gym.Env):
    def __init__(self, polyphony: np.int = np.inf, key_range=(0, 127)):
        self.__polyphony = polyphony
        self.__key_start, self.__key_end = key_range
        note_region_length = self.__key_end - self.__key_start + 1
        self.__status = np.zeros((note_region_length,), dtype=np.int8)
        self.__teacher: Optional[Teacher] = Teacher()
        self.observation_space = Box(0, 1, (note_region_length,), dtype=np.int8)
        self.action_space = Box(-1, 1, (note_region_length,), dtype=np.int8)
        self.num_envs = 1
        self.remotes = []

    def reset(self):
        self.__status = np.zeros(self.__key_end - self.__key_start + 1, dtype=np.int8)
        return self.__status

    def step(self, action: np.ndarray):
        if self.action_space.contains(action):
            a = np.abs(action)
            if np.sum(a) > self.__polyphony:
                t = a[a > 0]
                t[self.__polyphony:] = 0
                np.random.shuffle(t)
                a[a > 0.5] = t
            self.__status = a.astype(np.int8)
        else:
            raise ValueError('Action not in action_space')
        return self.__status, self.reward(a), False, dict()

    def render(self, mode='human'):
        print(self.__status)

    def reward(self, action: np.ndarray) -> np.float:
        if np.sum(action) > self.__polyphony:
            return -1.0
        if self.__teacher is None:
            return 0.0
        else:
            return self.__teacher.evaluate(action)

    @property
    def polyphony(self):
        return self.__polyphony

    @property
    def region_length(self):
        return self.__key_end - self.__key_start + 1

    @property
    def teacher(self):
        return self.__teacher
Beispiel #5
0
def _rescale_from_one_space_to_other(
        input_val: np.ndarray, input_space: spaces.Box, output_space: spaces.Box) -> np.ndarray:
    """
    Transforms a point in a input space to a corresponding point in the
    output space. The output point is as far away from the output
    boundaries as the input point is from the input boundaries.
    :param input_space: bounded Box
    :param output_space: bounded Box
    :param input_val: The input point
    :return: The output point in the output space
    """
    assert input_space.shape == output_space.shape
    assert input_space.contains(input_val)
    slope = (output_space.high-output_space.low) / (input_space.high-input_space.low)
    return slope * (input_val - input_space.high) + output_space.high
Beispiel #6
0
class Delta:
    """The difference between the last `lookback` bid, ask, last, and volume from `lookback + 1` bars ago."""
    def __init__(self, lookback):
        self.lookback = lookback
        self.vals = deque(maxlen=lookback + 1)
        obs_size = lookback * 4 + 2     # diffs for bid/ask/last/vol + unrealized/pos
        self.observation_space = Box(low=-np.inf, high=np.inf, shape=(obs_size,))

    def __call__(self, obs):
        obs = Obs._make(obs)
        self.vals.append([obs.bid, obs.ask, obs.last, obs.volume / 100])
        xobs = np.hstack([np.diff(np.array(self.vals), axis=0).flatten(), obs.unrealized_gain / 100, obs.position])
        if len(self.vals) == self.lookback + 1 and np.isfinite(xobs).all():
            assert self.observation_space.contains(xobs), 'shape {} expected {}\n{}'.format(xobs.shape, self.observation_space.shape, xobs)
        else:
            xobs = None
        LOG.debug('XFORM %s', xobs)
        return xobs
class SawyerReachTorqueEnv(MujocoEnv, Serializable, MultitaskEnv):
    """Implements a torque-controlled Sawyer environment"""
    def __init__(
        self,
        frame_skip=30,
        action_scale=10,
        keep_vel_in_obs=True,
        use_safety_box=False,
        fix_goal=False,
        fixed_goal=(0.05, 0.6, 0.15),
        reward_type='hand_distance',
        indicator_threshold=.05,
        goal_low=None,
        goal_high=None,
    ):
        self.quick_init(locals())
        MultitaskEnv.__init__(self)
        self.action_scale = action_scale
        MujocoEnv.__init__(self, self.model_name, frame_skip=frame_skip)
        bounds = self.model.actuator_ctrlrange.copy()
        low = bounds[:, 0]
        high = bounds[:, 1]
        self.action_space = Box(low=low, high=high)
        if goal_low is None:
            goal_low = np.array([-0.1, 0.5, 0.02])
        else:
            goal_low = np.array(goal_low)
        if goal_high is None:
            goal_high = np.array([0.1, 0.7, 0.2])
        else:
            goal_high = np.array(goal_low)
        self.safety_box = Box(goal_low, goal_high)
        self.keep_vel_in_obs = keep_vel_in_obs
        self.goal_space = Box(goal_low, goal_high)
        obs_size = self._get_env_obs().shape[0]
        high = np.inf * np.ones(obs_size)
        low = -high
        self.obs_space = Box(low, high)
        self.achieved_goal_space = Box(-np.inf * np.ones(3),
                                       np.inf * np.ones(3))
        self.observation_space = Dict([
            ('observation', self.obs_space),
            ('desired_goal', self.goal_space),
            ('achieved_goal', self.achieved_goal_space),
            ('state_observation', self.obs_space),
            ('state_desired_goal', self.goal_space),
            ('state_achieved_goal', self.achieved_goal_space),
        ])
        self.fix_goal = fix_goal
        self.fixed_goal = np.array(fixed_goal)
        self.use_safety_box = use_safety_box
        self.prev_qpos = self.init_angles.copy()
        self.reward_type = reward_type
        self.indicator_threshold = indicator_threshold
        self.reset()

    @property
    def model_name(self):
        return get_asset_full_path('sawyer_xyz/sawyer_reach_torque.xml')

    def reset_to_prev_qpos(self):
        angles = self.data.qpos.copy()
        velocities = self.data.qvel.copy()
        angles[:] = self.prev_qpos.copy()
        velocities[:] = 0
        self.set_state(angles.flatten(), velocities.flatten())

    def is_outside_box(self):
        pos = self.get_endeff_pos()
        return not self.safety_box.contains(pos)

    def set_to_qpos(self, qpos):
        angles = self.data.qpos.copy()
        velocities = self.data.qvel.copy()
        angles[:] = qpos
        velocities[:] = 0
        self.set_state(angles.flatten(), velocities.flatten())

    def viewer_setup(self):
        self.viewer.cam.trackbodyid = 0
        self.viewer.cam.distance = 1.0

        # 3rd person view
        cam_dist = 0.3
        rotation_angle = 270
        cam_pos = np.array([0, 1.0, 0.5, cam_dist, -45, rotation_angle])

        for i in range(3):
            self.viewer.cam.lookat[i] = cam_pos[i]
        self.viewer.cam.distance = cam_pos[3]
        self.viewer.cam.elevation = cam_pos[4]
        self.viewer.cam.azimuth = cam_pos[5]
        self.viewer.cam.trackbodyid = -1

    def step(self, action):
        action = action * self.action_scale
        self.do_simulation(action, self.frame_skip)
        if self.use_safety_box:
            if self.is_outside_box():
                self.reset_to_prev_qpos()
            else:
                self.prev_qpos = self.data.qpos.copy()
        ob = self._get_obs()
        info = self._get_info()
        reward = self.compute_reward(action, ob)
        done = False
        return ob, reward, done, info

    def _get_env_obs(self):
        if self.keep_vel_in_obs:
            return np.concatenate([
                self.sim.data.qpos.flat,
                self.sim.data.qvel.flat,
                self.get_endeff_pos(),
            ])
        else:
            return np.concatenate([
                self.sim.data.qpos.flat,
                self.get_endeff_pos(),
            ])

    def _get_obs(self):
        ee_pos = self.get_endeff_pos()
        state_obs = self._get_env_obs()
        return dict(
            observation=state_obs,
            desired_goal=self._state_goal,
            achieved_goal=ee_pos,
            state_observation=state_obs,
            state_desired_goal=self._state_goal,
            state_achieved_goal=ee_pos,
        )

    def _get_info(self):
        hand_distance = np.linalg.norm(self._state_goal -
                                       self.get_endeff_pos())
        return dict(
            hand_distance=hand_distance,
            hand_success=float(hand_distance < self.indicator_threshold),
        )

    def get_endeff_pos(self):
        return self.data.body_xpos[self.endeff_id].copy()

    def reset_model(self):
        angles = self.data.qpos.copy()
        velocities = self.data.qvel.copy()
        angles[:] = self.init_angles
        velocities[:] = 0
        self.set_state(angles.flatten(), velocities.flatten())
        self.sim.forward()
        self.prev_qpos = self.data.qpos.copy()

    def reset(self):
        self.reset_model()
        self.set_goal(self.sample_goal())
        self.sim.forward()
        self.prev_qpos = self.data.qpos.copy()
        return self._get_obs()

    @property
    def init_angles(self):
        return [
            1.02866769e+00,
            -6.95207647e-01,
            4.22932911e-01,
            1.76670458e+00,
            -5.69637604e-01,
            6.24117280e-01,
            3.53404635e+00,
        ]

    @property
    def endeff_id(self):
        return self.model.body_names.index('leftclaw')

    def get_diagnostics(self, paths, prefix=''):
        statistics = OrderedDict()
        for stat_name in [
                'hand_distance',
                'hand_success',
        ]:
            stat_name = stat_name
            stat = get_stat_in_paths(paths, 'env_infos', stat_name)
            statistics.update(
                create_stats_ordered_dict(
                    '%s%s' % (prefix, stat_name),
                    stat,
                    always_show_all_stats=True,
                ))
            statistics.update(
                create_stats_ordered_dict(
                    'Final %s%s' % (prefix, stat_name),
                    [s[-1] for s in stat],
                    always_show_all_stats=True,
                ))
        return statistics

    """
    Multitask functions
    """

    @property
    def goal_dim(self) -> int:
        return 3

    def get_goal(self):
        return {
            'desired_goal': self._state_goal,
            'state_desired_goal': self._state_goal,
        }

    def set_goal(self, goal):
        self._state_goal = goal['state_desired_goal']

    def sample_goals(self, batch_size):
        if self.fix_goal:
            goals = np.repeat(self.fixed_goal.copy()[None], batch_size, 0)
        else:
            goals = np.random.uniform(
                self.goal_space.low,
                self.goal_space.high,
                size=(batch_size, self.goal_space.low.size),
            )
        return {
            'desired_goal': goals,
            'state_desired_goal': goals,
        }

    def compute_rewards(self, actions, obs):
        achieved_goals = obs['achieved_goal']
        desired_goals = obs['desired_goal']
        hand_pos = achieved_goals
        goals = desired_goals
        distances = np.linalg.norm(hand_pos - goals, axis=1)
        if self.reward_type == 'hand_distance':
            r = -distances
        elif self.reward_type == 'hand_success':
            r = -(distances > self.indicator_threshold).astype(float)
        else:
            raise NotImplementedError("Invalid/no reward type.")
        return r

    def get_env_state(self):
        joint_state = self.sim.get_state()
        goal = self._state_goal.copy()
        return joint_state, goal

    def set_env_state(self, state):
        state, goal = state
        self.sim.set_state(state)
        self.sim.forward()
        self._state_goal = goal
Beispiel #8
0
    def update(self, tasks, continuous_competence, all_raw_rewards):
        if not isinstance(continuous_competence, list):
            continuous_competence = [continuous_competence]
        if not isinstance(tasks[0], list):
            tasks = [tasks]

        #print(continuous_competence)
        if len(tasks) > 0:
            new_split = False
            all_order = None
            regions = [None] * len(tasks)
            for i, task in enumerate(tasks):
                for j, rb in enumerate(self.region_bounds):
                    if rb.contains(task):
                        regions[i] = j
                        break

            cps = continuous_competence

            # add new outcomes and tasks to regions
            for reg, cp, task in zip(regions, cps, tasks):
                self.regions[reg][0].append(cp)
                self.regions[reg][1].append(task)
                self.update_nb += 1

            # check if need to split
            ind_split = []
            new_bounds = []
            new_sub_regions = []
            for reg in range(self.nb_regions):
                if len(self.regions[reg][0]) > self.maxlen:
                    # try nb_split_attempts splits
                    best_split_score = 0
                    best_abs_interest_diff = 0
                    best_bounds = None
                    best_sub_regions = None
                    is_split = False
                    for i in range(self.nb_split_attempts):
                        sub_reg1 = [deque(), deque()]
                        sub_reg2 = [deque(), deque()]

                        # repeat until the two sub regions contain at least 1/4 of the mother region
                        while len(sub_reg1[0]) < self.maxlen / 4 or  len(sub_reg2[0]) < self.maxlen / 4:
                            # decide on dimension
                            dim = np.random.choice(range(self.nb_dims))
                            threshold = self.region_bounds[reg].sample()[dim]
                            bounds1 = Box(self.region_bounds[reg].low, self.region_bounds[reg].high, dtype=np.float32)
                            bounds1.high[dim] = threshold
                            bounds2 = Box(self.region_bounds[reg].low, self.region_bounds[reg].high, dtype=np.float32)
                            bounds2.low[dim] = threshold
                            bounds = [bounds1, bounds2]
                            valid_bounds = True
                            if np.any(bounds1.high - bounds1.low < self.init_size / 15): # to enforce not too small boxes ADHOC TODO #5
                                valid_bounds = False
                            if np.any(bounds2.high - bounds2.low < self.init_size / 15):
                                valid_bounds = valid_bounds and False

                            # perform split in sub regions
                            sub_reg1 = [deque(), deque()]
                            sub_reg2 = [deque(), deque()]
                            for i, task in enumerate(self.regions[reg][1]):
                                if bounds1.contains(task):
                                    sub_reg1[1].append(task)
                                    sub_reg1[0].append(self.regions[reg][0][i])
                                else:
                                    sub_reg2[1].append(task)
                                    sub_reg2[0].append(self.regions[reg][0][i])
                            sub_regions = [sub_reg1, sub_reg2]

                        # compute interest
                        interest, _ = self.compute_interests(sub_regions)

                        # compute score
                        split_score = len(sub_reg1) * len(sub_reg2) * np.abs(interest[0] - interest[1])
                        if split_score >= best_split_score and np.abs(interest[0] - interest[1]) >= self.max_difference / 8 and valid_bounds:
                            best_abs_interest_diff = np.abs(interest[0] - interest[1])
                            #print(interest)

                            best_split_score = split_score
                            best_sub_regions = sub_regions
                            best_bounds = bounds
                            is_split = True
                            if interest[0] >= interest[1]:
                                order = [1, -1]
                            else:
                                order = [-1, 1]
                    if is_split:
                        ind_split.append(reg)
                        if best_abs_interest_diff > self.max_difference:
                            self.max_difference = best_abs_interest_diff
                    else:
                        self.regions[reg][0] = deque(np.array(self.regions[reg][0])[- int (3 * len(self.regions[reg][0]) / 4):], maxlen=self.maxlen + 1)
                        self.regions[reg][1] = deque(np.array(self.regions[reg][1])[- int(3 * len(self.regions[reg][1]) / 4):], maxlen=self.maxlen + 1)
                    new_bounds.append(best_bounds)
                    new_sub_regions.append(best_sub_regions)

            # implement splits
            for i, reg in enumerate(ind_split):
                all_order = [0] * self.nb_regions
                all_order.pop(reg)
                all_order.insert(reg, order[0])
                all_order.insert(reg, order[1])

                new_split = True
                self.region_bounds.pop(reg)
                self.region_bounds.insert(reg, new_bounds[i][0])
                self.region_bounds.insert(reg, new_bounds[i][1])

                self.regions.pop(reg)
                self.regions.insert(reg, new_sub_regions[i][0])
                self.regions.insert(reg, new_sub_regions[i][1])

                self.interest.pop(reg)
                self.interest.insert(reg, 0)
                self.interest.insert(reg, 0)

                self.probas.pop(reg)
                self.probas.insert(reg, 0)
                self.probas.insert(reg, 0)

            # recompute interest
            self.interest, self.probas = self.compute_interests(self.regions)

            assert len(self.probas) == len(self.regions)
            # bk-keeping
            if new_split:
                self.all_boxes.append(copy.copy(self.region_bounds))
                self.all_interests.append(copy.copy(self.interest))
                self.split_iterations.append(self.update_nb)
            return new_split, all_order

        else:
            return False, None
Beispiel #9
0
class Everglades(gym.Env):
    def __init__(self, env_config):
        """

        :param player_num:
        :param game_config: a dict containing config values
        """
        game_config = env_config['game_config']
        player_num = env_config['player_num']

        self.server_address = None
        self.sub_socket = None
        self.pub_socket = None
        self.unit_configs = None
        self.await_connection_time = None

        self.parse_game_config(game_config)

        # todo: this needs to be built via info over the server
        self.node_defs = {
            1: {
                'defense': 1,
                'is_watchtower': False,
                'is_fortress': False
            },
            2: {
                'defense': 1.25,
                'is_watchtower': True,
                'is_fortress': False
            },
            3: {
                'defense': 1.5,
                'is_watchtower': False,
                'is_fortress': False
            },
            4: {
                'defense': 1.5,
                'is_watchtower': False,
                'is_fortress': True
            },
            5: {
                'defense': 1.5,
                'is_watchtower': False,
                'is_fortress': False
            },
            6: {
                'defense': 1.5,
                'is_watchtower': False,
                'is_fortress': False
            },
            7: {
                'defense': 1.5,
                'is_watchtower': False,
                'is_fortress': False
            },
            8: {
                'defense': 1.5,
                'is_watchtower': False,
                'is_fortress': True
            },
            9: {
                'defense': 1.5,
                'is_watchtower': False,
                'is_fortress': False
            },
            10: {
                'defense': 1.25,
                'is_watchtower': True,
                'is_fortress': False
            },
            11: {
                'defense': 1,
                'is_watchtower': False,
                'is_fortress': False
            }
        }

        self.player_num = player_num
        self.opposing_player_num = 1 if self.player_num == 0 else 1

        self.max_game_time = 300  # todo is this correct?
        self.num_units = 100
        self.num_nodes = 11
        self.unit_classes = ['controller', 'striker', 'tank']

        if self.unit_configs is None:
            self.unit_configs = {
                1: self.num_units // len(self.unit_classes),
                2: self.num_units // len(self.unit_classes),
                3:
                self.num_units - 2 * (self.num_units // len(self.unit_classes))
            }
        assert sum(self.unit_configs.values()) == self.num_units

        # self.group_definitions = {}
        # self.opp_group_definitions = {}

        self.setup_state_trackers()

        self.observation_space = None

        self.message_parsing_funcs = {
            evgtypes.PY_GROUP_MoveUpdate.__name__: self.update_group_locs,
            evgtypes.PY_GROUP_Initialization.__name__: self.initialize_groups,
            evgtypes.PY_GROUP_CombatUpdate.__name__: self.update_group_combat,
            evgtypes.PY_GROUP_CreateNew.__name__: self.create_new_group,
            evgtypes.PY_GROUP_TransferUnits.__name__: self.transfer_units,
            evgtypes.PY_NODE_ControlUpdate.__name__: self.update_control_state,
            evgtypes.PY_GAME_Scores.__name__: self.update_game_score,
            evgtypes.PY_NODE_Knowledge.__name__: self.update_node_knowledge,
            evgtypes.PY_GROUP_Knowledge.__name__:
            self.update_opp_group_knowledge,
            evgtypes.PubError.__name__: self.handle_bad_message,
        }

        self.build_obs_space()

        self.action_space = Box(low=np.ones(self.num_units),
                                high=np.repeat(self.num_nodes + 1,
                                               self.num_units))

        self.pub_addr = f'tcp://*:{self.pub_socket}'
        self.sub_addr = f'tcp://{self.server_address}:{self.sub_socket}'

        self.conn = None

    def parse_game_config(self, config):
        """
        Loads the dict config file
        :param config:
            server_address: IP address of the server
            pub_socket: publish socket number for the server
            sub_socket: subscribe socket number for the server
            unit_config: dict with keys corresponding to unit class numbers ([1-3] by default and values corresponding
                to the number of units for that type. Must sum to the number of total units
        :return:
        """
        self.server_address = config.get('server_address')
        self.pub_socket = config.get('pub_socket', '5555')
        self.sub_socket = config.get('sub_socket', '5563')
        self.unit_configs = config.get('unit_config')
        self.await_connection_time = config.get('await_connection_time', 60)

    def build_obs_space(self):
        # control points state
        # [is fortress, is watchtower, pct controlled by me, pct controlled by opp]
        # n x 100 (num_units)
        # [node loc, health, group?, in transit? in combat?]
        # what should node loc be during transit? fraction of node left or node arriving at?
        #todo class should be a boolean
        unit_portion_low = np.array(
            [1, 0, 0, 0, 0])  # node loc, class, health, in transit, in combat
        unit_portion_high = np.array(
            [self.num_nodes,
             len(self.unit_classes) - 1, 100, 1, 1])
        unit_state_low = np.tile(unit_portion_low, self.num_units)
        unit_state_high = np.tile(unit_portion_high, self.num_units)

        control_point_portion_low = np.array([
            0, 0, -100, -1
        ])  # is fortress, is watchtower, percent controlled, num opp units
        control_point_portion_high = np.array([1, 1, 100, self.num_units])

        control_point_state_low = np.tile(control_point_portion_low,
                                          self.num_nodes)
        control_point_state_high = np.tile(control_point_portion_high,
                                           self.num_nodes)

        self.observation_space = Box(
            low=np.concatenate([[0], control_point_state_low, unit_state_low]),
            high=np.concatenate([[self.max_game_time],
                                 control_point_state_high, unit_state_high]))

    def setup_state_trackers(self):
        self.units = UnitDefs()
        self.opp_units_at_nodes = [0] * self.num_nodes

        self.control_states = [0] * self.num_nodes
        self.global_control_states = [0] * self.num_nodes

        self.game_time = 0
        self.game_scores = np.zeros(2)
        self.game_conclusion = 0
        self.winning_player = -1

        self.turn_num = 1

    def step(self, action):
        logger.debug('Action shape {}'.format(action.shape))
        [self.act(index, node_id) for index, node_id in enumerate(action)]

        self.turn_num += 1

        self.conn.send(evgcommands.EndTurn(self.conn.guid))

        waiting_for_action, is_game_over = False, False
        _counter = 0
        while not waiting_for_action and not is_game_over:
            waiting_for_action, is_game_started, is_game_over = self.update_state(
            )
            _counter += 1
            if _counter > 100:
                raise TimeoutError('Game not prompted for action')

        state = self.build_state()

        info = {
            'friendly_units_rem':
            len(np.where(self.units.get_all_states()[:, 2] > 0)),
            # 'opp_units_rem': len(np.where(self.opp_units.get_all_states()[:, 2] > 0)),
        }

        if is_game_over:
            logger.info('Game ended')
            info['game_ending_condition'] = self.game_conclusion

            reward = self.game_scores[self.player_num]
        else:
            reward = 0

        return state, reward, is_game_over, info

    def act(self, index, node_id):
        units = self.units.get_units(inds=index)
        for unit in units:
            if unit.state.health > 0:
                logger.debug(
                    'Acting for unit (ind={}, group={}) to node {}'.format(
                        index, unit.group_id, node_id))
                self.conn.send(
                    evgcommands.PY_GROUP_MoveToNode(self.conn.guid,
                                                    unit.group_id, node_id))

    def seed(self, seed=None):
        if seed is not None and type(seed) == dict:
            self.unit_configs = seed

    def reset(self):
        if self.conn is None:
            logger.info(
                f'Connecting to server at:\n\tPub Addr {self.pub_addr}\n\tSub Addr {self.sub_addr}'
            )
            self.conn = Connection(
                self.pub_addr,
                self.sub_addr,
                self.player_num,
                await_connection_time=self.await_connection_time)

        self.setup_state_trackers()

        for class_type, count in self.unit_configs.items():
            for i in range(count):
                self.conn.send(
                    evgcommands.PY_GROUP_InitGroup(self.conn.guid,
                                                   [class_type], [1],
                                                   [f'{class_type}-{i}']))

        waiting_for_action = False
        _counter = 0
        while len(self.units) < self.num_units:
            _waiting_for_action, _, _ = self.update_state()
            waiting_for_action = _waiting_for_action if _waiting_for_action else False

            _counter += 1
            if _counter > 50:
                logger.warning(
                    'Still awaiting receipt of group initialization. Attempt {}'
                    .format(_counter))
            sleep(.1)

        logger.info(
            'Received group initialization receipts ({} Units).'.format(
                len(self.units)))
        logger.info('Sending Go')

        self.conn.send(evgcommands.Go(self.conn.guid))

        _counter = 0
        while not waiting_for_action:
            waiting_for_action, _, _ = self.update_state()

            _counter += 1
            if _counter > 50:
                logger.warning(
                    'Still waiting for turn to start. Attempt {}'.format(
                        _counter))
            sleep(.1)

        logger.info('Game initialized')

        state = self.build_state()

        return state

    def render(self, mode='human'):
        if mode == 'string':
            return self.print_game_state()

    def close(self):
        if self.conn is not None:
            self.conn.close()

    def update_state(self):
        msgs = self.conn.receive_and_parse()
        waiting_for_action = False
        is_game_over = False
        is_game_started = False

        self.opp_units_at_nodes = [0] * self.num_nodes

        msgs_by_type = {}
        for msg in msgs:
            if hasattr(msg, 'player') and msg.player != self.player_num:
                continue
            msg_type = msg.__class__.__name__
            if msg_type not in msgs_by_type:
                msgs_by_type[msg_type] = []
            msgs_by_type[msg_type].append(msg)

        logger.debug('Message received types {}'.format(msgs_by_type.keys()))

        for type in msgs_by_type:
            if type in self.message_parsing_funcs:
                [
                    self.message_parsing_funcs[type](msg)
                    for msg in msgs_by_type[type]
                ]
            elif type == str(evgtypes.TurnStart.__name__):
                waiting_for_action = True
            elif type == str(evgtypes.GoodBye.__name__):
                logger.info('Goodbye received. Turn {}'.format(self.turn_num))
                is_game_over = True
            elif type == str(evgtypes.NewClientACK.__name__):
                # is_game_started = True
                pass
            elif type == str(evgtypes.TimeStamp.__name__):
                self.game_time = max([msg.time for msg in msgs_by_type[type]])
                # NOTE: This is to avoid setting a bad time value when the server sends a bad timestamp
                if self.game_time > self.max_game_time:
                    self.game_time = self.max_game_time
            else:
                logger.debug(f'Unrecognized message type {type}')

        return waiting_for_action, is_game_started, is_game_over

    def build_state(self):
        state = np.array([self.max_game_time - self.game_time])
        # control points
        # is fortress, is watchtower, percent controlled
        for i in range(self.num_nodes):
            node_num = i + 1
            node_def = self.node_defs[node_num]
            is_fortress = 1 if node_def['is_fortress'] else 0
            is_watchtower = 1 if node_def['is_watchtower'] else 0
            control_pct = float(self.control_states[i])
            num_opp_units = self.opp_units_at_nodes[i]

            node_state = np.array(
                [is_fortress, is_watchtower, control_pct, num_opp_units])
            state = np.concatenate([state, node_state])

        # unit state
        # node loc, class, health, in transit, in combat
        state = np.concatenate([state, self.units.get_all_states().flatten()])

        self.verify_state(state)

        return state

    def initialize_groups(self, msg):
        logger.debug(f'~~~Initializing groups:\n{json.dumps(msg.__dict__)}')
        if msg.player != self.player_num:
            return

        group_num = msg.group

        for start_id, group_type, count in zip(msg.start, msg.types,
                                               msg.count):
            if count == 0:
                continue

            group_type_num = self.unit_classes.index(group_type.lower())

            state = UnitState(node_loc=int(msg.node),
                              health=100,
                              class_type=group_type_num,
                              in_combat=0,
                              in_transit=0)

            self.units.add_units([start_id + i for i in range(count)],
                                 [group_num] * count, [state] * count)

    def update_group_locs(self, msg):
        logger.debug(
            f'~~~Updating group locations:\n{json.dumps(msg.__dict__)}')
        if msg.player != self.player_num:
            return

        in_transit = msg.status == 'IN_TRANSIT'
        if msg.status == 'IN_TRANSIT':
            node_loc = (msg.start + msg.destination) / 2.
        elif msg.status == 'ARRIVED':
            node_loc = msg.destination
        else:
            node_loc = msg.start

        units = self.units.get_units(group_ids=msg.group)
        for unit in units:
            state = unit.state
            state.node_loc = node_loc
            state.in_transit = in_transit
            unit.add_state(state)

    def update_group_combat(self, msg):
        logger.debug(f'~~~Group combat update:\n{json.dumps(msg.__dict__)}')
        if msg.player != self.player_num:
            return

        units = self.units.get_units(unit_ids=msg.units)

        for unit, health in zip(units, msg.health):
            state = unit.state
            state.health = health
            unit.add_state(state)

    def create_new_group(self, msg):
        logger.warning('Create new group not supported')

    def transfer_units(self, msg):
        logger.warning('Transferring units not supported')

    def update_control_state(self, msg):
        logger.debug(f'~~~Control state update:\n{json.dumps(msg.__dict__)}')
        value = msg.controlvalue if msg.faction == self.player_num else -msg.controlvalue
        value = int(value * 100)
        if msg.player == self.player_num:
            self.control_states[msg.node - 1] = value
        self.global_control_states[msg.node - 1] = value

    def update_game_score(self, msg):
        logger.info(f'~~~Game score update:\n{json.dumps(msg.__dict__)}')
        self.game_scores = np.array([msg.player1, msg.player2])
        # 1: time expired, 2: base captured, 3: units eliminated
        self.game_conclusion = msg.status
        self.winning_player = -1 if (self.game_scores[0]
                                     == self.game_scores).all() else np.argmax(
                                         self.game_scores)

    def update_node_knowledge(self, msg):
        logger.debug(f'~~~Node knowledge update:\n{json.dumps(msg.__dict__)}')
        is_players = msg.player == self.player_num
        for node, knowledge, controller, percent in zip(
                msg.nodes, msg.knowledge, msg.controller, msg.percent):
            if controller == self.opposing_player_num:
                percent = -percent

            self.global_control_states[node - 1] = percent
            if is_players:
                self.control_states[node - 1] = percent

    def update_opp_group_knowledge(self, msg):
        if msg.player != self.player_num:
            return
        logger.debug(
            f'~~~Opponent knowledge update:\n{json.dumps(msg.__dict__)}')

        num_units = sum(msg.ucount)
        node_num = msg.node1 - 1
        tar_node_num = msg.node2 - 1

        if tar_node_num < 0:
            self.opp_units_at_nodes[node_num] += num_units
        else:
            self.opp_units_at_nodes[node_num] += num_units / 2.
            self.opp_units_at_nodes[tar_node_num] += num_units / 2.

    def handle_bad_message(self, msg):
        if 'movetonode' not in msg.message.lower():
            logger.info('Bad message {}'.format(json.dumps(msg.__dict__)))

    def verify_state(self, state):
        if not self.observation_space.contains(state):
            logger.info('Observation not within defined bounds')
            logger.info(self.print_game_state())
            logger.info(state)

        assert self.observation_space.shape == state.shape

    def print_game_state(self):
        units_at_node = {}
        for i in range(1, self.num_nodes + 1):
            units_at_node[i] = 0

        for i, unit in enumerate(self.units.get_all_units()):
            node_loc = unit.state.node_loc
            if node_loc not in units_at_node:
                units_at_node[node_loc] = 0
            units_at_node[node_loc] += 1

        string = 'Game state at turn {}'.format(self.turn_num)
        for node_num, num_units in sorted(units_at_node.items()):
            node_num = float(node_num)
            string += '\n{} ({:.1f} %): {} units, {} opp units'.format(
                int(node_num) if node_num.is_integer() else node_num,
                self.control_states[int(node_num) -
                                    1] if node_num.is_integer() else 0,
                num_units,
                self.opp_units_at_nodes[int(node_num) -
                                        1] if node_num.is_integer() else 0)

        return string
class SawyerPushAndReachXYZDoublePuckEnv(MultitaskEnv, SawyerXYZEnv):
    def __init__(self,
                 puck_low=(-.4, .2),
                 puck_high=(.4, 1),
                 reward_type='state_distance',
                 norm_order=1,
                 indicator_threshold=0.06,
                 hand_low=(-0.28, 0.3, 0.05),
                 hand_high=(0.28, 0.9, 0.3),
                 fix_goal=False,
                 fixed_goal=(0.15, 0.6, 0.02, -0.15, 0.6),
                 goal_low=(-0.25, 0.3, 0.02, -.2, .4, -.2, .4),
                 goal_high=(0.25, 0.875, 0.02, .2, .8, .2, .8),
                 hide_goal_markers=False,
                 init_puck_z=0.02,
                 num_resets_before_puck_reset=1,
                 num_resets_before_hand_reset=1,
                 always_start_on_same_side=True,
                 goal_always_on_same_side=True,
                 **kwargs):
        self.quick_init(locals())
        MultitaskEnv.__init__(self)
        SawyerXYZEnv.__init__(self,
                              hand_low=hand_low,
                              hand_high=hand_high,
                              model_name=self.model_name,
                              **kwargs)
        if puck_low is None:
            puck_low = self.hand_low[:2]
        if puck_high is None:
            puck_high = self.hand_high[:2]
        puck_low = np.array(puck_low)
        puck_high = np.array(puck_high)

        self.puck_low = puck_low
        self.puck_high = puck_high

        if goal_low is None:
            goal_low = np.hstack((self.hand_low, puck_low, puck_low))
        if goal_high is None:
            goal_high = np.hstack((self.hand_high, puck_high, puck_high))
        self.goal_low = np.array(goal_low)
        self.goal_high = np.array(goal_high)

        self.reward_type = reward_type
        self.norm_order = norm_order
        self.indicator_threshold = indicator_threshold

        self.fix_goal = fix_goal
        self.fixed_goal = np.array(fixed_goal)
        self._state_goal = None

        self.hide_goal_markers = hide_goal_markers

        self.action_space = Box(np.array([-1, -1, -1]),
                                np.array([1, 1, 1]),
                                dtype=np.float32)
        self.hand_and_two_puck_space = Box(np.hstack(
            (self.hand_low, puck_low, puck_low)),
                                           np.hstack((self.hand_high,
                                                      puck_high, puck_high)),
                                           dtype=np.float32)
        self.hand_space = Box(self.hand_low, self.hand_high, dtype=np.float32)
        self.puck_space = Box(self.puck_low, self.puck_high, dtype=np.float32)
        self.observation_space = Dict([
            ('observation', self.hand_and_two_puck_space),
            ('desired_goal', self.hand_and_two_puck_space),
            ('achieved_goal', self.hand_and_two_puck_space),
            ('state_observation', self.hand_and_two_puck_space),
            ('state_desired_goal', self.hand_and_two_puck_space),
            ('state_achieved_goal', self.hand_and_two_puck_space),
            ('proprio_observation', self.hand_space),
            ('proprio_desired_goal', self.hand_space),
            ('proprio_achieved_goal', self.hand_space),
        ])
        self.init_puck_z = init_puck_z
        self.reset_counter = 0
        self.num_resets_before_puck_reset = num_resets_before_puck_reset
        self.num_resets_before_hand_reset = num_resets_before_hand_reset
        self._always_start_on_same_side = always_start_on_same_side
        self._goal_always_on_same_side = goal_always_on_same_side

        self._set_puck_xys(self._sample_puck_xys())

    @property
    def model_name(self):
        return get_asset_full_path('sawyer_xyz/sawyer_push_two_puck.xml')

    def viewer_setup(self):
        self.viewer.cam.trackbodyid = 0
        self.viewer.cam.lookat[0] = 0
        self.viewer.cam.lookat[1] = 1.0
        self.viewer.cam.lookat[2] = 0.5
        self.viewer.cam.distance = 0.3
        self.viewer.cam.elevation = -45
        self.viewer.cam.azimuth = 270
        self.viewer.cam.trackbodyid = -1

    def step(self, action):
        self.set_xyz_action(action)
        u = np.zeros(7)
        self.do_simulation(u)
        self._set_goal_marker(self._state_goal)
        ob = self._get_obs()
        reward = self.compute_reward(action, ob)
        info = self._get_info()
        done = False
        return ob, reward, done, info

    def _get_obs(self):
        e = self.get_endeff_pos()
        b = self.get_puck1_pos()[:2]
        c = self.get_puck2_pos()[:2]
        flat_obs = np.concatenate((e, b, c))

        return dict(
            observation=flat_obs,
            desired_goal=self._state_goal,
            achieved_goal=flat_obs,
            state_observation=flat_obs,
            state_desired_goal=self._state_goal,
            state_achieved_goal=flat_obs,
            proprio_observation=flat_obs[:3],
            proprio_desired_goal=self._state_goal[:3],
            proprio_achieved_goal=flat_obs[:3],
        )

    def _get_info(self):
        hand_goal = self._state_goal[:3]
        puck1_goal = self._state_goal[3:5]
        puck2_goal = self._state_goal[5:7]

        # hand distance
        hand_diff = hand_goal - self.get_endeff_pos()
        hand_distance = np.linalg.norm(hand_diff, ord=self.norm_order)

        # puck1 distance
        puck1_diff = puck1_goal - self.get_puck1_pos()[:2]
        puck1_distance = np.linalg.norm(puck1_diff, ord=self.norm_order)

        # puck2 distance
        puck2_diff = puck2_goal - self.get_puck2_pos()[:2]
        puck2_distance = np.linalg.norm(puck2_diff, ord=self.norm_order)

        # state distance
        state_diff = np.hstack(
            (self.get_endeff_pos(), self.get_puck1_pos()[:2],
             self.get_puck2_pos()[:2])) - self._state_goal
        state_distance = np.linalg.norm(state_diff, ord=self.norm_order)

        return dict(
            hand_distance=hand_distance,
            puck1_distance=puck1_distance,
            puck2_distance=puck2_distance,
            puck_distance_sum=puck1_distance + puck2_distance,
            hand_and_puck_distance=hand_distance + puck1_distance +
            puck2_distance,
            state_distance=state_distance,
            hand_success=float(hand_distance < self.indicator_threshold),
            puck1_success=float(puck1_distance < self.indicator_threshold),
            puck2_success=float(puck2_distance < self.indicator_threshold),
            hand_and_puck_success=float(
                hand_distance + puck1_distance +
                puck2_distance < self.indicator_threshold),
            state_success=float(state_distance < self.indicator_threshold),
        )

    def get_puck1_pos(self):
        return self.data.get_body_xpos('puck1').copy()

    def get_puck2_pos(self):
        return self.data.get_body_xpos('puck2').copy()

    def _sample_puck_xys(self):
        if self._always_start_on_same_side:
            return np.array([-.1, 0.6]), np.array([0.1, 0.6])
        else:
            if np.random.randint(0, 2) == 0:
                return np.array([0.1, 0.6]), np.array([-.1, 0.6])
            else:
                return np.array([-.1, 0.6]), np.array([0.1, 0.6])

    def _set_goal_marker(self, goal):
        """
        This should be use ONLY for visualization. Use self._state_goal for
        logging, learning, etc.
        """
        self.data.site_xpos[self.model.site_name2id('hand-goal-site')] = (
            goal[:3])
        self.data.site_xpos[self.model.site_name2id('puck1-goal-site')][:2] = (
            goal[3:5])
        self.data.site_xpos[self.model.site_name2id('puck2-goal-site')][:2] = (
            goal[5:7])
        if self.hide_goal_markers:
            self.data.site_xpos[self.model.site_name2id('hand-goal-site'),
                                2] = (-1000)
            self.data.site_xpos[self.model.site_name2id('puck1-goal-site'),
                                2] = (-1000)
            self.data.site_xpos[self.model.site_name2id('puck2-goal-site'),
                                2] = (-1000)

    def _set_puck_xys(self, puck_xys):
        pos1, pos2 = puck_xys
        qpos = self.data.qpos.flat.copy()
        qvel = self.data.qvel.flat.copy()
        qpos[7:10] = np.hstack((pos1.copy(), np.array([self.init_puck_z])))
        qpos[10:14] = np.array([1, 0, 0, 0])

        qpos[14:17] = np.hstack((pos2.copy(), np.array([self.init_puck_z])))
        qpos[17:21] = np.array([1, 0, 0, 0])
        qvel[14:21] = 0
        self.set_state(qpos, qvel)

    def reset_model(self):
        if self.reset_counter % self.num_resets_before_hand_reset == 0:
            self._reset_hand()
        if self.reset_counter % self.num_resets_before_puck_reset == 0:
            self._set_puck_xys(self._sample_puck_xys())

        if not (self.puck_space.contains(self.get_puck1_pos()[:2])
                and self.puck_space.contains(self.get_puck2_pos()[:2])):
            self._set_puck_xys(self._sample_puck_xys())
        goal = self.sample_goal()
        self.set_goal(goal)
        self.reset_counter += 1
        self.reset_mocap_welds()
        return self._get_obs()

    def _reset_hand(self):
        velocities = self.data.qvel.copy()
        angles = self.data.qpos.copy()
        angles[:7] = self.init_angles[:7]
        self.set_state(angles.flatten(), velocities.flatten())
        for _ in range(10):
            self.data.set_mocap_pos('mocap', np.array([0, 0.4, 0.02]))
            self.data.set_mocap_quat('mocap', np.array([1, 0, 1, 0]))
        sim = self.sim
        if sim.model.nmocap > 0 and sim.model.eq_data is not None:
            for i in range(sim.model.eq_data.shape[0]):
                if sim.model.eq_type[i] == mujoco_py.const.EQ_WELD:
                    # Define the xyz + quat of the mocap relative to the hand
                    sim.model.eq_data[i, :] = np.array(
                        [0., 0., 0., 1., 0., 0., 0.])

    def reset(self):
        ob = self.reset_model()
        if self.viewer is not None:
            self.viewer_setup()
        return ob

    @property
    def init_angles(self):
        return [
            1.78026069e+00, -6.84415781e-01, -1.54549231e-01, 2.30672090e+00,
            1.93111471e+00, 1.27854012e-01, 1.49353907e+00, 1.80196716e-03,
            7.40415706e-01, 2.09895360e-02, 1, 0, 0, 0, 1.80196716e-03 + .3,
            7.40415706e-01, 2.09895360e-02, 1, 0, 0, 0
        ]

    """
    Multitask functions
    """

    def get_goal(self):
        return {
            'desired_goal': self._state_goal,
            'state_desired_goal': self._state_goal,
        }

    def set_goal(self, goal):
        self._state_goal = goal['state_desired_goal']
        self._set_goal_marker(self._state_goal)

    def set_to_goal(self, goal):
        hand_goal = goal['state_desired_goal'][:3]
        for _ in range(10):
            self.data.set_mocap_pos('mocap', hand_goal)
            self.data.set_mocap_quat('mocap', np.array([1, 0, 1, 0]))
            self.do_simulation(None, self.frame_skip)
        puck_goal = goal['state_desired_goal'][3:]
        self._set_puck_xys((puck_goal[:2], puck_goal[2:]))
        self.sim.forward()

    def sample_goals(self, batch_size):
        if self.fix_goal:
            goals = np.repeat(self.fixed_goal.copy()[None], batch_size, 0)
        else:
            if self._goal_always_on_same_side:
                goal_low = self.goal_low.copy()
                goal_high = self.goal_high.copy()
                # first puck
                goal_high[3] = 0
                # second puck
                goal_low[5] = 0
                goals = np.random.uniform(
                    goal_low,
                    goal_high,
                    size=(batch_size, self.goal_low.size),
                )
            else:
                goals = np.random.uniform(
                    self.goal_low,
                    self.goal_high,
                    size=(batch_size, self.goal_low.size),
                )
        return {
            'desired_goal': goals,
            'state_desired_goal': goals,
        }

    def compute_rewards(self, actions, obs):
        achieved_goals = obs['state_achieved_goal']
        desired_goals = obs['state_desired_goal']
        hand_pos = achieved_goals[:, :3]
        puck1_pos = achieved_goals[:, 3:5]
        puck2_pos = achieved_goals[:, 5:7]
        hand_goals = desired_goals[:, :3]
        puck1_goals = desired_goals[:, 3:5]
        puck2_goals = desired_goals[:, 5:7]

        hand_distances = np.linalg.norm(hand_goals - hand_pos,
                                        ord=self.norm_order,
                                        axis=1)
        puck1_distances = np.linalg.norm(puck1_goals - puck1_pos,
                                         ord=self.norm_order,
                                         axis=1)
        puck2_distances = np.linalg.norm(puck2_goals - puck2_pos,
                                         ord=self.norm_order,
                                         axis=1)

        if self.reward_type == 'hand_distance':
            r = -hand_distances
        elif self.reward_type == 'hand_success':
            r = -(hand_distances > self.indicator_threshold).astype(float)
        elif self.reward_type == 'puck1_distance':
            r = -puck1_distances
        elif self.reward_type == 'puck1_success':
            r = -(puck1_distances > self.indicator_threshold).astype(float)
        elif self.reward_type == 'puck2_distance':
            r = -puck2_distances
        elif self.reward_type == 'puck2_success':
            r = -(puck2_distances > self.indicator_threshold).astype(float)
        elif self.reward_type == 'state_distance':
            r = -np.linalg.norm(
                achieved_goals - desired_goals, ord=self.norm_order, axis=1)
        elif self.reward_type == 'vectorized_state_distance':
            r = -np.abs(achieved_goals - desired_goals)
        else:
            raise NotImplementedError("Invalid/no reward type.")
        return r

    def get_diagnostics(self, paths, prefix=''):
        statistics = OrderedDict()
        for stat_name in [
                'hand_distance',
                'puck1_distance',
                'puck2_distance',
                'puck_distance_sum',
                'hand_and_puck_distance',
                'state_distance',
                'hand_success',
                'puck1_success',
                'puck2_success',
                'hand_and_puck_success',
                'state_success',
        ]:
            stat_name = stat_name
            stat = get_stat_in_paths(paths, 'env_infos', stat_name)
            statistics.update(
                create_stats_ordered_dict(
                    '%s%s' % (prefix, stat_name),
                    stat,
                    always_show_all_stats=True,
                ))
            statistics.update(
                create_stats_ordered_dict(
                    'Final %s%s' % (prefix, stat_name),
                    [s[-1] for s in stat],
                    always_show_all_stats=True,
                ))
        return statistics

    def get_env_state(self):
        base_state = super().get_env_state()
        goal = self._state_goal.copy()
        return base_state, goal

    def set_env_state(self, state):
        base_state, goal = state
        super().set_env_state(base_state)
        self._state_goal = goal
        self._set_goal_marker(goal)
Beispiel #11
0
    def split(self, nid):
        # try nb_split_attempts splits
        reg = self.tree.get_node(nid).data
        best_split_score = 0
        best_abs_interest_diff = 0
        best_bounds = None
        best_sub_regions = None
        is_split = False
        for i in range(self.nb_split_attempts):
            sub_reg1 = [deque(maxlen=self.maxlen + 1), deque(maxlen=self.maxlen + 1)]
            sub_reg2 = [deque(maxlen=self.maxlen + 1), deque(maxlen=self.maxlen + 1)]

            # repeat until the two sub regions contain at least minlen of the mother region TRICK NB 1
            while len(sub_reg1[0]) < self.minlen or len(sub_reg2[0]) < self.minlen:
                # decide on dimension
                dim = np.random.choice(range(self.nb_dims))
                threshold = reg.bounds.sample()[dim]
                bounds1 = Box(reg.bounds.low, reg.bounds.high, dtype=np.float32)
                bounds1.high[dim] = threshold
                bounds2 = Box(reg.bounds.low, reg.bounds.high, dtype=np.float32)
                bounds2.low[dim] = threshold
                bounds = [bounds1, bounds2]
                valid_bounds = True
                if np.any(bounds1.high - bounds1.low < self.init_size / 15):  # to enforce not too small boxes TRICK NB 2
                    valid_bounds = False
                if np.any(bounds2.high - bounds2.low < self.init_size / 15):
                    valid_bounds = valid_bounds and False

                # perform split in sub regions
                sub_reg1 = [deque(maxlen=self.maxlen + 1), deque(maxlen=self.maxlen + 1)]
                sub_reg2 = [deque(maxlen=self.maxlen + 1), deque(maxlen=self.maxlen + 1)]
                for i, task in enumerate(reg.cps_gs[1]):
                    if bounds1.contains(task):
                        sub_reg1[1].append(task)
                        sub_reg1[0].append(reg.cps_gs[0][i])
                    else:
                        sub_reg2[1].append(task)
                        sub_reg2[0].append(reg.cps_gs[0][i])
                sub_regions = [sub_reg1, sub_reg2]

            # compute interest
            interest = [self.compute_interest(sub_reg1), self.compute_interest(sub_reg2)]

            # compute score
            split_score = len(sub_reg1) * len(sub_reg2) * np.abs(interest[0] - interest[1])
            if split_score >= best_split_score and valid_bounds: # TRICK NB 3, max diff #and np.abs(interest[0] - interest[1]) >= self.max_difference / 8
                is_split = True
                best_abs_interest_diff = np.abs(interest[0] - interest[1])
                best_split_score = split_score
                best_sub_regions = sub_regions
                best_bounds = bounds

        if is_split:
            if best_abs_interest_diff > self.max_difference:
                self.max_difference = best_abs_interest_diff
            # add new nodes to tree
            for i, (cps_gs, bounds) in enumerate(zip(best_sub_regions, best_bounds)):
                self.tree.create_node(parent=nid, data=Region(self.maxlen, cps_gs=cps_gs, bounds=bounds, interest=interest[i]))
        else:
            #print("abort mission")
            # TRICK NB 6, remove old stuff if can't find split
            assert len(reg.cps_gs[0]) == (self.maxlen + 1)
            reg.cps_gs[0] = deque(islice(reg.cps_gs[0], int(self.maxlen / 4), self.maxlen + 1))
            reg.cps_gs[1] = deque(islice(reg.cps_gs[1], int(self.maxlen / 4), self.maxlen + 1))

        return is_split
Beispiel #12
0
class FourWayGridWorld(gym.Env):
    def __init__(self, env_config=None):
        self.config = copy.deepcopy(default_config)
        if isinstance(env_config, dict):
            self.config.update(env_config)
        self.N = self.config['N']
        self.observation_space = Box(0, self.N, shape=(2, ))
        self.action_space = Box(-1, 1, shape=(2, ))
        self.map = np.ones((self.N + 1, self.N + 1), dtype=np.float32)
        self._fill_map()
        self.walls = []
        self._fill_walls()
        self.reset()

    def _fill_walls(self):
        """Let suppose you have three walls, two vertical, one horizontal."""
        if self.config['use_walls']:
            print('Building Walls!!!')
            self.walls.append(Wall([4, 6], [12, 6]))
            self.walls.append(Wall([4, 10], [12, 10]))
            self.walls.append(Wall([12, 6], [12, 10]))

    def _fill_map(self):
        self.map.fill(-0.1)
        self.map[int(self.N / 2), 0] = self.config['down']
        self.map[0, int(self.N / 2)] = self.config['left']
        self.map[self.N, int(self.N / 2)] = self.config['right']
        self.map[int(self.N / 2), self.N] = self.config['up']
        self.traj = []

    def is_done(self, reward):
        if self.config['early_done']:
            return (not (0 < self.x < self.N)) or \
                   (not (0 < self.y < self.N)) or \
                   (self.step_num >= 2 * self.N) or \
                   (reward > 0.1)
        return self.step_num >= 2 * self.N

    def step(self, action):
        x = _clip(self.x + action[0], max(0, self.x - 1),
                  min(self.N, self.x + 1))
        y = _clip(self.y + action[1], max(0, self.y - 1),
                  min(self.N, self.y + 1))

        if any(w.intersect((self.x, self.y), (x, y)) for w in self.walls):
            pass
        else:
            self.x = x
            self.y = y

        loc = np.array((self.x, self.y))
        reward = self.map[int(round(self.x)), int(round(self.y))]
        self.step_num += 1
        if self.config['record_trajectory']:
            self.traj.append(loc)
        return loc, reward, self.is_done(reward), {}

    def render(self, mode=None, **plt_kwargs):
        import matplotlib.pyplot as plt
        fig = plt.figure(**plt_kwargs)
        ax = fig.add_subplot()
        img = ax.imshow(np.transpose(self.map)[::-1, :],
                        aspect=1,
                        extent=[-0.5, self.N + 0.5, -0.5, self.N + 0.5],
                        cmap=plt.cm.hot_r)
        fig.colorbar(img)
        ax.set_aspect(1)
        for w in self.walls:
            x = [w.start[0], w.end[0]]
            y = [w.start[1], w.end[1]]
            ax.plot(x, y, c='orange')
        if len(self.traj) > 1:
            traj = np.asarray(self.traj)
            ax.plot(traj[:, 0], traj[:, 1], c='blue', alpha=0.75)
        ax.set_xlabel('X-coordinate')
        ax.set_ylabel('Y-coordinate')
        ax.set_xlim(0, self.N)
        ax.set_ylim(0, self.N)
        if self.config["_show"]:
            plt.show()
        return fig, ax

    def reset(self):
        if self.config['init_loc'] is not None:
            loc = self.set_loc(self.config['init_loc'])
        else:
            if self.config['int_initialize']:
                loc = np.random.randint(0, self.N + 1,
                                        size=(2, )).astype(np.float32)
            else:
                loc = np.random.uniform(0, self.N,
                                        size=(2, )).astype(np.float32)
        self.x, self.y = loc[0], loc[1]
        self.step_num = 0
        self.traj = [loc]
        return loc

    def seed(self, s=None):
        if s is not None:
            np.random.seed(s)

    def set_loc(self, loc):
        loc = np.asarray(loc)
        assert self.observation_space.contains(loc)
        self.x = loc[0]
        self.y = loc[1]
        return loc
class UR5ePushAndReachXYZEnv(MultitaskEnv, UR5eXYZEnv):
    def __init__(
            self,
            puck_low=(-0.2, -0.2),  # x, y
            puck_high=(0.2, 0.2),
            reward_type='state_distance',
            norm_order=1,
            indicator_threshold=0.05,
            hand_low=(-0.35, -0.35, 0.84),
            hand_high=(0.35, 0.3, 1.2),
            fix_goal=False,
            fixed_goal=(
                0, 0, 0.84, 0,
                0),  # table top surface height .82, puck half height .02
            goal_low=(-.2, -.2, .84, -.2, -.2),
            goal_high=(.2, .2, .84, .2, .2),
            hide_goal_markers=False,
            init_puck_z=0.84,
            init_hand_xyz=(0, 0, 0.9),
            reset_free=False,
            xml_path='ur5e_sim/ur5e_push_puck.xml',
            clamp_puck_on_step=True,
            puck_radius=.04,
            **kwargs):
        self.quick_init(locals())
        self.model_name = get_asset_full_path(xml_path)
        MultitaskEnv.__init__(self)
        UR5eXYZEnv.__init__(self,
                            hand_low=hand_low,
                            hand_high=hand_high,
                            model_name=self.model_name,
                            **kwargs)
        if puck_low is None:
            puck_low = self.hand_low[:2]
        if puck_high is None:
            puck_high = self.hand_high[:2]
        puck_low = np.array(puck_low)
        puck_high = np.array(puck_high)

        self.puck_low = puck_low
        self.puck_high = puck_high

        # The goal ranges are composed by 5 values:
        # hand_goal_x, hand_goal_y, hand_goal_z, puck_goal_x, puck_goal_y
        # The robot first move gripper to puck goal and then push the puck to hand goal
        if goal_low is None:
            goal_low = np.hstack((self.hand_low, puck_low))
        if goal_high is None:
            goal_high = np.hstack((self.hand_high, puck_high))
        self.goal_low = np.array(goal_low)
        self.goal_high = np.array(goal_high)

        self.reward_type = reward_type
        self.norm_order = norm_order
        self.indicator_threshold = indicator_threshold

        self.fix_goal = fix_goal
        self.fixed_goal = np.array(fixed_goal)
        self._state_goal = None

        self.hide_goal_markers = hide_goal_markers

        self.action_space = Box(np.array([-1, -1, -1]),
                                np.array([1, 1, 1]),
                                dtype=np.float32)
        self.hand_and_puck_space = Box(np.hstack((self.hand_low, puck_low)),
                                       np.hstack((self.hand_high, puck_high)),
                                       dtype=np.float32)
        self.hand_space = Box(self.hand_low, self.hand_high, dtype=np.float32)
        self.observation_space = Dict([
            ('observation', self.hand_and_puck_space),
            ('desired_goal', self.hand_and_puck_space),
            ('achieved_goal', self.hand_and_puck_space),
            ('state_observation', self.hand_and_puck_space),
            ('state_desired_goal', self.hand_and_puck_space),
            ('state_achieved_goal', self.hand_and_puck_space),
            ('proprio_observation', self.hand_space),
            ('proprio_desired_goal', self.hand_space),
            ('proprio_achieved_goal', self.hand_space),
        ])
        self.init_puck_z = init_puck_z
        self.init_hand_xyz = np.array(init_hand_xyz)

        self._set_puck_xy(self.sample_puck_xy())
        self.reset_free = reset_free
        self.reset_counter = 0
        self.puck_space = Box(self.puck_low, self.puck_high, dtype=np.float32)
        self.clamp_puck_on_step = clamp_puck_on_step
        self.puck_radius = puck_radius
        self.reset()

    def viewer_setup(self):
        # FIXME change the cam according to the env
        self.viewer.cam.trackbodyid = 0
        self.viewer.cam.lookat[0] = 0
        self.viewer.cam.lookat[1] = -1
        self.viewer.cam.lookat[2] = 2
        self.viewer.cam.distance = 0.5
        self.viewer.cam.elevation = -45
        self.viewer.cam.azimuth = 90
        self.viewer.cam.trackbodyid = -1

    def step(self, action):
        self.set_xyz_action(action)
        # FIXME refer DOF
        u = np.zeros(6)
        # Since no actuator in robot, u takes no effect,
        # only run the simulation for frame_skip times
        self.do_simulation(u)
        if self.clamp_puck_on_step:
            curr_puck_pos = self.get_puck_pos()[:2]
            curr_puck_pos = np.clip(curr_puck_pos, self.puck_space.low,
                                    self.puck_space.high)
            self._set_puck_xy(curr_puck_pos)
        self._set_goal_marker(self._state_goal)
        ob = self._get_obs()
        reward = self.compute_reward(action, ob)
        info = self._get_info()
        done = False
        return ob, reward, done, info

    def _get_obs(self):
        e = self.get_endeff_pos()
        b = self.get_puck_pos()[:2]
        flat_obs = np.concatenate((e, b))

        return dict(
            observation=flat_obs,
            desired_goal=self._state_goal,
            achieved_goal=flat_obs,
            state_observation=flat_obs,
            state_desired_goal=self._state_goal,
            state_achieved_goal=flat_obs,
            proprio_observation=flat_obs[:3],
            proprio_desired_goal=self._state_goal[:3],
            proprio_achieved_goal=flat_obs[:3],
        )

    def _get_info(self):
        hand_goal = self._state_goal[:3]
        puck_goal = self._state_goal[3:]

        # hand distance
        hand_diff = hand_goal - self.get_endeff_pos()
        hand_distance = np.linalg.norm(hand_diff, ord=self.norm_order)
        hand_distance_l1 = np.linalg.norm(hand_diff, 1)
        hand_distance_l2 = np.linalg.norm(hand_diff, 2)

        # puck distance
        puck_diff = puck_goal - self.get_puck_pos()[:2]
        puck_distance = np.linalg.norm(puck_diff, ord=self.norm_order)
        puck_distance_l1 = np.linalg.norm(puck_diff, 1)
        puck_distance_l2 = np.linalg.norm(puck_diff, 2)

        # touch distance
        touch_diff = self.get_endeff_pos() - self.get_puck_pos()
        touch_distance = np.linalg.norm(touch_diff, ord=self.norm_order)
        touch_distance_l1 = np.linalg.norm(touch_diff, ord=1)
        touch_distance_l2 = np.linalg.norm(touch_diff, ord=2)

        # state distance
        state_diff = np.hstack((self.get_endeff_pos(),
                                self.get_puck_pos()[:2])) - self._state_goal
        state_distance = np.linalg.norm(state_diff, ord=self.norm_order)
        state_distance_l1 = np.linalg.norm(state_diff, ord=1)
        state_distance_l2 = np.linalg.norm(state_diff, ord=2)

        return dict(
            hand_distance=hand_distance,
            hand_distance_l1=hand_distance_l1,
            hand_distance_l2=hand_distance_l2,
            puck_distance=puck_distance,
            puck_distance_l1=puck_distance_l1,
            puck_distance_l2=puck_distance_l2,
            hand_and_puck_distance=hand_distance + puck_distance,
            hand_and_puck_distance_l1=hand_distance_l1 + puck_distance_l1,
            hand_and_puck_distance_l2=hand_distance_l2 + puck_distance_l2,
            touch_distance=touch_distance,
            touch_distance_l1=touch_distance_l1,
            touch_distance_l2=touch_distance_l2,
            state_distance=state_distance,
            state_distance_l1=state_distance_l1,
            state_distance_l2=state_distance_l2,
            hand_success=float(hand_distance < self.indicator_threshold),
            puck_success=float(puck_distance < self.indicator_threshold),
            hand_and_puck_success=float(
                hand_distance + puck_distance < self.indicator_threshold),
            touch_success=float(touch_distance < self.indicator_threshold),
            state_success=float(state_distance < self.indicator_threshold),
        )

    def get_puck_pos(self):
        return self.data.get_body_xpos('puck').copy()

    def sample_puck_xy(self):
        return np.array([0, -0.1])

    def _set_goal_marker(self, goal):
        """This should be use ONLY for visualization. Use self._state_goal for
        logging, learning, etc.
        """
        self.data.site_xpos[self.model.site_name2id('hand-goal-site')] = (
            goal[:3])
        self.data.site_xpos[self.model.site_name2id('puck-goal-site')][:2] = (
            goal[3:])
        if self.hide_goal_markers:
            self.data.site_xpos[self.model.site_name2id('hand-goal-site'),
                                2] = (-1000)
            self.data.site_xpos[self.model.site_name2id('puck-goal-site'),
                                2] = (-1000)

    def _set_puck_xy(self, pos):
        """
        WARNING: this resets the sites (because set_state resets sights do).
        """
        qpos = self.data.qpos.flat.copy()
        qvel = self.data.qvel.flat.copy()
        # FIXME for UR5e-2F85 model in push puck env, qpos[0:6] are arm joints,
        #  [6:12] are gripper joints, [12:15] is puck position, [15, 19] is puck orientation
        qpos[12:15] = np.hstack((pos.copy(), np.array([self.init_puck_z])))
        qpos[15:19] = np.array([1, 0, 0, 0])
        qvel[12:19] = 0  # all velocity equal to zero
        self.set_state(qpos, qvel)

    def reset_model(self):
        self._reset_hand()
        if not self.reset_free:
            self._set_puck_xy(self.sample_puck_xy())

        if not (self.puck_space.contains(self.get_puck_pos()[:2])):
            self._set_puck_xy(self.sample_puck_xy())

        goal = self.sample_valid_goal()
        self.set_goal(goal)
        self.reset_counter += 1
        self.reset_mocap_welds()
        return self._get_obs()

    def _reset_hand(self):
        velocities = self.data.qvel.copy()
        angles = self.data.qpos.copy()
        # FIXME refer DOF
        angles[:6] = self.init_angles[:6]
        self.set_state(angles.flatten(), velocities.flatten())
        # TODO meaning of the range?
        for _ in range(9):
            self.data.set_mocap_pos('mocap', self.init_hand_xyz.copy())
            self.data.set_mocap_quat('mocap', np.array([1, 0, 1, 0]))

    def reset(self):
        ob = self.reset_model()
        if self.viewer is not None:
            self.viewer_setup()
        return ob

    @property
    def init_angles(self):
        # FIXME refer DOF
        return [
            0, 0, 0, 0, 0, 0, 5.05442647e-04, 6.00496057e-01, 3.06443862e-02,
            1, 0, 0, 0
        ]

    """
    Multitask functions
    """

    def get_goal(self):
        return {
            'desired_goal': self._state_goal,
            'state_desired_goal': self._state_goal,
        }

    def set_goal(self, goal):
        self._state_goal = goal['state_desired_goal']
        self._set_goal_marker(self._state_goal)

    def set_to_goal(self, goal):
        hand_goal = goal['state_desired_goal'][:3]
        # FIXME meaning of the range?
        for _ in range(9):
            self.data.set_mocap_pos('mocap', hand_goal)
            self.data.set_mocap_quat('mocap', np.array([1, 0, 1, 0]))
            self.do_simulation(None, self.frame_skip)
        puck_goal = goal['state_desired_goal'][3:]
        self._set_puck_xy(puck_goal)
        self.sim.forward()

    def sample_valid_goal(self):
        goal = self.sample_goal()
        hand_goal_xy = goal['state_desired_goal'][:2]
        puck_goal_xy = goal['state_desired_goal'][3:]
        dist = np.linalg.norm(hand_goal_xy - puck_goal_xy)
        while dist <= self.puck_radius:
            goal = self.sample_goal()
            hand_goal_xy = goal['state_desired_goal'][:2]
            puck_goal_xy = goal['state_desired_goal'][3:]
            dist = np.linalg.norm(hand_goal_xy - puck_goal_xy)
        return goal

    def sample_goals(self, batch_size):
        if self.fix_goal:
            goals = np.repeat(self.fixed_goal.copy()[None], batch_size, 0)
        else:
            goals = np.random.uniform(
                self.goal_low,
                self.goal_high,
                size=(batch_size, self.goal_low.size),
            )
        return {
            'desired_goal': goals,
            'state_desired_goal': goals,
        }

    def compute_rewards(self, actions, obs):
        achieved_goals = obs['state_achieved_goal']
        desired_goals = obs['state_desired_goal']
        hand_pos = achieved_goals[:, :3]
        puck_pos = achieved_goals[:, 3:]
        hand_goals = desired_goals[:, :3]
        puck_goals = desired_goals[:, 3:]

        hand_distances = np.linalg.norm(hand_goals - hand_pos,
                                        ord=self.norm_order,
                                        axis=1)
        puck_distances = np.linalg.norm(puck_goals - puck_pos,
                                        ord=self.norm_order,
                                        axis=1)
        puck_zs = self.init_puck_z * np.ones((desired_goals.shape[0], 1))
        touch_distances = np.linalg.norm(
            hand_pos - np.hstack((puck_pos, puck_zs)),
            ord=self.norm_order,
            axis=1,
        )

        if self.reward_type == 'hand_distance':
            r = -hand_distances
        elif self.reward_type == 'hand_success':
            r = -(hand_distances > self.indicator_threshold).astype(float)
        elif self.reward_type == 'puck_distance':
            r = -puck_distances
        elif self.reward_type == 'puck_success':
            r = -(puck_distances > self.indicator_threshold).astype(float)
        elif self.reward_type == 'hand_and_puck_distance':
            r = -(puck_distances + hand_distances)
        elif self.reward_type == 'state_distance':
            r = -np.linalg.norm(
                achieved_goals - desired_goals, ord=self.norm_order, axis=1)
        elif self.reward_type == 'vectorized_state_distance':
            r = -np.abs(achieved_goals - desired_goals)
        elif self.reward_type == 'touch_distance':
            r = -touch_distances
        elif self.reward_type == 'touch_success':
            r = -(touch_distances > self.indicator_threshold).astype(float)
        else:
            raise NotImplementedError("Invalid/no reward type.")
        return r

    def get_diagnostics(self, paths, prefix=''):
        statistics = OrderedDict()
        for stat_name in [
                'hand_distance',
                'hand_distance_l1',
                'hand_distance_l2',
                'puck_distance',
                'puck_distance_l1',
                'puck_distance_l2',
                'hand_and_puck_distance',
                'hand_and_puck_distance_l1',
                'hand_and_puck_distance_l2',
                'state_distance',
                'state_distance_l1',
                'state_distance_l2',
                'touch_distance',
                'touch_distance_l1',
                'touch_distance_l2',
                'hand_success',
                'puck_success',
                'hand_and_puck_success',
                'state_success',
                'touch_success',
        ]:
            stat_name = stat_name
            stat = get_stat_in_paths(paths, 'env_infos', stat_name)
            statistics.update(
                create_stats_ordered_dict(
                    '%s%s' % (prefix, stat_name),
                    stat,
                    always_show_all_stats=True,
                ))
            statistics.update(
                create_stats_ordered_dict(
                    'Final %s%s' % (prefix, stat_name),
                    [s[-1] for s in stat],
                    always_show_all_stats=True,
                ))
        return statistics

    def get_env_state(self):
        base_state = super().get_env_state()
        goal = self._state_goal.copy()
        return base_state, goal

    def set_env_state(self, state):
        base_state, goal = state
        super().set_env_state(base_state)
        self._state_goal = goal
        self._set_goal_marker(goal)
class SawyerPushAndReachXYZEnv(MultitaskEnv, SawyerXYZEnv):
    def __init__(self,
                 puck_low=(-.4, .2),
                 puck_high=(.4, 1),
                 reward_type='state_distance',
                 norm_order=1,
                 indicator_threshold=0.06,
                 hand_low=(-0.28, 0.3, 0.05),
                 hand_high=(0.28, 0.9, 0.3),
                 fix_goal=False,
                 fixed_goal=(0.15, 0.6, 0.02, -0.15, 0.6),
                 goal_low=(-0.25, 0.3, 0.02, -.2, .4),
                 goal_high=(0.25, 0.875, 0.02, .2, .8),
                 hide_goal_markers=False,
                 init_puck_z=0.02,
                 init_hand_xyz=(0, 0.4, 0.07),
                 reset_free=False,
                 xml_path='sawyer_xyz/sawyer_push_puck.xml',
                 clamp_puck_on_step=False,
                 puck_radius=.07,
                 **kwargs):
        self.quick_init(locals())
        self.model_name = get_asset_full_path(xml_path)
        MultitaskEnv.__init__(self)
        SawyerXYZEnv.__init__(self,
                              hand_low=hand_low,
                              hand_high=hand_high,
                              model_name=self.model_name,
                              **kwargs)
        if puck_low is None:
            puck_low = self.hand_low[:2]
        if puck_high is None:
            puck_high = self.hand_high[:2]
        puck_low = np.array(puck_low)
        puck_high = np.array(puck_high)

        self.puck_low = puck_low
        self.puck_high = puck_high

        if goal_low is None:
            goal_low = np.hstack((self.hand_low, puck_low))
        if goal_high is None:
            goal_high = np.hstack((self.hand_high, puck_high))
        self.goal_low = np.array(goal_low)
        self.goal_high = np.array(goal_high)

        self.reward_type = reward_type
        self.norm_order = norm_order
        self.indicator_threshold = indicator_threshold

        self.fix_goal = fix_goal
        self.fixed_goal = np.array(fixed_goal)
        self._state_goal = None

        self.hide_goal_markers = hide_goal_markers

        self.action_space = Box(np.array([-1, -1, -1]),
                                np.array([1, 1, 1]),
                                dtype=np.float32)
        self.hand_and_puck_space = Box(np.hstack((self.hand_low, puck_low)),
                                       np.hstack((self.hand_high, puck_high)),
                                       dtype=np.float32)
        self.hand_space = Box(self.hand_low, self.hand_high, dtype=np.float32)
        self.observation_space = Dict([
            ('observation', self.hand_and_puck_space),
            ('desired_goal', self.hand_and_puck_space),
            ('achieved_goal', self.hand_and_puck_space),
            ('state_observation', self.hand_and_puck_space),
            ('state_desired_goal', self.hand_and_puck_space),
            ('state_achieved_goal', self.hand_and_puck_space),
            ('proprio_observation', self.hand_space),
            ('proprio_desired_goal', self.hand_space),
            ('proprio_achieved_goal', self.hand_space),
        ])
        self.init_puck_z = init_puck_z
        self.init_hand_xyz = np.array(init_hand_xyz)
        self._set_puck_xy(self.sample_puck_xy())
        self.reset_free = reset_free
        self.reset_counter = 0
        self.puck_space = Box(self.puck_low, self.puck_high, dtype=np.float32)
        self.clamp_puck_on_step = clamp_puck_on_step
        self.puck_radius = puck_radius
        self.reset()

    def viewer_setup(self):
        self.viewer.cam.trackbodyid = 0
        self.viewer.cam.lookat[0] = 0
        self.viewer.cam.lookat[1] = 1.0
        self.viewer.cam.lookat[2] = 0.5
        self.viewer.cam.distance = 0.3
        self.viewer.cam.elevation = -45
        self.viewer.cam.azimuth = 270
        self.viewer.cam.trackbodyid = -1

    def step(self, action):
        self.set_xyz_action(action)
        u = np.zeros(8)
        u[7] = 1
        self.do_simulation(u)
        if self.clamp_puck_on_step:
            curr_puck_pos = self.get_puck_pos()[:2]
            curr_puck_pos = np.clip(curr_puck_pos, self.puck_space.low,
                                    self.puck_space.high)
            self._set_puck_xy(curr_puck_pos)
        self._set_goal_marker(self._state_goal)
        ob = self._get_obs()
        reward = self.compute_reward(action, ob)
        info = self._get_info()
        done = False
        return ob, reward, done, info

    def _get_obs(self):
        e = self.get_endeff_pos()
        b = self.get_puck_pos()[:2]
        flat_obs = np.concatenate((e, b))

        return dict(
            observation=flat_obs,
            desired_goal=self._state_goal,
            achieved_goal=flat_obs,
            state_observation=flat_obs,
            state_desired_goal=self._state_goal,
            state_achieved_goal=flat_obs,
            proprio_observation=flat_obs[:3],
            proprio_desired_goal=self._state_goal[:3],
            proprio_achieved_goal=flat_obs[:3],
        )

    def _get_info(self):
        hand_goal = self._state_goal[:3]
        puck_goal = self._state_goal[3:]

        # hand distance
        hand_diff = hand_goal - self.get_endeff_pos()
        hand_distance = np.linalg.norm(hand_diff, ord=self.norm_order)
        hand_distance_l1 = np.linalg.norm(hand_diff, 1)
        hand_distance_l2 = np.linalg.norm(hand_diff, 2)

        # puck distance
        puck_diff = puck_goal - self.get_puck_pos()[:2]
        puck_distance = np.linalg.norm(puck_diff, ord=self.norm_order)
        puck_distance_l1 = np.linalg.norm(puck_diff, 1)
        puck_distance_l2 = np.linalg.norm(puck_diff, 2)

        # touch distance
        touch_diff = self.get_endeff_pos() - self.get_puck_pos()
        touch_distance = np.linalg.norm(touch_diff, ord=self.norm_order)
        touch_distance_l1 = np.linalg.norm(touch_diff, ord=1)
        touch_distance_l2 = np.linalg.norm(touch_diff, ord=2)

        # state distance
        state_diff = np.hstack((self.get_endeff_pos(),
                                self.get_puck_pos()[:2])) - self._state_goal
        state_distance = np.linalg.norm(state_diff, ord=self.norm_order)
        state_distance_l1 = np.linalg.norm(state_diff, ord=1)
        state_distance_l2 = np.linalg.norm(state_diff, ord=2)

        return dict(
            hand_distance=hand_distance,
            hand_distance_l1=hand_distance_l1,
            hand_distance_l2=hand_distance_l2,
            puck_distance=puck_distance,
            puck_distance_l1=puck_distance_l1,
            puck_distance_l2=puck_distance_l2,
            hand_and_puck_distance=hand_distance + puck_distance,
            hand_and_puck_distance_l1=hand_distance_l1 + puck_distance_l1,
            hand_and_puck_distance_l2=hand_distance_l2 + puck_distance_l2,
            touch_distance=touch_distance,
            touch_distance_l1=touch_distance_l1,
            touch_distance_l2=touch_distance_l2,
            state_distance=state_distance,
            state_distance_l1=state_distance_l1,
            state_distance_l2=state_distance_l2,
            hand_success=float(hand_distance < self.indicator_threshold),
            puck_success=float(puck_distance < self.indicator_threshold),
            hand_and_puck_success=float(
                hand_distance + puck_distance < self.indicator_threshold),
            touch_success=float(touch_distance < self.indicator_threshold),
            state_success=float(state_distance < self.indicator_threshold),
        )

    def get_puck_pos(self):
        return self.data.get_body_xpos('puck').copy()

    def sample_puck_xy(self):
        return np.array([0, 0.6])

    def _set_goal_marker(self, goal):
        """
        This should be use ONLY for visualization. Use self._state_goal for
        logging, learning, etc.
        """
        self.data.site_xpos[self.model.site_name2id('hand-goal-site')] = (
            goal[:3])
        self.data.site_xpos[self.model.site_name2id('puck-goal-site')][:2] = (
            goal[3:])
        if self.hide_goal_markers:
            self.data.site_xpos[self.model.site_name2id('hand-goal-site'),
                                2] = (-1000)
            self.data.site_xpos[self.model.site_name2id('puck-goal-site'),
                                2] = (-1000)

    def _set_puck_xy(self, pos):
        """
        WARNING: this resets the sites (because set_state resets sights do).
        """
        qpos = self.data.qpos.flat.copy()
        qvel = self.data.qvel.flat.copy()
        qpos[8:11] = np.hstack((pos.copy(), np.array([self.init_puck_z])))
        qpos[11:15] = np.array([1, 0, 0, 0])
        qvel[8:15] = 0
        self.set_state(qpos, qvel)

    def reset_model(self):
        self._reset_hand()
        if not self.reset_free:
            self._set_puck_xy(self.sample_puck_xy())

        if not (self.puck_space.contains(self.get_puck_pos()[:2])):
            self._set_puck_xy(self.sample_puck_xy())

        goal = self.sample_valid_goal()
        self.set_goal(goal)
        self.reset_counter += 1
        self.reset_mocap_welds()
        return self._get_obs()

    def _reset_hand(self):
        velocities = self.data.qvel.copy()
        angles = self.data.qpos.copy()
        angles[:7] = self.init_angles[:7]
        self.set_state(angles.flatten(), velocities.flatten())
        for _ in range(10):
            self.data.set_mocap_pos('mocap', self.init_hand_xyz.copy())
            self.data.set_mocap_quat('mocap', np.array([1, 0, 1, 0]))

    def reset(self):
        ob = self.reset_model()
        if self.viewer is not None:
            self.viewer_setup()
        return ob

    @property
    def init_angles(self):
        return [
            1.7244448, -0.92036369, 0.10234232, 2.11178144, 2.97668632,
            -0.38664629, 0.54065733, 5.05442647e-04, 6.00496057e-01,
            3.06443862e-02, 1, 0, 0, 0
        ]

    """
    Multitask functions
    """

    def get_goal(self):
        return {
            'desired_goal': self._state_goal,
            'state_desired_goal': self._state_goal,
        }

    def set_goal(self, goal):
        self._state_goal = goal['state_desired_goal']
        self._set_goal_marker(self._state_goal)

    def set_to_goal(self, goal):
        hand_goal = goal['state_desired_goal'][:3]
        for _ in range(10):
            self.data.set_mocap_pos('mocap', hand_goal)
            self.data.set_mocap_quat('mocap', np.array([1, 0, 1, 0]))
            self.do_simulation(None, self.frame_skip)
        puck_goal = goal['state_desired_goal'][3:]
        self._set_puck_xy(puck_goal)
        self.sim.forward()

    def sample_valid_goal(self):
        goal = self.sample_goal()
        hand_goal_xy = goal['state_desired_goal'][:2]
        puck_goal_xy = goal['state_desired_goal'][3:]
        dist = np.linalg.norm(hand_goal_xy - puck_goal_xy)
        while (dist <= self.puck_radius and not self.fix_goal):
            goal = self.sample_goal()
            hand_goal_xy = goal['state_desired_goal'][:2]
            puck_goal_xy = goal['state_desired_goal'][3:]
            dist = np.linalg.norm(hand_goal_xy - puck_goal_xy)
        return goal

    def sample_goals(self, batch_size):
        if self.fix_goal:
            goals = np.repeat(self.fixed_goal.copy()[None], batch_size, 0)
        else:
            goals = np.random.uniform(
                self.goal_low,
                self.goal_high,
                size=(batch_size, self.goal_low.size),
            )
        return {
            'desired_goal': goals,
            'state_desired_goal': goals,
        }

    def compute_rewards(self, actions, obs):
        achieved_goals = obs['state_achieved_goal']
        desired_goals = obs['state_desired_goal']
        hand_pos = achieved_goals[:, :3]
        puck_pos = achieved_goals[:, 3:]
        hand_goals = desired_goals[:, :3]
        puck_goals = desired_goals[:, 3:]

        hand_distances = np.linalg.norm(hand_goals - hand_pos,
                                        ord=self.norm_order,
                                        axis=1)
        puck_distances = np.linalg.norm(puck_goals - puck_pos,
                                        ord=self.norm_order,
                                        axis=1)
        puck_zs = self.init_puck_z * np.ones((desired_goals.shape[0], 1))
        touch_distances = np.linalg.norm(
            hand_pos - np.hstack((puck_pos, puck_zs)),
            ord=self.norm_order,
            axis=1,
        )

        if self.reward_type == 'hand_distance':
            r = -hand_distances
        elif self.reward_type == 'hand_success':
            r = -(hand_distances > self.indicator_threshold).astype(float)
        elif self.reward_type == 'hand_success_positive':
            r = (hand_distances < self.indicator_threshold).astype(float)
        elif self.reward_type == 'puck_distance':
            r = -puck_distances
        elif self.reward_type == 'puck_success':
            r = -(puck_distances > self.indicator_threshold).astype(float)
        elif self.reward_type == 'puck_success_positive':
            r = (puck_distances < self.indicator_threshold).astype(float)
        elif self.reward_type == 'hand_and_puck_distance':
            r = -(puck_distances + hand_distances)
        elif self.reward_type == 'state_distance':
            r = -np.linalg.norm(
                achieved_goals - desired_goals, ord=self.norm_order, axis=1)
        elif self.reward_type == 'vectorized_state_distance':
            r = -np.abs(achieved_goals - desired_goals)
        elif self.reward_type == 'touch_distance':
            r = -touch_distances
        elif self.reward_type == 'touch_success':
            r = -(touch_distances > self.indicator_threshold).astype(float)
        else:
            raise NotImplementedError("Invalid/no reward type.")
        return r

    def get_diagnostics(self, paths, prefix=''):
        statistics = OrderedDict()
        for stat_name in [
                'hand_distance',
                'hand_distance_l1',
                'hand_distance_l2',
                'puck_distance',
                'puck_distance_l1',
                'puck_distance_l2',
                'hand_and_puck_distance',
                'hand_and_puck_distance_l1',
                'hand_and_puck_distance_l2',
                'state_distance',
                'state_distance_l1',
                'state_distance_l2',
                'touch_distance',
                'touch_distance_l1',
                'touch_distance_l2',
                'hand_success',
                'puck_success',
                'hand_and_puck_success',
                'state_success',
                'touch_success',
        ]:
            stat_name = stat_name
            stat = get_stat_in_paths(paths, 'env_infos', stat_name)
            statistics.update(
                create_stats_ordered_dict(
                    '%s%s' % (prefix, stat_name),
                    stat,
                    always_show_all_stats=True,
                ))
            statistics.update(
                create_stats_ordered_dict(
                    'Final %s%s' % (prefix, stat_name),
                    [s[-1] for s in stat],
                    always_show_all_stats=True,
                ))
        return statistics

    def get_env_state(self):
        base_state = super().get_env_state()
        goal = self._state_goal.copy()
        return base_state, goal

    def set_env_state(self, state):
        base_state, goal = state
        super().set_env_state(base_state)
        self._state_goal = goal
        self._set_goal_marker(goal)
Beispiel #15
0
class SawyerPushAndReachXYZEnv(MultitaskEnv, SawyerXYZEnv):
    def __init__(
            self,
            puck_low=(-.35, .25),
            puck_high=(.35, .95),

            norm_order=1,
            indicator_threshold=0.06,
            touch_threshold=0.1,  # I just chose this number after doing a few runs and looking at a histogram

            hand_low=(-0.28, 0.3, 0.05),
            hand_high=(0.28, 0.9, 0.3),

            fix_goal=True,
            fixed_goal=(0.15, 0.6, 0.02, -0.15, 0.6),
            goal_low=(-0.25, 0.3, 0.02, -0.2, .4),
            goal_high=(0.25, 0.875, 0.02, .2, .8),

            init_puck_z=0.035,
            init_hand_xyz=(0, 0.4, 0.07),

            reset_free=False,
            xml_path='sawyer_xyz/sawyer_push_puck.xml',
            clamp_puck_on_step=False,

            puck_radius=.07,
            **kwargs
    ):
        self.quick_init(locals())
        self.model_name = get_asset_full_path(xml_path)

        self.goal_type = kwargs.pop('goal_type', 'puck')
        self.dense_reward = kwargs.pop('dense_reward', False)
        self.indicator_threshold = kwargs.pop('goal_tolerance', indicator_threshold)
        self.fixed_goal = np.array(kwargs.pop('goal', fixed_goal))
        self.task_agnostic = kwargs.pop('task_agnostic', False)
        self.init_puck_xy = np.array(kwargs.pop('init_puck_xy', [0, 0.6]))
        self.init_hand_xyz = np.array(kwargs.pop('init_hand_xyz', init_hand_xyz))

        MultitaskEnv.__init__(self)
        SawyerXYZEnv.__init__(
            self,
            hand_low=hand_low,
            hand_high=hand_high,
            model_name=self.model_name,
            **kwargs
        )
        if puck_low is None:
            puck_low = self.hand_low[:2]
        if puck_high is None:
            puck_high = self.hand_high[:2]

        self.puck_low = np.array(puck_low)
        self.puck_high = np.array(puck_high)

        if goal_low is None:
            goal_low = np.hstack((self.hand_low, puck_low))
        if goal_high is None:
            goal_high = np.hstack((self.hand_high, puck_high))
        self.goal_low = np.array(goal_low)
        self.goal_high = np.array(goal_high)

        self.norm_order = norm_order
        self.touch_threshold = touch_threshold
        self.fix_goal = fix_goal

        self._state_goal = None

        self.hide_goal_markers = self.task_agnostic

        self.action_space = Box(np.array([-1, -1, -1]), np.array([1, 1, 1]), dtype=np.float32)
        self.hand_and_puck_space = Box(
            np.hstack((self.hand_low, puck_low)),
            np.hstack((self.hand_high, puck_high)),
            dtype=np.float32
        )
        self.hand_space = Box(self.hand_low, self.hand_high, dtype=np.float32)
        self.observation_space = Dict([
            ('observation', self.hand_and_puck_space),
            ('desired_goal', self.hand_and_puck_space),
            ('achieved_goal', self.hand_and_puck_space),
            ('state_observation', self.hand_and_puck_space),
            ('state_desired_goal', self.hand_and_puck_space),
            ('state_achieved_goal', self.hand_and_puck_space),
            ('proprio_observation', self.hand_space),
            ('proprio_desired_goal', self.hand_space),
            ('proprio_achieved_goal', self.hand_space),
        ])

        self.init_puck_z = init_puck_z
        self._set_puck_xy(self.sample_puck_xy())
        self.reset_free = reset_free
        self.reset_counter = 0
        self.puck_space = Box(self.puck_low, self.puck_high, dtype=np.float32)
        self.clamp_puck_on_step = clamp_puck_on_step
        self.puck_radius = puck_radius
        self.reset()

    def viewer_setup(self):
        self.viewer.cam.trackbodyid = 0
        self.viewer.cam.lookat[0] = 0
        self.viewer.cam.lookat[1] = 1.0
        self.viewer.cam.lookat[2] = 0.5
        self.viewer.cam.distance = 0.3
        self.viewer.cam.elevation = -45
        self.viewer.cam.azimuth = 270
        self.viewer.cam.trackbodyid = -1

    def step(self, action):
        self.set_xyz_action(action)
        u = np.zeros(8)
        u[7] = 1
        self.do_simulation(u)
        if self.clamp_puck_on_step:
            curr_puck_pos = self.get_puck_pos()[:2]
            curr_puck_pos = np.clip(curr_puck_pos, self.puck_space.low, self.puck_space.high)
            self._set_puck_xy(curr_puck_pos)
        self._set_goal_marker(self._state_goal)
        next_state = self._get_obs()['state_observation']
        # reward = self.compute_rewards(action, ob) if not self.task_agnostic else 0.
        # info = self._get_info()
        # done = self.is_goal_state(ob['observation']) if not self.task_agnostic else False
        return next_state

    def _get_obs(self):
        e = self.get_endeff_pos()
        b = self.get_puck_pos()[:2]
        flat_obs = np.concatenate((e, b))

        return dict(
            observation=flat_obs,
            desired_goal=self._state_goal,
            achieved_goal=flat_obs,
            state_observation=flat_obs,
            state_desired_goal=self._state_goal,
            state_achieved_goal=flat_obs,
            proprio_observation=flat_obs[:3],
            proprio_desired_goal=self._state_goal[:3],
            proprio_achieved_goal=flat_obs[:3],
        )

    def _get_info(self, state=None):
        hand_goal = self._state_goal[:3]
        puck_goal = self._state_goal[3:]
        endeff_pos = self.get_endeff_pos()
        puck_pos = self.get_puck_pos()
        if state is not None:
            endeff_pos = state[:3]
            puck_pos[:2] = state[3:]

        # hand distance
        hand_diff = hand_goal - endeff_pos
        hand_distance = np.linalg.norm(hand_diff, ord=self.norm_order)
        hand_distance_l1 = np.linalg.norm(hand_diff, 1)
        hand_distance_l2 = np.linalg.norm(hand_diff, 2)

        # puck distance
        puck_diff = puck_goal - puck_pos[:2]
        puck_distance = np.linalg.norm(puck_diff, ord=self.norm_order)
        puck_distance_l1 = np.linalg.norm(puck_diff, 1)
        puck_distance_l2 = np.linalg.norm(puck_diff, 2)

        # touch distance
        touch_diff = endeff_pos - puck_pos
        touch_distance = np.linalg.norm(touch_diff, ord=self.norm_order)
        touch_distance_l1 = np.linalg.norm(touch_diff, ord=1)
        touch_distance_l2 = np.linalg.norm(touch_diff, ord=2)

        # state distance
        state_diff = np.hstack((endeff_pos, puck_pos[:2])) - self._state_goal
        state_distance = np.linalg.norm(state_diff, ord=self.norm_order)
        state_distance_l1 = np.linalg.norm(state_diff, ord=1)
        state_distance_l2 = np.linalg.norm(state_diff, ord=2)

        return dict(
            hand_distance=hand_distance,
            hand_distance_l1=hand_distance_l1,
            hand_distance_l2=hand_distance_l2,
            puck_distance=puck_distance,
            puck_distance_l1=puck_distance_l1,
            puck_distance_l2=puck_distance_l2,
            hand_and_puck_distance=hand_distance+puck_distance,
            hand_and_puck_distance_l1=hand_distance_l1+puck_distance_l1,
            hand_and_puck_distance_l2=hand_distance_l2+puck_distance_l2,
            touch_distance=touch_distance,
            touch_distance_l1=touch_distance_l1,
            touch_distance_l2=touch_distance_l2,
            state_distance=state_distance,
            state_distance_l1=state_distance_l1,
            state_distance_l2=state_distance_l2,
            hand_success=float(hand_distance < self.indicator_threshold),
            puck_success=float(puck_distance < self.indicator_threshold),
            hand_and_puck_success=float(
                hand_distance+puck_distance < self.indicator_threshold
            ),
            touch_success=float(touch_distance < self.indicator_threshold),
            state_success=float(state_distance < self.indicator_threshold),
        )

    def distance_from_goal(self, state=None, goal=None):
        if state is None:
            endeff_pos, puck_pos = self.get_endeff_pos(), self.get_puck_pos()
        else:
            endeff_pos, puck_pos = state[:3], state[3:]
        if goal is None:
            hand_goal, puck_goal = self._state_goal[:3], self._state_goal[3:]
        else:
            hand_goal, puck_goal = goal[:3], goal[:3]


        distances = self._get_info(state)
        dict_key = self.goal_type + '_distance'
        try:
            return distances[dict_key]
        except KeyError:
            raise NotImplementedError("Invalid/no reward type.")

    def is_goal_state(self, state):
        """
        Only used by deep skill chaining.
        Args:
            state (np.ndarray): state array [endeff_x, endeff_x, endeff_x, puck_x, puck_y]
        Returns:
            True if is goal state, false otherwise
        """
        dist = self.distance_from_goal(state=state)
        tolerance = self.indicator_threshold if self.goal_type != 'touch' else self.touch_threshold
        return dist < tolerance

    def get_puck_pos(self):
        return self.data.get_body_xpos('puck').copy()

    def sample_puck_xy(self):
        return self.init_puck_xy

    def _set_goal_marker(self, goal):
        """
        This should be use ONLY for visualization. Use self._state_goal for
        logging, learning, etc.
        """
        self.data.site_xpos[self.model.site_name2id('hand-goal-site')] = (
            goal[:3]
        )
        self.data.site_xpos[self.model.site_name2id('puck-goal-site')][:2] = (
            goal[3:]
        )
        # if self.hide_goal_markers or self.goal_type == 'touch' or self.goal_type == 'puck':
        self.data.site_xpos[self.model.site_name2id('hand-goal-site'), 2] = (
            -1000
        )
        if self.hide_goal_markers or self.goal_type == 'touch' or self.goal_type == 'hand':
            self.data.site_xpos[self.model.site_name2id('puck-goal-site'), 2] = (
                -1000
            )

    def _set_puck_xy(self, pos):
        """
        WARNING: this resets the sites (because set_state resets sights do).
        """
        qpos = self.data.qpos.flat.copy()
        qvel = self.data.qvel.flat.copy()
        qpos[8:11] = np.hstack((pos.copy(), np.array([self.init_puck_z])))
        qpos[11:15] = np.array([1, 0, 0, 0])
        qvel[8:15] = 0
        self.set_state(qpos, qvel)

    def reset_model(self, start_pos=None, goal_pos=None):
        # Sets reset vars but doesn't actually reset hand position
        self._reset_hand(start_pos)

        if goal_pos is None:
            goal_pos = self.sample_valid_goal()['state_desired_goal']
        self.set_goal(goal_pos)

        # Actually moves the hand's position. Could knock the puck out of the way,
        # so needs to be done BEFORE resetting puck position.
        self.step([0, 0])

        if not self.reset_free:
            if start_pos is None:
                puck_xy = self.sample_puck_xy()
            else:
                puck_xy = start_pos[3:]
            self._set_puck_xy(puck_xy)

        if not (self.puck_space.contains(self.get_puck_pos()[:2])):
            self._set_puck_xy(self.sample_puck_xy())
        self.reset_counter += 1
        self.reset_mocap_welds()
        return self._get_obs()

    def _reset_hand(self, start_pos=None):
        velocities = self.data.qvel.copy()
        angles = self.data.qpos.copy()
        angles[:7] = self.init_angles[:7]
        self.set_state(angles.flatten(), velocities.flatten())

        if start_pos is None:
            hand_pos = self.init_hand_xyz.copy()
        else:
            hand_pos = start_pos[:3]
        for _ in range(10):
            self.data.set_mocap_pos('mocap', hand_pos)
            self.data.set_mocap_quat('mocap', np.array([1, 0, 1, 0]))

    def reset(self, start_pos=None, goal_puck_pos=None):
        ob = self.reset_model(start_pos, goal_puck_pos)
        if self.viewer is not None:
            self.viewer_setup()
        return ob['state_observation']


    @property
    def init_angles(self):
        return [1.7244448, -0.92036369,  0.10234232,  2.11178144,  2.97668632, -0.38664629, 0.54065733,
                5.05442647e-04, 6.00496057e-01, 3.06443862e-02,
                1, 0, 0, 0]

    """
    Multitask functions
    """
    def get_goal(self):
        return {
            'desired_goal': self._state_goal,
            'state_desired_goal': self._state_goal,
        }

    def set_goal(self, goal):
        self._state_goal = goal
        self._set_goal_marker(self._state_goal)

    def set_to_goal(self, goal):
        hand_goal = goal['state_desired_goal'][:3]
        for _ in range(10):
            self.data.set_mocap_pos('mocap', hand_goal)
            self.data.set_mocap_quat('mocap', np.array([1, 0, 1, 0]))
            self.do_simulation(None, self.frame_skip)
        puck_goal = goal['state_desired_goal'][3:]
        self._set_puck_xy(puck_goal)
        self.sim.forward()

    def sample_valid_goal(self):
        goal = self.sample_goal()
        hand_goal_xy = goal['state_desired_goal'][:2]
        puck_goal_xy = goal['state_desired_goal'][3:]
        dist = np.linalg.norm(hand_goal_xy-puck_goal_xy)
        while dist <= self.puck_radius:
            goal = self.sample_goal()
            hand_goal_xy = goal['state_desired_goal'][:2]
            puck_goal_xy = goal['state_desired_goal'][3:]
            dist = np.linalg.norm(hand_goal_xy - puck_goal_xy)
        return goal

    def sample_goals(self, batch_size):
        if self.fix_goal:
            goals = np.repeat(
                self.fixed_goal.copy()[None],
                batch_size,
                0
            )
        else:
            goals = np.random.uniform(
                self.goal_low,
                self.goal_high,
                size=(batch_size, self.goal_low.size),
            )
        return {
            'desired_goal': goals,
            'state_desired_goal': goals,
        }

    def compute_rewards(self, actions, obs):
        state = obs['observation']
        dist = self.distance_from_goal(state)
        tolerance = self.indicator_threshold if self.goal_type != 'touch' else self.touch_threshold

        if self.dense_reward:
            return -dist
        else:
            return -(dist > tolerance).astype(float)

    def get_diagnostics(self, paths, prefix=''):
        statistics = OrderedDict()
        for stat_name in [
            'hand_distance',
            'hand_distance_l1',
            'hand_distance_l2',
            'puck_distance',
            'puck_distance_l1',
            'puck_distance_l2',
            'hand_and_puck_distance',
            'hand_and_puck_distance_l1',
            'hand_and_puck_distance_l2',
            'state_distance',
            'state_distance_l1',
            'state_distance_l2',
            'touch_distance',
            'touch_distance_l1',
            'touch_distance_l2',
            'hand_success',
            'puck_success',
            'hand_and_puck_success',
            'state_success',
            'touch_success',
        ]:
            stat_name = stat_name
            stat = get_stat_in_paths(paths, 'env_infos', stat_name)
            statistics.update(create_stats_ordered_dict(
                '%s%s' % (prefix, stat_name),
                stat,
                always_show_all_stats=True,
                ))
            statistics.update(create_stats_ordered_dict(
                'Final %s%s' % (prefix, stat_name),
                [s[-1] for s in stat],
                always_show_all_stats=True,
                ))
        return statistics

    def get_env_state(self):
        base_state = super().get_env_state()
        goal = self._state_goal.copy()
        return base_state, goal

    def set_env_state(self, state):
        base_state, goal = state
        super().set_env_state(base_state)
        self._state_goal = goal
        self._set_goal_marker(goal)
class CartPoleContinuousEnv(CartPoleEnv):
    def __init__(self):
        super().__init__()
        self.action_space = Box(low=-1.,
                                high=1.,
                                shape=(1, ),
                                dtype=np.float32)

    def step(self, action):
        err_msg = "%r (%s) invalid" % (action, type(action))
        assert self.action_space.contains(action), err_msg

        x, x_dot, theta, theta_dot = self.state
        # ===
        # This assignment overwrites the discrete behavior of the cartpole.
        # The action is the scale of the force magnitude itself
        force = action[0] * self.force_mag
        # ===
        costheta = math.cos(theta)
        sintheta = math.sin(theta)

        # For the interested reader:
        # https://coneural.org/florian/papers/05_cart_pole.pdf
        temp = (force + self.polemass_length * theta_dot**2 *
                sintheta) / self.total_mass
        thetaacc = (self.gravity * sintheta - costheta * temp) / (
            self.length *
            (4.0 / 3.0 - self.masspole * costheta**2 / self.total_mass))
        xacc = temp - self.polemass_length * thetaacc * costheta / self.total_mass

        if self.kinematics_integrator == 'euler':
            x = x + self.tau * x_dot
            x_dot = x_dot + self.tau * xacc
            theta = theta + self.tau * theta_dot
            theta_dot = theta_dot + self.tau * thetaacc
        else:  # semi-implicit euler
            x_dot = x_dot + self.tau * xacc
            x = x + self.tau * x_dot
            theta_dot = theta_dot + self.tau * thetaacc
            theta = theta + self.tau * theta_dot

        self.state = (x, x_dot, theta, theta_dot)

        done = bool(x < -self.x_threshold or x > self.x_threshold
                    or theta < -self.theta_threshold_radians
                    or theta > self.theta_threshold_radians)

        if not done:
            reward = 1.0
        elif self.steps_beyond_done is None:
            # Pole just fell!
            self.steps_beyond_done = 0
            reward = 1.0
        else:
            if self.steps_beyond_done == 0:
                logger.warn(
                    "You are calling 'step()' even though this "
                    "environment has already returned done = True. You "
                    "should always call 'reset()' once you receive 'done = "
                    "True' -- any further steps are undefined behavior.")
            self.steps_beyond_done += 1
            reward = 0.0

        return np.array(self.state), reward, done, {}
    def update(self, goals, binary_competence, continuous_competence=None):
        if len(goals) > 0:
            new_split = False
            all_order = None
            regions = [None] * len(goals)
            for i, goal in enumerate(goals):
                for j, rb in enumerate(self.region_bounds):
                    if rb.contains(goal):
                        regions[i] = j
                        break

            if self.continuous_competence:
                c_or_cps = continuous_competence
            else:
                c_or_cps = binary_competence

            # add new outcomes and goals to regions
            for reg, c_or_cp, goal in zip(regions, c_or_cps, goals):
                self.regions[reg][0].append(c_or_cp)
                self.regions[reg][1].append(goal)

            # check if need to split
            ind_split = []
            new_bounds = []
            new_sub_regions = []
            for reg in range(self.nb_regions):
                if len(self.regions[reg][0]) > self.maxlen:
                    # try nb_split_attempts splits
                    best_split_score = 0
                    best_abs_interest_diff = 0
                    best_bounds = None
                    best_sub_regions = None
                    is_split = False
                    for i in range(self.nb_split_attempts):
                        sub_reg1 = [deque(), deque()]
                        sub_reg2 = [deque(), deque()]

                        # repeat until the two sub regions contain at least 1/10 of the mother region
                        while len(sub_reg1[0]) < self.maxlen / 4 or len(
                                sub_reg2[0]) < self.maxlen / 4:
                            # decide on dimension
                            dim = np.random.choice(range(self.nb_dims))
                            threshold = self.region_bounds[reg].sample()[dim]
                            bounds1 = Box(self.region_bounds[reg].low,
                                          self.region_bounds[reg].high,
                                          dtype=np.float32)
                            bounds1.high[dim] = threshold
                            bounds2 = Box(self.region_bounds[reg].low,
                                          self.region_bounds[reg].high,
                                          dtype=np.float32)
                            bounds2.low[dim] = threshold
                            bounds = [bounds1, bounds2]
                            valid_bounds = True
                            if np.any(bounds1.high -
                                      bounds1.low < self.init_size / 5):
                                valid_bounds = False
                            if np.any(bounds2.high -
                                      bounds2.low < self.init_size / 5):
                                valid_bounds = valid_bounds and False

                            # perform split in sub regions
                            sub_reg1 = [deque(), deque()]
                            sub_reg2 = [deque(), deque()]
                            for i, goal in enumerate(self.regions[reg][1]):
                                if bounds1.contains(goal):
                                    sub_reg1[1].append(goal)
                                    sub_reg1[0].append(self.regions[reg][0][i])
                                else:
                                    sub_reg2[1].append(goal)
                                    sub_reg2[0].append(self.regions[reg][0][i])
                            sub_regions = [sub_reg1, sub_reg2]

                        # compute interest
                        interest = np.zeros([2])
                        for i in range(2):
                            if self.continuous_competence:
                                cp_window = min(len(sub_regions[i][0]),
                                                self.window_cp)
                                cp = np.abs(
                                    np.array(
                                        sub_regions[i][0])[-cp_window].mean())
                            else:
                                cp_window = min(
                                    len(sub_regions[i][0]) // 2,
                                    self.window_cp)
                                cp = np.array(
                                    sub_regions[i][0]
                                )[-2 * cp_window:-cp_window].mean() - np.array(
                                    sub_regions[i][0])[-cp_window:].mean()
                            interest[i] = np.abs(cp)

                        # compute score
                        split_score = len(sub_reg1) * len(sub_reg2) * np.abs(
                            interest[0] - interest[1])
                        if split_score >= best_split_score and np.abs(
                                interest[0] - interest[1]
                        ) >= self.max_difference / 2 and valid_bounds:
                            best_abs_interest_diff = np.abs(interest[0] -
                                                            interest[1])
                            print(interest)

                            best_split_score = split_score
                            best_sub_regions = sub_regions
                            best_bounds = bounds
                            is_split = True
                            if interest[0] >= interest[1]:
                                order = [1, -1]
                            else:
                                order = [-1, 1]
                    if is_split:
                        ind_split.append(reg)
                        if best_abs_interest_diff > self.max_difference:
                            self.max_difference = best_abs_interest_diff
                    else:
                        self.regions[reg][0] = deque(
                            np.array(self.regions[reg][0])
                            [-int(3 * len(self.regions[reg][0]) / 4):],
                            maxlen=self.maxlen + 1)
                        self.regions[reg][1] = deque(
                            np.array(self.regions[reg][1])
                            [-int(3 * len(self.regions[reg][1]) / 4):],
                            maxlen=self.maxlen + 1)
                    new_bounds.append(best_bounds)
                    new_sub_regions.append(best_sub_regions)

            # implement splits
            for i, reg in enumerate(ind_split):
                all_order = [0] * self.nb_regions
                all_order.pop(reg)
                all_order.insert(reg, order[0])
                all_order.insert(reg, order[1])

                new_split = True
                self.region_bounds.pop(reg)
                self.region_bounds.insert(reg, new_bounds[i][0])
                self.region_bounds.insert(reg, new_bounds[i][1])

                self.regions.pop(reg)
                self.regions.insert(reg, new_sub_regions[i][0])
                self.regions.insert(reg, new_sub_regions[i][1])

                self.interest.pop(reg)
                self.interest.insert(reg, 0)
                self.interest.insert(reg, 0)

                self.probas.pop(reg)
                self.probas.insert(reg, 0)
                self.probas.insert(reg, 0)

            # recompute interest
            for i in range(self.nb_regions):
                if len(self.regions[i][0]) > 10:
                    if self.continuous_competence:
                        cp_window = min(len(self.regions[i][0]),
                                        self.window_cp)
                        cp = np.abs(
                            np.array(self.regions[i][0])[-cp_window].mean())
                    else:
                        cp_window = min(
                            len(self.regions[i][0]) // 2, self.window_cp)
                        cp = np.array(
                            self.regions[i]
                            [0])[-2 * cp_window:-cp_window].mean() - np.array(
                                self.regions[i][0])[-cp_window:].mean()
                else:
                    cp = 0
                self.interest[i] = np.abs(cp)
                exp_int = np.exp(self.temperature * np.array(self.interest))
                probas = exp_int / exp_int.sum()
                self.probas = probas.tolist()

            assert len(self.probas) == len(self.regions)
            return new_split, all_order

        else:
            return False, None
class DiscreteQuadEnv(gym.Env):
    """
	Gym environment for our custom system (Crazyflie quad).

	"""
    def __init__(self):
        gym.Env.__init__(self)
        self.dyn_nn = torch.load(
            "_models/temp/2018-12-05--15-51-52.2_train-acc-old-50-ensemble_stack3_.pth"
        )

        self.dyn_nn.eval()

        data_file = open(
            "_models/temp/2018-12-05--15-57-55.6_train-acc-old-50-ensemble_stack3_--data.pkl",
            'rb')

        self.n = 3
        self.state = None

        df = pickle.load(data_file)
        self.equilibrium = np.array([31687.1, 37954.7, 33384.8, 36220.11])

        self.action_dict = self.init_action_dict()

        print(df.columns.values)

        r = 500
        print("State Before")
        print(df.iloc[r, 12:12 + 9].values)
        print("State Change")
        print(df.iloc[r, :9].values)
        print("State After")
        print(df.iloc[r + 1, 12:12 + 9].values)
        print("Is this the same")
        print(df.iloc[r, :9].values + df.iloc[r, 12:12 + 9].values)
        print("Compare to State Before")
        print(df.iloc[r + 1, 12 + 9:12 + 9 + 9].values)

        s_stacked = df.iloc[2, 12:12 + 9 * self.n].values
        a_stacked = df.iloc[2, 12 + 9 * self.n:12 + 9 * self.n +
                            4 * self.n].values
        print(a_stacked)
        print("\n Prediction")
        print(self.dyn_nn.predict(s_stacked, a_stacked))

        v_bats = df.iloc[:, -1]
        print(min(v_bats), max(v_bats))

        self.dyn_data = df

        all_states = df.iloc[:, 12:12 + 9].values
        all_actions = df.iloc[:, 12 + 9 * self.n:12 + 9 * self.n + 4].values

        min_state_bounds = [
            min(all_states[:, i]) for i in range(len(all_states[0, :]))
        ]
        max_state_bounds = [
            max(all_states[:, i]) for i in range(len(all_states[0, :]))
        ]
        min_action_bounds = [
            min(all_actions[:, i]) for i in range(len(all_actions[0, :]))
        ]
        max_action_bounds = [
            max(all_actions[:, i]) for i in range(len(all_actions[0, :]))
        ]

        low_state_s = np.tile(min_state_bounds, self.n)
        low_state_a = np.tile(min_action_bounds, self.n - 1)
        high_state_s = np.tile(max_state_bounds, self.n)
        high_state_a = np.tile(max_action_bounds, self.n - 1)

        # self.action_space = Box(low=np.array(min_action_bounds), high=np.array(max_action_bounds))
        self.action_space = gym.spaces.Discrete(16)  # 2^4 actions
        self.observation_space = Box(low=np.append(low_state_s, low_state_a), \
          high=np.append(high_state_s, high_state_a))

    def state_failed(self, s):
        """
		Check whether a state has failed, so the trajectory should terminate.
		This happens when either the roll or pitch angle > 30
		Returns: failure flag [bool]
		"""
        if abs(s[3]) > 30.0 or abs(s[4]) > 30.0:
            return True
        return False

    def init_action_dict(self):
        motor_discretized = [[-5000, 0, 5000], [-5000, 0, 5000],
                             [-5000, 0, 5000], [-5000, 0, 5000]]
        action_dict = dict()
        for ac, action in enumerate(list(
                itertools.product(*motor_discretized))):
            action_dict[ac] = np.asarray(action) + self.equilibrium
        print(action_dict)
        return action_dict

    def get_reward_state(self, s_next):
        """
		Returns reward of being in a certain state.
		"""
        pitch = s_next[3]
        roll = s_next[4]
        if self.state_failed(s_next):
            # return -1 * 1000
            return 0
        loss = pitch**2 + roll**2
        loss = np.sqrt(loss)
        reward = 1800 - loss  # This should be positive. TODO: Double check
        reward = 30 - loss
        # reward = 1
        return reward

    def sample_random_state(self):
        """
		Samples random state from previous logs. Ensures that the sampled
		state is not in a failed state.
		"""
        state_failed = True
        acceptable = False
        while not acceptable:
            # random_state = self.observation_space.sample()

            row_idx = np.random.randint(self.dyn_data.shape[0])
            random_state = self.dyn_data.iloc[row_idx,
                                              12:12 + 9 * self.n].values
            random_state_s = self.dyn_data.iloc[row_idx,
                                                12:12 + 9 * self.n].values
            random_state_a = self.dyn_data.iloc[row_idx, 12 + 9 * self.n +
                                                4:12 + 9 * self.n + 4 + 4 *
                                                (self.n - 1)].values
            random_state = np.append(random_state_s, random_state_a)

            # if abs(random_state[3]) < 5 and abs(random_state[4]) < 5:
            # 	acceptable = True

            state_failed = self.state_failed(random_state)
            if not state_failed:
                acceptable = True
        return random_state

    def next_state(self, state, action):
        # Note that the states defined in env are different
        state_dynamics = state[:9 * self.n]
        action_dynamics = np.append(
            action, state[9 * self.n:9 * self.n + 4 * (self.n - 1)])
        state_change = self.dyn_nn.predict(state_dynamics, action_dynamics)

        next_state = state_change
        next_state[3:6] = state[3:6] + state_change[3:6]
        # next_state = state[:9] + state_change
        past_state = state[:9 * (self.n - 1)]

        new_state = np.concatenate((next_state, state[:9 * (self.n - 1)]))
        new_action = np.concatenate(
            (action, state[9 * self.n:9 * self.n + 4 * (self.n - 2)]))  #

        return np.concatenate((new_state, new_action))

    def step(self, ac):
        action = self.action_dict[ac]
        new_state = self.next_state(self.state, action)
        self.state = new_state
        reward = self.get_reward_state(new_state)
        done = False
        if self.state_failed(
                new_state) or not self.observation_space.contains(new_state):
            done = True
        return self.state, reward, done, {}

    def reset(self):
        self.state = self.sample_random_state()
        return self.state
class Color:
    def __init__(self, color, shade):
        """
        Implements a color class characterized by a color and shade attributes.
        Parameters
        ----------
        color: str
            Color in red, blue, green.
        shade: str
            Shade in light, dark.
        """
        self.color = color
        self.shade = shade
        if color == 'blue':
            if shade == 'light':
                self.space = Box(low=np.array([0.3, 0.7, 0.9]),
                                 high=np.array([0.5, 0.8, 1.]),
                                 dtype=np.float32)
            elif shade == 'dark':
                self.space = Box(low=np.array([0.0, 0., 0.8]),
                                 high=np.array([0.2, 0.2, 0.9]),
                                 dtype=np.float32)
            else:
                raise NotImplementedError("shade is either 'light' or 'dark'")
        elif color == 'red':
            if shade == 'light':
                self.space = Box(low=np.array([0.9, 0.4, 0.35]),
                                 high=np.array([1, 0.6, 0.65]),
                                 dtype=np.float32)
            elif shade == 'dark':
                self.space = Box(low=np.array([0.5, 0., 0.]),
                                 high=np.array([0.7, 0.1, 0.1]),
                                 dtype=np.float32)
            else:
                raise NotImplementedError("shade is either 'light' or 'dark'")
        elif color == 'green':
            if shade == 'light':
                self.space = Box(low=np.array([0.4, 0.8, 0.4]),
                                 high=np.array([0.6, 1, 0.5]),
                                 dtype=np.float32)
            elif shade == 'dark':
                self.space = Box(low=np.array([0., 0.4, 0.]),
                                 high=np.array([0.1, 0.6, 0.1]),
                                 dtype=np.float32)
            else:
                raise NotImplementedError
        elif color == 'dark':
            if shade == 'dark':
                self.space = Box(low=np.array([0., 0., 0.]),
                                 high=np.array([0.3, 0.3, 0.3]),
                                 dtype=np.float32)
            elif shade == 'light':
                self.space = Box(low=np.array([1., 1., 1.]),
                                 high=np.array([2., 2., 2.]),
                                 dtype=np.float32)
            else:
                raise NotImplementedError
        else:
            raise NotImplementedError("color is 'red', 'blue' or 'green'")

    def contains(self, rgb):
        """
        Whether the class contains a given rgb code.
        Parameters
        ----------
        rgb: 1D nd.array of size 3

        Returns
        -------
        contains: Bool
            True if rgb code in given Color class.
        """
        contains = self.space.contains(rgb)
        if self.color == 'red' and self.shade == 'light':
            contains = contains and (rgb[2] - rgb[1] <= 0.05)
        return contains

    def sample(self):
        """
        Sample an rgb code from the Color class

        Returns
        -------
        rgb: 1D nd.array of size 3
        """
        rgb = np.random.uniform(self.space.low, self.space.high, 3)
        if self.color == 'red' and self.shade == 'light':
            rgb[2] = rgb[1] + np.random.uniform(-0.05, 0.05)
        return rgb
Beispiel #20
0
class Environment(gym.Env):
    """
    Markov Decision Process associated to the microgrid.
        Parameters
        ----------
            microgrid: microgrid, mandatory
                The controlled microgrid.
            random_seed: int, optional
                Seed to be used to generate the needed random numbers to size microgrids.
    """
    def __init__(self, env_config, seed=42):
        # Set seed
        np.random.seed(seed)

        self.states_normalization = Preprocessing.normalize_environment_states(
            env_config['microgrid'])

        self.TRAIN = True
        # Microgrid
        self.env_config = env_config
        self.mg = env_config['microgrid']
        # State space
        self.mg.train_test_split()
        #np.zeros(2+self.mg.architecture['grid']*3+self.mg.architecture['genset']*1)
        # Number of states
        self.Ns = len(self.mg._df_record_state.keys()) + 1
        # Number of actions

        #training_reward_smoothing
        try:
            self.training_reward_smoothing = env_config[
                'training_reward_smoothing']
        except:
            self.training_reward_smoothing = 'sqrt'

        try:
            self.resampling_on_reset = env_config['resampling_on_reset']
        except:
            self.resampling_on_reset = True

        if self.resampling_on_reset == True:
            self.forecast_args = env_config['forecast_args']
            self.baseline_sampling_args = env_config['baseline_sampling_args']
            self.saa = generate_sampler(self.mg, self.forecast_args)

        self.observation_space = Box(low=-1,
                                     high=np.float('inf'),
                                     shape=(self.Ns, ),
                                     dtype=np.float)
        #np.zeros(len(self.mg._df_record_state.keys()))
        # Action space
        self.metadata = {"render.modes": ["human"]}

        self.state, self.reward, self.done, self.info, self.round = None, None, None, None, None
        self.round = None

        # Start the first round
        self.seed()
        self.reset()

        try:
            assert (self.observation_space.contains(self.state))
        except AssertionError:
            print("ERROR : INVALID STATE", self.state)

    def get_reward(self):
        if self.TRAIN == True:
            if self.training_reward_smoothing == 'sqrt':
                return -(self.mg.get_cost()**0.5)
            if self.training_reward_smoothing == 'peak_load':
                return -self.mg.get_cost(
                ) / self.mg.parameters['load'].values[0]
        return -self.mg.get_cost()

    def get_cost(self):
        return sum(self.mg._df_record_cost['cost'])

    def step(self, action):

        # CONTROL
        if self.done:
            print("WARNING : EPISODE DONE")  # should never reach this point
            return self.state, self.reward, self.done, self.info
        try:
            assert (self.observation_space.contains(self.state))
        except AssertionError:
            print("ERROR : INVALID STATE", self.state)

        try:
            assert (self.action_space.contains(action))
        except AssertionError:
            print("ERROR : INVALD ACTION", action)

        # UPDATE THE MICROGRID
        control_dict = self.get_action(action)
        self.mg.run(control_dict)

        # COMPUTE NEW STATE AND REWARD
        self.state = self.transition()
        self.reward = self.get_reward()
        self.done = self.mg.done
        self.info = {}
        self.round += 1

        return self.state, self.reward, self.done, self.info

#         control_dict = self.get_action(action)
#         self.mg.run(control_dict)
#         reward = self.reward()
#         s_ = self.transition()
#         self.state = s_
#         done = self.mg.done
#         self.round += 1
#         return s_, reward, done, {}

    def reset(self, testing=False):
        if "testing" in self.env_config:
            testing = self.env_config["testing"]
        self.round = 1
        # Reseting microgrid
        self.mg.reset(testing=testing)
        if testing == True:
            self.TRAIN = False
        elif self.resampling_on_reset == True:
            Preprocessing.sample_reset(self.mg.architecture['grid'] == 1,
                                       self.saa,
                                       self.mg,
                                       sampling_args=sampling_args)

        self.state, self.reward, self.done, self.info = self.transition(
        ), 0, False, {}

        return self.state

    def get_action(self, action):
        """
        :param action: current action
        :return: control_dict : dicco of controls
        """
        '''
        States are:
        binary variable whether charging or dischargin
        battery power, normalized to 1
        binary variable whether importing or exporting
        grid power, normalized to 1
        binary variable whether genset is on or off
        genset power, normalized to 1
        '''

        control_dict = []

        return control_dict

    def states(self):  # soc, price, load, pv 'df status?'
        observation_space = []
        return observation_space

    # Transition function
    def transition(self):
        #         net_load = round(self.mg.load - self.mg.pv)
        #         soc = round(self.mg.battery.soc,1)
        #         s_ = (net_load, soc)  # next state
        updated_values = self.mg.get_updated_values()
        updated_values = {
            x: float(updated_values[x]) / self.states_normalization[x]
            for x in self.states_normalization
        }
        updated_values['hour_sin'] = np.sin(
            2 * np.pi * updated_values['hour']
        )  # the hour is already divided by 24 in the line above
        updated_values['hour_cos'] = np.cos(2 * np.pi * updated_values['hour'])
        updated_values.pop('hour', None)

        s_ = np.array(list(updated_values.values()))
        #np.array(self.mg.get_updated_values().values)#.astype(np.float)#self.mg.get_updated_values()
        #s_ = [ s_[key] for key in s_.keys()]
        return s_

    def seed(self, seed=None):
        self.np_random, seed = seeding.np_random(seed)
        return [seed]

    def render(self, mode="human"):
        txt = "state: " + str(self.state) + " reward: " + str(
            self.reward) + " info: " + str(self.info)
        print(txt)

    # Mapping between action and the control_dict
    def get_action_continuous(self, action):
        """
        :param action: current action
        :return: control_dict : dicco of controls
        """
        '''
        Actions are:
        binary variable whether charging or dischargin
        battery power, normalized to 1
        binary variable whether importing or exporting
        grid power, normalized to 1
        binary variable whether genset is on or off
        genset power, normalized to 1
        '''
        print(action)

        mg = self.mg
        pv = mg.pv
        load = mg.load
        net_load = load - pv
        capa_to_charge = mg.battery.capa_to_charge
        p_charge_max = mg.battery.p_charge_max
        p_charge = max(0, min(-net_load, capa_to_charge, p_charge_max))

        capa_to_discharge = mg.battery.capa_to_discharge
        p_discharge_max = mg.battery.p_discharge_max
        p_discharge = max(0, min(net_load, capa_to_discharge, p_discharge_max))

        control_dict = {}

        if mg.architecture['battery'] == 1:
            control_dict['battery_charge'] = max(
                0, action[0] *
                min(action[1] * mg.battery.capacity, mg.battery.capa_to_charge,
                    mg.battery.power_charge))
            control_dict['battery_discharge'] = max(
                0, (1 - action[0]) *
                min(action[1] * mg.battery.capacity,
                    mg.battery.capa_to_discharge, mg.battery.power_discharge))

        if mg.architecture['grid'] == 1:
            if mg.grid.status == 1:
                control_dict['grid_import'] = max(
                    0, action[2] * min(action[3] * mg.grid.power_import,
                                       mg.grid.power_import))
                control_dict['grid_export'] = max(0, (1 - action[2]) * min(
                    action[3] * mg.grid.power_export, mg.grid.power_export))

        if mg.architecture['genset'] == 1:
            control_dict['genset'] = max(
                0, action[4] * min(action[5] * mg.genset.rated_power_import))

        return control_dict

    def get_action_discrete(self, action):
        """
        :param action: current action
        :return: control_dict : dicco of controls
        """
        '''
        Actions are:
        binary variable whether charging or dischargin
        battery power, normalized to 1
        binary variable whether importing or exporting
        grid power, normalized to 1
        binary variable whether genset is on or off
        genset power, normalized to 1
        '''
        control_dict = {}

        control_dict['pv_consumed'] = action[0]
        if self.mg.architecture['battery'] == 1:
            control_dict['battery_charge'] = action[1] * action[3]
            control_dict['battery_discharge'] = action[2] * (1 - action[3])

        if self.mg.architecture['genset'] == 1:
            control_dict['genset'] = action[4]

            if self.mg.architecture['grid'] == 1:
                control_dict['grid_import'] = action[5] * action[7]
                control_dict['grid_export'] = action[6] * (1 - action[7])

        elif self.mg.architecture['grid'] == 1:
            control_dict['grid_import'] = action[4] * action[6]
            control_dict['grid_export'] = action[5] * (1 - action[6])

        return control_dict

    # Mapping between action and the control_dict
    def get_action_priority_list(self, action):
        """
        :param action: current action
        :return: control_dict : dicco of controls
        """
        '''
        States are:
        binary variable whether charging or dischargin
        battery power, normalized to 1
        binary variable whether importing or exporting
        grid power, normalized to 1
        binary variable whether genset is on or off
        genset power, normalized to 1
        '''

        mg = self.mg
        pv = mg.pv
        load = mg.load
        net_load = load - pv
        capa_to_charge = mg.battery.capa_to_charge
        p_charge_max = mg.battery.p_charge_max
        p_charge = max(0, min(-net_load, capa_to_charge, p_charge_max))

        capa_to_discharge = mg.battery.capa_to_discharge
        p_discharge_max = mg.battery.p_discharge_max
        p_discharge = max(0, min(net_load, capa_to_discharge, p_discharge_max))

        control_dict = {}

        control_dict = self.actions_agent_discret(mg, action)

        return control_dict

    def actions_agent_discret(self, mg, action):
        if mg.architecture['genset'] == 1 and mg.architecture['grid'] == 1:
            control_dict = self.action_grid_genset(mg, action)

        elif mg.architecture['genset'] == 1 and mg.architecture['grid'] == 0:
            control_dict = self.action_genset(mg, action)

        else:
            control_dict = self.action_grid(mg, action)

        return control_dict

    def action_grid(self, mg, action):
        # slack is grid

        pv = mg.pv
        load = mg.load

        net_load = load - pv

        capa_to_charge = mg.battery.capa_to_charge
        p_charge_max = mg.battery.p_charge_max
        p_charge_pv = max(0, min(-net_load, capa_to_charge, p_charge_max))
        p_charge_grid = max(0, min(capa_to_charge, p_charge_max))

        capa_to_discharge = mg.battery.capa_to_discharge
        p_discharge_max = mg.battery.p_discharge_max
        p_discharge = max(0, min(net_load, capa_to_discharge, p_discharge_max))

        # Charge
        if action == 0:

            control_dict = {
                'pv_consummed': min(pv, load),
                'battery_charge': p_charge_pv,
                'battery_discharge': 0,
                'grid_import': 0,
                'grid_export': max(0, pv - min(pv, load) - p_charge_pv),
                'genset': 0
            }

        if action == 4:
            load = load + p_charge_grid
            control_dict = {
                'pv_consummed': min(pv, load),
                'battery_charge': p_charge_grid,
                'battery_discharge': 0,
                'grid_import': max(0, load - min(pv, load)),
                'grid_export': max(0, pv - min(pv, load) - p_charge_grid),
                'genset': 0
            }

        # décharger full
        elif action == 1:

            control_dict = {
                'pv_consummed': min(pv, load),
                'battery_charge': 0,
                'battery_discharge': p_discharge,
                'grid_import': max(0, load - min(pv, load) - p_discharge),
                'grid_export': 0,
                'genset': 0
            }

        # Import
        elif action == 2:

            control_dict = {
                'pv_consummed': min(pv, load),
                'battery_charge': 0,
                'battery_discharge': 0,
                'grid_import': max(0, net_load),
                'grid_export': 0,
                'genset': 0
            }
        # Export
        elif action == 3:

            control_dict = {
                'pv_consummed': min(pv, load),
                'battery_charge': 0,
                'battery_discharge': 0,
                'grid_import': 0,
                'grid_export': abs(min(net_load, 0)),
                'genset': 0
            }

        return control_dict

    def action_grid_genset(self, mg, action):
        # slack is grid

        pv = mg.pv
        load = mg.load

        net_load = load - pv
        status = mg.grid.status  # whether there is an outage or not
        capa_to_charge = mg.battery.capa_to_charge
        p_charge_max = mg.battery.p_charge_max
        p_charge_pv = max(0, min(-net_load, capa_to_charge, p_charge_max))
        p_charge_grid = max(0, min(capa_to_charge, p_charge_max))

        capa_to_discharge = mg.battery.capa_to_discharge
        p_discharge_max = mg.battery.p_discharge_max
        p_discharge = max(0, min(net_load, capa_to_discharge, p_discharge_max))

        capa_to_genset = mg.genset.rated_power * mg.genset.p_max
        p_genset = max(0, min(net_load, capa_to_genset))

        # Charge
        if action == 0:

            control_dict = {
                'pv_consummed': min(pv, load),
                'battery_charge': p_charge_pv,
                'battery_discharge': 0,
                'grid_import': 0,
                'grid_export':
                max(0, pv - min(pv, load) - p_charge_pv) * status,
                'genset': 0
            }
        if action == 5:
            load = load + p_charge_grid

            control_dict = {
                'pv_consummed': min(pv, load),
                'battery_charge': p_charge_grid,
                'battery_discharge': 0,
                'grid_import': max(0, load - min(pv, load)) * status,
                'grid_export':
                max(0, pv - min(pv, load) - p_charge_grid) * status,
                'genset': 0
            }

        # décharger full
        elif action == 1:

            control_dict = {
                'pv_consummed': min(pv, load),
                'battery_charge': 0,
                'battery_discharge': p_discharge,
                'grid_import':
                max(0, load - min(pv, load) - p_discharge) * status,
                'grid_export': 0,
                'genset': 0
            }

        # Import
        elif action == 2:

            control_dict = {
                'pv_consummed': min(pv, load),
                'battery_charge': 0,
                'battery_discharge': 0,
                'grid_import': max(0, net_load) * status,
                'grid_export': 0,
                'genset': 0
            }
        # Export
        elif action == 3:

            control_dict = {
                'pv_consummed': min(pv, load),
                'battery_charge': 0,
                'battery_discharge': 0,
                'grid_import': 0,
                'grid_export': abs(min(net_load, 0)) * status,
                'genset': 0
            }
        # Genset
        elif action == 4:

            control_dict = {
                'pv_consummed': min(pv, load),
                'battery_charge': 0,
                'battery_discharge': 0,
                'grid_import': 0,
                'grid_export': 0,
                'genset': max(net_load, 0)
            }

        elif action == 6:

            control_dict = {
                'pv_consummed': min(pv, load),
                'battery_charge': 0,
                'battery_discharge': p_discharge,
                'grid_import': 0,
                'grid_export': 0,
                'genset': max(0, load - min(pv, load) - p_discharge),
            }

        return control_dict

    def action_genset(self, mg, action):
        # slack is genset

        pv = mg.pv
        load = mg.load

        net_load = load - pv

        capa_to_charge = mg.battery.capa_to_charge
        p_charge_max = mg.battery.p_charge_max
        p_charge = max(0, min(-net_load, capa_to_charge, p_charge_max))

        capa_to_discharge = mg.battery.capa_to_discharge
        p_discharge_max = mg.battery.p_discharge_max
        p_discharge = max(0, min(net_load, capa_to_discharge, p_discharge_max))

        capa_to_genset = mg.genset.rated_power * mg.genset.p_max
        p_genset = max(0, min(net_load, capa_to_genset))

        # Charge
        if action == 0:

            control_dict = {
                'pv_consummed': min(pv, load),
                'battery_charge': p_charge,
                'battery_discharge': 0,
                'grid_import': 0,
                'grid_export': 0,
                'genset': 0
            }

        # décharger full
        elif action == 1:

            control_dict = {
                'pv_consummed': min(pv, load),
                'battery_charge': 0,
                'battery_discharge': p_discharge,
                'grid_import': 0,
                'grid_export': 0,
                'genset': max(0, load - min(pv, load) - p_discharge)
            }

        # Genset
        elif action == 2:

            control_dict = {
                'pv_consummed': min(pv, load),
                'battery_charge': 0,
                'battery_discharge': 0,
                'grid_import': 0,
                'grid_export': 0,
                'genset': max(0, load - min(pv, load))
            }

        return control_dict
Beispiel #21
0
class CopterEnvBase(gym.Env):
    metadata = {
        'render.modes': ['human', 'rgb_array'],
        'video.frames_per_second': 50
    }

    def __init__(self, strict_actions=False):
        self.viewer = None
        self.setup = CopterSetup()
        self._seed()
        self._strict_action_space = strict_actions
        self.allowed_fly_range = Box(np.array([-10, -10, 0]),
                                     np.array([10, 10, 10]))
        self.observation_space = Box(-np.inf, np.inf, (4 * 3 + 4))

    def _seed(self, seed=None):
        self.np_random, seed = seeding.np_random(seed)
        return [seed]

    def _step(self, action):
        if self._strict_action_space:
            assert self.action_space.contains(
                action), "%r (%s) invalid" % (action, type(action))
        else:
            action = np.clip(action, self.action_space.low,
                             self.action_space.high)

        self._control = self._control_from_action(action)
        for i in range(2):
            simulate(self.copterstatus, self.setup, self._control, 0.01)

        done = not self.allowed_fly_range.contains(self.copterstatus.position)
        reward = -1 if done else 0
        return self._get_state(), reward, done, {
            "rotor-speed": self.copterstatus.rotor_speeds
        }

    def _get_state(self):
        s = self.copterstatus
        return np.concatenate([
            s.position, s.velocity, s.attitude, s.angular_velocity,
            s.rotor_speeds
        ])

    def _reset(self):
        self.copterstatus = CopterStatus()
        return self._get_state()

    def _render(self, mode='human', close=False):
        if close:
            if self.viewer is not None:
                self.viewer.close()
                self.viewer = None
            return

        if self.viewer is None:
            self.viewer = rendering.Viewer(500, 500)
            self.center = self.copterstatus.position[0]

        self.center = 0.9 * self.center + 0.1 * self.copterstatus.position[0]
        self.viewer.set_bounds(-7 + self.center, 7 + self.center, -1, 13)

        # draw ground
        _draw_ground(self.viewer, self.center)
        _draw_copter(self.viewer, self.setup, self.copterstatus)

        return self.viewer.render(return_rgb_array=mode == 'rgb_array')

    def _control_from_action(self, action):
        raise NotImplementedError()
Beispiel #22
0
class FarmEnv(gym.Env):
    """
    Description:
        A wind farm is controlled by yaw angles
    Observation:
        Type: Box(n+2)
        Num     Observation                        Min                Max
        0       current yaw angle (°)             -max_yaw            max_yaw
        ...     ...                               ...                 ...
        n       current yaw angle (°)             -max_yaw            max_yaw
        n+1     wind angle (°)                     0                  359
        n+2     wind speed (kts)                   min_wind_speed     max_wind_speed

    Actions:
        Type: Discrete(2)
        Num   Action                     Min                Max
        0     yaw angle (°)             -max_yaw            max_yaw
        ...
        n     yaw angle (°)             -max_yaw            max_yaw

    Reward:
        Reward is power increase for every step taken, including the termination step
    Starting State:
        TBD
    Episode Termination:
        Pole Angle is more than 12 degrees.
        Cart Position is more than 2.4 (center of the cart reaches the edge of
        the display).
        Episode length is greater than 200.
        Solved Requirements:
        Considered solved when the average return is greater than or equal to
        195.0 over 100 consecutive trials.
    """
    def __init__(self, config):
        self.farm = config['farm']
        self.numwt = config['num_wind_turbines']

        # initialize yaw boundaries
        self.allowed_yaw = config["max_yaw"]

        self.min_wind_speed = config["min_wind_speed"]
        self.max_wind_speed = config["max_wind_speed"]
        self.min_wind_angle = config["min_wind_angle"]
        self.max_wind_angle = config["max_wind_angle"]

        self.continuous_action_space = config["continuous_action_space"]

        self.best_explored_power = {}
        self.count_steps = 0
        self.initialized_yaw_angle = 0
        self.cur_yaws = np.full((self.numwt, ), 0, dtype=np.int32)

        self.turbine_powers = np.full((self.numwt, ), 0, dtype=np.float64)
        self.turbulent_intensities = np.full((self.numwt, ),
                                             0,
                                             dtype=np.float64)
        self.thrust_coefs = np.full((self.numwt, ), 0, dtype=np.float64)
        self.wt_speed_u = np.full((self.numwt, ), 0, dtype=np.float64)
        self.wt_speed_v = np.full((self.numwt, ), 0, dtype=np.float64)

        self.cur_wind_speed = [8.]  # in kts
        self.cur_wind_angle = [270.]  # in degrees

        self.initial_wind_angle = 0  # in kts
        self.max_wind_direction_variation = 10,  # max wind angle variation during episode

        self.cur_nominal_power = 0
        self.cur_power = 0
        self.cur_power_ratio = 0
        self.cur_nominal_ti_sum = 0

        if self.continuous_action_space:
            # action space is the yaw angles for the wt , to be multiplied by allowed_yaw°
            action_low = np.full((self.numwt, ), -1., dtype=np.float32)
            action_high = np.full((self.numwt, ), 1., dtype=np.float32)
            self.action_space = Box(low=action_low,
                                    high=action_high,
                                    shape=(self.numwt, ))
        else:
            # discrete action space
            self.action_space = MultiDiscrete(
                np.full((self.numwt, ), 2 * self.allowed_yaw,
                        dtype=np.float32))

        print(f'action space : {self.action_space}')
        print(f'action space : {self.action_space.sample()}')

        # observation space TODO
        observation_high = np.concatenate(
            (
                #  np.array([self.max_wind_angle, self.max_wind_speed])),
                np.array([1.] * self.numwt
                         ),  # yaw max positions for all wind turbines
                #  np.array([1] * self.numwt),  # x axis wind speed for all wind turbines
                #  np.array([1] * self.numwt),  # y axis wind speed for all wind turbines
                np.array([1] * self.numwt
                         ),  # max turbulence intensity for all wind turbines
                np.array([1]),  # max power ratio
                #  np.array([1] * self.numwt),  # max thrust coef for all wind turbines
                #  np.array([1] * self.numwt),  # max power coef for all wind turbines
                np.array([1]),  # max sinus wind angle
                np.array([1]),  # max cosinus wind angle
                np.array(
                    [1])),  # max normalized wind speed (range 2 to 25.5 m.s-1)
            axis=0)
        observation_low = np.concatenate(
            (
                #  np.array([self.min_wind_angle, self.min_wind_speed])),
                np.array([-1] * self.numwt
                         ),  # yaw min positions for all wind turbines
                #  np.array([-1] * self.numwt),  # x axis wind speed for all wind turbines
                #  np.array([-1] * self.numwt),  # y axis wind speed for all wind turbines
                np.array([0.] * self.numwt
                         ),  # min turbulence intensity for all wind turbines
                np.array([-1]),  # min power ratio
                #  np.array([0] * self.numwt),  # min thrust coef for all wind turbines
                #  np.array([0] * self.numwt),  # min power coef for all wind turbines
                np.array([-1]),  # min sinus wind angle
                np.array([-1]),  # min cosinus wind angle
                np.array(
                    [0])),  # min normalized wind speed (range 2 to 25.5 m.s-1)
            axis=0)
        print(f'observation low : {observation_low}')
        print(f'observation high : {observation_high}')
        self.observation_space = Box(low=observation_low,
                                     high=observation_high,
                                     shape=(self.numwt * 2 + 4, ),
                                     dtype=np.float64)
        print(f'observation space : {self.observation_space}')

    def reset(self, wd=None, ws=None):
        self.count_steps = 0
        self.cur_yaws = np.full((self.numwt, ), 0, dtype=np.int32)

        # Define wind conditions for this episode

        # check wind speed in range (2 to 25,5)
        assert self.max_wind_speed < 25.5, "max wind speed too high"
        assert self.min_wind_speed > 2., "min wind speed too low"

        if wd:
            self.cur_wind_angle = wd
        else:
            self.cur_wind_angle = np.random.randint(self.min_wind_angle,
                                                    self.max_wind_angle)

        if ws:
            self.cur_wind_speed = ws
        else:
            self.cur_wind_speed = np.random.uniform(self.min_wind_speed,
                                                    self.max_wind_speed)

        self.initial_wind_angle = self.cur_wind_angle

        # Update the flow in the model
        print(f'wind angle {self.cur_wind_angle}')
        print(f'wind speed {self.cur_wind_speed}')
        self.farm.reinitialize_flow_field(wind_direction=[self.cur_wind_angle],
                                          wind_speed=[self.cur_wind_speed])
        self.farm.calculate_wake()
        self.cur_nominal_power = self.farm.get_farm_power()
        self.best_explored_power[self.cur_wind_angle] = self.cur_nominal_power
        self.cur_nominal_ti_sum = np.sum(self.farm.get_turbine_ti())

        state = self.get_observation()
        # print(f'initial state is {state}')
        # print(f'observation space is {self.observation_space}')

        return state  # return current state of the environment

    def get_observation(self):
        self.turbulent_intensities = (np.array(self.farm.get_turbine_ti()) -
                                      0.055) / 0.07  #rescaling
        self.cur_power = self.farm.get_farm_power()

        # self.thrust_coefs = self.farm.get_turbine_ct()

        #
        # turbine_powers = self.farm.get_turbine_power()
        # self.turbine_powers = turbine_powers / np.max(turbine_powers)
        #
        # wind_speed_points_at_wt = pd.DataFrame(self.farm.get_set_of_points(self.farm_layout[0], self.farm_layout[1], [80.] * self.numwt).head(self.numwt))
        # u_wind_speed_points_at_wt = np.array(wind_speed_points_at_wt.u)
        # v_wind_speed_points_at_wt = np.array(wind_speed_points_at_wt.v)
        # self.wt_speed_u = u_wind_speed_points_at_wt / self.cur_wind_speed
        # self.wt_speed_v = v_wind_speed_points_at_wt / self.cur_wind_speed

        current_yaws = self.cur_yaws / self.allowed_yaw
        self.cur_power_ratio = (
            self.cur_power - self.cur_nominal_power) / self.cur_nominal_power
        observation = np.concatenate(
            (
                #  self.wt_speed_u,
                # self.wt_speed_v,
                current_yaws,
                self.turbulent_intensities,
                # self.thrust_coefs,
                # self.turbine_powers,
                np.array([self.cur_power_ratio]),
                np.array([sind(self.cur_wind_angle)]),
                np.array([cosd(self.cur_wind_angle)]),
                np.array([self.cur_wind_speed / 25.5]),
            ),
            axis=0)

        return observation

    def step(self, action, no_variation=False):

        # check actions validity
        err_msg = "%r (%s) invalid" % (action, type(action))
        assert self.action_space.contains(action), err_msg

        #  Execute the actions
        if self.continuous_action_space:
            self.cur_yaws = action * self.allowed_yaw
        else:
            self.cur_yaws = action - self.allowed_yaw

        print(f'current yaws {self.cur_yaws}')

        if not no_variation:
            # Apply wind variation
            if self.cur_wind_angle <= self.initial_wind_angle + self.max_wind_direction_variation[
                    0] or self.cur_wind_angle >= self.initial_wind_angle - self.max_wind_direction_variation[
                        0]:
                self.cur_wind_angle = self.cur_wind_angle + np.random.randint(
                    -1, 2)
            self.farm.reinitialize_flow_field(
                wind_direction=[self.cur_wind_angle],
                wind_speed=[self.cur_wind_speed])
            print(f'new {self.cur_wind_angle}')
            self.farm.calculate_wake()
            self.cur_nominal_power = self.farm.get_farm_power()
        # Get the Observations from the simulation
        self.farm.calculate_wake(yaw_angles=self.cur_yaws)

        observation = self.get_observation()

        # check observation
        err_msg = "%r (%s) invalid" % (observation, type(observation))
        assert self.observation_space.contains(observation), err_msg

        # reward calc
        # power_ratio = (self.cur_power - self.best_explored_power[self.cur_wind_angle]) / self.best_explored_power[self.cur_wind_angle]

        reward = self.cur_power_ratio * 100
        print(f'power ratio       {self.cur_power_ratio}')

        # if self.cur_power > self.best_explored_power[self.cur_wind_angle]:
        #     self.best_explored_power[self.cur_wind_angle] = self.cur_power

        self.count_steps += 1

        # Done Evaluation
        if self.count_steps == 30:
            done = True
        else:
            done = False

        return observation, reward, done, {}
Beispiel #23
0
class PhyHumanoid(gym.Env):
    def __init__(self, config):

        args = ['--arg_file']
        args.append(os.path.join("data", config["config_file"]))

        self.enable_draw = config.get("enable_draw", False)
        self.show_kin = config.get("show_kin", False)
        self.draw_frame_skip = config.get("draw_frame_skip", 5)

        # self.enable_draw = config["enable_draw"]
        # self.show_kin = config["show_kin"]

        if self.enable_draw:
            self.num_draw_frame = 0
            self.win_width = 800
            self.win_height = int(self.win_width * 9.0 / 16.0)
            self._init_draw()

        self.playback_speed = 1

        self._core = DeepMimicCore.cDeepMimicCore(self.enable_draw)
        rand_seed = np.random.randint(np.iinfo(np.int32).max)
        self._core.SeedRand(rand_seed)

        self._core.ParseArgs(args)

        self._core.Init()
        self._core.SetPlaybackSpeed(self.playback_speed)

        fps = 60
        self.update_timestep = 1.0 / fps

        num_substeps = self._core.GetNumUpdateSubsteps()
        self.timestep = self.update_timestep / num_substeps

        self.agent_id = 0
        self.act_size = self._core.GetActionSize(self.agent_id)
        print(self.act_size)
        self.action_space = Box(-2 * np.pi,
                                2 * np.pi,
                                shape=(self.act_size, ),
                                dtype=np.float32)

        self.state_size = self._core.GetStateSize(self.agent_id)
        self.goal_size = self._core.GetGoalSize(self.agent_id)
        self.ground_size = 0
        self.obs_size = self.state_size + self.goal_size + self.ground_size
        self.observation_space = Box(-np.inf,
                                     np.inf,
                                     shape=(self.obs_size, ),
                                     dtype=np.float32)

        if self.enable_draw:
            self.reshaping = False
            self._setup_draw()
            if self.show_kin:
                self._core.Keyboard(107, 0, 0)

    def _init_draw(self):
        glutInit()

        # glutInitContextVersion(3, 2)
        glutInitContextFlags(GLUT_FORWARD_COMPATIBLE)
        glutInitContextProfile(GLUT_CORE_PROFILE)

        glutInitDisplayMode(GLUT_RGBA | GLUT_DOUBLE | GLUT_DEPTH)
        glutInitWindowSize(self.win_width, self.win_height)
        glutCreateWindow(b'ExtDeepMimic')
        return

    def _setup_draw(self):
        self._reshape(self.win_width, self.win_height)
        self._core.Reshape(self.win_width, self.win_height)

        return

    def _draw(self):

        if self.num_draw_frame % self.draw_frame_skip == 0:
            self._update_intermediate_buffer()
            self._core.Draw()
            glutSwapBuffers()
            self.reshaping = False
        self.num_draw_frame += 1
        return

    def _reshape(self, w, h):

        self.reshaping = True
        self.win_width = w
        self.win_height = h

        return

    def _shutdown(self):

        print('Shutting down...')
        self._core.Shutdown()
        sys.exit(0)
        return

    def _update_intermediate_buffer(self):
        if not (self.reshaping):
            if (self.win_width != self._core.GetWinWidth()
                    or self.win_height != self._core.GetWinHeight()):
                self._core.Reshape(self.win_width, self.win_height)

        return

    def reset(self):
        self._core.Reset()
        if self.enable_draw:
            self.num_draw_frame = 0
            self._draw()
        obs = self._get_observation()
        return obs

    def step(self, action):
        prev_obs = self._get_observation()
        self._core.SetAction(self.agent_id, np.asarray(action).tolist())

        # self._core.Update(self.timestep)
        need_update = True
        while need_update:
            self._core.Update(self.timestep)
            if self.enable_draw:
                self._draw()

            valid_episode = self._core.CheckValidEpisode()
            if valid_episode:
                done = self._core.IsEpisodeEnd()
                if done:

                    obs = self._get_observation()
                    valid_obs, obs = self._check_observation(obs, prev_obs)
                    if valid_obs:
                        reward = self._core.CalcReward(self.agent_id)
                        return obs, reward, True, {"valid": 1.0}
                    else:
                        return obs, 0.0, True, {"valid": 0.0}

                else:
                    need_update = not self._core.NeedNewAction(self.agent_id)

            else:

                obs = self._get_observation()
                valid_obs, obs = self._check_observation(obs, prev_obs)
                return obs, 0.0, True, {"valid": 0.0}

        obs = self._get_observation()
        valid_obs, obs = self._check_observation(obs, prev_obs)
        if valid_obs:
            reward = self._core.CalcReward(self.agent_id)
            return obs, reward, False, {"valid": 1.0}
        else:
            return obs, 0.0, True, {"valid": 0.0}

    def _get_observation(self):
        state = np.array(self._core.RecordState(self.agent_id))
        goal = np.array(self._core.RecordGoal(self.agent_id))
        # ground = np.array(self._core.RecordGround(self.agent_id))
        obs = np.concatenate([state, goal], axis=0)
        return obs

    def _check_observation(self, obs, prev_obs):
        if np.isnan(obs).any():
            return False, prev_obs
        else:
            if self.observation_space.contains(obs):
                return True, obs
            else:
                return False, prev_obs

    def get_state_size(self, agent_id):
        return self._core.GetStateSize(agent_id)

    def get_goal_size(self, agent_id):
        return self._core.GetGoalSize(agent_id)

    def get_action_size(self, agent_id):
        return self._core.GetActionSize(agent_id)

    def get_num_actions(self, agent_id):
        return self._core.GetNumActions(agent_id)

    def build_state_offset(self, agent_id):
        return np.array(self._core.BuildStateOffset(agent_id))

    def build_state_scale(self, agent_id):
        return np.array(self._core.BuildStateScale(agent_id))

    def build_goal_offset(self, agent_id):
        return np.array(self._core.BuildGoalOffset(agent_id))

    def build_goal_scale(self, agent_id):
        return np.array(self._core.BuildGoalScale(agent_id))

    def build_action_offset(self, agent_id):
        return np.array(self._core.BuildActionOffset(agent_id))

    def build_action_scale(self, agent_id):
        return np.array(self._core.BuildActionScale(agent_id))

    def build_action_bound_min(self, agent_id):
        return np.array(self._core.BuildActionBoundMin(agent_id))

    def build_action_bound_max(self, agent_id):
        return np.array(self._core.BuildActionBoundMax(agent_id))

    def build_state_norm_groups(self, agent_id):
        return np.array(self._core.BuildStateNormGroups(agent_id))

    def build_goal_norm_groups(self, agent_id):
        return np.array(self._core.BuildGoalNormGroups(agent_id))

    def get_reward_min(self, agent_id):
        return self._core.GetRewardMin(agent_id)

    def get_reward_max(self, agent_id):
        return self._core.GetRewardMax(agent_id)
Beispiel #24
0
    def split(self, nid):
        # Try nb_split_attempts splits on region corresponding to node <nid>
        reg = self.tree.get_node(nid).data
        best_split_score = 0
        best_bounds = None
        best_sub_regions = None
        is_split = False
        for i in range(self.nb_split_attempts):
            sub_reg1 = [
                deque(maxlen=self.maxlen + 1),
                deque(maxlen=self.maxlen + 1)
            ]
            sub_reg2 = [
                deque(maxlen=self.maxlen + 1),
                deque(maxlen=self.maxlen + 1)
            ]

            # repeat until the two sub regions contain at least minlen of the mother region
            while len(sub_reg1[0]) < self.minlen or len(
                    sub_reg2[0]) < self.minlen:
                # decide on dimension
                dim = self.random_state.choice(range(self.nb_dims))
                threshold = reg.bounds.sample()[dim]
                bounds1 = Box(reg.bounds.low,
                              reg.bounds.high,
                              dtype=np.float32)
                bounds1.high[dim] = threshold
                bounds2 = Box(reg.bounds.low,
                              reg.bounds.high,
                              dtype=np.float32)
                bounds2.low[dim] = threshold
                bounds = [bounds1, bounds2]
                valid_bounds = True

                if np.any(bounds1.high - bounds1.low < self.dims_ranges *
                          self.min_dims_range_ratio):
                    valid_bounds = False
                if np.any(bounds2.high - bounds2.low < self.dims_ranges *
                          self.min_dims_range_ratio):
                    valid_bounds = valid_bounds and False

                # perform split in sub regions
                sub_reg1 = [
                    deque(maxlen=self.maxlen + 1),
                    deque(maxlen=self.maxlen + 1)
                ]
                sub_reg2 = [
                    deque(maxlen=self.maxlen + 1),
                    deque(maxlen=self.maxlen + 1)
                ]
                for i, task in enumerate(reg.r_t_pairs[1]):
                    if bounds1.contains(task):
                        sub_reg1[1].append(task)
                        sub_reg1[0].append(reg.r_t_pairs[0][i])
                    else:
                        sub_reg2[1].append(task)
                        sub_reg2[0].append(reg.r_t_pairs[0][i])
                sub_regions = [sub_reg1, sub_reg2]

            # compute alp
            alp = [self.compute_alp(sub_reg1), self.compute_alp(sub_reg2)]

            # compute score
            split_score = len(sub_reg1) * len(sub_reg2) * np.abs(alp[0] -
                                                                 alp[1])
            if split_score >= best_split_score and valid_bounds:
                is_split = True
                best_split_score = split_score
                best_sub_regions = sub_regions
                best_bounds = bounds

        if is_split:
            # add new nodes to tree
            for i, (r_t_pairs,
                    bounds) in enumerate(zip(best_sub_regions, best_bounds)):
                self.tree.create_node(identifier=self.tree.size(),
                                      parent=nid,
                                      data=Region(self.maxlen,
                                                  r_t_pairs=r_t_pairs,
                                                  bounds=bounds,
                                                  alp=alp[i]))
        else:
            assert len(reg.r_t_pairs[0]) == (self.maxlen + 1)
            reg.r_t_pairs[0] = deque(
                islice(reg.r_t_pairs[0], int(self.maxlen * self.discard_ratio),
                       self.maxlen + 1))
            reg.r_t_pairs[1] = deque(
                islice(reg.r_t_pairs[1], int(self.maxlen * self.discard_ratio),
                       self.maxlen + 1))

        return is_split
Beispiel #25
0
class ParametrizedWaves(object):
    """The main OpenAI Gym class. It encapsulates an environment with
    arbitrary behind-the-scenes dynamics. An environment can be
    partially or fully observed.
    The main API methods that users of this class need to know are:
        step
        reset
        render
        close
        seed
    And set the following attributes:
        action_space: The Space object corresponding to valid actions
        observation_space: The Space object corresponding to valid observations
        reward_range: A tuple corresponding to the min and max possible rewards
    Note: a default reward range set to [-inf,+inf] already exists. Set it if you want a narrower range.
    The methods are accessed publicly as "step", "reset", etc.. The
    non-underscored versions are wrapper methods to which we may add
    functionality over time.
    """

    # Set this in SOME subclasses
    metadata = {'render.modes': []}
    reward_range = (-float('inf'), float('inf'))
    spec = None

    def __init__(self):
        super().__init__()
        self.amplitude = np.random.randint(100) / 10
        self.frequency = np.random.randint(100) / 10
        self.action_space = Box(low=-10 * np.ones(2), high=10 * np.ones(2))
        self.observation_space = Box(low=-10 * np.ones(2),
                                     high=10 * np.ones(2))

    def step(self, action):
        """Run one timestep of the environment's dynamics. When end of
        episode is reached, you are responsible for calling `reset()`
        to reset this environment's state.
        Accepts an action and returns a tuple (observation, reward, done, info).
        Args:
            action (object): an action provided by the environment
        Returns:
            observation (object): agent's observation of the current environment
            reward (float) : amount of reward returned after previous action
            done (boolean): whether the episode has ended, in which case further step() calls will return undefined results
            info (dict): contains auxiliary diagnostic information (helpful for debugging, and sometimes learning)
        """
        if not self.action_space.contains(action):
            raise Exception("Action %s is not in action space." % action)

        x = np.linspace(0, 1, 1000)
        wave_ground_truth = eval_wave(x, self.amplitude, self.frequency)
        wave = eval_wave(x, action[0], action[1])
        #reward = -squared_loss(wave_ground_truth, wave)
        reward = -squared_loss(np.array([self.amplitude, self.frequency]),
                               np.array([action[0], action[1]]))
        observation = np.array([self.amplitude, self.frequency])
        done = True

        return observation, reward, done, None

    def reset(self):
        """Resets the state of the environment and returns an initial observation.
        Returns: observation (object): the initial observation of the
            space.
        """
        self.amplitude = np.random.randint(100) / 10
        self.frequency = np.random.randint(100) / 10
        observation = np.array([self.amplitude, self.frequency])

        return observation

    def render(self, mode='human'):
        """Renders the environment.
        The set of supported modes varies per environment. (And some
        environments do not support rendering at all.) By convention,
        if mode is:
        - human: render to the current display or terminal and
          return nothing. Usually for human consumption.
        - rgb_array: Return an numpy.ndarray with shape (x, y, 3),
          representing RGB values for an x-by-y pixel image, suitable
          for turning into a video.
        - ansi: Return a string (str) or StringIO.StringIO containing a
          terminal-style text representation. The text can include newlines
          and ANSI escape sequences (e.g. for colors).
        Note:
            Make sure that your class's metadata 'render.modes' key includes
              the list of supported modes. It's recommended to call super()
              in implementations to use the functionality of this method.
        Args:
            mode (str): the mode to render with
            close (bool): close all open renderings
        Example:
        class MyEnv(Env):
            metadata = {'render.modes': ['human', 'rgb_array']}
            def render(self, mode='human'):
                if mode == 'rgb_array':
                    return np.array(...) # return RGB frame suitable for video
                elif mode is 'human':
                    ... # pop up a window and render
                else:
                    super(MyEnv, self).render(mode=mode) # just raise an exception
        """
        raise NotImplementedError

    def close(self):
        """Override _close in your subclass to perform any necessary cleanup.
        Environments will automatically close() themselves when
        garbage collected or when the program exits.
        """
        return

    def seed(self, seed=None):
        """Sets the seed for this env's random number generator(s).
        Note:
            Some environments use multiple pseudorandom number generators.
            We want to capture all such seeds used in order to ensure that
            there aren't accidental correlations between multiple generators.
        Returns:
            list<bigint>: Returns the list of seeds used in this env's random
              number generators. The first value in the list should be the
              "main" seed, or the value which a reproducer should pass to
              'seed'. Often, the main seed equals the provided 'seed', but
              this won't be true if seed=None, for example.
        """
        logger.warn("Could not seed environment %s", self)
        return

    @property
    def unwrapped(self):
        """Completely unwrap this env.
        Returns:
            gym.Env: The base non-wrapped gym.Env instance
        """
        return self

    def __str__(self):
        if self.spec is None:
            return '<{} instance>'.format(type(self).__name__)
        else:
            return '<{}<{}>>'.format(type(self).__name__, self.spec.id)