コード例 #1
0
ファイル: encoding.py プロジェクト: fossabot/yarllib
class Encoder:
    """Encode/Decode a multi-discrete space into a discrete one."""
    def __init__(self, input_space: MultiDiscrete):
        """
        Initialize the space_encoder.

        :param input_space: the space to encode.
        """
        self.input_space = input_space
        self.output_space = Discrete(np.prod(input_space.nvec))

    def encode(self, s: Sequence[int]) -> int:
        """
        Do the encoding.

        :param s: a point in the multi-discrete space.
        :return: the encoded version of the point.
        """
        if type(s) in {list, tuple}:
            s = np.array(s)
        assert_(self.input_space.contains(s),
                f"{s} is not contained in the input space.")
        dims = self.input_space.nvec
        result = 0
        for i, dim in zip(s, dims[+1:]):
            result += i
            result *= dim
        result += s[-1]
        assert_(
            self.output_space.contains(result),
            f"{result} is not contained in the output space.",
        )
        return result

    def decode(self, s: int) -> Sequence[int]:
        """
        Do the decoding.

        :param s: a point in a discrete space.
        :return: the decoded version of the point into a multi-discrete space.
        """
        assert_(self.output_space.contains(s),
                f"{s} is not contained in the output space.")
        dims = self.input_space.nvec
        result = []
        tmp = s
        for dim in reversed(dims[+1:]):
            result.append(tmp % dim)
            tmp = tmp // dim
        result.append(tmp)
        result = np.array(list(reversed(result)))
        assert_(
            self.input_space.contains(result),
            f"{result} is not contained in the input space.",
        )
        return result
コード例 #2
0
class MyEnv(Env):
    def __init__(self):
        self.action_space = Discrete(4)
        self.prev_action = 0
        self.state = np.zeros(4).astype(np.int32)

    def render(self):
        pass
        # print('action: ', self.prev_action, 'state:', self.state)

    def step(self, action):
        self.prev_action = action
        assert self.action_space.contains(action)
        self.state[action] += 1

        reward = 0
        if self.state[action] == 10:
            reward = 1
        elif self.state[action] > 10:
            reward = -1
        is_done = np.all(self.state > 9) or np.any(self.state > 15)
        if is_done:
            reward = 1 if np.all(self.state > 9) else -1
        return np.copy(self.state), reward, is_done, {}

    def reset(self):
        self.state = np.zeros(4).astype(np.int32)
コード例 #3
0
ファイル: action.py プロジェクト: jake-aft/tradeflow
class DiscreteActionToMarketOrder(Kernel):
    """
    Maps gym.spaces.Discrete actions to executable Market Orders.
    """

    # TODO: speed up by turning off action validation

    def __init__(
        self,
        assets,
        name='AssetActionMap',
        task=0,
        log=None,
        log_level=INFO,
    ):
        super().__init__(name=name, task=task, log=log, log_level=log_level)
        assets = list(assets)
        try:
            assert len(assets) == 1

        except AssertionError:
            e = 'This class currently supports single asset trading only'
            self.log.error(e)
            raise ValueError(e)

        self.assets = assets

        self.space = Discrete(4)
        self.action_map = {0: None, 1: 'buy', 2: 'sell', 3: 'close'}

    def update_state(self, input_state, reset):
        if reset:
            self._start(input_state)

        else:
            self._update_state(input_state)

        return self.state

    def _start(self, action):
        self.state = []

    def _update_state(self, action):
        try:
            assert self.space.contains(action)

        except AssertionError:
            e = 'Provided action `{}` is not a valid member of defined action space `{}`'.format(
                action, self.space)
            self.log.error(e)
            raise TypeError(e)

        if action != 0:
            self.state = [MarketOrder(self.assets[0], self.action_map[action])]

        else:
            self.state = []
コード例 #4
0
ファイル: multi_step.py プロジェクト: frt03/pic
class MultiStepEnv(Env):
    def __init__(self, horizon=1):
        self.horizon = horizon
        self.action_space = Discrete(2)
        self.observation_space = Box(low=0, high=1, shape=(3, ))
        self.state_dict = {
            's_1': np.array([1, 0, 0]),
            's_2': np.array([0, 1, 0]),
            's_3': np.array([0, 0, 1]),
            's_4': np.array([1, 1, 1]),
            's_5': np.array([0, 0, 0]),
        }
        self.seed()
        self.reset()

    def seed(self, seed=None):
        self.np_random, seed = seeding.np_random(seed)
        return [seed]

    def reset(self):
        self.state = 's_1'
        self.time_step = 0
        return self.state_dict[self.state]

    def step(self, action):
        assert self.action_space.contains(action)
        reward = 0.0
        if self.state == 's_1':
            if action == 0:
                self.state = 's_2'
            elif action == 1:
                self.state = 's_3'
        elif self.state == 's_2':
            if action == 1:
                self.state = 's_4'
        elif self.state == 's_4':
            if action == 0:
                self.state = 's_5'

        self.time_step += 1
        if self.time_step == self.horizon:
            done = True
            if self.state == 's_2' and self.time_step == 1:
                reward = 1.0
            elif self.state == 's_4' and self.time_step == 2:
                reward = 1.0
            elif self.state == 's_5' and self.time_step == 3:
                reward = 1.0
        else:
            done = False

        return self.state_dict[self.state], reward, done, {'state': self.state}
コード例 #5
0
class Discretizer:
    def __init__(self, space, n_bins):
        assert isinstance(space, Box), \
            "Discretization is only implemented for Box spaces."
        assert space.is_bounded()
        self.space = space
        self.n_bins = n_bins

        # initialize bins
        assert n_bins > 0, "Discretizer requires n_bins > 0"
        n_elements = 1
        tol = 1e-8
        self.dim = len(self.space.low)
        n_elements = n_bins**self.dim
        self._bins = []
        self._open_bins = []
        for dd in range(self.dim):
            range_dd = self.space.high[dd] - self.space.low[dd]
            epsilon = range_dd / n_bins
            bins_dd = []
            for bb in range(n_bins + 1):
                val = self.space.low[dd] + epsilon * bb
                bins_dd.append(val)
            self._open_bins.append(tuple(bins_dd[1:]))
            bins_dd[-1] += tol  # "close" the last interval
            self._bins.append(tuple(bins_dd))

        # set observation space
        self.discrete_space = Discrete(n_elements)

        # List of discretized elements
        self.discretized_elements = np.zeros((self.dim, n_elements))
        for ii in range(n_elements):
            self.discretized_elements[:, ii] = self.get_coordinates(ii, False)

    def discretize(self, coordinates):
        return binary_search_nd(coordinates, self._bins)

    def get_coordinates(self, flat_index, randomize=False):
        assert self.discrete_space.contains(flat_index), "invalid flat_index"
        # get multi-index
        index = unravel_index_uniform_bin(flat_index, self.dim, self.n_bins)

        # get coordinates
        coordinates = np.zeros(self.dim)
        for dd in range(self.dim):
            coordinates[dd] = self._bins[dd][index[dd]]
            if randomize:
                range_dd = self.space.high[dd] - self.space.low[dd]
                epsilon = range_dd / self.n_bins
                coordinates[dd] += epsilon * self.space.rng.uniform()
        return coordinates
コード例 #6
0
class MyEnv(Env):
    def __init__(self):
        self.action_space = Discrete(5)
        self.prev_action = 0
        self.prev_state = np.zeros(4).astype(np.int32)
        self.state = np.zeros(4).astype(np.int32)

    def render(self):
        pass
        # print('action: ', self.prev_action, 'state:', self.state)

    def step(self, action):
        self.prev_action = action
        assert self.action_space.contains(action)
        reward = 0
        if action < 4:
            self.prev_state = np.copy(self.state)
            self.state[action] += 1

        if np.sum(self.state == 10) > np.sum(self.prev_state == 10):
            reward = 1
        elif np.sum(self.state > 10) > np.sum(self.prev_state > 10):
            reward = -1

        is_done = np.all(self.state > 9) or np.any(self.state > 15)
        if is_done:
            reward = 1 if np.all(self.state > 9) else -1

        return np.copy(self.state), reward, is_done, {}

    def random_effect(self):
        if random() < 0.2:
            print('random effect')
            random_index = randint(0, 3)
            if self.state[random_index] < 9:
                self.state[random_index] += 1

    def reset(self):
        self.state = np.zeros(4).astype(np.int32)
コード例 #7
0
class Version9(MontageWfEnv):
    """
        @version 9.0:
            - Update version of V5.0
            - Required update V5 to only take in CS
            - Required verify it's convergence by simulation before deployment
    """
    def __init__(self):
        # Montage Experiment Variable
        super(Version9, self).__init__()

        # 3: cs, 5: cs & cn
        self.action_range = 5
        # 10 = 10 processors available
        self.cluster_range = 10
        self.action_space = Discrete(self.action_range)
        self.observation_space = Discrete(
            self.cluster_range)  #, Discrete(self.cluster_range)
        self.cluster_size = 1
        self.cluster_num = 1

        # Episode Conf
        # Best exec_time: None or 1, depends on reward version
        self.best_overhead = None
        self.last_overhead = None
        self.last_action = None
        self.reward = 0
        self.total_reward = 0.0
        self.is_training = False

        self.exec_records = {}
        self.all_exec_record = list()
        self.all_overhead_record = list()
        self.all_makespan_record = list()
        self.seed()
        self.reset()

    def step(self, action, training=False):
        assert self.action_space.contains(action)
        self.is_training = training
        done = False
        reward = 0.0

        if action == 1:
            self.cluster_size += 1
        elif action == 2:
            self.cluster_size -= 1
        elif action == 3:
            self.cluster_num += 1
        elif action == 4:
            self.cluster_num -= 1

        # Range Guarding Function
        if self.cluster_size <= 0:
            # reward = -100.0
            self.cluster_size = 1
        elif self.cluster_size > 10:
            # reward = -100.0
            self.cluster_size = 10
        elif self.cluster_num <= 0:
            # reward = -100.0
            self.cluster_num = 1
        elif self.cluster_num > 10:
            # reward = -100.0
            self.cluster_num = 10
        else:
            # Return all the data collected
            # status, jb, wt, cwt = self.run_experiment(self.clusters_size, self.clusters_num)

            # Experiment run failed -> High Penalty
            # if not status:
            #     return self._get_obs(), -10, True, {}
            # jb = 0 if jb is None else jb
            # wt = 0 if wt is None else wt
            # cwt = 0 if cwt is None else cwt

            jb = self.run_gen_experiment(self.cluster_size, self.cluster_num)
            # jb = self.run_gen_experiment(self.cluster_size)
            wt = 0
            cwt = 0

            # self.exec_time = jb
            self.exec_time = float(jb)
            overhead = self.exec_time

            self.all_overhead_record.append(jb)
            self.all_makespan_record.append(jb)
            self.all_exec_record.append(jb)

            if not training:
                # Setting up best exec time
                if self.best_overhead is None:
                    self.best_overhead = overhead
                if self.last_overhead is None:
                    self.last_overhead = overhead

                # Rewarding / Penalty Judgement
                if overhead < np.percentile(self.all_overhead_record, 20):
                    self.best_overhead = overhead
                    reward = 200
                else:
                    reward = -100
                self.last_overhead = overhead

            self.total_reward += reward
            self.last_action = action
            if self.total_reward > 2000 or self.total_reward < -1000:
                done = True

        return self._get_obs(), reward, done, {
            "exec": self.exec_time,
            "overhead": self.exec_time,
            "makespan": self.exec_time
        }

    def reset(self):
        if self.last_overhead is not None:
            self.last_overhead = self.exec_time
        self.cluster_size = np.random.randint(1, self.cluster_range)
        self.cluster_num = np.random.randint(1, self.cluster_range)
        self.total_reward = 0
        # For cs only
        # return self.cluster_size
        # For cs & cn
        return self.cluster_size, self.cluster_num

    def _get_obs(self):
        # For cs
        # return self.cluster_size
        # For cs & cn
        return self.cluster_size, self.cluster_num
コード例 #8
0
class PocEnv(Env):
    def __init__(self, maze, obs_array=False):
        self.board = select_maze(maze)
        self.grid = PocGrid(board=self.board["_maze"])
        self._get_init_state()
        self.action_space = Discrete(4)
        self.observation_space = Discrete(1 << 10)  # 1024
        # self.observation_space = Discrete(14)
        self._reward_range = 100
        self._discount = .95
        self.done = False

        self.gui = None

        if obs_array:
            self._set_flags = set_flags_array
            self._zero = lambda: [0] * 10
        else:
            self._set_flags = set_flags
            self._zero = lambda: 0

    def seed(self, seed=None):
        np.random.seed(seed)

    def is_power(self, idx):
        return self.board['_maze'][idx] == 3

    def is_passable(self, idx):
        return self.board['_maze'][idx] != 0

    def _is_valid(self):

        assert self.grid.is_inside(self.state.agent_pos)
        assert self.is_passable(self.state.agent_pos)
        for ghost in self.state.ghosts:
            assert self.grid.is_inside(ghost.pos)
            assert self.is_passable(ghost.pos)

    def _set_state(self, state):
        self.done = False
        self.state = state

    def _generate_legal(self):
        actions = []
        for action in self.action_space.n:
            if self.grid.is_inside(self.state.agent_pos +
                                   Moves.get_coord(action.value)):
                actions.append(action.value)
        return actions

    def step(self, action):
        err_msg = "%r (%s) invalid" % (action, type(action))
        assert self.action_space.contains(action), err_msg
        assert self.done is False

        self.state.action = action

        reward = -1
        next_pos = self._next_pos(self.state.agent_pos, action)
        if next_pos.is_valid():
            self.state.agent_pos = next_pos
        else:
            reward += -25

        if self.state.power_step > 0:
            self.state.power_step -= 1

        hit_ghost = -1
        for g, ghost in enumerate(self.state.ghosts):
            if ghost.pos == self.state.agent_pos:
                hit_ghost = g
            else:
                # move ghost
                self._move_ghost(g, ghost_range=self.board["_ghost_range"])
                if ghost.pos == self.state.agent_pos:
                    hit_ghost = g

        if hit_ghost >= 0:
            if self.state.power_step > 0:
                reward += 25
                self.state.ghosts[hit_ghost].reset()
            else:
                reward += -100
                self.done = True
        # don't eat power up when hit by a ghost already
        elif self.is_power(self.state.agent_pos):
            self.state.power_step = config["_power_steps"]
            reward += 10
        # same for food
        elif self.state.food_pos[self.grid.get_index(self.state.agent_pos)]:
            self.state.food_pos[self.grid.get_index(self.state.agent_pos)] = 0
            if sum(self.state.food_pos) == 0:
                reward += 1000
                self.done = True

        obs = self._make_ob()

        return obs, reward, self.done, {"state": self.state}

    def _make_ob(self):
        obs = self._zero()
        for d in range(self.action_space.n):
            if self._see_ghost(d) >= 0:
                obs = self._set_flags(obs, d)
            next_pos = self._next_pos(self.state.agent_pos, direction=d)
            if next_pos.is_valid() and self.is_passable(next_pos):
                obs = self._set_flags(obs, d + self.action_space.n)
        if self._smell_food():
            obs = self._set_flags(obs, 8)
        if self._hear_ghost(self.state):
            obs = self._set_flags(obs, 9)
        return obs

    def _encode_state(self, state):
        poc_idx = self.grid.get_index(state.agent_pos)
        ghosts = [(self.grid.get_index(ghost.pos), ghost.direction)
                  for ghost in state.ghosts]

        return np.concatenate([[poc_idx], *ghosts, state.food_pos,
                               [state.power_step]])

    def _decode_state(self, state):
        poc_state = PocState(Coord(*self.grid.get_coord(state[0])))
        ghosts = np.split(state[1:self.board["_num_ghosts"] * 3], 1)
        for g in ghosts:
            poc_state.ghosts.append(
                Ghost(pos=self.grid.get_coord(g[0]), direction=g[1]))
        poc_state.power_step = state[-1]
        poc_state.food_pos = state[self.board["_num_ghosts"] * 3:-1].tolist()
        return poc_state

    def _see_ghost(self, action):
        eye_pos = self.state.agent_pos + Moves.get_coord(action)
        while True:
            for g, ghost in enumerate(self.state.ghosts):
                if ghost.pos == eye_pos:
                    return g
            eye_pos += Moves.get_coord(action)
            if not (self.grid.is_inside(eye_pos)
                    and self.is_passable(eye_pos)):
                break
        return -1

    def _smell_food(self, smell_range=1):
        for x in range(-smell_range, smell_range + 1):
            for y in range(-smell_range, smell_range + 1):
                smell_pos = Coord(x, y)
                idx = self.grid.get_index(self.state.agent_pos + smell_pos)
                if self.grid.is_inside(self.state.agent_pos + smell_pos) and\
                        self.state.food_pos[idx]:
                    return True
        return False

    @staticmethod
    def _hear_ghost(poc_state, hear_range=2):
        for ghost in poc_state.ghosts:
            if Grid.manhattan_distance(ghost.pos,
                                       poc_state.agent_pos) <= hear_range:
                return True
        return False

    def render(self, mode='human', close=False):
        if close:
            return
        if mode == 'human':
            if self.gui is None:
                self.gui = PocGui(board_size=self.grid.get_size,
                                  maze=self.board["_maze"],
                                  state=self.state)
            else:
                self.gui.render(state=self.state)

    def reset(self):
        self.done = False
        self._get_init_state()
        return self._make_ob()

    def close(self):
        pass

    def _get_init_state(self):
        self.state = PocState()
        self.state.agent_pos = Coord(*self.board["_poc_home"])
        ghost_home = Coord(*self.board["_ghost_home"])

        for g in range(self.board["_num_ghosts"]):
            pos = Coord(ghost_home.x + g % 2, ghost_home.y + g // 2)
            self.state.ghosts.append(Ghost(pos, direction=-1))

        self.state.food_pos = np.random.binomial(1,
                                                 config["_food_prob"],
                                                 size=self.grid.n_tiles)
        # only make free space food
        idx = (self.board["_maze"] > 0) &\
              (self.state.food_pos.reshape(self.board["_maze"].shape) > 0)
        self.board["_maze"][idx] = 4
        self.state.power_step = 0
        return self.state

    def _next_pos(self, pos, direction):
        direction = Moves.get_coord(direction)
        if pos.x == 0 and pos.y == self.board['_passage_y'] and\
                direction == Moves.EAST:
            next_pos = Coord(self.grid.x_size - 1, pos.y)
        elif pos.x == self.grid.x_size - 1 and\
                pos.y == self.board['_passage_y'] and direction == Moves.WEST:
            next_pos = Coord(0, pos.y)
        else:
            next_pos = pos + direction

        if self.grid.is_inside(next_pos) and self.is_passable(next_pos):
            return next_pos
        else:
            return Coord(-1, -1)

    def _move_ghost(self, g, ghost_range):
        if Grid.manhattan_distance(self.state.agent_pos,
                                   self.state.ghosts[g].pos) < ghost_range:
            if self.state.power_step > 0:
                self._move_defensive(g)
            else:
                self._move_aggressive(g)
        else:
            self._move_random(g)

    def _move_aggressive(self, g):
        if not np.random.binomial(1, p=config["_chase_prob"]):
            return self._move_random(g)

        best_dist = self.grid.x_size + self.grid.y_size
        best_pos = self.state.ghosts[g].pos
        best_dir = -1
        for d in range(self.action_space.n):
            dist = Grid.directional_distance(self.state.agent_pos,
                                             self.state.ghosts[g].pos, d)

            new_pos = self._next_pos(self.state.ghosts[g].pos, d)
            if dist <= best_dist and new_pos.is_valid() and\
                    can_move(self.state.ghosts[g], d):
                best_pos = new_pos
                best_dist = dist
                best_dir = d

        self.state.ghosts[g].update(best_pos, best_dir)

    def _move_defensive(self, g, defensive_prob=.5):
        if np.random.binomial(1, defensive_prob) and\
                self.state.ghosts[g].direction >= 0:
            self.state.ghosts[g].direction = -1
            return

        best_dist = 0
        best_pos = self.state.ghosts[g].pos
        best_dir = -1
        for d in range(self.action_space.n):
            dist = Grid.directional_distance(self.state.agent_pos,
                                             self.state.ghosts[g].pos, d)

            new_pos = self._next_pos(self.state.ghosts[g].pos, d)
            if dist >= best_dist and new_pos.is_valid() and\
                    can_move(self.state.ghosts[g], d):
                best_pos = new_pos
                best_dist = dist
                best_dir = d

        self.state.ghosts[g].update(best_pos, best_dir)

    def _move_random(self, g):
        # there are !!! dead ends
        # only switch to opposite direction when it failed 10 times (hack)
        ghost_pos = self.state.ghosts[g].pos
        i = 0
        while True:
            d = self.action_space.sample()
            next_pos = self._next_pos(ghost_pos, d)
            # normal map has dead ends:
            if next_pos.is_valid() and (can_move(self.state.ghosts[g], d)
                                        or i > 10):
                break
            i += 1

        self.state.ghosts[g].update(next_pos, d)
コード例 #9
0
ファイル: rock.py プロジェクト: muthissar/gym_pomdp
class RockEnv(Env):
    metadata = {"render.modes": ["human", "ansi"]}

    def __init__(self, board_size=7, num_rocks=8, use_heuristic=False):

        assert board_size in list(
            config.keys()) and num_rocks in config[board_size]['size']

        self.num_rocks = num_rocks
        self._use_heuristic = use_heuristic

        self._rock_pos = [
            Coord(*rock) for rock in config[board_size]['rock_pos']
        ]
        self._agent_pos = Coord(*config[board_size]['init_pos'])
        self.grid = Grid(board_size, board_size)

        for idx, rock in enumerate(self._rock_pos):
            self.grid.board[rock] = idx

        self.action_space = Discrete(len(Action) + self.num_rocks)
        self.observation_space = Discrete(len(Obs))
        self._discount = .95
        self._reward_range = 20
        self._penalization = -100
        self._query = 0

    def seed(self, seed=None):
        np.random.seed(seed)

    def step(self, action):

        assert self.action_space.contains(action)
        assert self.done is False

        self.last_action = action
        self._query += 1

        reward = 0
        ob = Obs.NULL.value

        if action < Action.SAMPLE.value:
            if action == Action.EAST.value:
                if self.state.agent_pos.x + 1 < self.grid.x_size:
                    self.state.agent_pos += Moves.EAST.value
                else:
                    reward = 10
                    self.done = True
                    return ob, reward, self.done, {
                        "state": self._encode_state(self.state)
                    }
            elif action == Action.NORTH.value:
                if self.state.agent_pos.y + 1 < self.grid.y_size:
                    self.state.agent_pos += Moves.NORTH.value
                else:
                    reward = self._penalization
            elif action == Action.SOUTH.value:
                if self.state.agent_pos.y - 1 >= 0:
                    self.state.agent_pos += Moves.SOUTH.value
                else:
                    reward = self._penalization
            elif action == Action.WEST.value:
                if self.state.agent_pos.x - 1 >= 0:
                    self.state.agent_pos += Moves.WEST.value
                else:
                    reward = self._penalization
            else:
                raise NotImplementedError()

        if action == Action.SAMPLE.value:
            rock = self.grid[self.state.agent_pos]
            if rock >= 0 and not self.state.rocks[
                    rock].status == 0:  # collected
                if self.state.rocks[rock].status == 1:
                    reward = 10
                else:
                    reward = -10
                self.state.rocks[rock].status = 0
            else:
                reward = self._penalization

        if action > Action.SAMPLE.value:
            rock = action - Action.SAMPLE.value - 1
            assert rock < self.num_rocks

            ob = self._sample_ob(self.state.agent_pos, self.state.rocks[rock])

            self.state.rocks[rock].measured += 1

            eff = self._efficiency(self.state.agent_pos,
                                   self.state.rocks[rock].pos)

            if ob == Obs.GOOD.value:
                self.state.rocks[rock].count += 1
                self.state.rocks[rock].lkv *= eff
                self.state.rocks[rock].lkw *= (1 - eff)
            else:
                self.state.rocks[rock].count -= 1
                self.state.rocks[rock].lkw *= eff
                self.state.rocks[rock].lkv *= (1 - eff)

            denom = (.5 * self.state.rocks[rock].lkv) + (
                .5 * self.state.rocks[rock].lkw)
            self.state.rocks[rock].prob_valuable = (
                .5 * self.state.rocks[rock].lkv) / denom

        self.done = self._penalization == reward
        return ob, reward, self.done, {"state": self._encode_state(self.state)}

    def _decode_state(self, state, as_array=False):

        agent_pos = Coord(*state['agent_pos'])
        rock_state = RockState(agent_pos)
        for r in state['rocks']:
            rock = Rock(pos=0)
            rock.__dict__.update(r)
            rock_state.rocks.append(rock)

        if as_array:
            rocks = []
            for rock in rock_state.rocks:
                rocks.append(rock.status)

            return np.concatenate([[self.grid.get_index(agent_pos)], rocks])

        return rock_state

    def _encode_state(self, state):
        # use dictionary for state encodign

        return _encode_dict(state)
        # rocks can take 3 values: -1, 1, 0 if collected

    def render(self, mode='human', close=False):
        if close:
            return
        if mode == "human":
            if not hasattr(self, "gui"):
                start_pos = self.grid.get_index(self.state.agent_pos)
                obj_pos = [(self.grid.get_index(rock.pos), rock.status)
                           for rock in self.state.rocks]
                self.gui = RockGui((self.grid.x_size, self.grid.y_size),
                                   start_pos=start_pos,
                                   obj=obj_pos)

            if self.last_action > Action.SAMPLE.value:
                rock = self.last_action - Action.SAMPLE.value - 1
                print("Rock S: {} P:{}".format(self.state.rocks[rock].status,
                                               self.state.rocks[rock].pos))
            # msg = "Action : " + action_to_str(self.last_action) + " Step: " + str(self.t) + " Rw: " + str(self.total_rw)
            agent_pos = self.grid.get_index(self.state.agent_pos)
            self.gui.render(agent_pos)

    def reset(self):
        self.done = False
        self._query = 0
        self.last_action = Action.SAMPLE.value
        self.state = self._get_init_state(should_encode=False)
        return Obs.NULL.value

    def _set_state(self, state):
        self.done = False
        self.state = self._decode_state(state)

    def close(self):
        self.render(close=True)

    def _compute_prob(self, action, next_state, ob):

        next_state = self._decode_state(next_state)

        if action <= Action.SAMPLE.value:
            return int(ob == Obs.NULL.value)

        eff = self._efficiency(
            next_state.agent_pos,
            next_state.rocks[action - Action.SAMPLE.value - 1].pos)

        if ob == Obs.GOOD.value and next_state.rocks[action -
                                                     Action.SAMPLE.value -
                                                     1].status == 1:
            return eff
        elif ob == Obs.BAD.value and next_state.rocks[action -
                                                      Action.SAMPLE.value -
                                                      1].status == -1:
            return eff
        else:
            return 1 - eff

    def _get_init_state(self, should_encode=True):

        rock_state = RockState(self._agent_pos)
        for idx in range(self.num_rocks):
            rock_state.rocks.append(Rock(self._rock_pos[idx]))
        return self._encode_state(rock_state) if should_encode else rock_state

    def _generate_legal(self):
        legal = [Action.EAST.value]  # can always go east
        if self.state.agent_pos.y + 1 < self.grid.y_size:
            legal.append(Action.NORTH.value)

        if self.state.agent_pos.y - 1 >= 0:
            legal.append(Action.SOUTH.value)
        if self.state.agent_pos.x - 1 >= 0:
            legal.append(Action.WEST.value)

        rock = self.grid[self.state.agent_pos]
        if rock >= 0 and self.state.rocks[rock].status != 0:
            legal.append(Action.SAMPLE.value)

        for rock in self.state.rocks:
            assert self.grid[rock.pos] != -1
            if rock.status != 0:
                legal.append(self.grid[rock.pos] + 1 + Action.SAMPLE.value)
        return legal

    def _generate_preferred(self, history):
        if not self._use_heuristic:
            return self._generate_legal()

        actions = []

        # sample rocks with high likelihood of being good
        rock = self.grid[self.state.agent_pos]
        if rock >= 0 and self.state.rocks[rock].status != 0 and history.size:
            total = 0
            # history
            for t in range(history.size):
                if history[t].action == rock + 1 + Action.SAMPLE.value:
                    if history[t].ob == Obs.GOOD.value:
                        total += 1
                    elif history[t].ob == Obs.BAD.value:
                        total -= 1
            if total > 0:
                actions.append(Action.SAMPLE.value)
                return actions

        # process the rocks

        all_bad = True
        direction = {
            "north": False,
            "south": False,
            "west": False,
            "east": False
        }
        for idx in range(self.num_rocks):
            rock = self.state.rocks[idx]
            if rock.status != 0:
                total = 0
                for t in range(history.size):
                    if history[t].action == idx + 1 + Action.SAMPLE.value:
                        if history[t].ob == Obs.GOOD.value:
                            total += 1
                        elif history[t].ob == Obs.BAD.value:
                            total -= 1
                if total >= 0:
                    all_bad = False

                    if rock.pos.y > self.state.agent_pos.y:
                        direction['north'] = True
                    elif rock.pos.y < self.state.agent_pos.y:
                        direction['south'] = True
                    elif rock.pos.x < self.state.agent_pos.x:
                        direction['west'] = True
                    elif rock.pos.x > self.state.agent_pos.x:
                        direction['east'] = True

        if all_bad:
            actions.append(Action.EAST.value)
            return actions

        # generate a random legal move
        # do not measure a collected rock
        # do no measure a rock too often
        # do not measure clearly bad rocks
        # don't move in a direction that puts you closer to bad rocks
        # never sample a rock

        if self.state.agent_pos.y + 1 < self.grid.y_size and direction['north']:
            actions.append(Action.NORTH.value)

        if direction['east']:
            actions.append(Action.EAST.value)

        if self.state.agent_pos.y - 1 >= 0 and direction['south']:
            actions.append(Action.SOUTH.value)

        if self.state.agent_pos.x - 1 >= 0 and direction['west']:
            actions.append(Action.WEST.value)

        for idx, rock in enumerate(self.state.rocks):
            if not rock.status == 0 and rock.measured < 5 and abs(
                    rock.count) < 2 and 0 < rock.prob_valuable < 1:
                actions.append(idx + 1 + Action.SAMPLE.value)

        if len(actions) == 0:
            return self._generate_legal()

        return actions

    def __dict2np__(self, state):
        idx = self.grid.get_index(Coord(*state['agent_pos']))
        rocks = []
        for rock in state['rocks']:
            rocks.append(rock['status'])
        return np.concatenate([[idx], rocks])

    @staticmethod
    def _efficiency(agent_pos, rock_pos, hed=20):
        # TODO check me
        d = Grid.euclidean_distance(agent_pos, rock_pos)
        eff = (1 + pow(2, -d / hed)) * .5
        return eff

    @staticmethod
    def _select_target(rock_state, x_size):
        best_dist = x_size * 2
        best_rock = -1  # Coord(-1, -1)
        for idx, rock in enumerate(rock_state.rocks):
            if rock.status != 0 and rock.count >= 0:
                d = Grid.manhattan_distance(rock_state.agent_pos, rock.pos)
                if d < best_dist:
                    best_dist = d
                    best_rock = idx  # rock.pos
        return best_rock

    @staticmethod
    def _sample_ob(agent_pos, rock, hed=20):
        eff = RockEnv._efficiency(agent_pos, rock.pos, hed=hed)
        if np.random.binomial(1, eff):
            return Obs.GOOD.value if rock.status == 1 else Obs.BAD.value
        else:
            return Obs.BAD.value if rock.status == 1 else Obs.GOOD.value
コード例 #10
0
class MarketEnv(Env):
    """
    The class implements an Env class interface from the Gym (OpenAI) package,
    to communicate with real market data.
    """

    reward_range = (-np.inf, np.inf)
    action_space = None
    observation_space = None
    actions = {'hold': 0, 'buy': 1, 'sell': 2}
    metadata = {'render.modes': ['human', 'rgb_array'],
                'video.frames_per_second' : 15}
    viewer = None

    def __init__(self, market: AbstractMarket,
                    use_deposit=False, use_last_action=False,
                    **kwargs):
        """MarketEnv constructor.
        
        Arguments
            market (AbstractMarket): Wrapper for access to the market through API, or other solutions.
            window (int): This is the size of the state of the environment (The number of time intervals).
        """

        self.market = market
        self.use_deposit = use_deposit
        self.use_last_action = use_last_action
        
        self.last_action = dict()

        # TODO action space for multy symbols agent
        self.action_space = Discrete(3 * self.market.symbols_count)
        self.observation_space = Space(shape=self.market.shape, dtype=np.float)

    def step(self, action):
        """Run one timestep of the environment's dynamics. When end of
        episode is reached, you are responsible for calling `reset()`
        to reset this environment's state.

        Accepts an action and returns a tuple (observation, reward, done, info).

        Arguments
            action (object): an action provided by the environment

        Returns
            observation (object): agent's observation of the current environment
            reward (float) : amount of reward returned after previous action
            done (boolean): whether the episode has ended, in which case further step() calls will return undefined results
            info (dict): contains auxiliary diagnostic information (helpful for debugging, and sometimes learning)
        """
        # TODO multy action
        assert self.action_space.contains(action[0]), "%r (%s) invalid"%(action, type(action))

        done = False
        reward = 0.0
        info = dict()

        observation = self.market.observation()
        feedback = []

        # action is the max index from the model output (from three neurons)
        idx = 0
        for symbol in self.market.symbols:
            if action[idx] == self.actions['buy']:
                self.market.buy_order(symbol)
            elif action[idx] == self.actions['hold']:
                pass
            elif action[idx] == self.actions['sell']:
                self.market.sell_order(symbol)

            if self.use_deposit:
                feedback.append(self.market.deposit(symbol))
            if self.use_last_action:
                feedback.append(self.last_action[symbol])
                self.last_action[symbol] = action[idx]

            info[symbol] = {
                'action': action[idx],
                'reward': self.market.profit,
                'deposit': self.market.deposit(symbol)
            }

            reward += self.market.profit
            idx += 1

        if self.use_deposit or self.use_last_action:
            observation.append(np.array(feedback))
        if self.market.done or self.market.balance <= 0:
            done = True

        info['sum_reward'] = reward
        info['balance'] = self.market.balance
        
        return (observation, reward, done, info)

    def reset(self):
        """Resets the state of the environment and returns an initial observation.

        Returns
            observation (object): the initial observation of the space.
        """

        self.market.reset()
        observation = self.market.observation()
        
        feedback = []
        for symbol in self.market.symbols:
            self.last_action[symbol] = 0
            if self.use_deposit:
                feedback.append(self.market.deposit(symbol))
            if self.use_last_action:
                feedback.append(self.last_action[symbol])

        if self.use_deposit or self.use_last_action:
            observation.append(np.array(feedback))

        return observation

    @property
    def feedback_shape(self):
        """"""

        return ((len(self.market.symbols) if self.use_deposit else 0) +
                (len(self.market.symbols) if self.use_last_action else 0))

    def render(self, mode='human', close=False):
        """Renders the environment.
        The set of supported modes varies per environment. (And some
        environments do not support rendering at all.) By convention,
        if mode is:
        - human: render to the current display or terminal and
          return nothing. Usually for human consumption.
        - rgb_array: Return an numpy.ndarray with shape (x, y, 3),
          representing RGB values for an x-by-y pixel image, suitable
          for turning into a video.
        - ansi: Return a string (str) or StringIO.StringIO containing a
          terminal-style text representation. The text can include newlines
          and ANSI escape sequences (e.g. for colors).

        Note
            Make sure that your class's metadata 'render.modes' key includes
              the list of supported modes. It's recommended to call super()
              in implementations to use the functionality of this method.

        Arguments
            mode (str): The mode to render with.
            close (bool): Close all open renderings.
        """

        if mode == 'rgb_array':
            # data = self.market.get_window(len=20)
            return np.array([]) # return RGB frame suitable for video
        elif mode is 'human':
            pass # pop up a window and render
        else:
            super(MarketEnv, self).render(mode=mode) # just raise an exception

    def configure(self, **kwargs):
        """Provides runtime configuration to the environment.
        This configuration should consist of data that tells your
        environment how to run (such as an address of a remote server,
        or path to your ImageNet data). It should not affect the
        semantics of the environment.
        """
        
        if kwargs['market']:
            del self.market
            self.market = kwargs['market']

        self.reset()

    def close(self):
        """Override _close in your subclass to perform any necessary cleanup.

        Environments will automatically close() themselves when
        garbage collected or when the program exits.
        """
        if self.viewer: self.viewer.close()

    def seed(self, seed=None):
        """Sets the seed for this env's random number generator(s).
        
        Returns
            list<bigint>: Returns the list of seeds used in this env's random
              number generators. The first value in the list should be the
              "main" seed, or the value which a reproducer should pass to
              'seed'. Often, the main seed equals the provided 'seed', but
              this won't be true if seed=None, for example.
        """
        self.np_random, seed = seeding.np_random(seed)
        return [seed]

    def __del__(self):
        self.close()

    def __str__(self):
        return '<{} instance>'.format(type(self).__name__)
コード例 #11
0
ファイル: version_10.py プロジェクト: rayson1223/gym-workflow
class Version10(MontageWfEnv):
    """
        @version 10.0:
            - Design based on pegasus
            - Reward scheme same as v8
            - Action scheu
    """
    def __init__(self):
        # Montage Experiment Variable
        super(Version10, self).__init__()

        self.action_range = 30
        self.cluster_range = 30
        self.action_space = Discrete(self.action_range)
        self.observation_space = Discrete(self.cluster_range)
        self.cluster_size = 1

        # Episode Conf
        # Best exec_time: None or 1, depends on reward version
        self.best_makespan = None
        self.last_makespan = None
        self.last_action = None
        self.reward = 0
        self.total_reward = 0.0

        self.exec_records = {}
        self.all_exec_record = list()
        self.all_overhead_record = list()
        self.all_makespan_record = list()
        self.seed()
        self.reset()

    def step(self, action, training=False):
        assert self.action_space.contains(action)
        action += 1
        done = False
        reward = 0.0

        # Return all the data collected
        makespan = self.run_gen_experiment(action)
        # status, jb, wt, cwt = self.run_experiment(action)

        # Experiment run failed -> High Penalty
        # if not status:
        #     return self._get_obs(), 0, True, {}
        #
        # jb = 0 if jb is None else jb
        # wt = 0 if wt is None else wt
        # cwt = 0 if cwt is None else cwt
        #
        # makespan = jb
        self.all_makespan_record.append(makespan)

        if not training:
            # Setting up best exec
            if self.best_makespan is None:
                self.best_makespan = makespan
            if self.last_makespan is None:
                self.last_makespan = makespan

            # Rewarding / Penalty Judgement
            if makespan < np.percentile(self.all_makespan_record, 20):
                self.best_makespan = makespan
                reward = 10
            # elif makespan < np.percentile(self.all_makespan_record, 40):
            #     reward = 1
            else:
                reward = -5
            self.last_makespan = makespan

            self.total_reward += reward
            self.last_action = action
            if self.total_reward > 50 or self.total_reward < -100:
                done = True

        return self._get_obs(), reward, done, {
            "exec": makespan,
            "overhead": makespan,
            "makespan": makespan,
            "benchmark": np.percentile(self.all_makespan_record, 20)
        }

    def reset(self):
        self.total_reward = 0.0
        # # This part is crucial, as it determine what data going to be stay and what's going to be remove to
        # # maintain the data stability
        # self.all_makespan_record.sort()
        # for x in self.all_makespan_record:
        #     # if wanna get reward while mantain the data, must determine a suitable range of data
        #     # if x < np.percentile(self.all_makespan_record, 20) or x > np.percentile(self.all_makespan_record, 80):
        #     # Unable to obtain proper reward if without remove the extra datas
        #     if x > np.percentile(self.all_makespan_record, 40):
        #         self.all_makespan_record.remove(x)
        # if len(self.all_makespan_record) > 20:
        #     while len(self.all_makespan_record) > 20:
        #         # try to remove the median data
        #         # self.all_makespan_record.remove(self.all_makespan_record[int(len(self.all_makespan_record) / 2) + 1])
        #         # self.all_makespan_record.remove(self.all_makespan_record[np.random.randint(1, len(self.all_makespan_record))])
        #         self.all_makespan_record.remove(self.all_makespan_record[0])
        #         # self.all_makespan_record.remove(self.all_makespan_record[len(self.all_makespan_record)-1])
        return np.random.randint(1,
                                 self.cluster_range + 1)  # , self.clusters_num

    def _get_obs(self):
        return np.random.randint(1, self.cluster_range + 1)
コード例 #12
0
ファイル: battleship.py プロジェクト: recurse-id/battleship
class BattleShipEnv(Env):
    metadata = {"render.modes": ["human", "ansi"]}

    def __init__(self, board_size=(5, 5), ship_sizes=[5, 4, 3, 3, 2]):
        self.grid = BattleGrid(board_size)
        self.action_space = Discrete(self.grid.n_tiles)
        self.observation_space = Discrete(len(Obs))
        self.num_obs = 2
        self._reward_range = self.action_space.n / 4.
        self._discount = 1.
        self.ship_sizes = ship_sizes
        self.total_remaining = sum(self.ship_sizes)

    def seed(self, seed=None):
        np.random.seed(seed)

    def _compute_prob(self, action, next_state, ob):

        action_pos = self.grid.get_coord(action)
        cell = self.grid[action_pos]
        if ob == Obs.NULL.value and cell.visited:
            return 1
        elif ob == Obs.HIT.value and cell.occupied:
            return 1
        else:
            return int(ob == Obs.NULL.value)

    def step(self, action):

        assert self.done is False
        assert self.action_space.contains(action)
        assert self.total_remaining > 0
        self.last_action = action
        self.t += 1
        action_pos = self.grid.get_coord(action)
        # cell = self.grid.get_value(action_pos)
        cell = self.grid[action_pos]
        reward = 0
        if cell.visited:
            reward -= 10
            obs = Obs.NULL.value
        else:
            if cell.occupied:
                reward -= 1
                obs = 1
                self.state.total_remaining -= 1

                for d in range(4, 8):
                    if self.grid[action_pos + Compass.get_coord(d)]:
                        self.grid[action_pos +
                                  Compass.get_coord(d)].diagonal = False
            else:
                reward -= 1
                obs = Obs.NULL.value
            cell.visited = True
        if self.state.total_remaining == 0:
            reward += self.grid.n_tiles
            self.done = True
        self.tot_rw += reward
        return obs, reward, self.done, {"state": self.state}

    def _set_state(self, state):
        self.reset()
        self.state = state

    def close(self):
        return

    def reset(self):
        self.done = False
        self.tot_rw = 0
        self.t = 0
        self.last_action = -1
        self.state = self._get_init_state()
        return Obs.NULL.value

    def render(self, mode='human', close=False):
        if close:
            return
        if mode == 'human':
            if not hasattr(self, "gui"):
                obj_pos = []
                for ship in self.state.ships:
                    pos = ship.pos
                    obj_pos.append(self.grid.get_index(pos))
                    for i in range(ship.length):
                        pos += Compass.get_coord(ship.direction)
                        obj_pos.append(self.grid.get_index(pos))
                self.gui = ShipGui(board_size=self.grid.get_size,
                                   obj_pos=obj_pos)
            if self.t > 0:
                msg = "A: " + str(self.grid.get_coord(
                    self.last_action)) + "T: " + str(self.t) + "Rw :" + str(
                        self.tot_rw)
                self.gui.render(state=self.last_action, msg=msg)

    def _generate_legal(self):
        # assert self.state.total_remaining > 0
        actions = []
        for action in range(self.action_space.n):
            action_pos = self.grid.get_coord(action)
            if not self.grid[action_pos].visited:
                actions.append(action)
        # assert len(actions) > 0
        return actions

    def _get_init_state(self):
        bsstate = ShipState()
        self.grid.build_board()
        for length in self.ship_sizes:
            # num_ships = 1
            # for idx in range(num_ships):
            while True:  # add one ship of each kind
                ship = Ship(coord=self.grid.sample(), length=length)
                if not self.collision(ship, self.grid, bsstate):
                    break
            self.mark_ship(ship, self.grid, bsstate)
            bsstate.ships.append(ship)
        return bsstate

    @staticmethod
    def mark_ship(ship, grid, state):

        pos = ship.pos  # .copy()

        for i in range(ship.length + 1):
            cell = grid[pos]
            assert not cell.occupied
            cell.occupied = True
            if not cell.visited:
                state.total_remaining += 1
            pos += Compass.get_coord(ship.direction)

    @staticmethod
    def collision(ship, grid, state):

        pos = ship.pos  # .copy()
        for i in range(ship.length):
            if not grid.is_inside(pos + Compass.get_coord(ship.direction)):
                return True
            # cell = grid.get_value(pos)
            cell = grid[pos]
            if cell.occupied:
                return True
            for adj in range(8):
                coord = pos + Compass.get_coord(adj)
                if grid.is_inside(coord) and grid[coord].occupied:
                    return True
            pos += Compass.get_coord(ship.direction)
        return False
コード例 #13
0
class GraphSamplingEnv(Env):

    metadata = {
        'render.modes': ('human', ),
    }

    def __init__(self, max_samples=3, render_depth=3, graph_args=None):
        self.graph_args = {**DEFAULT_GRAPH_ARGS, **(graph_args or {})}
        self._generate_new_graph()
        self.sampling_set = set()

        # actions: 0: sample 1: next edge 2: move
        self.action_space = Discrete(3)

        self._current_node = 0
        self._current_edge_idx = 0
        self._clustering_coefficients = None
        self._max_samples = max_samples
        self._render_depth = render_depth
        self._screen = None

        self.reset()

        observation_length = self.num_nodes**2 + 5 * self.num_nodes
        self.observation_space = Box(0, 1, observation_length)

    def _generate_new_graph(self):
        graph = generate_appm.main(self.graph_args)
        self.graph = graph

    def _randomize_position(self):
        self._current_node = random.sample(self.graph.nodes(), 1)[0]
        self._current_edge_idx = np.random.randint(
            len(self.graph.neighbors(self._current_node)))

    def _reset(self):
        self._generate_new_graph()
        self.num_nodes = self.graph.number_of_nodes()
        self.sampling_set = set()
        self._clustering_coefficients = nx.clustering(self.graph)
        self._current_node = np.random.randint(self.graph.number_of_nodes())
        self._current_edge_idx = 0
        self._num_actions = 0
        self.error = 0
        self._adjacency_matrix = np.array(
            nx.adjacency_matrix(self.graph, range(self.num_nodes)).todense())
        self._degree_vector = np.reshape(
            np.array([self.graph.degree(i) for i in range(self.num_nodes)],
                     dtype=np.float32), (-1, 1))
        self._clustering_coefficient_vector = np.reshape(
            np.array([
                self._clustering_coefficients[i] for i in range(self.num_nodes)
            ]), (-1, 1))
        x = [self.graph.node[idx]['value'] for idx in range(self.num_nodes)]
        return self._get_observation()

    def _validate_action(self, action):
        if not self.action_space.contains(action):
            raise ValueError("{} ({}) invalid".format(action, type(action)))

    def _get_next_node(self):
        neighbors = list(self.graph.neighbors(self._current_node))
        return neighbors[self._current_edge_idx]

    def _get_observation(self):
        state_descriptor = np.zeros((self.num_nodes, 3))
        # current node indicator
        state_descriptor[self._current_node, 0] = 1.
        # sampling set indicator
        for node in self.sampling_set:
            state_descriptor[node, 1] = 1.
        # next node indicator
        current_neighbor = self._get_next_node()
        state_descriptor[current_neighbor, 2] = 1.

        observation = np.reshape(
            np.concatenate(
                (self._adjacency_matrix, self._degree_vector,
                 self._clustering_coefficient_vector, state_descriptor), 1),
            (-1))
        return observation

    def _do_action(self, action):
        # actions: { 0: sample, 1: next edge 2: next node }
        if action == 0:
            self.sampling_set.add(self._current_node)
        elif action == 1:
            self._current_edge_idx = ((self._current_edge_idx + 1) %
                                      self.graph.degree(self._current_node))
        elif action == 2:
            self._current_node = self._get_next_node()
            self._current_edge_idx = 0
        self._num_actions += 1

    def _reward(self):
        x_hat = sparse_label_propagation(self.graph, list(self.sampling_set))
        x = [
            self.graph.node[idx]['value']
            for idx in range(self.graph.number_of_nodes())
        ]
        error = nmse(x, x_hat)
        tv = total_variation(self.graph.edges(), x)
        self.error = error

        # this + all nodes in action seems best
        reward = 1.0 - error

        return reward

    def _step(self, action):
        self._validate_action(action)
        self._do_action(action)
        observation = self._get_observation()

        num_samples = len(self.sampling_set)
        reward = 0.0

        done = (num_samples >= self._max_samples
                or num_samples >= self.graph.number_of_nodes())

        if done:
            reward = self._reward()

        return observation, reward, done, {}

    def get_current_nmse(self):
        graph = self.graph
        sampling_set = self.sampling_set

        x = [
            self.graph.node[idx]['value']
            for idx in range(self.graph.number_of_nodes())
        ]
        x_hat = sparse_label_propagation(graph, list(sampling_set))

        return nmse(x, x_hat)

    def _render(self, mode='human', close=False):
        if close:
            if self._screen is not None:
                pygame.display.quit()
                pygame.quit()
            return

        if self._screen is None:
            pygame.init()
            self._screen = pygame.display.set_mode((800, 800))

        subgraph_paths = nx.single_source_shortest_path(
            self.graph, self._current_node, cutoff=self._render_depth)
        subgraph = self.graph.subgraph(subgraph_paths)

        nodelist = [[] for _ in range(self._render_depth + 1)]
        nodelist_1d = []
        node_color = []
        for key, path in subgraph_paths.items():
            level = len(path) - 1
            nodelist[level].append(key)
            nodelist_1d.append(key)
            if key in self.sampling_set:
                node_color.append('r')
            else:
                node_color.append('g')

        for level in range(self._render_depth + 1):
            if len(nodelist[level]) == 0:
                del nodelist[level]

        local_labels = {
            key: str(self._clustering_coefficients[key])[:4]
            for key in subgraph.node
        }

        edge_color = []
        edge_list = []
        neighbors = list(self.graph.neighbors(self._current_node))
        edge_to = neighbors[self._current_edge_idx]
        for edge in subgraph.edges():
            edge_list.append(edge)
            if (edge_to in edge and self._current_node in edge):
                edge_color.append('r')
            elif (self._current_node in edge):
                edge_color.append('k')
            else:
                edge_color.append('b')

        fig = plt.figure(figsize=[4, 4], dpi=200)
        layout = nx.shell_layout(subgraph, nodelist)
        nx.draw(subgraph,
                edgelist=edge_list,
                edge_color=edge_color,
                style='dashed',
                nodelist=nodelist_1d,
                node_color=node_color,
                width=0.5,
                with_labels=True,
                center=self._current_node,
                pos=layout,
                labels=local_labels,
                font_size=8)

        canvas = agg.FigureCanvasAgg(fig)
        canvas.draw()
        size = canvas.get_width_height()
        raw_data = fig.canvas.tostring_rgb()
        plt.close(fig)
        surface = pygame.image.fromstring(raw_data, size, "RGB")
        self._screen.blit(surface, (0, 0))
        pygame.display.flip()

        # TODO: the event loop should be continuously pumped somewhere to avoid UI
        # freezing after every call to _render()
        pygame.event.get()
コード例 #14
0
class GazeboPionnerWrapper(gym.Env):
    def __init__(self):
        rospy.init_node("rl_robot_node")
        self.pub, self.reset_robot_state = reset_node_init()
        self.act_pub = rospy.Publisher('/pioneer/cmd_vel', Twist, queue_size=1)
        # self.observation_space = Discrete(100)  # 这样写表示100个相互独立的离散状态
        # 暂时去掉了两个速度,数据已经离散化(不需要归一化吧?),进一步转换成整型? # 速度的形式与其它状态相去甚远,需要进一步考虑是否将其作为状态
        # self.observation_space = gym.spaces.Box(-10.0, 10.0, (14, ), dtype=np.dtype(np.float32))
        self.observation_space = gym.spaces.Box(-100,
                                                100, (14, ),
                                                dtype=np.dtype(
                                                    np.int32))  # 整数版

        # self.observation_space = gym.spaces.Box(-10.0, 10.0, (4, ), dtype=np.dtype(np.float32))
        self.action_space = Discrete(5)  # 取值0,1,2,3,4,5 , 动作是否应该加上组合?
        self.state = None
        # self.laser_data = [0 for i in range(0, 10)]
        self.num_laser = 10  # 别乱改,有些相关数据没用这个参数
        self.laser_data = None
        self.sample_laser_data = [0 for x in range(0, self.num_laser)
                                  ]  # 特定抽取的data, 位置从右向左
        self.int_sample_laser_data = [0 for x in range(0, self.num_laser)
                                      ]  # 特定抽取的data, 位置从右向左,整数版
        # 下面5个用于校验激光数据(for reward_fuc)
        self.last_laser_data = [0 for x in range(0, self.num_laser)]
        # self.num_rew_sample_laser_data = 3
        # self.rew_sample_laser_data = [[] for x in range(0, self.num_rew_sample_laser_data)]
        # self.next_nrsld_index = 0
        # self.check_threshold = 0.3
        # self.bool_check_laser_data = [0 for x in range(0, self.num_laser)]
        self.current_sate = None
        # self.speed = 0.0  # 运行一段时间之后,记录的speed、angle就会与实际(gazebo)有较大差距
        self.min_speed = 0.0  # -0.75  # 线速度 m/s ;角速度 rad/s(6弧度相当于一个整圆),要想倒车必须在车身后部安装测距手段
        self.max_speed = 0.75  # 速度不能再高了,已经开始出现误差了
        # self.angle = 0.0
        self.min_angle = -0.5
        self.max_angle = 0.5
        self.cmd_twist = Twist()
        self.done = False
        self.thread1 = LaserNodeThread()
        self.thread1.start()
        # 获取位置服务
        rospy.wait_for_service('gazebo/get_model_state')
        self.get_state_srv = rospy.ServiceProxy('/gazebo/get_model_state',
                                                GetModelState)
        # 重置位置服务
        rospy.wait_for_service('gazebo/set_model_state')
        self.set_obstacle_srv = rospy.ServiceProxy('/gazebo/set_model_state',
                                                   SetModelState)
        self.obstacle_pose = [[0 for x in range(0, 2)]
                              for x in range(0, 20)]  # 20个障碍物的位置[x, y]
        # gazebo返回的robot数据
        self.gazebo_robot_pose_x = None
        self.gazebo_robot_pose_y = None
        self.gazebo_robot_speed = 0.0
        self.gazebo_robot_angle = 0.0
        # laser_data_node()
        # reward 相关
        self.arrive_reward = 100.0
        self.collision_reward = -100.0
        self.road_reward_times = 0.1
        self.last_distance_to_goal = 0.0  # 上一次动作时距离目标的距离
        # 设置最终robot位姿
        self.goal_x = 0.0
        self.goal_y = 10.0
        self.desire_final_speed = None
        self.desire_final_angle = None
        self.eps = 0  # 记录当前迭代轮数
        # self.seed()
        # self.reset()  # 先初始化一下 , 在dqn中会先调用一下的

    # def seed(self, seed=None):
    #     self.np_random, seed = seeding.np_random(seed)
    #     return [seed]

    # 使用随机目标应该可以加快学习速度,和泛化能力
    def random_goal(self):
        # print('call')
        self.goal_x = (20 * (np.random.ranf() - 0.5))  # x~(-10,10)
        # 注意这里的设计,比较巧妙, 每当goal_x 不满足条件时,就需要重新random一下,然后与出生点、“所有”障碍物点重新对比一下
        i = -1
        while i < 19:
            i += 1
            while abs(self.goal_x) < 1.0 or (
                    abs(self.obstacle_pose[i][0] - self.goal_x) <
                    1.2):  # 1.2 有待商议
                self.goal_x = (20 * (np.random.ranf() - 0.5))  # 距离初始位置或任意障碍物过近
                i = -1

        i = -1
        self.goal_y = (20 * (np.random.ranf() - 0.5))  # y~(-10,10)
        while i < 19:
            i += 1
            while abs(self.goal_y) < 1.0 or (
                    abs(self.obstacle_pose[i][1] - self.goal_y) <
                    1.2):  # 1.2 有待商议
                self.goal_y = (20 * (np.random.ranf() - 0.5))  # 距离初始位置或任意障碍物过近
                i = -1
        self.goal_x = round(self.goal_x, 1)  # 离散化
        self.goal_y = round(self.goal_y, 1)
        # print('finish')

    # def distance_to_obstacle(self, num_x, num_y, count):
    #     return math.sqrt((self.obstacle_pose[count][0] - num_x) ** 2 + (self.obstacle_pose[count][1] - num_y) ** 2)

    def reset(self):
        self.eps += 1
        # print('call_reset')
        self.done = False
        # 重置robot的 speed angle 注意各步骤执行顺序
        self.cmd_twist.linear.x = 0.0
        self.cmd_twist.angular.z = 0.0
        # 将robot重置回原位置
        start_time = time.time()
        while (time.time() - start_time) < 0.5:  #
            self.act_pub.publish(self.cmd_twist)  # 先重置速度
            self.pub.publish(self.reset_robot_state)
        # 重置state中的pose speed angle
        self.gazebo_robot_pose_x, self.gazebo_robot_pose_y, self.gazebo_robot_speed, self.gazebo_robot_angle = \
            self.get_gazebo_state()
        self.gazebo_robot_speed = 0.0
        self.gazebo_robot_angle = 0.0
        self.laser_data = self.thread1.laser_data
        self.sample_fuc()
        self.get_rid_of_bad_data()
        self.int_sample_fuc()
        self.state = np.array(self.int_sample_laser_data[:])
        # self.state = np.array([])  # 用来只有目标点作为状态时
        self.state = np.append(self.state, [
            int(self.goal_x * 10),
            int(self.goal_y * 10),
            int(self.gazebo_robot_pose_x * 10),
            int(self.gazebo_robot_pose_y * 10)
        ])
        # self.gazebo_robot_speed, self.gazebo_robot_angle])

        # reset 函数需要提供初始状态
        if self.eps % 30 == 0 or self.eps == 1:  # 每30轮random一次
            # self.reset_obstacle_pose()  # 3步顺序不可乱
            # self.reset_obstacle_pose_srv()
            self.random_goal()
            print('x: ' + str(self.goal_x) + ' y: ' + str(self.goal_y))

        # print(self.goal_x, self.goal_y)
        self.last_distance_to_goal = self.distance_to_goal()
        self.last_laser_data = self.sample_laser_data[:]
        self.state = self.state.reshape((14, ))
        # self.state = self.state.reshape((4,))
        return self.state

    def step(self, action):
        assert self.action_space.contains(
            action), "%r (%s) invalid" % (action, type(action))
        # robot 的加速度是有限制的,这里的选取一定要慎重,有两种方案,要么把变化数值都设成很小(不宜,难以快速响应环境)
        # 要么将publish的频率设限,20hz(+0.1情况下)以下
        if action == 0:
            self.gazebo_robot_speed += 0.05
        elif action == 1:
            self.gazebo_robot_speed -= 0.05
        elif action == 2:
            self.gazebo_robot_angle += 0.05
        elif action == 3:
            self.gazebo_robot_angle -= 0.05
        elif action == 4:
            pass  # 保持当前状态
        # elif action == 5:  # test_action ,later remove
        #     pass
        else:
            gym.logger.warn("Unknown situation !")

        self.gazebo_robot_speed = np.clip(self.gazebo_robot_speed,
                                          self.min_speed, self.max_speed)
        self.gazebo_robot_angle = np.clip(self.gazebo_robot_angle,
                                          self.min_angle, self.max_angle)

        # 与许多其它os不同之处,p2os设置好cmd_vel后,robot会一直保持该速度,
        # self.cmd_twist.linear.x = 0
        # gazebo中robot就算什么指令都没有,还是会有非常小的cmd_twist.linear.x与cmd_twist.angular.z,如果直接publish这些数据,只会造成误差越来越大
        if abs(self.gazebo_robot_speed) < 0.025:
            self.gazebo_robot_speed = 0.0
        if abs(self.gazebo_robot_angle) < 0.025:
            self.gazebo_robot_angle = 0.0
        self.cmd_twist.linear.x = self.gazebo_robot_speed
        self.cmd_twist.angular.z = self.gazebo_robot_angle
        # print("gazebo: speed" + str(self.gazebo_robot_speed) + ",  angle: " + str(self.gazebo_robot_angle))
        # 应该新建一个服务,将最新的动作信息同步给该服务,在该服务中负责发布cmd_twist,或者使用线程
        time.sleep(0.075)
        self.act_pub.publish(self.cmd_twist)

        # 执行完动作,检查一下状态值(这里会不会不太对?因为这里并不是严格的action一下,返回一个动作)
        self.laser_data = self.thread1.laser_data
        self.gazebo_robot_pose_x, self.gazebo_robot_pose_y, self.gazebo_robot_speed, self.gazebo_robot_angle = \
            self.get_gazebo_state()
        self.sample_fuc()
        self.get_rid_of_bad_data()
        self.int_sample_fuc()
        # print(self.sample_laser_data)
        # print("first")
        # print(self.rew_sample_laser_data)
        self.state = np.array(self.int_sample_laser_data[:])
        # self.state = np.array([])
        self.state = np.append(self.state, [
            int(self.goal_x * 10),
            int(self.goal_y * 10),
            int(self.gazebo_robot_pose_x * 10),
            int(self.gazebo_robot_pose_y * 10)
        ])
        # self.gazebo_robot_speed, self.gazebo_robot_angle])
        # self.state = (self.laser_data[0:10], self.gazebo_robot_pose.x, self.gazebo_robot_pose.y,
        #               self.gazebo_robot_speed, self.gazebo_robot_angle)
        # self.state.append(self.gazebo_robot_pose.x)
        # self.state.append(self.gazebo_robot_pose.y)
        # self.state.append(self.gazebo_robot_speed)
        # self.state.append(self.gazebo_robot_angle)
        # print(self.sample_laser_data)
        # 应该把最后的位姿信息也作为限制
        if self.distance_to_goal() < 0.5 or min(
                self.sample_laser_data
        ) < 0.5:  # and (self.gazebo_robot_speed < 0.1):
            self.done = True
        self.state = self.state.reshape((14, ))
        # self.state = self.state.reshape((4, ))
        reward_num = self.reward()  # reward值必须在更新last_distance_ro_goal之前计算
        self.last_distance_to_goal = self.distance_to_goal(
        )  # 更新距离,为下次计算reward准备
        self.last_laser_data = self.sample_laser_data[:]  # 注意这里,坑....  copy!!!

        return self.state, reward_num, self.done, {}

    def distance_to_goal(self):
        return math.sqrt((self.goal_x - self.gazebo_robot_pose_x)**2 +
                         (self.goal_y - self.gazebo_robot_pose_y)**2)

    def reward(self):
        # print(self.sample_laser_data)
        # print(min(self.sample_laser_data) < 0.5)
        # 应该把最后的位姿信息也作为reward的限制,与done保持一致,目标距离与障碍物距离之间应该有交叉reward?
        if self.distance_to_goal() < 0.5:
            return self.arrive_reward
        elif min(self.sample_laser_data
                 ) < 0.5:  # 有关障碍物距离部分,原始激光数据有噪声(或其它原因,不能直接做判定,甚至达到0.2 ???)
            # print(self.sample_laser_data)
            # print("error")
            return self.collision_reward
        # 谨慎选择有关距离的reward_fuc, 避免robot有可能出现的刷分行为(与arrive_reward的比例,正负等等)
        # 有关距离目标距离的部分,应该可以加快学习速度,但是比较难处理,超过目标点之后,跑远了就给它负的reward
        # 方案一
        # else:
        #     return self.road_reward_times * (self.last_distance_ro_goal-self.distance_to_goal())
        # 方案二
        else:
            if self.last_distance_to_goal - self.distance_to_goal(
            ) > 0:  # 越来越近
                return 1.0 * self.road_reward_times
            else:  # 现在这样原地不动,也是负reward
                return -1.0 * self.road_reward_times  # 加大远离的惩罚,后期去掉?
        # else:
        #     return 0.0

    def render(self, mode='human'):
        pass

    # 是否应该换成机器人本身的数据,毕竟真实机器人没有这些
    # @staticmethod
    def get_gazebo_state(self):
        # 第一个参数model_name,第二个参数为relative_entity_name
        model_state = self.get_state_srv(
            'robot_description', '')  # 注意model_state 的类型为GetModelStateResponse
        robot_pose_x = round(model_state.pose.position.x,
                             1)  # 注意pose为(x,y,z) , 离散,
        robot_pose_y = round(model_state.pose.position.y, 1)
        robot_twist_linear_x = round(model_state.twist.linear.x, 4)  # 离散化
        robot_twist_angular_z = round(model_state.twist.angular.z,
                                      4)  # angular.z代表平面机器人的角速度,因为此时z轴为旋转轴
        return robot_pose_x, robot_pose_y, robot_twist_linear_x, robot_twist_angular_z

    def get_obstacle_pose(self):
        pass

    # 用来随机重置障碍物的位置坐标
    def reset_obstacle_pose(self):
        for i in range(0, 20):
            self.obstacle_pose[i][0] = (20 * (np.random.ranf() - 0.5)
                                        )  # x~(-10,10)
            while abs(self.obstacle_pose[i][0]) < 1.0:
                self.obstacle_pose[i][0] = (20 * (np.random.ranf() - 0.5)
                                            )  # 距离初始位置过近
            self.obstacle_pose[i][0] = round(self.obstacle_pose[i][0],
                                             1)  # 离散化
            self.obstacle_pose[i][1] = (20 * (np.random.ranf() - 0.5)
                                        )  # y~(-10,10)
            while abs(self.obstacle_pose[i][1]) < 1.0:
                self.obstacle_pose[i][1] = (20 * (np.random.ranf() - 0.5)
                                            )  # 距离初始位置过近
            self.obstacle_pose[i][1] = round(self.obstacle_pose[i][1], 1)

    # 使用服务,重置障碍物
    def reset_obstacle_pose_srv(self):
        new_obstacle_state = ModelState()

        new_obstacle_state.pose.orientation.x = 0
        new_obstacle_state.pose.orientation.y = 0
        new_obstacle_state.pose.orientation.z = 0
        new_obstacle_state.pose.orientation.w = 0

        new_obstacle_state.twist.linear.x = 0
        new_obstacle_state.twist.linear.y = 0
        new_obstacle_state.twist.linear.z = 0

        new_obstacle_state.twist.angular.x = 0
        new_obstacle_state.twist.angular.y = 0
        new_obstacle_state.twist.angular.z = 0

        new_obstacle_state.reference_frame = "world"
        for i in range(0, 15):  # 10个方块
            new_obstacle_state.model_name = "unit_box_" + str(i)
            new_obstacle_state.pose.position.x = self.obstacle_pose[i][0]
            new_obstacle_state.pose.position.y = self.obstacle_pose[i][1]
            new_obstacle_state.pose.position.z = 0

            self.set_obstacle_srv(new_obstacle_state)

        for i in range(0, 5):  # 5个圆柱体,不使用球体,容易到处乱滚
            new_obstacle_state.model_name = "unit_cylinder_" + str(i)
            new_obstacle_state.pose.position.x = self.obstacle_pose[i + 15][0]
            new_obstacle_state.pose.position.y = self.obstacle_pose[i + 15][1]
            new_obstacle_state.pose.position.z = 0

            self.set_obstacle_srv(new_obstacle_state)

    # 从原激光数据中抽取10组0.0型的激光数据
    def sample_fuc(self):
        self.sample_laser_data[0] = round(self.laser_data[0], 1)  # 也许2位更好?
        self.sample_laser_data[-1] = round(self.laser_data[-1], 1)
        for i in range(1, 9):
            self.sample_laser_data[i] = round(
                self.laser_data[i * 72], 1)  # 离散化,保留1位小数,注意round()既非截断,也非四舍五入
        for j in range(0, 10):
            # if self.sample_laser_data[i] == float('inf'):
            if self.sample_laser_data[j] > 10.0:
                self.sample_laser_data[j] = 10.0

    # # 该函数应该在sample_fuc()之后调用,用于记录一个数组中的几组激光数据,self.rew_sample_laser_data为3×10(暂时)
    # def add_check_laser_data(self):
    #     data = self.sample_laser_data
    #     print(data)
    #     if self.next_nrsld_index >= len(self.rew_sample_laser_data):
    #         self.rew_sample_laser_data.append(data)
    #     else:
    #         self.rew_sample_laser_data[self.next_nrsld_index] = data
    #     self.next_nrsld_index = (self.next_nrsld_index + 1) % self.num_rew_sample_laser_data
    #     print(self.next_nrsld_index)
    # def add_check_laser_data(self):
    #     data = self.sample_laser_data
    #     self.rew_sample_laser_data[self.next_nrsld_index] = data
    #     self.next_nrsld_index = (self.next_nrsld_index + 1) % self.num_rew_sample_laser_data
    #     print("what the hell?")
    #     print(self.rew_sample_laser_data)
    #     print(self.next_nrsld_index)
    #
    # # 用于恢复<0.3的激光数据
    # def get_rid_of_bad_data(self):
    #     for i in range(self.num_laser):
    #         if self.sample_laser_data[i] < 0.3:
    #             for j in range(self.num_rew_sample_laser_data):
    #                 if self.rew_sample_laser_data[j][i] > 0.3:
    #                     self.sample_laser_data[i] = self.rew_sample_laser_data[j][i]
    #                 else:
    #                     pass
    #         else:
    #             pass

    # 用于恢复<0.3的激光数据
    def get_rid_of_bad_data(self):
        for i in range(self.num_laser):
            if self.sample_laser_data[i] < 0.3:
                # print("check")
                # print(self.sample_laser_data)
                # print(self.last_laser_data)
                self.sample_laser_data[i] = self.last_laser_data[i]
            else:
                pass

    def int_sample_fuc(self):
        for j in range(0, 10):
            self.int_sample_laser_data[j] = int(self.sample_laser_data[j] *
                                                10)  # 变成整数

    # 找到其它原因,放弃该部分代码
    # # 给出一个避免噪声影响的最小值, 还要加上限制条件(inf情况下),如安然变小(噪声),为false
    # def check_min_laser_data(self):
    #     mean = 0.0
    #     sigle_bool_check_data = [0 for x in range(0, self.num_rew_sample_laser_data)]
    #     sigle_bool_bool = [0 for x in range(0, self.num_rew_sample_laser_data)]
    #     for i in range(self.num_laser):
    #         for j in range(self.num_rew_sample_laser_data):
    #             sigle_bool_check_data[j] = self.rew_sample_laser_data[j][i]
    #             mean = sum(sigle_bool_check_data)/len(sigle_bool_check_data)
    #         for k in range(self.num_rew_sample_laser_data):
    #             if (sigle_bool_check_data[k] - mean) > 0.1:   # (当前状态下不太可能单次变化超过0.1)
    #                 sigle_bool_bool[k] = False  # 判定为噪声数据
    #             else:
    #                 sigle_bool_bool[k] = True
    #         if False in sigle_bool_bool:
    #             self.bool_check_laser_data[i] = False  # 判定该位置为噪声数据
    #     # 迭代寻找最小数据,未完成

    # def __del__(self):
    #     # 等待至线程中止。这阻塞调用线程直至线程的join()方法被调用中止-正常退出或者抛出未处理的异常-或者是可选的超时发生。
    #     self.thread1.join()  # 还是不对
    #
    def close(self):
        self.thread1.join()  # 还是不对
コード例 #15
0
class Version5(MontageWfEnv):
    """
        @version 5.0:
            - Run with real env
            - No Sampling
    """
    def __init__(self):
        # Montage Experiment Variable
        super(Version5, self).__init__()
        self.action_range = 20
        self.cluster_range = 20
        self.action_space = Discrete(self.action_range)
        self.observation_space = Discrete(self.cluster_range)

        # Episode Conf
        # Best exec_time: None or 1, depends on reward version
        self.best_exec_time = None
        self.last_exec_time = None
        self.last_action = None
        self.last_reward = None
        self.total_reward = 0.0
        self.all_makespan_record = list()
        self.all_benchmark_record = list()
        # 0: Ntg, 1: improve, 2: degrade
        self.is_improve = 0
        self.seed()
        self.reset()

    def step(self, action, training=False):
        assert self.action_space.contains(action)

        action += 1
        done = False
        reward = 0.0

        # res = self.run_static_experiment(self.clusters_size, self.clusters_num)
        res = self.run_demo_cn_gen_experiment(action)

        self.all_makespan_record.append(res)
        self.exec_time = res

        # Rewarding / Penalty Judgement
        percentile = np.percentile(self.all_makespan_record, 10)
        benchmark = np.mean(self.all_benchmark_record)
        # Calc improve percentage
        if len(self.all_benchmark_record) == 0:
            self.all_benchmark_record.append(percentile)
        elif abs(percentile - benchmark) / benchmark * 100 > 10:
            self.all_benchmark_record.append(percentile)
            benchmark = np.mean(self.all_benchmark_record)
        # print(benchmark)
        # if self.exec_time < np.percentile(self.all_makespan_record, 10):
        if self.exec_time < benchmark:
            reward = 10
        else:
            reward = -1
        self.total_reward += reward
        if self.total_reward > 200:
            done = True
        return self._get_obs(), reward, done, {
            "exec": self.exec_time,
            "overhead": self.exec_time,
            "makespan": self.exec_time,
            "benchmark": benchmark
        }

    def render(self, mode='human'):
        outfile = StringIO() if mode == 'ansi' else sys.stdout
        init_msg = "Current Experiment Parameters: degree: %d\t cluster.size: %d\t cluster.num: %d\t\n" % (
            self.degree, self.clusters_size, self.clusters_num)
        outfile.write(init_msg)
        # if self.last_action is not None:
        # 	cs, cn = action
        result_str = "Current Execution Time: \t"
        expect_str = "Best Execution Time: \t"
        action_str = "Current Action: \t"
        # Process Outputs
        outfile.write(result_str + (" %s " % self.exec_time) + "\n")
        outfile.write(expect_str + (" %s " % self.best_exec_time) + "\n")
        outfile.write(action_str + (" %s " % self.last_action) + "\n")

        return outfile

    def reset(self):
        # Reset method should always return a new sets of episode settings
        # self.degree = 0.1
        # self.clusters_size = 1
        # self.clusters_num = 1
        # self.band_num = 1
        # self.best_exec_time = None
        if self.exec_time is not None:
            self.last_exec_time = self.exec_time
        self.wall_time = None
        self.cum_wall_time = None
        self.total_reward = 0
        self.clusters_size = np.random.randint(1, 30)
        self.clusters_num = np.random.randint(1, 30)

        # print("Environment had been reset!")
        return np.random.randint(1,
                                 self.cluster_range + 1)  # , self.clusters_num

    def _get_obs(self):
        return np.random.randint(1, self.cluster_range + 1)
コード例 #16
0
class Version11(MontageWfEnv):
    """
        @version 11.0:

    """
    def __init__(self):
        # Montage Experiment Variable
        super(Version11, self).__init__()

        self.action_range = 10
        self.cluster_range = 10
        self.action_space = Discrete(self.action_range)
        self.observation_space = Discrete(self.cluster_range)
        self.cluster_size = 1

        # Episode Conf
        # Best exec_time: None or 1, depends on reward version
        self.best_makespan = None
        self.last_makespan = None
        self.last_action = None
        self.reward = 0
        self.total_reward = 0.0

        self.exec_records = {}
        self.all_exec_record = list()
        self.all_overhead_record = list()
        self.all_makespan_record = list()
        self.all_benchmark_record = list()
        self.seed()
        self.reset()

    def step(self, action, training=False):
        assert self.action_space.contains(action)
        action += 1
        done = False
        reward = 0.0

        # Return all the data collected
        # makespan = self.run_gen_experiment(action)
        makespan = self.run_cn_gen_experiment(action)
        # status, jb, wt, cwt = self.run_experiment(action)

        # Experiment run failed -> High Penalty
        # if not status:
        #     return self._get_obs(), 0, True, {}
        #
        # jb = 0 if jb is None else jb
        # wt = 0 if wt is None else wt
        # cwt = 0 if cwt is None else cwt
        #
        # makespan = jb
        self.all_makespan_record.append(makespan)
        if not training:
            # Setting up best exec
            # if self.best_makespan is None:
            #     self.best_makespan = makespan
            # if self.last_makespan is None:
            #     self.last_makespan = makespan

            # Rewarding / Penalty Judgement
            percentile = np.percentile(self.all_makespan_record, 10)
            benchmark = np.mean(self.all_benchmark_record)
            # Calc improve percentage
            if len(self.all_benchmark_record) == 0:
                self.all_benchmark_record.append(percentile)
            elif abs(percentile - benchmark) / benchmark * 100 > 10:
                self.all_benchmark_record.append(percentile)
                benchmark = np.mean(self.all_benchmark_record)
            # else:
            # print(abs(percentile - benchmark) / benchmark * 100)
            # self.all_benchmark_record.append(percentile)
            # benchmark = percentile

            if makespan < benchmark:
                reward = 1
            else:
                reward = -1
            self.last_makespan = makespan

            self.total_reward += reward
            self.last_action = action
            if self.total_reward > 20 or self.total_reward < -20:
                done = True

        return self._get_obs(), reward, done, {
            "exec": makespan,
            "overhead": makespan,
            "makespan": makespan,
            "benchmark": benchmark
        }

    def reset(self):
        self.total_reward = 0.0
        return np.random.randint(1,
                                 self.cluster_range + 1)  # , self.clusters_num

    def _get_obs(self):
        return np.random.randint(1, self.cluster_range + 1)
コード例 #17
0
class AdaptiveLearningEnv(gym.Env):

    metadata = { 'render.modes': ['human', 'rgb_array'] }

    def __init__(self, filename='activities.pkl'):
        self.filename = filename
        self.assets_dir = os.path.dirname(os.path.abspath(__file__))
        self.reward_range = (0, 1)
        self.viewer = None
        self.circle_indexs = []
        self.ob = None
        self._configure()
        self._seed()
        self._reset()

    def _configure(self):
        self._load_activities()
        self.action_space = Discrete(len(self.activities))
        self.observation_space = Box(0, 1, len(self.knowledges)) #
        self.simulator = StudentSimulator()

    def _load_activities(self):
        data_file = os.path.join(self.assets_dir, 'assets/%s' % self.filename)
        pkl_file = open(data_file, 'rb')
        self.knowledges = pickle.load(pkl_file)
        self.activities = pickle.load(pkl_file)
        pkl_file.close()

    def _step(self, action):
        assert self.action_space.contains(action)
        a = Activity(self.activities[action], self.knowledges)
        ob, reward, done = self.simulator.progress(self.ob, a)
        self.ob = ob
        return ob, reward, done, {}

    def _reset(self):
        self.ob = Box(0.1, 0.1, len(self.knowledges)).sample()
        return self.ob

    def _seed(self, seed=None):
        self.np_random, seed = seeding.np_random(seed)
        return [seed]

    def _render(self, mode='human', close=False):
        if close:
            if self.viewer is not None:
                self.viewer.close()
                self.viewer = None
            return

        screen_width = 600
        screen_height = 400
        radius = 10
        init_alpha = 0.1
        margin = radius * 2.5
        max_per_line = (screen_width - margin * 2) / margin
        colors = np.array([[78,191,126], [254,178,45], [175,101,194]])/255.

        if self.viewer is None:
            from gym.envs.classic_control import rendering
            self.viewer = rendering.Viewer(screen_width, screen_height)

            for (i, x) in enumerate(sorted(self.knowledges, key=lambda tup:(tup.level(), tup.group), reverse=True)):
                h, w = divmod(i, max_per_line)
                self.circle_indexs.append(x._id)
                w = screen_width - 20  - w * (margin + 10)
                h = screen_height - 20 - h * (margin + 10)
                t = self.viewer.draw_circle(radius)
                t.add_attr(rendering.Transform((w, h)))
                r, g, b = colors[x.level() - 1]
                t.set_color(r, g, b, init_alpha)
                self.viewer.add_geom(t)

        for i, x in enumerate(self.ob):
            if len(self.circle_indexs) != 0:
                t = self.viewer.geoms[self.circle_indexs.index(i)]
                k = self.knowledges[i]
                r, g, b = colors[k.level() - 1]
                t.set_color(r, g, b, x)
        return self.viewer.render(return_rgb_array = mode=='rgb_array')
コード例 #18
0
class Simple_Copy_Repeat_ENV(Env):

    ALPHABET = list(string.ascii_uppercase[:26])

    def __init__(self, n_char=5, size=6, repeat=3):
        """
        :param n_char: number of different chars in inputs, e.g. 3 => {A,B,C}
        :param size: the length of input sequence
        :param repeat: the expected repeat times of the target output
        """
        self.n_char = n_char
        self.size = size
        self.repeat = repeat

        # observation (characters)
        self.observation_space = Discrete(
            n_char + 1)  # +1: empty symbol, whose index is n_char
        # action
        self.action_space = Discrete(n_char)

        # states of an episode
        self.position = None
        self.last_action = None
        self.last_reward = None
        self.episode_total_reward = None
        self.input_str = None
        self.target_str = None
        self.output_str = None

        self.np_random = None
        self.seed()
        self.reset()

    @property
    def input_length(self):
        return len(self.input_str)

    @property
    def target_length(self):
        return self.input_length * self.repeat

    def seed(self, seed=None):
        self.np_random, seed = seeding.np_random(seed)
        return [seed]

    def reset(self):
        self.position = 0
        self.last_action = None
        self.last_reward = None
        self.episode_total_reward = 0.0
        self.input_str, self.target_str = self._generate_input_target()
        self.output_str = ''
        obs_char, obs_idx = self._get_observation()
        return obs_idx

    def step(self, action):
        assert self.action_space.contains(action)
        assert 0 <= self.position < self.target_length
        target_act = self.ALPHABET.index(self.target_str[self.position])
        reward = 1.0 if action == target_act else -1.0
        self.last_action = action
        self.last_reward = reward
        self.episode_total_reward += reward
        self.output_str += self.ALPHABET[action]
        self.position += 1
        if self.position < self.target_length:
            done = False
            _, obs = self._get_observation()
        else:
            done = True
            obs = None
        info = {"target_act": target_act}
        return obs, reward, done, info

    def render(self, mode='human'):
        outfile = sys.stdout  # TODO: other mode
        pos = self.position - 1
        o_str = ""
        if pos > -1:
            for i, c in enumerate(self.output_str):
                color = 'green' if self.target_str[i] == c else 'red'
                o_str += colorize(c, color, highlight=True)
        outfile.write("=" * 20 + "\n")
        outfile.write("Length   : " + str(self.input_length) + "\n")
        outfile.write("T-Length : " + str(len(self.target_str)) + "\n")
        outfile.write("Input    : " + self.input_str + "\n")
        outfile.write("Target   : " + self.target_str + "\n")
        outfile.write("Output   : " + o_str + "\n")
        if self.position > 0:
            outfile.write("-" * 20 + "\n")
            outfile.write("Current reward:   %.2f\n" % self.last_reward)
            outfile.write("Cumulative reward:   %.2f\n" %
                          self.episode_total_reward)
        outfile.write("\n")
        return

    def _generate_input_target(self):
        input_str = ""
        for i in range(self.size):
            c = self.np_random.choice(self.ALPHABET[:self.n_char])
            input_str += c
        target_str = ""
        for i in range(self.repeat):
            if i % 2 == 1:
                target_str += input_str[::-1]
            else:
                target_str += input_str
        return input_str, target_str

    def _get_observation(self, pos=None):
        if pos is None:
            pos = self.position
        if pos >= self.input_length:
            obs_char = ''
            obs_idx = self.n_char
        else:
            obs_char = self.input_str[pos]
            obs_idx = self.ALPHABET.index(obs_char)
        return obs_char, obs_idx
コード例 #19
0
ファイル: tiger.py プロジェクト: solohan22/deep-q-learning
class TigerEnv(gym.Env):
    metadata = {"render.modes": ["human", "ansi"]}

    def __init__(self, seed=0, correct_prob=.85):
        self.correct_prob = correct_prob
        self.action_space = Discrete(len(Action))
        self.state_space = Discrete(len(State))
        self.observation_space = Discrete(len(Obs))
        self._discount = .95
        self._reward_range = (-float(100), float(10))
        self._query = 0
        self.seed(seed)

    def reset(self):
        self.done = False
        self.t = 0
        self._query = 0
        self.state = self.state_space.sample()
        self.last_action = Action.LISTEN.value
        return Obs.NULL.value

    def seed(self, seed=1234):
        np.random.seed(seed)
        return [seed]

    def step(self, action):

        assert self.action_space.contains(action)
        assert self.done is False
        self.t += 1
        self._query += 1
        self.last_action = action

        rw = TigerEnv._compute_rw(self.state, action)
        if TigerEnv._is_terminal(self.state, action):
            self.done = True
            return self.state, rw, self.done, {'state': self.state}

        self._sample_state(action)
        ob = TigerEnv._sample_ob(action, self.state)
        self.done = False
        return ob, rw, self.done, {"state": self.state}

    def render(self, mode='human', close=False):
        if close:
            return
        if mode == "human":
            if not hasattr(self, "gui"):
                self.gui = TigerGui()
            msg = "A: " + action_to_str(
                self.last_action) + " S: " + state_to_str(self.state)
            self.gui.render(state=(self.last_action, self.state), msg=msg)
        elif mode == "ansi":
            print("Current step: {}, tiger is in state: {}, action took: {}".
                  format(self.t, self.state, self.last_action[0]))
        else:
            raise NotImplementedError()

    def close(self):
        self._render(close=True)

    def _set_state(self, state):
        self.state = state
        self.done = False

    def _generate_legal(self):
        return list(range(self.action_space.n))

    def _generate_preferred(self, history):
        return self._generate_legal()

    def _sample_state(self, action):
        if action == Action.RIGHT.value or action == Action.LEFT.value:
            self.state = self.state_space.sample()

    def _get_init_state(self):
        # fix initial belief to be exact
        return self.state_space.sample()

    @staticmethod
    def _compute_prob(action, next_state, ob, correct_prob=.85):
        p_ob = 0.0
        if action == Action.LISTEN.value and ob != Obs.NULL.value:
            if (next_state == State.LEFT.value and ob == Obs.LEFT.value) or (
                    next_state == State.RIGHT.value and ob == Obs.RIGHT.value):
                p_ob = correct_prob
            else:
                p_ob = 1 - correct_prob
        elif action != Action.LISTEN.value and ob == Obs.NULL.value:
            p_ob = 1.

        assert p_ob >= 0.0 and p_ob <= 1.0
        return p_ob

    @staticmethod
    def _sample_ob(action, next_state, correct_prob=.85):
        ob = Obs.NULL.value
        p = np.random.uniform()
        if action == Action.LISTEN.value:
            if next_state == State.LEFT.value:
                ob = Obs.RIGHT.value if p > correct_prob else Obs.LEFT.value
            else:
                ob = Obs.LEFT.value if p > correct_prob else Obs.RIGHT.value
        return ob

    @staticmethod
    def _local_move(state, last_action, last_ob):
        raise NotImplementedError()

    @staticmethod
    def _is_terminal(state, action):
        is_terminal = False
        if action != Action.LISTEN.value:
            is_terminal = (
                (action == Action.LEFT.value and state == State.LEFT.value) or
                (action == Action.RIGHT.value and state == State.RIGHT.value))
        return is_terminal

    @staticmethod
    def _compute_rw(state, action):
        if action == Action.LISTEN.value:
            reward = -1
        elif not TigerEnv._is_terminal(state, action):
            reward = 10
        else:
            reward = -100
        return reward
コード例 #20
0
class TagEnv(Env):
    metadata = {"render.modes": ["human", "ansi"]}

    def __init__(self,
                 num_opponents=1,
                 move_prob=.8,
                 obs_cells=29,
                 board_size=(10, 5)):
        self.num_opponents = num_opponents
        self.move_prob = move_prob
        self._reward_range = 10 * self.num_opponents
        self._discount = .95
        self.action_space = Discrete(len(Action))
        self.grid = TagGrid(board_size, obs_cells=obs_cells)
        self.observation_space = Discrete(self.grid.n_tiles + 1)
        self.time = 0

        self.done = False
        self.time = 0
        self.last_action = 4
        self.state = None

    def reset(self):
        self.done = False
        self.time = 0
        self.last_action = 4
        self.state = self._get_init_state(False)
        # get agent position
        return self._sample_ob(state=self.state, action=0)

    def seed(self, seed=None):
        np.random.seed(seed)
        return [seed]

    def step(self, action: int):  # state
        err_msg = "%r (%s) invalid" % (action, type(action))
        assert self.action_space.contains(action), err_msg
        assert self.done is False

        reward = 0.
        self.time += 1
        self.last_action = action
        assert self.grid.is_inside(self.state.agent_pos)
        assert self.grid.is_inside(self.state.opponent_pos[0])

        if action == 4:
            tagged = False
            for opp, opp_pos in enumerate(self.state.opponent_pos):
                # check if x==x_agent and y==y_agent
                if opp_pos == self.state.agent_pos:
                    reward = 10.
                    tagged = True
                    # self.state.opponent_pos[opp] = Coord(4, 4)
                    self.state.num_opp -= 1
                    # self.state.opponent_pos.pop(opp)
                elif self.grid.is_inside(self.state.opponent_pos[opp]) and\
                        self.state.num_opp > 0:
                    self.move_opponent(opp)
            if not tagged:
                reward = -10.

        else:
            reward = -1.
            next_pos = self.state.agent_pos + Moves.get_coord(action)
            if self.grid.is_inside(next_pos):
                self.state.agent_pos = next_pos

        ob = self._sample_ob(self.state, action)
        assert ob < self.grid.n_tiles + 1
        # p_ob = self._compute_prob(action, self.state, ob)
        self.done = self.state.num_opp == 0
        # self._encode_state(self.state)}
        return ob, reward, self.done, {"state": self.state}

    def render(self, mode="human", close=False):
        # return
        if close:
            return
        # if mode == "human":
        #     agent_pos = self.grid.get_index(self.state.agent_pos)
        #     opponent_pos = [self.grid.get_index(opp) for
        #                     opp in self.state.opponent_pos]
        #     if self.gui is None:
        #         self.gui = TagGui(board_size=self.grid.get_size,
        #                           start_pos=agent_pos, obj_pos=opponent_pos)
        #     msg = "S: " + str(self.state) + " T: " + str(self.time) +\
        #           " A: " + action_to_str(
        #         self.last_action)
        #     self.gui.render(state=(self.state.agent_pos,
        #                            self.state.opponent_pos), msg=msg)

    def _encode_state(self, state):
        s = np.zeros(self.num_opponents + 1, dtype=np.int32) - 1
        s[0] = self.grid.get_index(state.agent_pos)
        for idx, opp in zip(range(1, len(s)), state.opponent_pos):
            opp_idx = self.grid.get_index(opp)
            s[idx] = opp_idx

        return s

    def _decode_state(self, state):
        agent_idx = state[0]
        tag_state = TagState(self.grid.get_tag_coord(agent_idx))
        for opp_idx in state[1:]:
            if opp_idx > -1:
                tag_state.num_opp += 1
            opp_pos = self.grid.get_tag_coord(opp_idx)
            tag_state.opponent_pos.append(opp_pos)

        return tag_state

    def _get_init_state(self, should_encode=False):

        agent_pos = self.grid.sample()
        assert self.grid.is_inside(agent_pos)
        tag_state = TagState(agent_pos)
        for opp in range(self.num_opponents):
            opp_pos = self.grid.sample()
            assert self.grid.is_inside(opp_pos)
            tag_state.opponent_pos.append(opp_pos)

        tag_state.num_opp = self.num_opponents
        assert len(tag_state.opponent_pos) > 0

        if not should_encode:
            return tag_state
        else:
            return self._encode_state(tag_state)

    def _set_state(self, state):
        # self.reset()
        self.done = False
        # self.state = self._decode_state(state)
        self.state = state

    def move_opponent(self, opp):
        opp_pos = self.state.opponent_pos[opp]
        actions = self._admissable_actions(self.state.agent_pos, opp_pos)
        if np.random.binomial(1, self.move_prob):
            move = np.random.choice(actions).value
            if self.grid.is_inside(opp_pos + move):
                self.state.opponent_pos[opp] += move

    def _compute_prob(self, next_state, ob):
        # next_state = self._decode_state(next_state)

        p_ob = int(ob == self.grid.get_index(next_state.agent_pos))
        if ob == self.grid.n_tiles:
            for opp_pos in next_state.opponent_pos:
                if opp_pos == next_state.agent_pos:
                    return 1.
        return p_ob

    def _sample_ob(self, state, action):
        ob = self.grid.get_index(state.agent_pos)  # agent index
        # ob = self.grid.board[self.state.agent_pos]
        if action < Action.TAG.value:
            for opp_pos in state.opponent_pos:
                if opp_pos == state.agent_pos:
                    ob = self.grid.n_tiles  # 29 observation
        return ob

    def generate_legal(self):
        return list(range(self.action_space.n))

    def _generate_preferred(self, history):
        actions = []
        if history.size == 0:
            return self.generate_legal()
        if history[-1].ob == self.grid.n_tiles and\
                self.grid.is_corner(self.state.agent_pos):

            actions.append(Action.TAG.value)
            return actions
        for d in range(4):
            if history[-1].action != self.grid.opposite(d) and\
                    self.grid.is_inside(
                    self.state.agent_pos + Moves.get_coord(d)):

                actions.append(d)
        assert len(actions) > 0
        return actions

    # def _local_move(self, state, last_action, last_ob):
    #     if len(state.opponent_pos) > 0:
    #         opp = np.random.randint(len(state.opponent_pos))
    #     else:
    #         return False
    #
    #     if state.opponent_pos[opp] == Coord(-1, -1):
    #         return False
    #     state.opponent_pos[opp] = self.grid.sample()
    #     if last_ob != self.grid.get_index(state.agent_pos):
    #         state.agent_pos = self.grid.get_tag_coord(last_ob)
    #
    #     ob = self._sample_ob(state, last_action)
    #     return ob == last_ob

    @staticmethod
    def _admissable_actions(agent_pos, opp_pos):
        actions = []
        if opp_pos.x >= agent_pos.x:
            actions.append(Moves.EAST)
        if opp_pos.y >= agent_pos.y:
            actions.append(Moves.NORTH)
        if opp_pos.x <= agent_pos.x:
            actions.append(Moves.WEST)
        if opp_pos.y <= agent_pos.y:
            actions.append(Moves.SOUTH)
        if opp_pos.x == agent_pos.x and opp_pos.y > agent_pos.y:
            actions.append(Moves.NORTH)
        if opp_pos.y == agent_pos.y and opp_pos.x > agent_pos.x:
            actions.append(Moves.EAST)
        if opp_pos.x == agent_pos.x and opp_pos.y < agent_pos.y:
            actions.append(Moves.SOUTH)
        if opp_pos.y == agent_pos.y and opp_pos.x < agent_pos.x:
            actions.append(Moves.WEST)
        assert len(actions) > 0
        return actions
コード例 #21
0
ファイル: version_2.py プロジェクト: rayson1223/gym-workflow
class Version2(MontageWfEnv):
    """
        @version 2: Greedy Approach --> Proved method correct, reward wrong
        - Known fact that best exec time is 100
        All actions will be given -1, goal to find the lowest 2nd score
    """

    def __init__(self, degree=0.1, band_num=1, db_dir=".pegasus/workflow.db"):
        # Montage Experiment Variable
        super(Version2, self).__init__()
        self.degree = degree
        self.clusters_size = 1
        self.clusters_num = 1
        self.is_clusters_size = True
        self.is_clusters_num = False
        self.band_num = band_num

        # Setting database connection
        # self.db = DatabaseEnv(db_dir)

        self.action_space = Discrete(3)

        self.observation_space = Discrete(10)  # , Discrete(8), Discrete(3)

        # Episode Conf
        # Best exec_time: None or 1, depends on reward version
        self.best_exec_time = None
        self.last_exec_time = None
        self.last_action = None
        self.last_reward = None
        self.total_reward = 0.0
        # 0: Ntg, 1: improve, 2: degrade
        self.is_improve = 0
        self.seed()
        self.reset()

    def step(self, action, training=False):
        assert self.action_space.contains(action)

        def calc_lb_hb(v, p):
            return (v * (100 - p)) / 100, (v * (p + 100)) / 100

        reward = 0.0
        self.last_action = action
        done = False
        if action == 1:
            self.clusters_size += 1
        elif action == 2:
            self.clusters_size -= 1
        # elif action == 3:
        #     self.clusters_num += 1
        # elif action == 4:
        #     self.clusters_num -= 1

        # Range Guarding Function
        if self.clusters_size <= 0:
            reward -= 1.0
            self.clusters_size = 1
        elif self.clusters_size > 30:
            reward -= 1.0
            self.clusters_size = 30
        # elif self.clusters_num <= 0:
        #     reward -= 1.0
        #     self.clusters_num = 1
        # elif self.clusters_num > 10:
        #     reward -= 1.0
        #     self.clusters_num = 10
        else:
            # res = self.run_static_experiment(self.clusters_size, self.clusters_num)
            res = self.run_static_experiment(self.clusters_size)
            self.exec_time = res

            if self.best_exec_time is None:
                self.best_exec_time = res
            if self.last_exec_time is None:
                self.last_exec_time = res
            if self.exec_time < 300:
                reward = 2
            else:
                reward = -1
            self.total_reward += reward
            if self.total_reward > 10:
                done = True
        return self._get_obs(), reward, done, {
            "exec": self.exec_time,
            "overhead": self.exec_time,
            "makespan": self.exec_time,
            "benchmark": 200
        }

    def render(self, mode='human'):
        outfile = StringIO() if mode == 'ansi' else sys.stdout
        init_msg = "Current Experiment Parameters: degree: %d\t cluster.size: %d\t cluster.num: %d\t\n" % (
            self.degree, self.clusters_size, self.clusters_num)
        outfile.write(init_msg)
        # if self.last_action is not None:
        # 	cs, cn = action
        result_str = "Current Execution Time: \t"
        expect_str = "Best Execution Time: \t"
        action_str = "Current Action: \t"
        # Process Outputs
        outfile.write(result_str + (" %s " % self.exec_time) + "\n")
        outfile.write(expect_str + (" %s " % self.best_exec_time) + "\n")
        outfile.write(action_str + (" %s " % self.last_action) + "\n")

        return outfile

    def reset(self):
        # Reset method should always return a new sets of episode settings
        # self.degree = 0.1
        # self.clusters_size = 1
        # self.clusters_num = 1
        # self.band_num = 1
        # self.best_exec_time = None
        if self.exec_time is not None:
            self.last_exec_time = self.exec_time
        self.wall_time = None
        self.cum_wall_time = None
        self.total_reward = 0
        self.clusters_size = np.random.randint(1, 30)
        self.clusters_num = np.random.randint(1, 30)

        # print("Environment had been reset!")
        return self.clusters_size

    def _get_obs(self):
        return self.clusters_size
コード例 #22
0
class PocEnv(Env):
    def __init__(self):
        self.board = select_maze("micro")
        self._get_init_state()
        self.action_space = Discrete(4)
        self.observation_space = Box(low=0, high=1, shape=(8, ))
        self._reward_range = 100
        self.step_cnt = 0

    def seed(self, seed=None):
        np.random.seed(seed)

    def _passable(self, pos):
        return self.maze[pos.x, pos.y] != Element.WALL

    def _inside(self, pos):
        if 0 <= pos.x < self.maze.shape[0] and 0 <= pos.y < self.maze.shape[1]:
            return True
        return False

    def step(self, action):
        assert self.action_space.contains(action)
        assert self.done is False
        self.step_cnt += 1

        reward = -1
        next_pos = self.next_pos(self.internal_state.agent_pos, action)
        self.internal_state.agent_pos = next_pos

        if self.internal_state.power_duration > 0:
            self.internal_state.power_duration -= 1
        agent_in_power = self.internal_state.power_duration > 0

        hit_ghost = set()
        for g, ghost in enumerate(self.internal_state.ghosts):
            # check if the ghost hits the agent before and after it moves
            if ghost.pos == self.internal_state.agent_pos:
                hit_ghost.add(g)
            else:
                ghost.move(self.internal_state.agent_pos, agent_in_power)
                if ghost.pos == self.internal_state.agent_pos:
                    hit_ghost.add(g)

        hit_ghost = list(hit_ghost)
        for g in hit_ghost:
            if self.internal_state.power_duration > 0:
                reward += 25
                self.internal_state.ghosts[g].reset()
            else:
                reward += -100
                self.done = True
                break

        if self.step_cnt > self.board["_max_step"]:
            self.done = True

        if self._agent_at_food():
            reward += 10
            self.maze[self.internal_state.agent_pos.x,
                      self.internal_state.agent_pos.y] = Element.CLEAR_WALK_WAY
            if self._food_left() == 0:
                self.done = True

        if self._agent_at_power():
            self.internal_state.power_duration = self.board["_power_duration"]
            self.maze[self.internal_state.agent_pos.x,
                      self.internal_state.agent_pos.y] = Element.CLEAR_WALK_WAY
            reward += 10

        ob = self._make_ob(action)

        return ob, reward, self.done, {"state": self.internal_state}

    def _agent_at_food(self):
        agent_pos = self.internal_state.agent_pos
        if self.maze[agent_pos.x, agent_pos.y] == Element.FOOD_PELLET:
            return True
        return False

    def _agent_at_power(self):
        agent_pos = self.internal_state.agent_pos
        if self.maze[agent_pos.x, agent_pos.y] == Element.POWER:
            return True
        return False

    def _make_ob(self, action):
        """
        Return 10 state features of observation:
        4 features indicating whether the agent can see a ghost
            in that direction (UP, RIGHT, DOWN, LEFT)
        4 features indicating whether he can feel a wall in each of the
            cardinal directions, which is set to 1 if he is adjacent to a wall
        1 feature indicating whether he can hear a ghost, which is set to 1
            if he is within Manhattan distance 2 of a ghost
        1 feature indicating whether he can smell food (adjacent or
            diagonally adjacent to any food)
        """
        ob = np.zeros(STATE_DIM)
        for i, action in enumerate(ACTIONS):
            if self._see_ghost(action):
                ob[i] = 1
            next_pos = self.next_pos(self.internal_state.agent_pos, action)
            # If an agent couldn't move from the current position, then there is a wall
            if next_pos == self.internal_state.agent_pos:
                ob[i + len(ACTIONS)] = 1
        if self._hear_ghost():
            ob[2 * len(ACTIONS)] = 1
        if self._smell_food():
            ob[2 * len(ACTIONS) + 1] = 1
        return ob

    def _see_ghost(self, action):
        agent_pos = self.internal_state.agent_pos
        for ghost in self.internal_state.ghosts:
            if agent_pos.x != ghost.pos.x and agent_pos.y != ghost.pos.y:
                continue
            if ((action == Action.UP and ghost.pos.x < agent_pos.x) or
                (action == Action.DOWN and ghost.pos.x > agent_pos.x) or
                (action == Action.LEFT and ghost.pos.y < agent_pos.y) or
                (action == Action.RIGHT and ghost.pos.y > agent_pos.y)
                ) and not self._wall_between(agent_pos, ghost.pos):
                return True
        return False

    def _smell_food(self):
        smell_range = self.board["_smell_range"]
        agent_pos = self.internal_state.agent_pos

        for x in range(-smell_range, smell_range + 1):
            for y in range(-smell_range, smell_range + 1):
                smell_x = agent_pos.x + x
                smell_y = agent_pos.y + y
                if (0 <= smell_x < self.maze.shape[0]
                        and 0 <= smell_y < self.maze.shape[1] and
                        self.maze[smell_x, smell_y] == Element.FOOD_PELLET):
                    return True
        return False

    def _hear_ghost(self):
        for ghost in self.internal_state.ghosts:
            if (manhattan_distance(ghost.pos, self.internal_state.agent_pos) <=
                    self.board["_hear_range"]):
                return True
        return False

    def _wall_between(self, pos1, pos2):
        assert pos1 != pos2
        assert pos1.x == pos2.x or pos1.y == pos2.y
        if pos1.y == pos2.y:
            for i in range(min(pos1.x, pos2.x) + 1, max(pos1.x, pos2.x)):
                if self.maze[i, pos1.y] == Element.WALL:
                    return True
        elif pos1.x == pos2.x:
            for i in range(min(pos1.y, pos2.y), max(pos1.y, pos2.y)):
                if self.maze[pos1.x, i] == Element.WALL:
                    return True
        return False

    def _food_left(self):
        return np.sum(self.maze == Element.FOOD_PELLET)

    def reset(self):
        self.done = False
        self.step_cnt = 0
        self._get_init_state()
        return 0

    def _get_init_state(self):
        self.maze = self.board["_maze"].copy()
        self.internal_state = InternalState()
        self.internal_state.agent_pos = Position(*self.board["_poc_home"])
        Ghost.max_x = self.maze.shape[0]
        Ghost.max_y = self.maze.shape[1]
        ghost_home = Position(*self.board["_ghost_home"])
        Ghost.home = ghost_home

        for _ in range(self.board["_num_ghosts"]):
            pos = Position(ghost_home.x, ghost_home.y)
            self.internal_state.ghosts.append(
                Ghost(
                    self,
                    pos,
                    direction=random_action(),
                    ghost_range=self.board["_ghost_range"],
                ))

        return self.internal_state

    def next_pos(self, pos, action):
        x_offset, y_offset = 0, 0
        if action == Action.UP:
            x_offset = -1
            y_offset = 0
        elif action == Action.DOWN:
            x_offset = 1
            y_offset = 0
        elif action == Action.RIGHT:
            x_offset = 0
            y_offset = 1
        elif action == Action.LEFT:
            x_offset = 0
            y_offset = -1

        next_pos = Position(pos.x + x_offset, pos.y + y_offset)

        if self._inside(next_pos) and self._passable(next_pos):
            return next_pos
        else:
            return pos

    def print_internal_state(self):
        print("Step", self.step_cnt)
        print_maze = self.maze.astype(str)
        print_maze[self.internal_state.agent_pos.x,
                   self.internal_state.agent_pos.y] = "A"
        ghost_str = ""
        for g, ghost in enumerate(self.internal_state.ghosts):
            print_maze[ghost.pos.x, ghost.pos.y] = "G"
            ghost_str += "Ghost {} at {}, direction={}, type={}\n".format(
                g, ghost.pos, ACTION_DICT[ghost.direction], ghost.move_type)
        np.set_printoptions(formatter={"str_kind": lambda x: x})
        print("Maze: \n{}".format(print_maze))
        print("Agent at {}, power duration {}".format(
            self.internal_state.agent_pos, self.internal_state.power_duration))
        print(ghost_str[:-1])

    def print_ob(self, ob):
        ob_str = ""
        for i, action in enumerate(ACTIONS):
            if ob[i] == 1:
                ob_str += " SEE GHOST {},".format(ACTION_DICT[action])
        for i, action in enumerate(ACTIONS):
            if ob[i + len(ACTIONS)] == 1:
                ob_str += " FEEL WALL {},".format(ACTION_DICT[action])
        if ob[-2]:
            ob_str += " HEAR GHOST,"
        if ob[-1]:
            ob_str += " SMELL FOOD,"
        return ob_str
コード例 #23
0
class SSingleRobGpw(gym.Env):
    def __init__(self):
        rospy.init_node("rl_robot_node"+str(ac_num))
        self.pub, self.reset_robot_state = reset_node_init()
        self.act_pub = rospy.Publisher('/robot'+str(ac_num)+'/pioneer/cmd_vel', Twist, queue_size=1)
        # 当前每步为16个状态
        if enable_correction:
            self.observation_space = gym.spaces.Box(0, 100, (64,), dtype=np.dtype(np.int32))  # 整数版
        else:
            self.observation_space = gym.spaces.Box(-100, 100, (64, ), dtype=np.dtype(np.int32))  # 整数版
        # self.observation_space = gym.spaces.Box(-10.0, 10.0, (4, ), dtype=np.dtype(np.float32))
        self.action_space = Discrete(6)  # 取值0,1,2,3,4,5 , 动作是否应该加上组合?
        self.state = None
        # self.laser_data = [0 for i in range(0, 10)]
        self.num_laser = 10  # 别乱改,有些相关数据没用这个参数
        self.laser_data = None
        self.sample_laser_data = [0 for x in range(0, self.num_laser)]  # 特定抽取的data, 位置从右向左
        self.int_sample_laser_data = [0 for x in range(0, self.num_laser)]  # 特定抽取的data, 位置从右向左,整数版
        # 用于修正激光数据
        # self.last_laser_data = [0 for x in range(0, self.num_laser)]
        # self.current_sate = None
        # self.speed = 0.0  # 运行一段时间之后,记录的speed、angle就会与实际(gazebo)有较大差距
        self.state_speed = 0.0  # 作为状态用的speed 离散整数化
        self.state_angle = 0.0  # angular speed
        self.s_state = None
        self.min_speed = 0.0  # -0.75  # 线速度 m/s ;角速度 rad/s(6弧度相当于一个整圆),要想倒车必须在车身后部安装测距手段
        self.max_speed = 0.5  # 速度不能高于0.75,已经开始出现误差  0.5m/s因为环境障碍物比较密集
        # self.angle = 0.0
        self.min_angle = - 0.5
        self.max_angle = 0.5
        self.cmd_twist = Twist()
        self.done = False
        self.thread1 = LaserNodeThread()
        self.thread1.setDaemon(True)
        self.thread1.start()
        # 获取位置服务
        rospy.wait_for_service('gazebo/get_model_state')
        self.get_state_srv = rospy.ServiceProxy('/gazebo/get_model_state', GetModelState)
        # 重置位置服务
        rospy.wait_for_service('gazebo/set_model_state')
        self.set_obstacle_srv = rospy.ServiceProxy('/gazebo/set_model_state', SetModelState)
        # self.obstacle_pose = [[0 for x in range(0, 2)] for x in range(0, 20)]  # 20个障碍物的位置[x, y]
        # gazebo返回的robot数据
        self.gazebo_robot_pose_x = None  # 可以考虑为其注入噪声
        self.gazebo_robot_pose_y = None
        self.gazebo_robot_speed = 0.0
        self.gazebo_robot_angle = 0.0
        self.state_distance = 0.0
        self.state_angle = 0.0
        self.last_robot_pose_x = 0.0
        self.last_robot_pose_y = 0.0
        # reward 相关
        self.arrive_reward = 100.0
        self.collision_reward = -100.0
        self.road_reward_times = 0.1
        self.last_distance_to_goal = 0.0  # 上一次动作时距离目标的距离
        self.distance_to_goal_threshold = 0.25
        self.collision_threshold = 0.43  # 至少比车身宽 0.5  如果laser位于中央,0.3不足以做到覆盖车身距离
        self.distance_r_sate = 0
        # 设置最终robot位姿
        self.goal_x = 0.0
        self.goal_y = 10.0
        self.desire_final_speed = None
        self.desire_final_angle = None
        self.eps = 0  # 记录当前迭代轮数
        self.seed()
        self.steps = 0  # for debug

    def seed(self, seed=None):
        self.np_random, seed = seeding.np_random(seed)
        return [seed]

    # fixed version
    def reset_goal(self):
        goal_num = self.np_random.randint(0, 2)
        # self.goal_x = self.cc_func(round(goal_sequence[ac_num][goal_num][0], 1))
        # self.goal_y = self.cc_func(round(goal_sequence[ac_num][goal_num][1], 1))
        self.goal_x = round(goal_sequence[ac_num][goal_num][0], 1)
        self.goal_y = round(goal_sequence[ac_num][goal_num][1], 1)

    def compute_distance(self, lhx, lhy, rhx, rhy):
        return math.sqrt((lhx - rhx) ** 2 + (lhy - rhy) ** 2)

    # 使用相邻两次移动计算robot朝向,移动距离很小的话(特别有噪声的情况下,怎么可能计算的准确?需要借助IMU?)
    def compute_polar_coordinates(self):
        edge_a = self.compute_distance(self.last_robot_pose_x, self.last_robot_pose_y,
                                       self.gazebo_robot_pose_x, self.gazebo_robot_pose_y)
        edge_b = self.compute_distance(self.last_robot_pose_x, self.last_robot_pose_y, self.goal_x, self.goal_y)
        edge_c = self.compute_distance(self.gazebo_robot_pose_x, self.gazebo_robot_pose_y, self.goal_x, self.goal_y)
        cos_angle_beta = (edge_a**2 + edge_c**2 - edge_b**2)/(2*edge_a*edge_c)
        self.state_distance = edge_c  # 数值需进行处理
        self.state_angle = math.pi - math.acos(cos_angle_beta)  # 数值需进行处理

    # def distance_to_obstacle(self, num_x, num_y, count):
    #     return math.sqrt((self.obstacle_pose[count][0] - num_x) ** 2 + (self.obstacle_pose[count][1] - num_y) ** 2)

    def state_speed_wrapper(self):
        self.state_speed = round(self.gazebo_robot_speed, 4) * 200
        self.state_speed = int(self.state_speed)  # (0~100)
        self.state_angle = round(self.gazebo_robot_angle, 4) * 100
        self.state_angle = int(self.state_angle) + 50  # (0~100)

    def reset(self):
        self.steps = 0  # for debug
        self.eps += 1
        self.done = False
        # 每100轮random一次
        if self.eps % 100 == 0 or self.eps == 1:
            self.reset_goal()
            # robot 的初始朝向也随机化
            self.reset_robot_state.pose.orientation.z = self.np_random.uniform(low=-0.9, high=0.9)
            self.reset_robot_state.pose.orientation.w = self.np_random.uniform(low=-0.9, high=0.9)
            print('actor'+str(ac_num)+' --> x: ' + str(self.goal_x) + ' y: ' + str(self.goal_y))
        # 重置robot的 speed angle 注意各步骤执行顺序
        self.cmd_twist.linear.x = 0.0
        self.cmd_twist.angular.z = 0.0
        # 将robot重置回原位置
        start_time = time.time()
        while (time.time() - start_time) < 0.5:  # 这里不使用srv直接重置,
            self.act_pub.publish(self.cmd_twist)  # 先重置速度
            self.pub.publish(self.reset_robot_state)
            time.sleep(0.05)
        # time.sleep(0.5)  # 重置后需要一定时间等待激光等数据更新...
        # 重置state中的pose speed angle
        self.gazebo_robot_pose_x, self.gazebo_robot_pose_y, self.gazebo_robot_speed, self.gazebo_robot_angle = \
            self.get_gazebo_state()
        self.gazebo_robot_speed = 0.0
        self.gazebo_robot_angle = 0.0
        self.laser_data = self.thread1.laser_data
        self.last_robot_pose_x = self.gazebo_robot_pose_x
        self.last_robot_pose_y = self.gazebo_robot_pose_y
        self.sample_fuc()
        # self.get_rid_of_bad_data()
        self.int_sample_fuc()
        # reset 函数需要提供初始状态
        self.state = np.array(self.int_sample_laser_data[:])
        # self.state = np.array([])  # 用来只有目标点作为状态时
        self.state_speed_wrapper()
        self.state = np.append(self.state, [int(self.goal_x*10), int(self.goal_y*10),
                                            int(self.gazebo_robot_pose_x*10), int(self.gazebo_robot_pose_y*10),
                                            self.state_speed, self.state_angle])
        self.sequence_state_wrap()
        self.last_distance_to_goal = self.distance_to_goal()
        # self.last_laser_data = self.sample_laser_data[:]
        self.s_state = self.s_state.reshape((64,))
        # self.state = self.state.reshape((4,))
        return self.s_state

    # 连续4个state合在一起
    def sequence_state_wrap(self):
        if self.steps is 0:
            self.s_state = self.state
            for i in range(0, 3):
                self.s_state = np.append(self.s_state, self.state)
        else:
            for j in range(0, 3):
                for k in range(0, 16):
                    self.s_state[j*16+k] = self.s_state[(j+1)*16+k]
            for l in range(0, 16):
                self.s_state[3 * 16 + l] = self.state[l]

    def step(self, action):
        self.steps += 1
        assert self.action_space.contains(action), "%r (%s) invalid" % (action, type(action))
        # robot 的加速度是有限制的,这里的选取一定要慎重,有两种方案,要么把变化数值都设成很小(不宜,难以快速响应环境)
        # 要么将publish的频率设限,20hz(+0.1情况下)以下
        if action == 0:
            self.gazebo_robot_speed += 0.10
        elif action == 1:
            self.gazebo_robot_speed -= 0.10
        elif action == 2:
            self.gazebo_robot_angle = 0.20
        elif action == 3:
            self.gazebo_robot_angle = -0.20
        elif action == 4:
            self.gazebo_robot_angle = 0.0
        elif action == 5:
            pass  # 保持当前状态
        else:
            gym.logger.warn("Unknown situation !")

        self.gazebo_robot_speed = np.clip(self.gazebo_robot_speed, self.min_speed, self.max_speed)
        self.gazebo_robot_angle = np.clip(self.gazebo_robot_angle, self.min_angle, self.max_angle)

        # 与许多其它os不同之处,p2os设置好cmd_vel后,robot会一直保持该速度,
        # self.cmd_twist.linear.x = 0
        # gazebo中robot就算什么指令都没有,还是会有非常小的cmd_twist.linear.x
        # 与cmd_twist.angular.z,如果直接publish这些数据,只会造成误差越来越大
        if abs(self.gazebo_robot_speed) < 0.025:
            self.gazebo_robot_speed = 0.0
        if abs(self.gazebo_robot_angle) < 0.025:
            self.gazebo_robot_angle = 0.0
        self.cmd_twist.linear.x = self.gazebo_robot_speed
        self.cmd_twist.angular.z = self.gazebo_robot_angle
        # attention! testing!
        # self.cmd_twist.linear.x = 0.5
        # self.cmd_twist.angular.z = 0.0
        time.sleep(0.08)  # 0.075
        self.act_pub.publish(self.cmd_twist)
        # 记录部分上一轮数据
        self.laser_data = self.thread1.laser_data
        self.last_robot_pose_x = self.gazebo_robot_pose_x
        self.last_robot_pose_y = self.gazebo_robot_pose_y
        # 执行完动作,检查一下状态值(这里的严谨性有待考察,因为这里并不是严格的action一下,返回一个状态)
        self.gazebo_robot_pose_x, self.gazebo_robot_pose_y, self.gazebo_robot_speed, self.gazebo_robot_angle = \
            self.get_gazebo_state()
        self.sample_fuc()
        # self.get_rid_of_bad_data()
        self.int_sample_fuc()
        self.state = np.array(self.int_sample_laser_data[:])
        # self.state = np.array([])
        self.state_speed_wrapper()
        self.state = np.append(self.state, [int(self.goal_x*10), int(self.goal_y*10),
                                            int(self.gazebo_robot_pose_x*10), int(self.gazebo_robot_pose_y*10),
                                            self.state_speed, self.state_angle])
        # 应该把最后的位姿信息也作为限制, min(distance) 选择0.25,有待考察,因为将测距仪放在车头前部
        if self.distance_to_goal() < self.distance_to_goal_threshold \
                or min(self.sample_laser_data) < self.collision_threshold:  # and (self.gazebo_robot_speed < 0.1):
            self.done = True
            if self.steps <= 5:  # for debug
                print("robot: "+str(ac_num)+" : ")
                print(self.sample_laser_data)
                print("x: "+str(self.gazebo_robot_pose_x)+"y: "+str(self.gazebo_robot_pose_y))
        self.sequence_state_wrap()
        self.s_state = self.s_state.reshape((64, ))
        # self.state = self.state.reshape((4, ))
        reward_num = self.reward()  # reward值必须在更新last_distance_to_goal之前计算
        self.last_distance_to_goal = self.distance_to_goal()  # 更新距离,为下次计算reward准备
        # self.last_laser_data = self.sample_laser_data[:]  # 注意这里,坑....  copy!!!
        return self.s_state, reward_num, self.done, {}

    def distance_to_goal(self):
        return math.sqrt((self.goal_x - self.gazebo_robot_pose_x)**2 + (self.goal_y - self.gazebo_robot_pose_y)**2)

    def reward(self):
        # 应该把最后的位姿信息也作为reward的限制,与done保持一致,目标距离与障碍物距离之间应该有交叉reward?
        if self.distance_to_goal() < self.distance_to_goal_threshold:
            return self.arrive_reward
        elif min(self.sample_laser_data) < self.collision_threshold:  # 有关障碍物距离部分,原始激光数据有噪声(或其它原因,不能直接做判定,甚至达到0.2 ???)
            return self.collision_reward
        # 谨慎选择有关距离的reward_fuc, 避免robot有可能出现的刷分行为(与arrive_reward的比例,正负等等)
        # 有关距离目标距离的部分,应该可以加快学习速度,但是比较难处理,超过目标点之后,跑远了就给它负的reward
        # 方案一
        # else:
        #     return self.road_reward_times * (self.last_distance_ro_goal-self.distance_to_goal())
        # 方案二
        else:
            if self.last_distance_to_goal - self.distance_to_goal() > 0:  # 越来越近
                return 1.0*self.road_reward_times
            else:  # 现在这样原地不动,也是负reward, >=则无惩罚
                return -1.0*self.road_reward_times  # 可以加大远离的惩罚
        # else:
        #     return 0.0

    def render(self, mode='human'):
        pass

    # 是否应该换成机器人本身的数据,毕竟真实机器人没有这些
    # @staticmethod
    def get_gazebo_state(self):
        # 第一个参数model_name,第二个参数为relative_entity_name
        model_state = self.get_state_srv('robot'+str(ac_num), '')  # 注意model_state 的类型为GetModelStateResponse
        robot_pose_x = round(model_state.pose.position.x, 1)  # 注意pose为(x,y,z) , 离散,
        robot_pose_y = round(model_state.pose.position.y, 1)
        robot_twist_linear_x = round(model_state.twist.linear.x, 4)  # 离散化
        robot_twist_angular_z = round(model_state.twist.angular.z, 4)  # angular.z代表平面机器人的角速度,因为此时z轴为旋转轴
        # return self.cc_func(robot_pose_x), self.cc_func(robot_pose_y), robot_twist_linear_x, robot_twist_angular_z
        return robot_pose_x, robot_pose_y, robot_twist_linear_x, robot_twist_angular_z

    # 从原激光数据中抽取10组0.0型的激光数据,相当于每20°取一个;
    # 为了各robot可以互相探测,激光安装在车身前部,但更合理的做法应当是车身中央,所以要将测量到的laser数据进行一下变换0.15
    def sample_fuc(self):
        self.sample_laser_data[0] = self.laser_data[0]
        self.sample_laser_data[-1] = self.laser_data[-1]
        for i in range(1, 9):
            self.sample_laser_data[i] = self.laser_data[i * 72]
        for j in range(0, 10):
            temp_angle = (math.pi / 9) * j  # 弧度制
            self.sample_laser_data[j] += math.sin(temp_angle) * 0.15  # 修正中心位置
            # 离散化,保留1位小数,注意round()既非截断,也非四舍五入,也许2位更好?
            self.sample_laser_data[j] = round(self.sample_laser_data[j], 1)  
            if self.sample_laser_data[j] > 10.0:
                self.sample_laser_data[j] = 10.0


    def int_sample_fuc(self):
        for j in range(0, 10):
            self.int_sample_laser_data[j] = int(self.sample_laser_data[j]*10)  # 变成整数

    # def __del__(self):
    #     # 等待至线程中止。这阻塞调用线程直至线程的join()方法被调用中止-正常退出或者抛出未处理的异常-或者是可选的超时发生。
    #     self.thread1.join()  # 还是不对
    #
    def close(self):
        pass
コード例 #24
0
class ThreatDefenseEnv(BaseEnv):
    """
    The class of the Defense environment.

    An OpenAI environment of the toy example given in Optimal Defense
    Policies for Partially Observable Spreading Processes on Bayesian Attack
    Graphs by Miehling, E., Rasouli, M., & Teneketzis, D. (2015). It
    constitutes a 29-state/observation, 4-action POMDP defense problem.
    """

    metadata = {'render.modes': ['human', 'rgb_array']}

    def __init__(self):
        """POMDP environment."""
        self.action_space = Discrete(len(Action))
        self.state_space = Discrete(STATES.shape[0])
        self.observation_space = Discrete(OBSERVATIONS.shape[0])
        self.all_states = STATES
        self.last_obs = None
        self.viewer = None

    def step(self, action):
        """
        Progress the simulation one step.

        Arguments:
        action -- an Action containing a numeric value [0, 3].

        Returns:
        A tuple (o, r, d, i), where

        o -- the observation as a binary vector.
        r -- the reward gained as an integer.
        d -- boolean indicating if the simulation is done or not.
        i -- simulation meta-data as a dictionary.

        """
        assert self.action_space.contains(action)
        assert self.done is not True

        if not type(action) is Action:
            action = Action(action)

        self.t += 1
        self.last_action = action

        reward = REWARDS[action.value][self.state.index]

        self.done = self._is_terminal(action, self.t)

        old_state = self.state
        self.state = self._next_state(action, old_state)
        self.info = self._update_info(action, old_state, self.state)

        return self._sample_observation(self.state).as_list(), \
            reward, self.done, self.info

    def reset(self):
        """
        Reset the simulation.

        Returns:
        The beginner observation as a binary vector.
        """
        self.done = False
        self.state = State(0)
        self.last_action = Action.NONE
        self.t = 0
        self.info = {}
        return self.state.as_list()

    def _next_state(self, action, state):
        """
        Generate the next state from an action.

        Arguments:
        action -- an Action containing numeric value [0, 3].
        state -- a State containing a binary vector of length 12.

        Returns:
        The next State containing a binary vector of length 12.
        """
        probs = TRANSITIONS[action.value][state.index]
        return State(np.random.choice(probs.shape[0], p=probs))

    def _is_terminal(self, action, time_step):
        """
        Check if the current action terminates the simulation.

        Arguments:
        action -- an Action containing the numeric value [0, 3].
        time_step -- the current time_step as a numeric value
            0 <= time_step < max_time_step.

        Returns:
        A boolean depending on if the simulation is over or not.
        """
        return self.last_action.value is Action.BOTH.value or \
            time_step >= 100

    def _sample_observation(self, state):
        """
        Generate a new observation from the current state and action taken.

        Arguments:
        action -- an Action containing the numeric value [0, 3].
        state -- a State containing a binary vector of length 12.

        Returns:
        An Observation containing a binary vector of length 12.
        """
        probs = OBSERVATIONS[state.index]
        self.last_obs = Observation(np.random.choice(probs.shape[0], p=probs))
        return self.last_obs

    def _update_info(self, action, old_state, new_state):
        """
        Update the info meta-data Dict.

        Update the info with respect to the current and past state as well as
        the action taken.

        Arguments:
        action -- an Action containing the numeric value [0, 3].
        old_state -- a State containing a binary vector of length 12,
            representing the state at the past time step.
        new_state -- a State containing a binary vector of length 12,
            representing the state at the current time step.

        Returns:
        A dictionary containing the updated info with respect to the current
            and past state as well as the action taken.

        """
        return {
            'last_transition': TRANSITIONS[action.value][old_state.index],
            'last_observation': OBSERVATIONS[new_state.index],  # noqa
            'state': new_state.as_list()
        }
コード例 #25
0
class Kvazaar_v0(gym.Env):

    metadata = {"render.modes": ["human"]}

    def set_video_selection_mode(self):
        if os.path.isdir(self.vids_path):
            self.vids_list = os.listdir(
                self.vids_path)  #list all vids in vids_path
            if self.mode == "rotating":
                self.vids_list = sorted(self.vids_list)  #sort videos
            elif self.mode is None:
                self.mode = "random"
        elif os.path.isfile(self.vids_path) and self.mode is None:
            self.mode = "file"
            self.vids_list = os.listdir(self.vids_path)
            self.vid_selected['name'] = self.vids_list[0]
            self.vid_selected['dir_pos'] = 0

    def random_video_selection(self):
        randomInt = self.np_random.randint(0, len(self.vids_list))
        newVideo = self.vids_list[randomInt]
        self.vid_selected['name'] = newVideo
        self.vid_selected['pos'] = randomInt

    def rotating_video_selection(self):
        self.vid_selected['pos'] = (self.vid_selected['pos'] + 1) % len(
            self.vids_list)
        new_video = self.vids_list[self.vid_selected['pos']]
        self.vid_selected['name'] = new_video

    def log_metrics(self):
        message = ("\n ------------- \n"
                   "Total steps: {}" + "\n ------------- \n" +
                   "VIDEO USAGE \n").format(self.steps)

        self.video_usage = {
            key: round((value / self.steps * 100), 0)
            for key, value in self.video_usage.items()
        }

        for video, usage in self.video_usage.items():
            message += video + " = " + str(usage) + "%\n"

        if self.logger: self.logger.info(message)

    def __init__(self, **kwargs):
        # Recogemos los argumentos:
        # kvazaar_path: ruta donde está instalado kvazaar
        # vids_path: ruta de los vídeos que utilizará el entorno
        # cores: lista de cores para kvazaar
        self.kvazaar_path = kwargs.get("kvazaar_path")
        self.vids_path = kwargs.get("vids_path")
        self.cores = kwargs.get(
            "cores")  #Lista de los cores que utiliza el entorno
        self.mode = kwargs.get("mode")  #Modo de seleccion de videos
        self.logger = kwargs.get("logger")
        self.max_steps = kwargs.get("num_steps")

        self.vids_list = None  #for random or rotating mode
        self.vid_selected = {'name': None, 'pos': 0}

        self.set_video_selection_mode()

        self.action_space = Discrete(len(self.cores))
        self.observation_space = Discrete(11)
        self.kvazaar = None  ##object for kvazaar subprocess

        #metrics
        self.video_usage = {video: 0 for video in self.vids_list}
        self.steps = 0

    def reset(self):
        self.seed()  #Generate seed for random numbers
        self.reset_kvazaar()
        self.state = np.int64(1)
        self.reward = 0  #Starter reward is 0
        self.done = False
        self.info = {"state": "running", "fps": 0}
        return self.state

    def reset_kvazaar(self):

        #self.mode == file is already managed
        if self.mode == "random": self.random_video_selection()
        if self.mode == "rotating": self.rotating_video_selection()

        #log new video
        if self.logger:
            self.logger.info(self.vid_selected['name'] + " , pos " +
                             str(self.vid_selected['pos']))

        command = [
            self.kvazaar_path, "--input",
            self.vids_path + self.vid_selected['name'], "--output",
            "/dev/null", "--preset=ultrafast", "--qp=22", "--owf=0",
            "--threads=" + str(len(self.cores))
        ]

        #apply taskset using the range of cores setted as argument
        command = ["taskset", "-c", ",".join([str(x)
                                              for x in self.cores])] + command

        # kvazaar process generation
        self.kvazaar = subprocess.Popen(command,
                                        stdin=subprocess.PIPE,
                                        stdout=subprocess.PIPE,
                                        universal_newlines=True,
                                        bufsize=1,
                                        env={'NUM_FRAMES_PER_BATCH': '24'})

        #make sure kvazaar process is up
        while not self.kvazaar:
            time.sleep(1)

    def step(self, action):
        assert self.action_space.contains(action)
        action += 1

        #Interaction with Kvazaar process passing the new action, i.e. the new numbers of cpus used for the next video block.
        s = "nThs:" + str(action)
        self.kvazaar.stdin.write(s + "\n")
        output = self.kvazaar.stdout.readline().strip()
        ########

        ##update metrics
        self.steps += 1
        self.video_usage[self.vid_selected['name']] += 1

        #log this num step and the video used
        if self.logger:
            self.logger.info("Step {} : {} , pos {}".format(
                str(self.steps), self.vid_selected['name'],
                str(self.vid_selected['pos'])))

        self.calculate_state(output=output)

        self.info["state"] = output.strip()

        if self.steps == self.max_steps:
            #reached end of training
            self.kvazaar.kill()
            self.log_metrics()

        try:
            assert self.observation_space.contains(
                self.state)  #check if new state is valid
        except AssertionError:
            print("INVALID STATE", self.state)

        return [self.state, self.calculate_reward(), self.done, self.info]

    def calculate_reward(self):
        if self.info["state"] == 'END':
            self.reward = 0
        else:
            map_rewards = {
                1: 0,  ##[0,10)
                2: 1,  ##[10,16)
                3: 2,  ##[16,24)
                4: 3,  ##[20,24)
                5: 6,  ##[24,27)
                6: 8,  ##[27,30)
                7: 10,  ##[30,35)
                8: 7,  # [35,40)
                9: 4  # [40,inf)
            }
            self.reward = map_rewards.get(self.state)

        return self.reward

    def calculate_state(self, output):
        if (output == "END"):
            self.done = True
            self.info["state"] = 'END'
        else:
            ##Erase "FPS:" from output and save it in the new state.
            output_value = np.float32(output[4:])

            self.info["fps"] = '{:.2f}'.format(output_value)
            if output_value < 10: self.state = np.int64(1)
            elif output_value < 16: self.state = np.int(2)
            elif output_value < 20: self.state = np.int(3)
            elif output_value < 24: self.state = np.int(4)
            elif output_value < 27: self.state = np.int64(5)
            elif output_value < 30: self.state = np.int64(6)
            elif output_value < 35: self.state = np.int64(7)
            elif output_value < 40: self.state = np.int64(8)
            else: self.state = np.int64(9)

    def render(self, mode="human"):
        if self.info["state"] == 'END':
            print(self.info["state"])
        else:
            l = '{:>7}  fps:{:>1}  reward:{:<10}'.format(
                self.state, self.info["fps"], self.reward)
            print(l)

    def seed(self, seed=None):
        self.np_random, seed = seeding.np_random(seed)
        return [seed]

    def close(self):
        pass
コード例 #26
0
class CacheEnv(gym.Env):
    """
    Cache description.

    * STATE *
        The state is represented as a vector:
        [request object size, 
         cache remaining size, 
         time of last request to the same object]

    * ACTIONS *
    TODO: should be fixed here, there should be both 
        Whether the cache accept the incoming request, represented as an
        integer in [0, 1].

    * REWARD * (BHR)
        Cost of previous step (object size) * hit

    * REFERENCE *
    """
    def __init__(self, seed=42):
        self.seed(seed)
        self.cache_size = config.cache_size

        # load trace, attach initial online feature values
        self.src = TraceSrc(trace=config.cache_trace,
                            cache_size=self.cache_size)

        # set up the state and action space
        #TODO: Modify the action space size to C+1
        self.action_space = Discrete(2)
        self.observation_space = Box(self.src.min_values, \
                                      self.src.max_values, \
                                      dtype=np.float32)

        # cache simulator
        self.sim = CacheSim(cache_size=self.cache_size, \
                            policy='lru', \
                            action_space=self.action_space, \
                            state_space=self.observation_space)

        # reset environment (generate new jobs)
        #self.reset(1, 2)

    def reset(self, low=1, high=1001):
        print('env.reset called')
        print('caller name:', inspect.stack()[1][3])
        new_trace = self.np_random.randint(low, high)
        print(new_trace)
        self.src.reset(new_trace)
        self.sim.reset()
        if config.cache_trace == 'test':
            print("New Env Start", new_trace)
        elif config.cache_trace == 'real':
            print("New Env Start Real")
        #print(self.sim.get_state())
        return self.sim.get_state()

    def seed(self, seed):
        self.np_random = seeding.np_random(seed)

    def step(self, action):
        # 0 <= action < num_servers
        global accept
        assert self.action_space.contains(action)
        state, done = self.src.step()
        reward, info = self.sim.step(action, state)
        obj, done = self.src.next()
        while self.sim.next_hit(obj):
            state, done = self.src.step()
            hit_reward, info = self.sim.step(accept, state)
            reward += hit_reward
            if done is True:
                break
            obj, done = self.src.next()

        obs = self.sim.get_state(obj)
        info = {}
        return obs, reward, done, info

    def render(self, mode='human', close=False):
        pass
コード例 #27
0
class MarketEnvironment(gym.Env):
    def __init__(self, env_config):
        #save the variables
        self.env_config = env_config
        self.db = DataBase(env_config["database_path"])
        self.hp = env_config["hyperparameters"]
        self.episode_length = env_config["episode_length"]
        self.trading_fee_dez = env_config["trading_fee"] / 100

        #specify action and observation spaces
        self.action_space = Discrete(3)
        self.observation_space = Box(low=-np.finfo(np.float32).max,
                                     high=np.finfo(np.float32).max,
                                     shape=(self.hp.window_size,
                                            len(self.hp.features)))

        #handling of the scaler
        if env_config["preloaded_scaler"] is None:
            self.scaler = None
            warnings.warn("You did not specify a preloaded scaler!")
        elif isinstance(env_config["preloaded_scaler"], BaseEstimator):
            #save preloaded scaler instance
            self.scaler = env_config["preloaded_scaler"]
        elif type(env_config["preloaded_scaler"]) == str:
            #load in the scaler
            self.scaler = joblib.load(filename=env_config["preloaded_scaler"])
        else:
            raise Exception(
                "You have to either specify a scaler_location, scaler_instance or None for config_parameter: scaler"
            )

        #prepare the data
        self.data, self.close_data = self._prepare_data()

        #local episode variables
        self.current_step = 0
        self.start_index = 0
        self.done = True
        self.mode = "buy"
        self.specific_profit = 0

    def step(self, action):
        #check that environment is ready so get stepped in
        if self.done:
            raise Exception(
                "You have to reset the environment before you can step through it"
            )

        #check that action is in actionspace
        if not self.action_space.contains(action):
            raise Exception(f"Your chosen action: {action} is not valid!")
        """
        Caculate the specific_profit (reward)
        """
        #calculate the index
        index = self.start_index + self.current_step + self.hp.window_size - 1
        #get the price
        price = self.close_data.iloc[index, 1]
        #set reward to 0
        reward = 0

        #buy assets
        if self.mode == "buy" and action == 1:
            #save the buyprice
            self.buy_price = price
            #set mode to buy
            self.mode = "sell"

        #sell assets
        elif self.mode == "sell" and action == 2:
            #calculate profit
            local_specific_profit = price / self.buy_price * (
                1 - self.trading_fee_dez)**2 - 1
            #update episode profit
            self.specific_profit += local_specific_profit
            #set mode to buy
            self.mode = "buy"
            #set reward
            reward = local_specific_profit
        """
        Get new Observation
        """
        #update the current_step
        self.current_step += 1

        #if we are not at the end of this episode yet:
        if self.current_step < self.episode_length:
            #get the new observation
            observation = self.data.iloc[self.start_index +
                                         self.current_step:self.start_index +
                                         self.current_step +
                                         self.hp.window_size].to_numpy()

        #if we reached the end of this episode:
        else:
            #update the doneflag
            self.done = True
            observation = self.data.iloc[self.start_index + self.current_step -
                                         1:self.start_index +
                                         self.current_step - 1 +
                                         self.hp.window_size].to_numpy()

        return observation, reward, self.done, {}

    def reset(self):
        #reset the local run variables
        self.current_step = 0
        self.done = False
        self.mode = "buy"
        self.specific_profit = 0

        #get random startingpoint
        self.start_index = random.randrange(
            0,
            self.data.shape[0] - self.episode_length - self.hp.window_size + 1,
            1)

        #get the start_state
        start_state = self.data.iloc[self.start_index +
                                     self.current_step:self.start_index +
                                     self.current_step +
                                     self.hp.window_size].to_numpy()

        return start_state

    def _prepare_data(self):
        #select the features
        data = self.db[self.hp.candlestick_interval, self.hp.features]

        #data operations that can be made on the whole dataset
        data, scaler = TrainDataBase._raw_data_prep(
            data=data,
            derive=self.hp.derivation,
            scaling_method=self.hp.scaling,
            scaler_type=self.hp.scaler_type,
            preloaded_scaler=self.scaler)

        #reset the index
        data.reset_index(inplace=True, drop=True)

        #create the close_data
        close_data = self.db[self.hp.candlestick_interval,
                             ["close_time", "close"]]
        #remove first row
        close_data = close_data.iloc[1:, :]
        #reset index
        close_data.reset_index(inplace=True, drop=True)

        return data, close_data
コード例 #28
0
class PocEnv(Env):
    def __init__(self, maze):
        self.board = select_maze(maze)
        self.grid = PocGrid(board=self.board["_maze"])
        self._get_init_state()
        self.action_space = Discrete(4)
        self.observation_space = Discrete(1 << 10)  # 1024
        # self.observation_space = Discrete(14)
        self._reward_range = 100
        self._discount = .95

    def seed(self, seed=None):
        np.random.seed(seed)

    def is_power(self, idx):
        return self.board['_maze'][idx] == 3

    def is_passable(self, idx):
        return self.board['_maze'][idx] != 0

    def _is_valid(self):

        assert self.grid.is_inside(self.state.agent_pos)
        assert self.is_passable(self.state.agent_pos)
        for ghost in self.state.ghosts:
            assert self.grid.is_inside(ghost.pos)
            assert self.is_passable(ghost.pos)

    def _set_state(self, state):
        self.done = False
        self.state = state

    def _generate_legal(self):
        actions = []
        for action in Action:
            if self.grid.is_inside(self.state.agent_pos +
                                   Moves.get_coord(action.value)):
                actions.append(action.value)
        return actions

    def step(self, action):
        assert self.action_space.contains(action)
        assert self.done is False

        reward = -1
        next_pos = self._next_pos(self.state.agent_pos, action)
        if next_pos.is_valid():
            self.state.agent_pos = next_pos
        else:
            reward += -25

        if self.state.power_step > 0:
            self.state.power_step -= 1

        hit_ghost = -1
        for g, ghost in enumerate(self.state.ghosts):
            if ghost.pos == self.state.agent_pos:
                hit_ghost = g
            # move ghost
            self._move_ghost(g, ghost_range=self.board["_ghost_range"])

        if hit_ghost >= 0:
            if self.state.power_step > 0:
                reward += 25
                self.state.ghosts[hit_ghost].reset()
            else:
                reward += -100
                self.done = True

        ob = self._make_ob(action)

        if self.state.food_pos[self.grid.get_index(self.state.agent_pos)]:
            if sum(self.state.food_pos) == 0:
                reward += 1000
                self.done = True
            if self.is_power(self.state.agent_pos):
                self.state.power_step = config["_power_steps"]
            reward += 10

        return ob, reward, self.done, {"state": self.state}

    def _make_ob(self, action):
        # TODO fix me
        ob = 0
        for d in range(self.action_space.n):
            if self._see_ghost(action) > 0:
                ob = set_flags(ob, d)
            next_pos = self._next_pos(self.state.agent_pos, direction=d)
            if next_pos.is_valid() and self.is_passable(next_pos):
                ob = set_flags(ob, d + self.action_space.n)
        if self._smell_food():
            ob = set_flags(ob, 8)
        if self._hear_ghost(self.state):
            ob = set_flags(ob, 9)
        return ob

    def _encode_state(self, state):
        poc_idx = self.grid.get_index(state.agent_pos)
        ghosts = [(self.grid.get_index(ghost.pos), ghost.direction)
                  for ghost in state.ghosts]
        return np.concatenate([[poc_idx], *ghosts, state.food_pos,
                               [state.power_step]])

    def _decode_state(self, state):
        poc_state = PocState(Coord(*self.grid.get_coord(state[0])))
        ghosts = np.split(state[1:self.board["_num_ghosts"] * 3], 1)
        for g in ghosts:
            poc_state.ghosts.append(
                Ghost(pos=self.grid.get_coord(g[0]), direction=g[1]))
        poc_state.power_step = state[-1]
        poc_state.food_pos = state[self.board["_num_ghosts"] * 3:-1].tolist()
        return poc_state

    def _compute_prob(self, action, next_state, ob):
        return int(ob == self._make_ob(action))

    def _see_ghost(self, action):
        eye_pos = self.state.agent_pos + Moves.get_coord(action)
        while True:
            for g, ghost in enumerate(self.state.ghosts):
                if ghost.pos == eye_pos:
                    return g
            eye_pos += Moves.get_coord(action)
            if not self.grid.is_inside(eye_pos) or not self.is_passable(
                    eye_pos):
                break
        return -1

    def _smell_food(self, smell_range=1):
        for x in range(-smell_range, smell_range + 1):
            for y in range(-smell_range, smell_range + 1):
                smell_pos = Coord(x, y)
                idx = self.grid.get_index(self.state.agent_pos + smell_pos)
                if self.grid.is_inside(self.state.agent_pos +
                                       smell_pos) and self.state.food_pos[idx]:
                    return True
        return False

    @staticmethod
    def _hear_ghost(poc_state, hear_range=2):
        for ghost in poc_state.ghosts:
            if Grid.manhattan_distance(ghost.pos,
                                       poc_state.agent_pos) <= hear_range:
                return True
        return False

    def render(self, mode='human', close=False):
        pass

    def reset(self):
        self.t = 0
        self.done = False
        self._get_init_state()
        return 0

    def close(self):
        pass

    def _get_init_state(self):
        # create walls
        # for tile in self.grid:
        #     value = config["maze"][tile.key[0]]
        #     self.grid.set_value(value, coord=tile.key)

        self.state = PocState()
        self.state.agent_pos = Coord(*self.board["_poc_home"])
        ghost_home = Coord(*self.board["_ghost_home"])

        for g in range(self.board["_num_ghosts"]):
            pos = Coord(ghost_home.x + g % 2, ghost_home.y + g // 2)
            self.state.ghosts.append(Ghost(pos, direction=-1))

        self.state.food_pos = np.random.binomial(1,
                                                 config["_food_prob"],
                                                 size=self.grid.n_tiles + 1)
        self.state.power_step = 0
        return self.state

    def _next_pos(self, pos, direction):
        direction = Moves.get_coord(direction)
        if pos.x == 0 and pos.y == self.board[
                '_passage_y'] and direction == Moves.EAST:
            next_pos = Coord(self.grid.x_size - 1, pos.y)
        elif pos.x == self.grid.x_size - 1 and pos.y == self.board[
                '_passage_y'] and direction == Moves.WEST:
            next_pos = Coord(0, pos.y)
        else:
            next_pos = pos + direction

        if self.grid.is_inside(next_pos) and self.is_passable(next_pos):
            return next_pos
        else:
            return Coord(-1, -1)

    def _move_ghost(self, g, ghost_range):
        if Grid.manhattan_distance(self.state.agent_pos,
                                   self.state.ghosts[g].pos) < ghost_range:
            if self.state.power_step > 0:
                self._move_defensive(g)
            else:
                self._move_aggressive(g)
        else:
            self._move_random(g)

    def _move_aggressive(self, g, chase_prob=.75):
        if not np.random.binomial(1, p=chase_prob):
            return self._move_random(g)

        best_dist = self.grid.x_size + self.grid.y_size
        best_pos = self.state.ghosts[g].pos
        best_dir = -1
        for d in range(self.action_space.n):
            dist = Grid.directional_distance(self.state.agent_pos,
                                             self.state.ghosts[g].pos, d)
            new_pos = self._next_pos(self.state.ghosts[g].pos, d)
            if dist <= best_dist and new_pos.is_valid() and can_move(
                    self.state.ghosts[g], d):
                best_pos = new_pos
                best_dist = dist
                best_dir = d

        self.state.ghosts[g].update(best_pos, best_dir)

    def _move_defensive(self, g, defensive_prob=.5):
        if np.random.binomial(
                1, defensive_prob) and self.state.ghosts[g].direction >= 0:
            self.state.ghosts[g].direction = -1

        best_dist = self.grid.x_size + self.grid.y_size
        best_pos = self.state.ghosts[g].pos
        best_dir = -1
        for d in range(self.action_space.n):
            dist = Grid.directional_distance(self.state.agent_pos,
                                             self.state.ghosts[g].pos, d)
            new_pos = self._next_pos(self.state.ghosts[g].pos, d)
            if dist >= best_dist and new_pos.is_valid() and can_move(
                    self.state.ghosts[g], d):
                best_pos = new_pos
                best_dist = dist
                best_dir = d

        self.state.ghosts[g].update(best_pos, best_dir)

    def _move_random(self, g):
        # there are no dead ends
        # never switch to opposite direction
        ghost_pos = self.state.ghosts[g].pos
        while True:
            d = self.action_space.sample()
            next_pos = self._next_pos(ghost_pos, d)
            if next_pos.is_valid() and can_move(self.state.ghosts[g], d):
                break

        self.state.ghosts[g].update(next_pos, d)
コード例 #29
0
class FourthRobGpw2(gym.Env):
    def __init__(self):
        rospy.init_node("rl_robot_node" + str(ac_num))
        self.pub, self.reset_robot_state = reset_node_init()
        self.act_pub = rospy.Publisher('/robot' + str(ac_num) +
                                       '/pioneer/cmd_vel',
                                       Twist,
                                       queue_size=1)
        self.sub_name = '/robot' + str(ac_num) + '/laser/scan'
        self.laser_sub = rospy.Subscriber(self.sub_name,
                                          LaserScan,
                                          self.laser_callback,
                                          queue_size=1)
        # self.cc_func = correct_coordinate_func()
        # 数据已经离散化(不需要归一化吧?),进一步转换成整数, 线速度为第15个状态
        # self.observation_space = gym.spaces.Box(-10.0, 10.0, (15, ), dtype=np.dtype(np.float32))
        # if enable_correction:
        #     self.observation_space = gym.spaces.Box(0, 100, (14,), dtype=np.dtype(np.int32))  # 整数版
        # else:
        #     self.observation_space = gym.spaces.Box(-100, 100, (14, ), dtype=np.dtype(np.int32))  # 整数版
        # self.observation_space = gym.spaces.Box(0, 1, (14,), dtype=np.dtype(np.float32))  # normalization
        # self.observation_space = gym.spaces.Box(-10.0, 10.0, (4, ), dtype=np.dtype(np.float32))
        self.observation_space = gym.spaces.Box(0,
                                                100, (12, ),
                                                dtype=np.dtype(np.int32))
        self.action_space = Discrete(5)  # 取值0,1,2,3,4,5 , 动作是否应该加上组合?
        self.state = None
        # self.laser_data = [0 for i in range(0, 10)]
        self.num_laser = 10  # 别乱改,有些相关数据没用这个参数
        self.laser_data = None
        self.sample_laser_data = [0 for _ in range(0, self.num_laser)
                                  ]  # 特定抽取的data, 位置从右向左
        self.int_sample_laser_data = [0 for _ in range(0, self.num_laser)
                                      ]  # 特定抽取的data, 位置从右向左,整数版
        # self.state_sample_laser_data = [0 for x in range(0, self.num_laser)]  # 特定抽取的data, 位置从右向左,归一化
        # 用于修正激光数据
        self.last_laser_data = [0 for x in range(0, self.num_laser)]
        # self.current_sate = None
        # self.speed = 0.0  # 运行一段时间之后,记录的speed、angle就会与实际(gazebo)有较大差距
        self.state_speed = 0.0  # 作为状态用的speed 离散整数化
        self.state_angle = 0.0  # angular speed
        self.min_speed = 0.0  # -0.75  # 线速度 m/s ;角速度 rad/s(6弧度相当于一个整圆),要想倒车必须在车身后部安装测距手段
        self.max_speed = 0.5  # 速度不能高于0.75,已经开始出现误差  0.5m/s因为环境障碍物比较密集
        # self.angle = 0.0
        self.min_angle = -0.5
        self.max_angle = 0.5
        self.cmd_twist = Twist()
        self.done = False
        # self.thread1 = LaserNodeThread()
        # self.thread1.setDaemon(True)
        # self.thread1.start()
        # 获取状态服务
        rospy.wait_for_service('gazebo/get_model_state')
        self.get_state_srv = rospy.ServiceProxy('/gazebo/get_model_state',
                                                GetModelState)
        # 重置位置服务
        rospy.wait_for_service('gazebo/set_model_state')
        self.set_obstacle_srv = rospy.ServiceProxy('/gazebo/set_model_state',
                                                   SetModelState)
        # 重置simulation
        rospy.wait_for_service('/gazebo/reset_simulation')
        self.reset_sim_srv = rospy.ServiceProxy('/gazebo/reset_simulation',
                                                Empty)
        # self.obstacle_pose = [[0 for x in range(0, 2)] for x in range(0, 20)]  # 20个障碍物的位置[x, y]
        # gazebo返回的robot数据
        self.gazebo_robot_pose_x = 0.0  # 可以考虑为其注入噪声,这里在初始化时为0,导致下面一开始角度计算有一个错误
        self.gazebo_robot_pose_y = 0.0
        self.gazebo_robot_speed = 0.0
        self.gazebo_robot_angle = 0.0
        self.robot_orientation_z = 0
        self.robot_orientation_w = 0
        self.true_state_distance = 0.0  # self.state_distance 经过处理,不再是真实距离,用于检查是否到达目标
        self.state_distance = 0.0  # 用于作为状态输入,对比每步的reward
        self.state_angle = 0.0
        self.state_polar_angle = 0.0  # 这个是robot相对x轴朝向
        self.last_robot_pose_x = 0.0
        self.last_robot_pose_y = 0.0
        # reward 相关
        self.arrive_reward = 100.0
        self.collision_reward = -100.0
        self.road_reward_times = 1  # 0.1
        self.time_cost_reward = -0.1
        self.last_distance_to_goal = 0.0  # 上一次动作时距离目标的距离
        self.distance_to_goal_threshold = 0.10  # 0.20
        self.collision_threshold = 0.35  # 至少比车身宽 0.5 , 0.43, 0.35是比较好的数值,再小就不行了
        # self.action_0 = False
        # self.distance_r_sate = 0
        # 设置最终robot位姿
        self.goal_x = 9.0  # 8
        self.goal_y = 6.0  # 8
        # self.desire_final_speed = None
        # self.desire_final_angle = None
        self.eps = 0  # 记录当前迭代轮数
        # self.get_obstacle_pose()
        self.seed()
        self.steps = 0  # for debug
        self.total_steps = 0
        self.trace_need = 1
        # self.reset()  # 先初始化一下 , 在dqn中会先调用一下的

    def seed(self, seed=None):
        self.np_random, seed = seeding.np_random(seed)
        return [seed]

    def laser_callback(self, msg):
        self.laser_data = msg.ranges

    # 使用随机目标应该可以加快学习速度,和泛化能力
    # def random_goal(self):
    #     # print("random_goal")
    #     # RandomState(ac_num)
    #     # 搞不懂random被openAI怎么动了,运行结果诡异,使用baselines中的uniform分布
    #     self.goal_x = self.np_random.uniform(low=-9.0, high=9.0)
    #     self.goal_y = self.np_random.uniform(low=-9.0, high=9.0)
    #     # self.goal_x = (18 * (np.random.ranf() - 0.5))  # x~(-9,9)
    #     # self.goal_y = (18 * (np.random.ranf() - 0.5))  # y~(-9,9)
    #     # print(str(self.goal_x) +"  " +str(self.goal_y))
    #     # 注意这里的设计, 每当goal 不满足条件时,就需要重新random一下,然后与出生点、“所有”障碍物点重新对比一下
    #     i = -1
    #     while i < 19:
    #         i += 1
    #         while self.compute_distance(self.goal_x, self.reset_robot_state.pose.position.x, self.goal_y,
    #                                     self.reset_robot_state.pose.position.y) < 1.0 \
    #                 or (self.compute_distance(self.goal_x, self.obstacle_pose[i][0],
    #                                           self.goal_y, self.obstacle_pose[i][1])) < 1.2:
    #             # self.goal_x = (18 * (np.random.ranf() - 0.5))  # 距离初始位置或任意障碍物过近
    #             # self.goal_y = (18 * (np.random.ranf() - 0.5))
    #             self.goal_x = self.np_random.uniform(low=-9.0, high=9.0)
    #             self.goal_y = self.np_random.uniform(low=-9.0, high=9.0)
    #             i = -1
    #
    #     self.goal_x = round(self.goal_x, 1)  # 离散化
    #     self.goal_y = round(self.goal_y, 1)
    #     # print("end random_goal")

    # fixed version
    def reset_goal(self):
        goal_num = self.np_random.randint(0, 3)
        # self.goal_x = self.cc_func(round(goal_sequence[ac_num][goal_num][0], 1))
        # self.goal_y = self.cc_func(round(goal_sequence[ac_num][goal_num][1], 1))
        self.goal_x = round(goal_sequence[ac_num][goal_num][0], 1)
        self.goal_y = round(goal_sequence[ac_num][goal_num][1], 1)

    def initial_pose(self):
        pose_num = self.np_random.randint(0, 4)
        # self.reset_robot_state.pose.position.x = self.cc_func(round(initial_sequence[ac_num][goal_num][0], 1))
        # self.reset_robot_state.pose.position.y = self.cc_func(round(initial_sequence[ac_num][goal_num][1], 1))
        self.reset_robot_state.pose.position.x = round(
            initial_sequence[ac_num][pose_num][0], 1)
        self.reset_robot_state.pose.position.y = round(
            initial_sequence[ac_num][pose_num][1], 1)
        # print('init_actor' + str(ac_num) + ' --> x: ' + str(self.reset_robot_state.pose.position.x)
        #       + ' y: ' + str(self.reset_robot_state.pose.position.y))

    @staticmethod
    def compute_distance(lhx, lhy, rhx, rhy):
        return math.sqrt((lhx - rhx)**2 + (lhy - rhy)**2)

    # goal 在robot当前位置为基准的角度
    def cpose_goal_angle(self):
        return math.atan2(self.goal_y - self.gazebo_robot_pose_y,
                          self.goal_x - self.gazebo_robot_pose_x)  # (-pi,pi]

    # # 使用相邻两次移动计算robot朝向,移动距离很小的话(特别有噪声的情况下,怎么可能计算的准确?需要借助IMU?)
    # def compute_polar_coordinates(self):
    #     edge_a = self.compute_distance(self.last_robot_pose_x, self.last_robot_pose_y,
    #                                    self.gazebo_robot_pose_x, self.gazebo_robot_pose_y)
    #     edge_b = self.compute_distance(self.last_robot_pose_x, self.last_robot_pose_y, self.goal_x, self.goal_y)
    #     edge_c = self.compute_distance(self.gazebo_robot_pose_x, self.gazebo_robot_pose_y, self.goal_x, self.goal_y)
    #     cos_angle_beta = (edge_a**2 + edge_c**2 - edge_b**2)/(2*edge_a*edge_c)
    #     self.state_distance = edge_c  # 数值需进行处理
    #     self.state_angle = math.pi - math.acos(cos_angle_beta)  # 数值需进行处理

    # def distance_to_obstacle(self, num_x, num_y, count):
    #     return math.sqrt((self.obstacle_pose[count][0] - num_x) ** 2 + (self.obstacle_pose[count][1] - num_y) ** 2)

    # 如果使用normalization_state_input(),则不要使用这个
    def state_speed_wrapper(self):
        # print(self.gazebo_robot_speed)
        # print(self.gazebo_robot_angle)
        self.state_speed = round(self.gazebo_robot_speed, 4) * 200
        self.state_speed = int(self.state_speed)  # (0~100)
        self.state_speed = np.clip(self.state_speed, 0, 100)
        self.state_angle = round(self.gazebo_robot_angle, 4) * 100
        self.state_angle = int(self.state_angle) + 50  # (0~100)
        self.state_angle = np.clip(self.state_angle, 0, 100)

    def reset(self):
        self.total_steps += self.steps
        self.steps = 0  # for debug
        self.eps += 1
        # print('call_reset')
        self.done = False
        # 每10轮random一次
        if self.eps % 10 == 0 or self.eps == 1:
            self.reset_goal()
            # self.initial_pose()
            # robot 的初始朝向也随机化
            self.reset_robot_state.pose.orientation.z = self.np_random.uniform(
                low=-0.9, high=0.9)
            self.reset_robot_state.pose.orientation.w = self.np_random.uniform(
                low=-0.9, high=0.9)
            print('actor' + str(ac_num) + ' --> x: ' + str(self.goal_x) +
                  ' y: ' + str(self.goal_y))
        # 重置robot的 speed angle 注意各步骤执行顺序
        self.cmd_twist.linear.x = 0.0
        self.cmd_twist.angular.z = 0.0
        # 将robot重置回原位置
        start_time = time.time()
        while (time.time() - start_time) < 0.25:  # 这里不使用srv直接重置,
            self.act_pub.publish(self.cmd_twist)  # 先重置速度
            self.pub.publish(self.reset_robot_state)
            time.sleep(0.05)
        # time.sleep(0.5)  # 重置后需要一定时间等待激光等数据更新...
        # 重置state中的pose speed angle
        # 每过大约10万步reset simulation
        if self.total_steps >= (self.trace_need * 10000):
            # self.reset_sim_srv()
            self.trace_need += 1

        self.gazebo_robot_pose_x, self.gazebo_robot_pose_y, self.gazebo_robot_speed, self.gazebo_robot_angle,\
        self.state_polar_angle = self.get_gazebo_state()
        self.gazebo_robot_speed = 0.0
        self.gazebo_robot_angle = 0.0
        # self.laser_data = self.thread1.laser_data
        self.laser_sub = rospy.Subscriber(self.sub_name,
                                          LaserScan,
                                          self.laser_callback,
                                          queue_size=1)
        self.last_robot_pose_x = self.gazebo_robot_pose_x
        self.last_robot_pose_y = self.gazebo_robot_pose_y
        self.sample_fuc()
        self.get_rid_of_bad_data()
        self.int_sample_fuc()
        # reset 函数需要提供初始状态
        # self.state = np.array([])  # 用来只有目标点作为状态时
        self.state_speed_wrapper()
        self.true_state_distance = self.distance_to_goal()
        if self.true_state_distance >= 10:
            self.true_state_distance = 10
        # self.normalization_state_input()
        # self.discretization_state_input()
        # print(self.sample_laser_data)
        # print(self.rew_sample_laser_data)
        self.state = np.array(self.int_sample_laser_data[:])
        # self.state = np.array(self.state_sample_laser_data[:])
        # self.state = np.append(self.state, [self.state_distance, self.state_polar_angle,
        #                                     self.state_speed, self.state_angle])
        # self.state = np.append(self.state, [int(self.true_state_distance*6), int(self.state_polar_angle),
        #                                     self.state_speed, self.state_angle])
        self.state_distance = int(round(self.true_state_distance * 6))
        self.state = np.append(
            self.state, [self.state_distance,
                         int(self.state_polar_angle)])
        self.last_distance_to_goal = self.state_distance
        self.last_laser_data = self.sample_laser_data[:]
        # self.state = self.state.reshape((14,))
        self.state = self.state.reshape((12, ))
        return self.state

    def step(self, action):
        self.steps += 1
        assert self.action_space.contains(
            action), "%r (%s) invalid" % (action, type(action))
        # robot 的加速度是有限制的,这里的选取一定要慎重,有两种方案,要么把变化数值都设成很小(不宜,难以快速响应环境)
        # 要么将publish的频率设限,20hz(+0.1情况下)以下
        # print("action:"+str(action))
        # print("gazebo: speed" + str(self.gazebo_robot_speed))
        # action = 0  # testing
        # if action == 0:
        #     self.gazebo_robot_speed += 0.05
        # elif action == 1:
        #     self.gazebo_robot_speed -= 0.05
        # # elif action == 2:
        # #     self.gazebo_robot_speed = 0.0
        # elif action == 2:
        #     self.gazebo_robot_angle += 0.05
        # elif action == 3:
        #     self.gazebo_robot_angle -= 0.05
        # # elif action == 5:
        # #     self.gazebo_robot_angle = 0.0
        # elif action == 4:
        #     pass  # 保持当前状态
        # else:
        #     gym.logger.warn("Unknown situation !")
        if action == 0:
            self.gazebo_robot_speed = 0.10  # 速度不能突然设置为特别大的数,否则会失控,不能超过0.1了
            # self.action_0 = True
        elif action == 1:
            self.gazebo_robot_speed = 0.0
            # self.action_0 = False
        # elif action == 2:
        #     self.gazebo_robot_speed = 0.0
        elif action == 2:
            self.gazebo_robot_angle = +0.10
            # self.action_0 = False
        elif action == 3:
            self.gazebo_robot_angle = -0.10
            # self.action_0 = False
        # elif action == 5:
        #     self.gazebo_robot_angle = 0.0
        elif action == 4:
            self.gazebo_robot_angle = 0
            # self.action_0 = False
        else:
            gym.logger.warn("Unknown situation !")

        self.gazebo_robot_speed = np.clip(self.gazebo_robot_speed,
                                          self.min_speed, self.max_speed)
        self.gazebo_robot_angle = np.clip(self.gazebo_robot_angle,
                                          self.min_angle, self.max_angle)

        # 与许多其它os不同之处,p2os设置好cmd_vel后,robot会一直保持该速度,
        # self.cmd_twist.linear.x = 0
        # gazebo中robot就算什么指令都没有,还是会有非常小的cmd_twist.linear.x与cmd_twist.angular.z,如果直接publish这些数据,只会造成误差越来越大
        if abs(self.gazebo_robot_speed) < 0.025:
            self.gazebo_robot_speed = 0.0
        if abs(self.gazebo_robot_angle) < 0.025:
            self.gazebo_robot_angle = 0.0
        self.cmd_twist.linear.x = self.gazebo_robot_speed
        self.cmd_twist.angular.z = self.gazebo_robot_angle
        # attention! testing!
        # self.cmd_twist.linear.x = 0.5
        # self.cmd_twist.angular.z = 0.0
        # print("gazebo: speed" + str(self.gazebo_robot_speed) + ",  angle: " + str(self.gazebo_robot_angle))
        # 应该新建一个服务,将最新的动作信息同步给该服务,在该服务中负责发布cmd_twist,或者使用线程
        time.sleep(0.085)  # 0.075
        self.act_pub.publish(self.cmd_twist)

        # 记录部分上一轮数据
        # self.laser_data = self.thread1.laser_data
        self.laser_sub = rospy.Subscriber(self.sub_name,
                                          LaserScan,
                                          self.laser_callback,
                                          queue_size=1)
        self.last_robot_pose_x = self.gazebo_robot_pose_x
        self.last_robot_pose_y = self.gazebo_robot_pose_y
        # 执行完动作,检查一下状态值(这里的严谨性有待考察,因为这里并不是严格的action一下,返回一个状态)
        self.gazebo_robot_pose_x, self.gazebo_robot_pose_y, self.gazebo_robot_speed, self.gazebo_robot_angle,\
        self.state_polar_angle = self.get_gazebo_state()
        self.sample_fuc()
        self.get_rid_of_bad_data()
        self.true_state_distance = self.distance_to_goal()
        if self.true_state_distance >= 10:
            self.true_state_distance = 10
        # print(self.state_speed)
        # print(self.state_angle)
        # self.normalization_state_input()
        # self.discretization_state_input()
        self.int_sample_fuc()
        # print(self.sample_laser_data)
        # print(self.rew_sample_laser_data)
        self.state = np.array(self.int_sample_laser_data[:])
        # self.state = np.array(self.state_sample_laser_data[:])
        # self.state = np.array([])
        self.state_speed_wrapper()
        self.state_distance = int(round(self.true_state_distance * 6))
        self.state = np.append(
            self.state, [self.state_distance,
                         int(self.state_polar_angle)])
        # self.state_speed, self.state_angle])
        # self.state = np.append(self.state, [self.state_distance, self.state_polar_angle,
        #                                     self.state_speed, self.state_angle])
        # self.state = np.append(self.state, [int(self.goal_x*10), int(self.goal_y*10),
        #                                     int(self.gazebo_robot_pose_x*10), int(self.gazebo_robot_pose_y*10),
        #                                     int(self.last_robot_pose_x*10), int(self.last_robot_pose_y*10),
        #                                     self.state_speed, self.state_angle])
        # print(self.sample_laser_data)
        # 应该把最后的位姿信息也作为限制, min(distance) 选择0.25,有待考察,因为将测距仪放在车头前部
        if self.true_state_distance < self.distance_to_goal_threshold \
                or min(self.sample_laser_data) < self.collision_threshold:  # and (self.gazebo_robot_speed < 0.1):
            self.done = True
            # print("SPACE")
            # print(self.sample_laser_data)
            # print(self.last_laser_data)
            # print("x: " + str(self.gazebo_robot_pose_x) + "y: " + str(self.gazebo_robot_pose_y))
            # print(self.true_state_distance)
            if self.steps <= 10:  # for debug
                print("robot: " + str(ac_num) + " : ")
                print(self.sample_laser_data)
                # print(self.last_laser_data)
                print("goal_x: " + str(self.goal_x) + "goal_y: " +
                      str(self.goal_y))
                print("init_x: " + str(self.gazebo_robot_pose_x) + "init_y: " +
                      str(self.gazebo_robot_pose_y))
                # print(self.int_sample_laser_data)
        # self.state = self.state.reshape((14, ))
        self.state = self.state.reshape((12, ))
        reward_num = self.reward()  # reward值必须在更新last_distance_to_goal之前计算
        self.last_distance_to_goal = self.state_distance  # 更新距离,为下次计算reward准备
        self.last_laser_data = self.sample_laser_data[:]  # 注意这里,坑....  copy!!!

        # print("actor"+str(ac_num)+": ")
        # print(self.state)
        return self.state, reward_num, self.done, {}

    def distance_to_goal(self):
        return math.sqrt((self.goal_x - self.gazebo_robot_pose_x)**2 +
                         (self.goal_y - self.gazebo_robot_pose_y)**2)

    def reward(self):
        # 应该把最后的位姿信息也作为reward的限制,与done保持一致,目标距离与障碍物距离之间应该有交叉reward?
        if self.true_state_distance < self.distance_to_goal_threshold:
            return self.arrive_reward
        elif min(
                self.sample_laser_data
        ) < self.collision_threshold:  # 有关障碍物距离部分,原始激光数据有噪声(或其它原因,不能直接做判定,甚至达到0.2 ???)
            # print(self.sample_laser_data)
            # print("error")
            return self.collision_reward
        # 谨慎选择有关距离的reward_fuc, 避免robot有可能出现的刷分行为(与arrive_reward的比例,正负等等)
        # 有关距离目标距离的部分,应该可以加快学习速度,但是比较难处理,超过目标点之后,跑远了就给它负的reward
        # 方案一
        # else:
        #     return self.road_reward_times * (self.last_distance_ro_goal-self.distance_to_goal())
        # 方案二
        else:  # 应该设置成总选action_1给负分
            # if self.action_0 is True:
            #     plus_reward = 0.1
            # else:
            #     plus_reward = 0
            # if self.last_distance_to_goal - self.state_distance > 0:  # 越来越近
            #     return 1.0 * self.road_reward_times + plus_reward
            # elif self.last_distance_to_goal - self.state_distance < 0:  # 远离惩罚
            #     return -0.5 * self.road_reward_times + plus_reward  # 可以加大远离的惩罚
            # else:
            #     return 0 + plus_reward
            if self.last_distance_to_goal - self.state_distance > 0:  # 越来越近
                return 1.0 * self.road_reward_times + self.time_cost_reward
            elif self.last_distance_to_goal - self.state_distance < 0:  # 远离惩罚,不动惩罚
                return -0.5 * self.road_reward_times + self.time_cost_reward  # 可以加大远离的惩罚
            else:
                return 0 + self.time_cost_reward
        # else:
        #     return 0.0

    def render(self, mode='human'):
        pass

    # @staticmethod
    # def normalization_func(max_num, min_num, x):
    #     return (x - min_num)/(max_num - min_num)
    #
    # def normalization_state_input(self):
    #     for i in range(0, 10):
    #         self.state_sample_laser_data[i] = self.normalization_func(10, 0, self.sample_laser_data[i])
    #     self.state_polar_angle = self.normalization_func(2*math.pi, 0, self.state_polar_angle)
    #     self.state_distance = self.normalization_func(14.14, 0, self.true_state_distance)  # sqrt(200)
    #     self.state_speed = self.normalization_func(0.5, 0, self.gazebo_robot_speed)
    #     if self.state_speed < 0:  # why?
    #         self.state_speed = 0.0
    #     self.state_angle = self.normalization_func(0.5, -0.5, self.gazebo_robot_angle)
    #
    # # round()取值方式精度不高,具体查看 http://www.runoob.com/w3cnote/python-round-func-note.html
    # def discretization_state_input(self):
    #     for i in range(0, 10):
    #         self.state_sample_laser_data[i] = round(self.state_sample_laser_data[i], 2)
    #     self.state_polar_angle = round(self.state_polar_angle, 2)
    #     self.state_distance = round(self.state_distance, 2)
    #     self.state_speed = round(self.state_speed, 2)
    #     self.state_angle = round(self.state_angle, 2)

    # 是否应该换成机器人本身的数据,毕竟真实机器人没有这些
    # @staticmethod
    def get_gazebo_state(self):
        # 第一个参数model_name,第二个参数为relative_entity_name
        model_state = self.get_state_srv(
            'robot' + str(ac_num),
            '')  # 注意model_state 的类型为GetModelStateResponse
        robot_pose_x = round(model_state.pose.position.x,
                             1)  # 注意pose为(x,y,z) , 离散,
        robot_pose_y = round(model_state.pose.position.y, 1)
        robot_twist_linear_x = round(model_state.twist.linear.x, 4)  # 离散化
        robot_twist_angular_z = round(model_state.twist.angular.z,
                                      4)  # angular.z代表平面机器人的角速度,因为此时z轴为旋转轴
        if model_state.pose.orientation.z < 0:  # robot_polar_angle位于[-2pi,2pi],what the hell?
            robot_polar_angle = -2 * math.acos(model_state.pose.orientation.w)
        else:
            robot_polar_angle = 2 * math.acos(model_state.pose.orientation.w)
        # print(robot_polar_angle)
        # print(model_state.pose.orientation.w)
        # if robot_polar_angle < 0:  # 整合为[0,2pi]
        #     robot_polar_angle += 2*math.pi  # robot_polar_angle是robot相对默认坐标系的转向角
        if robot_polar_angle > math.pi:  # 整合为(-pi,pi]
            robot_polar_angle -= 2 * math.pi  # robot_polar_angle是robot相对默认坐标系的转向角
        elif robot_polar_angle <= -math.pi:
            robot_polar_angle += 2 * math.pi
        # print(robot_polar_angle/math.pi * 180)
        robot_to_goal_angle = self.cpose_goal_angle(
        ) - robot_polar_angle  # 这里会被计算成(-2pi,2pi]
        if robot_to_goal_angle > math.pi:  # 整合为(-pi,pi]
            robot_to_goal_angle -= 2 * math.pi  # robot_polar_angle是robot相对默认坐标系的转向角
        elif robot_to_goal_angle <= -math.pi:
            robot_to_goal_angle += 2 * math.pi
        # print("robot 当前与坐标轴的角度")
        # print(robot_polar_angle / math.pi * 180)
        # print("goal 与当前robot坐标轴的角度")
        # print(self.cpose_goal_angle() / math.pi * 180)
        # print("goal 与当前robot的角度")
        # print(robot_to_goal_angle/math.pi * 180)
        # robot_polar_angle = round(robot_polar_angle*10, 1)  # normalization 注释掉这句话
        robot_to_goal_angle += math.pi  # 变为(0,2pi]
        # robot_to_goal_angle = round(robot_to_goal_angle*10, 1)  # normalization 注释掉这句话
        robot_to_goal_angle = round(robot_to_goal_angle *
                                    10)  # normalization 注释掉这句话
        # robot_orientation_z = round(50*(model_state.pose.orientation.z+1), 1)  # z与w 在[-1,1]之间
        # robot_orientation_w = round(50*(model_state.pose.orientation.w+1), 1)
        # return self.cc_func(robot_pose_x), self.cc_func(robot_pose_y), robot_twist_linear_x, robot_twist_angular_z
        return robot_pose_x, robot_pose_y, robot_twist_linear_x, robot_twist_angular_z, robot_to_goal_angle

    # 当前实现使用其它方法
    # 获取障碍物的位置
    # def get_obstacle_pose(self):
    #     # print('call get_obstacle_pose')
    #     for i in range(0, 15):  # 15个方块
    #         obstacle_pose = self.get_state_srv('unit_box_' + str(i), '')
    #         self.obstacle_pose[i][0] = obstacle_pose.pose.position.x
    #         self.obstacle_pose[i][1] = obstacle_pose.pose.position.y
    #
    #     for i in range(0, 5):  # 5个圆柱体,不使用球体,容易到处乱滚
    #         obstacle_pose = self.get_state_srv('unit_cylinder_' + str(i), '')
    #         self.obstacle_pose[i + 15][0] = obstacle_pose.pose.position.x
    #         self.obstacle_pose[i + 15][1] = obstacle_pose.pose.position.y
    #     # print('end call get_obstacle_pose')
    #
    # # 用来随机重置障碍物的位置坐标
    # def reset_obstacle_pose(self):
    #     for i in range(0, 20):
    #         self.obstacle_pose[i][0] = (20 * (np.random.ranf() - 0.5))  # x~(-10,10)
    #         while abs(self.obstacle_pose[i][0]) < 1.0:
    #             self.obstacle_pose[i][0] = (20 * (np.random.ranf() - 0.5))  # 距离初始位置过近
    #         self.obstacle_pose[i][0] = round(self.obstacle_pose[i][0], 1)  # 离散化
    #         self.obstacle_pose[i][1] = (20 * (np.random.ranf() - 0.5))  # y~(-10,10)
    #         while abs(self.obstacle_pose[i][1]) < 1.0:
    #             self.obstacle_pose[i][1] = (20 * (np.random.ranf() - 0.5))  # 距离初始位置过近
    #         self.obstacle_pose[i][1] = round(self.obstacle_pose[i][1], 1)
    #
    # # 使用服务,重置障碍物
    # def reset_obstacle_pose_srv(self):
    #     new_obstacle_state = ModelState()
    #
    #     new_obstacle_state.pose.orientation.x = 0
    #     new_obstacle_state.pose.orientation.y = 0
    #     new_obstacle_state.pose.orientation.z = 0
    #     new_obstacle_state.pose.orientation.w = 0
    #
    #     new_obstacle_state.twist.linear.x = 0
    #     new_obstacle_state.twist.linear.y = 0
    #     new_obstacle_state.twist.linear.z = 0
    #
    #     new_obstacle_state.twist.angular.x = 0
    #     new_obstacle_state.twist.angular.y = 0
    #     new_obstacle_state.twist.angular.z = 0
    #
    #     new_obstacle_state.reference_frame = "world"
    #     for i in range(0, 15):  # 10个方块
    #         new_obstacle_state.model_name = "unit_box_"+str(i)
    #         new_obstacle_state.pose.position.x = self.obstacle_pose[i][0]
    #         new_obstacle_state.pose.position.y = self.obstacle_pose[i][1]
    #         new_obstacle_state.pose.position.z = 0
    #
    #         self.set_obstacle_srv(new_obstacle_state)
    #
    #     for i in range(0, 5):  # 5个圆柱体,不使用球体,容易到处乱滚
    #         new_obstacle_state.model_name = "unit_cylinder_"+str(i)
    #         new_obstacle_state.pose.position.x = self.obstacle_pose[i+15][0]
    #         new_obstacle_state.pose.position.y = self.obstacle_pose[i+15][1]
    #         new_obstacle_state.pose.position.z = 0
    #
    #         self.set_obstacle_srv(new_obstacle_state)

    # 从原激光数据中抽取10组0.0型的激光数据,相当于每20°取一个;
    # 为了各robot可以互相探测,激光安装在车身前部,但更合理的做法应当是车身中央,所以要将测量到的laser数据进行一下变换0.15
    def sample_fuc(self):
        self.sample_laser_data[0] = self.laser_data[0]
        self.sample_laser_data[-1] = self.laser_data[-1]
        for i in range(1, 9):
            self.sample_laser_data[i] = self.laser_data[i * 72]
        for j in range(0, 10):
            # temp_angle = (math.pi / 9) * j  # 弧度制
            # self.sample_laser_data[j] += math.sin(temp_angle) * 0.15  # 修正中心位置,这个应该用于安装于车头的情况
            self.sample_laser_data[j] = round(
                self.sample_laser_data[j],
                1)  # 离散化,保留1位小数,注意round()既非截断,也非四舍五入,也许2位更好?
            # if self.sample_laser_data[i] == float('inf'):
            if self.sample_laser_data[j] > 10.0:
                self.sample_laser_data[j] = 10.0

    # # 该函数应该在sample_fuc()之后调用,用于记录一个数组中的几组激光数据,self.rew_sample_laser_data为3×10(暂时)
    # def add_check_laser_data(self):
    #     data = self.sample_laser_data
    #     print(data)
    #     if self.next_nrsld_index >= len(self.rew_sample_laser_data):
    #         self.rew_sample_laser_data.append(data)
    #     else:
    #         self.rew_sample_laser_data[self.next_nrsld_index] = data
    #     self.next_nrsld_index = (self.next_nrsld_index + 1) % self.num_rew_sample_laser_data
    #     print(self.next_nrsld_index)
    # def add_check_laser_data(self):
    #     data = self.sample_laser_data
    #     self.rew_sample_laser_data[self.next_nrsld_index] = data
    #     self.next_nrsld_index = (self.next_nrsld_index + 1) % self.num_rew_sample_laser_data
    #     print("what the hell?")
    #     print(self.rew_sample_laser_data)
    #     print(self.next_nrsld_index)
    #
    # # 用于恢复<0.3的激光数据
    # def get_rid_of_bad_data(self):
    #     for i in range(self.num_laser):
    #         if self.sample_laser_data[i] < 0.3:
    #             for j in range(self.num_rew_sample_laser_data):
    #                 if self.rew_sample_laser_data[j][i] > 0.3:
    #                     self.sample_laser_data[i] = self.rew_sample_laser_data[j][i]
    #                 else:
    #                     pass
    #         else:
    #             pass

    # 用于恢复<0.3的激光数据,或许应该检查的是相同位置激光读数差距(应该不只保存一组,而是多组,方便对比)
    def get_rid_of_bad_data(self):
        for i in range(self.num_laser):
            if self.sample_laser_data[i] < 0.3:
                # print("check")
                # print(self.sample_laser_data)
                # print(self.last_laser_data)
                self.sample_laser_data[i] = self.last_laser_data[i]
            else:
                pass

    # 如果使用normalization_state_input(),则不要使用这个
    def int_sample_fuc(self):
        for j in range(0, 10):
            self.int_sample_laser_data[j] = int(self.sample_laser_data[j] *
                                                10)  # 变成整数

    # 找到其它原因,放弃该部分代码
    # # 给出一个避免噪声影响的最小值, 还要加上限制条件(inf情况下),如安然变小(噪声),为false
    # def check_min_laser_data(self):
    #     mean = 0.0
    #     sigle_bool_check_data = [0 for x in range(0, self.num_rew_sample_laser_data)]
    #     sigle_bool_bool = [0 for x in range(0, self.num_rew_sample_laser_data)]
    #     for i in range(self.num_laser):
    #         for j in range(self.num_rew_sample_laser_data):
    #             sigle_bool_check_data[j] = self.rew_sample_laser_data[j][i]
    #             mean = sum(sigle_bool_check_data)/len(sigle_bool_check_data)
    #         for k in range(self.num_rew_sample_laser_data):
    #             if (sigle_bool_check_data[k] - mean) > 0.1:   # (当前状态下不太可能单次变化超过0.1)
    #                 sigle_bool_bool[k] = False  # 判定为噪声数据
    #             else:
    #                 sigle_bool_bool[k] = True
    #         if False in sigle_bool_bool:
    #             self.bool_check_laser_data[i] = False  # 判定该位置为噪声数据
    #     # 迭代寻找最小数据,未完成

    # def __del__(self):
    #     # 等待至线程中止。这阻塞调用线程直至线程的join()方法被调用中止-正常退出或者抛出未处理的异常-或者是可选的超时发生。
    #     self.thread1.join()  # 还是不对
    #
    def close(self):
        pass
コード例 #30
0
ファイル: string_game.py プロジェクト: amazyra/ReAgent
class StringGameEnv(Env):
    def __init__(self, max_steps=MAX_STEP):
        np.random.seed(123)
        torch.manual_seed(123)
        self.max_steps = max_steps
        self.reward_map = {}
        self._init_reward()
        logger.debug(self.reward_map)
        self.recent_actions = deque([], maxlen=SEQ_LEN)
        self.recent_states = deque([], maxlen=SEQ_LEN)
        self.cur_state = None
        self.action_space = Discrete(ACTION_DIM)
        self.observation_space = Box(low=0, high=1, shape=(STATE_DIM,))
        self.step_cnt = 0
        self.reset()

    def _init_reward(self):
        for seq_len in range(1, SEQ_LEN + 1):
            for k in itertools.product(CHARACTERS, repeat=seq_len):
                self.reward_map["".join(k)] = 0
        self.reward_map["ABB"] = 5
        self.reward_map["BBB"] = -5

    def seed(self, seed=None):
        np.random.seed(seed)
        torch.manual_seed(seed)

    @staticmethod
    def random_action():
        return np.random.randint(0, ACTION_DIM)

    def get_reward(self):
        """
        The function you can write to customize rewards. In this
        specific environment, the reward only depends on action history
        """
        recent_characters = [CHARACTERS[c] for c in list(self.recent_actions)]
        string = "".join(recent_characters)
        reward = self.reward_map[string]
        return reward, string

    def step(self, action):
        assert self.action_space.contains(action)
        assert self.done is False
        self.step_cnt += 1

        self.recent_states.append(self.cur_state)
        self.recent_actions.append(action)
        reward, info = self.get_reward()
        if self.step_cnt >= self.max_steps:
            self.done = True
        ob = self.get_observation()
        self.cur_state = ob

        return ob, reward, self.done, {"reward_str": info}

    def get_observation(self):
        """
        The function you can write to customize transitions. In this
        specific environment, the next state is exactly the latest action taken.
        The initial observation is all zeros.
        """
        ob = np.zeros(STATE_DIM)
        if len(self.recent_actions) > 0:
            ob[self.recent_actions[-1]] = 1
        return ob

    def reset(self):
        self.done = False
        self.recent_states = deque([], maxlen=SEQ_LEN)
        self.recent_actions = deque([], maxlen=SEQ_LEN)
        self.step_cnt = 0
        ob = self.get_observation()
        self.cur_state = ob
        return ob

    def print_internal_state(self):
        print("Step", self.step_cnt)
        state_str = "".join(
            [CHARACTERS[np.nonzero(c)[0].item()] for c in self.recent_states]
        )
        action_str = "".join([CHARACTERS[c] for c in self.recent_actions])
        print(
            "Internal state: recent states {}, recent actions {}".format(
                state_str, action_str
            )
        )

    @staticmethod
    def print_ob(ob):
        return str(ob)

    @staticmethod
    def print_action(action):
        return CHARACTERS[action]
コード例 #31
0
ファイル: version_12.py プロジェクト: rayson1223/gym-workflow
class Version12(WorkflowSimEnv):
    """
        @version 12.0:
            - Design based on workflow sim
            - Reward scheme same as v7
            - Focus on system overhead
    """
    def __init__(self):
        # Montage Experiment Variable
        super(Version12, self).__init__()

        self.action_range = 100
        self.cluster_range = 100
        self.action_space = Discrete(self.action_range)
        self.observation_space = Discrete(self.cluster_range)
        self.cluster_size = 1

        # Episode Conf
        # Best exec_time: None or 1, depends on reward version
        self.best_makespan = None
        self.last_makespan = None
        self.last_action = None
        self.reward = 0
        self.total_reward = 0.0

        self.exec_records = {}
        self.all_exec_record = list()
        self.all_overhead_record = list()
        self.all_makespan_record = list()
        self.all_benchmark_record = list()
        self.seed()
        self.reset()

    def step(self, action, training=False):
        assert self.action_space.contains(action)
        action += 1
        done = False
        reward = 0.0

        # Return all the data collected
        makespan, queue, exec, postscript, cluster, wen = self.run_experiment(
            vm_size=self.action_range,
            clustering_choice="CNUM",
            clustering_method="HORIZONTAL",
            cluster_size=action)
        overhead = float(queue) + float(postscript) + float(wen) + float(
            cluster)
        makespan = float(makespan)
        self.all_overhead_record.append(overhead)
        self.all_makespan_record.append(float(makespan))
        self.all_exec_record.append(float(exec))

        if not training:
            # Setting up best exec
            if self.best_makespan is None:
                self.best_makespan = makespan
            if self.last_makespan is None:
                self.last_makespan = makespan

            # Rewarding / Penalty Judgement
            percentile = np.percentile(self.all_makespan_record, 10)
            benchmark = np.mean(self.all_benchmark_record)
            # Calc improve percentage
            if len(self.all_benchmark_record) == 0:
                self.all_benchmark_record.append(percentile)
            elif abs(percentile - benchmark) / benchmark * 100 > 10:
                self.all_benchmark_record.append(percentile)
                benchmark = np.mean(self.all_benchmark_record)
            # else:
            # print(abs(percentile - benchmark) / benchmark * 100)

            if makespan < benchmark:
                self.best_makespan = makespan
                reward = 1
            else:
                reward = -1
            self.last_makespan = makespan

            self.total_reward += reward
            self.last_action = action
            if self.total_reward > 20 or self.total_reward < -20:
                done = True

        return self._get_obs(), reward, done, {
            "exec": exec,
            "overhead": overhead,
            "makespan": makespan,
            "queue": queue,
            "postscript": postscript,
            "cluster": cluster,
            "wen": wen,
            "benchmark": benchmark
        }

    def reset(self):
        self.total_reward = 0.0
        # for x in self.all_makespan_record:
        #     if x > np.percentile(self.all_makespan_record, 20):
        #         self.all_makespan_record.remove(x)
        return np.random.randint(
            1, (self.cluster_range + 1))  # , self.clusters_num

    def _get_obs(self):
        return np.random.randint(1, (self.cluster_range + 1))