class Encoder: """Encode/Decode a multi-discrete space into a discrete one.""" def __init__(self, input_space: MultiDiscrete): """ Initialize the space_encoder. :param input_space: the space to encode. """ self.input_space = input_space self.output_space = Discrete(np.prod(input_space.nvec)) def encode(self, s: Sequence[int]) -> int: """ Do the encoding. :param s: a point in the multi-discrete space. :return: the encoded version of the point. """ if type(s) in {list, tuple}: s = np.array(s) assert_(self.input_space.contains(s), f"{s} is not contained in the input space.") dims = self.input_space.nvec result = 0 for i, dim in zip(s, dims[+1:]): result += i result *= dim result += s[-1] assert_( self.output_space.contains(result), f"{result} is not contained in the output space.", ) return result def decode(self, s: int) -> Sequence[int]: """ Do the decoding. :param s: a point in a discrete space. :return: the decoded version of the point into a multi-discrete space. """ assert_(self.output_space.contains(s), f"{s} is not contained in the output space.") dims = self.input_space.nvec result = [] tmp = s for dim in reversed(dims[+1:]): result.append(tmp % dim) tmp = tmp // dim result.append(tmp) result = np.array(list(reversed(result))) assert_( self.input_space.contains(result), f"{result} is not contained in the input space.", ) return result
class MyEnv(Env): def __init__(self): self.action_space = Discrete(4) self.prev_action = 0 self.state = np.zeros(4).astype(np.int32) def render(self): pass # print('action: ', self.prev_action, 'state:', self.state) def step(self, action): self.prev_action = action assert self.action_space.contains(action) self.state[action] += 1 reward = 0 if self.state[action] == 10: reward = 1 elif self.state[action] > 10: reward = -1 is_done = np.all(self.state > 9) or np.any(self.state > 15) if is_done: reward = 1 if np.all(self.state > 9) else -1 return np.copy(self.state), reward, is_done, {} def reset(self): self.state = np.zeros(4).astype(np.int32)
class DiscreteActionToMarketOrder(Kernel): """ Maps gym.spaces.Discrete actions to executable Market Orders. """ # TODO: speed up by turning off action validation def __init__( self, assets, name='AssetActionMap', task=0, log=None, log_level=INFO, ): super().__init__(name=name, task=task, log=log, log_level=log_level) assets = list(assets) try: assert len(assets) == 1 except AssertionError: e = 'This class currently supports single asset trading only' self.log.error(e) raise ValueError(e) self.assets = assets self.space = Discrete(4) self.action_map = {0: None, 1: 'buy', 2: 'sell', 3: 'close'} def update_state(self, input_state, reset): if reset: self._start(input_state) else: self._update_state(input_state) return self.state def _start(self, action): self.state = [] def _update_state(self, action): try: assert self.space.contains(action) except AssertionError: e = 'Provided action `{}` is not a valid member of defined action space `{}`'.format( action, self.space) self.log.error(e) raise TypeError(e) if action != 0: self.state = [MarketOrder(self.assets[0], self.action_map[action])] else: self.state = []
class MultiStepEnv(Env): def __init__(self, horizon=1): self.horizon = horizon self.action_space = Discrete(2) self.observation_space = Box(low=0, high=1, shape=(3, )) self.state_dict = { 's_1': np.array([1, 0, 0]), 's_2': np.array([0, 1, 0]), 's_3': np.array([0, 0, 1]), 's_4': np.array([1, 1, 1]), 's_5': np.array([0, 0, 0]), } self.seed() self.reset() def seed(self, seed=None): self.np_random, seed = seeding.np_random(seed) return [seed] def reset(self): self.state = 's_1' self.time_step = 0 return self.state_dict[self.state] def step(self, action): assert self.action_space.contains(action) reward = 0.0 if self.state == 's_1': if action == 0: self.state = 's_2' elif action == 1: self.state = 's_3' elif self.state == 's_2': if action == 1: self.state = 's_4' elif self.state == 's_4': if action == 0: self.state = 's_5' self.time_step += 1 if self.time_step == self.horizon: done = True if self.state == 's_2' and self.time_step == 1: reward = 1.0 elif self.state == 's_4' and self.time_step == 2: reward = 1.0 elif self.state == 's_5' and self.time_step == 3: reward = 1.0 else: done = False return self.state_dict[self.state], reward, done, {'state': self.state}
class Discretizer: def __init__(self, space, n_bins): assert isinstance(space, Box), \ "Discretization is only implemented for Box spaces." assert space.is_bounded() self.space = space self.n_bins = n_bins # initialize bins assert n_bins > 0, "Discretizer requires n_bins > 0" n_elements = 1 tol = 1e-8 self.dim = len(self.space.low) n_elements = n_bins**self.dim self._bins = [] self._open_bins = [] for dd in range(self.dim): range_dd = self.space.high[dd] - self.space.low[dd] epsilon = range_dd / n_bins bins_dd = [] for bb in range(n_bins + 1): val = self.space.low[dd] + epsilon * bb bins_dd.append(val) self._open_bins.append(tuple(bins_dd[1:])) bins_dd[-1] += tol # "close" the last interval self._bins.append(tuple(bins_dd)) # set observation space self.discrete_space = Discrete(n_elements) # List of discretized elements self.discretized_elements = np.zeros((self.dim, n_elements)) for ii in range(n_elements): self.discretized_elements[:, ii] = self.get_coordinates(ii, False) def discretize(self, coordinates): return binary_search_nd(coordinates, self._bins) def get_coordinates(self, flat_index, randomize=False): assert self.discrete_space.contains(flat_index), "invalid flat_index" # get multi-index index = unravel_index_uniform_bin(flat_index, self.dim, self.n_bins) # get coordinates coordinates = np.zeros(self.dim) for dd in range(self.dim): coordinates[dd] = self._bins[dd][index[dd]] if randomize: range_dd = self.space.high[dd] - self.space.low[dd] epsilon = range_dd / self.n_bins coordinates[dd] += epsilon * self.space.rng.uniform() return coordinates
class MyEnv(Env): def __init__(self): self.action_space = Discrete(5) self.prev_action = 0 self.prev_state = np.zeros(4).astype(np.int32) self.state = np.zeros(4).astype(np.int32) def render(self): pass # print('action: ', self.prev_action, 'state:', self.state) def step(self, action): self.prev_action = action assert self.action_space.contains(action) reward = 0 if action < 4: self.prev_state = np.copy(self.state) self.state[action] += 1 if np.sum(self.state == 10) > np.sum(self.prev_state == 10): reward = 1 elif np.sum(self.state > 10) > np.sum(self.prev_state > 10): reward = -1 is_done = np.all(self.state > 9) or np.any(self.state > 15) if is_done: reward = 1 if np.all(self.state > 9) else -1 return np.copy(self.state), reward, is_done, {} def random_effect(self): if random() < 0.2: print('random effect') random_index = randint(0, 3) if self.state[random_index] < 9: self.state[random_index] += 1 def reset(self): self.state = np.zeros(4).astype(np.int32)
class Version9(MontageWfEnv): """ @version 9.0: - Update version of V5.0 - Required update V5 to only take in CS - Required verify it's convergence by simulation before deployment """ def __init__(self): # Montage Experiment Variable super(Version9, self).__init__() # 3: cs, 5: cs & cn self.action_range = 5 # 10 = 10 processors available self.cluster_range = 10 self.action_space = Discrete(self.action_range) self.observation_space = Discrete( self.cluster_range) #, Discrete(self.cluster_range) self.cluster_size = 1 self.cluster_num = 1 # Episode Conf # Best exec_time: None or 1, depends on reward version self.best_overhead = None self.last_overhead = None self.last_action = None self.reward = 0 self.total_reward = 0.0 self.is_training = False self.exec_records = {} self.all_exec_record = list() self.all_overhead_record = list() self.all_makespan_record = list() self.seed() self.reset() def step(self, action, training=False): assert self.action_space.contains(action) self.is_training = training done = False reward = 0.0 if action == 1: self.cluster_size += 1 elif action == 2: self.cluster_size -= 1 elif action == 3: self.cluster_num += 1 elif action == 4: self.cluster_num -= 1 # Range Guarding Function if self.cluster_size <= 0: # reward = -100.0 self.cluster_size = 1 elif self.cluster_size > 10: # reward = -100.0 self.cluster_size = 10 elif self.cluster_num <= 0: # reward = -100.0 self.cluster_num = 1 elif self.cluster_num > 10: # reward = -100.0 self.cluster_num = 10 else: # Return all the data collected # status, jb, wt, cwt = self.run_experiment(self.clusters_size, self.clusters_num) # Experiment run failed -> High Penalty # if not status: # return self._get_obs(), -10, True, {} # jb = 0 if jb is None else jb # wt = 0 if wt is None else wt # cwt = 0 if cwt is None else cwt jb = self.run_gen_experiment(self.cluster_size, self.cluster_num) # jb = self.run_gen_experiment(self.cluster_size) wt = 0 cwt = 0 # self.exec_time = jb self.exec_time = float(jb) overhead = self.exec_time self.all_overhead_record.append(jb) self.all_makespan_record.append(jb) self.all_exec_record.append(jb) if not training: # Setting up best exec time if self.best_overhead is None: self.best_overhead = overhead if self.last_overhead is None: self.last_overhead = overhead # Rewarding / Penalty Judgement if overhead < np.percentile(self.all_overhead_record, 20): self.best_overhead = overhead reward = 200 else: reward = -100 self.last_overhead = overhead self.total_reward += reward self.last_action = action if self.total_reward > 2000 or self.total_reward < -1000: done = True return self._get_obs(), reward, done, { "exec": self.exec_time, "overhead": self.exec_time, "makespan": self.exec_time } def reset(self): if self.last_overhead is not None: self.last_overhead = self.exec_time self.cluster_size = np.random.randint(1, self.cluster_range) self.cluster_num = np.random.randint(1, self.cluster_range) self.total_reward = 0 # For cs only # return self.cluster_size # For cs & cn return self.cluster_size, self.cluster_num def _get_obs(self): # For cs # return self.cluster_size # For cs & cn return self.cluster_size, self.cluster_num
class PocEnv(Env): def __init__(self, maze, obs_array=False): self.board = select_maze(maze) self.grid = PocGrid(board=self.board["_maze"]) self._get_init_state() self.action_space = Discrete(4) self.observation_space = Discrete(1 << 10) # 1024 # self.observation_space = Discrete(14) self._reward_range = 100 self._discount = .95 self.done = False self.gui = None if obs_array: self._set_flags = set_flags_array self._zero = lambda: [0] * 10 else: self._set_flags = set_flags self._zero = lambda: 0 def seed(self, seed=None): np.random.seed(seed) def is_power(self, idx): return self.board['_maze'][idx] == 3 def is_passable(self, idx): return self.board['_maze'][idx] != 0 def _is_valid(self): assert self.grid.is_inside(self.state.agent_pos) assert self.is_passable(self.state.agent_pos) for ghost in self.state.ghosts: assert self.grid.is_inside(ghost.pos) assert self.is_passable(ghost.pos) def _set_state(self, state): self.done = False self.state = state def _generate_legal(self): actions = [] for action in self.action_space.n: if self.grid.is_inside(self.state.agent_pos + Moves.get_coord(action.value)): actions.append(action.value) return actions def step(self, action): err_msg = "%r (%s) invalid" % (action, type(action)) assert self.action_space.contains(action), err_msg assert self.done is False self.state.action = action reward = -1 next_pos = self._next_pos(self.state.agent_pos, action) if next_pos.is_valid(): self.state.agent_pos = next_pos else: reward += -25 if self.state.power_step > 0: self.state.power_step -= 1 hit_ghost = -1 for g, ghost in enumerate(self.state.ghosts): if ghost.pos == self.state.agent_pos: hit_ghost = g else: # move ghost self._move_ghost(g, ghost_range=self.board["_ghost_range"]) if ghost.pos == self.state.agent_pos: hit_ghost = g if hit_ghost >= 0: if self.state.power_step > 0: reward += 25 self.state.ghosts[hit_ghost].reset() else: reward += -100 self.done = True # don't eat power up when hit by a ghost already elif self.is_power(self.state.agent_pos): self.state.power_step = config["_power_steps"] reward += 10 # same for food elif self.state.food_pos[self.grid.get_index(self.state.agent_pos)]: self.state.food_pos[self.grid.get_index(self.state.agent_pos)] = 0 if sum(self.state.food_pos) == 0: reward += 1000 self.done = True obs = self._make_ob() return obs, reward, self.done, {"state": self.state} def _make_ob(self): obs = self._zero() for d in range(self.action_space.n): if self._see_ghost(d) >= 0: obs = self._set_flags(obs, d) next_pos = self._next_pos(self.state.agent_pos, direction=d) if next_pos.is_valid() and self.is_passable(next_pos): obs = self._set_flags(obs, d + self.action_space.n) if self._smell_food(): obs = self._set_flags(obs, 8) if self._hear_ghost(self.state): obs = self._set_flags(obs, 9) return obs def _encode_state(self, state): poc_idx = self.grid.get_index(state.agent_pos) ghosts = [(self.grid.get_index(ghost.pos), ghost.direction) for ghost in state.ghosts] return np.concatenate([[poc_idx], *ghosts, state.food_pos, [state.power_step]]) def _decode_state(self, state): poc_state = PocState(Coord(*self.grid.get_coord(state[0]))) ghosts = np.split(state[1:self.board["_num_ghosts"] * 3], 1) for g in ghosts: poc_state.ghosts.append( Ghost(pos=self.grid.get_coord(g[0]), direction=g[1])) poc_state.power_step = state[-1] poc_state.food_pos = state[self.board["_num_ghosts"] * 3:-1].tolist() return poc_state def _see_ghost(self, action): eye_pos = self.state.agent_pos + Moves.get_coord(action) while True: for g, ghost in enumerate(self.state.ghosts): if ghost.pos == eye_pos: return g eye_pos += Moves.get_coord(action) if not (self.grid.is_inside(eye_pos) and self.is_passable(eye_pos)): break return -1 def _smell_food(self, smell_range=1): for x in range(-smell_range, smell_range + 1): for y in range(-smell_range, smell_range + 1): smell_pos = Coord(x, y) idx = self.grid.get_index(self.state.agent_pos + smell_pos) if self.grid.is_inside(self.state.agent_pos + smell_pos) and\ self.state.food_pos[idx]: return True return False @staticmethod def _hear_ghost(poc_state, hear_range=2): for ghost in poc_state.ghosts: if Grid.manhattan_distance(ghost.pos, poc_state.agent_pos) <= hear_range: return True return False def render(self, mode='human', close=False): if close: return if mode == 'human': if self.gui is None: self.gui = PocGui(board_size=self.grid.get_size, maze=self.board["_maze"], state=self.state) else: self.gui.render(state=self.state) def reset(self): self.done = False self._get_init_state() return self._make_ob() def close(self): pass def _get_init_state(self): self.state = PocState() self.state.agent_pos = Coord(*self.board["_poc_home"]) ghost_home = Coord(*self.board["_ghost_home"]) for g in range(self.board["_num_ghosts"]): pos = Coord(ghost_home.x + g % 2, ghost_home.y + g // 2) self.state.ghosts.append(Ghost(pos, direction=-1)) self.state.food_pos = np.random.binomial(1, config["_food_prob"], size=self.grid.n_tiles) # only make free space food idx = (self.board["_maze"] > 0) &\ (self.state.food_pos.reshape(self.board["_maze"].shape) > 0) self.board["_maze"][idx] = 4 self.state.power_step = 0 return self.state def _next_pos(self, pos, direction): direction = Moves.get_coord(direction) if pos.x == 0 and pos.y == self.board['_passage_y'] and\ direction == Moves.EAST: next_pos = Coord(self.grid.x_size - 1, pos.y) elif pos.x == self.grid.x_size - 1 and\ pos.y == self.board['_passage_y'] and direction == Moves.WEST: next_pos = Coord(0, pos.y) else: next_pos = pos + direction if self.grid.is_inside(next_pos) and self.is_passable(next_pos): return next_pos else: return Coord(-1, -1) def _move_ghost(self, g, ghost_range): if Grid.manhattan_distance(self.state.agent_pos, self.state.ghosts[g].pos) < ghost_range: if self.state.power_step > 0: self._move_defensive(g) else: self._move_aggressive(g) else: self._move_random(g) def _move_aggressive(self, g): if not np.random.binomial(1, p=config["_chase_prob"]): return self._move_random(g) best_dist = self.grid.x_size + self.grid.y_size best_pos = self.state.ghosts[g].pos best_dir = -1 for d in range(self.action_space.n): dist = Grid.directional_distance(self.state.agent_pos, self.state.ghosts[g].pos, d) new_pos = self._next_pos(self.state.ghosts[g].pos, d) if dist <= best_dist and new_pos.is_valid() and\ can_move(self.state.ghosts[g], d): best_pos = new_pos best_dist = dist best_dir = d self.state.ghosts[g].update(best_pos, best_dir) def _move_defensive(self, g, defensive_prob=.5): if np.random.binomial(1, defensive_prob) and\ self.state.ghosts[g].direction >= 0: self.state.ghosts[g].direction = -1 return best_dist = 0 best_pos = self.state.ghosts[g].pos best_dir = -1 for d in range(self.action_space.n): dist = Grid.directional_distance(self.state.agent_pos, self.state.ghosts[g].pos, d) new_pos = self._next_pos(self.state.ghosts[g].pos, d) if dist >= best_dist and new_pos.is_valid() and\ can_move(self.state.ghosts[g], d): best_pos = new_pos best_dist = dist best_dir = d self.state.ghosts[g].update(best_pos, best_dir) def _move_random(self, g): # there are !!! dead ends # only switch to opposite direction when it failed 10 times (hack) ghost_pos = self.state.ghosts[g].pos i = 0 while True: d = self.action_space.sample() next_pos = self._next_pos(ghost_pos, d) # normal map has dead ends: if next_pos.is_valid() and (can_move(self.state.ghosts[g], d) or i > 10): break i += 1 self.state.ghosts[g].update(next_pos, d)
class RockEnv(Env): metadata = {"render.modes": ["human", "ansi"]} def __init__(self, board_size=7, num_rocks=8, use_heuristic=False): assert board_size in list( config.keys()) and num_rocks in config[board_size]['size'] self.num_rocks = num_rocks self._use_heuristic = use_heuristic self._rock_pos = [ Coord(*rock) for rock in config[board_size]['rock_pos'] ] self._agent_pos = Coord(*config[board_size]['init_pos']) self.grid = Grid(board_size, board_size) for idx, rock in enumerate(self._rock_pos): self.grid.board[rock] = idx self.action_space = Discrete(len(Action) + self.num_rocks) self.observation_space = Discrete(len(Obs)) self._discount = .95 self._reward_range = 20 self._penalization = -100 self._query = 0 def seed(self, seed=None): np.random.seed(seed) def step(self, action): assert self.action_space.contains(action) assert self.done is False self.last_action = action self._query += 1 reward = 0 ob = Obs.NULL.value if action < Action.SAMPLE.value: if action == Action.EAST.value: if self.state.agent_pos.x + 1 < self.grid.x_size: self.state.agent_pos += Moves.EAST.value else: reward = 10 self.done = True return ob, reward, self.done, { "state": self._encode_state(self.state) } elif action == Action.NORTH.value: if self.state.agent_pos.y + 1 < self.grid.y_size: self.state.agent_pos += Moves.NORTH.value else: reward = self._penalization elif action == Action.SOUTH.value: if self.state.agent_pos.y - 1 >= 0: self.state.agent_pos += Moves.SOUTH.value else: reward = self._penalization elif action == Action.WEST.value: if self.state.agent_pos.x - 1 >= 0: self.state.agent_pos += Moves.WEST.value else: reward = self._penalization else: raise NotImplementedError() if action == Action.SAMPLE.value: rock = self.grid[self.state.agent_pos] if rock >= 0 and not self.state.rocks[ rock].status == 0: # collected if self.state.rocks[rock].status == 1: reward = 10 else: reward = -10 self.state.rocks[rock].status = 0 else: reward = self._penalization if action > Action.SAMPLE.value: rock = action - Action.SAMPLE.value - 1 assert rock < self.num_rocks ob = self._sample_ob(self.state.agent_pos, self.state.rocks[rock]) self.state.rocks[rock].measured += 1 eff = self._efficiency(self.state.agent_pos, self.state.rocks[rock].pos) if ob == Obs.GOOD.value: self.state.rocks[rock].count += 1 self.state.rocks[rock].lkv *= eff self.state.rocks[rock].lkw *= (1 - eff) else: self.state.rocks[rock].count -= 1 self.state.rocks[rock].lkw *= eff self.state.rocks[rock].lkv *= (1 - eff) denom = (.5 * self.state.rocks[rock].lkv) + ( .5 * self.state.rocks[rock].lkw) self.state.rocks[rock].prob_valuable = ( .5 * self.state.rocks[rock].lkv) / denom self.done = self._penalization == reward return ob, reward, self.done, {"state": self._encode_state(self.state)} def _decode_state(self, state, as_array=False): agent_pos = Coord(*state['agent_pos']) rock_state = RockState(agent_pos) for r in state['rocks']: rock = Rock(pos=0) rock.__dict__.update(r) rock_state.rocks.append(rock) if as_array: rocks = [] for rock in rock_state.rocks: rocks.append(rock.status) return np.concatenate([[self.grid.get_index(agent_pos)], rocks]) return rock_state def _encode_state(self, state): # use dictionary for state encodign return _encode_dict(state) # rocks can take 3 values: -1, 1, 0 if collected def render(self, mode='human', close=False): if close: return if mode == "human": if not hasattr(self, "gui"): start_pos = self.grid.get_index(self.state.agent_pos) obj_pos = [(self.grid.get_index(rock.pos), rock.status) for rock in self.state.rocks] self.gui = RockGui((self.grid.x_size, self.grid.y_size), start_pos=start_pos, obj=obj_pos) if self.last_action > Action.SAMPLE.value: rock = self.last_action - Action.SAMPLE.value - 1 print("Rock S: {} P:{}".format(self.state.rocks[rock].status, self.state.rocks[rock].pos)) # msg = "Action : " + action_to_str(self.last_action) + " Step: " + str(self.t) + " Rw: " + str(self.total_rw) agent_pos = self.grid.get_index(self.state.agent_pos) self.gui.render(agent_pos) def reset(self): self.done = False self._query = 0 self.last_action = Action.SAMPLE.value self.state = self._get_init_state(should_encode=False) return Obs.NULL.value def _set_state(self, state): self.done = False self.state = self._decode_state(state) def close(self): self.render(close=True) def _compute_prob(self, action, next_state, ob): next_state = self._decode_state(next_state) if action <= Action.SAMPLE.value: return int(ob == Obs.NULL.value) eff = self._efficiency( next_state.agent_pos, next_state.rocks[action - Action.SAMPLE.value - 1].pos) if ob == Obs.GOOD.value and next_state.rocks[action - Action.SAMPLE.value - 1].status == 1: return eff elif ob == Obs.BAD.value and next_state.rocks[action - Action.SAMPLE.value - 1].status == -1: return eff else: return 1 - eff def _get_init_state(self, should_encode=True): rock_state = RockState(self._agent_pos) for idx in range(self.num_rocks): rock_state.rocks.append(Rock(self._rock_pos[idx])) return self._encode_state(rock_state) if should_encode else rock_state def _generate_legal(self): legal = [Action.EAST.value] # can always go east if self.state.agent_pos.y + 1 < self.grid.y_size: legal.append(Action.NORTH.value) if self.state.agent_pos.y - 1 >= 0: legal.append(Action.SOUTH.value) if self.state.agent_pos.x - 1 >= 0: legal.append(Action.WEST.value) rock = self.grid[self.state.agent_pos] if rock >= 0 and self.state.rocks[rock].status != 0: legal.append(Action.SAMPLE.value) for rock in self.state.rocks: assert self.grid[rock.pos] != -1 if rock.status != 0: legal.append(self.grid[rock.pos] + 1 + Action.SAMPLE.value) return legal def _generate_preferred(self, history): if not self._use_heuristic: return self._generate_legal() actions = [] # sample rocks with high likelihood of being good rock = self.grid[self.state.agent_pos] if rock >= 0 and self.state.rocks[rock].status != 0 and history.size: total = 0 # history for t in range(history.size): if history[t].action == rock + 1 + Action.SAMPLE.value: if history[t].ob == Obs.GOOD.value: total += 1 elif history[t].ob == Obs.BAD.value: total -= 1 if total > 0: actions.append(Action.SAMPLE.value) return actions # process the rocks all_bad = True direction = { "north": False, "south": False, "west": False, "east": False } for idx in range(self.num_rocks): rock = self.state.rocks[idx] if rock.status != 0: total = 0 for t in range(history.size): if history[t].action == idx + 1 + Action.SAMPLE.value: if history[t].ob == Obs.GOOD.value: total += 1 elif history[t].ob == Obs.BAD.value: total -= 1 if total >= 0: all_bad = False if rock.pos.y > self.state.agent_pos.y: direction['north'] = True elif rock.pos.y < self.state.agent_pos.y: direction['south'] = True elif rock.pos.x < self.state.agent_pos.x: direction['west'] = True elif rock.pos.x > self.state.agent_pos.x: direction['east'] = True if all_bad: actions.append(Action.EAST.value) return actions # generate a random legal move # do not measure a collected rock # do no measure a rock too often # do not measure clearly bad rocks # don't move in a direction that puts you closer to bad rocks # never sample a rock if self.state.agent_pos.y + 1 < self.grid.y_size and direction['north']: actions.append(Action.NORTH.value) if direction['east']: actions.append(Action.EAST.value) if self.state.agent_pos.y - 1 >= 0 and direction['south']: actions.append(Action.SOUTH.value) if self.state.agent_pos.x - 1 >= 0 and direction['west']: actions.append(Action.WEST.value) for idx, rock in enumerate(self.state.rocks): if not rock.status == 0 and rock.measured < 5 and abs( rock.count) < 2 and 0 < rock.prob_valuable < 1: actions.append(idx + 1 + Action.SAMPLE.value) if len(actions) == 0: return self._generate_legal() return actions def __dict2np__(self, state): idx = self.grid.get_index(Coord(*state['agent_pos'])) rocks = [] for rock in state['rocks']: rocks.append(rock['status']) return np.concatenate([[idx], rocks]) @staticmethod def _efficiency(agent_pos, rock_pos, hed=20): # TODO check me d = Grid.euclidean_distance(agent_pos, rock_pos) eff = (1 + pow(2, -d / hed)) * .5 return eff @staticmethod def _select_target(rock_state, x_size): best_dist = x_size * 2 best_rock = -1 # Coord(-1, -1) for idx, rock in enumerate(rock_state.rocks): if rock.status != 0 and rock.count >= 0: d = Grid.manhattan_distance(rock_state.agent_pos, rock.pos) if d < best_dist: best_dist = d best_rock = idx # rock.pos return best_rock @staticmethod def _sample_ob(agent_pos, rock, hed=20): eff = RockEnv._efficiency(agent_pos, rock.pos, hed=hed) if np.random.binomial(1, eff): return Obs.GOOD.value if rock.status == 1 else Obs.BAD.value else: return Obs.BAD.value if rock.status == 1 else Obs.GOOD.value
class MarketEnv(Env): """ The class implements an Env class interface from the Gym (OpenAI) package, to communicate with real market data. """ reward_range = (-np.inf, np.inf) action_space = None observation_space = None actions = {'hold': 0, 'buy': 1, 'sell': 2} metadata = {'render.modes': ['human', 'rgb_array'], 'video.frames_per_second' : 15} viewer = None def __init__(self, market: AbstractMarket, use_deposit=False, use_last_action=False, **kwargs): """MarketEnv constructor. Arguments market (AbstractMarket): Wrapper for access to the market through API, or other solutions. window (int): This is the size of the state of the environment (The number of time intervals). """ self.market = market self.use_deposit = use_deposit self.use_last_action = use_last_action self.last_action = dict() # TODO action space for multy symbols agent self.action_space = Discrete(3 * self.market.symbols_count) self.observation_space = Space(shape=self.market.shape, dtype=np.float) def step(self, action): """Run one timestep of the environment's dynamics. When end of episode is reached, you are responsible for calling `reset()` to reset this environment's state. Accepts an action and returns a tuple (observation, reward, done, info). Arguments action (object): an action provided by the environment Returns observation (object): agent's observation of the current environment reward (float) : amount of reward returned after previous action done (boolean): whether the episode has ended, in which case further step() calls will return undefined results info (dict): contains auxiliary diagnostic information (helpful for debugging, and sometimes learning) """ # TODO multy action assert self.action_space.contains(action[0]), "%r (%s) invalid"%(action, type(action)) done = False reward = 0.0 info = dict() observation = self.market.observation() feedback = [] # action is the max index from the model output (from three neurons) idx = 0 for symbol in self.market.symbols: if action[idx] == self.actions['buy']: self.market.buy_order(symbol) elif action[idx] == self.actions['hold']: pass elif action[idx] == self.actions['sell']: self.market.sell_order(symbol) if self.use_deposit: feedback.append(self.market.deposit(symbol)) if self.use_last_action: feedback.append(self.last_action[symbol]) self.last_action[symbol] = action[idx] info[symbol] = { 'action': action[idx], 'reward': self.market.profit, 'deposit': self.market.deposit(symbol) } reward += self.market.profit idx += 1 if self.use_deposit or self.use_last_action: observation.append(np.array(feedback)) if self.market.done or self.market.balance <= 0: done = True info['sum_reward'] = reward info['balance'] = self.market.balance return (observation, reward, done, info) def reset(self): """Resets the state of the environment and returns an initial observation. Returns observation (object): the initial observation of the space. """ self.market.reset() observation = self.market.observation() feedback = [] for symbol in self.market.symbols: self.last_action[symbol] = 0 if self.use_deposit: feedback.append(self.market.deposit(symbol)) if self.use_last_action: feedback.append(self.last_action[symbol]) if self.use_deposit or self.use_last_action: observation.append(np.array(feedback)) return observation @property def feedback_shape(self): """""" return ((len(self.market.symbols) if self.use_deposit else 0) + (len(self.market.symbols) if self.use_last_action else 0)) def render(self, mode='human', close=False): """Renders the environment. The set of supported modes varies per environment. (And some environments do not support rendering at all.) By convention, if mode is: - human: render to the current display or terminal and return nothing. Usually for human consumption. - rgb_array: Return an numpy.ndarray with shape (x, y, 3), representing RGB values for an x-by-y pixel image, suitable for turning into a video. - ansi: Return a string (str) or StringIO.StringIO containing a terminal-style text representation. The text can include newlines and ANSI escape sequences (e.g. for colors). Note Make sure that your class's metadata 'render.modes' key includes the list of supported modes. It's recommended to call super() in implementations to use the functionality of this method. Arguments mode (str): The mode to render with. close (bool): Close all open renderings. """ if mode == 'rgb_array': # data = self.market.get_window(len=20) return np.array([]) # return RGB frame suitable for video elif mode is 'human': pass # pop up a window and render else: super(MarketEnv, self).render(mode=mode) # just raise an exception def configure(self, **kwargs): """Provides runtime configuration to the environment. This configuration should consist of data that tells your environment how to run (such as an address of a remote server, or path to your ImageNet data). It should not affect the semantics of the environment. """ if kwargs['market']: del self.market self.market = kwargs['market'] self.reset() def close(self): """Override _close in your subclass to perform any necessary cleanup. Environments will automatically close() themselves when garbage collected or when the program exits. """ if self.viewer: self.viewer.close() def seed(self, seed=None): """Sets the seed for this env's random number generator(s). Returns list<bigint>: Returns the list of seeds used in this env's random number generators. The first value in the list should be the "main" seed, or the value which a reproducer should pass to 'seed'. Often, the main seed equals the provided 'seed', but this won't be true if seed=None, for example. """ self.np_random, seed = seeding.np_random(seed) return [seed] def __del__(self): self.close() def __str__(self): return '<{} instance>'.format(type(self).__name__)
class Version10(MontageWfEnv): """ @version 10.0: - Design based on pegasus - Reward scheme same as v8 - Action scheu """ def __init__(self): # Montage Experiment Variable super(Version10, self).__init__() self.action_range = 30 self.cluster_range = 30 self.action_space = Discrete(self.action_range) self.observation_space = Discrete(self.cluster_range) self.cluster_size = 1 # Episode Conf # Best exec_time: None or 1, depends on reward version self.best_makespan = None self.last_makespan = None self.last_action = None self.reward = 0 self.total_reward = 0.0 self.exec_records = {} self.all_exec_record = list() self.all_overhead_record = list() self.all_makespan_record = list() self.seed() self.reset() def step(self, action, training=False): assert self.action_space.contains(action) action += 1 done = False reward = 0.0 # Return all the data collected makespan = self.run_gen_experiment(action) # status, jb, wt, cwt = self.run_experiment(action) # Experiment run failed -> High Penalty # if not status: # return self._get_obs(), 0, True, {} # # jb = 0 if jb is None else jb # wt = 0 if wt is None else wt # cwt = 0 if cwt is None else cwt # # makespan = jb self.all_makespan_record.append(makespan) if not training: # Setting up best exec if self.best_makespan is None: self.best_makespan = makespan if self.last_makespan is None: self.last_makespan = makespan # Rewarding / Penalty Judgement if makespan < np.percentile(self.all_makespan_record, 20): self.best_makespan = makespan reward = 10 # elif makespan < np.percentile(self.all_makespan_record, 40): # reward = 1 else: reward = -5 self.last_makespan = makespan self.total_reward += reward self.last_action = action if self.total_reward > 50 or self.total_reward < -100: done = True return self._get_obs(), reward, done, { "exec": makespan, "overhead": makespan, "makespan": makespan, "benchmark": np.percentile(self.all_makespan_record, 20) } def reset(self): self.total_reward = 0.0 # # This part is crucial, as it determine what data going to be stay and what's going to be remove to # # maintain the data stability # self.all_makespan_record.sort() # for x in self.all_makespan_record: # # if wanna get reward while mantain the data, must determine a suitable range of data # # if x < np.percentile(self.all_makespan_record, 20) or x > np.percentile(self.all_makespan_record, 80): # # Unable to obtain proper reward if without remove the extra datas # if x > np.percentile(self.all_makespan_record, 40): # self.all_makespan_record.remove(x) # if len(self.all_makespan_record) > 20: # while len(self.all_makespan_record) > 20: # # try to remove the median data # # self.all_makespan_record.remove(self.all_makespan_record[int(len(self.all_makespan_record) / 2) + 1]) # # self.all_makespan_record.remove(self.all_makespan_record[np.random.randint(1, len(self.all_makespan_record))]) # self.all_makespan_record.remove(self.all_makespan_record[0]) # # self.all_makespan_record.remove(self.all_makespan_record[len(self.all_makespan_record)-1]) return np.random.randint(1, self.cluster_range + 1) # , self.clusters_num def _get_obs(self): return np.random.randint(1, self.cluster_range + 1)
class BattleShipEnv(Env): metadata = {"render.modes": ["human", "ansi"]} def __init__(self, board_size=(5, 5), ship_sizes=[5, 4, 3, 3, 2]): self.grid = BattleGrid(board_size) self.action_space = Discrete(self.grid.n_tiles) self.observation_space = Discrete(len(Obs)) self.num_obs = 2 self._reward_range = self.action_space.n / 4. self._discount = 1. self.ship_sizes = ship_sizes self.total_remaining = sum(self.ship_sizes) def seed(self, seed=None): np.random.seed(seed) def _compute_prob(self, action, next_state, ob): action_pos = self.grid.get_coord(action) cell = self.grid[action_pos] if ob == Obs.NULL.value and cell.visited: return 1 elif ob == Obs.HIT.value and cell.occupied: return 1 else: return int(ob == Obs.NULL.value) def step(self, action): assert self.done is False assert self.action_space.contains(action) assert self.total_remaining > 0 self.last_action = action self.t += 1 action_pos = self.grid.get_coord(action) # cell = self.grid.get_value(action_pos) cell = self.grid[action_pos] reward = 0 if cell.visited: reward -= 10 obs = Obs.NULL.value else: if cell.occupied: reward -= 1 obs = 1 self.state.total_remaining -= 1 for d in range(4, 8): if self.grid[action_pos + Compass.get_coord(d)]: self.grid[action_pos + Compass.get_coord(d)].diagonal = False else: reward -= 1 obs = Obs.NULL.value cell.visited = True if self.state.total_remaining == 0: reward += self.grid.n_tiles self.done = True self.tot_rw += reward return obs, reward, self.done, {"state": self.state} def _set_state(self, state): self.reset() self.state = state def close(self): return def reset(self): self.done = False self.tot_rw = 0 self.t = 0 self.last_action = -1 self.state = self._get_init_state() return Obs.NULL.value def render(self, mode='human', close=False): if close: return if mode == 'human': if not hasattr(self, "gui"): obj_pos = [] for ship in self.state.ships: pos = ship.pos obj_pos.append(self.grid.get_index(pos)) for i in range(ship.length): pos += Compass.get_coord(ship.direction) obj_pos.append(self.grid.get_index(pos)) self.gui = ShipGui(board_size=self.grid.get_size, obj_pos=obj_pos) if self.t > 0: msg = "A: " + str(self.grid.get_coord( self.last_action)) + "T: " + str(self.t) + "Rw :" + str( self.tot_rw) self.gui.render(state=self.last_action, msg=msg) def _generate_legal(self): # assert self.state.total_remaining > 0 actions = [] for action in range(self.action_space.n): action_pos = self.grid.get_coord(action) if not self.grid[action_pos].visited: actions.append(action) # assert len(actions) > 0 return actions def _get_init_state(self): bsstate = ShipState() self.grid.build_board() for length in self.ship_sizes: # num_ships = 1 # for idx in range(num_ships): while True: # add one ship of each kind ship = Ship(coord=self.grid.sample(), length=length) if not self.collision(ship, self.grid, bsstate): break self.mark_ship(ship, self.grid, bsstate) bsstate.ships.append(ship) return bsstate @staticmethod def mark_ship(ship, grid, state): pos = ship.pos # .copy() for i in range(ship.length + 1): cell = grid[pos] assert not cell.occupied cell.occupied = True if not cell.visited: state.total_remaining += 1 pos += Compass.get_coord(ship.direction) @staticmethod def collision(ship, grid, state): pos = ship.pos # .copy() for i in range(ship.length): if not grid.is_inside(pos + Compass.get_coord(ship.direction)): return True # cell = grid.get_value(pos) cell = grid[pos] if cell.occupied: return True for adj in range(8): coord = pos + Compass.get_coord(adj) if grid.is_inside(coord) and grid[coord].occupied: return True pos += Compass.get_coord(ship.direction) return False
class GraphSamplingEnv(Env): metadata = { 'render.modes': ('human', ), } def __init__(self, max_samples=3, render_depth=3, graph_args=None): self.graph_args = {**DEFAULT_GRAPH_ARGS, **(graph_args or {})} self._generate_new_graph() self.sampling_set = set() # actions: 0: sample 1: next edge 2: move self.action_space = Discrete(3) self._current_node = 0 self._current_edge_idx = 0 self._clustering_coefficients = None self._max_samples = max_samples self._render_depth = render_depth self._screen = None self.reset() observation_length = self.num_nodes**2 + 5 * self.num_nodes self.observation_space = Box(0, 1, observation_length) def _generate_new_graph(self): graph = generate_appm.main(self.graph_args) self.graph = graph def _randomize_position(self): self._current_node = random.sample(self.graph.nodes(), 1)[0] self._current_edge_idx = np.random.randint( len(self.graph.neighbors(self._current_node))) def _reset(self): self._generate_new_graph() self.num_nodes = self.graph.number_of_nodes() self.sampling_set = set() self._clustering_coefficients = nx.clustering(self.graph) self._current_node = np.random.randint(self.graph.number_of_nodes()) self._current_edge_idx = 0 self._num_actions = 0 self.error = 0 self._adjacency_matrix = np.array( nx.adjacency_matrix(self.graph, range(self.num_nodes)).todense()) self._degree_vector = np.reshape( np.array([self.graph.degree(i) for i in range(self.num_nodes)], dtype=np.float32), (-1, 1)) self._clustering_coefficient_vector = np.reshape( np.array([ self._clustering_coefficients[i] for i in range(self.num_nodes) ]), (-1, 1)) x = [self.graph.node[idx]['value'] for idx in range(self.num_nodes)] return self._get_observation() def _validate_action(self, action): if not self.action_space.contains(action): raise ValueError("{} ({}) invalid".format(action, type(action))) def _get_next_node(self): neighbors = list(self.graph.neighbors(self._current_node)) return neighbors[self._current_edge_idx] def _get_observation(self): state_descriptor = np.zeros((self.num_nodes, 3)) # current node indicator state_descriptor[self._current_node, 0] = 1. # sampling set indicator for node in self.sampling_set: state_descriptor[node, 1] = 1. # next node indicator current_neighbor = self._get_next_node() state_descriptor[current_neighbor, 2] = 1. observation = np.reshape( np.concatenate( (self._adjacency_matrix, self._degree_vector, self._clustering_coefficient_vector, state_descriptor), 1), (-1)) return observation def _do_action(self, action): # actions: { 0: sample, 1: next edge 2: next node } if action == 0: self.sampling_set.add(self._current_node) elif action == 1: self._current_edge_idx = ((self._current_edge_idx + 1) % self.graph.degree(self._current_node)) elif action == 2: self._current_node = self._get_next_node() self._current_edge_idx = 0 self._num_actions += 1 def _reward(self): x_hat = sparse_label_propagation(self.graph, list(self.sampling_set)) x = [ self.graph.node[idx]['value'] for idx in range(self.graph.number_of_nodes()) ] error = nmse(x, x_hat) tv = total_variation(self.graph.edges(), x) self.error = error # this + all nodes in action seems best reward = 1.0 - error return reward def _step(self, action): self._validate_action(action) self._do_action(action) observation = self._get_observation() num_samples = len(self.sampling_set) reward = 0.0 done = (num_samples >= self._max_samples or num_samples >= self.graph.number_of_nodes()) if done: reward = self._reward() return observation, reward, done, {} def get_current_nmse(self): graph = self.graph sampling_set = self.sampling_set x = [ self.graph.node[idx]['value'] for idx in range(self.graph.number_of_nodes()) ] x_hat = sparse_label_propagation(graph, list(sampling_set)) return nmse(x, x_hat) def _render(self, mode='human', close=False): if close: if self._screen is not None: pygame.display.quit() pygame.quit() return if self._screen is None: pygame.init() self._screen = pygame.display.set_mode((800, 800)) subgraph_paths = nx.single_source_shortest_path( self.graph, self._current_node, cutoff=self._render_depth) subgraph = self.graph.subgraph(subgraph_paths) nodelist = [[] for _ in range(self._render_depth + 1)] nodelist_1d = [] node_color = [] for key, path in subgraph_paths.items(): level = len(path) - 1 nodelist[level].append(key) nodelist_1d.append(key) if key in self.sampling_set: node_color.append('r') else: node_color.append('g') for level in range(self._render_depth + 1): if len(nodelist[level]) == 0: del nodelist[level] local_labels = { key: str(self._clustering_coefficients[key])[:4] for key in subgraph.node } edge_color = [] edge_list = [] neighbors = list(self.graph.neighbors(self._current_node)) edge_to = neighbors[self._current_edge_idx] for edge in subgraph.edges(): edge_list.append(edge) if (edge_to in edge and self._current_node in edge): edge_color.append('r') elif (self._current_node in edge): edge_color.append('k') else: edge_color.append('b') fig = plt.figure(figsize=[4, 4], dpi=200) layout = nx.shell_layout(subgraph, nodelist) nx.draw(subgraph, edgelist=edge_list, edge_color=edge_color, style='dashed', nodelist=nodelist_1d, node_color=node_color, width=0.5, with_labels=True, center=self._current_node, pos=layout, labels=local_labels, font_size=8) canvas = agg.FigureCanvasAgg(fig) canvas.draw() size = canvas.get_width_height() raw_data = fig.canvas.tostring_rgb() plt.close(fig) surface = pygame.image.fromstring(raw_data, size, "RGB") self._screen.blit(surface, (0, 0)) pygame.display.flip() # TODO: the event loop should be continuously pumped somewhere to avoid UI # freezing after every call to _render() pygame.event.get()
class GazeboPionnerWrapper(gym.Env): def __init__(self): rospy.init_node("rl_robot_node") self.pub, self.reset_robot_state = reset_node_init() self.act_pub = rospy.Publisher('/pioneer/cmd_vel', Twist, queue_size=1) # self.observation_space = Discrete(100) # 这样写表示100个相互独立的离散状态 # 暂时去掉了两个速度,数据已经离散化(不需要归一化吧?),进一步转换成整型? # 速度的形式与其它状态相去甚远,需要进一步考虑是否将其作为状态 # self.observation_space = gym.spaces.Box(-10.0, 10.0, (14, ), dtype=np.dtype(np.float32)) self.observation_space = gym.spaces.Box(-100, 100, (14, ), dtype=np.dtype( np.int32)) # 整数版 # self.observation_space = gym.spaces.Box(-10.0, 10.0, (4, ), dtype=np.dtype(np.float32)) self.action_space = Discrete(5) # 取值0,1,2,3,4,5 , 动作是否应该加上组合? self.state = None # self.laser_data = [0 for i in range(0, 10)] self.num_laser = 10 # 别乱改,有些相关数据没用这个参数 self.laser_data = None self.sample_laser_data = [0 for x in range(0, self.num_laser) ] # 特定抽取的data, 位置从右向左 self.int_sample_laser_data = [0 for x in range(0, self.num_laser) ] # 特定抽取的data, 位置从右向左,整数版 # 下面5个用于校验激光数据(for reward_fuc) self.last_laser_data = [0 for x in range(0, self.num_laser)] # self.num_rew_sample_laser_data = 3 # self.rew_sample_laser_data = [[] for x in range(0, self.num_rew_sample_laser_data)] # self.next_nrsld_index = 0 # self.check_threshold = 0.3 # self.bool_check_laser_data = [0 for x in range(0, self.num_laser)] self.current_sate = None # self.speed = 0.0 # 运行一段时间之后,记录的speed、angle就会与实际(gazebo)有较大差距 self.min_speed = 0.0 # -0.75 # 线速度 m/s ;角速度 rad/s(6弧度相当于一个整圆),要想倒车必须在车身后部安装测距手段 self.max_speed = 0.75 # 速度不能再高了,已经开始出现误差了 # self.angle = 0.0 self.min_angle = -0.5 self.max_angle = 0.5 self.cmd_twist = Twist() self.done = False self.thread1 = LaserNodeThread() self.thread1.start() # 获取位置服务 rospy.wait_for_service('gazebo/get_model_state') self.get_state_srv = rospy.ServiceProxy('/gazebo/get_model_state', GetModelState) # 重置位置服务 rospy.wait_for_service('gazebo/set_model_state') self.set_obstacle_srv = rospy.ServiceProxy('/gazebo/set_model_state', SetModelState) self.obstacle_pose = [[0 for x in range(0, 2)] for x in range(0, 20)] # 20个障碍物的位置[x, y] # gazebo返回的robot数据 self.gazebo_robot_pose_x = None self.gazebo_robot_pose_y = None self.gazebo_robot_speed = 0.0 self.gazebo_robot_angle = 0.0 # laser_data_node() # reward 相关 self.arrive_reward = 100.0 self.collision_reward = -100.0 self.road_reward_times = 0.1 self.last_distance_to_goal = 0.0 # 上一次动作时距离目标的距离 # 设置最终robot位姿 self.goal_x = 0.0 self.goal_y = 10.0 self.desire_final_speed = None self.desire_final_angle = None self.eps = 0 # 记录当前迭代轮数 # self.seed() # self.reset() # 先初始化一下 , 在dqn中会先调用一下的 # def seed(self, seed=None): # self.np_random, seed = seeding.np_random(seed) # return [seed] # 使用随机目标应该可以加快学习速度,和泛化能力 def random_goal(self): # print('call') self.goal_x = (20 * (np.random.ranf() - 0.5)) # x~(-10,10) # 注意这里的设计,比较巧妙, 每当goal_x 不满足条件时,就需要重新random一下,然后与出生点、“所有”障碍物点重新对比一下 i = -1 while i < 19: i += 1 while abs(self.goal_x) < 1.0 or ( abs(self.obstacle_pose[i][0] - self.goal_x) < 1.2): # 1.2 有待商议 self.goal_x = (20 * (np.random.ranf() - 0.5)) # 距离初始位置或任意障碍物过近 i = -1 i = -1 self.goal_y = (20 * (np.random.ranf() - 0.5)) # y~(-10,10) while i < 19: i += 1 while abs(self.goal_y) < 1.0 or ( abs(self.obstacle_pose[i][1] - self.goal_y) < 1.2): # 1.2 有待商议 self.goal_y = (20 * (np.random.ranf() - 0.5)) # 距离初始位置或任意障碍物过近 i = -1 self.goal_x = round(self.goal_x, 1) # 离散化 self.goal_y = round(self.goal_y, 1) # print('finish') # def distance_to_obstacle(self, num_x, num_y, count): # return math.sqrt((self.obstacle_pose[count][0] - num_x) ** 2 + (self.obstacle_pose[count][1] - num_y) ** 2) def reset(self): self.eps += 1 # print('call_reset') self.done = False # 重置robot的 speed angle 注意各步骤执行顺序 self.cmd_twist.linear.x = 0.0 self.cmd_twist.angular.z = 0.0 # 将robot重置回原位置 start_time = time.time() while (time.time() - start_time) < 0.5: # self.act_pub.publish(self.cmd_twist) # 先重置速度 self.pub.publish(self.reset_robot_state) # 重置state中的pose speed angle self.gazebo_robot_pose_x, self.gazebo_robot_pose_y, self.gazebo_robot_speed, self.gazebo_robot_angle = \ self.get_gazebo_state() self.gazebo_robot_speed = 0.0 self.gazebo_robot_angle = 0.0 self.laser_data = self.thread1.laser_data self.sample_fuc() self.get_rid_of_bad_data() self.int_sample_fuc() self.state = np.array(self.int_sample_laser_data[:]) # self.state = np.array([]) # 用来只有目标点作为状态时 self.state = np.append(self.state, [ int(self.goal_x * 10), int(self.goal_y * 10), int(self.gazebo_robot_pose_x * 10), int(self.gazebo_robot_pose_y * 10) ]) # self.gazebo_robot_speed, self.gazebo_robot_angle]) # reset 函数需要提供初始状态 if self.eps % 30 == 0 or self.eps == 1: # 每30轮random一次 # self.reset_obstacle_pose() # 3步顺序不可乱 # self.reset_obstacle_pose_srv() self.random_goal() print('x: ' + str(self.goal_x) + ' y: ' + str(self.goal_y)) # print(self.goal_x, self.goal_y) self.last_distance_to_goal = self.distance_to_goal() self.last_laser_data = self.sample_laser_data[:] self.state = self.state.reshape((14, )) # self.state = self.state.reshape((4,)) return self.state def step(self, action): assert self.action_space.contains( action), "%r (%s) invalid" % (action, type(action)) # robot 的加速度是有限制的,这里的选取一定要慎重,有两种方案,要么把变化数值都设成很小(不宜,难以快速响应环境) # 要么将publish的频率设限,20hz(+0.1情况下)以下 if action == 0: self.gazebo_robot_speed += 0.05 elif action == 1: self.gazebo_robot_speed -= 0.05 elif action == 2: self.gazebo_robot_angle += 0.05 elif action == 3: self.gazebo_robot_angle -= 0.05 elif action == 4: pass # 保持当前状态 # elif action == 5: # test_action ,later remove # pass else: gym.logger.warn("Unknown situation !") self.gazebo_robot_speed = np.clip(self.gazebo_robot_speed, self.min_speed, self.max_speed) self.gazebo_robot_angle = np.clip(self.gazebo_robot_angle, self.min_angle, self.max_angle) # 与许多其它os不同之处,p2os设置好cmd_vel后,robot会一直保持该速度, # self.cmd_twist.linear.x = 0 # gazebo中robot就算什么指令都没有,还是会有非常小的cmd_twist.linear.x与cmd_twist.angular.z,如果直接publish这些数据,只会造成误差越来越大 if abs(self.gazebo_robot_speed) < 0.025: self.gazebo_robot_speed = 0.0 if abs(self.gazebo_robot_angle) < 0.025: self.gazebo_robot_angle = 0.0 self.cmd_twist.linear.x = self.gazebo_robot_speed self.cmd_twist.angular.z = self.gazebo_robot_angle # print("gazebo: speed" + str(self.gazebo_robot_speed) + ", angle: " + str(self.gazebo_robot_angle)) # 应该新建一个服务,将最新的动作信息同步给该服务,在该服务中负责发布cmd_twist,或者使用线程 time.sleep(0.075) self.act_pub.publish(self.cmd_twist) # 执行完动作,检查一下状态值(这里会不会不太对?因为这里并不是严格的action一下,返回一个动作) self.laser_data = self.thread1.laser_data self.gazebo_robot_pose_x, self.gazebo_robot_pose_y, self.gazebo_robot_speed, self.gazebo_robot_angle = \ self.get_gazebo_state() self.sample_fuc() self.get_rid_of_bad_data() self.int_sample_fuc() # print(self.sample_laser_data) # print("first") # print(self.rew_sample_laser_data) self.state = np.array(self.int_sample_laser_data[:]) # self.state = np.array([]) self.state = np.append(self.state, [ int(self.goal_x * 10), int(self.goal_y * 10), int(self.gazebo_robot_pose_x * 10), int(self.gazebo_robot_pose_y * 10) ]) # self.gazebo_robot_speed, self.gazebo_robot_angle]) # self.state = (self.laser_data[0:10], self.gazebo_robot_pose.x, self.gazebo_robot_pose.y, # self.gazebo_robot_speed, self.gazebo_robot_angle) # self.state.append(self.gazebo_robot_pose.x) # self.state.append(self.gazebo_robot_pose.y) # self.state.append(self.gazebo_robot_speed) # self.state.append(self.gazebo_robot_angle) # print(self.sample_laser_data) # 应该把最后的位姿信息也作为限制 if self.distance_to_goal() < 0.5 or min( self.sample_laser_data ) < 0.5: # and (self.gazebo_robot_speed < 0.1): self.done = True self.state = self.state.reshape((14, )) # self.state = self.state.reshape((4, )) reward_num = self.reward() # reward值必须在更新last_distance_ro_goal之前计算 self.last_distance_to_goal = self.distance_to_goal( ) # 更新距离,为下次计算reward准备 self.last_laser_data = self.sample_laser_data[:] # 注意这里,坑.... copy!!! return self.state, reward_num, self.done, {} def distance_to_goal(self): return math.sqrt((self.goal_x - self.gazebo_robot_pose_x)**2 + (self.goal_y - self.gazebo_robot_pose_y)**2) def reward(self): # print(self.sample_laser_data) # print(min(self.sample_laser_data) < 0.5) # 应该把最后的位姿信息也作为reward的限制,与done保持一致,目标距离与障碍物距离之间应该有交叉reward? if self.distance_to_goal() < 0.5: return self.arrive_reward elif min(self.sample_laser_data ) < 0.5: # 有关障碍物距离部分,原始激光数据有噪声(或其它原因,不能直接做判定,甚至达到0.2 ???) # print(self.sample_laser_data) # print("error") return self.collision_reward # 谨慎选择有关距离的reward_fuc, 避免robot有可能出现的刷分行为(与arrive_reward的比例,正负等等) # 有关距离目标距离的部分,应该可以加快学习速度,但是比较难处理,超过目标点之后,跑远了就给它负的reward # 方案一 # else: # return self.road_reward_times * (self.last_distance_ro_goal-self.distance_to_goal()) # 方案二 else: if self.last_distance_to_goal - self.distance_to_goal( ) > 0: # 越来越近 return 1.0 * self.road_reward_times else: # 现在这样原地不动,也是负reward return -1.0 * self.road_reward_times # 加大远离的惩罚,后期去掉? # else: # return 0.0 def render(self, mode='human'): pass # 是否应该换成机器人本身的数据,毕竟真实机器人没有这些 # @staticmethod def get_gazebo_state(self): # 第一个参数model_name,第二个参数为relative_entity_name model_state = self.get_state_srv( 'robot_description', '') # 注意model_state 的类型为GetModelStateResponse robot_pose_x = round(model_state.pose.position.x, 1) # 注意pose为(x,y,z) , 离散, robot_pose_y = round(model_state.pose.position.y, 1) robot_twist_linear_x = round(model_state.twist.linear.x, 4) # 离散化 robot_twist_angular_z = round(model_state.twist.angular.z, 4) # angular.z代表平面机器人的角速度,因为此时z轴为旋转轴 return robot_pose_x, robot_pose_y, robot_twist_linear_x, robot_twist_angular_z def get_obstacle_pose(self): pass # 用来随机重置障碍物的位置坐标 def reset_obstacle_pose(self): for i in range(0, 20): self.obstacle_pose[i][0] = (20 * (np.random.ranf() - 0.5) ) # x~(-10,10) while abs(self.obstacle_pose[i][0]) < 1.0: self.obstacle_pose[i][0] = (20 * (np.random.ranf() - 0.5) ) # 距离初始位置过近 self.obstacle_pose[i][0] = round(self.obstacle_pose[i][0], 1) # 离散化 self.obstacle_pose[i][1] = (20 * (np.random.ranf() - 0.5) ) # y~(-10,10) while abs(self.obstacle_pose[i][1]) < 1.0: self.obstacle_pose[i][1] = (20 * (np.random.ranf() - 0.5) ) # 距离初始位置过近 self.obstacle_pose[i][1] = round(self.obstacle_pose[i][1], 1) # 使用服务,重置障碍物 def reset_obstacle_pose_srv(self): new_obstacle_state = ModelState() new_obstacle_state.pose.orientation.x = 0 new_obstacle_state.pose.orientation.y = 0 new_obstacle_state.pose.orientation.z = 0 new_obstacle_state.pose.orientation.w = 0 new_obstacle_state.twist.linear.x = 0 new_obstacle_state.twist.linear.y = 0 new_obstacle_state.twist.linear.z = 0 new_obstacle_state.twist.angular.x = 0 new_obstacle_state.twist.angular.y = 0 new_obstacle_state.twist.angular.z = 0 new_obstacle_state.reference_frame = "world" for i in range(0, 15): # 10个方块 new_obstacle_state.model_name = "unit_box_" + str(i) new_obstacle_state.pose.position.x = self.obstacle_pose[i][0] new_obstacle_state.pose.position.y = self.obstacle_pose[i][1] new_obstacle_state.pose.position.z = 0 self.set_obstacle_srv(new_obstacle_state) for i in range(0, 5): # 5个圆柱体,不使用球体,容易到处乱滚 new_obstacle_state.model_name = "unit_cylinder_" + str(i) new_obstacle_state.pose.position.x = self.obstacle_pose[i + 15][0] new_obstacle_state.pose.position.y = self.obstacle_pose[i + 15][1] new_obstacle_state.pose.position.z = 0 self.set_obstacle_srv(new_obstacle_state) # 从原激光数据中抽取10组0.0型的激光数据 def sample_fuc(self): self.sample_laser_data[0] = round(self.laser_data[0], 1) # 也许2位更好? self.sample_laser_data[-1] = round(self.laser_data[-1], 1) for i in range(1, 9): self.sample_laser_data[i] = round( self.laser_data[i * 72], 1) # 离散化,保留1位小数,注意round()既非截断,也非四舍五入 for j in range(0, 10): # if self.sample_laser_data[i] == float('inf'): if self.sample_laser_data[j] > 10.0: self.sample_laser_data[j] = 10.0 # # 该函数应该在sample_fuc()之后调用,用于记录一个数组中的几组激光数据,self.rew_sample_laser_data为3×10(暂时) # def add_check_laser_data(self): # data = self.sample_laser_data # print(data) # if self.next_nrsld_index >= len(self.rew_sample_laser_data): # self.rew_sample_laser_data.append(data) # else: # self.rew_sample_laser_data[self.next_nrsld_index] = data # self.next_nrsld_index = (self.next_nrsld_index + 1) % self.num_rew_sample_laser_data # print(self.next_nrsld_index) # def add_check_laser_data(self): # data = self.sample_laser_data # self.rew_sample_laser_data[self.next_nrsld_index] = data # self.next_nrsld_index = (self.next_nrsld_index + 1) % self.num_rew_sample_laser_data # print("what the hell?") # print(self.rew_sample_laser_data) # print(self.next_nrsld_index) # # # 用于恢复<0.3的激光数据 # def get_rid_of_bad_data(self): # for i in range(self.num_laser): # if self.sample_laser_data[i] < 0.3: # for j in range(self.num_rew_sample_laser_data): # if self.rew_sample_laser_data[j][i] > 0.3: # self.sample_laser_data[i] = self.rew_sample_laser_data[j][i] # else: # pass # else: # pass # 用于恢复<0.3的激光数据 def get_rid_of_bad_data(self): for i in range(self.num_laser): if self.sample_laser_data[i] < 0.3: # print("check") # print(self.sample_laser_data) # print(self.last_laser_data) self.sample_laser_data[i] = self.last_laser_data[i] else: pass def int_sample_fuc(self): for j in range(0, 10): self.int_sample_laser_data[j] = int(self.sample_laser_data[j] * 10) # 变成整数 # 找到其它原因,放弃该部分代码 # # 给出一个避免噪声影响的最小值, 还要加上限制条件(inf情况下),如安然变小(噪声),为false # def check_min_laser_data(self): # mean = 0.0 # sigle_bool_check_data = [0 for x in range(0, self.num_rew_sample_laser_data)] # sigle_bool_bool = [0 for x in range(0, self.num_rew_sample_laser_data)] # for i in range(self.num_laser): # for j in range(self.num_rew_sample_laser_data): # sigle_bool_check_data[j] = self.rew_sample_laser_data[j][i] # mean = sum(sigle_bool_check_data)/len(sigle_bool_check_data) # for k in range(self.num_rew_sample_laser_data): # if (sigle_bool_check_data[k] - mean) > 0.1: # (当前状态下不太可能单次变化超过0.1) # sigle_bool_bool[k] = False # 判定为噪声数据 # else: # sigle_bool_bool[k] = True # if False in sigle_bool_bool: # self.bool_check_laser_data[i] = False # 判定该位置为噪声数据 # # 迭代寻找最小数据,未完成 # def __del__(self): # # 等待至线程中止。这阻塞调用线程直至线程的join()方法被调用中止-正常退出或者抛出未处理的异常-或者是可选的超时发生。 # self.thread1.join() # 还是不对 # def close(self): self.thread1.join() # 还是不对
class Version5(MontageWfEnv): """ @version 5.0: - Run with real env - No Sampling """ def __init__(self): # Montage Experiment Variable super(Version5, self).__init__() self.action_range = 20 self.cluster_range = 20 self.action_space = Discrete(self.action_range) self.observation_space = Discrete(self.cluster_range) # Episode Conf # Best exec_time: None or 1, depends on reward version self.best_exec_time = None self.last_exec_time = None self.last_action = None self.last_reward = None self.total_reward = 0.0 self.all_makespan_record = list() self.all_benchmark_record = list() # 0: Ntg, 1: improve, 2: degrade self.is_improve = 0 self.seed() self.reset() def step(self, action, training=False): assert self.action_space.contains(action) action += 1 done = False reward = 0.0 # res = self.run_static_experiment(self.clusters_size, self.clusters_num) res = self.run_demo_cn_gen_experiment(action) self.all_makespan_record.append(res) self.exec_time = res # Rewarding / Penalty Judgement percentile = np.percentile(self.all_makespan_record, 10) benchmark = np.mean(self.all_benchmark_record) # Calc improve percentage if len(self.all_benchmark_record) == 0: self.all_benchmark_record.append(percentile) elif abs(percentile - benchmark) / benchmark * 100 > 10: self.all_benchmark_record.append(percentile) benchmark = np.mean(self.all_benchmark_record) # print(benchmark) # if self.exec_time < np.percentile(self.all_makespan_record, 10): if self.exec_time < benchmark: reward = 10 else: reward = -1 self.total_reward += reward if self.total_reward > 200: done = True return self._get_obs(), reward, done, { "exec": self.exec_time, "overhead": self.exec_time, "makespan": self.exec_time, "benchmark": benchmark } def render(self, mode='human'): outfile = StringIO() if mode == 'ansi' else sys.stdout init_msg = "Current Experiment Parameters: degree: %d\t cluster.size: %d\t cluster.num: %d\t\n" % ( self.degree, self.clusters_size, self.clusters_num) outfile.write(init_msg) # if self.last_action is not None: # cs, cn = action result_str = "Current Execution Time: \t" expect_str = "Best Execution Time: \t" action_str = "Current Action: \t" # Process Outputs outfile.write(result_str + (" %s " % self.exec_time) + "\n") outfile.write(expect_str + (" %s " % self.best_exec_time) + "\n") outfile.write(action_str + (" %s " % self.last_action) + "\n") return outfile def reset(self): # Reset method should always return a new sets of episode settings # self.degree = 0.1 # self.clusters_size = 1 # self.clusters_num = 1 # self.band_num = 1 # self.best_exec_time = None if self.exec_time is not None: self.last_exec_time = self.exec_time self.wall_time = None self.cum_wall_time = None self.total_reward = 0 self.clusters_size = np.random.randint(1, 30) self.clusters_num = np.random.randint(1, 30) # print("Environment had been reset!") return np.random.randint(1, self.cluster_range + 1) # , self.clusters_num def _get_obs(self): return np.random.randint(1, self.cluster_range + 1)
class Version11(MontageWfEnv): """ @version 11.0: """ def __init__(self): # Montage Experiment Variable super(Version11, self).__init__() self.action_range = 10 self.cluster_range = 10 self.action_space = Discrete(self.action_range) self.observation_space = Discrete(self.cluster_range) self.cluster_size = 1 # Episode Conf # Best exec_time: None or 1, depends on reward version self.best_makespan = None self.last_makespan = None self.last_action = None self.reward = 0 self.total_reward = 0.0 self.exec_records = {} self.all_exec_record = list() self.all_overhead_record = list() self.all_makespan_record = list() self.all_benchmark_record = list() self.seed() self.reset() def step(self, action, training=False): assert self.action_space.contains(action) action += 1 done = False reward = 0.0 # Return all the data collected # makespan = self.run_gen_experiment(action) makespan = self.run_cn_gen_experiment(action) # status, jb, wt, cwt = self.run_experiment(action) # Experiment run failed -> High Penalty # if not status: # return self._get_obs(), 0, True, {} # # jb = 0 if jb is None else jb # wt = 0 if wt is None else wt # cwt = 0 if cwt is None else cwt # # makespan = jb self.all_makespan_record.append(makespan) if not training: # Setting up best exec # if self.best_makespan is None: # self.best_makespan = makespan # if self.last_makespan is None: # self.last_makespan = makespan # Rewarding / Penalty Judgement percentile = np.percentile(self.all_makespan_record, 10) benchmark = np.mean(self.all_benchmark_record) # Calc improve percentage if len(self.all_benchmark_record) == 0: self.all_benchmark_record.append(percentile) elif abs(percentile - benchmark) / benchmark * 100 > 10: self.all_benchmark_record.append(percentile) benchmark = np.mean(self.all_benchmark_record) # else: # print(abs(percentile - benchmark) / benchmark * 100) # self.all_benchmark_record.append(percentile) # benchmark = percentile if makespan < benchmark: reward = 1 else: reward = -1 self.last_makespan = makespan self.total_reward += reward self.last_action = action if self.total_reward > 20 or self.total_reward < -20: done = True return self._get_obs(), reward, done, { "exec": makespan, "overhead": makespan, "makespan": makespan, "benchmark": benchmark } def reset(self): self.total_reward = 0.0 return np.random.randint(1, self.cluster_range + 1) # , self.clusters_num def _get_obs(self): return np.random.randint(1, self.cluster_range + 1)
class AdaptiveLearningEnv(gym.Env): metadata = { 'render.modes': ['human', 'rgb_array'] } def __init__(self, filename='activities.pkl'): self.filename = filename self.assets_dir = os.path.dirname(os.path.abspath(__file__)) self.reward_range = (0, 1) self.viewer = None self.circle_indexs = [] self.ob = None self._configure() self._seed() self._reset() def _configure(self): self._load_activities() self.action_space = Discrete(len(self.activities)) self.observation_space = Box(0, 1, len(self.knowledges)) # self.simulator = StudentSimulator() def _load_activities(self): data_file = os.path.join(self.assets_dir, 'assets/%s' % self.filename) pkl_file = open(data_file, 'rb') self.knowledges = pickle.load(pkl_file) self.activities = pickle.load(pkl_file) pkl_file.close() def _step(self, action): assert self.action_space.contains(action) a = Activity(self.activities[action], self.knowledges) ob, reward, done = self.simulator.progress(self.ob, a) self.ob = ob return ob, reward, done, {} def _reset(self): self.ob = Box(0.1, 0.1, len(self.knowledges)).sample() return self.ob def _seed(self, seed=None): self.np_random, seed = seeding.np_random(seed) return [seed] def _render(self, mode='human', close=False): if close: if self.viewer is not None: self.viewer.close() self.viewer = None return screen_width = 600 screen_height = 400 radius = 10 init_alpha = 0.1 margin = radius * 2.5 max_per_line = (screen_width - margin * 2) / margin colors = np.array([[78,191,126], [254,178,45], [175,101,194]])/255. if self.viewer is None: from gym.envs.classic_control import rendering self.viewer = rendering.Viewer(screen_width, screen_height) for (i, x) in enumerate(sorted(self.knowledges, key=lambda tup:(tup.level(), tup.group), reverse=True)): h, w = divmod(i, max_per_line) self.circle_indexs.append(x._id) w = screen_width - 20 - w * (margin + 10) h = screen_height - 20 - h * (margin + 10) t = self.viewer.draw_circle(radius) t.add_attr(rendering.Transform((w, h))) r, g, b = colors[x.level() - 1] t.set_color(r, g, b, init_alpha) self.viewer.add_geom(t) for i, x in enumerate(self.ob): if len(self.circle_indexs) != 0: t = self.viewer.geoms[self.circle_indexs.index(i)] k = self.knowledges[i] r, g, b = colors[k.level() - 1] t.set_color(r, g, b, x) return self.viewer.render(return_rgb_array = mode=='rgb_array')
class Simple_Copy_Repeat_ENV(Env): ALPHABET = list(string.ascii_uppercase[:26]) def __init__(self, n_char=5, size=6, repeat=3): """ :param n_char: number of different chars in inputs, e.g. 3 => {A,B,C} :param size: the length of input sequence :param repeat: the expected repeat times of the target output """ self.n_char = n_char self.size = size self.repeat = repeat # observation (characters) self.observation_space = Discrete( n_char + 1) # +1: empty symbol, whose index is n_char # action self.action_space = Discrete(n_char) # states of an episode self.position = None self.last_action = None self.last_reward = None self.episode_total_reward = None self.input_str = None self.target_str = None self.output_str = None self.np_random = None self.seed() self.reset() @property def input_length(self): return len(self.input_str) @property def target_length(self): return self.input_length * self.repeat def seed(self, seed=None): self.np_random, seed = seeding.np_random(seed) return [seed] def reset(self): self.position = 0 self.last_action = None self.last_reward = None self.episode_total_reward = 0.0 self.input_str, self.target_str = self._generate_input_target() self.output_str = '' obs_char, obs_idx = self._get_observation() return obs_idx def step(self, action): assert self.action_space.contains(action) assert 0 <= self.position < self.target_length target_act = self.ALPHABET.index(self.target_str[self.position]) reward = 1.0 if action == target_act else -1.0 self.last_action = action self.last_reward = reward self.episode_total_reward += reward self.output_str += self.ALPHABET[action] self.position += 1 if self.position < self.target_length: done = False _, obs = self._get_observation() else: done = True obs = None info = {"target_act": target_act} return obs, reward, done, info def render(self, mode='human'): outfile = sys.stdout # TODO: other mode pos = self.position - 1 o_str = "" if pos > -1: for i, c in enumerate(self.output_str): color = 'green' if self.target_str[i] == c else 'red' o_str += colorize(c, color, highlight=True) outfile.write("=" * 20 + "\n") outfile.write("Length : " + str(self.input_length) + "\n") outfile.write("T-Length : " + str(len(self.target_str)) + "\n") outfile.write("Input : " + self.input_str + "\n") outfile.write("Target : " + self.target_str + "\n") outfile.write("Output : " + o_str + "\n") if self.position > 0: outfile.write("-" * 20 + "\n") outfile.write("Current reward: %.2f\n" % self.last_reward) outfile.write("Cumulative reward: %.2f\n" % self.episode_total_reward) outfile.write("\n") return def _generate_input_target(self): input_str = "" for i in range(self.size): c = self.np_random.choice(self.ALPHABET[:self.n_char]) input_str += c target_str = "" for i in range(self.repeat): if i % 2 == 1: target_str += input_str[::-1] else: target_str += input_str return input_str, target_str def _get_observation(self, pos=None): if pos is None: pos = self.position if pos >= self.input_length: obs_char = '' obs_idx = self.n_char else: obs_char = self.input_str[pos] obs_idx = self.ALPHABET.index(obs_char) return obs_char, obs_idx
class TigerEnv(gym.Env): metadata = {"render.modes": ["human", "ansi"]} def __init__(self, seed=0, correct_prob=.85): self.correct_prob = correct_prob self.action_space = Discrete(len(Action)) self.state_space = Discrete(len(State)) self.observation_space = Discrete(len(Obs)) self._discount = .95 self._reward_range = (-float(100), float(10)) self._query = 0 self.seed(seed) def reset(self): self.done = False self.t = 0 self._query = 0 self.state = self.state_space.sample() self.last_action = Action.LISTEN.value return Obs.NULL.value def seed(self, seed=1234): np.random.seed(seed) return [seed] def step(self, action): assert self.action_space.contains(action) assert self.done is False self.t += 1 self._query += 1 self.last_action = action rw = TigerEnv._compute_rw(self.state, action) if TigerEnv._is_terminal(self.state, action): self.done = True return self.state, rw, self.done, {'state': self.state} self._sample_state(action) ob = TigerEnv._sample_ob(action, self.state) self.done = False return ob, rw, self.done, {"state": self.state} def render(self, mode='human', close=False): if close: return if mode == "human": if not hasattr(self, "gui"): self.gui = TigerGui() msg = "A: " + action_to_str( self.last_action) + " S: " + state_to_str(self.state) self.gui.render(state=(self.last_action, self.state), msg=msg) elif mode == "ansi": print("Current step: {}, tiger is in state: {}, action took: {}". format(self.t, self.state, self.last_action[0])) else: raise NotImplementedError() def close(self): self._render(close=True) def _set_state(self, state): self.state = state self.done = False def _generate_legal(self): return list(range(self.action_space.n)) def _generate_preferred(self, history): return self._generate_legal() def _sample_state(self, action): if action == Action.RIGHT.value or action == Action.LEFT.value: self.state = self.state_space.sample() def _get_init_state(self): # fix initial belief to be exact return self.state_space.sample() @staticmethod def _compute_prob(action, next_state, ob, correct_prob=.85): p_ob = 0.0 if action == Action.LISTEN.value and ob != Obs.NULL.value: if (next_state == State.LEFT.value and ob == Obs.LEFT.value) or ( next_state == State.RIGHT.value and ob == Obs.RIGHT.value): p_ob = correct_prob else: p_ob = 1 - correct_prob elif action != Action.LISTEN.value and ob == Obs.NULL.value: p_ob = 1. assert p_ob >= 0.0 and p_ob <= 1.0 return p_ob @staticmethod def _sample_ob(action, next_state, correct_prob=.85): ob = Obs.NULL.value p = np.random.uniform() if action == Action.LISTEN.value: if next_state == State.LEFT.value: ob = Obs.RIGHT.value if p > correct_prob else Obs.LEFT.value else: ob = Obs.LEFT.value if p > correct_prob else Obs.RIGHT.value return ob @staticmethod def _local_move(state, last_action, last_ob): raise NotImplementedError() @staticmethod def _is_terminal(state, action): is_terminal = False if action != Action.LISTEN.value: is_terminal = ( (action == Action.LEFT.value and state == State.LEFT.value) or (action == Action.RIGHT.value and state == State.RIGHT.value)) return is_terminal @staticmethod def _compute_rw(state, action): if action == Action.LISTEN.value: reward = -1 elif not TigerEnv._is_terminal(state, action): reward = 10 else: reward = -100 return reward
class TagEnv(Env): metadata = {"render.modes": ["human", "ansi"]} def __init__(self, num_opponents=1, move_prob=.8, obs_cells=29, board_size=(10, 5)): self.num_opponents = num_opponents self.move_prob = move_prob self._reward_range = 10 * self.num_opponents self._discount = .95 self.action_space = Discrete(len(Action)) self.grid = TagGrid(board_size, obs_cells=obs_cells) self.observation_space = Discrete(self.grid.n_tiles + 1) self.time = 0 self.done = False self.time = 0 self.last_action = 4 self.state = None def reset(self): self.done = False self.time = 0 self.last_action = 4 self.state = self._get_init_state(False) # get agent position return self._sample_ob(state=self.state, action=0) def seed(self, seed=None): np.random.seed(seed) return [seed] def step(self, action: int): # state err_msg = "%r (%s) invalid" % (action, type(action)) assert self.action_space.contains(action), err_msg assert self.done is False reward = 0. self.time += 1 self.last_action = action assert self.grid.is_inside(self.state.agent_pos) assert self.grid.is_inside(self.state.opponent_pos[0]) if action == 4: tagged = False for opp, opp_pos in enumerate(self.state.opponent_pos): # check if x==x_agent and y==y_agent if opp_pos == self.state.agent_pos: reward = 10. tagged = True # self.state.opponent_pos[opp] = Coord(4, 4) self.state.num_opp -= 1 # self.state.opponent_pos.pop(opp) elif self.grid.is_inside(self.state.opponent_pos[opp]) and\ self.state.num_opp > 0: self.move_opponent(opp) if not tagged: reward = -10. else: reward = -1. next_pos = self.state.agent_pos + Moves.get_coord(action) if self.grid.is_inside(next_pos): self.state.agent_pos = next_pos ob = self._sample_ob(self.state, action) assert ob < self.grid.n_tiles + 1 # p_ob = self._compute_prob(action, self.state, ob) self.done = self.state.num_opp == 0 # self._encode_state(self.state)} return ob, reward, self.done, {"state": self.state} def render(self, mode="human", close=False): # return if close: return # if mode == "human": # agent_pos = self.grid.get_index(self.state.agent_pos) # opponent_pos = [self.grid.get_index(opp) for # opp in self.state.opponent_pos] # if self.gui is None: # self.gui = TagGui(board_size=self.grid.get_size, # start_pos=agent_pos, obj_pos=opponent_pos) # msg = "S: " + str(self.state) + " T: " + str(self.time) +\ # " A: " + action_to_str( # self.last_action) # self.gui.render(state=(self.state.agent_pos, # self.state.opponent_pos), msg=msg) def _encode_state(self, state): s = np.zeros(self.num_opponents + 1, dtype=np.int32) - 1 s[0] = self.grid.get_index(state.agent_pos) for idx, opp in zip(range(1, len(s)), state.opponent_pos): opp_idx = self.grid.get_index(opp) s[idx] = opp_idx return s def _decode_state(self, state): agent_idx = state[0] tag_state = TagState(self.grid.get_tag_coord(agent_idx)) for opp_idx in state[1:]: if opp_idx > -1: tag_state.num_opp += 1 opp_pos = self.grid.get_tag_coord(opp_idx) tag_state.opponent_pos.append(opp_pos) return tag_state def _get_init_state(self, should_encode=False): agent_pos = self.grid.sample() assert self.grid.is_inside(agent_pos) tag_state = TagState(agent_pos) for opp in range(self.num_opponents): opp_pos = self.grid.sample() assert self.grid.is_inside(opp_pos) tag_state.opponent_pos.append(opp_pos) tag_state.num_opp = self.num_opponents assert len(tag_state.opponent_pos) > 0 if not should_encode: return tag_state else: return self._encode_state(tag_state) def _set_state(self, state): # self.reset() self.done = False # self.state = self._decode_state(state) self.state = state def move_opponent(self, opp): opp_pos = self.state.opponent_pos[opp] actions = self._admissable_actions(self.state.agent_pos, opp_pos) if np.random.binomial(1, self.move_prob): move = np.random.choice(actions).value if self.grid.is_inside(opp_pos + move): self.state.opponent_pos[opp] += move def _compute_prob(self, next_state, ob): # next_state = self._decode_state(next_state) p_ob = int(ob == self.grid.get_index(next_state.agent_pos)) if ob == self.grid.n_tiles: for opp_pos in next_state.opponent_pos: if opp_pos == next_state.agent_pos: return 1. return p_ob def _sample_ob(self, state, action): ob = self.grid.get_index(state.agent_pos) # agent index # ob = self.grid.board[self.state.agent_pos] if action < Action.TAG.value: for opp_pos in state.opponent_pos: if opp_pos == state.agent_pos: ob = self.grid.n_tiles # 29 observation return ob def generate_legal(self): return list(range(self.action_space.n)) def _generate_preferred(self, history): actions = [] if history.size == 0: return self.generate_legal() if history[-1].ob == self.grid.n_tiles and\ self.grid.is_corner(self.state.agent_pos): actions.append(Action.TAG.value) return actions for d in range(4): if history[-1].action != self.grid.opposite(d) and\ self.grid.is_inside( self.state.agent_pos + Moves.get_coord(d)): actions.append(d) assert len(actions) > 0 return actions # def _local_move(self, state, last_action, last_ob): # if len(state.opponent_pos) > 0: # opp = np.random.randint(len(state.opponent_pos)) # else: # return False # # if state.opponent_pos[opp] == Coord(-1, -1): # return False # state.opponent_pos[opp] = self.grid.sample() # if last_ob != self.grid.get_index(state.agent_pos): # state.agent_pos = self.grid.get_tag_coord(last_ob) # # ob = self._sample_ob(state, last_action) # return ob == last_ob @staticmethod def _admissable_actions(agent_pos, opp_pos): actions = [] if opp_pos.x >= agent_pos.x: actions.append(Moves.EAST) if opp_pos.y >= agent_pos.y: actions.append(Moves.NORTH) if opp_pos.x <= agent_pos.x: actions.append(Moves.WEST) if opp_pos.y <= agent_pos.y: actions.append(Moves.SOUTH) if opp_pos.x == agent_pos.x and opp_pos.y > agent_pos.y: actions.append(Moves.NORTH) if opp_pos.y == agent_pos.y and opp_pos.x > agent_pos.x: actions.append(Moves.EAST) if opp_pos.x == agent_pos.x and opp_pos.y < agent_pos.y: actions.append(Moves.SOUTH) if opp_pos.y == agent_pos.y and opp_pos.x < agent_pos.x: actions.append(Moves.WEST) assert len(actions) > 0 return actions
class Version2(MontageWfEnv): """ @version 2: Greedy Approach --> Proved method correct, reward wrong - Known fact that best exec time is 100 All actions will be given -1, goal to find the lowest 2nd score """ def __init__(self, degree=0.1, band_num=1, db_dir=".pegasus/workflow.db"): # Montage Experiment Variable super(Version2, self).__init__() self.degree = degree self.clusters_size = 1 self.clusters_num = 1 self.is_clusters_size = True self.is_clusters_num = False self.band_num = band_num # Setting database connection # self.db = DatabaseEnv(db_dir) self.action_space = Discrete(3) self.observation_space = Discrete(10) # , Discrete(8), Discrete(3) # Episode Conf # Best exec_time: None or 1, depends on reward version self.best_exec_time = None self.last_exec_time = None self.last_action = None self.last_reward = None self.total_reward = 0.0 # 0: Ntg, 1: improve, 2: degrade self.is_improve = 0 self.seed() self.reset() def step(self, action, training=False): assert self.action_space.contains(action) def calc_lb_hb(v, p): return (v * (100 - p)) / 100, (v * (p + 100)) / 100 reward = 0.0 self.last_action = action done = False if action == 1: self.clusters_size += 1 elif action == 2: self.clusters_size -= 1 # elif action == 3: # self.clusters_num += 1 # elif action == 4: # self.clusters_num -= 1 # Range Guarding Function if self.clusters_size <= 0: reward -= 1.0 self.clusters_size = 1 elif self.clusters_size > 30: reward -= 1.0 self.clusters_size = 30 # elif self.clusters_num <= 0: # reward -= 1.0 # self.clusters_num = 1 # elif self.clusters_num > 10: # reward -= 1.0 # self.clusters_num = 10 else: # res = self.run_static_experiment(self.clusters_size, self.clusters_num) res = self.run_static_experiment(self.clusters_size) self.exec_time = res if self.best_exec_time is None: self.best_exec_time = res if self.last_exec_time is None: self.last_exec_time = res if self.exec_time < 300: reward = 2 else: reward = -1 self.total_reward += reward if self.total_reward > 10: done = True return self._get_obs(), reward, done, { "exec": self.exec_time, "overhead": self.exec_time, "makespan": self.exec_time, "benchmark": 200 } def render(self, mode='human'): outfile = StringIO() if mode == 'ansi' else sys.stdout init_msg = "Current Experiment Parameters: degree: %d\t cluster.size: %d\t cluster.num: %d\t\n" % ( self.degree, self.clusters_size, self.clusters_num) outfile.write(init_msg) # if self.last_action is not None: # cs, cn = action result_str = "Current Execution Time: \t" expect_str = "Best Execution Time: \t" action_str = "Current Action: \t" # Process Outputs outfile.write(result_str + (" %s " % self.exec_time) + "\n") outfile.write(expect_str + (" %s " % self.best_exec_time) + "\n") outfile.write(action_str + (" %s " % self.last_action) + "\n") return outfile def reset(self): # Reset method should always return a new sets of episode settings # self.degree = 0.1 # self.clusters_size = 1 # self.clusters_num = 1 # self.band_num = 1 # self.best_exec_time = None if self.exec_time is not None: self.last_exec_time = self.exec_time self.wall_time = None self.cum_wall_time = None self.total_reward = 0 self.clusters_size = np.random.randint(1, 30) self.clusters_num = np.random.randint(1, 30) # print("Environment had been reset!") return self.clusters_size def _get_obs(self): return self.clusters_size
class PocEnv(Env): def __init__(self): self.board = select_maze("micro") self._get_init_state() self.action_space = Discrete(4) self.observation_space = Box(low=0, high=1, shape=(8, )) self._reward_range = 100 self.step_cnt = 0 def seed(self, seed=None): np.random.seed(seed) def _passable(self, pos): return self.maze[pos.x, pos.y] != Element.WALL def _inside(self, pos): if 0 <= pos.x < self.maze.shape[0] and 0 <= pos.y < self.maze.shape[1]: return True return False def step(self, action): assert self.action_space.contains(action) assert self.done is False self.step_cnt += 1 reward = -1 next_pos = self.next_pos(self.internal_state.agent_pos, action) self.internal_state.agent_pos = next_pos if self.internal_state.power_duration > 0: self.internal_state.power_duration -= 1 agent_in_power = self.internal_state.power_duration > 0 hit_ghost = set() for g, ghost in enumerate(self.internal_state.ghosts): # check if the ghost hits the agent before and after it moves if ghost.pos == self.internal_state.agent_pos: hit_ghost.add(g) else: ghost.move(self.internal_state.agent_pos, agent_in_power) if ghost.pos == self.internal_state.agent_pos: hit_ghost.add(g) hit_ghost = list(hit_ghost) for g in hit_ghost: if self.internal_state.power_duration > 0: reward += 25 self.internal_state.ghosts[g].reset() else: reward += -100 self.done = True break if self.step_cnt > self.board["_max_step"]: self.done = True if self._agent_at_food(): reward += 10 self.maze[self.internal_state.agent_pos.x, self.internal_state.agent_pos.y] = Element.CLEAR_WALK_WAY if self._food_left() == 0: self.done = True if self._agent_at_power(): self.internal_state.power_duration = self.board["_power_duration"] self.maze[self.internal_state.agent_pos.x, self.internal_state.agent_pos.y] = Element.CLEAR_WALK_WAY reward += 10 ob = self._make_ob(action) return ob, reward, self.done, {"state": self.internal_state} def _agent_at_food(self): agent_pos = self.internal_state.agent_pos if self.maze[agent_pos.x, agent_pos.y] == Element.FOOD_PELLET: return True return False def _agent_at_power(self): agent_pos = self.internal_state.agent_pos if self.maze[agent_pos.x, agent_pos.y] == Element.POWER: return True return False def _make_ob(self, action): """ Return 10 state features of observation: 4 features indicating whether the agent can see a ghost in that direction (UP, RIGHT, DOWN, LEFT) 4 features indicating whether he can feel a wall in each of the cardinal directions, which is set to 1 if he is adjacent to a wall 1 feature indicating whether he can hear a ghost, which is set to 1 if he is within Manhattan distance 2 of a ghost 1 feature indicating whether he can smell food (adjacent or diagonally adjacent to any food) """ ob = np.zeros(STATE_DIM) for i, action in enumerate(ACTIONS): if self._see_ghost(action): ob[i] = 1 next_pos = self.next_pos(self.internal_state.agent_pos, action) # If an agent couldn't move from the current position, then there is a wall if next_pos == self.internal_state.agent_pos: ob[i + len(ACTIONS)] = 1 if self._hear_ghost(): ob[2 * len(ACTIONS)] = 1 if self._smell_food(): ob[2 * len(ACTIONS) + 1] = 1 return ob def _see_ghost(self, action): agent_pos = self.internal_state.agent_pos for ghost in self.internal_state.ghosts: if agent_pos.x != ghost.pos.x and agent_pos.y != ghost.pos.y: continue if ((action == Action.UP and ghost.pos.x < agent_pos.x) or (action == Action.DOWN and ghost.pos.x > agent_pos.x) or (action == Action.LEFT and ghost.pos.y < agent_pos.y) or (action == Action.RIGHT and ghost.pos.y > agent_pos.y) ) and not self._wall_between(agent_pos, ghost.pos): return True return False def _smell_food(self): smell_range = self.board["_smell_range"] agent_pos = self.internal_state.agent_pos for x in range(-smell_range, smell_range + 1): for y in range(-smell_range, smell_range + 1): smell_x = agent_pos.x + x smell_y = agent_pos.y + y if (0 <= smell_x < self.maze.shape[0] and 0 <= smell_y < self.maze.shape[1] and self.maze[smell_x, smell_y] == Element.FOOD_PELLET): return True return False def _hear_ghost(self): for ghost in self.internal_state.ghosts: if (manhattan_distance(ghost.pos, self.internal_state.agent_pos) <= self.board["_hear_range"]): return True return False def _wall_between(self, pos1, pos2): assert pos1 != pos2 assert pos1.x == pos2.x or pos1.y == pos2.y if pos1.y == pos2.y: for i in range(min(pos1.x, pos2.x) + 1, max(pos1.x, pos2.x)): if self.maze[i, pos1.y] == Element.WALL: return True elif pos1.x == pos2.x: for i in range(min(pos1.y, pos2.y), max(pos1.y, pos2.y)): if self.maze[pos1.x, i] == Element.WALL: return True return False def _food_left(self): return np.sum(self.maze == Element.FOOD_PELLET) def reset(self): self.done = False self.step_cnt = 0 self._get_init_state() return 0 def _get_init_state(self): self.maze = self.board["_maze"].copy() self.internal_state = InternalState() self.internal_state.agent_pos = Position(*self.board["_poc_home"]) Ghost.max_x = self.maze.shape[0] Ghost.max_y = self.maze.shape[1] ghost_home = Position(*self.board["_ghost_home"]) Ghost.home = ghost_home for _ in range(self.board["_num_ghosts"]): pos = Position(ghost_home.x, ghost_home.y) self.internal_state.ghosts.append( Ghost( self, pos, direction=random_action(), ghost_range=self.board["_ghost_range"], )) return self.internal_state def next_pos(self, pos, action): x_offset, y_offset = 0, 0 if action == Action.UP: x_offset = -1 y_offset = 0 elif action == Action.DOWN: x_offset = 1 y_offset = 0 elif action == Action.RIGHT: x_offset = 0 y_offset = 1 elif action == Action.LEFT: x_offset = 0 y_offset = -1 next_pos = Position(pos.x + x_offset, pos.y + y_offset) if self._inside(next_pos) and self._passable(next_pos): return next_pos else: return pos def print_internal_state(self): print("Step", self.step_cnt) print_maze = self.maze.astype(str) print_maze[self.internal_state.agent_pos.x, self.internal_state.agent_pos.y] = "A" ghost_str = "" for g, ghost in enumerate(self.internal_state.ghosts): print_maze[ghost.pos.x, ghost.pos.y] = "G" ghost_str += "Ghost {} at {}, direction={}, type={}\n".format( g, ghost.pos, ACTION_DICT[ghost.direction], ghost.move_type) np.set_printoptions(formatter={"str_kind": lambda x: x}) print("Maze: \n{}".format(print_maze)) print("Agent at {}, power duration {}".format( self.internal_state.agent_pos, self.internal_state.power_duration)) print(ghost_str[:-1]) def print_ob(self, ob): ob_str = "" for i, action in enumerate(ACTIONS): if ob[i] == 1: ob_str += " SEE GHOST {},".format(ACTION_DICT[action]) for i, action in enumerate(ACTIONS): if ob[i + len(ACTIONS)] == 1: ob_str += " FEEL WALL {},".format(ACTION_DICT[action]) if ob[-2]: ob_str += " HEAR GHOST," if ob[-1]: ob_str += " SMELL FOOD," return ob_str
class SSingleRobGpw(gym.Env): def __init__(self): rospy.init_node("rl_robot_node"+str(ac_num)) self.pub, self.reset_robot_state = reset_node_init() self.act_pub = rospy.Publisher('/robot'+str(ac_num)+'/pioneer/cmd_vel', Twist, queue_size=1) # 当前每步为16个状态 if enable_correction: self.observation_space = gym.spaces.Box(0, 100, (64,), dtype=np.dtype(np.int32)) # 整数版 else: self.observation_space = gym.spaces.Box(-100, 100, (64, ), dtype=np.dtype(np.int32)) # 整数版 # self.observation_space = gym.spaces.Box(-10.0, 10.0, (4, ), dtype=np.dtype(np.float32)) self.action_space = Discrete(6) # 取值0,1,2,3,4,5 , 动作是否应该加上组合? self.state = None # self.laser_data = [0 for i in range(0, 10)] self.num_laser = 10 # 别乱改,有些相关数据没用这个参数 self.laser_data = None self.sample_laser_data = [0 for x in range(0, self.num_laser)] # 特定抽取的data, 位置从右向左 self.int_sample_laser_data = [0 for x in range(0, self.num_laser)] # 特定抽取的data, 位置从右向左,整数版 # 用于修正激光数据 # self.last_laser_data = [0 for x in range(0, self.num_laser)] # self.current_sate = None # self.speed = 0.0 # 运行一段时间之后,记录的speed、angle就会与实际(gazebo)有较大差距 self.state_speed = 0.0 # 作为状态用的speed 离散整数化 self.state_angle = 0.0 # angular speed self.s_state = None self.min_speed = 0.0 # -0.75 # 线速度 m/s ;角速度 rad/s(6弧度相当于一个整圆),要想倒车必须在车身后部安装测距手段 self.max_speed = 0.5 # 速度不能高于0.75,已经开始出现误差 0.5m/s因为环境障碍物比较密集 # self.angle = 0.0 self.min_angle = - 0.5 self.max_angle = 0.5 self.cmd_twist = Twist() self.done = False self.thread1 = LaserNodeThread() self.thread1.setDaemon(True) self.thread1.start() # 获取位置服务 rospy.wait_for_service('gazebo/get_model_state') self.get_state_srv = rospy.ServiceProxy('/gazebo/get_model_state', GetModelState) # 重置位置服务 rospy.wait_for_service('gazebo/set_model_state') self.set_obstacle_srv = rospy.ServiceProxy('/gazebo/set_model_state', SetModelState) # self.obstacle_pose = [[0 for x in range(0, 2)] for x in range(0, 20)] # 20个障碍物的位置[x, y] # gazebo返回的robot数据 self.gazebo_robot_pose_x = None # 可以考虑为其注入噪声 self.gazebo_robot_pose_y = None self.gazebo_robot_speed = 0.0 self.gazebo_robot_angle = 0.0 self.state_distance = 0.0 self.state_angle = 0.0 self.last_robot_pose_x = 0.0 self.last_robot_pose_y = 0.0 # reward 相关 self.arrive_reward = 100.0 self.collision_reward = -100.0 self.road_reward_times = 0.1 self.last_distance_to_goal = 0.0 # 上一次动作时距离目标的距离 self.distance_to_goal_threshold = 0.25 self.collision_threshold = 0.43 # 至少比车身宽 0.5 如果laser位于中央,0.3不足以做到覆盖车身距离 self.distance_r_sate = 0 # 设置最终robot位姿 self.goal_x = 0.0 self.goal_y = 10.0 self.desire_final_speed = None self.desire_final_angle = None self.eps = 0 # 记录当前迭代轮数 self.seed() self.steps = 0 # for debug def seed(self, seed=None): self.np_random, seed = seeding.np_random(seed) return [seed] # fixed version def reset_goal(self): goal_num = self.np_random.randint(0, 2) # self.goal_x = self.cc_func(round(goal_sequence[ac_num][goal_num][0], 1)) # self.goal_y = self.cc_func(round(goal_sequence[ac_num][goal_num][1], 1)) self.goal_x = round(goal_sequence[ac_num][goal_num][0], 1) self.goal_y = round(goal_sequence[ac_num][goal_num][1], 1) def compute_distance(self, lhx, lhy, rhx, rhy): return math.sqrt((lhx - rhx) ** 2 + (lhy - rhy) ** 2) # 使用相邻两次移动计算robot朝向,移动距离很小的话(特别有噪声的情况下,怎么可能计算的准确?需要借助IMU?) def compute_polar_coordinates(self): edge_a = self.compute_distance(self.last_robot_pose_x, self.last_robot_pose_y, self.gazebo_robot_pose_x, self.gazebo_robot_pose_y) edge_b = self.compute_distance(self.last_robot_pose_x, self.last_robot_pose_y, self.goal_x, self.goal_y) edge_c = self.compute_distance(self.gazebo_robot_pose_x, self.gazebo_robot_pose_y, self.goal_x, self.goal_y) cos_angle_beta = (edge_a**2 + edge_c**2 - edge_b**2)/(2*edge_a*edge_c) self.state_distance = edge_c # 数值需进行处理 self.state_angle = math.pi - math.acos(cos_angle_beta) # 数值需进行处理 # def distance_to_obstacle(self, num_x, num_y, count): # return math.sqrt((self.obstacle_pose[count][0] - num_x) ** 2 + (self.obstacle_pose[count][1] - num_y) ** 2) def state_speed_wrapper(self): self.state_speed = round(self.gazebo_robot_speed, 4) * 200 self.state_speed = int(self.state_speed) # (0~100) self.state_angle = round(self.gazebo_robot_angle, 4) * 100 self.state_angle = int(self.state_angle) + 50 # (0~100) def reset(self): self.steps = 0 # for debug self.eps += 1 self.done = False # 每100轮random一次 if self.eps % 100 == 0 or self.eps == 1: self.reset_goal() # robot 的初始朝向也随机化 self.reset_robot_state.pose.orientation.z = self.np_random.uniform(low=-0.9, high=0.9) self.reset_robot_state.pose.orientation.w = self.np_random.uniform(low=-0.9, high=0.9) print('actor'+str(ac_num)+' --> x: ' + str(self.goal_x) + ' y: ' + str(self.goal_y)) # 重置robot的 speed angle 注意各步骤执行顺序 self.cmd_twist.linear.x = 0.0 self.cmd_twist.angular.z = 0.0 # 将robot重置回原位置 start_time = time.time() while (time.time() - start_time) < 0.5: # 这里不使用srv直接重置, self.act_pub.publish(self.cmd_twist) # 先重置速度 self.pub.publish(self.reset_robot_state) time.sleep(0.05) # time.sleep(0.5) # 重置后需要一定时间等待激光等数据更新... # 重置state中的pose speed angle self.gazebo_robot_pose_x, self.gazebo_robot_pose_y, self.gazebo_robot_speed, self.gazebo_robot_angle = \ self.get_gazebo_state() self.gazebo_robot_speed = 0.0 self.gazebo_robot_angle = 0.0 self.laser_data = self.thread1.laser_data self.last_robot_pose_x = self.gazebo_robot_pose_x self.last_robot_pose_y = self.gazebo_robot_pose_y self.sample_fuc() # self.get_rid_of_bad_data() self.int_sample_fuc() # reset 函数需要提供初始状态 self.state = np.array(self.int_sample_laser_data[:]) # self.state = np.array([]) # 用来只有目标点作为状态时 self.state_speed_wrapper() self.state = np.append(self.state, [int(self.goal_x*10), int(self.goal_y*10), int(self.gazebo_robot_pose_x*10), int(self.gazebo_robot_pose_y*10), self.state_speed, self.state_angle]) self.sequence_state_wrap() self.last_distance_to_goal = self.distance_to_goal() # self.last_laser_data = self.sample_laser_data[:] self.s_state = self.s_state.reshape((64,)) # self.state = self.state.reshape((4,)) return self.s_state # 连续4个state合在一起 def sequence_state_wrap(self): if self.steps is 0: self.s_state = self.state for i in range(0, 3): self.s_state = np.append(self.s_state, self.state) else: for j in range(0, 3): for k in range(0, 16): self.s_state[j*16+k] = self.s_state[(j+1)*16+k] for l in range(0, 16): self.s_state[3 * 16 + l] = self.state[l] def step(self, action): self.steps += 1 assert self.action_space.contains(action), "%r (%s) invalid" % (action, type(action)) # robot 的加速度是有限制的,这里的选取一定要慎重,有两种方案,要么把变化数值都设成很小(不宜,难以快速响应环境) # 要么将publish的频率设限,20hz(+0.1情况下)以下 if action == 0: self.gazebo_robot_speed += 0.10 elif action == 1: self.gazebo_robot_speed -= 0.10 elif action == 2: self.gazebo_robot_angle = 0.20 elif action == 3: self.gazebo_robot_angle = -0.20 elif action == 4: self.gazebo_robot_angle = 0.0 elif action == 5: pass # 保持当前状态 else: gym.logger.warn("Unknown situation !") self.gazebo_robot_speed = np.clip(self.gazebo_robot_speed, self.min_speed, self.max_speed) self.gazebo_robot_angle = np.clip(self.gazebo_robot_angle, self.min_angle, self.max_angle) # 与许多其它os不同之处,p2os设置好cmd_vel后,robot会一直保持该速度, # self.cmd_twist.linear.x = 0 # gazebo中robot就算什么指令都没有,还是会有非常小的cmd_twist.linear.x # 与cmd_twist.angular.z,如果直接publish这些数据,只会造成误差越来越大 if abs(self.gazebo_robot_speed) < 0.025: self.gazebo_robot_speed = 0.0 if abs(self.gazebo_robot_angle) < 0.025: self.gazebo_robot_angle = 0.0 self.cmd_twist.linear.x = self.gazebo_robot_speed self.cmd_twist.angular.z = self.gazebo_robot_angle # attention! testing! # self.cmd_twist.linear.x = 0.5 # self.cmd_twist.angular.z = 0.0 time.sleep(0.08) # 0.075 self.act_pub.publish(self.cmd_twist) # 记录部分上一轮数据 self.laser_data = self.thread1.laser_data self.last_robot_pose_x = self.gazebo_robot_pose_x self.last_robot_pose_y = self.gazebo_robot_pose_y # 执行完动作,检查一下状态值(这里的严谨性有待考察,因为这里并不是严格的action一下,返回一个状态) self.gazebo_robot_pose_x, self.gazebo_robot_pose_y, self.gazebo_robot_speed, self.gazebo_robot_angle = \ self.get_gazebo_state() self.sample_fuc() # self.get_rid_of_bad_data() self.int_sample_fuc() self.state = np.array(self.int_sample_laser_data[:]) # self.state = np.array([]) self.state_speed_wrapper() self.state = np.append(self.state, [int(self.goal_x*10), int(self.goal_y*10), int(self.gazebo_robot_pose_x*10), int(self.gazebo_robot_pose_y*10), self.state_speed, self.state_angle]) # 应该把最后的位姿信息也作为限制, min(distance) 选择0.25,有待考察,因为将测距仪放在车头前部 if self.distance_to_goal() < self.distance_to_goal_threshold \ or min(self.sample_laser_data) < self.collision_threshold: # and (self.gazebo_robot_speed < 0.1): self.done = True if self.steps <= 5: # for debug print("robot: "+str(ac_num)+" : ") print(self.sample_laser_data) print("x: "+str(self.gazebo_robot_pose_x)+"y: "+str(self.gazebo_robot_pose_y)) self.sequence_state_wrap() self.s_state = self.s_state.reshape((64, )) # self.state = self.state.reshape((4, )) reward_num = self.reward() # reward值必须在更新last_distance_to_goal之前计算 self.last_distance_to_goal = self.distance_to_goal() # 更新距离,为下次计算reward准备 # self.last_laser_data = self.sample_laser_data[:] # 注意这里,坑.... copy!!! return self.s_state, reward_num, self.done, {} def distance_to_goal(self): return math.sqrt((self.goal_x - self.gazebo_robot_pose_x)**2 + (self.goal_y - self.gazebo_robot_pose_y)**2) def reward(self): # 应该把最后的位姿信息也作为reward的限制,与done保持一致,目标距离与障碍物距离之间应该有交叉reward? if self.distance_to_goal() < self.distance_to_goal_threshold: return self.arrive_reward elif min(self.sample_laser_data) < self.collision_threshold: # 有关障碍物距离部分,原始激光数据有噪声(或其它原因,不能直接做判定,甚至达到0.2 ???) return self.collision_reward # 谨慎选择有关距离的reward_fuc, 避免robot有可能出现的刷分行为(与arrive_reward的比例,正负等等) # 有关距离目标距离的部分,应该可以加快学习速度,但是比较难处理,超过目标点之后,跑远了就给它负的reward # 方案一 # else: # return self.road_reward_times * (self.last_distance_ro_goal-self.distance_to_goal()) # 方案二 else: if self.last_distance_to_goal - self.distance_to_goal() > 0: # 越来越近 return 1.0*self.road_reward_times else: # 现在这样原地不动,也是负reward, >=则无惩罚 return -1.0*self.road_reward_times # 可以加大远离的惩罚 # else: # return 0.0 def render(self, mode='human'): pass # 是否应该换成机器人本身的数据,毕竟真实机器人没有这些 # @staticmethod def get_gazebo_state(self): # 第一个参数model_name,第二个参数为relative_entity_name model_state = self.get_state_srv('robot'+str(ac_num), '') # 注意model_state 的类型为GetModelStateResponse robot_pose_x = round(model_state.pose.position.x, 1) # 注意pose为(x,y,z) , 离散, robot_pose_y = round(model_state.pose.position.y, 1) robot_twist_linear_x = round(model_state.twist.linear.x, 4) # 离散化 robot_twist_angular_z = round(model_state.twist.angular.z, 4) # angular.z代表平面机器人的角速度,因为此时z轴为旋转轴 # return self.cc_func(robot_pose_x), self.cc_func(robot_pose_y), robot_twist_linear_x, robot_twist_angular_z return robot_pose_x, robot_pose_y, robot_twist_linear_x, robot_twist_angular_z # 从原激光数据中抽取10组0.0型的激光数据,相当于每20°取一个; # 为了各robot可以互相探测,激光安装在车身前部,但更合理的做法应当是车身中央,所以要将测量到的laser数据进行一下变换0.15 def sample_fuc(self): self.sample_laser_data[0] = self.laser_data[0] self.sample_laser_data[-1] = self.laser_data[-1] for i in range(1, 9): self.sample_laser_data[i] = self.laser_data[i * 72] for j in range(0, 10): temp_angle = (math.pi / 9) * j # 弧度制 self.sample_laser_data[j] += math.sin(temp_angle) * 0.15 # 修正中心位置 # 离散化,保留1位小数,注意round()既非截断,也非四舍五入,也许2位更好? self.sample_laser_data[j] = round(self.sample_laser_data[j], 1) if self.sample_laser_data[j] > 10.0: self.sample_laser_data[j] = 10.0 def int_sample_fuc(self): for j in range(0, 10): self.int_sample_laser_data[j] = int(self.sample_laser_data[j]*10) # 变成整数 # def __del__(self): # # 等待至线程中止。这阻塞调用线程直至线程的join()方法被调用中止-正常退出或者抛出未处理的异常-或者是可选的超时发生。 # self.thread1.join() # 还是不对 # def close(self): pass
class ThreatDefenseEnv(BaseEnv): """ The class of the Defense environment. An OpenAI environment of the toy example given in Optimal Defense Policies for Partially Observable Spreading Processes on Bayesian Attack Graphs by Miehling, E., Rasouli, M., & Teneketzis, D. (2015). It constitutes a 29-state/observation, 4-action POMDP defense problem. """ metadata = {'render.modes': ['human', 'rgb_array']} def __init__(self): """POMDP environment.""" self.action_space = Discrete(len(Action)) self.state_space = Discrete(STATES.shape[0]) self.observation_space = Discrete(OBSERVATIONS.shape[0]) self.all_states = STATES self.last_obs = None self.viewer = None def step(self, action): """ Progress the simulation one step. Arguments: action -- an Action containing a numeric value [0, 3]. Returns: A tuple (o, r, d, i), where o -- the observation as a binary vector. r -- the reward gained as an integer. d -- boolean indicating if the simulation is done or not. i -- simulation meta-data as a dictionary. """ assert self.action_space.contains(action) assert self.done is not True if not type(action) is Action: action = Action(action) self.t += 1 self.last_action = action reward = REWARDS[action.value][self.state.index] self.done = self._is_terminal(action, self.t) old_state = self.state self.state = self._next_state(action, old_state) self.info = self._update_info(action, old_state, self.state) return self._sample_observation(self.state).as_list(), \ reward, self.done, self.info def reset(self): """ Reset the simulation. Returns: The beginner observation as a binary vector. """ self.done = False self.state = State(0) self.last_action = Action.NONE self.t = 0 self.info = {} return self.state.as_list() def _next_state(self, action, state): """ Generate the next state from an action. Arguments: action -- an Action containing numeric value [0, 3]. state -- a State containing a binary vector of length 12. Returns: The next State containing a binary vector of length 12. """ probs = TRANSITIONS[action.value][state.index] return State(np.random.choice(probs.shape[0], p=probs)) def _is_terminal(self, action, time_step): """ Check if the current action terminates the simulation. Arguments: action -- an Action containing the numeric value [0, 3]. time_step -- the current time_step as a numeric value 0 <= time_step < max_time_step. Returns: A boolean depending on if the simulation is over or not. """ return self.last_action.value is Action.BOTH.value or \ time_step >= 100 def _sample_observation(self, state): """ Generate a new observation from the current state and action taken. Arguments: action -- an Action containing the numeric value [0, 3]. state -- a State containing a binary vector of length 12. Returns: An Observation containing a binary vector of length 12. """ probs = OBSERVATIONS[state.index] self.last_obs = Observation(np.random.choice(probs.shape[0], p=probs)) return self.last_obs def _update_info(self, action, old_state, new_state): """ Update the info meta-data Dict. Update the info with respect to the current and past state as well as the action taken. Arguments: action -- an Action containing the numeric value [0, 3]. old_state -- a State containing a binary vector of length 12, representing the state at the past time step. new_state -- a State containing a binary vector of length 12, representing the state at the current time step. Returns: A dictionary containing the updated info with respect to the current and past state as well as the action taken. """ return { 'last_transition': TRANSITIONS[action.value][old_state.index], 'last_observation': OBSERVATIONS[new_state.index], # noqa 'state': new_state.as_list() }
class Kvazaar_v0(gym.Env): metadata = {"render.modes": ["human"]} def set_video_selection_mode(self): if os.path.isdir(self.vids_path): self.vids_list = os.listdir( self.vids_path) #list all vids in vids_path if self.mode == "rotating": self.vids_list = sorted(self.vids_list) #sort videos elif self.mode is None: self.mode = "random" elif os.path.isfile(self.vids_path) and self.mode is None: self.mode = "file" self.vids_list = os.listdir(self.vids_path) self.vid_selected['name'] = self.vids_list[0] self.vid_selected['dir_pos'] = 0 def random_video_selection(self): randomInt = self.np_random.randint(0, len(self.vids_list)) newVideo = self.vids_list[randomInt] self.vid_selected['name'] = newVideo self.vid_selected['pos'] = randomInt def rotating_video_selection(self): self.vid_selected['pos'] = (self.vid_selected['pos'] + 1) % len( self.vids_list) new_video = self.vids_list[self.vid_selected['pos']] self.vid_selected['name'] = new_video def log_metrics(self): message = ("\n ------------- \n" "Total steps: {}" + "\n ------------- \n" + "VIDEO USAGE \n").format(self.steps) self.video_usage = { key: round((value / self.steps * 100), 0) for key, value in self.video_usage.items() } for video, usage in self.video_usage.items(): message += video + " = " + str(usage) + "%\n" if self.logger: self.logger.info(message) def __init__(self, **kwargs): # Recogemos los argumentos: # kvazaar_path: ruta donde está instalado kvazaar # vids_path: ruta de los vídeos que utilizará el entorno # cores: lista de cores para kvazaar self.kvazaar_path = kwargs.get("kvazaar_path") self.vids_path = kwargs.get("vids_path") self.cores = kwargs.get( "cores") #Lista de los cores que utiliza el entorno self.mode = kwargs.get("mode") #Modo de seleccion de videos self.logger = kwargs.get("logger") self.max_steps = kwargs.get("num_steps") self.vids_list = None #for random or rotating mode self.vid_selected = {'name': None, 'pos': 0} self.set_video_selection_mode() self.action_space = Discrete(len(self.cores)) self.observation_space = Discrete(11) self.kvazaar = None ##object for kvazaar subprocess #metrics self.video_usage = {video: 0 for video in self.vids_list} self.steps = 0 def reset(self): self.seed() #Generate seed for random numbers self.reset_kvazaar() self.state = np.int64(1) self.reward = 0 #Starter reward is 0 self.done = False self.info = {"state": "running", "fps": 0} return self.state def reset_kvazaar(self): #self.mode == file is already managed if self.mode == "random": self.random_video_selection() if self.mode == "rotating": self.rotating_video_selection() #log new video if self.logger: self.logger.info(self.vid_selected['name'] + " , pos " + str(self.vid_selected['pos'])) command = [ self.kvazaar_path, "--input", self.vids_path + self.vid_selected['name'], "--output", "/dev/null", "--preset=ultrafast", "--qp=22", "--owf=0", "--threads=" + str(len(self.cores)) ] #apply taskset using the range of cores setted as argument command = ["taskset", "-c", ",".join([str(x) for x in self.cores])] + command # kvazaar process generation self.kvazaar = subprocess.Popen(command, stdin=subprocess.PIPE, stdout=subprocess.PIPE, universal_newlines=True, bufsize=1, env={'NUM_FRAMES_PER_BATCH': '24'}) #make sure kvazaar process is up while not self.kvazaar: time.sleep(1) def step(self, action): assert self.action_space.contains(action) action += 1 #Interaction with Kvazaar process passing the new action, i.e. the new numbers of cpus used for the next video block. s = "nThs:" + str(action) self.kvazaar.stdin.write(s + "\n") output = self.kvazaar.stdout.readline().strip() ######## ##update metrics self.steps += 1 self.video_usage[self.vid_selected['name']] += 1 #log this num step and the video used if self.logger: self.logger.info("Step {} : {} , pos {}".format( str(self.steps), self.vid_selected['name'], str(self.vid_selected['pos']))) self.calculate_state(output=output) self.info["state"] = output.strip() if self.steps == self.max_steps: #reached end of training self.kvazaar.kill() self.log_metrics() try: assert self.observation_space.contains( self.state) #check if new state is valid except AssertionError: print("INVALID STATE", self.state) return [self.state, self.calculate_reward(), self.done, self.info] def calculate_reward(self): if self.info["state"] == 'END': self.reward = 0 else: map_rewards = { 1: 0, ##[0,10) 2: 1, ##[10,16) 3: 2, ##[16,24) 4: 3, ##[20,24) 5: 6, ##[24,27) 6: 8, ##[27,30) 7: 10, ##[30,35) 8: 7, # [35,40) 9: 4 # [40,inf) } self.reward = map_rewards.get(self.state) return self.reward def calculate_state(self, output): if (output == "END"): self.done = True self.info["state"] = 'END' else: ##Erase "FPS:" from output and save it in the new state. output_value = np.float32(output[4:]) self.info["fps"] = '{:.2f}'.format(output_value) if output_value < 10: self.state = np.int64(1) elif output_value < 16: self.state = np.int(2) elif output_value < 20: self.state = np.int(3) elif output_value < 24: self.state = np.int(4) elif output_value < 27: self.state = np.int64(5) elif output_value < 30: self.state = np.int64(6) elif output_value < 35: self.state = np.int64(7) elif output_value < 40: self.state = np.int64(8) else: self.state = np.int64(9) def render(self, mode="human"): if self.info["state"] == 'END': print(self.info["state"]) else: l = '{:>7} fps:{:>1} reward:{:<10}'.format( self.state, self.info["fps"], self.reward) print(l) def seed(self, seed=None): self.np_random, seed = seeding.np_random(seed) return [seed] def close(self): pass
class CacheEnv(gym.Env): """ Cache description. * STATE * The state is represented as a vector: [request object size, cache remaining size, time of last request to the same object] * ACTIONS * TODO: should be fixed here, there should be both Whether the cache accept the incoming request, represented as an integer in [0, 1]. * REWARD * (BHR) Cost of previous step (object size) * hit * REFERENCE * """ def __init__(self, seed=42): self.seed(seed) self.cache_size = config.cache_size # load trace, attach initial online feature values self.src = TraceSrc(trace=config.cache_trace, cache_size=self.cache_size) # set up the state and action space #TODO: Modify the action space size to C+1 self.action_space = Discrete(2) self.observation_space = Box(self.src.min_values, \ self.src.max_values, \ dtype=np.float32) # cache simulator self.sim = CacheSim(cache_size=self.cache_size, \ policy='lru', \ action_space=self.action_space, \ state_space=self.observation_space) # reset environment (generate new jobs) #self.reset(1, 2) def reset(self, low=1, high=1001): print('env.reset called') print('caller name:', inspect.stack()[1][3]) new_trace = self.np_random.randint(low, high) print(new_trace) self.src.reset(new_trace) self.sim.reset() if config.cache_trace == 'test': print("New Env Start", new_trace) elif config.cache_trace == 'real': print("New Env Start Real") #print(self.sim.get_state()) return self.sim.get_state() def seed(self, seed): self.np_random = seeding.np_random(seed) def step(self, action): # 0 <= action < num_servers global accept assert self.action_space.contains(action) state, done = self.src.step() reward, info = self.sim.step(action, state) obj, done = self.src.next() while self.sim.next_hit(obj): state, done = self.src.step() hit_reward, info = self.sim.step(accept, state) reward += hit_reward if done is True: break obj, done = self.src.next() obs = self.sim.get_state(obj) info = {} return obs, reward, done, info def render(self, mode='human', close=False): pass
class MarketEnvironment(gym.Env): def __init__(self, env_config): #save the variables self.env_config = env_config self.db = DataBase(env_config["database_path"]) self.hp = env_config["hyperparameters"] self.episode_length = env_config["episode_length"] self.trading_fee_dez = env_config["trading_fee"] / 100 #specify action and observation spaces self.action_space = Discrete(3) self.observation_space = Box(low=-np.finfo(np.float32).max, high=np.finfo(np.float32).max, shape=(self.hp.window_size, len(self.hp.features))) #handling of the scaler if env_config["preloaded_scaler"] is None: self.scaler = None warnings.warn("You did not specify a preloaded scaler!") elif isinstance(env_config["preloaded_scaler"], BaseEstimator): #save preloaded scaler instance self.scaler = env_config["preloaded_scaler"] elif type(env_config["preloaded_scaler"]) == str: #load in the scaler self.scaler = joblib.load(filename=env_config["preloaded_scaler"]) else: raise Exception( "You have to either specify a scaler_location, scaler_instance or None for config_parameter: scaler" ) #prepare the data self.data, self.close_data = self._prepare_data() #local episode variables self.current_step = 0 self.start_index = 0 self.done = True self.mode = "buy" self.specific_profit = 0 def step(self, action): #check that environment is ready so get stepped in if self.done: raise Exception( "You have to reset the environment before you can step through it" ) #check that action is in actionspace if not self.action_space.contains(action): raise Exception(f"Your chosen action: {action} is not valid!") """ Caculate the specific_profit (reward) """ #calculate the index index = self.start_index + self.current_step + self.hp.window_size - 1 #get the price price = self.close_data.iloc[index, 1] #set reward to 0 reward = 0 #buy assets if self.mode == "buy" and action == 1: #save the buyprice self.buy_price = price #set mode to buy self.mode = "sell" #sell assets elif self.mode == "sell" and action == 2: #calculate profit local_specific_profit = price / self.buy_price * ( 1 - self.trading_fee_dez)**2 - 1 #update episode profit self.specific_profit += local_specific_profit #set mode to buy self.mode = "buy" #set reward reward = local_specific_profit """ Get new Observation """ #update the current_step self.current_step += 1 #if we are not at the end of this episode yet: if self.current_step < self.episode_length: #get the new observation observation = self.data.iloc[self.start_index + self.current_step:self.start_index + self.current_step + self.hp.window_size].to_numpy() #if we reached the end of this episode: else: #update the doneflag self.done = True observation = self.data.iloc[self.start_index + self.current_step - 1:self.start_index + self.current_step - 1 + self.hp.window_size].to_numpy() return observation, reward, self.done, {} def reset(self): #reset the local run variables self.current_step = 0 self.done = False self.mode = "buy" self.specific_profit = 0 #get random startingpoint self.start_index = random.randrange( 0, self.data.shape[0] - self.episode_length - self.hp.window_size + 1, 1) #get the start_state start_state = self.data.iloc[self.start_index + self.current_step:self.start_index + self.current_step + self.hp.window_size].to_numpy() return start_state def _prepare_data(self): #select the features data = self.db[self.hp.candlestick_interval, self.hp.features] #data operations that can be made on the whole dataset data, scaler = TrainDataBase._raw_data_prep( data=data, derive=self.hp.derivation, scaling_method=self.hp.scaling, scaler_type=self.hp.scaler_type, preloaded_scaler=self.scaler) #reset the index data.reset_index(inplace=True, drop=True) #create the close_data close_data = self.db[self.hp.candlestick_interval, ["close_time", "close"]] #remove first row close_data = close_data.iloc[1:, :] #reset index close_data.reset_index(inplace=True, drop=True) return data, close_data
class PocEnv(Env): def __init__(self, maze): self.board = select_maze(maze) self.grid = PocGrid(board=self.board["_maze"]) self._get_init_state() self.action_space = Discrete(4) self.observation_space = Discrete(1 << 10) # 1024 # self.observation_space = Discrete(14) self._reward_range = 100 self._discount = .95 def seed(self, seed=None): np.random.seed(seed) def is_power(self, idx): return self.board['_maze'][idx] == 3 def is_passable(self, idx): return self.board['_maze'][idx] != 0 def _is_valid(self): assert self.grid.is_inside(self.state.agent_pos) assert self.is_passable(self.state.agent_pos) for ghost in self.state.ghosts: assert self.grid.is_inside(ghost.pos) assert self.is_passable(ghost.pos) def _set_state(self, state): self.done = False self.state = state def _generate_legal(self): actions = [] for action in Action: if self.grid.is_inside(self.state.agent_pos + Moves.get_coord(action.value)): actions.append(action.value) return actions def step(self, action): assert self.action_space.contains(action) assert self.done is False reward = -1 next_pos = self._next_pos(self.state.agent_pos, action) if next_pos.is_valid(): self.state.agent_pos = next_pos else: reward += -25 if self.state.power_step > 0: self.state.power_step -= 1 hit_ghost = -1 for g, ghost in enumerate(self.state.ghosts): if ghost.pos == self.state.agent_pos: hit_ghost = g # move ghost self._move_ghost(g, ghost_range=self.board["_ghost_range"]) if hit_ghost >= 0: if self.state.power_step > 0: reward += 25 self.state.ghosts[hit_ghost].reset() else: reward += -100 self.done = True ob = self._make_ob(action) if self.state.food_pos[self.grid.get_index(self.state.agent_pos)]: if sum(self.state.food_pos) == 0: reward += 1000 self.done = True if self.is_power(self.state.agent_pos): self.state.power_step = config["_power_steps"] reward += 10 return ob, reward, self.done, {"state": self.state} def _make_ob(self, action): # TODO fix me ob = 0 for d in range(self.action_space.n): if self._see_ghost(action) > 0: ob = set_flags(ob, d) next_pos = self._next_pos(self.state.agent_pos, direction=d) if next_pos.is_valid() and self.is_passable(next_pos): ob = set_flags(ob, d + self.action_space.n) if self._smell_food(): ob = set_flags(ob, 8) if self._hear_ghost(self.state): ob = set_flags(ob, 9) return ob def _encode_state(self, state): poc_idx = self.grid.get_index(state.agent_pos) ghosts = [(self.grid.get_index(ghost.pos), ghost.direction) for ghost in state.ghosts] return np.concatenate([[poc_idx], *ghosts, state.food_pos, [state.power_step]]) def _decode_state(self, state): poc_state = PocState(Coord(*self.grid.get_coord(state[0]))) ghosts = np.split(state[1:self.board["_num_ghosts"] * 3], 1) for g in ghosts: poc_state.ghosts.append( Ghost(pos=self.grid.get_coord(g[0]), direction=g[1])) poc_state.power_step = state[-1] poc_state.food_pos = state[self.board["_num_ghosts"] * 3:-1].tolist() return poc_state def _compute_prob(self, action, next_state, ob): return int(ob == self._make_ob(action)) def _see_ghost(self, action): eye_pos = self.state.agent_pos + Moves.get_coord(action) while True: for g, ghost in enumerate(self.state.ghosts): if ghost.pos == eye_pos: return g eye_pos += Moves.get_coord(action) if not self.grid.is_inside(eye_pos) or not self.is_passable( eye_pos): break return -1 def _smell_food(self, smell_range=1): for x in range(-smell_range, smell_range + 1): for y in range(-smell_range, smell_range + 1): smell_pos = Coord(x, y) idx = self.grid.get_index(self.state.agent_pos + smell_pos) if self.grid.is_inside(self.state.agent_pos + smell_pos) and self.state.food_pos[idx]: return True return False @staticmethod def _hear_ghost(poc_state, hear_range=2): for ghost in poc_state.ghosts: if Grid.manhattan_distance(ghost.pos, poc_state.agent_pos) <= hear_range: return True return False def render(self, mode='human', close=False): pass def reset(self): self.t = 0 self.done = False self._get_init_state() return 0 def close(self): pass def _get_init_state(self): # create walls # for tile in self.grid: # value = config["maze"][tile.key[0]] # self.grid.set_value(value, coord=tile.key) self.state = PocState() self.state.agent_pos = Coord(*self.board["_poc_home"]) ghost_home = Coord(*self.board["_ghost_home"]) for g in range(self.board["_num_ghosts"]): pos = Coord(ghost_home.x + g % 2, ghost_home.y + g // 2) self.state.ghosts.append(Ghost(pos, direction=-1)) self.state.food_pos = np.random.binomial(1, config["_food_prob"], size=self.grid.n_tiles + 1) self.state.power_step = 0 return self.state def _next_pos(self, pos, direction): direction = Moves.get_coord(direction) if pos.x == 0 and pos.y == self.board[ '_passage_y'] and direction == Moves.EAST: next_pos = Coord(self.grid.x_size - 1, pos.y) elif pos.x == self.grid.x_size - 1 and pos.y == self.board[ '_passage_y'] and direction == Moves.WEST: next_pos = Coord(0, pos.y) else: next_pos = pos + direction if self.grid.is_inside(next_pos) and self.is_passable(next_pos): return next_pos else: return Coord(-1, -1) def _move_ghost(self, g, ghost_range): if Grid.manhattan_distance(self.state.agent_pos, self.state.ghosts[g].pos) < ghost_range: if self.state.power_step > 0: self._move_defensive(g) else: self._move_aggressive(g) else: self._move_random(g) def _move_aggressive(self, g, chase_prob=.75): if not np.random.binomial(1, p=chase_prob): return self._move_random(g) best_dist = self.grid.x_size + self.grid.y_size best_pos = self.state.ghosts[g].pos best_dir = -1 for d in range(self.action_space.n): dist = Grid.directional_distance(self.state.agent_pos, self.state.ghosts[g].pos, d) new_pos = self._next_pos(self.state.ghosts[g].pos, d) if dist <= best_dist and new_pos.is_valid() and can_move( self.state.ghosts[g], d): best_pos = new_pos best_dist = dist best_dir = d self.state.ghosts[g].update(best_pos, best_dir) def _move_defensive(self, g, defensive_prob=.5): if np.random.binomial( 1, defensive_prob) and self.state.ghosts[g].direction >= 0: self.state.ghosts[g].direction = -1 best_dist = self.grid.x_size + self.grid.y_size best_pos = self.state.ghosts[g].pos best_dir = -1 for d in range(self.action_space.n): dist = Grid.directional_distance(self.state.agent_pos, self.state.ghosts[g].pos, d) new_pos = self._next_pos(self.state.ghosts[g].pos, d) if dist >= best_dist and new_pos.is_valid() and can_move( self.state.ghosts[g], d): best_pos = new_pos best_dist = dist best_dir = d self.state.ghosts[g].update(best_pos, best_dir) def _move_random(self, g): # there are no dead ends # never switch to opposite direction ghost_pos = self.state.ghosts[g].pos while True: d = self.action_space.sample() next_pos = self._next_pos(ghost_pos, d) if next_pos.is_valid() and can_move(self.state.ghosts[g], d): break self.state.ghosts[g].update(next_pos, d)
class FourthRobGpw2(gym.Env): def __init__(self): rospy.init_node("rl_robot_node" + str(ac_num)) self.pub, self.reset_robot_state = reset_node_init() self.act_pub = rospy.Publisher('/robot' + str(ac_num) + '/pioneer/cmd_vel', Twist, queue_size=1) self.sub_name = '/robot' + str(ac_num) + '/laser/scan' self.laser_sub = rospy.Subscriber(self.sub_name, LaserScan, self.laser_callback, queue_size=1) # self.cc_func = correct_coordinate_func() # 数据已经离散化(不需要归一化吧?),进一步转换成整数, 线速度为第15个状态 # self.observation_space = gym.spaces.Box(-10.0, 10.0, (15, ), dtype=np.dtype(np.float32)) # if enable_correction: # self.observation_space = gym.spaces.Box(0, 100, (14,), dtype=np.dtype(np.int32)) # 整数版 # else: # self.observation_space = gym.spaces.Box(-100, 100, (14, ), dtype=np.dtype(np.int32)) # 整数版 # self.observation_space = gym.spaces.Box(0, 1, (14,), dtype=np.dtype(np.float32)) # normalization # self.observation_space = gym.spaces.Box(-10.0, 10.0, (4, ), dtype=np.dtype(np.float32)) self.observation_space = gym.spaces.Box(0, 100, (12, ), dtype=np.dtype(np.int32)) self.action_space = Discrete(5) # 取值0,1,2,3,4,5 , 动作是否应该加上组合? self.state = None # self.laser_data = [0 for i in range(0, 10)] self.num_laser = 10 # 别乱改,有些相关数据没用这个参数 self.laser_data = None self.sample_laser_data = [0 for _ in range(0, self.num_laser) ] # 特定抽取的data, 位置从右向左 self.int_sample_laser_data = [0 for _ in range(0, self.num_laser) ] # 特定抽取的data, 位置从右向左,整数版 # self.state_sample_laser_data = [0 for x in range(0, self.num_laser)] # 特定抽取的data, 位置从右向左,归一化 # 用于修正激光数据 self.last_laser_data = [0 for x in range(0, self.num_laser)] # self.current_sate = None # self.speed = 0.0 # 运行一段时间之后,记录的speed、angle就会与实际(gazebo)有较大差距 self.state_speed = 0.0 # 作为状态用的speed 离散整数化 self.state_angle = 0.0 # angular speed self.min_speed = 0.0 # -0.75 # 线速度 m/s ;角速度 rad/s(6弧度相当于一个整圆),要想倒车必须在车身后部安装测距手段 self.max_speed = 0.5 # 速度不能高于0.75,已经开始出现误差 0.5m/s因为环境障碍物比较密集 # self.angle = 0.0 self.min_angle = -0.5 self.max_angle = 0.5 self.cmd_twist = Twist() self.done = False # self.thread1 = LaserNodeThread() # self.thread1.setDaemon(True) # self.thread1.start() # 获取状态服务 rospy.wait_for_service('gazebo/get_model_state') self.get_state_srv = rospy.ServiceProxy('/gazebo/get_model_state', GetModelState) # 重置位置服务 rospy.wait_for_service('gazebo/set_model_state') self.set_obstacle_srv = rospy.ServiceProxy('/gazebo/set_model_state', SetModelState) # 重置simulation rospy.wait_for_service('/gazebo/reset_simulation') self.reset_sim_srv = rospy.ServiceProxy('/gazebo/reset_simulation', Empty) # self.obstacle_pose = [[0 for x in range(0, 2)] for x in range(0, 20)] # 20个障碍物的位置[x, y] # gazebo返回的robot数据 self.gazebo_robot_pose_x = 0.0 # 可以考虑为其注入噪声,这里在初始化时为0,导致下面一开始角度计算有一个错误 self.gazebo_robot_pose_y = 0.0 self.gazebo_robot_speed = 0.0 self.gazebo_robot_angle = 0.0 self.robot_orientation_z = 0 self.robot_orientation_w = 0 self.true_state_distance = 0.0 # self.state_distance 经过处理,不再是真实距离,用于检查是否到达目标 self.state_distance = 0.0 # 用于作为状态输入,对比每步的reward self.state_angle = 0.0 self.state_polar_angle = 0.0 # 这个是robot相对x轴朝向 self.last_robot_pose_x = 0.0 self.last_robot_pose_y = 0.0 # reward 相关 self.arrive_reward = 100.0 self.collision_reward = -100.0 self.road_reward_times = 1 # 0.1 self.time_cost_reward = -0.1 self.last_distance_to_goal = 0.0 # 上一次动作时距离目标的距离 self.distance_to_goal_threshold = 0.10 # 0.20 self.collision_threshold = 0.35 # 至少比车身宽 0.5 , 0.43, 0.35是比较好的数值,再小就不行了 # self.action_0 = False # self.distance_r_sate = 0 # 设置最终robot位姿 self.goal_x = 9.0 # 8 self.goal_y = 6.0 # 8 # self.desire_final_speed = None # self.desire_final_angle = None self.eps = 0 # 记录当前迭代轮数 # self.get_obstacle_pose() self.seed() self.steps = 0 # for debug self.total_steps = 0 self.trace_need = 1 # self.reset() # 先初始化一下 , 在dqn中会先调用一下的 def seed(self, seed=None): self.np_random, seed = seeding.np_random(seed) return [seed] def laser_callback(self, msg): self.laser_data = msg.ranges # 使用随机目标应该可以加快学习速度,和泛化能力 # def random_goal(self): # # print("random_goal") # # RandomState(ac_num) # # 搞不懂random被openAI怎么动了,运行结果诡异,使用baselines中的uniform分布 # self.goal_x = self.np_random.uniform(low=-9.0, high=9.0) # self.goal_y = self.np_random.uniform(low=-9.0, high=9.0) # # self.goal_x = (18 * (np.random.ranf() - 0.5)) # x~(-9,9) # # self.goal_y = (18 * (np.random.ranf() - 0.5)) # y~(-9,9) # # print(str(self.goal_x) +" " +str(self.goal_y)) # # 注意这里的设计, 每当goal 不满足条件时,就需要重新random一下,然后与出生点、“所有”障碍物点重新对比一下 # i = -1 # while i < 19: # i += 1 # while self.compute_distance(self.goal_x, self.reset_robot_state.pose.position.x, self.goal_y, # self.reset_robot_state.pose.position.y) < 1.0 \ # or (self.compute_distance(self.goal_x, self.obstacle_pose[i][0], # self.goal_y, self.obstacle_pose[i][1])) < 1.2: # # self.goal_x = (18 * (np.random.ranf() - 0.5)) # 距离初始位置或任意障碍物过近 # # self.goal_y = (18 * (np.random.ranf() - 0.5)) # self.goal_x = self.np_random.uniform(low=-9.0, high=9.0) # self.goal_y = self.np_random.uniform(low=-9.0, high=9.0) # i = -1 # # self.goal_x = round(self.goal_x, 1) # 离散化 # self.goal_y = round(self.goal_y, 1) # # print("end random_goal") # fixed version def reset_goal(self): goal_num = self.np_random.randint(0, 3) # self.goal_x = self.cc_func(round(goal_sequence[ac_num][goal_num][0], 1)) # self.goal_y = self.cc_func(round(goal_sequence[ac_num][goal_num][1], 1)) self.goal_x = round(goal_sequence[ac_num][goal_num][0], 1) self.goal_y = round(goal_sequence[ac_num][goal_num][1], 1) def initial_pose(self): pose_num = self.np_random.randint(0, 4) # self.reset_robot_state.pose.position.x = self.cc_func(round(initial_sequence[ac_num][goal_num][0], 1)) # self.reset_robot_state.pose.position.y = self.cc_func(round(initial_sequence[ac_num][goal_num][1], 1)) self.reset_robot_state.pose.position.x = round( initial_sequence[ac_num][pose_num][0], 1) self.reset_robot_state.pose.position.y = round( initial_sequence[ac_num][pose_num][1], 1) # print('init_actor' + str(ac_num) + ' --> x: ' + str(self.reset_robot_state.pose.position.x) # + ' y: ' + str(self.reset_robot_state.pose.position.y)) @staticmethod def compute_distance(lhx, lhy, rhx, rhy): return math.sqrt((lhx - rhx)**2 + (lhy - rhy)**2) # goal 在robot当前位置为基准的角度 def cpose_goal_angle(self): return math.atan2(self.goal_y - self.gazebo_robot_pose_y, self.goal_x - self.gazebo_robot_pose_x) # (-pi,pi] # # 使用相邻两次移动计算robot朝向,移动距离很小的话(特别有噪声的情况下,怎么可能计算的准确?需要借助IMU?) # def compute_polar_coordinates(self): # edge_a = self.compute_distance(self.last_robot_pose_x, self.last_robot_pose_y, # self.gazebo_robot_pose_x, self.gazebo_robot_pose_y) # edge_b = self.compute_distance(self.last_robot_pose_x, self.last_robot_pose_y, self.goal_x, self.goal_y) # edge_c = self.compute_distance(self.gazebo_robot_pose_x, self.gazebo_robot_pose_y, self.goal_x, self.goal_y) # cos_angle_beta = (edge_a**2 + edge_c**2 - edge_b**2)/(2*edge_a*edge_c) # self.state_distance = edge_c # 数值需进行处理 # self.state_angle = math.pi - math.acos(cos_angle_beta) # 数值需进行处理 # def distance_to_obstacle(self, num_x, num_y, count): # return math.sqrt((self.obstacle_pose[count][0] - num_x) ** 2 + (self.obstacle_pose[count][1] - num_y) ** 2) # 如果使用normalization_state_input(),则不要使用这个 def state_speed_wrapper(self): # print(self.gazebo_robot_speed) # print(self.gazebo_robot_angle) self.state_speed = round(self.gazebo_robot_speed, 4) * 200 self.state_speed = int(self.state_speed) # (0~100) self.state_speed = np.clip(self.state_speed, 0, 100) self.state_angle = round(self.gazebo_robot_angle, 4) * 100 self.state_angle = int(self.state_angle) + 50 # (0~100) self.state_angle = np.clip(self.state_angle, 0, 100) def reset(self): self.total_steps += self.steps self.steps = 0 # for debug self.eps += 1 # print('call_reset') self.done = False # 每10轮random一次 if self.eps % 10 == 0 or self.eps == 1: self.reset_goal() # self.initial_pose() # robot 的初始朝向也随机化 self.reset_robot_state.pose.orientation.z = self.np_random.uniform( low=-0.9, high=0.9) self.reset_robot_state.pose.orientation.w = self.np_random.uniform( low=-0.9, high=0.9) print('actor' + str(ac_num) + ' --> x: ' + str(self.goal_x) + ' y: ' + str(self.goal_y)) # 重置robot的 speed angle 注意各步骤执行顺序 self.cmd_twist.linear.x = 0.0 self.cmd_twist.angular.z = 0.0 # 将robot重置回原位置 start_time = time.time() while (time.time() - start_time) < 0.25: # 这里不使用srv直接重置, self.act_pub.publish(self.cmd_twist) # 先重置速度 self.pub.publish(self.reset_robot_state) time.sleep(0.05) # time.sleep(0.5) # 重置后需要一定时间等待激光等数据更新... # 重置state中的pose speed angle # 每过大约10万步reset simulation if self.total_steps >= (self.trace_need * 10000): # self.reset_sim_srv() self.trace_need += 1 self.gazebo_robot_pose_x, self.gazebo_robot_pose_y, self.gazebo_robot_speed, self.gazebo_robot_angle,\ self.state_polar_angle = self.get_gazebo_state() self.gazebo_robot_speed = 0.0 self.gazebo_robot_angle = 0.0 # self.laser_data = self.thread1.laser_data self.laser_sub = rospy.Subscriber(self.sub_name, LaserScan, self.laser_callback, queue_size=1) self.last_robot_pose_x = self.gazebo_robot_pose_x self.last_robot_pose_y = self.gazebo_robot_pose_y self.sample_fuc() self.get_rid_of_bad_data() self.int_sample_fuc() # reset 函数需要提供初始状态 # self.state = np.array([]) # 用来只有目标点作为状态时 self.state_speed_wrapper() self.true_state_distance = self.distance_to_goal() if self.true_state_distance >= 10: self.true_state_distance = 10 # self.normalization_state_input() # self.discretization_state_input() # print(self.sample_laser_data) # print(self.rew_sample_laser_data) self.state = np.array(self.int_sample_laser_data[:]) # self.state = np.array(self.state_sample_laser_data[:]) # self.state = np.append(self.state, [self.state_distance, self.state_polar_angle, # self.state_speed, self.state_angle]) # self.state = np.append(self.state, [int(self.true_state_distance*6), int(self.state_polar_angle), # self.state_speed, self.state_angle]) self.state_distance = int(round(self.true_state_distance * 6)) self.state = np.append( self.state, [self.state_distance, int(self.state_polar_angle)]) self.last_distance_to_goal = self.state_distance self.last_laser_data = self.sample_laser_data[:] # self.state = self.state.reshape((14,)) self.state = self.state.reshape((12, )) return self.state def step(self, action): self.steps += 1 assert self.action_space.contains( action), "%r (%s) invalid" % (action, type(action)) # robot 的加速度是有限制的,这里的选取一定要慎重,有两种方案,要么把变化数值都设成很小(不宜,难以快速响应环境) # 要么将publish的频率设限,20hz(+0.1情况下)以下 # print("action:"+str(action)) # print("gazebo: speed" + str(self.gazebo_robot_speed)) # action = 0 # testing # if action == 0: # self.gazebo_robot_speed += 0.05 # elif action == 1: # self.gazebo_robot_speed -= 0.05 # # elif action == 2: # # self.gazebo_robot_speed = 0.0 # elif action == 2: # self.gazebo_robot_angle += 0.05 # elif action == 3: # self.gazebo_robot_angle -= 0.05 # # elif action == 5: # # self.gazebo_robot_angle = 0.0 # elif action == 4: # pass # 保持当前状态 # else: # gym.logger.warn("Unknown situation !") if action == 0: self.gazebo_robot_speed = 0.10 # 速度不能突然设置为特别大的数,否则会失控,不能超过0.1了 # self.action_0 = True elif action == 1: self.gazebo_robot_speed = 0.0 # self.action_0 = False # elif action == 2: # self.gazebo_robot_speed = 0.0 elif action == 2: self.gazebo_robot_angle = +0.10 # self.action_0 = False elif action == 3: self.gazebo_robot_angle = -0.10 # self.action_0 = False # elif action == 5: # self.gazebo_robot_angle = 0.0 elif action == 4: self.gazebo_robot_angle = 0 # self.action_0 = False else: gym.logger.warn("Unknown situation !") self.gazebo_robot_speed = np.clip(self.gazebo_robot_speed, self.min_speed, self.max_speed) self.gazebo_robot_angle = np.clip(self.gazebo_robot_angle, self.min_angle, self.max_angle) # 与许多其它os不同之处,p2os设置好cmd_vel后,robot会一直保持该速度, # self.cmd_twist.linear.x = 0 # gazebo中robot就算什么指令都没有,还是会有非常小的cmd_twist.linear.x与cmd_twist.angular.z,如果直接publish这些数据,只会造成误差越来越大 if abs(self.gazebo_robot_speed) < 0.025: self.gazebo_robot_speed = 0.0 if abs(self.gazebo_robot_angle) < 0.025: self.gazebo_robot_angle = 0.0 self.cmd_twist.linear.x = self.gazebo_robot_speed self.cmd_twist.angular.z = self.gazebo_robot_angle # attention! testing! # self.cmd_twist.linear.x = 0.5 # self.cmd_twist.angular.z = 0.0 # print("gazebo: speed" + str(self.gazebo_robot_speed) + ", angle: " + str(self.gazebo_robot_angle)) # 应该新建一个服务,将最新的动作信息同步给该服务,在该服务中负责发布cmd_twist,或者使用线程 time.sleep(0.085) # 0.075 self.act_pub.publish(self.cmd_twist) # 记录部分上一轮数据 # self.laser_data = self.thread1.laser_data self.laser_sub = rospy.Subscriber(self.sub_name, LaserScan, self.laser_callback, queue_size=1) self.last_robot_pose_x = self.gazebo_robot_pose_x self.last_robot_pose_y = self.gazebo_robot_pose_y # 执行完动作,检查一下状态值(这里的严谨性有待考察,因为这里并不是严格的action一下,返回一个状态) self.gazebo_robot_pose_x, self.gazebo_robot_pose_y, self.gazebo_robot_speed, self.gazebo_robot_angle,\ self.state_polar_angle = self.get_gazebo_state() self.sample_fuc() self.get_rid_of_bad_data() self.true_state_distance = self.distance_to_goal() if self.true_state_distance >= 10: self.true_state_distance = 10 # print(self.state_speed) # print(self.state_angle) # self.normalization_state_input() # self.discretization_state_input() self.int_sample_fuc() # print(self.sample_laser_data) # print(self.rew_sample_laser_data) self.state = np.array(self.int_sample_laser_data[:]) # self.state = np.array(self.state_sample_laser_data[:]) # self.state = np.array([]) self.state_speed_wrapper() self.state_distance = int(round(self.true_state_distance * 6)) self.state = np.append( self.state, [self.state_distance, int(self.state_polar_angle)]) # self.state_speed, self.state_angle]) # self.state = np.append(self.state, [self.state_distance, self.state_polar_angle, # self.state_speed, self.state_angle]) # self.state = np.append(self.state, [int(self.goal_x*10), int(self.goal_y*10), # int(self.gazebo_robot_pose_x*10), int(self.gazebo_robot_pose_y*10), # int(self.last_robot_pose_x*10), int(self.last_robot_pose_y*10), # self.state_speed, self.state_angle]) # print(self.sample_laser_data) # 应该把最后的位姿信息也作为限制, min(distance) 选择0.25,有待考察,因为将测距仪放在车头前部 if self.true_state_distance < self.distance_to_goal_threshold \ or min(self.sample_laser_data) < self.collision_threshold: # and (self.gazebo_robot_speed < 0.1): self.done = True # print("SPACE") # print(self.sample_laser_data) # print(self.last_laser_data) # print("x: " + str(self.gazebo_robot_pose_x) + "y: " + str(self.gazebo_robot_pose_y)) # print(self.true_state_distance) if self.steps <= 10: # for debug print("robot: " + str(ac_num) + " : ") print(self.sample_laser_data) # print(self.last_laser_data) print("goal_x: " + str(self.goal_x) + "goal_y: " + str(self.goal_y)) print("init_x: " + str(self.gazebo_robot_pose_x) + "init_y: " + str(self.gazebo_robot_pose_y)) # print(self.int_sample_laser_data) # self.state = self.state.reshape((14, )) self.state = self.state.reshape((12, )) reward_num = self.reward() # reward值必须在更新last_distance_to_goal之前计算 self.last_distance_to_goal = self.state_distance # 更新距离,为下次计算reward准备 self.last_laser_data = self.sample_laser_data[:] # 注意这里,坑.... copy!!! # print("actor"+str(ac_num)+": ") # print(self.state) return self.state, reward_num, self.done, {} def distance_to_goal(self): return math.sqrt((self.goal_x - self.gazebo_robot_pose_x)**2 + (self.goal_y - self.gazebo_robot_pose_y)**2) def reward(self): # 应该把最后的位姿信息也作为reward的限制,与done保持一致,目标距离与障碍物距离之间应该有交叉reward? if self.true_state_distance < self.distance_to_goal_threshold: return self.arrive_reward elif min( self.sample_laser_data ) < self.collision_threshold: # 有关障碍物距离部分,原始激光数据有噪声(或其它原因,不能直接做判定,甚至达到0.2 ???) # print(self.sample_laser_data) # print("error") return self.collision_reward # 谨慎选择有关距离的reward_fuc, 避免robot有可能出现的刷分行为(与arrive_reward的比例,正负等等) # 有关距离目标距离的部分,应该可以加快学习速度,但是比较难处理,超过目标点之后,跑远了就给它负的reward # 方案一 # else: # return self.road_reward_times * (self.last_distance_ro_goal-self.distance_to_goal()) # 方案二 else: # 应该设置成总选action_1给负分 # if self.action_0 is True: # plus_reward = 0.1 # else: # plus_reward = 0 # if self.last_distance_to_goal - self.state_distance > 0: # 越来越近 # return 1.0 * self.road_reward_times + plus_reward # elif self.last_distance_to_goal - self.state_distance < 0: # 远离惩罚 # return -0.5 * self.road_reward_times + plus_reward # 可以加大远离的惩罚 # else: # return 0 + plus_reward if self.last_distance_to_goal - self.state_distance > 0: # 越来越近 return 1.0 * self.road_reward_times + self.time_cost_reward elif self.last_distance_to_goal - self.state_distance < 0: # 远离惩罚,不动惩罚 return -0.5 * self.road_reward_times + self.time_cost_reward # 可以加大远离的惩罚 else: return 0 + self.time_cost_reward # else: # return 0.0 def render(self, mode='human'): pass # @staticmethod # def normalization_func(max_num, min_num, x): # return (x - min_num)/(max_num - min_num) # # def normalization_state_input(self): # for i in range(0, 10): # self.state_sample_laser_data[i] = self.normalization_func(10, 0, self.sample_laser_data[i]) # self.state_polar_angle = self.normalization_func(2*math.pi, 0, self.state_polar_angle) # self.state_distance = self.normalization_func(14.14, 0, self.true_state_distance) # sqrt(200) # self.state_speed = self.normalization_func(0.5, 0, self.gazebo_robot_speed) # if self.state_speed < 0: # why? # self.state_speed = 0.0 # self.state_angle = self.normalization_func(0.5, -0.5, self.gazebo_robot_angle) # # # round()取值方式精度不高,具体查看 http://www.runoob.com/w3cnote/python-round-func-note.html # def discretization_state_input(self): # for i in range(0, 10): # self.state_sample_laser_data[i] = round(self.state_sample_laser_data[i], 2) # self.state_polar_angle = round(self.state_polar_angle, 2) # self.state_distance = round(self.state_distance, 2) # self.state_speed = round(self.state_speed, 2) # self.state_angle = round(self.state_angle, 2) # 是否应该换成机器人本身的数据,毕竟真实机器人没有这些 # @staticmethod def get_gazebo_state(self): # 第一个参数model_name,第二个参数为relative_entity_name model_state = self.get_state_srv( 'robot' + str(ac_num), '') # 注意model_state 的类型为GetModelStateResponse robot_pose_x = round(model_state.pose.position.x, 1) # 注意pose为(x,y,z) , 离散, robot_pose_y = round(model_state.pose.position.y, 1) robot_twist_linear_x = round(model_state.twist.linear.x, 4) # 离散化 robot_twist_angular_z = round(model_state.twist.angular.z, 4) # angular.z代表平面机器人的角速度,因为此时z轴为旋转轴 if model_state.pose.orientation.z < 0: # robot_polar_angle位于[-2pi,2pi],what the hell? robot_polar_angle = -2 * math.acos(model_state.pose.orientation.w) else: robot_polar_angle = 2 * math.acos(model_state.pose.orientation.w) # print(robot_polar_angle) # print(model_state.pose.orientation.w) # if robot_polar_angle < 0: # 整合为[0,2pi] # robot_polar_angle += 2*math.pi # robot_polar_angle是robot相对默认坐标系的转向角 if robot_polar_angle > math.pi: # 整合为(-pi,pi] robot_polar_angle -= 2 * math.pi # robot_polar_angle是robot相对默认坐标系的转向角 elif robot_polar_angle <= -math.pi: robot_polar_angle += 2 * math.pi # print(robot_polar_angle/math.pi * 180) robot_to_goal_angle = self.cpose_goal_angle( ) - robot_polar_angle # 这里会被计算成(-2pi,2pi] if robot_to_goal_angle > math.pi: # 整合为(-pi,pi] robot_to_goal_angle -= 2 * math.pi # robot_polar_angle是robot相对默认坐标系的转向角 elif robot_to_goal_angle <= -math.pi: robot_to_goal_angle += 2 * math.pi # print("robot 当前与坐标轴的角度") # print(robot_polar_angle / math.pi * 180) # print("goal 与当前robot坐标轴的角度") # print(self.cpose_goal_angle() / math.pi * 180) # print("goal 与当前robot的角度") # print(robot_to_goal_angle/math.pi * 180) # robot_polar_angle = round(robot_polar_angle*10, 1) # normalization 注释掉这句话 robot_to_goal_angle += math.pi # 变为(0,2pi] # robot_to_goal_angle = round(robot_to_goal_angle*10, 1) # normalization 注释掉这句话 robot_to_goal_angle = round(robot_to_goal_angle * 10) # normalization 注释掉这句话 # robot_orientation_z = round(50*(model_state.pose.orientation.z+1), 1) # z与w 在[-1,1]之间 # robot_orientation_w = round(50*(model_state.pose.orientation.w+1), 1) # return self.cc_func(robot_pose_x), self.cc_func(robot_pose_y), robot_twist_linear_x, robot_twist_angular_z return robot_pose_x, robot_pose_y, robot_twist_linear_x, robot_twist_angular_z, robot_to_goal_angle # 当前实现使用其它方法 # 获取障碍物的位置 # def get_obstacle_pose(self): # # print('call get_obstacle_pose') # for i in range(0, 15): # 15个方块 # obstacle_pose = self.get_state_srv('unit_box_' + str(i), '') # self.obstacle_pose[i][0] = obstacle_pose.pose.position.x # self.obstacle_pose[i][1] = obstacle_pose.pose.position.y # # for i in range(0, 5): # 5个圆柱体,不使用球体,容易到处乱滚 # obstacle_pose = self.get_state_srv('unit_cylinder_' + str(i), '') # self.obstacle_pose[i + 15][0] = obstacle_pose.pose.position.x # self.obstacle_pose[i + 15][1] = obstacle_pose.pose.position.y # # print('end call get_obstacle_pose') # # # 用来随机重置障碍物的位置坐标 # def reset_obstacle_pose(self): # for i in range(0, 20): # self.obstacle_pose[i][0] = (20 * (np.random.ranf() - 0.5)) # x~(-10,10) # while abs(self.obstacle_pose[i][0]) < 1.0: # self.obstacle_pose[i][0] = (20 * (np.random.ranf() - 0.5)) # 距离初始位置过近 # self.obstacle_pose[i][0] = round(self.obstacle_pose[i][0], 1) # 离散化 # self.obstacle_pose[i][1] = (20 * (np.random.ranf() - 0.5)) # y~(-10,10) # while abs(self.obstacle_pose[i][1]) < 1.0: # self.obstacle_pose[i][1] = (20 * (np.random.ranf() - 0.5)) # 距离初始位置过近 # self.obstacle_pose[i][1] = round(self.obstacle_pose[i][1], 1) # # # 使用服务,重置障碍物 # def reset_obstacle_pose_srv(self): # new_obstacle_state = ModelState() # # new_obstacle_state.pose.orientation.x = 0 # new_obstacle_state.pose.orientation.y = 0 # new_obstacle_state.pose.orientation.z = 0 # new_obstacle_state.pose.orientation.w = 0 # # new_obstacle_state.twist.linear.x = 0 # new_obstacle_state.twist.linear.y = 0 # new_obstacle_state.twist.linear.z = 0 # # new_obstacle_state.twist.angular.x = 0 # new_obstacle_state.twist.angular.y = 0 # new_obstacle_state.twist.angular.z = 0 # # new_obstacle_state.reference_frame = "world" # for i in range(0, 15): # 10个方块 # new_obstacle_state.model_name = "unit_box_"+str(i) # new_obstacle_state.pose.position.x = self.obstacle_pose[i][0] # new_obstacle_state.pose.position.y = self.obstacle_pose[i][1] # new_obstacle_state.pose.position.z = 0 # # self.set_obstacle_srv(new_obstacle_state) # # for i in range(0, 5): # 5个圆柱体,不使用球体,容易到处乱滚 # new_obstacle_state.model_name = "unit_cylinder_"+str(i) # new_obstacle_state.pose.position.x = self.obstacle_pose[i+15][0] # new_obstacle_state.pose.position.y = self.obstacle_pose[i+15][1] # new_obstacle_state.pose.position.z = 0 # # self.set_obstacle_srv(new_obstacle_state) # 从原激光数据中抽取10组0.0型的激光数据,相当于每20°取一个; # 为了各robot可以互相探测,激光安装在车身前部,但更合理的做法应当是车身中央,所以要将测量到的laser数据进行一下变换0.15 def sample_fuc(self): self.sample_laser_data[0] = self.laser_data[0] self.sample_laser_data[-1] = self.laser_data[-1] for i in range(1, 9): self.sample_laser_data[i] = self.laser_data[i * 72] for j in range(0, 10): # temp_angle = (math.pi / 9) * j # 弧度制 # self.sample_laser_data[j] += math.sin(temp_angle) * 0.15 # 修正中心位置,这个应该用于安装于车头的情况 self.sample_laser_data[j] = round( self.sample_laser_data[j], 1) # 离散化,保留1位小数,注意round()既非截断,也非四舍五入,也许2位更好? # if self.sample_laser_data[i] == float('inf'): if self.sample_laser_data[j] > 10.0: self.sample_laser_data[j] = 10.0 # # 该函数应该在sample_fuc()之后调用,用于记录一个数组中的几组激光数据,self.rew_sample_laser_data为3×10(暂时) # def add_check_laser_data(self): # data = self.sample_laser_data # print(data) # if self.next_nrsld_index >= len(self.rew_sample_laser_data): # self.rew_sample_laser_data.append(data) # else: # self.rew_sample_laser_data[self.next_nrsld_index] = data # self.next_nrsld_index = (self.next_nrsld_index + 1) % self.num_rew_sample_laser_data # print(self.next_nrsld_index) # def add_check_laser_data(self): # data = self.sample_laser_data # self.rew_sample_laser_data[self.next_nrsld_index] = data # self.next_nrsld_index = (self.next_nrsld_index + 1) % self.num_rew_sample_laser_data # print("what the hell?") # print(self.rew_sample_laser_data) # print(self.next_nrsld_index) # # # 用于恢复<0.3的激光数据 # def get_rid_of_bad_data(self): # for i in range(self.num_laser): # if self.sample_laser_data[i] < 0.3: # for j in range(self.num_rew_sample_laser_data): # if self.rew_sample_laser_data[j][i] > 0.3: # self.sample_laser_data[i] = self.rew_sample_laser_data[j][i] # else: # pass # else: # pass # 用于恢复<0.3的激光数据,或许应该检查的是相同位置激光读数差距(应该不只保存一组,而是多组,方便对比) def get_rid_of_bad_data(self): for i in range(self.num_laser): if self.sample_laser_data[i] < 0.3: # print("check") # print(self.sample_laser_data) # print(self.last_laser_data) self.sample_laser_data[i] = self.last_laser_data[i] else: pass # 如果使用normalization_state_input(),则不要使用这个 def int_sample_fuc(self): for j in range(0, 10): self.int_sample_laser_data[j] = int(self.sample_laser_data[j] * 10) # 变成整数 # 找到其它原因,放弃该部分代码 # # 给出一个避免噪声影响的最小值, 还要加上限制条件(inf情况下),如安然变小(噪声),为false # def check_min_laser_data(self): # mean = 0.0 # sigle_bool_check_data = [0 for x in range(0, self.num_rew_sample_laser_data)] # sigle_bool_bool = [0 for x in range(0, self.num_rew_sample_laser_data)] # for i in range(self.num_laser): # for j in range(self.num_rew_sample_laser_data): # sigle_bool_check_data[j] = self.rew_sample_laser_data[j][i] # mean = sum(sigle_bool_check_data)/len(sigle_bool_check_data) # for k in range(self.num_rew_sample_laser_data): # if (sigle_bool_check_data[k] - mean) > 0.1: # (当前状态下不太可能单次变化超过0.1) # sigle_bool_bool[k] = False # 判定为噪声数据 # else: # sigle_bool_bool[k] = True # if False in sigle_bool_bool: # self.bool_check_laser_data[i] = False # 判定该位置为噪声数据 # # 迭代寻找最小数据,未完成 # def __del__(self): # # 等待至线程中止。这阻塞调用线程直至线程的join()方法被调用中止-正常退出或者抛出未处理的异常-或者是可选的超时发生。 # self.thread1.join() # 还是不对 # def close(self): pass
class StringGameEnv(Env): def __init__(self, max_steps=MAX_STEP): np.random.seed(123) torch.manual_seed(123) self.max_steps = max_steps self.reward_map = {} self._init_reward() logger.debug(self.reward_map) self.recent_actions = deque([], maxlen=SEQ_LEN) self.recent_states = deque([], maxlen=SEQ_LEN) self.cur_state = None self.action_space = Discrete(ACTION_DIM) self.observation_space = Box(low=0, high=1, shape=(STATE_DIM,)) self.step_cnt = 0 self.reset() def _init_reward(self): for seq_len in range(1, SEQ_LEN + 1): for k in itertools.product(CHARACTERS, repeat=seq_len): self.reward_map["".join(k)] = 0 self.reward_map["ABB"] = 5 self.reward_map["BBB"] = -5 def seed(self, seed=None): np.random.seed(seed) torch.manual_seed(seed) @staticmethod def random_action(): return np.random.randint(0, ACTION_DIM) def get_reward(self): """ The function you can write to customize rewards. In this specific environment, the reward only depends on action history """ recent_characters = [CHARACTERS[c] for c in list(self.recent_actions)] string = "".join(recent_characters) reward = self.reward_map[string] return reward, string def step(self, action): assert self.action_space.contains(action) assert self.done is False self.step_cnt += 1 self.recent_states.append(self.cur_state) self.recent_actions.append(action) reward, info = self.get_reward() if self.step_cnt >= self.max_steps: self.done = True ob = self.get_observation() self.cur_state = ob return ob, reward, self.done, {"reward_str": info} def get_observation(self): """ The function you can write to customize transitions. In this specific environment, the next state is exactly the latest action taken. The initial observation is all zeros. """ ob = np.zeros(STATE_DIM) if len(self.recent_actions) > 0: ob[self.recent_actions[-1]] = 1 return ob def reset(self): self.done = False self.recent_states = deque([], maxlen=SEQ_LEN) self.recent_actions = deque([], maxlen=SEQ_LEN) self.step_cnt = 0 ob = self.get_observation() self.cur_state = ob return ob def print_internal_state(self): print("Step", self.step_cnt) state_str = "".join( [CHARACTERS[np.nonzero(c)[0].item()] for c in self.recent_states] ) action_str = "".join([CHARACTERS[c] for c in self.recent_actions]) print( "Internal state: recent states {}, recent actions {}".format( state_str, action_str ) ) @staticmethod def print_ob(ob): return str(ob) @staticmethod def print_action(action): return CHARACTERS[action]
class Version12(WorkflowSimEnv): """ @version 12.0: - Design based on workflow sim - Reward scheme same as v7 - Focus on system overhead """ def __init__(self): # Montage Experiment Variable super(Version12, self).__init__() self.action_range = 100 self.cluster_range = 100 self.action_space = Discrete(self.action_range) self.observation_space = Discrete(self.cluster_range) self.cluster_size = 1 # Episode Conf # Best exec_time: None or 1, depends on reward version self.best_makespan = None self.last_makespan = None self.last_action = None self.reward = 0 self.total_reward = 0.0 self.exec_records = {} self.all_exec_record = list() self.all_overhead_record = list() self.all_makespan_record = list() self.all_benchmark_record = list() self.seed() self.reset() def step(self, action, training=False): assert self.action_space.contains(action) action += 1 done = False reward = 0.0 # Return all the data collected makespan, queue, exec, postscript, cluster, wen = self.run_experiment( vm_size=self.action_range, clustering_choice="CNUM", clustering_method="HORIZONTAL", cluster_size=action) overhead = float(queue) + float(postscript) + float(wen) + float( cluster) makespan = float(makespan) self.all_overhead_record.append(overhead) self.all_makespan_record.append(float(makespan)) self.all_exec_record.append(float(exec)) if not training: # Setting up best exec if self.best_makespan is None: self.best_makespan = makespan if self.last_makespan is None: self.last_makespan = makespan # Rewarding / Penalty Judgement percentile = np.percentile(self.all_makespan_record, 10) benchmark = np.mean(self.all_benchmark_record) # Calc improve percentage if len(self.all_benchmark_record) == 0: self.all_benchmark_record.append(percentile) elif abs(percentile - benchmark) / benchmark * 100 > 10: self.all_benchmark_record.append(percentile) benchmark = np.mean(self.all_benchmark_record) # else: # print(abs(percentile - benchmark) / benchmark * 100) if makespan < benchmark: self.best_makespan = makespan reward = 1 else: reward = -1 self.last_makespan = makespan self.total_reward += reward self.last_action = action if self.total_reward > 20 or self.total_reward < -20: done = True return self._get_obs(), reward, done, { "exec": exec, "overhead": overhead, "makespan": makespan, "queue": queue, "postscript": postscript, "cluster": cluster, "wen": wen, "benchmark": benchmark } def reset(self): self.total_reward = 0.0 # for x in self.all_makespan_record: # if x > np.percentile(self.all_makespan_record, 20): # self.all_makespan_record.remove(x) return np.random.randint( 1, (self.cluster_range + 1)) # , self.clusters_num def _get_obs(self): return np.random.randint(1, (self.cluster_range + 1))