def before_step(self, action): assert not self.closed if self.done: raise error.ResetNeeded("Trying to step environment which is currently done. While the monitor is active for {}, you cannot step beyond the end of an episode. Call 'env.reset()' to start the next episode.".format(self.env_id)) elif self.steps is None: raise error.ResetNeeded("Trying to step an environment before reset. While the monitor is active for {}, you must call 'env.reset()' before taking an initial step.".format(self.env_id))
def before_step(self, action): assert not self.closed if self.done: raise error.ResetNeeded( "Trying to step environment which is currently done. While the monitor is active, you cannot step beyond the end of an episode. Call 'env.reset()' to start the next episode." ) self.actions.append(action)
def step(self, action: Tuple[int, int], player: Optional[int] = None) -> Tuple[Any, float, bool, Dict]: """ Args: action: locaton we player: In more complex environments, we'll want to ensure we're not playing as the the same player twice. This provides a way of checking we're not breaking order by mistake Returns: observation, reward, done, info """ # check the action is valid and the game isn't over action = tuple(action) if self.board[action] != 0: raise error.InvalidAction(f"action {action} is not a vaid choice") if self.done: raise error.ResetNeeded("Call reset as game is over") if player and player != self.curr_turn: raise error.InvalidAction( f"Player {self.curr_turn}'s turn. Move request from {player}") logger.debug("Selected action: %s on turn %d", action, self.turns_played + 1) # set the location on the board to the current player. Since curr_turn # and current player use the same indicator, we just use that self.board[action] = self.curr_turn # check if the game is over. Reward is player that won (1 or -1) reward = check_win(self.board) if reward: self.done = True return self._get_obs(), float(reward), self.done, {} # check if the game is over (i.e. no more turns). Since we don't have a win # it must be a draw if self.turns_played == 9: self.done = True return self._get_obs(), 0.0, self.done, {} # otherwise game is still going. Advance turn and return state + no reward self.curr_turn = next(self.turn_iterator) return self._get_obs(), 0.0, self.done, {}
def step(self, action): if not self.episode_number or self.timesteps is self.horizon: raise error.ResetNeeded() state = self._get_new_state() self._take_action(action) reward = self._get_reward() message = "Timestep {}:==: Action: {} ; Reward: {}".format( self.timesteps, BaseEnv.action_space.lookup[action], reward) self.logger.debug(message) self.timesteps = self.timesteps + 1 if self.timesteps is not self.horizon: self.current = self.current + 1 return state, reward, False, float(self.horizon - self.timesteps) else: return state, reward, True, 0.0
def step(self, action): if self.done: raise error.ResetNeeded("") r, c, stone = action if self.board[r][c] != self.EMPTY: raise error.InvalidAction( "Stone '{}' already exists in row: {}, col: {}".format( self.board[r][c], r, c)) if stone >= self.STONE_TYPE_COUNT: raise error.InvalidAction("Unknown stone type '{}'".format(stone)) if stone == self.last_stone: raise error.InvalidAction("Need to change stone.") self.board[r][c] = self.STONES[stone] self.last_stone = self.STONES[stone] self.remaining_place -= 1 reward, self.done = self._check_status() return copy.deepcopy(self.board), reward, self.done, {}
def step(self, action: list): # sanity checks if self.done: raise error.ResetNeeded( "Environment is finished, please run env.reset() before taking actions" ) if get_init_len(action) != self.n_agents: raise error.InvalidAction( f"Length of action array must be same as n_agents({self.n_agents})" ) if any(np.array(action) < 0): raise error.InvalidAction( f"You can't order negative amount. You agents actions are: {action}" ) # concatenate previous states, self.prev_states in an queue of previous states self.prev_states.popleft() self.prev_states.append(self._get_observations()) # make incoming step demand = self._get_demand() orders_inc = [order.popleft() for order in self.orders] self.next_incoming_orders = [ demand ] + orders_inc[:-1] # what's the demand for each agent ship_inc = [shipment.popleft() for shipment in self.inbound_shipments] # calculate inbound shipments respecting orders and stock levels for i in range(self.n_agents - 1): # manufacturer is assumed to have no constraints max_possible_shipment = (max(0, self.stocks[i + 1]) + ship_inc[i + 1] ) # stock + incoming shipment order = orders_inc[i] + max( 0, -self.stocks[i + 1]) # incoming order + stockout (backorder) shipment = min(order, max_possible_shipment) self.inbound_shipments[i].append(shipment) self.inbound_shipments[-1].append(orders_inc[-1]) # update stocks self.stocks = [(stock + inc) for stock, inc in zip(self.stocks, ship_inc)] for i in range(1, self.n_agents): self.stocks[i] -= orders_inc[i - 1] self.stocks[0] -= demand # for the retailer # update orders for i in range(self.n_agents): self.orders[i].append(action[i]) self.next_incoming_orders = [self._get_demand() ] + [x[0] for x in self.orders[:-1]] # calculate costs self.holding_cost = np.zeros(self.n_agents, dtype=np.float) self.stockout_cost = np.zeros(self.n_agents, dtype=np.float) for i in range(self.n_agents): if self.stocks[i] >= 0: self.holding_cost[i] = (max(0, self.stocks[i]) * self.score_weight[0][i] ) # only applicable when stocks > 0 else: self.stockout_cost[i] = (-min(0, self.stocks[i]) * self.score_weight[1][i] ) # only applicable when stocks < 0 self.cum_holding_cost += self.holding_cost self.cum_stockout_cost += self.stockout_cost # calculate reward rewards = self._get_rewards() # check if done if self.turn == self.n_turns - 1: print( f"\nTotal cost is: EUR {sum(self.cum_holding_cost + self.cum_stockout_cost)}" ) self.done = True else: self.turn += 1 state = self._get_observations() # todo flatten observation dict return state, rewards, self.done, {}