def _state_step( self, action: D.T_agent[D.T_concurrency[D.T_event]] ) -> TransitionOutcome[D.T_state, D.T_agent[TransitionValue[D.T_value]], D.T_agent[D.T_info]]: # Get players' moves move1, move2 = action['player1'], action['player2'] # Compute rewards r1, r2 = { (Move.rock, Move.rock): (0, 0), (Move.rock, Move.paper): (-1, 1), (Move.rock, Move.scissors): (1, -1), (Move.paper, Move.rock): (1, -1), (Move.paper, Move.paper): (0, 0), (Move.paper, Move.scissors): (-1, 1), (Move.scissors, Move.rock): (-1, 1), (Move.scissors, Move.paper): (1, -1), (Move.scissors, Move.scissors): (0, 0) }[move1, move2] # Compute num_move increment last_state = self._memory num_move = last_state.num_move + 1 return TransitionOutcome(state=State(num_move=num_move), value={ 'player1': TransitionValue(reward=r1), 'player2': TransitionValue(reward=r2) }, termination=(num_move >= self._max_moves))
def _get_transition_value(self, state: TrafficLightState, action: TrafficLightAction, next_state: Optional[TrafficLightState] = None) -> TransitionValue: ''' Returns the value of the transition ''' # DONE if next_state is not None: return TransitionValue(cost=next_state.cars_queueing_east+state.cars_queueing_north, reward=-(next_state.cars_queueing_east+state.cars_queueing_north)) else: return TransitionValue(cost=0)
def _get_transition_value( self, state: SMState, action: SMAction, next_state: Optional[SMState] = None) -> TransitionValue: (value, _distrib) = self._state_to_action_to_output[state][action] return TransitionValue(cost=value)
def _get_transition_value( self, memory: D.T_memory[D.T_state], action: D.T_agent[D.T_concurrency[D.T_event]], next_state: Optional[D.T_state] = None ) -> D.T_agent[TransitionValue[D.T_value]]: v = super()._get_transition_value(memory, action, next_state) return TransitionValue(reward=v.reward - 1)
def _get_transition_value(self, memory: D.T_memory[D.T_state], action: D.T_agent[D.T_concurrency[D.T_event]], next_state: Optional[D.T_state] = None) -> D.T_agent[TransitionValue[D.T_value]]: if next_state.x == memory.x and next_state.y == memory.y: cost = 2 # big penalty when hitting a wall else: cost = abs(next_state.x - memory.x) + abs(next_state.y - memory.y) # every move costs 1 return TransitionValue(cost=cost)
def _get_transition_value( self, memory: D.T_memory[D.T_state], action: D.T_agent[D.T_concurrency[D.T_event]], next_state: Optional[D.T_state] = None ) -> D.T_agent[TransitionValue[D.T_value]]: # every move costs 1 return TransitionValue(cost=abs(next_state.x - memory.x) + abs(next_state.y - memory.y))
def _state_step( self, action: D.T_agent[D.T_concurrency[D.T_event]] ) -> TransitionOutcome[D.T_state, D.T_agent[TransitionValue[D.T_value]], D.T_agent[D.T_info]]: obs, reward, done, info = self._gym_env.step(action) return TransitionOutcome(state=obs, value=TransitionValue(reward=reward), termination=done, info=info)
def _state_step( self, action: D.T_agent[D.T_concurrency[D.T_event]] ) -> TransitionOutcome[D.T_state, D.T_agent[TransitionValue[D.T_value]], D.T_agent[D.T_info]]: o = super()._state_step(action) return TransitionOutcome(state=o.state, value=TransitionValue(reward=o.value.reward - 1), termination=o.termination, info=o.info)
def _sample(self, memory: D.T_memory[D.T_state], action: D.T_agent[D.T_concurrency[D.T_event]]) -> \ EnvironmentOutcome[D.T_agent[D.T_observation], D.T_agent[TransitionValue[D.T_value]], D.T_agent[D.T_info]]: o = super()._sample(memory, action) return EnvironmentOutcome(observation=GymDomainStateProxy( state=normalize_and_round(o.observation._state), context=o.observation._context), value=TransitionValue(reward=o.value.reward - 1), termination=o.termination, info=o.info)
def _state_step( self, action: D.T_agent[D.T_concurrency[D.T_event]] ) -> TransitionOutcome[D.T_state, D.T_agent[TransitionValue[D.T_value]], D.T_agent[D.T_info]]: o = super()._state_step(action) return TransitionOutcome(state=GymDomainStateProxy( state=normalize_and_round(o.state._state), context=o.state._context), value=TransitionValue(reward=o.value.reward - 1), termination=o.termination, info=o.info)
def test(self): dom = TrafficLightDomain() state = TrafficLightState(cars_queueing_north=3, cars_queueing_east=2, north_light=SingleLightState.RECENT_RED, east_light=SingleLightState.RED) next_state = TrafficLightState(cars_queueing_north=3, cars_queueing_east=3, north_light=SingleLightState.RED, east_light=SingleLightState.GREEN) action = TrafficLightAction.DO_NOT_SWITCH self.assertEqual(dom.get_transition_value(state, action, next_state), TransitionValue(cost=6))
def _state_step( self, action: D.T_agent[D.T_concurrency[D.T_event]] ) -> TransitionOutcome[D.T_state, D.T_agent[TransitionValue[D.T_value]], D.T_agent[D.T_info]]: obs, reward, done, info = self._gym_env.step(action) if self._set_state is not None and self._get_state is not None: state = GymDomainStateProxy(state=obs, context=self._initial_env_state) else: state = GymDomainStateProxy(state=obs, context=self._init_env) return TransitionOutcome(state=state, value=TransitionValue(reward=reward), termination=done, info=info)
def _get_next_state( self, memory: D.T_memory[D.T_state], action: D.T_agent[D.T_concurrency[D.T_event]]) -> D.T_state: env = memory._context[0] if self._set_state is None or self._get_state is None: env = deepcopy(env) elif memory._context[4] != self._get_state(env): self._set_state(env, memory._context[4]) self._gym_env = env # Just in case the simulation environment would be different from the planner's environment... obs, reward, done, info = env.step(action) outcome = TransitionOutcome(state=obs, value=TransitionValue(reward=reward), termination=done, info=info) # print('Transition:', str(memory._state), ' -> ', str(action), ' -> ', str(outcome.state)) return GymDomainStateProxy( state=outcome.state, context=[ env, memory._state, action, outcome, self._get_state(env) if (self._get_state is not None and self._set_state is not None) else None ])
def decode(value): if value[1].value: return TransitionValue(reward=value[0].value) else: return TransitionValue(cost=value[0].value)
def _get_transition_value(self, memory: D.T_memory[D.T_state], action: D.T_agent[D.T_concurrency[D.T_event]], next_state: Optional[D.T_state] = None) -> D.T_agent[TransitionValue[D.T_value]]: return TransitionValue(cost=1)
def _get_transition_value(self, state: GridState, action: GridAction, next_state: Optional[GridState] = None) -> TransitionValue: return TransitionValue(cost=action._cost)