def _state_step( self, action: D.T_agent[D.T_concurrency[D.T_event]] ) -> TransitionOutcome[D.T_state, D.T_agent[TransitionValue[D.T_value]], D.T_agent[D.T_info]]: # Get players' moves move1, move2 = action['player1'], action['player2'] # Compute rewards r1, r2 = { (Move.rock, Move.rock): (0, 0), (Move.rock, Move.paper): (-1, 1), (Move.rock, Move.scissors): (1, -1), (Move.paper, Move.rock): (1, -1), (Move.paper, Move.paper): (0, 0), (Move.paper, Move.scissors): (-1, 1), (Move.scissors, Move.rock): (-1, 1), (Move.scissors, Move.paper): (1, -1), (Move.scissors, Move.scissors): (0, 0) }[move1, move2] # Compute num_move increment last_state = self._memory num_move = last_state.num_move + 1 return TransitionOutcome(state=State(num_move=num_move), value={ 'player1': TransitionValue(reward=r1), 'player2': TransitionValue(reward=r2) }, termination=(num_move >= self._max_moves))
def _get_next_state( self, memory: D.T_memory[D.T_state], action: D.T_agent[D.T_concurrency[D.T_event]], ) -> D.T_state: env = memory._context[0] if self._set_state is None or self._get_state is None: env = deepcopy(env) elif memory._context[4] != self._get_state(env): self._set_state(env, memory._context[4]) self._gym_env = env # Just in case the simulation environment would be different from the planner's environment... obs, reward, done, info = env.step(action) outcome = TransitionOutcome(state=obs, value=Value(reward=reward), termination=done, info=info) # print('Transition:', str(memory._state), ' -> ', str(action), ' -> ', str(outcome.state)) return GymDomainStateProxy( state=outcome.state, context=[ env, memory._state, action, outcome, self._get_state(env) if (self._get_state is not None and self._set_state is not None) else None, ], )
def _state_step( self, action: D.T_agent[D.T_concurrency[D.T_event]] ) -> TransitionOutcome[D.T_state, D.T_agent[Value[D.T_value]], D.T_agent[D.T_predicate], D.T_agent[D.T_info], ]: o = super()._state_step(action) self._current_depth += 1 self._cumulated_reward += o.value.reward # self._cumulated_dist_to_start += math.exp(-math.fabs(self._gym_env.sim.get_property_value(prp.position_distance_from_start_mag_mt))) self._cumulated_dist_to_start = self._gym_env.sim.get_property_value( prp.position_distance_from_start_mag_mt) self._cumulated_dist_to_line += math.exp(-math.fabs( self._gym_env.sim.get_property_value(prp.shortest_dist))) return TransitionOutcome( state=GymDomainStateProxy( state=o.state._state, context=( self._current_depth, self._cumulated_reward, self._cumulated_dist_to_start, self._cumulated_dist_to_line, ), ), value=o.value, termination=o.termination, info=o.info, )
def _state_step( self, action: D.T_agent[D.T_concurrency[D.T_event]] ) -> TransitionOutcome[D.T_state, D.T_agent[Value[D.T_value]], D.T_agent[D.T_predicate], D.T_agent[D.T_info], ]: obs, reward, done, info = self._gym_env.step(action) return TransitionOutcome(state=obs, value=Value(reward=reward), termination=done, info=info)
def _state_step( self, action: D.T_agent[D.T_concurrency[D.T_event]] ) -> TransitionOutcome[D.T_state, D.T_agent[TransitionValue[D.T_value]], D.T_agent[D.T_info]]: o = super()._state_step(action) return TransitionOutcome(state=o.state, value=TransitionValue(reward=o.value.reward - 1), termination=o.termination, info=o.info)
def _state_step( self, action: D.T_agent[D.T_concurrency[D.T_event]] ) -> TransitionOutcome[D.T_state, D.T_agent[TransitionValue[D.T_value]], D.T_agent[D.T_info]]: o = super()._state_step(action) return TransitionOutcome(state=GymDomainStateProxy( state=normalize_and_round(o.state._state), context=o.state._context), value=TransitionValue(reward=o.value.reward - 1), termination=o.termination, info=o.info)
def _state_step( self, action: D.T_agent[D.T_concurrency[D.T_event]] ) -> TransitionOutcome[D.T_state, D.T_agent[Value[D.T_value]], D.T_agent[D.T_predicate], D.T_agent[D.T_info], ]: self._gym_env.set_state(self._current_state) o = super()._state_step(action) self._current_state = self._gym_env.get_state() return TransitionOutcome( state=o.state, value=Value(reward=o.value.reward - 1), termination=o.termination, info=o.info, )
def _state_step( self, action: D.T_agent[D.T_concurrency[D.T_event]] ) -> TransitionOutcome[D.T_state, D.T_agent[Value[D.T_value]], D.T_agent[D.T_predicate], D.T_agent[D.T_info], ]: obs, reward, done, info = self._gym_env.step(action) if self._set_state is not None and self._get_state is not None: state = GymDomainStateProxy(state=obs, context=self._initial_env_state) else: state = GymDomainStateProxy(state=obs, context=self._init_env) return TransitionOutcome(state=state, value=Value(reward=reward), termination=done, info=info)
def decode(outcome): return TransitionOutcome( state=MyShmProxy.StateProxy.decode(outcome[0]), value=MyShmProxy.TransitionValueProxy.decode(outcome[1:3]), termination=MyShmProxy.BoolProxy.decode(outcome[3]))