def __init__(self, environment: Gym): """ Initialize the feature extractor. :param environment: Environment. """ if not isinstance(environment.gym_native.action_space, Discrete): # pragma no cover raise ValueError( 'Expected a discrete action space, but did not get one.') if environment.gym_native.action_space.n != 2: # pragma no cover raise ValueError('Expected two actions: left and right') super().__init__( environment=environment, actions=[Action(i=0, name='left'), Action(i=1, name='right')]) # create interacter over cartesian product of state categories self.state_category_interacter = OneHotCategoricalFeatureInteracter([ OneHotCategory(*args) for args in product(*([[True, False]] * 4)) ]) self.feature_scaler = NonstationaryFeatureScaler( num_observations_refit_feature_scaler=2000, refit_history_length=100000, refit_weight_decay=0.99999)
def test_action_eq_ne(): a1 = Action(1) a2 = Action(1) a3 = Action(2) assert a1 == a2 and a2 == a1 and a1 != a3 and a3 != a1
def test_human_agent(): agent = Human() a1 = Action(0, 'Foo') a2 = Action(1, 'Bar') state = MdpState(1, [a1, a2], False) agent.sense(state, 0) call_num = 0 def mock_input( prompt: str ) -> str: nonlocal call_num if call_num == 0: call_num += 1 return 'asdf' else: return 'Bar' agent.get_input = mock_input # MagicMock(return_value='Bar') assert agent.act(0) == a2 with pytest.raises(NotImplementedError): rng = RandomState(12345) Human.init_from_arguments([], rng, None)
def test_ne(): s1 = State(1, [Action(1), Action(2)]) s2 = State(2, [Action(1), Action(2)]) assert s1 != s2 s3 = State(1, [Action(3)]) assert s1 == s3
def reset_for_new_run(self, agent: Agent) -> State: """ Reset the the bandit, initializing arms to new expected values. :param agent: Agent. :return: New State. """ super().reset_for_new_run(agent) # get new arm reward means and initialize new arms q_star_means = self.random_state.normal(loc=self.q_star_mean, scale=self.q_star_variance, size=self.k) self.arms = [ Arm(i=i, mean=mean, variance=self.reward_variance, random_state=self.random_state) for i, mean in enumerate(q_star_means) ] self.best_arm = max(self.arms, key=lambda arm: arm.mean) return State(i=0, AA=[Action(i) for i in range(self.k)])
def __init__(self, name: str, random_state: RandomState, T: Optional[int], p_h: float): """ Initialize the MDP environment. :param name: Name. :param random_state: Random state. :param T: Maximum number of steps to run, or None for no limit. :param p_h: Probability of coin toss coming up heads. """ self.p_h = p_h self.p_t = 1 - p_h # the range of possible actions: stake 0 (no play) through 50 (at capital=50). beyond a capital of 50 the # agent is only allowed to stake an amount that would take them to 100 on a win. AA = [Action(i=stake, name=f'Stake {stake}') for stake in range(0, 51)] # two possible rewards: 0.0 and 1.0 self.r_not_win = Reward(0, 0.0) self.r_win = Reward(1, 1.0) RR = [self.r_not_win, self.r_win] # range of possible states (capital levels) SS = [ MdpState( i=capital, # the range of permissible actions is state dependent AA=[a for a in AA if a.i <= min(capital, 100 - capital)], terminal=capital == 0 or capital == 100) # include terminal capital levels of 0 and 100 for capital in range(0, 101) ] super().__init__(name=name, random_state=random_state, T=T, SS=SS, RR=RR) for s in self.SS: for a in self.p_S_prime_R_given_S_A[s]: # next state and reward if heads s_prime_h = self.SS[s.i + a.i] r_h = self.r_win if not s.terminal and s_prime_h.i == 100 else self.r_not_win self.p_S_prime_R_given_S_A[s][a][s_prime_h][r_h] = self.p_h # next state and reward if tails s_prime_t = self.SS[s.i - a.i] r_t = self.r_win if not s.terminal and s_prime_t.i == 100 else self.r_not_win self.p_S_prime_R_given_S_A[s][a][s_prime_t][ r_t] += self.p_t # add the probability, in case the results of head and tail are the same. self.check_marginal_probabilities()
def test_prioritized_planning_environment(): rng = RandomState(12345) planning_environment = PrioritizedSweepingMdpPlanningEnvironment( 'test', rng, StochasticEnvironmentModel(), 1, 0.3, 10) planning_environment.add_state_action_priority(MdpState(1, [], False), Action(1), 0.2) planning_environment.add_state_action_priority(MdpState(2, [], False), Action(2), 0.1) planning_environment.add_state_action_priority(MdpState(3, [], False), Action(3), 0.3) s, a = planning_environment.get_state_action_with_highest_priority() assert s.i == 2 and a.i == 2 s, a = planning_environment.get_state_action_with_highest_priority() assert s.i == 1 and a.i == 1 s, a = planning_environment.get_state_action_with_highest_priority() assert s is None and a is None
def __init__(self, random_state: RandomState, T: Optional[int], initial_count: int, player_2: Agent): """ Initialize the game. :param random_state: Random state. :param T: Maximum number of steps to run, or None for no limit. :param initial_count: Initial count for each pit. :param player_2: Agent for player 2. """ super().__init__(name='mancala', random_state=random_state, T=T) self.initial_count = initial_count self.player_2 = player_2 self.r_win = Reward(0, 1.0) self.r_lose = Reward(1, -1.0) self.r_none = Reward(2, 0.0) self.player_1_pockets = [ Pit(True, self.initial_count, False) for _ in range(6) ] self.player_1_store = Pit(True, 0, True) self.player_2_pockets = [ Pit(False, self.initial_count, False) for _ in range(6) ] self.player_2_store = Pit(False, 0, True) self.board = self.player_1_pockets + [ self.player_1_store ] + self.player_2_pockets + [self.player_2_store] for i, pit in enumerate(self.board): pit.i = i # non-store pit (i.e., pockets) have actions associated with them. Action.i indexes the particular pit # within the board. if not pit.store: pit.action = Action(pit.i) # Action.name indicates the i-th pit from the player's perspective for i, pit in enumerate(self.player_1_pockets): pit.action.name = str(i) for i, pit in enumerate(self.player_2_pockets): pit.action.name = str(i) for player_1_pocket, opposing_player_2_pocket in zip( self.player_1_pockets, reversed(self.player_2_pockets)): player_1_pocket.opposing_pocket = opposing_player_2_pocket opposing_player_2_pocket.opposing_pocket = player_1_pocket
def __init__( self, name: str, random_state: RandomState, T: Optional[int], n_rows: int, n_columns: int, terminal_states: List[Tuple[int, int]], RR: List[Reward] ): """ Initialize the gridworld. :param name: Name. :param random_state: Random state. :param T: Maximum number of steps to run, or None for no limit. :param n_rows: Number of row. :param n_columns: Number of columns. :param terminal_states: List of terminal-state locations. :param RR: List of all possible rewards. """ AA = [ Action( i=i, name=direction ) for i, direction in enumerate(['u', 'd', 'l', 'r']) ] self.a_up, self.a_down, self.a_left, self.a_right = AA SS = [ MdpState( i=row_i * n_columns + col_j, AA=AA, terminal=False ) for row_i in range(n_rows) for col_j in range(n_columns) ] for row, col in terminal_states: SS[row * n_columns + col].terminal = True super().__init__( name=name, random_state=random_state, T=T, SS=SS, RR=RR ) self.grid = np.array(self.SS).reshape(n_rows, n_columns)
def test_check_state_and_action_lists(): random = RandomState(12345) gw = Gridworld.example_4_1(random, T=None) fex = GridworldFeatureExtractor(gw) states = [MdpState(i=None, AA=[], terminal=False)] actions = [Action(0)] fex.check_state_and_action_lists(states, actions) with pytest.raises(ValueError, match='Expected '): actions.clear() fex.check_state_and_action_lists(states, actions)
def test_agent_invalid_action(): random = RandomState() agent = StochasticMdpAgent('foo', random, TabularPolicy(None, None), 1.0) # test None action agent.__act__ = lambda t: None with pytest.raises(ValueError, match='Agent returned action of None'): agent.act(0) # test infeasible action action = Action(1, 'foo') agent.__act__ = lambda t: action state = MdpState(1, [], False) agent.sense(state, 0) with pytest.raises(ValueError, match=f'Action {action} is not feasible in state {state}'): agent.act(0)
def test_stochastic_environment_model(): random_state = RandomState(12345) model = StochasticEnvironmentModel() actions = [ Action(i) for i in range(5) ] states = [ State(i, actions) for i in range(5) ] for t in range(1000): state = sample_list_item(states, None, random_state) action = sample_list_item(state.AA, None, random_state) next_state = sample_list_item(states, None, random_state) reward = Reward(None, random_state.randint(10)) model.update(state, action, next_state, reward) environment_sequence = [] for i in range(1000): state = model.sample_state(random_state) action = model.sample_action(state, random_state) next_state, reward = model.sample_next_state_and_reward(state, action, random_state) environment_sequence.append((next_state, reward)) # uncomment the following line and run test to update fixture # with open(f'{os.path.dirname(__file__)}/fixtures/test_stochastic_environment_model.pickle', 'wb') as file: # pickle.dump(environment_sequence, file) with open(f'{os.path.dirname(__file__)}/fixtures/test_stochastic_environment_model.pickle', 'rb') as file: environment_sequence_fixture = pickle.load(file) assert environment_sequence == environment_sequence_fixture
def test_agent_invalid_action(): random = RandomState() agent = ActionValueMdpAgent( 'foo', random, 1.0, TabularStateActionValueEstimator(Gridworld.example_4_1(random, None), None, None)) # test None action agent.__act__ = lambda t: None with pytest.raises(ValueError, match='Agent returned action of None'): agent.act(0) # test infeasible action action = Action(1, 'foo') agent.__act__ = lambda t: action state = MdpState(1, [], False) agent.sense(state, 0) with pytest.raises( ValueError, match=f'Action {action} is not feasible in state {state}'): agent.act(0)
def run_step(self, t: int, agent: Agent, monitor: Monitor) -> bool: """ Run a step of the environment with an agent. :param t: Step. :param agent: Agent. :param monitor: Monitor. :return: True if a terminal state was entered and the run should terminate, and False otherwise. """ if self.random_state.random_sample() < self.reset_probability: self.reset_for_new_run(agent) action = agent.act(t=t) monitor.report(t=t, agent_action=action, optimal_action=Action(self.best_arm.i)) reward = self.pull(action.i) monitor.report(t=t, action_reward=reward) agent.reward(reward) return False
def __init__( self, random_state: RandomState, T: Optional[int], gym_id: str, continuous_action_discretization_resolution: Optional[float] = None, render_every_nth_episode: Optional[int] = None, video_directory: Optional[str] = None, steps_per_second: Optional[int] = None, plot_environment: bool = False, progressive_reward: bool = False ): """ Initialize the environment. :param random_state: Random state. :param T: Maximum number of steps to run, or None for no limit. :param gym_id: Gym identifier. See https://gym.openai.com/envs for a list. :param continuous_action_discretization_resolution: A discretization resolution for continuous-action environments. Providing this value allows the environment to be used with discrete-action methods via discretization of the continuous-action dimensions. :param render_every_nth_episode: If passed, the environment will render an episode video per this value. :param video_directory: Directory in which to store rendered videos. :param steps_per_second: Number of steps per second when displaying videos. :param plot_environment: Whether or not to plot the environment. :param progressive_reward: Use progressive reward. """ super().__init__( name=f'gym ({gym_id})', random_state=random_state, T=T ) self.gym_id = gym_id self.progressive_reward = progressive_reward self.continuous_action_discretization_resolution = continuous_action_discretization_resolution self.render_every_nth_episode = render_every_nth_episode if self.render_every_nth_episode is not None and self.render_every_nth_episode <= 0: raise ValueError('render_every_nth_episode must be > 0 if provided.') self.video_directory = video_directory self.steps_per_second = steps_per_second self.gym_native = self.init_gym_native() self.previous_observation = None self.plot_environment = plot_environment self.state_reward_scatter_plot = None if self.plot_environment: self.state_reward_scatter_plot = ScatterPlot( f'{self.gym_id}: State and Reward', self.get_state_dimension_names() + ['reward'], None ) if self.continuous_action_discretization_resolution is not None and not isinstance(self.gym_native.action_space, Box): raise ValueError('Continuous-action discretization is only valid for Box action-space environments.') # action space is already discrete: initialize n actions from it. if isinstance(self.gym_native.action_space, Discrete): self.actions = [ Action( i=i ) for i in range(self.gym_native.action_space.n) ] # action space is continuous and we lack a discretization resolution: initialize a single, multi-dimensional # action including the min and max values of the dimensions. a policy gradient approach will be required. elif isinstance(self.gym_native.action_space, Box) and self.continuous_action_discretization_resolution is None: self.actions = [ ContinuousMultiDimensionalAction( value=None, min_values=self.gym_native.action_space.low, max_values=self.gym_native.action_space.high ) ] # action space is continuous and we have a discretization resolution: discretize it. this is generally not a # great approach, as it results in high-dimensional action spaces. but here goes. elif isinstance(self.gym_native.action_space, Box) and self.continuous_action_discretization_resolution is not None: box = self.gym_native.action_space # continuous n-dimensional action space with identical bounds on each dimension if len(box.shape) == 1: action_discretizations = [ np.linspace(low, high, math.ceil((high - low) / self.continuous_action_discretization_resolution)) for low, high in zip(box.low, box.high) ] else: # pragma no cover raise ValueError(f'Unknown format of continuous action space: {box}') self.actions = [ DiscretizedAction( i=i, continuous_value=np.array(n_dim_action) ) for i, n_dim_action in enumerate(product(*action_discretizations)) ] else: # pragma no cover raise ValueError(f'Unknown Gym action space type: {type(self.gym_native.action_space)}') # set progressive goal for certain environments if self.gym_id == Gym.MCC_V0: if self.progressive_reward: self.mcc_curr_goal_x_pos = Gym.MCC_V0_TROUGH_X_POS + 0.1 else: self.mcc_curr_goal_x_pos = Gym.MCC_V0_GOAL_X_POS
def test_action_str(): action = Action(1, 'foo') assert str(action) == '1: foo'