Exemple #1
0
    def __init__(self, environment: Gym):
        """
        Initialize the feature extractor.

        :param environment: Environment.
        """

        if not isinstance(environment.gym_native.action_space,
                          Discrete):  # pragma no cover
            raise ValueError(
                'Expected a discrete action space, but did not get one.')

        if environment.gym_native.action_space.n != 2:  # pragma no cover
            raise ValueError('Expected two actions:  left and right')

        super().__init__(
            environment=environment,
            actions=[Action(i=0, name='left'),
                     Action(i=1, name='right')])

        # create interacter over cartesian product of state categories
        self.state_category_interacter = OneHotCategoricalFeatureInteracter([
            OneHotCategory(*args) for args in product(*([[True, False]] * 4))
        ])

        self.feature_scaler = NonstationaryFeatureScaler(
            num_observations_refit_feature_scaler=2000,
            refit_history_length=100000,
            refit_weight_decay=0.99999)
Exemple #2
0
def test_action_eq_ne():

    a1 = Action(1)
    a2 = Action(1)
    a3 = Action(2)

    assert a1 == a2 and a2 == a1 and a1 != a3 and a3 != a1
Exemple #3
0
def test_human_agent():

    agent = Human()

    a1 = Action(0, 'Foo')
    a2 = Action(1, 'Bar')

    state = MdpState(1, [a1, a2], False)
    agent.sense(state, 0)

    call_num = 0

    def mock_input(
            prompt: str
    ) -> str:

        nonlocal call_num
        if call_num == 0:
            call_num += 1
            return 'asdf'
        else:
            return 'Bar'

    agent.get_input = mock_input  # MagicMock(return_value='Bar')

    assert agent.act(0) == a2

    with pytest.raises(NotImplementedError):
        rng = RandomState(12345)
        Human.init_from_arguments([], rng, None)
Exemple #4
0
def test_ne():

    s1 = State(1, [Action(1), Action(2)])
    s2 = State(2, [Action(1), Action(2)])
    assert s1 != s2

    s3 = State(1, [Action(3)])
    assert s1 == s3
Exemple #5
0
    def reset_for_new_run(self, agent: Agent) -> State:
        """
        Reset the the bandit, initializing arms to new expected values.

        :param agent: Agent.
        :return: New State.
        """

        super().reset_for_new_run(agent)

        # get new arm reward means and initialize new arms
        q_star_means = self.random_state.normal(loc=self.q_star_mean,
                                                scale=self.q_star_variance,
                                                size=self.k)

        self.arms = [
            Arm(i=i,
                mean=mean,
                variance=self.reward_variance,
                random_state=self.random_state)
            for i, mean in enumerate(q_star_means)
        ]

        self.best_arm = max(self.arms, key=lambda arm: arm.mean)

        return State(i=0, AA=[Action(i) for i in range(self.k)])
Exemple #6
0
    def __init__(self, name: str, random_state: RandomState, T: Optional[int],
                 p_h: float):
        """
        Initialize the MDP environment.

        :param name: Name.
        :param random_state: Random state.
        :param T: Maximum number of steps to run, or None for no limit.
        :param p_h: Probability of coin toss coming up heads.
        """

        self.p_h = p_h
        self.p_t = 1 - p_h

        # the range of possible actions:  stake 0 (no play) through 50 (at capital=50). beyond a capital of 50 the
        # agent is only allowed to stake an amount that would take them to 100 on a win.
        AA = [Action(i=stake, name=f'Stake {stake}') for stake in range(0, 51)]

        # two possible rewards:  0.0 and 1.0
        self.r_not_win = Reward(0, 0.0)
        self.r_win = Reward(1, 1.0)
        RR = [self.r_not_win, self.r_win]

        # range of possible states (capital levels)
        SS = [
            MdpState(
                i=capital,

                # the range of permissible actions is state dependent
                AA=[a for a in AA if a.i <= min(capital, 100 - capital)],
                terminal=capital == 0 or capital == 100)

            # include terminal capital levels of 0 and 100
            for capital in range(0, 101)
        ]

        super().__init__(name=name,
                         random_state=random_state,
                         T=T,
                         SS=SS,
                         RR=RR)

        for s in self.SS:
            for a in self.p_S_prime_R_given_S_A[s]:

                # next state and reward if heads
                s_prime_h = self.SS[s.i + a.i]
                r_h = self.r_win if not s.terminal and s_prime_h.i == 100 else self.r_not_win
                self.p_S_prime_R_given_S_A[s][a][s_prime_h][r_h] = self.p_h

                # next state and reward if tails
                s_prime_t = self.SS[s.i - a.i]
                r_t = self.r_win if not s.terminal and s_prime_t.i == 100 else self.r_not_win
                self.p_S_prime_R_given_S_A[s][a][s_prime_t][
                    r_t] += self.p_t  # add the probability, in case the results of head and tail are the same.

        self.check_marginal_probabilities()
Exemple #7
0
def test_prioritized_planning_environment():

    rng = RandomState(12345)

    planning_environment = PrioritizedSweepingMdpPlanningEnvironment(
        'test', rng, StochasticEnvironmentModel(), 1, 0.3, 10)

    planning_environment.add_state_action_priority(MdpState(1, [], False),
                                                   Action(1), 0.2)
    planning_environment.add_state_action_priority(MdpState(2, [], False),
                                                   Action(2), 0.1)
    planning_environment.add_state_action_priority(MdpState(3, [], False),
                                                   Action(3), 0.3)

    s, a = planning_environment.get_state_action_with_highest_priority()
    assert s.i == 2 and a.i == 2
    s, a = planning_environment.get_state_action_with_highest_priority()
    assert s.i == 1 and a.i == 1
    s, a = planning_environment.get_state_action_with_highest_priority()
    assert s is None and a is None
Exemple #8
0
    def __init__(self, random_state: RandomState, T: Optional[int],
                 initial_count: int, player_2: Agent):
        """
        Initialize the game.

        :param random_state: Random state.
        :param T: Maximum number of steps to run, or None for no limit.
        :param initial_count: Initial count for each pit.
        :param player_2: Agent for player 2.
        """

        super().__init__(name='mancala', random_state=random_state, T=T)

        self.initial_count = initial_count
        self.player_2 = player_2

        self.r_win = Reward(0, 1.0)
        self.r_lose = Reward(1, -1.0)
        self.r_none = Reward(2, 0.0)

        self.player_1_pockets = [
            Pit(True, self.initial_count, False) for _ in range(6)
        ]
        self.player_1_store = Pit(True, 0, True)

        self.player_2_pockets = [
            Pit(False, self.initial_count, False) for _ in range(6)
        ]
        self.player_2_store = Pit(False, 0, True)

        self.board = self.player_1_pockets + [
            self.player_1_store
        ] + self.player_2_pockets + [self.player_2_store]

        for i, pit in enumerate(self.board):

            pit.i = i

            # non-store pit (i.e., pockets) have actions associated with them. Action.i indexes the particular pit
            # within the board.
            if not pit.store:
                pit.action = Action(pit.i)

        # Action.name indicates the i-th pit from the player's perspective
        for i, pit in enumerate(self.player_1_pockets):
            pit.action.name = str(i)

        for i, pit in enumerate(self.player_2_pockets):
            pit.action.name = str(i)

        for player_1_pocket, opposing_player_2_pocket in zip(
                self.player_1_pockets, reversed(self.player_2_pockets)):
            player_1_pocket.opposing_pocket = opposing_player_2_pocket
            opposing_player_2_pocket.opposing_pocket = player_1_pocket
Exemple #9
0
    def __init__(
            self,
            name: str,
            random_state: RandomState,
            T: Optional[int],
            n_rows: int,
            n_columns: int,
            terminal_states: List[Tuple[int, int]],
            RR: List[Reward]
    ):
        """
        Initialize the gridworld.

        :param name: Name.
        :param random_state: Random state.
        :param T: Maximum number of steps to run, or None for no limit.
        :param n_rows: Number of row.
        :param n_columns: Number of columns.
        :param terminal_states: List of terminal-state locations.
        :param RR: List of all possible rewards.
        """

        AA = [
            Action(
                i=i,
                name=direction
            )
            for i, direction in enumerate(['u', 'd', 'l', 'r'])
        ]

        self.a_up, self.a_down, self.a_left, self.a_right = AA

        SS = [
            MdpState(
                i=row_i * n_columns + col_j,
                AA=AA,
                terminal=False
            )
            for row_i in range(n_rows)
            for col_j in range(n_columns)
        ]

        for row, col in terminal_states:
            SS[row * n_columns + col].terminal = True

        super().__init__(
            name=name,
            random_state=random_state,
            T=T,
            SS=SS,
            RR=RR
        )

        self.grid = np.array(self.SS).reshape(n_rows, n_columns)
def test_check_state_and_action_lists():

    random = RandomState(12345)
    gw = Gridworld.example_4_1(random, T=None)
    fex = GridworldFeatureExtractor(gw)

    states = [MdpState(i=None, AA=[], terminal=False)]
    actions = [Action(0)]
    fex.check_state_and_action_lists(states, actions)

    with pytest.raises(ValueError, match='Expected '):
        actions.clear()
        fex.check_state_and_action_lists(states, actions)
Exemple #11
0
def test_agent_invalid_action():

    random = RandomState()
    agent = StochasticMdpAgent('foo', random, TabularPolicy(None, None), 1.0)

    # test None action
    agent.__act__ = lambda t: None

    with pytest.raises(ValueError, match='Agent returned action of None'):
        agent.act(0)

    # test infeasible action
    action = Action(1, 'foo')
    agent.__act__ = lambda t: action
    state = MdpState(1, [], False)
    agent.sense(state, 0)
    with pytest.raises(ValueError, match=f'Action {action} is not feasible in state {state}'):
        agent.act(0)
def test_stochastic_environment_model():

    random_state = RandomState(12345)

    model = StochasticEnvironmentModel()

    actions = [
        Action(i)
        for i in range(5)
    ]

    states = [
        State(i, actions)
        for i in range(5)
    ]

    for t in range(1000):
        state = sample_list_item(states, None, random_state)
        action = sample_list_item(state.AA, None, random_state)
        next_state = sample_list_item(states, None, random_state)
        reward = Reward(None, random_state.randint(10))
        model.update(state, action, next_state, reward)

    environment_sequence = []
    for i in range(1000):
        state = model.sample_state(random_state)
        action = model.sample_action(state, random_state)
        next_state, reward = model.sample_next_state_and_reward(state, action, random_state)
        environment_sequence.append((next_state, reward))

    # uncomment the following line and run test to update fixture
    # with open(f'{os.path.dirname(__file__)}/fixtures/test_stochastic_environment_model.pickle', 'wb') as file:
    #     pickle.dump(environment_sequence, file)

    with open(f'{os.path.dirname(__file__)}/fixtures/test_stochastic_environment_model.pickle', 'rb') as file:
        environment_sequence_fixture = pickle.load(file)

    assert environment_sequence == environment_sequence_fixture
Exemple #13
0
def test_agent_invalid_action():

    random = RandomState()
    agent = ActionValueMdpAgent(
        'foo', random, 1.0,
        TabularStateActionValueEstimator(Gridworld.example_4_1(random, None),
                                         None, None))

    # test None action
    agent.__act__ = lambda t: None

    with pytest.raises(ValueError, match='Agent returned action of None'):
        agent.act(0)

    # test infeasible action
    action = Action(1, 'foo')
    agent.__act__ = lambda t: action
    state = MdpState(1, [], False)
    agent.sense(state, 0)
    with pytest.raises(
            ValueError,
            match=f'Action {action} is not feasible in state {state}'):
        agent.act(0)
Exemple #14
0
    def run_step(self, t: int, agent: Agent, monitor: Monitor) -> bool:
        """
        Run a step of the environment with an agent.

        :param t: Step.
        :param agent: Agent.
        :param monitor: Monitor.
        :return: True if a terminal state was entered and the run should terminate, and False otherwise.
        """

        if self.random_state.random_sample() < self.reset_probability:
            self.reset_for_new_run(agent)

        action = agent.act(t=t)
        monitor.report(t=t,
                       agent_action=action,
                       optimal_action=Action(self.best_arm.i))

        reward = self.pull(action.i)
        monitor.report(t=t, action_reward=reward)

        agent.reward(reward)

        return False
Exemple #15
0
    def __init__(
            self,
            random_state: RandomState,
            T: Optional[int],
            gym_id: str,
            continuous_action_discretization_resolution: Optional[float] = None,
            render_every_nth_episode: Optional[int] = None,
            video_directory: Optional[str] = None,
            steps_per_second: Optional[int] = None,
            plot_environment: bool = False,
            progressive_reward: bool = False
    ):
        """
        Initialize the environment.

        :param random_state: Random state.
        :param T: Maximum number of steps to run, or None for no limit.
        :param gym_id: Gym identifier. See https://gym.openai.com/envs for a list.
        :param continuous_action_discretization_resolution: A discretization resolution for continuous-action
        environments. Providing this value allows the environment to be used with discrete-action methods via
        discretization of the continuous-action dimensions.
        :param render_every_nth_episode: If passed, the environment will render an episode video per this value.
        :param video_directory: Directory in which to store rendered videos.
        :param steps_per_second: Number of steps per second when displaying videos.
        :param plot_environment: Whether or not to plot the environment.
        :param progressive_reward: Use progressive reward.
        """

        super().__init__(
            name=f'gym ({gym_id})',
            random_state=random_state,
            T=T
        )

        self.gym_id = gym_id
        self.progressive_reward = progressive_reward
        self.continuous_action_discretization_resolution = continuous_action_discretization_resolution
        self.render_every_nth_episode = render_every_nth_episode
        if self.render_every_nth_episode is not None and self.render_every_nth_episode <= 0:
            raise ValueError('render_every_nth_episode must be > 0 if provided.')

        self.video_directory = video_directory
        self.steps_per_second = steps_per_second
        self.gym_native = self.init_gym_native()
        self.previous_observation = None
        self.plot_environment = plot_environment
        self.state_reward_scatter_plot = None
        if self.plot_environment:
            self.state_reward_scatter_plot = ScatterPlot(
                f'{self.gym_id}:  State and Reward',
                self.get_state_dimension_names() + ['reward'],
                None
            )

        if self.continuous_action_discretization_resolution is not None and not isinstance(self.gym_native.action_space, Box):
            raise ValueError('Continuous-action discretization is only valid for Box action-space environments.')

        # action space is already discrete:  initialize n actions from it.
        if isinstance(self.gym_native.action_space, Discrete):
            self.actions = [
                Action(
                    i=i
                )
                for i in range(self.gym_native.action_space.n)
            ]

        # action space is continuous and we lack a discretization resolution:  initialize a single, multi-dimensional
        # action including the min and max values of the dimensions. a policy gradient approach will be required.
        elif isinstance(self.gym_native.action_space, Box) and self.continuous_action_discretization_resolution is None:
            self.actions = [
                ContinuousMultiDimensionalAction(
                    value=None,
                    min_values=self.gym_native.action_space.low,
                    max_values=self.gym_native.action_space.high
                )
            ]

        # action space is continuous and we have a discretization resolution:  discretize it. this is generally not a
        # great approach, as it results in high-dimensional action spaces. but here goes.
        elif isinstance(self.gym_native.action_space, Box) and self.continuous_action_discretization_resolution is not None:

            box = self.gym_native.action_space

            # continuous n-dimensional action space with identical bounds on each dimension
            if len(box.shape) == 1:
                action_discretizations = [
                    np.linspace(low, high, math.ceil((high - low) / self.continuous_action_discretization_resolution))
                    for low, high in zip(box.low, box.high)
                ]
            else:  # pragma no cover
                raise ValueError(f'Unknown format of continuous action space:  {box}')

            self.actions = [
                DiscretizedAction(
                    i=i,
                    continuous_value=np.array(n_dim_action)
                )
                for i, n_dim_action in enumerate(product(*action_discretizations))
            ]

        else:  # pragma no cover
            raise ValueError(f'Unknown Gym action space type:  {type(self.gym_native.action_space)}')

        # set progressive goal for certain environments
        if self.gym_id == Gym.MCC_V0:
            if self.progressive_reward:
                self.mcc_curr_goal_x_pos = Gym.MCC_V0_TROUGH_X_POS + 0.1
            else:
                self.mcc_curr_goal_x_pos = Gym.MCC_V0_GOAL_X_POS
Exemple #16
0
def test_action_str():

    action = Action(1, 'foo')

    assert str(action) == '1:  foo'