Ejemplo n.º 1
0
def main():

    runner = ExperimentRunner()

    for N in range(5):
        configuration = {
            "gamma": 0.98,
            "horizon": 5,
            "base_policy": .8,
            "eval_policy": .2,
            "stochastic_env": True,
            "stochastic_rewards": False,
            "sparse_rewards": False,
            "num_traj": 8 * 2**N,
            "is_pomdp": False,
            "pomdp_horizon": 2,
            "seed": 1000,
            "experiment_number": 0,
            "access": 0,
            "secret": 0,
            "to_regress_pi_b": False,
            "frameskip": 1,
            "frameheight": 1,
            "modeltype": 'conv',
            "Qmodel": 'conv1',
        }

        cfg = Config(configuration)

        env = Gridworld(slippage=.2 * cfg.stochastic_env)

        np.random.seed(cfg.seed)
        eval_policy = cfg.eval_policy
        base_policy = cfg.base_policy

        # to_grid and from_grid are particular to Gridworld
        # These functions are special to convert an index in a grid to an 'image'
        def to_grid(x, gridsize=[8, 8]):
            x = x.reshape(-1)
            x = x[0]
            out = np.zeros(gridsize)
            if x >= 64:
                return out
            else:
                out[x // gridsize[0], x % gridsize[1]] = 1.
            return out

        # This function takes an 'image' and returns the position in the grid
        def from_grid(x, gridsize=[8, 8]):
            if len(x.shape) == 3:
                if np.sum(x) == 0:
                    x = np.array([gridsize[0] * gridsize[1]])
                else:
                    x = np.array([np.argmax(x.reshape(-1))])
            return x

        processor = lambda x: x
        policy = env.best_policy()
        absorbing_state = processor(np.array([len(policy)]))

        pi_e = EGreedyPolicy(model=TabularPolicy(policy,
                                                 absorbing=absorbing_state),
                             processor=from_grid,
                             prob_deviation=eval_policy,
                             action_space_dim=env.n_actions)
        pi_b = EGreedyPolicy(model=TabularPolicy(policy,
                                                 absorbing=absorbing_state),
                             processor=from_grid,
                             prob_deviation=base_policy,
                             action_space_dim=env.n_actions)

        cfg.add({
            'env': env,
            'pi_e': pi_e,
            'pi_b': pi_b,
            'processor': processor,
            'absorbing_state': absorbing_state,
            'convert_from_int_to_img': to_grid,
        })
        cfg.add({'models': 'all'})

        runner.add(cfg)

    results = runner.run()

    # print results
    for result in results:
        analysis(result)
Ejemplo n.º 2
0
def main():

    runner = ExperimentRunner()

    # run 5 experiments
    for N in range(5):

        # basic configuration with varying number of trajectories
        configuration = {
            "gamma": 0.98,
            "horizon": 4,
            "base_policy": .8,
            "eval_policy": .2,
            "stochastic_env": True,
            "stochastic_rewards": False,
            "sparse_rewards": False,
            "num_traj": 8 * 2**N,
            "is_pomdp": False,
            "pomdp_horizon": 2,
            "seed": 1000,
            "experiment_number": 0,
            "access": 0,
            "secret": 0,
            "modeltype": "tabular",
            "to_regress_pi_b": False,
        }

        # store them
        cfg = Config(configuration)

        # initialize environment with this configuration
        env = Graph(make_pomdp=cfg.is_pomdp,
                    number_of_pomdp_states=cfg.pomdp_horizon,
                    transitions_deterministic=not cfg.stochastic_env,
                    max_length=cfg.horizon,
                    sparse_rewards=cfg.sparse_rewards,
                    stochastic_rewards=cfg.stochastic_rewards)

        # set seed for the experiment
        np.random.seed(cfg.seed)

        # processor processes the state for storage
        processor = lambda x: x

        # absorbing state for padding if episode ends before horizon is reached
        absorbing_state = processor(np.array([env.n_dim - 1]))

        # Setup policies
        actions = [0, 1]
        pi_e = BasicPolicy(
            actions,
            [max(.001, cfg.eval_policy), 1 - max(.001, cfg.eval_policy)])
        pi_b = BasicPolicy(
            actions,
            [max(.001, cfg.base_policy), 1 - max(.001, cfg.base_policy)])

        # add env, policies, absorbing state and processor
        cfg.add({
            'env': env,
            'pi_e': pi_e,
            'pi_b': pi_b,
            'processor': processor,
            'absorbing_state': absorbing_state
        })

        # Decide which OPE methods to run.
        # Currently only all is available
        cfg.add({'models': 'all'})

        # Add the configuration
        runner.add(cfg)

    # Run the configurations
    results = runner.run()

    # print results
    for result in results:
        analysis(result)
Ejemplo n.º 3
0
def main(param):

    # replace string of model with model itself in the configuration.
    for method, parameters in param['models'].items():
        if parameters['model'] != 'tabular':
            param['models'][method]['model'] = get_model_from_name(
                parameters['model'])

    runner = ExperimentRunner()

    for N in range(5):
        configuration = deepcopy(
            param['experiment']
        )  # Make sure to deepcopy as to never change original
        configuration['num_traj'] = 8 * 2**N  # Increase dataset size

        cfg = Config(configuration)

        # initialize environment with this configuration
        env = Graph(make_pomdp=cfg.is_pomdp,
                    number_of_pomdp_states=cfg.pomdp_horizon,
                    transitions_deterministic=not cfg.stochastic_env,
                    max_length=cfg.horizon,
                    sparse_rewards=cfg.sparse_rewards,
                    stochastic_rewards=cfg.stochastic_rewards)

        # set seed for the experiment
        np.random.seed(cfg.seed)

        # processor processes the state for storage
        processor = lambda x: x

        # absorbing state for padding if episode ends before horizon is reached
        absorbing_state = processor(np.array([env.n_dim - 1]))

        # Setup policies
        actions = [0, 1]
        pi_e = BasicPolicy(
            actions,
            [max(.001, cfg.eval_policy), 1 - max(.001, cfg.eval_policy)])
        pi_b = BasicPolicy(
            actions,
            [max(.001, cfg.base_policy), 1 - max(.001, cfg.base_policy)])

        # add env, policies, absorbing state and processor
        cfg.add({
            'env': env,
            'pi_e': pi_e,
            'pi_b': pi_b,
            'processor': processor,
            'absorbing_state': absorbing_state
        })
        cfg.add({'models': param['models']})

        runner.add(cfg)

    results = runner.run()

    # print results
    for result in results:
        analysis(result)
Ejemplo n.º 4
0
def main(param):

    param = setup_params(param)
    runner = ExperimentRunner()

    for N in range(5):
        configuration = deepcopy(
            param['experiment']
        )  # Make sure to deepcopy as to never change original
        configuration['num_traj'] = 8 * 2**N  # Increase dataset size

        cfg = Config(configuration)

        env = Gridworld(slippage=.2 * cfg.stochastic_env)

        np.random.seed(cfg.seed)
        eval_policy = cfg.eval_policy
        base_policy = cfg.base_policy

        # to_grid and from_grid are particular to Gridworld
        # These functions are special to convert an index in a grid to an 'image'
        def to_grid(x, gridsize=[8, 8]):
            x = x.reshape(-1)
            x = x[0]
            out = np.zeros(gridsize)
            if x >= 64:
                return out
            else:
                out[x // gridsize[0], x % gridsize[1]] = 1.
            return out

        # This function takes an 'image' and returns the position in the grid
        def from_grid(x, gridsize=[8, 8]):
            if len(x.shape) == 3:
                if np.sum(x) == 0:
                    x = np.array([gridsize[0] * gridsize[1]])
                else:
                    x = np.array([np.argmax(x.reshape(-1))])
            return x

        processor = lambda x: x
        policy = env.best_policy()
        absorbing_state = processor(np.array([len(policy)]))

        pi_e = EGreedyPolicy(model=TabularPolicy(policy,
                                                 absorbing=absorbing_state),
                             processor=from_grid,
                             prob_deviation=eval_policy,
                             action_space_dim=env.n_actions)
        pi_b = EGreedyPolicy(model=TabularPolicy(policy,
                                                 absorbing=absorbing_state),
                             processor=from_grid,
                             prob_deviation=base_policy,
                             action_space_dim=env.n_actions)

        cfg.add({
            'env': env,
            'pi_e': pi_e,
            'pi_b': pi_b,
            'processor': processor,
            'absorbing_state': absorbing_state,
            'convert_from_int_to_img': to_grid,
        })
        cfg.add({'models': param['models']})

        runner.add(cfg)

    results = runner.run()

    # print results
    for result in results:
        analysis(result)