def experiment(variant):
    expl_env = gym.make("CartPole-v0")
    eval_env = gym.make("CartPole-v0")
    obs_dim = expl_env.observation_space.low.size
    action_dim = eval_env.action_space.n

    qf = Mlp(hidden_sizes=[32, 32], input_size=obs_dim, output_size=action_dim)
    target_qf = Mlp(hidden_sizes=[32, 32],
                    input_size=obs_dim,
                    output_size=action_dim)
    qf_criterion = nn.MSELoss()
    eval_policy = ArgmaxDiscretePolicy(qf)
    expl_policy = PolicyWrappedWithExplorationStrategy(
        EpsilonGreedy(expl_env.action_space), eval_policy)
    eval_path_collector = MdpPathCollector(eval_env, eval_policy)
    expl_path_collector = MdpPathCollector(expl_env, expl_policy)
    trainer = DQNTrainer(qf=qf,
                         target_qf=target_qf,
                         qf_criterion=qf_criterion,
                         **variant["trainer_kwargs"])
    replay_buffer = EnvReplayBuffer(variant["replay_buffer_size"], expl_env)
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        **variant["algorithm_kwargs"])
    algorithm.to(ptu.device)
    algorithm.train()
Example #2
0
def experiment(variant):
    expl_env = gym.make('GoalGridworld-v0')
    eval_env = gym.make('GoalGridworld-v0')

    obs_dim = expl_env.observation_space.spaces['observation'].low.size
    goal_dim = expl_env.observation_space.spaces['desired_goal'].low.size
    action_dim = expl_env.action_space.n
    qf = FlattenMlp(
        input_size=obs_dim + goal_dim,
        output_size=action_dim,
        hidden_sizes=[400, 300],
    )
    target_qf = FlattenMlp(
        input_size=obs_dim + goal_dim,
        output_size=action_dim,
        hidden_sizes=[400, 300],
    )
    eval_policy = ArgmaxDiscretePolicy(qf)
    exploration_strategy = EpsilonGreedy(action_space=expl_env.action_space, )
    expl_policy = PolicyWrappedWithExplorationStrategy(
        exploration_strategy=exploration_strategy,
        policy=eval_policy,
    )

    replay_buffer = ObsDictRelabelingBuffer(env=eval_env,
                                            **variant['replay_buffer_kwargs'])
    observation_key = 'observation'
    desired_goal_key = 'desired_goal'
    eval_path_collector = GoalConditionedPathCollector(
        eval_env,
        eval_policy,
        observation_key=observation_key,
        desired_goal_key=desired_goal_key,
    )
    expl_path_collector = GoalConditionedPathCollector(
        expl_env,
        expl_policy,
        observation_key=observation_key,
        desired_goal_key=desired_goal_key,
    )
    trainer = DQNTrainer(qf=qf,
                         target_qf=target_qf,
                         **variant['trainer_kwargs'])
    trainer = HERTrainer(trainer)
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        **variant['algo_kwargs'])
    algorithm.to(ptu.device)
    algorithm.train()
Example #3
0
def experiment(variant):

    qf = CNN(
        input_width=obs_dim,
        input_height=obs_dim,
        input_channels=channels,
        output_size=action_dim,
        kernel_sizes=[8, 4],
        n_channels=[16, 32],
        strides=[4, 2],
        paddings=[0, 0],
        hidden_sizes=[256],
    )
    target_qf = CNN(
        input_width=obs_dim,
        input_height=obs_dim,
        input_channels=channels,
        output_size=action_dim,
        kernel_sizes=[8, 4],
        n_channels=[16, 32],
        strides=[4, 2],
        paddings=[0, 0],
        hidden_sizes=[256],
    )
    qf_criterion = nn.MSELoss()
    eval_learner_policy = ArgmaxDiscretePolicy(qf)
    expl_learner_policy = PolicyWrappedWithExplorationStrategy(
        AnnealedEpsilonGreedy(symbolic_action_space,
                              anneal_rate=variant["anneal_rate"]),
        eval_learner_policy,
    )
    eval_policy = LearnPlanPolicy(eval_learner_policy)
    expl_policy = LearnPlanPolicy(expl_learner_policy)
    eval_path_collector = MdpPathCollector(eval_env,
                                           eval_policy,
                                           rollout=hierarchical_rollout)
    expl_path_collector = MdpPathCollector(expl_env,
                                           expl_policy,
                                           rollout=hierarchical_rollout)
    trainer = DQNTrainer(qf=qf,
                         target_qf=target_qf,
                         qf_criterion=qf_criterion,
                         **variant["trainer_kwargs"])
    replay_buffer = EnvReplayBuffer(variant["replay_buffer_size"], symb_env)
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        **variant["algorithm_kwargs"])
    algorithm.to(ptu.device)
    algorithm.train()
def experiment(variant):
    # Select a different success_function for different tasks.
    expl_env = GymCraftingEnv(state_obs=True,
                              few_obj=True,
                              success_function=eval_eatbread)
    eval_env = GymCraftingEnv(state_obs=True,
                              few_obj=True,
                              success_function=eval_eatbread)
    obs_dim = expl_env.observation_space.low.size
    action_dim = eval_env.action_space.n

    qf = Mlp(
        hidden_sizes=[32, 32],
        input_size=obs_dim,
        output_size=action_dim,
    )
    target_qf = Mlp(
        hidden_sizes=[32, 32],
        input_size=obs_dim,
        output_size=action_dim,
    )
    qf_criterion = nn.MSELoss()
    eval_policy = ArgmaxDiscretePolicy(qf)
    expl_policy = PolicyWrappedWithExplorationStrategy(
        EpsilonGreedy(expl_env.action_space),
        eval_policy,
    )
    eval_path_collector = MdpPathCollector(
        eval_env,
        eval_policy,
    )
    expl_path_collector = MdpPathCollector(
        expl_env,
        expl_policy,
    )
    trainer = DQNTrainer(qf=qf,
                         target_qf=target_qf,
                         qf_criterion=qf_criterion,
                         **variant['trainer_kwargs'])
    replay_buffer = EnvReplayBuffer(
        variant['replay_buffer_size'],
        expl_env,
    )
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        **variant['algorithm_kwargs'])
    algorithm.to(ptu.device)
    algorithm.train()
Example #5
0
def experiment(variant):
    """Run the experiment."""
    eval_env = gym.make('CartPole-v0')
    obs_dim = eval_env.observation_space.low.size
    action_dim = eval_env.action_space.n
    # Collect data.
    print('Collecting data...')
    data = []
    while len(data) < variant['offline_data_size']:
        done = False
        s = eval_env.reset()
        while not done:
            a = np.random.randint(action_dim)
            n, r, done, _ = eval_env.step(a)
            one_hot_a = np.zeros(action_dim)
            one_hot_a[a] = 1
            data.append((s, one_hot_a, r, n, done))
            s = n
            if len(data) == variant['offline_data_size']:
                break

    qf = Mlp(
        hidden_sizes=[32, 32],
        input_size=obs_dim,
        output_size=action_dim,
    )
    target_qf = Mlp(
        hidden_sizes=[32, 32],
        input_size=obs_dim,
        output_size=action_dim,
    )
    qf_criterion = nn.MSELoss()
    eval_policy = ArgmaxDiscretePolicy(qf)
    eval_path_collector = MdpPathCollector(
        eval_env,
        eval_policy,
    )
    trainer = DQNTrainer(
        qf=qf,
        target_qf=target_qf,
        qf_criterion=qf_criterion,
        **variant['trainer_kwargs']
    )
    offline_data = OfflineDataStore(data=data,)
    algorithm = TorchOfflineRLAlgorithm(
        trainer=trainer,
        evaluation_env=eval_env,
        evaluation_data_collector=eval_path_collector,
        offline_data=offline_data,
        **variant['algorithm_kwargs']
    )
    algorithm.to(ptu.device)
    algorithm.train()
Example #6
0
def experiment(variant):
    args = getArgs()
    # expl_env = NormalizedBoxEnv(environment(args))

    expl_env = environment(args, 'dqn')
    eval_env = environment(args, 'dqn')
    # expl_env.render()
    obs_dim = expl_env.get_obsdim()
    action_dim = expl_env.action_space.n

    qf = Mlp(
        hidden_sizes=[32, 32],
        input_size=obs_dim,
        output_size=action_dim,
    )
    target_qf = Mlp(
        hidden_sizes=[32, 32],
        input_size=obs_dim,
        output_size=action_dim,
    )
    qf_criterion = nn.MSELoss()
    eval_policy = ArgmaxDiscretePolicy(qf)
    expl_policy = PolicyWrappedWithExplorationStrategy(
        EpsilonGreedy(expl_env.action_space),
        eval_policy,
    )
    eval_path_collector = MdpPathCollector(
        eval_env,
        eval_policy,
    )
    expl_path_collector = MdpPathCollector(
        expl_env,
        expl_policy,
    )
    trainer = DQNTrainer(qf=qf,
                         target_qf=target_qf,
                         qf_criterion=qf_criterion,
                         **variant['trainer_kwargs'])
    replay_buffer = EnvReplayBuffer(
        variant['replay_buffer_size'],
        expl_env,
    )
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        **variant['algorithm_kwargs'])
    algorithm.to(ptu.device)
    algorithm.train()
def experiment(variant):
    # common.initialise(variant)

    setup_logger("name-of-experiment", variant=variant)
    ptu.set_gpu_mode(True)

    expl_env = gym.make(variant["env_name"], seed=5)
    eval_env = gym.make(variant["env_name"], seed=5)

    ANCILLARY_GOAL_SIZE = 16
    SYMBOLIC_ACTION_SIZE = (
        12
    )  # Size of embedding (ufva/multihead) for goal space direction to controller
    GRID_SIZE = 31

    action_dim = ANCILLARY_GOAL_SIZE
    symbolic_action_space = gym.spaces.Discrete(ANCILLARY_GOAL_SIZE)
    symb_env = gym.make(variant["env_name"])
    symb_env.action_space = symbolic_action_space

    (
        obs_shape,
        obs_space,
        action_space,
        n,
        mlp,
        channels,
        fc_input,
    ) = common.get_spaces(expl_env)

    qf = Mlp(
        input_size=n,
        output_size=action_dim,
        hidden_sizes=[256, 256],
        init_w=variant["init_w"],
        b_init_value=variant["b_init_value"],
    )
    target_qf = Mlp(
        input_size=n,
        output_size=action_dim,
        hidden_sizes=[256, 256],
        init_w=variant["init_w"],
        b_init_value=variant["b_init_value"],
    )

    planner = ENHSPPlanner()

    # collect
    filepath = "/home/achester/anaconda3/envs/goal-gen/.guild/runs/e77c75eed02e4b38a0a308789fbfcbd8/data/params.pkl"  # collect
    with (open(filepath, "rb")) as openfile:
        while True:
            try:
                policies = pickle.load(openfile)
            except EOFError:
                break

    loaded_collect_policy = policies["exploration/policy"]
    loaded_collect_policy.rnn_hxs = loaded_collect_policy.rnn_hxs[0].unsqueeze(
        0)
    eval_collect = CraftController(loaded_collect_policy, n=GRID_SIZE)
    expl_collect = CraftController(loaded_collect_policy, n=GRID_SIZE)

    # other
    # filepath = "/home/achester/anaconda3/envs/goal-gen/.guild/runs/cf5c31afe0724acd8f6398d77a80443e/data/params.pkl"  # other (RC 28)
    filepath = "/home/achester/anaconda3/envs/goal-gen/.guild/runs/4989f4bcbadb4ac58c3668c068d63225/data/params.pkl"  # other (RC 55)
    # filepath = "/home/achester/Documents/misc/craft-model/params.pkl"
    with (open(filepath, "rb")) as openfile:
        while True:
            try:
                policies = pickle.load(openfile)
            except EOFError:
                break

    loaded_other_policy = policies["exploration/policy"]
    loaded_other_policy.rnn_hxs = loaded_other_policy.rnn_hxs[0].unsqueeze(0)
    eval_other = CraftController(loaded_other_policy, n=GRID_SIZE)
    expl_other = CraftController(loaded_other_policy, n=GRID_SIZE)

    eval_controller = PretrainedController([eval_collect, eval_other])
    expl_controller = PretrainedController([expl_collect, expl_other])

    function_env = gym.make(variant["env_name"])

    qf_criterion = nn.MSELoss()
    if variant["softmax"]:
        eval_learner = SoftmaxDiscretePolicy(qf, variant["temperature"])
    else:
        eval_learner = ArgmaxDiscretePolicy(qf)

    expl_learner = PolicyWrappedWithExplorationStrategy(
        LinearEpsilonGreedy(symbolic_action_space,
                            anneal_schedule=variant["anneal_schedule"]),
        eval_learner,
    )

    eval_policy = LearnPlanPolicy(
        eval_learner,
        planner,
        eval_controller,
        num_processes=1,
        vectorised=False,
        env=function_env,
    )

    expl_policy = LearnPlanPolicy(
        expl_learner,
        planner,
        expl_controller,
        num_processes=1,
        vectorised=False,
        env=function_env,
    )

    eval_path_collector = IntermediatePathCollector(
        eval_env,
        eval_policy,
        rollout=intermediate_rollout,
        gamma=1,
        render=variant["render"],
        single_plan_discounting=variant["trainer_kwargs"]
        ["single_plan_discounting"],
        experience_interval=variant["experience_interval"],
    )
    expl_path_collector = IntermediatePathCollector(
        expl_env,
        expl_policy,
        rollout=intermediate_rollout,
        gamma=variant["trainer_kwargs"]["discount"],
        render=variant["render"],
        single_plan_discounting=variant["trainer_kwargs"]
        ["single_plan_discounting"],
        experience_interval=variant["experience_interval"],
    )

    if variant["double_dqn"]:
        trainer = DoubleDQNTrainer(qf=qf,
                                   target_qf=target_qf,
                                   qf_criterion=qf_criterion,
                                   **variant["trainer_kwargs"])
    else:
        trainer = DQNTrainer(qf=qf,
                             target_qf=target_qf,
                             qf_criterion=qf_criterion,
                             **variant["trainer_kwargs"])
    replay_buffer = PlanReplayBuffer(variant["replay_buffer_size"], symb_env)

    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        **variant["algorithm_kwargs"])

    algorithm.to(ptu.device)

    algorithm.train()
def experiment(variant):
    setup_logger("name-of-experiment", variant=variant)
    ptu.set_gpu_mode(True)

    expl_env = gym.make(variant["env_name"])
    eval_env = gym.make(variant["env_name"])
    obs_dim = expl_env.observation_space.image.shape[1]
    channels = expl_env.observation_space.image.shape[0]
    action_dim = SYMBOLIC_ACTION_COUNT
    symbolic_action_space = gym.spaces.Discrete(SYMBOLIC_ACTION_COUNT)
    symb_env = gym.make(variant["env_name"])
    symb_env.action_space = symbolic_action_space

    qf = CNN(
        input_width=obs_dim,
        input_height=obs_dim,
        input_channels=channels,
        output_size=action_dim,
        kernel_sizes=[8, 4],
        n_channels=[16, 32],
        strides=[4, 2],
        paddings=[0, 0],
        hidden_sizes=[256],
    )
    target_qf = CNN(
        input_width=obs_dim,
        input_height=obs_dim,
        input_channels=channels,
        output_size=action_dim,
        kernel_sizes=[8, 4],
        n_channels=[16, 32],
        strides=[4, 2],
        paddings=[0, 0],
        hidden_sizes=[256],
    )
    qf_criterion = nn.MSELoss()

    eval_policy = LearnPlanPolicy(None)
    expl_policy = LearnPlanPolicy(None)
    eval_path_collector = MdpPathCollector(eval_env,
                                           eval_policy,
                                           rollout=hierarchical_rollout,
                                           render=variant["render"])
    expl_path_collector = MdpPathCollector(expl_env,
                                           expl_policy,
                                           rollout=hierarchical_rollout,
                                           render=variant["render"])
    trainer = DQNTrainer(qf=qf,
                         target_qf=target_qf,
                         qf_criterion=qf_criterion,
                         **variant["trainer_kwargs"])
    replay_buffer = EnvReplayBuffer(variant["replay_buffer_size"], symb_env)
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        **variant["algorithm_kwargs"])
    algorithm.to(ptu.device)
    algorithm.train()
Example #9
0
def experiment(variant):
    fov, delta, num_ch = 13, 3, 3
    expl_env = EnvBrainbow('0:data/brainbow/training_sample.tif',
                           coord_interval=2, img_mean=128, img_stddev=33,
                           num_ch=3, fov=fov, delta=delta, seed=0)
    eval_env = EnvBrainbow('0:data/brainbow/training_sample.tif',
                           coord_interval=2, img_mean=128, img_stddev=33,
                           num_ch=3, fov=fov, delta=delta, seed=0)
    obs_dim = expl_env.observation_space.low.shape  # 13, 13, 3
    action_dim = eval_env.action_space.n  # 2
    kernel_sizes = [3, 3, 3]
    n_channels = [32, 64, 64]
    strides = [1, 1, 1]
    paddings = [0, 0, 0]
    hidden_sizes = [512]

    qf = CNN(
        input_width=fov,
        input_height=fov,
        input_channels=num_ch,
        output_size=action_dim,
        kernel_sizes=kernel_sizes,
        n_channels=n_channels,
        strides=strides,
        paddings=paddings,
        hidden_sizes=hidden_sizes,
        batch_norm_conv=True,
        batch_norm_fc=False
    )
    target_qf = CNN(
        input_width=fov,
        input_height=fov,
        input_channels=num_ch,
        output_size=action_dim,
        kernel_sizes=kernel_sizes,
        n_channels=n_channels,
        strides=strides,
        paddings=paddings,
        hidden_sizes=hidden_sizes,
        batch_norm_conv=True,
        batch_norm_fc=False
    )

    print(qf)
    qf_criterion = nn.MSELoss()
    eval_policy = ArgmaxDiscretePolicy(qf)
    expl_policy = PolicyWrappedWithExplorationStrategy(
        EpsilonGreedy(expl_env.action_space),
        eval_policy,
    )
    eval_path_collector = MdpPathCollector(
        eval_env,
        eval_policy,
    )
    expl_path_collector = MdpPathCollector(
        expl_env,
        expl_policy,
    )
    trainer = DQNTrainer(
        qf=qf,
        target_qf=target_qf,
        qf_criterion=qf_criterion,
        **variant['trainer_kwargs']
    )
    replay_buffer = EnvReplayBuffer(
        variant['replay_buffer_size'],
        expl_env,
    )
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        **variant['algorithm_kwargs']
    )
    algorithm.to(ptu.device)
    algorithm.train()
Example #10
0
def experiment(doodad_config, variant):
    from rlkit.core import logger
    from rlkit.launchers.launcher_util import setup_logger
    print ("doodad_config.base_log_dir: ", doodad_config.base_log_dir)
    from datetime import datetime
    timestamp = datetime.now().strftime('%Y_%m_%d_%H_%M_%S_%f')
    setup_logger('wrapped_'+variant['env'], variant=variant, log_dir=doodad_config.base_log_dir+"/smirl/"+variant['exp_name']+"/"+timestamp+"/")
    if (variant["log_comet"]):
        try:
            comet_logger = Experiment(api_key=launchers.config.COMET_API_KEY,
                                         project_name=launchers.config.COMET_PROJECT_NAME, 
                                         workspace=launchers.config.COMET_WORKSPACE)
            logger.set_comet_logger(comet_logger)
            comet_logger.set_name(str(variant['env'])+"_"+str(variant['exp_name']))
            print("variant: ", variant)
            variant['comet_key'] = comet_logger.get_key()
            comet_logger.log_parameters(variant)
            print(comet_logger)
        except Exception as inst:
            print ("Not tracking training via commet.ml")
            print ("Error: ", inst)

    import gym
    from torch import nn as nn
    
    import rlkit.torch.pytorch_util as ptu
    import torch
    from rlkit.exploration_strategies.epsilon_greedy import EpsilonGreedy
    from rlkit.exploration_strategies.base import \
        PolicyWrappedWithExplorationStrategy
    from rlkit.policies.argmax import ArgmaxDiscretePolicy
    from rlkit.torch.dqn.dqn import DQNTrainer
    from rlkit.data_management.env_replay_buffer import EnvReplayBuffer
    from rlkit.samplers.data_collector import MdpPathCollector
    from rlkit.torch.torch_rl_algorithm import TorchBatchRLAlgorithm
    from surprise.utils.rendering_algorithm import TorchBatchRLRenderAlgorithm
    from surprise.envs.tetris.tetris import TetrisEnv
    from surprise.wrappers.obsresize import ResizeObservationWrapper, RenderingObservationWrapper, SoftResetWrapper
    import pdb
    
    base_env = get_env(variant)
    base_env2 = get_env(variant)
    
    print ("GPU_BUS_Index", variant["GPU_BUS_Index"])
    if torch.cuda.is_available() and doodad_config.use_gpu:
        print ("Using the GPU for learning")
#         ptu.set_gpu_mode(True, gpu_id=doodad_config.gpu_id)
        ptu.set_gpu_mode(True, gpu_id=variant["GPU_BUS_Index"])
    else:
        print ("NOT Using the GPU for learning")
    
#     base_env2 = RenderingObservationWrapper(base_env2)
    expl_env, network = add_wrappers(base_env, variant, device=ptu.device)
    eval_env, _ = add_wrappers(base_env2, variant, device=ptu.device, eval=True, network=network)
    if ("vae_wrapper" in variant["wrappers"]):
        eval_env._network = base_env._network
    
    obs_dim = expl_env.observation_space.low.shape
    print("Final obs dim", obs_dim)
    action_dim = eval_env.action_space.n
    print("Action dimension: ", action_dim)
    qf, target_qf = get_network(variant["network_args"], obs_dim, action_dim)
    qf_criterion = nn.MSELoss()
    eval_policy = ArgmaxDiscretePolicy(qf)
    if "prob_random_action" in variant:
        expl_policy = PolicyWrappedWithExplorationStrategy(
            EpsilonGreedy(expl_env.action_space, prob_random_action=variant["prob_random_action"], 
                          prob_end=variant["prob_end"],
                          steps=variant["steps"]),
            eval_policy,
        )
    else:  
        expl_policy = PolicyWrappedWithExplorationStrategy(
            EpsilonGreedy(expl_env.action_space, prob_random_action=0.8, prob_end=0.05),
            eval_policy,
        )
    eval_path_collector = MdpPathCollector(
        eval_env,
        eval_policy,
        render_kwargs=variant['render_kwargs']
    )
    expl_path_collector = MdpPathCollector(
        expl_env,
        expl_policy,
    )
    trainer = DQNTrainer(
        qf=qf,
        target_qf=target_qf,
        qf_criterion=qf_criterion,
        **variant['trainer_kwargs']
    )
    replay_buffer = EnvReplayBuffer(
        variant['replay_buffer_size'],
        expl_env,
    )
    algorithm = TorchBatchRLRenderAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        **variant['algorithm_kwargs']
    )
    algorithm.to(ptu.device)
    algorithm.train()
Example #11
0
def experiment(variant):
    setup_logger("name-of-experiment", variant=variant)
    ptu.set_gpu_mode(True)

    expl_env = gym.make(variant["env_name"])
    eval_env = gym.make(variant["env_name"])

    # OLD - Taxi image env
    # if isinstance(expl_env.observation_space, Json):
    #     expl_env = BoxWrapper(expl_env)
    #     eval_env = BoxWrapper(eval_env)
    #     # obs_shape = expl_env.observation_space.image.shape

    # obs_shape = expl_env.observation_space.shape
    # if len(obs_shape) == 3 and obs_shape[2] in [1, 3]:  # convert WxHxC into CxWxH
    #     expl_env = TransposeImage(expl_env, op=[2, 0, 1])
    #     eval_env = TransposeImage(eval_env, op=[2, 0, 1])

    # obs_shape = expl_env.observation_space.shape
    # channels, obs_width, obs_height = obs_shape
    # action_dim = eval_env.action_space.n

    # qf = CNN(
    #     input_width=obs_width,
    #     input_height=obs_height,
    #     input_channels=channels,
    #     output_size=action_dim,
    #     kernel_sizes=[8, 4],
    #     n_channels=[16, 32],
    #     strides=[4, 2],
    #     paddings=[0, 0],
    #     hidden_sizes=[256],
    # )
    # target_qf = CNN(
    #     input_width=obs_width,
    #     input_height=obs_height,
    #     input_channels=channels,
    #     output_size=action_dim,
    #     kernel_sizes=[8, 4],
    #     n_channels=[16, 32],
    #     strides=[4, 2],
    #     paddings=[0, 0],
    #     hidden_sizes=[256],
    # )

    (
        obs_shape,
        obs_space,
        action_space,
        n,
        mlp,
        channels,
        fc_input,
    ) = common.get_spaces(expl_env)

    qf = Mlp(
        input_size=n,
        output_size=action_space.n,
        hidden_sizes=[256, 256],
        init_w=variant["init_w"],
        b_init_value=variant["b_init_value"],
    )
    target_qf = Mlp(
        input_size=n,
        output_size=action_space.n,
        hidden_sizes=[256, 256],
        init_w=variant["init_w"],
        b_init_value=variant["b_init_value"],
    )

    qf_criterion = nn.MSELoss()

    if variant["softmax"]:
        eval_policy = SoftmaxDiscretePolicy(qf, variant["temperature"])
    else:
        eval_policy = ArgmaxDiscretePolicy(qf)

    expl_policy = PolicyWrappedWithExplorationStrategy(
        LinearEpsilonGreedy(action_space,
                            anneal_schedule=variant["anneal_schedule"]),
        eval_policy,
    )
    eval_path_collector = MdpPathCollector(eval_env,
                                           eval_policy,
                                           render=variant["render"])
    expl_path_collector = MdpPathCollector(expl_env,
                                           expl_policy,
                                           render=variant["render"])
    trainer = DQNTrainer(qf=qf,
                         target_qf=target_qf,
                         qf_criterion=qf_criterion,
                         **variant["trainer_kwargs"])
    replay_buffer = EnvReplayBuffer(variant["replay_buffer_size"], expl_env)
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        **variant["algorithm_kwargs"])
    algorithm.to(ptu.device)
    algorithm.train()