def __init__(self, envs):
        self.envs = [EnvContainer(env) for env in envs]

        self.n_envs = len(self.envs)
        self.n_abstract_mdps = 2
        self.abstract_dim = 4
        self.state_dim = 4
        self.state_shape = (self.envs[0].width, self.envs[0].height)
        self.states = []
        self.state_to_idx = None

        all_encoder_lst = nn.ModuleList()

        for j in range(self.n_abstract_mdps):
            # encoder = Mlp((128, 128, 128), output_size=self.abstract_dim, input_size=self.state_dim,
            #            output_activation=F.softmax, layer_norm=True)
            encoder = CNN(self.state_shape[0],
                          self.state_shape[1],
                          3,
                          self.abstract_dim, [3, 3], [32, 32], [1, 1], [0, 0],
                          hidden_sizes=(64, 64),
                          output_activation=nn.Softmax())
            encoder.apply(init_weights)
            all_encoder_lst.append(encoder)

        self.all_encoder_lst = all_encoder_lst
        self.all_encoder_lst.cuda()
        self.optimizer = optim.Adam(self.all_encoder_lst.parameters(), lr=1e-5)
Ejemplo n.º 2
0
def gen_network(variant, action_dim, layer_size, policy=False):
    return FoodNetworkMedium(
        img_network=CNN(**variant['img_conv_kwargs']),
        full_img_network=CNN(**variant['full_img_conv_kwargs']),
        inventory_network=FlattenMlp(**variant['inventory_network_kwargs']),
        final_network=FlattenMlp(
            input_size=variant['img_conv_kwargs']['output_size'] +
            variant['full_img_conv_kwargs']['output_size'] +
            variant['inventory_network_kwargs']['output_size'],
            output_size=action_dim,
            hidden_sizes=[layer_size, layer_size],
            output_activation=F.softmax if policy else identity),
        sizes=[
            variant['img_conv_kwargs']['input_width'] *
            variant['img_conv_kwargs']['input_height'] *
            variant['img_conv_kwargs']['input_channels'],
            variant['full_img_conv_kwargs']['input_width'] *
            variant['full_img_conv_kwargs']['input_height'] *
            variant['full_img_conv_kwargs']['input_channels'],
            # health dim
            1,
            # pantry dim
            400,
            # shelf dim
            40
        ])
Ejemplo n.º 3
0
def experiment(variant):

    qf = CNN(
        input_width=obs_dim,
        input_height=obs_dim,
        input_channels=channels,
        output_size=action_dim,
        kernel_sizes=[8, 4],
        n_channels=[16, 32],
        strides=[4, 2],
        paddings=[0, 0],
        hidden_sizes=[256],
    )
    target_qf = CNN(
        input_width=obs_dim,
        input_height=obs_dim,
        input_channels=channels,
        output_size=action_dim,
        kernel_sizes=[8, 4],
        n_channels=[16, 32],
        strides=[4, 2],
        paddings=[0, 0],
        hidden_sizes=[256],
    )
    qf_criterion = nn.MSELoss()
    eval_learner_policy = ArgmaxDiscretePolicy(qf)
    expl_learner_policy = PolicyWrappedWithExplorationStrategy(
        AnnealedEpsilonGreedy(symbolic_action_space,
                              anneal_rate=variant["anneal_rate"]),
        eval_learner_policy,
    )
    eval_policy = LearnPlanPolicy(eval_learner_policy)
    expl_policy = LearnPlanPolicy(expl_learner_policy)
    eval_path_collector = MdpPathCollector(eval_env,
                                           eval_policy,
                                           rollout=hierarchical_rollout)
    expl_path_collector = MdpPathCollector(expl_env,
                                           expl_policy,
                                           rollout=hierarchical_rollout)
    trainer = DQNTrainer(qf=qf,
                         target_qf=target_qf,
                         qf_criterion=qf_criterion,
                         **variant["trainer_kwargs"])
    replay_buffer = EnvReplayBuffer(variant["replay_buffer_size"], symb_env)
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        **variant["algorithm_kwargs"])
    algorithm.to(ptu.device)
    algorithm.train()
def gen_network(variant, action_dim, layer_size, policy=False):
    final_network_kwargs = dict(
        # +1 for health
        input_size=variant['img_conv_kwargs']['output_size'] +
        variant['full_img_conv_kwargs']['output_size'] + 1,
        output_size=action_dim,
        hidden_sizes=[layer_size, layer_size],
    )
    if policy:
        final_network_kwargs.update(output_activation=F.softmax)
    return FoodNetworkEasy(
        img_network=CNN(**variant['img_conv_kwargs']),
        full_img_network=CNN(**variant['full_img_conv_kwargs']),
        final_network=FlattenMlp(**final_network_kwargs),
        sizes=[
            variant['img_conv_kwargs']['input_width'] *
            variant['img_conv_kwargs']['input_height'] *
            variant['img_conv_kwargs']['input_channels'],
            variant['full_img_conv_kwargs']['input_width'] *
            variant['full_img_conv_kwargs']['input_height'] *
            variant['full_img_conv_kwargs']['input_channels'],
            # health dim
            1
        ])
Ejemplo n.º 5
0
def her_dqn_experiment_minigrid(variant):
    env = gym.make(variant['env_id'])

    observation_key = variant['observation_key']
    desired_goal_key = variant['desired_goal_key']
    variant['algo_kwargs']['her_kwargs']['observation_key'] = observation_key
    variant['algo_kwargs']['her_kwargs']['desired_goal_key'] = desired_goal_key
    # if variant.get('normalize', False):
    #     raise NotImplementedError()

    replay_buffer = ObsDictRelabelingBuffer(env=env,
                                            observation_key=observation_key,
                                            desired_goal_key=desired_goal_key,
                                            internal_keys=['agent_pos'],
                                            **variant['replay_buffer_kwargs'])
    obs_shape = env.obs_shape
    action_dim = env.action_space.n
    #goal_shape = env.observation_space.spaces['desired_goal'].shape

    qf1 = CNN(
        obs_shape[0],
        obs_shape[1],
        obs_shape[2],
        output_size=action_dim,
        kernel_sizes=[2, 2],
        n_channels=[16, 32],
        strides=[1, 1],
        paddings=np.zeros(2, dtype=np.int64),
        added_fc_input_size=env.add_input_dim * 2,
        hidden_sizes=(128, 128),
    )
    algorithm = HerDQN(
        env,
        training_env=env,
        qf=qf1,
        #qf2=qf2,
        #policy=policy,
        #exploration_policy=exploration_policy,
        replay_buffer=replay_buffer,
        qf_criterion=nn.MSELoss(),
        **variant['algo_kwargs'])

    algorithm.to(ptu.device)
    algorithm.train()
Ejemplo n.º 6
0
def dqn_experiment_mincraft(variant):
    if 'env_id' in variant:
        env = gym.make(variant['env_id'])
    else:
        env = variant['env_class'](**variant['env_kwargs'])
    env.init(start_minecraft=False,
             client_pool=[('127.0.0.1', 10000)],
             step_sleep=0.01,
             skip_steps=100,
             retry_sleep=2)
    # env = malmoenv.make()
    # xml = Path(variant['mission']).read_text()
    # env.init(xml, variant['port'], server='127.0.0.1',
    #          resync=0, role=0)
    #env = WallBuilder(variant['mission'])

    #env.reset()
    observation_key = 'state_observation'
    desired_goal_key = 'desired_goal'
    # # if variant.get('normalize', False):
    #     raise NotImplementedError()

    # replay_buffer = ObsDictRelabelingBuffer(
    #     env=env,
    #     observation_key=observation_key,
    #     desired_goal_key=desired_goal_key,
    #     internal_keys=['agent_pos'],
    #     **variant['replay_buffer_kwargs']
    # )
    obs_shape = env.obs_shape
    action_dim = env.action_space.n
    #goal_shape = env.observation_space.spaces['desired_goal'].shape

    qf1 = CNN(
        obs_shape[1],
        obs_shape[2],
        obs_shape[0],  # + env.voxel_shape[0],
        output_size=action_dim,
        kernel_sizes=[3, 3],
        n_channels=[16, 32],
        strides=[1, 1],
        paddings=np.zeros(2, dtype=np.int64),
        hidden_sizes=(128, 128),
    )
    # qf1 = FlattenMlp(
    #     input_size=obs_dim + goal_dim,
    #     output_size=action_dim,
    #     **variant['qf_kwargs']
    # )
    # qf2 = FlattenMlp(
    #     input_size=obs_dim + action_dim + goal_dim,
    #     output_size=1,
    #     **variant['qf_kwargs']
    # )
    # policy = MlpPolicy(
    #     input_size=obs_dim + goal_dim,
    #     output_size=action_dim,
    #     **variant['policy_kwargs']
    # )
    # exploration_policy = PolicyWrappedWithExplorationStrategy(
    #     exploration_strategy=es,
    #     policy=policy,
    # )
    algorithm = DQN(
        env,
        training_env=env,
        qf=qf1,
        #qf2=qf2,
        #policy=policy,
        #exploration_policy=exploration_policy,
        #replay_buffer=replay_buffer,
        qf_criterion=nn.MSELoss(),
        **variant['algo_kwargs'])

    algorithm.to(ptu.device)
    algorithm.train()
Ejemplo n.º 7
0
def experiment(variant):
    setup_logger("name-of-experiment", variant=variant)
    ptu.set_gpu_mode(True)

    expl_env = gym.make(variant["env_name"])
    eval_env = gym.make(variant["env_name"])
    obs_dim = expl_env.observation_space.image.shape[1]
    channels = expl_env.observation_space.image.shape[0]
    action_dim = SYMBOLIC_ACTION_COUNT
    symbolic_action_space = gym.spaces.Discrete(SYMBOLIC_ACTION_COUNT)
    symb_env = gym.make(variant["env_name"])
    symb_env.action_space = symbolic_action_space

    qf = CNN(
        input_width=obs_dim,
        input_height=obs_dim,
        input_channels=channels,
        output_size=action_dim,
        kernel_sizes=[8, 4],
        n_channels=[16, 32],
        strides=[4, 2],
        paddings=[0, 0],
        hidden_sizes=[256],
    )
    target_qf = CNN(
        input_width=obs_dim,
        input_height=obs_dim,
        input_channels=channels,
        output_size=action_dim,
        kernel_sizes=[8, 4],
        n_channels=[16, 32],
        strides=[4, 2],
        paddings=[0, 0],
        hidden_sizes=[256],
    )
    qf_criterion = nn.MSELoss()

    eval_policy = LearnPlanPolicy(None)
    expl_policy = LearnPlanPolicy(None)
    eval_path_collector = MdpPathCollector(eval_env,
                                           eval_policy,
                                           rollout=hierarchical_rollout,
                                           render=variant["render"])
    expl_path_collector = MdpPathCollector(expl_env,
                                           expl_policy,
                                           rollout=hierarchical_rollout,
                                           render=variant["render"])
    trainer = DQNTrainer(qf=qf,
                         target_qf=target_qf,
                         qf_criterion=qf_criterion,
                         **variant["trainer_kwargs"])
    replay_buffer = EnvReplayBuffer(variant["replay_buffer_size"], symb_env)
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        **variant["algorithm_kwargs"])
    algorithm.to(ptu.device)
    algorithm.train()
Ejemplo n.º 8
0
def experiment(variant):
    fov, delta, num_ch = 13, 3, 3
    expl_env = EnvBrainbow('0:data/brainbow/training_sample.tif',
                           coord_interval=2, img_mean=128, img_stddev=33,
                           num_ch=3, fov=fov, delta=delta, seed=0)
    eval_env = EnvBrainbow('0:data/brainbow/training_sample.tif',
                           coord_interval=2, img_mean=128, img_stddev=33,
                           num_ch=3, fov=fov, delta=delta, seed=0)
    obs_dim = expl_env.observation_space.low.shape  # 13, 13, 3
    action_dim = eval_env.action_space.n  # 2
    kernel_sizes = [3, 3, 3]
    n_channels = [32, 64, 64]
    strides = [1, 1, 1]
    paddings = [0, 0, 0]
    hidden_sizes = [512]

    qf = CNN(
        input_width=fov,
        input_height=fov,
        input_channels=num_ch,
        output_size=action_dim,
        kernel_sizes=kernel_sizes,
        n_channels=n_channels,
        strides=strides,
        paddings=paddings,
        hidden_sizes=hidden_sizes,
        batch_norm_conv=True,
        batch_norm_fc=False
    )
    target_qf = CNN(
        input_width=fov,
        input_height=fov,
        input_channels=num_ch,
        output_size=action_dim,
        kernel_sizes=kernel_sizes,
        n_channels=n_channels,
        strides=strides,
        paddings=paddings,
        hidden_sizes=hidden_sizes,
        batch_norm_conv=True,
        batch_norm_fc=False
    )

    print(qf)
    qf_criterion = nn.MSELoss()
    eval_policy = ArgmaxDiscretePolicy(qf)
    expl_policy = PolicyWrappedWithExplorationStrategy(
        EpsilonGreedy(expl_env.action_space),
        eval_policy,
    )
    eval_path_collector = MdpPathCollector(
        eval_env,
        eval_policy,
    )
    expl_path_collector = MdpPathCollector(
        expl_env,
        expl_policy,
    )
    trainer = DQNTrainer(
        qf=qf,
        target_qf=target_qf,
        qf_criterion=qf_criterion,
        **variant['trainer_kwargs']
    )
    replay_buffer = EnvReplayBuffer(
        variant['replay_buffer_size'],
        expl_env,
    )
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        **variant['algorithm_kwargs']
    )
    algorithm.to(ptu.device)
    algorithm.train()
Ejemplo n.º 9
0
def experiment(variant):
    eval_env = roboverse.make(variant['env'], transpose_image=True)
    expl_env = eval_env
    action_dim = eval_env.action_space.low.size

    cnn_params = variant['cnn_params']
    cnn_params.update(
        input_width=48,
        input_height=48,
        input_channels=3,
        output_size=1,
        added_fc_input_size=action_dim,
    )

    cnn_params.update(
        output_size=256,
        added_fc_input_size=0,
        hidden_sizes=[1024, 512],
    )

    policy_obs_processor = CNN(**cnn_params)
    policy = TanhGaussianPolicy(
        obs_dim=cnn_params['output_size'],
        action_dim=action_dim,
        hidden_sizes=[256, 256, 256],
        obs_processor=policy_obs_processor,
    )

    if variant['stoch_eval_policy']:
        eval_policy = policy
    else:
        eval_policy = MakeDeterministic(policy)

    eval_path_collector = MdpPathCollector(
        eval_env,
        eval_policy,
    )
    expl_path_collector = CustomMDPPathCollector(
        eval_env,
    )

    observation_key = 'image'
    replay_buffer = load_data_from_npy_chaining(
        variant, expl_env, observation_key)

    trainer = BCTrainer(
        env=eval_env,
        policy=policy,
        **variant['trainer_kwargs']
    )
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        eval_both=False,
        batch_rl=True,
        **variant['algorithm_kwargs']
    )
    video_func = VideoSaveFunction(variant)
    algorithm.post_epoch_funcs.append(video_func)

    algorithm.to(ptu.device)
    algorithm.train()
Ejemplo n.º 10
0
def experiment(variant):
    common.initialise(variant)

    expl_envs, eval_envs = common.create_environments(variant)

    (
        obs_shape,
        obs_space,
        action_space,
        n,
        mlp,
        channels,
        fc_input,
    ) = common.get_spaces(expl_envs)

    obs_dim = obs_shape[1]

    qf = CNN(
        input_width=obs_dim,
        input_height=obs_dim,
        input_channels=channels,
        output_size=8,
        kernel_sizes=[8, 4],
        n_channels=[16, 32],
        strides=[4, 2],
        paddings=[0, 0],
        hidden_sizes=[256],
    )
    # CHANGE TO ORDINAL ACTION SPACE
    action_space = gym.spaces.Box(-np.inf, np.inf, (8, ))
    expl_envs.action_space = action_space
    eval_envs.action_space = action_space

    base = common.create_networks(variant, n, mlp, channels, fc_input)

    bernoulli_dist = distributions.Bernoulli(base.output_size, 4)
    passenger_dist = distributions.Categorical(base.output_size, 5)
    delivered_dist = distributions.Categorical(base.output_size, 5)
    continuous_dist = distributions.DiagGaussian(base.output_size, 2)
    dist = distributions.DistributionGeneratorTuple(
        (bernoulli_dist, continuous_dist, passenger_dist, delivered_dist))

    eval_policy = LearnPlanPolicy(
        ScriptedPolicy(qf, variant["always_return"]),
        num_processes=variant["num_processes"],
        vectorised=True,
        json_to_screen=expl_envs.observation_space.converter,
    )
    expl_policy = LearnPlanPolicy(
        ScriptedPolicy(qf, variant["always_return"]),
        num_processes=variant["num_processes"],
        vectorised=True,
        json_to_screen=expl_envs.observation_space.converter,
    )

    eval_path_collector = HierarchicalStepCollector(
        eval_envs,
        eval_policy,
        ptu.device,
        max_num_epoch_paths_saved=variant["algorithm_kwargs"]
        ["num_eval_steps_per_epoch"],
        num_processes=variant["num_processes"],
        render=variant["render"],
        gamma=1,
        no_plan_penalty=variant.get("no_plan_penalty", False),
    )
    expl_path_collector = HierarchicalStepCollector(
        expl_envs,
        expl_policy,
        ptu.device,
        max_num_epoch_paths_saved=variant["num_steps"],
        num_processes=variant["num_processes"],
        render=variant["render"],
        gamma=variant["trainer_kwargs"]["gamma"],
        no_plan_penalty=variant.get("no_plan_penalty", False),
    )
    # added: created rollout(5,1,(4,84,84),Discrete(6),1), reset env and added obs to rollout[step]

    trainer = PPOTrainer(actor_critic=expl_policy.learner,
                         **variant["trainer_kwargs"])
    # missing: by this point, rollout back in sync.
    replay_buffer = EnvReplayBuffer(variant["replay_buffer_size"], expl_envs)
    # added: replay buffer is new
    algorithm = TorchIkostrikovRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_envs,
        evaluation_env=eval_envs,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        **variant["algorithm_kwargs"],
        # batch_size,
        # max_path_length,
        # num_epochs,
        # num_eval_steps_per_epoch,
        # num_expl_steps_per_train_loop,
        # num_trains_per_train_loop,
        # num_train_loops_per_epoch=1,
        # min_num_steps_before_training=0,
    )

    algorithm.to(ptu.device)
    # missing: device back in sync
    algorithm.evaluate()
Ejemplo n.º 11
0
def experiment(variant):
    imsize = 48
    expl_env = mujoco_image_env.ImageMujocoEnv(sawyer_door.SawyerDoorEnv(),
                                               imsize=imsize)
    eval_env = mujoco_image_env.ImageMujocoEnv(sawyer_door.SawyerDoorEnv(),
                                               imsize=imsize)

    expl_env.reset()
    eval_env.reset()
    print(expl_env)
    #expl_env = ImageEnv(sawyer_door_hook_env())
    #eval_env = ImageEnv(sawyer_door_hook_env())
    obs_dim = expl_env.observation_space.low.size
    action_dim = eval_env.action_space.low.size
    M = variant['layer_size']

    cnn_args = cnn_specs[1]
    print(cnn_args)

    qf1 = CNN(input_width=imsize, input_height=imsize, input_channels=3, output_size=1, \
    kernel_sizes=cnn_args["kernel_sizes"], strides=cnn_args["strides"], paddings =cnn_args["paddings"], \
    hidden_sizes=cnn_args["hidden_sizes"], n_channels=cnn_args["n_channels"], added_fc_input_size=action_dim)

    qf2 = CNN(input_width=imsize, input_height=imsize, input_channels=3, output_size=1, \
    kernel_sizes=cnn_args["kernel_sizes"], strides=cnn_args["strides"],  paddings=cnn_args["paddings"], \
    hidden_sizes=cnn_args["hidden_sizes"], n_channels=cnn_args["n_channels"], added_fc_input_size=action_dim)

    target_qf1 = CNN(input_width=imsize, input_height=imsize, input_channels=3, output_size=1, \
    kernel_sizes=cnn_args["kernel_sizes"], strides=cnn_args["strides"],  paddings=cnn_args["paddings"], \
    hidden_sizes=cnn_args["hidden_sizes"], n_channels=cnn_args["n_channels"], added_fc_input_size=action_dim)

    target_qf2 = CNN(input_width=imsize, input_height=imsize, input_channels=3, output_size=1, \
    kernel_sizes=cnn_args["kernel_sizes"], strides=cnn_args["strides"],  paddings=cnn_args["paddings"], \
    hidden_sizes=cnn_args["hidden_sizes"], n_channels=cnn_args["n_channels"], added_fc_input_size=action_dim)

    policy = CNNTanhGaussianPolicy(input_width=imsize, input_height=imsize, input_channels=3, action_dim=action_dim, \
    kernel_sizes=cnn_args["kernel_sizes"], strides=cnn_args["strides"], paddings=cnn_args["paddings"], \
    hidden_sizes=cnn_args["hidden_sizes"], n_channels=cnn_args["n_channels"])

    eval_policy = MakeDeterministic(policy)
    eval_path_collector = MdpPathCollector(
        eval_env,
        eval_policy,
    )
    expl_path_collector = MdpPathCollector(
        expl_env,
        policy,
    )

    replay_buffer = EnvReplayBuffer(
        variant['replay_buffer_size'],
        expl_env,
    )

    trainer = SACTrainer(env=eval_env,
                         policy=policy,
                         qf1=qf1,
                         qf2=qf2,
                         target_qf1=target_qf1,
                         target_qf2=target_qf2,
                         **variant['trainer_kwargs'])
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        **variant['algorithm_kwargs'])
    algorithm.to(ptu.device)
    algorithm.train()
Ejemplo n.º 12
0
    def __init__(self, variant, other_config):
        expl_env = other_config['env']
        eval_env = other_config['test_env']
        if variant['env_name'] in [
                'DTC_GT_Reward'
        ] and variant['test_env_name'] in ['duckietown_cam']:
            self.net_type = 'CNN'
        else:
            self.net_type = 'FlattenMlp'
        obs_dim = expl_env.observation_space.low.size
        action_dim = eval_env.action_space.low.size

        if self.net_type == 'FlattenMlp':
            M = variant['layer_size']
            qf1 = FlattenMlp(
                input_size=obs_dim + action_dim,
                output_size=1,
                hidden_sizes=[M, M],
            )
            qf2 = FlattenMlp(
                input_size=obs_dim + action_dim,
                output_size=1,
                hidden_sizes=[M, M],
            )
            target_qf1 = FlattenMlp(
                input_size=obs_dim + action_dim,
                output_size=1,
                hidden_sizes=[M, M],
            )
            target_qf2 = FlattenMlp(
                input_size=obs_dim + action_dim,
                output_size=1,
                hidden_sizes=[M, M],
            )

            policy = TanhGaussianPolicy(
                obs_dim=obs_dim,
                action_dim=action_dim,
                hidden_sizes=[M, M],
            )

        elif self.net_type == 'CNN':
            M = variant['layer_size']
            qf1 = CNN(input_width=40,
                      input_height=30,
                      input_channels=3,
                      output_size=1,
                      kernel_sizes=[3, 3, 3],
                      n_channels=[5, 5, 5],
                      strides=[1, 1, 1],
                      paddings=[1, 1, 1],
                      hidden_sizes=[M, M],
                      added_fc_input_size=action_dim)
            qf2 = CNN(input_width=40,
                      input_height=30,
                      input_channels=3,
                      output_size=1,
                      kernel_sizes=[3, 3, 3],
                      n_channels=[5, 5, 5],
                      strides=[1, 1, 1],
                      paddings=[1, 1, 1],
                      hidden_sizes=[M, M],
                      added_fc_input_size=action_dim)
            target_qf1 = CNN(input_width=40,
                             input_height=30,
                             input_channels=3,
                             output_size=1,
                             kernel_sizes=[3, 3, 3],
                             n_channels=[5, 5, 5],
                             strides=[1, 1, 1],
                             paddings=[1, 1, 1],
                             hidden_sizes=[M, M],
                             added_fc_input_size=action_dim)
            target_qf2 = CNN(input_width=40,
                             input_height=30,
                             input_channels=3,
                             output_size=1,
                             kernel_sizes=[3, 3, 3],
                             n_channels=[5, 5, 5],
                             strides=[1, 1, 1],
                             paddings=[1, 1, 1],
                             hidden_sizes=[M, M],
                             added_fc_input_size=action_dim)

            policy = CNNTanhGaussianPolicy(input_width=40,
                                           input_height=30,
                                           input_channels=3,
                                           output_size=action_dim,
                                           kernel_sizes=[3, 3, 3],
                                           n_channels=[5, 5, 5],
                                           strides=[1, 1, 1],
                                           paddings=[1, 1, 1],
                                           hidden_sizes=[M, M])

        eval_policy = MakeDeterministic(policy)
        eval_path_collector = MdpPathCollector(
            eval_env,
            eval_policy,
        )
        expl_path_collector = MdpPathCollector(
            expl_env,
            policy,
        )
        replay_buffer = EnvReplayBuffer(
            variant['replay_buffer_size'],
            expl_env,
        )
        trainer = SACTrainer(env=eval_env,
                             policy=policy,
                             qf1=qf1,
                             qf2=qf2,
                             target_qf1=target_qf1,
                             target_qf2=target_qf2,
                             **variant['trainer_kwargs'])
        self.algorithm = TorchBatchRLAlgorithm(
            trainer=trainer,
            exploration_env=expl_env,
            evaluation_env=eval_env,
            exploration_data_collector=expl_path_collector,
            evaluation_data_collector=eval_path_collector,
            replay_buffer=replay_buffer,
            **variant['algorithm_kwargs'])
        self.algorithm.to(ptu.device)
Ejemplo n.º 13
0
def experiment(variant):
    eval_env = roboverse.make(variant['env'], transpose_image=True)
    expl_env = eval_env
    action_dim = eval_env.action_space.low.size

    cnn_params = variant['cnn_params']
    cnn_params.update(
        input_width=48,
        input_height=48,
        input_channels=3,
        output_size=1,
        added_fc_input_size=action_dim,
    )
    qf1 = ConcatCNN(**cnn_params)
    qf2 = ConcatCNN(**cnn_params)
    target_qf1 = ConcatCNN(**cnn_params)
    target_qf2 = ConcatCNN(**cnn_params)

    cnn_params.update(
        output_size=256,
        added_fc_input_size=0,
        hidden_sizes=[1024, 512],
    )

    policy_obs_processor = CNN(**cnn_params)
    policy = TanhGaussianPolicy(
        obs_dim=cnn_params['output_size'],
        action_dim=action_dim,
        hidden_sizes=[256, 256, 256],
        obs_processor=policy_obs_processor,
    )

    eval_policy = MakeDeterministic(policy)
    eval_path_collector = MdpPathCollector(
        eval_env,
        eval_policy,
    )
    expl_path_collector = CustomMDPPathCollector(eval_env, )

    observation_key = 'image'
    replay_buffer = load_data_from_npy_chaining(variant, expl_env,
                                                observation_key)

    # Translate 0/1 rewards to +4/+10 rewards.
    if variant['use_positive_rew']:
        if set(np.unique(replay_buffer._rewards)).issubset({0, 1}):
            replay_buffer._rewards = replay_buffer._rewards * 6.0
            replay_buffer._rewards = replay_buffer._rewards + 4.0
        assert set(np.unique(replay_buffer._rewards)).issubset(
            set(6.0 * np.array([0, 1]) + 4.0))

    trainer = CQLTrainer(env=eval_env,
                         policy=policy,
                         qf1=qf1,
                         qf2=qf2,
                         target_qf1=target_qf1,
                         target_qf2=target_qf2,
                         **variant['trainer_kwargs'])
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        eval_both=False,
        batch_rl=True,
        **variant['algorithm_kwargs'])
    video_func = VideoSaveFunction(variant)
    algorithm.post_epoch_funcs.append(video_func)

    algorithm.to(ptu.device)
    algorithm.train()