def experiment(variant):
    expl_env = gym.make("CartPole-v0")
    eval_env = gym.make("CartPole-v0")
    obs_dim = expl_env.observation_space.low.size
    action_dim = eval_env.action_space.n

    qf = Mlp(hidden_sizes=[32, 32], input_size=obs_dim, output_size=action_dim)
    target_qf = Mlp(hidden_sizes=[32, 32],
                    input_size=obs_dim,
                    output_size=action_dim)
    qf_criterion = nn.MSELoss()
    eval_policy = ArgmaxDiscretePolicy(qf)
    expl_policy = PolicyWrappedWithExplorationStrategy(
        EpsilonGreedy(expl_env.action_space), eval_policy)
    eval_path_collector = MdpPathCollector(eval_env, eval_policy)
    expl_path_collector = MdpPathCollector(expl_env, expl_policy)
    trainer = DQNTrainer(qf=qf,
                         target_qf=target_qf,
                         qf_criterion=qf_criterion,
                         **variant["trainer_kwargs"])
    replay_buffer = EnvReplayBuffer(variant["replay_buffer_size"], expl_env)
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        **variant["algorithm_kwargs"])
    algorithm.to(ptu.device)
    algorithm.train()
Example #2
0
def experiment(variant):
    from simple_sup import SimpleSupEnv
    expl_env = SimpleSupEnv(**variant['env_kwars'])
    eval_env = SimpleSupEnv(**variant['env_kwars'])
    obs_dim = eval_env.observation_space.low.size
    action_dim = eval_env.action_space.n

    encoder = nn.Sequential(
        nn.Linear(obs_dim, 16),
        nn.ReLU(),
    )
    decoder = nn.Linear(16, action_dim)
    from layers import ReshapeLayer
    sup_learner = nn.Sequential(
        nn.Linear(16, action_dim),
        ReshapeLayer(shape=(1, action_dim)),
    )
    from sup_softmax_policy import SupSoftmaxPolicy
    policy = SupSoftmaxPolicy(encoder, decoder, sup_learner)

    vf = Mlp(
        hidden_sizes=[32],
        input_size=obs_dim,
        output_size=1,
    )
    vf_criterion = nn.MSELoss()
    eval_policy = ArgmaxDiscretePolicy(policy, use_preactivation=True)
    expl_policy = policy

    eval_path_collector = MdpPathCollector(
        eval_env,
        eval_policy,
    )
    expl_path_collector = MdpPathCollector(
        expl_env,
        expl_policy,
    )
    from sup_replay_buffer import SupReplayBuffer
    replay_buffer = SupReplayBuffer(
        observation_dim=obs_dim,
        label_dim=1,
        max_replay_buffer_size=int(1e6),
    )

    from rlkit.torch.vpg.trpo_sup import TRPOSupTrainer
    trainer = TRPOSupTrainer(policy=policy,
                             value_function=vf,
                             vf_criterion=vf_criterion,
                             replay_buffer=replay_buffer,
                             **variant['trainer_kwargs'])
    algorithm = TorchOnlineRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        **variant['algorithm_kwargs'])
    algorithm.to(ptu.device)
    algorithm.train()
Example #3
0
def experiment(variant):
    from simple_sup import SimpleSupEnv
    expl_env = SimpleSupEnv(**variant['env_kwars'])
    eval_env = SimpleSupEnv(**variant['env_kwars'])
    obs_dim = eval_env.observation_space.low.size
    action_dim = eval_env.action_space.n

    hidden_dim = variant['hidden_dim']
    encoder = nn.Sequential(
             nn.Linear(obs_dim,hidden_dim),
             nn.ReLU(),
             nn.Linear(hidden_dim,hidden_dim),
             nn.ReLU(),
            )
    decoder = nn.Linear(hidden_dim, action_dim)
    from layers import ReshapeLayer
    sup_learner = nn.Sequential(
            nn.Linear(hidden_dim, action_dim),
            ReshapeLayer(shape=(1, action_dim)),
        )
    from sup_softmax_policy import SupSoftmaxPolicy
    policy = SupSoftmaxPolicy(encoder, decoder, sup_learner)
    print('parameters: ',np.sum([p.view(-1).shape[0] for p in policy.parameters()]))

    vf = Mlp(
        hidden_sizes=[32],
        input_size=obs_dim,
        output_size=1,
    )
    vf_criterion = nn.MSELoss()
    eval_policy = ArgmaxDiscretePolicy(policy,use_preactivation=True)
    expl_policy = policy

    eval_path_collector = MdpPathCollector(
        eval_env,
        eval_policy,
    )
    expl_path_collector = MdpPathCollector(
        expl_env,
        expl_policy,
    )

    from rlkit.torch.vpg.ppo_sup_online import PPOSupOnlineTrainer
    trainer = PPOSupOnlineTrainer(
        policy=policy,
        value_function=vf,
        vf_criterion=vf_criterion,
        **variant['trainer_kwargs']
    )
    algorithm = TorchOnlineRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        **variant['algorithm_kwargs']
    )
    algorithm.to(ptu.device)
    algorithm.train()
Example #4
0
def experiment(variant):
    import sys
    from traffic.make_env import make_env
    expl_env = make_env(args.exp_name)
    eval_env = make_env(args.exp_name)
    obs_dim = eval_env.observation_space.low.size
    action_dim = eval_env.action_space.n

    module = Mlp(
        hidden_sizes=[32, 32],
        input_size=obs_dim,
        output_size=action_dim,
    )
    policy = SoftmaxPolicy(module, **variant['policy_kwargs'])
    qf1 = Mlp(input_size=obs_dim,
              output_size=action_dim,
              **variant['qf_kwargs'])
    target_qf1 = copy.deepcopy(qf1)
    qf2 = Mlp(input_size=obs_dim,
              output_size=action_dim,
              **variant['qf_kwargs'])
    target_qf2 = copy.deepcopy(qf2)

    eval_policy = ArgmaxDiscretePolicy(policy, use_preactivation=True)
    expl_policy = policy

    eval_path_collector = MdpPathCollector(
        eval_env,
        eval_policy,
    )
    expl_path_collector = MdpPathCollector(
        expl_env,
        expl_policy,
    )
    qf_criterion = nn.MSELoss()
    trainer = SACDiscreteTrainer(env=eval_env,
                                 policy=policy,
                                 qf1=qf1,
                                 qf2=qf2,
                                 target_qf1=target_qf1,
                                 target_qf2=target_qf2,
                                 qf_criterion=qf_criterion,
                                 **variant['trainer_kwargs'])
    replay_buffer = EnvReplayBuffer(
        variant['replay_buffer_size'],
        expl_env,
    )
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        **variant['algorithm_kwargs'])
    algorithm.to(ptu.device)
    algorithm.train()
Example #5
0
def experiment(variant):
    import sys
    from traffic.make_env import make_env
    expl_env = make_env(args.exp_name)
    eval_env = make_env(args.exp_name)
    obs_dim = eval_env.observation_space.low.size
    action_dim = eval_env.action_space.n

    gb = TrafficGraphBuilder(input_dim=4,
                            ego_init=torch.tensor([0.,1.]),
                            other_init=torch.tensor([1.,0.]),
                            edge_index=torch.tensor([[0,0,1,2],
                                                [1,2,0,0]]))
    qf = GNNNet( 
                pre_graph_builder = gb, 
                node_dim = 16,
                output_dim = action_dim,
                post_mlp_kwargs = variant['qf_kwargs'],
                num_conv_layers=3)
    
    target_qf = copy.deepcopy(qf)
    eval_policy = ArgmaxDiscretePolicy(qf)
    expl_policy = PolicyWrappedWithExplorationStrategy(
        EpsilonGreedy(expl_env.action_space, variant['epsilon']),
        eval_policy,
    )
    eval_path_collector = MdpPathCollector(
        eval_env,
        eval_policy,
    )
    expl_path_collector = MdpPathCollector(
        expl_env,
        expl_policy,
    )
    replay_buffer = PrioritizedReplayBuffer(
        variant['replay_buffer_size'],
        expl_env,
    )
    qf_criterion = nn.MSELoss()
    trainer = DQNTrainer(
        qf=qf,
        target_qf=target_qf,
        qf_criterion=qf_criterion,
        replay_buffer=replay_buffer,
        **variant['trainer_kwargs']
    )
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        **variant['algorithm_kwargs']
    )
    algorithm.to(ptu.device)
    algorithm.train()
Example #6
0
def experiment(variant):
    from traffic.make_env import make_env
    expl_env = make_env(args.exp_name, **variant['env_kwargs'])
    eval_env = make_env(args.exp_name, **variant['env_kwargs'])
    obs_dim = eval_env.observation_space.low.size
    action_dim = eval_env.action_space.n
    label_num = expl_env.label_num
    label_dim = expl_env.label_dim

    encoder = nn.Sequential(
        nn.Linear(obs_dim, 32),
        nn.ReLU(),
        nn.Linear(32, 32),
        nn.ReLU(),
    )
    decoder = nn.Linear(32, action_dim)
    from layers import ReshapeLayer
    sup_learner = nn.Sequential(
        nn.Linear(32, int(label_num * label_dim)),
        ReshapeLayer(shape=(label_num, label_dim)),
    )
    from sup_softmax_policy import SupSoftmaxPolicy
    policy = SupSoftmaxPolicy(encoder, decoder, sup_learner)
    print('parameters: ',
          np.sum([p.view(-1).shape[0] for p in policy.parameters()]))

    vf = Mlp(
        hidden_sizes=[32, 32],
        input_size=obs_dim,
        output_size=1,
    )
    vf_criterion = nn.MSELoss()
    eval_policy = ArgmaxDiscretePolicy(policy, use_preactivation=True)
    expl_policy = policy

    eval_path_collector = MdpPathCollector(
        eval_env,
        eval_policy,
    )
    expl_path_collector = MdpPathCollector(
        expl_env,
        expl_policy,
    )
    trainer = TRPOTrainer(policy=policy,
                          value_function=vf,
                          vf_criterion=vf_criterion,
                          **variant['trainer_kwargs'])
    algorithm = TorchOnlineRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        log_path_function=get_traffic_path_information,
        **variant['algorithm_kwargs'])
    algorithm.to(ptu.device)
    algorithm.train()
Example #7
0
def experiment(variant):
    checkpoint_filepath = os.path.join(variant['checkpoint_dir'],
                                       'itr_{}.pkl'.format(
                                           variant['checkpoint_epoch']))
    checkpoint = torch.load(checkpoint_filepath)

    # the following does not work for Bullet envs yet
    # eval_env = checkpoint['evaluation/env']
    # expl_env = checkpoint['exploration/env']

    eval_env = roboverse.make(variant['env'], transpose_image=True)
    expl_env = eval_env

    policy = checkpoint['trainer/trainer'].policy
    eval_policy = checkpoint['evaluation/policy']
    eval_path_collector = MdpPathCollector(
        eval_env,
        eval_policy,
    )
    expl_path_collector = MdpPathCollector(
        expl_env,
        policy,
    )

    observation_key = 'image'
    online_buffer_size = 500 * 10 * variant['algorithm_kwargs'][
        'max_path_length']

    if variant['online_data_only']:
        replay_buffer = ObsDictReplayBuffer(online_buffer_size, expl_env,
                                            observation_key=observation_key)
    else:
        replay_buffer = load_data_from_npy_chaining(
            variant, expl_env, observation_key,
            extra_buffer_size=online_buffer_size)

    trainer_kwargs = variant['trainer_kwargs']
    assert trainer_kwargs['min_q_weight'] > 0.
    trainer = checkpoint['trainer/trainer']
    trainer.min_q_weight = trainer_kwargs['min_q_weight']

    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        eval_both=False,
        batch_rl=False,
        **variant['algorithm_kwargs']
    )
    video_func = VideoSaveFunction(variant)
    algorithm.post_epoch_funcs.append(video_func)

    algorithm.to(ptu.device)
    algorithm.train()
Example #8
0
def experiment(variant):
    '''
    1. 建立实验环境(eval, expl)
    2. 确立输入,输出维度,建立qf函数,policy函数
    3. 复制target qf和 target policy 函数
    4. 对于评估构建path collector
    5. 对于训练实验,构建探索策略、path collector、replay buffer
    6. 构建 DDPGTrainer (qf, policy)
    7. algorithm (包括trainer, env, replay buffer, path collector.以及用于评价部分)
    8. 开始训练
    :param variant: config parameter
    :return:
    '''
    eval_env = NormalizedBoxEnv(HalfCheetahEnv())
    expl_env = NormalizedBoxEnv(HalfCheetahEnv())
    # Or for a specific version:
    # import gym
    # env = NormalizedBoxEnv(gym.make('HalfCheetah-v1'))
    obs_dim = eval_env.observation_space.low.size
    action_dim = eval_env.action_space.low.size
    qf = FlattenMlp(input_size=obs_dim + action_dim,
                    output_size=1,
                    **variant['qf_kwargs'])
    policy = TanhMlpPolicy(input_size=obs_dim,
                           output_size=action_dim,
                           **variant['policy_kwargs'])
    # 利用copy
    target_qf = copy.deepcopy(qf)
    target_policy = copy.deepcopy(policy)
    # 评估
    eval_path_collector = MdpPathCollector(eval_env, policy)
    # 实验 (探索策略、path收集、replay buffer)
    exploration_policy = PolicyWrappedWithExplorationStrategy(
        exploration_strategy=OUStrategy(action_space=expl_env.action_space),
        policy=policy,
    )
    expl_path_collector = MdpPathCollector(expl_env, exploration_policy)
    replay_buffer = EnvReplayBuffer(variant['replay_buffer_size'], expl_env)

    trainer = DDPGTrainer(qf=qf,
                          target_qf=target_qf,
                          policy=policy,
                          target_policy=target_policy,
                          **variant['trainer_kwargs'])
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        **variant['algorithm_kwargs'])
    # 转化变量格式
    algorithm.to(ptu.device)

    algorithm.train()
Example #9
0
def experiment(variant):
    from traffic.make_env import make_env
    expl_env = make_env(args.exp_name, **variant['env_kwargs'])
    eval_env = make_env(args.exp_name, **variant['env_kwargs'])
    obs_dim = eval_env.observation_space.low.size
    action_dim = eval_env.action_space.n
    label_num = expl_env.label_num
    label_dim = expl_env.label_dim

    if variant['load_kwargs']['load']:
        load_dir = variant['load_kwargs']['load_dir']
        load_data = torch.load(load_dir + '/params.pkl', map_location='cpu')
        policy = load_data['trainer/policy']
        vf = load_data['trainer/value_function']
    else:
        hidden_dim = variant['mlp_kwargs']['hidden']
        policy = nn.Sequential(nn.Linear(obs_dim, hidden_dim), nn.ReLU(),
                               nn.Linear(hidden_dim, hidden_dim), nn.ReLU(),
                               nn.Linear(hidden_dim, action_dim))
        policy = SoftmaxPolicy(policy)
        print('parameters: ',
              np.sum([p.view(-1).shape[0] for p in policy.parameters()]))

        vf = Mlp(
            hidden_sizes=[32, 32],
            input_size=obs_dim,
            output_size=1,
        )

    vf_criterion = nn.MSELoss()
    eval_policy = ArgmaxDiscretePolicy(policy, use_preactivation=True)
    expl_policy = policy

    eval_path_collector = MdpPathCollector(
        eval_env,
        eval_policy,
    )
    expl_path_collector = MdpPathCollector(
        expl_env,
        expl_policy,
    )
    trainer = PPOTrainer(policy=policy,
                         value_function=vf,
                         vf_criterion=vf_criterion,
                         **variant['trainer_kwargs'])
    algorithm = TorchOnlineRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        log_path_function=get_traffic_path_information,
        **variant['algorithm_kwargs'])
    algorithm.to(ptu.device)
    algorithm.train()
Example #10
0
def experiment(variant):
    env = Point2DEnv(**variant['env_kwargs'])
    env = FlatGoalEnv(env)
    env = NormalizedBoxEnv(env)

    action_dim = int(np.prod(env.action_space.shape))
    obs_dim = int(np.prod(env.observation_space.shape))

    qf1 = ConcatMlp(input_size=obs_dim + action_dim,
                    output_size=1,
                    **variant['qf_kwargs'])
    qf2 = ConcatMlp(input_size=obs_dim + action_dim,
                    output_size=1,
                    **variant['qf_kwargs'])
    target_qf1 = ConcatMlp(input_size=obs_dim + action_dim,
                           output_size=1,
                           **variant['qf_kwargs'])
    target_qf2 = ConcatMlp(input_size=obs_dim + action_dim,
                           output_size=1,
                           **variant['qf_kwargs'])
    policy = TanhGaussianPolicy(obs_dim=obs_dim,
                                action_dim=action_dim,
                                **variant['policy_kwargs'])
    eval_env = expl_env = env

    eval_policy = MakeDeterministic(policy)
    eval_path_collector = MdpPathCollector(
        eval_env,
        eval_policy,
    )
    expl_path_collector = MdpPathCollector(
        expl_env,
        policy,
    )
    replay_buffer = EnvReplayBuffer(
        variant['replay_buffer_size'],
        expl_env,
    )
    trainer = TwinSACTrainer(env=eval_env,
                             policy=policy,
                             qf1=qf1,
                             qf2=qf2,
                             target_qf1=target_qf1,
                             target_qf2=target_qf2,
                             **variant['trainer_kwargs'])
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        data_buffer=replay_buffer,
        **variant['algo_kwargs'])
    algorithm.to(ptu.device)
    algorithm.train()
Example #11
0
def experiment(variant):

    qf = CNN(
        input_width=obs_dim,
        input_height=obs_dim,
        input_channels=channels,
        output_size=action_dim,
        kernel_sizes=[8, 4],
        n_channels=[16, 32],
        strides=[4, 2],
        paddings=[0, 0],
        hidden_sizes=[256],
    )
    target_qf = CNN(
        input_width=obs_dim,
        input_height=obs_dim,
        input_channels=channels,
        output_size=action_dim,
        kernel_sizes=[8, 4],
        n_channels=[16, 32],
        strides=[4, 2],
        paddings=[0, 0],
        hidden_sizes=[256],
    )
    qf_criterion = nn.MSELoss()
    eval_learner_policy = ArgmaxDiscretePolicy(qf)
    expl_learner_policy = PolicyWrappedWithExplorationStrategy(
        AnnealedEpsilonGreedy(symbolic_action_space,
                              anneal_rate=variant["anneal_rate"]),
        eval_learner_policy,
    )
    eval_policy = LearnPlanPolicy(eval_learner_policy)
    expl_policy = LearnPlanPolicy(expl_learner_policy)
    eval_path_collector = MdpPathCollector(eval_env,
                                           eval_policy,
                                           rollout=hierarchical_rollout)
    expl_path_collector = MdpPathCollector(expl_env,
                                           expl_policy,
                                           rollout=hierarchical_rollout)
    trainer = DQNTrainer(qf=qf,
                         target_qf=target_qf,
                         qf_criterion=qf_criterion,
                         **variant["trainer_kwargs"])
    replay_buffer = EnvReplayBuffer(variant["replay_buffer_size"], symb_env)
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        **variant["algorithm_kwargs"])
    algorithm.to(ptu.device)
    algorithm.train()
def experiment(variant):
    # Select a different success_function for different tasks.
    expl_env = GymCraftingEnv(state_obs=True,
                              few_obj=True,
                              success_function=eval_eatbread)
    eval_env = GymCraftingEnv(state_obs=True,
                              few_obj=True,
                              success_function=eval_eatbread)
    obs_dim = expl_env.observation_space.low.size
    action_dim = eval_env.action_space.n

    qf = Mlp(
        hidden_sizes=[32, 32],
        input_size=obs_dim,
        output_size=action_dim,
    )
    target_qf = Mlp(
        hidden_sizes=[32, 32],
        input_size=obs_dim,
        output_size=action_dim,
    )
    qf_criterion = nn.MSELoss()
    eval_policy = ArgmaxDiscretePolicy(qf)
    expl_policy = PolicyWrappedWithExplorationStrategy(
        EpsilonGreedy(expl_env.action_space),
        eval_policy,
    )
    eval_path_collector = MdpPathCollector(
        eval_env,
        eval_policy,
    )
    expl_path_collector = MdpPathCollector(
        expl_env,
        expl_policy,
    )
    trainer = DQNTrainer(qf=qf,
                         target_qf=target_qf,
                         qf_criterion=qf_criterion,
                         **variant['trainer_kwargs'])
    replay_buffer = EnvReplayBuffer(
        variant['replay_buffer_size'],
        expl_env,
    )
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        **variant['algorithm_kwargs'])
    algorithm.to(ptu.device)
    algorithm.train()
Example #13
0
def experiment(variant):

    expl_env = get_env()
    eval_env = get_env()

    post_epoch_funcs = []
    M = variant['layer_size']
    trainer = get_sac_model(env=eval_env, hidden_sizes=[M, M])
    policy = trainer.policy
    eval_policy = MakeDeterministic(policy)
    eval_path_collector = MdpPathCollector(
        eval_env,
        eval_policy,
    )
    expl_path_collector = MdpPathCollector(
        expl_env,
        policy,
    )
    replay_buffer = EnvReplayBuffer(
        variant['replay_buffer_size'],
        expl_env,
    )

    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        **variant['algorithm_kwargs'])
    columns = ['Epoch', 'mean', 'std']
    eval_result = pd.DataFrame(columns=columns)
    eval_output_csv = os.path.join(variant['log_dir'], 'eval_result.csv')

    def post_epoch_func(self, epoch):
        nonlocal eval_result
        nonlocal policy
        print(f'-------------post_epoch_func start-------------')
        eval_result = my_eval_policy(
            env=get_env(),
            algorithm=self,
            epoch=epoch,
            eval_result=eval_result,
            output_csv=eval_output_csv,
        )
        print(f'-------------post_epoch_func done-------------')

    algorithm.post_epoch_funcs = [
        post_epoch_func,
    ]
    algorithm.to(ptu.device)
    algorithm.train()
Example #14
0
def run_sac(base_expl_env, base_eval_env, variant):
    expl_env = FlatGoalEnv(base_expl_env, append_goal_to_obs=True)
    eval_env = FlatGoalEnv(base_eval_env, append_goal_to_obs=True)
    obs_dim = expl_env.observation_space.low.size
    action_dim = eval_env.action_space.low.size

    M = variant["layer_size"]
    num_hidden = variant["num_hidden_layers"]
    qf1 = FlattenMlp(input_size=obs_dim + action_dim,
                     output_size=1,
                     hidden_sizes=[M] * num_hidden)
    qf2 = FlattenMlp(input_size=obs_dim + action_dim,
                     output_size=1,
                     hidden_sizes=[M] * num_hidden)
    target_qf1 = FlattenMlp(input_size=obs_dim + action_dim,
                            output_size=1,
                            hidden_sizes=[M] * num_hidden)
    target_qf2 = FlattenMlp(input_size=obs_dim + action_dim,
                            output_size=1,
                            hidden_sizes=[M] * num_hidden)
    policy = TanhGaussianPolicy(obs_dim=obs_dim,
                                action_dim=action_dim,
                                hidden_sizes=[M] * num_hidden)
    eval_policy = MakeDeterministic(policy)
    eval_path_collector = MdpPathCollector(
        eval_env,
        eval_policy,
    )
    expl_path_collector = MdpPathCollector(
        expl_env,
        policy,
    )
    replay_buffer = EnvReplayBuffer(
        variant["replay_buffer_size"],
        expl_env,
    )
    trainer = SACTrainer(env=eval_env,
                         policy=policy,
                         qf1=qf1,
                         qf2=qf2,
                         target_qf1=target_qf1,
                         target_qf2=target_qf2,
                         **variant["trainer_kwargs"])
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        **variant["algorithm_kwargs"])
    algorithm.train()
Example #15
0
def experiment(variant):
    args = getArgs()
    # expl_env = NormalizedBoxEnv(environment(args))

    expl_env = environment(args, 'dqn')
    eval_env = environment(args, 'dqn')
    # expl_env.render()
    obs_dim = expl_env.get_obsdim()
    action_dim = expl_env.action_space.n

    qf = Mlp(
        hidden_sizes=[32, 32],
        input_size=obs_dim,
        output_size=action_dim,
    )
    target_qf = Mlp(
        hidden_sizes=[32, 32],
        input_size=obs_dim,
        output_size=action_dim,
    )
    qf_criterion = nn.MSELoss()
    eval_policy = ArgmaxDiscretePolicy(qf)
    expl_policy = PolicyWrappedWithExplorationStrategy(
        EpsilonGreedy(expl_env.action_space),
        eval_policy,
    )
    eval_path_collector = MdpPathCollector(
        eval_env,
        eval_policy,
    )
    expl_path_collector = MdpPathCollector(
        expl_env,
        expl_policy,
    )
    trainer = DQNTrainer(qf=qf,
                         target_qf=target_qf,
                         qf_criterion=qf_criterion,
                         **variant['trainer_kwargs'])
    replay_buffer = EnvReplayBuffer(
        variant['replay_buffer_size'],
        expl_env,
    )
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        **variant['algorithm_kwargs'])
    algorithm.to(ptu.device)
    algorithm.train()
Example #16
0
def experiment(variant):
    expl_env = NormalizedBoxEnv(HalfCheetahEnv())
    eval_env = NormalizedBoxEnv(HalfCheetahEnv())
    obs_dim = expl_env.observation_space.low.size
    action_dim = expl_env.action_space.low.size
    qf1 = FlattenMlp(input_size=obs_dim + action_dim,
                     output_size=1,
                     **variant["qf_kwargs"])
    qf2 = FlattenMlp(input_size=obs_dim + action_dim,
                     output_size=1,
                     **variant["qf_kwargs"])
    target_qf1 = FlattenMlp(input_size=obs_dim + action_dim,
                            output_size=1,
                            **variant["qf_kwargs"])
    target_qf2 = FlattenMlp(input_size=obs_dim + action_dim,
                            output_size=1,
                            **variant["qf_kwargs"])
    policy = TanhMlpPolicy(input_size=obs_dim,
                           output_size=action_dim,
                           **variant["policy_kwargs"])
    target_policy = TanhMlpPolicy(input_size=obs_dim,
                                  output_size=action_dim,
                                  **variant["policy_kwargs"])
    es = GaussianStrategy(
        action_space=expl_env.action_space,
        max_sigma=0.1,
        min_sigma=0.1,  # Constant sigma
    )
    exploration_policy = PolicyWrappedWithExplorationStrategy(
        exploration_strategy=es, policy=policy)
    eval_path_collector = MdpPathCollector(eval_env, policy)
    expl_path_collector = MdpPathCollector(expl_env, exploration_policy)
    replay_buffer = EnvReplayBuffer(variant["replay_buffer_size"], expl_env)
    trainer = TD3Trainer(policy=policy,
                         qf1=qf1,
                         qf2=qf2,
                         target_qf1=target_qf1,
                         target_qf2=target_qf2,
                         target_policy=target_policy,
                         **variant["trainer_kwargs"])
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        **variant["algorithm_kwargs"])
    algorithm.to(ptu.device)
    algorithm.train()
Example #17
0
def experiment(variant):
    import sys
    from traffic.make_env import make_env
    expl_env = make_env(args.exp_name)
    eval_env = make_env(args.exp_name)
    obs_dim = eval_env.observation_space.low.size
    action_dim = eval_env.action_space.n

    qf = Mlp(
        input_size=obs_dim,
        output_size=action_dim,
        **variant['qf_kwargs']
    )
    target_qf = copy.deepcopy(qf)
    eval_policy = ArgmaxDiscretePolicy(qf)
    expl_policy = PolicyWrappedWithExplorationStrategy(
        EpsilonGreedy(expl_env.action_space, variant['epsilon']),
        eval_policy,
    )
    eval_path_collector = MdpPathCollector(
        eval_env,
        eval_policy,
    )
    expl_path_collector = MdpPathCollector(
        expl_env,
        expl_policy,
    )
    qf_criterion = nn.MSELoss()
    trainer = DoubleDQNTrainer(
        qf=qf,
        target_qf=target_qf,
        qf_criterion=qf_criterion,
        **variant['trainer_kwargs']
    )
    replay_buffer = EnvReplayBuffer(
        variant['replay_buffer_size'],
        expl_env,
    )
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        log_path_function = get_traffic_path_information,
        **variant['algorithm_kwargs']
    )
    algorithm.to(ptu.device)
    algorithm.train()
Example #18
0
def experiment(variant):
    import sys
    from traffic.make_env import make_env
    expl_env = make_env(args.exp_name)
    eval_env = make_env(args.exp_name)
    obs_dim = eval_env.observation_space.low.size
    action_dim = eval_env.action_space.n

    gb = TrafficGraphBuilder(input_dim=4,
                             ego_init=torch.tensor([0., 1.]),
                             other_init=torch.tensor([1., 0.]),
                             edge_index=torch.tensor([[0, 0, 1, 2],
                                                      [1, 2, 0, 0]]))
    module = GNNNet(pre_graph_builder=gb,
                    node_dim=16,
                    output_dim=action_dim,
                    post_mlp_kwargs=dict(hidden_sizes=[32]),
                    num_conv_layers=3)
    policy = SoftmaxPolicy(module, **variant['policy_kwargs'])

    vf = Mlp(
        hidden_sizes=[32, 32],
        input_size=obs_dim,
        output_size=1,
    )
    vf_criterion = nn.MSELoss()
    eval_policy = ArgmaxDiscretePolicy(policy, use_preactivation=True)
    expl_policy = policy

    eval_path_collector = MdpPathCollector(
        eval_env,
        eval_policy,
    )
    expl_path_collector = MdpPathCollector(
        expl_env,
        expl_policy,
    )
    trainer = PPOTrainer(policy=policy,
                         value_function=vf,
                         vf_criterion=vf_criterion,
                         **variant['trainer_kwargs'])
    algorithm = TorchOnlineRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        **variant['algorithm_kwargs'])
    algorithm.to(ptu.device)
    algorithm.train()
Example #19
0
def experiment(variant):
    from cartpole import CartPoleEnv
    from rlkit.envs.wrappers import ProbDiscreteEnv
    expl_env = ProbDiscreteEnv(CartPoleEnv(mode=2))
    eval_env = ProbDiscreteEnv(CartPoleEnv(mode=2))
    obs_dim = eval_env.observation_space.low.size
    action_dim = eval_env.action_space.low.size
    # import gym
    # from rlkit.envs.wrappers import ProbDiscreteEnv
    # expl_env = ProbDiscreteEnv(gym.make('CartPole-v0'))
    # eval_env = ProbDiscreteEnv(gym.make('CartPole-v0'))
    # obs_dim = eval_env.observation_space.low.size
    # action_dim = eval_env.action_space.low.size

    qf = FlattenMlp(input_size=obs_dim + action_dim,
                    output_size=1,
                    **variant['qf_kwargs'])
    policy = SoftmaxMlpPolicy(input_size=obs_dim,
                              output_size=action_dim,
                              **variant['policy_kwargs'])
    target_qf = copy.deepcopy(qf)
    target_policy = copy.deepcopy(policy)
    eval_path_collector = MdpPathCollector(eval_env, policy)
    # remove this since need action to be a prob
    # exploration_policy = PolicyWrappedWithExplorationStrategy(
    #     exploration_strategy=OUStrategy(action_space=expl_env.action_space),
    #     policy=policy,
    # )
    exploration_policy = policy
    expl_path_collector = MdpPathCollector(expl_env, exploration_policy)
    replay_buffer = EnvReplayBuffer(variant['replay_buffer_size'], expl_env)
    trainer = DDPGTrainer(qf=qf,
                          target_qf=target_qf,
                          policy=policy,
                          target_policy=target_policy,
                          **variant['trainer_kwargs'])
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        **variant['algorithm_kwargs'])
    algorithm.to(ptu.device)
    algorithm.train()
Example #20
0
File: ddpg.py Project: jesbu1/rlkit
def experiment(variant):
    eval_env = NormalizedBoxEnv(HalfCheetahEnv())
    expl_env = NormalizedBoxEnv(HalfCheetahEnv())
    # Or for a specific version:
    # import gym
    # env = NormalizedBoxEnv(gym.make('HalfCheetah-v1'))
    obs_dim = eval_env.observation_space.low.size
    action_dim = eval_env.action_space.low.size
    qf = ConcatMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        **variant['qf_kwargs']
    )
    policy = TanhMlpPolicy(
        input_size=obs_dim,
        output_size=action_dim,
        **variant['policy_kwargs']
    )
    target_qf = copy.deepcopy(qf)
    target_policy = copy.deepcopy(policy)
    eval_path_collector = MdpPathCollector(eval_env, policy)
    exploration_policy = PolicyWrappedWithExplorationStrategy(
        exploration_strategy=OUStrategy(action_space=expl_env.action_space),
        policy=policy,
    )
    expl_path_collector = MdpPathCollector(expl_env, exploration_policy)
    replay_buffer = EnvReplayBuffer(variant['replay_buffer_size'], expl_env)
    trainer = DDPGTrainer(
        qf=qf,
        target_qf=target_qf,
        policy=policy,
        target_policy=target_policy,
        **variant['trainer_kwargs']
    )
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        **variant['algorithm_kwargs']
    )
    algorithm.to(ptu.device)
    algorithm.train()
Example #21
0
def experiment(variant):
    from cartpole import CartPoleEnv
    expl_env = CartPoleEnv(mode=2)
    eval_env = CartPoleEnv(mode=2)
    obs_dim = eval_env.observation_space.low.size
    action_dim = eval_env.action_space.n
    # import gym
    # expl_env = gym.make('CartPole-v0')
    # eval_env = gym.make('CartPole-v0')
    # obs_dim = eval_env.observation_space.low.size
    # action_dim = eval_env.action_space.n

    policy = SoftmaxMlpPolicy(input_size=obs_dim,
                              output_size=action_dim,
                              **variant['policy_kwargs'])
    vf = Mlp(
        hidden_sizes=[32, 32],
        input_size=obs_dim,
        output_size=1,
    )
    vf_criterion = nn.MSELoss()
    eval_policy = ArgmaxDiscretePolicy(policy, use_preactivation=True)
    expl_policy = policy

    eval_path_collector = MdpPathCollector(
        eval_env,
        eval_policy,
    )
    expl_path_collector = MdpPathCollector(
        expl_env,
        expl_policy,
    )
    trainer = VPGTrainer(policy=policy,
                         value_function=vf,
                         vf_criterion=vf_criterion,
                         **variant['trainer_kwargs'])
    algorithm = TorchOnlineRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        **variant['algorithm_kwargs'])
    algorithm.to(ptu.device)
    algorithm.train()
Example #22
0
def experiment(variant):
    from cartpole import CartPoleEnv
    expl_env = NormalizedBoxEnv(CartPoleEnv(mode=0))
    eval_env = NormalizedBoxEnv(CartPoleEnv(mode=0))
    obs_dim = eval_env.observation_space.low.size
    action_dim = eval_env.action_space.low.size

    qf = FlattenMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        **variant['qf_kwargs']
    )
    policy = TanhMlpPolicy(
        input_size=obs_dim,
        output_size=action_dim,
        **variant['policy_kwargs']
    )
    target_qf = copy.deepcopy(qf)
    target_policy = copy.deepcopy(policy)
    eval_path_collector = MdpPathCollector(eval_env, policy)
    exploration_policy = PolicyWrappedWithExplorationStrategy(
        exploration_strategy=OUStrategy(action_space=expl_env.action_space),
        policy=policy,
    )
    expl_path_collector = MdpPathCollector(expl_env, exploration_policy)
    replay_buffer = EnvReplayBuffer(variant['replay_buffer_size'], expl_env)
    trainer = DDPGTrainer(
        qf=qf,
        target_qf=target_qf,
        policy=policy,
        target_policy=target_policy,
        **variant['trainer_kwargs']
    )
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        **variant['algorithm_kwargs']
    )
    algorithm.to(ptu.device)
    algorithm.train()
Example #23
0
def experiment(variant):
    from cartpole import CartPoleEnv
    expl_env = CartPoleEnv(mode=2)
    eval_env = CartPoleEnv(mode=2)
    obs_dim = eval_env.observation_space.low.size
    action_dim = eval_env.action_space.n

    qf = Mlp(input_size=obs_dim,
             output_size=action_dim,
             **variant['qf_kwargs'])
    target_qf = copy.deepcopy(qf)
    eval_policy = ArgmaxDiscretePolicy(qf)
    expl_policy = PolicyWrappedWithExplorationStrategy(
        EpsilonGreedy(expl_env.action_space, variant['epsilon']),
        eval_policy,
    )
    eval_path_collector = MdpPathCollector(
        eval_env,
        eval_policy,
    )
    expl_path_collector = MdpPathCollector(
        expl_env,
        expl_policy,
    )
    replay_buffer = PrioritizedReplayBuffer(
        variant['replay_buffer_size'],
        expl_env,
    )
    qf_criterion = nn.MSELoss()
    trainer = DQNTrainer(qf=qf,
                         target_qf=target_qf,
                         qf_criterion=qf_criterion,
                         replay_buffer=replay_buffer,
                         **variant['trainer_kwargs'])
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        **variant['algorithm_kwargs'])
    algorithm.to(ptu.device)
    algorithm.train()
Example #24
0
def experiment(variant):
    expl_env = NormalizedBoxEnv(HalfCheetahEnv())
    eval_env = NormalizedBoxEnv(HalfCheetahEnv())
    obs_dim = expl_env.observation_space.low.size
    action_dim = eval_env.action_space.low.size

    M = variant["layer_size"]
    qf1 = FlattenMlp(input_size=obs_dim + action_dim,
                     output_size=1,
                     hidden_sizes=[M, M])
    qf2 = FlattenMlp(input_size=obs_dim + action_dim,
                     output_size=1,
                     hidden_sizes=[M, M])
    target_qf1 = FlattenMlp(input_size=obs_dim + action_dim,
                            output_size=1,
                            hidden_sizes=[M, M])
    target_qf2 = FlattenMlp(input_size=obs_dim + action_dim,
                            output_size=1,
                            hidden_sizes=[M, M])
    policy = TanhGaussianPolicy(obs_dim=obs_dim,
                                action_dim=action_dim,
                                hidden_sizes=[M, M])
    eval_policy = MakeDeterministic(policy)
    eval_path_collector = MdpPathCollector(eval_env, eval_policy)
    expl_path_collector = MdpPathCollector(expl_env, policy)
    replay_buffer = EnvReplayBuffer(variant["replay_buffer_size"], expl_env)
    trainer = SACTrainer(env=eval_env,
                         policy=policy,
                         qf1=qf1,
                         qf2=qf2,
                         target_qf1=target_qf1,
                         target_qf2=target_qf2,
                         **variant["trainer_kwargs"])
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        **variant["algorithm_kwargs"])
    algorithm.to(ptu.device)
    algorithm.train()
Example #25
0
def experiment(variant):
    import sys
    from traffic.make_env import make_env
    expl_env = make_env(args.exp_name)
    eval_env = make_env(args.exp_name)
    obs_dim = eval_env.observation_space.low.size
    action_dim = eval_env.action_space.low.size

    policy = TanhGaussianPolicy(
        obs_dim=obs_dim,
        action_dim=action_dim,
        **variant['policy_kwargs'],
    )
    vf = Mlp(
        hidden_sizes=[32, 32],
        input_size=obs_dim,
        output_size=1,
    )
    vf_criterion = nn.MSELoss()
    eval_policy = MakeDeterministic(policy)
    expl_policy = policy

    eval_path_collector = MdpPathCollector(
        eval_env,
        eval_policy,
    )
    expl_path_collector = MdpPathCollector(
        expl_env,
        expl_policy,
    )
    trainer = PPOTrainer(policy=policy,
                         value_function=vf,
                         vf_criterion=vf_criterion,
                         **variant['trainer_kwargs'])
    algorithm = TorchOnlineRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        **variant['algorithm_kwargs'])
    algorithm.to(ptu.device)
    algorithm.train()
def experiment(variant):
    env = gym.make('RLkitUR-v0')._start_ros_services()
    eval_env = gym.make('RLkitUR-v0')
    expl_env = gym.make('RLkitUR-v0')
    eval_env = NormalizedBoxEnv(eval_env)
    expl_env = NormalizedBoxEnv(expl_env)

    obs_dim = eval_env.observation_space.low.size
    action_dim = eval_env.action_space.low.size
    print("obs_dim: ", obs_dim)
    print("action_dim: ", action_dim)
    qf = FlattenMlp(input_size=obs_dim + action_dim,
                    output_size=1,
                    **variant['qf_kwargs'])
    policy = TanhMlpPolicy(input_size=obs_dim,
                           output_size=action_dim,
                           **variant['policy_kwargs'])
    target_qf = copy.deepcopy(qf)
    target_policy = copy.deepcopy(policy)
    eval_path_collector = MdpPathCollector(eval_env, policy)
    exploration_policy = PolicyWrappedWithExplorationStrategy(
        exploration_strategy=OUStrategy(action_space=expl_env.action_space),
        policy=policy,
    )
    expl_path_collector = MdpPathCollector(expl_env, exploration_policy)
    replay_buffer = EnvReplayBuffer(variant['replay_buffer_size'], expl_env)
    trainer = DDPGTrainer(qf=qf,
                          target_qf=target_qf,
                          policy=policy,
                          target_policy=target_policy,
                          **variant['trainer_kwargs'])
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        **variant['algorithm_kwargs'])
    algorithm.to(ptu.device)
    algorithm.train()
Example #27
0
def simulate_policy(fpath,
                    env_name,
                    seed,
                    max_path_length,
                    num_eval_steps,
                    headless,
                    max_eps,
                    verbose=True,
                    pause=False):
    data = torch.load(fpath, map_location=ptu.device)
    policy = data['evaluation/policy']
    policy.to(ptu.device)
    # make new env, reloading with data['evaluation/env'] seems to make bug
    env = gym.make(env_name, **{"headless": headless, "verbose": False})
    env.seed(seed)
    if pause:
        input("Waiting to start.")
    path_collector = MdpPathCollector(env, policy)
    paths = path_collector.collect_new_paths(
        max_path_length,
        num_eval_steps,
        discard_incomplete_paths=True,
    )

    if max_eps:
        paths = paths[:max_eps]
    if verbose:
        completions = sum([
            info["completed"] for path in paths for info in path["env_infos"]
        ])
        print("Completed {} out of {}".format(completions, len(paths)))
        # plt.plot(paths[0]["actions"])
        # plt.show()
        # plt.plot(paths[2]["observations"])
        # plt.show()
        logger.record_dict(
            eval_util.get_generic_path_information(paths),
            prefix="evaluation/",
        )
        logger.dump_tabular()
    return paths
Example #28
0
def experiment(variant):
    """Run the experiment."""
    eval_env = gym.make('CartPole-v0')
    obs_dim = eval_env.observation_space.low.size
    action_dim = eval_env.action_space.n
    # Collect data.
    print('Collecting data...')
    data = []
    while len(data) < variant['offline_data_size']:
        done = False
        s = eval_env.reset()
        while not done:
            a = np.random.randint(action_dim)
            n, r, done, _ = eval_env.step(a)
            one_hot_a = np.zeros(action_dim)
            one_hot_a[a] = 1
            data.append((s, one_hot_a, r, n, done))
            s = n
            if len(data) == variant['offline_data_size']:
                break

    qf = Mlp(
        hidden_sizes=[32, 32],
        input_size=obs_dim,
        output_size=action_dim,
    )
    target_qf = Mlp(
        hidden_sizes=[32, 32],
        input_size=obs_dim,
        output_size=action_dim,
    )
    qf_criterion = nn.MSELoss()
    eval_policy = ArgmaxDiscretePolicy(qf)
    eval_path_collector = MdpPathCollector(
        eval_env,
        eval_policy,
    )
    trainer = DQNTrainer(
        qf=qf,
        target_qf=target_qf,
        qf_criterion=qf_criterion,
        **variant['trainer_kwargs']
    )
    offline_data = OfflineDataStore(data=data,)
    algorithm = TorchOfflineRLAlgorithm(
        trainer=trainer,
        evaluation_env=eval_env,
        evaluation_data_collector=eval_path_collector,
        offline_data=offline_data,
        **variant['algorithm_kwargs']
    )
    algorithm.to(ptu.device)
    algorithm.train()
Example #29
0
def experiment(variant, data):
    # make new env, reloading with data['evaluation/env'] seems to make bug
    eval_env = gym.make("panda-v0", **{"headless": variant["headless"]})
    eval_env.seed(variant['seed'])
    expl_env = eval_env

    qf1 = data['trainer/qf1']
    qf2 = data['trainer/qf2']
    target_qf1 = data['trainer/target_qf1']
    target_qf2 = data['trainer/target_qf2']
    policy = data['trainer/policy']
    eval_policy = data["evaluation/policy"]
    eval_path_collector = MdpPathCollector(
        eval_env,
        eval_policy,
    )
    expl_path_collector = CustomMDPPathCollector(eval_env, )
    buffer_filename = None
    if variant['buffer_filename'] is not None:
        buffer_filename = variant['buffer_filename']

    replay_buffer = EnvReplayBuffer(
        variant['replay_buffer_size'],
        expl_env,
    )
    if variant['load_buffer'] and buffer_filename is not None:
        replay_buffer.load_buffer(buffer_filename)
    else:
        dataset = get_dataset(variant["h5path"], eval_env)
        load_hdf5(d4rl.qlearning_dataset(eval_env, dataset), replay_buffer)

    trainer = CQLTrainer(env=eval_env,
                         policy=policy,
                         qf1=qf1,
                         qf2=qf2,
                         target_qf1=target_qf1,
                         target_qf2=target_qf2,
                         **variant['trainer_kwargs'])
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        eval_both=True,
        batch_rl=variant['load_buffer'],
        **variant['algorithm_kwargs'])
    algorithm.to(ptu.device)
    algorithm.train(start_epoch=variant["start_epoch"])
Example #30
0
def get_path_collector(variant, expl_env, eval_env, policy, eval_policy):
    """
    Define path collector
    """
    mode = variant["mode"]
    if mode == "vanilla":
        expl_path_collector = MdpPathCollector(expl_env, policy)
        eval_path_collector = MdpPathCollector(eval_env, eval_policy)
    elif mode == "her":
        expl_path_collector = GoalConditionedPathCollector(
            expl_env,
            policy,
            observation_key=variant["her"]["observation_key"],
            desired_goal_key=variant["her"]["desired_goal_key"],
            representation_goal_key=variant["her"]["representation_goal_key"],
        )
        eval_path_collector = GoalConditionedPathCollector(
            eval_env,
            eval_policy,
            observation_key=variant["her"]["observation_key"],
            desired_goal_key=variant["her"]["desired_goal_key"],
            representation_goal_key=variant["her"]["representation_goal_key"],
        )
    return expl_path_collector, eval_path_collector