Beispiel #1
0
def get_ddpg(evaluation_environment, parameters):
    obs_dim = evaluation_environment.observation_space.low.size
    action_dim = evaluation_environment.action_space.low.size
    hidden_sizes_qf = parameters['hidden_sizes_qf']
    hidden_sizes_policy = parameters['hidden_sizes_policy']

    qf = FlattenMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        hidden_sizes=hidden_sizes_qf,
    )
    policy = TanhMlpPolicy(
        input_size=obs_dim,
        output_size=action_dim,
        hidden_sizes=hidden_sizes_policy,
    )
    target_qf = copy.deepcopy(qf)
    target_policy = copy.deepcopy(policy)

    exploration_policy = PolicyWrappedWithExplorationStrategy(
        exploration_strategy=OUStrategy(
            action_space=evaluation_environment.action_space),
        policy=policy,
    )

    trainer = DDPGTrainer(qf=qf,
                          target_qf=target_qf,
                          policy=policy,
                          target_policy=target_policy,
                          **parameters['trainer_params'])
    return exploration_policy, policy, trainer
Beispiel #2
0
def experiment(variant):
    '''
    1. 建立实验环境(eval, expl)
    2. 确立输入,输出维度,建立qf函数,policy函数
    3. 复制target qf和 target policy 函数
    4. 对于评估构建path collector
    5. 对于训练实验,构建探索策略、path collector、replay buffer
    6. 构建 DDPGTrainer (qf, policy)
    7. algorithm (包括trainer, env, replay buffer, path collector.以及用于评价部分)
    8. 开始训练
    :param variant: config parameter
    :return:
    '''
    eval_env = NormalizedBoxEnv(HalfCheetahEnv())
    expl_env = NormalizedBoxEnv(HalfCheetahEnv())
    # Or for a specific version:
    # import gym
    # env = NormalizedBoxEnv(gym.make('HalfCheetah-v1'))
    obs_dim = eval_env.observation_space.low.size
    action_dim = eval_env.action_space.low.size
    qf = FlattenMlp(input_size=obs_dim + action_dim,
                    output_size=1,
                    **variant['qf_kwargs'])
    policy = TanhMlpPolicy(input_size=obs_dim,
                           output_size=action_dim,
                           **variant['policy_kwargs'])
    # 利用copy
    target_qf = copy.deepcopy(qf)
    target_policy = copy.deepcopy(policy)
    # 评估
    eval_path_collector = MdpPathCollector(eval_env, policy)
    # 实验 (探索策略、path收集、replay buffer)
    exploration_policy = PolicyWrappedWithExplorationStrategy(
        exploration_strategy=OUStrategy(action_space=expl_env.action_space),
        policy=policy,
    )
    expl_path_collector = MdpPathCollector(expl_env, exploration_policy)
    replay_buffer = EnvReplayBuffer(variant['replay_buffer_size'], expl_env)

    trainer = DDPGTrainer(qf=qf,
                          target_qf=target_qf,
                          policy=policy,
                          target_policy=target_policy,
                          **variant['trainer_kwargs'])
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        **variant['algorithm_kwargs'])
    # 转化变量格式
    algorithm.to(ptu.device)

    algorithm.train()
Beispiel #3
0
def experiment(variant):
    from cartpole import CartPoleEnv
    from rlkit.envs.wrappers import ProbDiscreteEnv
    expl_env = ProbDiscreteEnv(CartPoleEnv(mode=2))
    eval_env = ProbDiscreteEnv(CartPoleEnv(mode=2))
    obs_dim = eval_env.observation_space.low.size
    action_dim = eval_env.action_space.low.size
    # import gym
    # from rlkit.envs.wrappers import ProbDiscreteEnv
    # expl_env = ProbDiscreteEnv(gym.make('CartPole-v0'))
    # eval_env = ProbDiscreteEnv(gym.make('CartPole-v0'))
    # obs_dim = eval_env.observation_space.low.size
    # action_dim = eval_env.action_space.low.size

    qf = FlattenMlp(input_size=obs_dim + action_dim,
                    output_size=1,
                    **variant['qf_kwargs'])
    policy = SoftmaxMlpPolicy(input_size=obs_dim,
                              output_size=action_dim,
                              **variant['policy_kwargs'])
    target_qf = copy.deepcopy(qf)
    target_policy = copy.deepcopy(policy)
    eval_path_collector = MdpPathCollector(eval_env, policy)
    # remove this since need action to be a prob
    # exploration_policy = PolicyWrappedWithExplorationStrategy(
    #     exploration_strategy=OUStrategy(action_space=expl_env.action_space),
    #     policy=policy,
    # )
    exploration_policy = policy
    expl_path_collector = MdpPathCollector(expl_env, exploration_policy)
    replay_buffer = EnvReplayBuffer(variant['replay_buffer_size'], expl_env)
    trainer = DDPGTrainer(qf=qf,
                          target_qf=target_qf,
                          policy=policy,
                          target_policy=target_policy,
                          **variant['trainer_kwargs'])
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        **variant['algorithm_kwargs'])
    algorithm.to(ptu.device)
    algorithm.train()
Beispiel #4
0
def experiment(variant):
    eval_env = NormalizedBoxEnv(HalfCheetahEnv())
    expl_env = NormalizedBoxEnv(HalfCheetahEnv())
    # Or for a specific version:
    # import gym
    # env = NormalizedBoxEnv(gym.make('HalfCheetah-v1'))
    obs_dim = eval_env.observation_space.low.size
    action_dim = eval_env.action_space.low.size
    qf = ConcatMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        **variant['qf_kwargs']
    )
    policy = TanhMlpPolicy(
        input_size=obs_dim,
        output_size=action_dim,
        **variant['policy_kwargs']
    )
    target_qf = copy.deepcopy(qf)
    target_policy = copy.deepcopy(policy)
    eval_path_collector = MdpPathCollector(eval_env, policy)
    exploration_policy = PolicyWrappedWithExplorationStrategy(
        exploration_strategy=OUStrategy(action_space=expl_env.action_space),
        policy=policy,
    )
    expl_path_collector = MdpPathCollector(expl_env, exploration_policy)
    replay_buffer = EnvReplayBuffer(variant['replay_buffer_size'], expl_env)
    trainer = DDPGTrainer(
        qf=qf,
        target_qf=target_qf,
        policy=policy,
        target_policy=target_policy,
        **variant['trainer_kwargs']
    )
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        **variant['algorithm_kwargs']
    )
    algorithm.to(ptu.device)
    algorithm.train()
Beispiel #5
0
def experiment(variant):
    from cartpole import CartPoleEnv
    expl_env = NormalizedBoxEnv(CartPoleEnv(mode=0))
    eval_env = NormalizedBoxEnv(CartPoleEnv(mode=0))
    obs_dim = eval_env.observation_space.low.size
    action_dim = eval_env.action_space.low.size

    qf = FlattenMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        **variant['qf_kwargs']
    )
    policy = TanhMlpPolicy(
        input_size=obs_dim,
        output_size=action_dim,
        **variant['policy_kwargs']
    )
    target_qf = copy.deepcopy(qf)
    target_policy = copy.deepcopy(policy)
    eval_path_collector = MdpPathCollector(eval_env, policy)
    exploration_policy = PolicyWrappedWithExplorationStrategy(
        exploration_strategy=OUStrategy(action_space=expl_env.action_space),
        policy=policy,
    )
    expl_path_collector = MdpPathCollector(expl_env, exploration_policy)
    replay_buffer = EnvReplayBuffer(variant['replay_buffer_size'], expl_env)
    trainer = DDPGTrainer(
        qf=qf,
        target_qf=target_qf,
        policy=policy,
        target_policy=target_policy,
        **variant['trainer_kwargs']
    )
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        **variant['algorithm_kwargs']
    )
    algorithm.to(ptu.device)
    algorithm.train()
def experiment(variant):
    env = gym.make('RLkitUR-v0')._start_ros_services()
    eval_env = gym.make('RLkitUR-v0')
    expl_env = gym.make('RLkitUR-v0')
    eval_env = NormalizedBoxEnv(eval_env)
    expl_env = NormalizedBoxEnv(expl_env)

    obs_dim = eval_env.observation_space.low.size
    action_dim = eval_env.action_space.low.size
    print("obs_dim: ", obs_dim)
    print("action_dim: ", action_dim)
    qf = FlattenMlp(input_size=obs_dim + action_dim,
                    output_size=1,
                    **variant['qf_kwargs'])
    policy = TanhMlpPolicy(input_size=obs_dim,
                           output_size=action_dim,
                           **variant['policy_kwargs'])
    target_qf = copy.deepcopy(qf)
    target_policy = copy.deepcopy(policy)
    eval_path_collector = MdpPathCollector(eval_env, policy)
    exploration_policy = PolicyWrappedWithExplorationStrategy(
        exploration_strategy=OUStrategy(action_space=expl_env.action_space),
        policy=policy,
    )
    expl_path_collector = MdpPathCollector(expl_env, exploration_policy)
    replay_buffer = EnvReplayBuffer(variant['replay_buffer_size'], expl_env)
    trainer = DDPGTrainer(qf=qf,
                          target_qf=target_qf,
                          policy=policy,
                          target_policy=target_policy,
                          **variant['trainer_kwargs'])
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        **variant['algorithm_kwargs'])
    algorithm.to(ptu.device)
    algorithm.train()
def get_ddpg_trainer(env, hidden_sizes=[256, 256]):
    obs_dim = env.observation_space.low.size
    action_dim = env.action_space.low.size
    qf = ConcatMlp(input_size=obs_dim + action_dim,
                   output_size=1,
                   hidden_sizes=hidden_sizes)
    policy = TanhMlpPolicy(input_size=obs_dim,
                           output_size=action_dim,
                           hidden_sizes=hidden_sizes)
    target_qf = copy.deepcopy(qf)
    target_policy = copy.deepcopy(policy)

    trainer = DDPGTrainer(qf=qf,
                          target_qf=target_qf,
                          policy=policy,
                          target_policy=target_policy,
                          use_soft_update=True,
                          tau=1e-2,
                          discount=0.99,
                          qf_learning_rate=1e-3,
                          policy_learning_rate=1e-4)
    return trainer
Beispiel #8
0
def experiment(variant):
    # Or for a specific version (Daniel: doesn't work):
    # import gym
    # env = NormalizedBoxEnv(gym.make('HalfCheetah-v1'))

    if 'Ant' in args.env:
        expl_env = NormalizedBoxEnv(AntEnv())
        eval_env = NormalizedBoxEnv(AntEnv())
    elif 'InvertedPendulum' in args.env:
        expl_env = NormalizedBoxEnv(InvertedPendulumEnv())
        eval_env = NormalizedBoxEnv(InvertedPendulumEnv())
    elif 'HalfCheetah' in args.env:
        expl_env = NormalizedBoxEnv(HalfCheetahEnv())
        eval_env = NormalizedBoxEnv(HalfCheetahEnv())
    elif 'Hopper' in args.env:
        expl_env = NormalizedBoxEnv(HopperEnv())
        eval_env = NormalizedBoxEnv(HopperEnv())
    elif 'Reacher' in args.env:
        expl_env = NormalizedBoxEnv(ReacherEnv())
        eval_env = NormalizedBoxEnv(ReacherEnv())
    elif 'Swimmer' in args.env:
        expl_env = NormalizedBoxEnv(SwimmerEnv())
        eval_env = NormalizedBoxEnv(SwimmerEnv())
    elif 'Walker2d' in args.env:
        expl_env = NormalizedBoxEnv(Walker2dEnv())
        eval_env = NormalizedBoxEnv(Walker2dEnv())
    else:
        raise ValueError(args.env)

    obs_dim = eval_env.observation_space.low.size
    action_dim = eval_env.action_space.low.size
    qf = FlattenMlp(input_size=obs_dim + action_dim,
                    output_size=1,
                    **variant['qf_kwargs'])
    policy = TanhMlpPolicy(input_size=obs_dim,
                           output_size=action_dim,
                           **variant['policy_kwargs'])
    target_qf = copy.deepcopy(qf)
    target_policy = copy.deepcopy(policy)
    eval_path_collector = MdpPathCollector(eval_env, policy)
    exploration_policy = PolicyWrappedWithExplorationStrategy(
        exploration_strategy=OUStrategy(action_space=expl_env.action_space),
        policy=policy,
    )
    expl_path_collector = MdpPathCollector(expl_env, exploration_policy)
    replay_buffer = EnvReplayBuffer(variant['replay_buffer_size'], expl_env)
    trainer = DDPGTrainer(qf=qf,
                          target_qf=target_qf,
                          policy=policy,
                          target_policy=target_policy,
                          **variant['trainer_kwargs'])
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        **variant['algorithm_kwargs'])
    algorithm.to(ptu.device)
    algorithm.train()