Exemple #1
0
def create_env(env_id, env_kwargs, num_skills=0):
    if env_id == 'ManipulationEnv':
        env = NormalizedBoxEnv(ManipulationEnv(**env_kwargs))
        training_env = NormalizedBoxEnv(ManipulationEnv(**env_kwargs))
    elif env_id == 'StarEnv':
        env = NormalizedBoxEnv(StarEnv(**env_kwargs))
        training_env = NormalizedBoxEnv(StarEnv(**env_kwargs))
    elif env_id == 'PointEnv':
        env = NormalizedBoxEnv(PointEnv_SMM(**env_kwargs))
        training_env = NormalizedBoxEnv(PointEnv_SMM(**env_kwargs))
    elif env_id == 'PointEnv_evolve':
        env = NormalizedBoxEnv(PointEnv_SMM_evolution(**env_kwargs))
        training_env = NormalizedBoxEnv(PointEnv_SMM_evolution(**env_kwargs))
    elif env_id == 'ant_goal':
        env = NormalizedBoxEnv(AntGoalEnv_SMM(**env_kwargs))
        training_env = NormalizedBoxEnv(AntGoalEnv_SMM(**env_kwargs))
    else:
        raise NotImplementedError('Unrecognized environment:', env_id)

    # Append skill to observation vector.
    if num_skills > 0:
        env = AugmentedBoxObservationShapeEnv(env, num_skills)
        training_env = AugmentedBoxObservationShapeEnv(env, num_skills)

    return env, training_env
Exemple #2
0
def simulate_policy(args):
    data = torch.load(str(args.file))
    #data = joblib.load(str(args.file))
    policy = data['evaluation/policy']
    env = NormalizedBoxEnv(HalfCheetahEnv())
    #env = data['evaluation/env']
    print("Policy loaded")
    if args.gpu:
        set_gpu_mode(True)
        policy.cuda()

    if args.collect:
        data = []
    for trial in tqdm(range(100)):
        path = rollout(
            env,
            policy,
            max_path_length=args.H + 1,
            render=not args.collect,
        )
        if hasattr(env, "log_diagnostics"):
            env.log_diagnostics([path])
        logger.dump_tabular()
        if args.collect:
            data.append([path['actions'], path['next_observations']])

    if args.collect:
        import pickle
        with open("data/expert.pkl", mode='wb') as f:
            pickle.dump(data, f)
Exemple #3
0
def simulate_policy(args):
 #   data = joblib.load(args.file)
    data = torch.load(args.file)
    policy = data['evaluation/policy']
    env = NormalizedBoxEnv(Mani2dEnv())
    # env.reset()
    # print(env.step(env.action_space.sample()))
    # sys.exit()
 #   env = env.wrapped_env.unwrapped
    print("Policy loaded")
    if args.gpu:
        set_gpu_mode(True)
        # policy.cuda()
    # import cv2
    # video = cv2.VideoWriter('diayn_bipedal_walker_hardcore.avi', cv2.VideoWriter_fourcc('M','J','P','G'), 30, (1200, 800))
    index = 0
    for skill in range(policy.stochastic_policy.skill_dim):
        print(skill)
        for _ in range(3):
            path = rollout(
                env,
                policy,
                skill,
                max_path_length=args.H,
                render=True,
            )
            if hasattr(env, "log_diagnostics"):
                env.log_diagnostics([path])
            logger.dump_tabular()
Exemple #4
0
def simulate_policy(args):
    #   data = joblib.load(args.file)
    data = torch.load(args.file)
    policy = data['evaluation/policy']
    env = NormalizedBoxEnv(gym.make("BipedalWalker-v2"))
    print("Policy loaded")
    if args.gpu:
        set_gpu_mode(True)
        policy.cuda()

    import cv2
    video = cv2.VideoWriter('ppo_test.avi',
                            cv2.VideoWriter_fourcc('M', 'J', 'P', 'G'), 30,
                            (640, 480))
    index = 0
    path = rollout(
        env,
        policy,
        max_path_length=args.H,
        render=True,
    )
    if hasattr(env, "log_diagnostics"):
        env.log_diagnostics([path])
    logger.dump_tabular()

    for i, img in enumerate(path['images']):
        print(i)
        video.write(img[:, :, ::-1].astype(np.uint8))
        cv2.imwrite("frames/ppo_test/%06d.png" % index, img[:, :, ::-1])
        index += 1

    video.release()
    print("wrote video")
Exemple #5
0
def simulate_policy(args):
    data = torch.load(args.file)
    policy = data['evaluation/policy']

    if args.gpu:
        ptu.set_gpu_mode(True)
        policy.cuda()
        print("set gpu")
    print(ptu.device)

    config_file = get_config_file(args.config_file)
    env = NormalizedBoxEnv(
        load_env(args, config_file, args.env_mode, ptu.device.index))

    print("Policy loaded")

    while True:
        path = rollout(
            env,
            policy,
            max_path_length=args.H,
            render=False,
        )
        if hasattr(env, "log_diagnostics"):
            env.log_diagnostics([path])
        logger.dump_tabular()
def get_meta_env(env_specs):
    base_env_name = env_specs['base_env_name']
    env_dict = meta_envs[base_env_name]
    meta_train_env, meta_test_env = meta_envs[base_env_name]['meta_train'](
    ), meta_envs[base_env_name]['meta_test']()
    if env_specs['need_pixels']:
        if env_dict['info']['is_dmcs_env']:
            meta_train_env = pixels.Wrapper(
                meta_train_env,
                pixels_only=False,
                render_kwargs=env_specs['render_kwargs'])
            meta_test_env = pixels.Wrapper(
                meta_test_env,
                pixels_only=False,
                render_kwargs=env_specs['render_kwargs'])
        else:
            raise NotImplementedError()
    # if its a dmcs env we need to wrap it to look like a gym env
    if env_dict['info']['is_dmcs_env']:
        meta_train_env = DmControlWrapper(meta_train_env)
        meta_test_env = DmControlWrapper(meta_test_env)
    if env_specs['normalized']:
        meta_train_env, meta_test_env = NormalizedBoxEnv(
            meta_train_env), NormalizedBoxEnv(meta_test_env)
    return meta_train_env, meta_test_env
Exemple #7
0
def experiment(variant):
    torch.autograd.set_detect_anomaly(True)
    #expl_env = NormalizedBoxEnv(HalfCheetahEnv())
    #eval_env = NormalizedBoxEnv(HalfCheetahEnv())
    # expl_env = NormalizedBoxEnv(PendulumEnv())
    # eval_env = NormalizedBoxEnv(PendulumEnv())
    expl_env = NormalizedBoxEnv(gym.make("BipedalWalker-v2"))
    eval_env = NormalizedBoxEnv(gym.make("BipedalWalker-v2"))
    #expl_env = NormalizedBoxEnv(gym.make("LunarLanderContinuous-v2"))
    #eval_env = NormalizedBoxEnv(gym.make("LunarLanderContinuous-v2"))
    obs_dim = expl_env.observation_space.low.size
    action_dim = eval_env.action_space.low.size

    M = variant['layer_size']
    vf = FlattenMlp(
        input_size=obs_dim,
        output_size=1,
        hidden_sizes=[M, M],
    )
    policy = TanhGaussianPolicy(
        obs_dim=obs_dim,
        action_dim=action_dim,
        hidden_sizes=[M, M],
    )
    eval_policy = MakeDeterministic(policy)
    eval_step_collector = PPOMdpPathCollector(
        eval_env,
        eval_policy,
        calculate_advantages=False
    )
    expl_step_collector = PPOMdpPathCollector(
        expl_env,
        policy,
        calculate_advantages=True,
        vf=vf,
        gae_lambda=0.97,
        discount=0.995,
    )
    replay_buffer = PPOEnvReplayBuffer(
        variant['replay_buffer_size'],
        expl_env,
    )
    trainer = PPOTrainer(
        env=eval_env,
        policy=policy,
        vf=vf,
        **variant['trainer_kwargs']
    )
    algorithm = PPOTorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_step_collector,
        evaluation_data_collector=eval_step_collector,
        replay_buffer=replay_buffer,
        **variant['algorithm_kwargs']
    )
    algorithm.to(ptu.device)
    algorithm.train()
Exemple #8
0
def experiment(variant):
    env = NormalizedBoxEnv(PointEnv(**variant['task_params']))
    ptu.set_gpu_mode(variant['use_gpu'], variant['gpu_id'])

    tasks = env.get_all_task_idx()

    obs_dim = int(np.prod(env.observation_space.shape))
    action_dim = int(np.prod(env.action_space.shape))
    latent_dim = 5
    task_enc_output_dim = latent_dim * 2 if variant['algo_params']['use_information_bottleneck'] else latent_dim
    reward_dim = 1

    net_size = variant['net_size']
    # start with linear task encoding
    recurrent = variant['algo_params']['recurrent']
    encoder_model = RecurrentEncoder if recurrent else MlpEncoder
    task_enc = encoder_model(
            hidden_sizes=[200, 200, 200], # deeper net + higher dim space generalize better
            input_size=obs_dim + action_dim + reward_dim,
            output_size=task_enc_output_dim,
    )
    qf1 = FlattenMlp(
        hidden_sizes=[net_size, net_size, net_size],
        input_size=obs_dim + action_dim + latent_dim,
        output_size=1,
    )
    qf2 = FlattenMlp(
        hidden_sizes=[net_size, net_size, net_size],
        input_size=obs_dim + action_dim + latent_dim,
        output_size=1,
    )
    vf = FlattenMlp(
        hidden_sizes=[net_size, net_size, net_size],
        input_size=obs_dim + latent_dim,
        output_size=1,
    )
    policy = TanhGaussianPolicy(
        hidden_sizes=[net_size, net_size, net_size],
        obs_dim=obs_dim + latent_dim,
        latent_dim=latent_dim,
        action_dim=action_dim,
    )
    agent = ProtoAgent(
        latent_dim,
        [task_enc, policy, qf1, qf2, vf],
        **variant['algo_params']
    )

    algorithm = ProtoSoftActorCritic(
        env=env,
        train_tasks=list(tasks[:-20]),
        eval_tasks=list(tasks[-20:]),
        nets=[agent, task_enc, policy, qf1, qf2, vf],
        latent_dim=latent_dim,
        **variant['algo_params']
    )
    if ptu.gpu_enabled():
        algorithm.to()
    algorithm.train()
Exemple #9
0
def experiment(variant):
    '''
    1. 建立实验环境(eval, expl)
    2. 确立输入,输出维度,建立qf函数,policy函数
    3. 复制target qf和 target policy 函数
    4. 对于评估构建path collector
    5. 对于训练实验,构建探索策略、path collector、replay buffer
    6. 构建 DDPGTrainer (qf, policy)
    7. algorithm (包括trainer, env, replay buffer, path collector.以及用于评价部分)
    8. 开始训练
    :param variant: config parameter
    :return:
    '''
    eval_env = NormalizedBoxEnv(HalfCheetahEnv())
    expl_env = NormalizedBoxEnv(HalfCheetahEnv())
    # Or for a specific version:
    # import gym
    # env = NormalizedBoxEnv(gym.make('HalfCheetah-v1'))
    obs_dim = eval_env.observation_space.low.size
    action_dim = eval_env.action_space.low.size
    qf = FlattenMlp(input_size=obs_dim + action_dim,
                    output_size=1,
                    **variant['qf_kwargs'])
    policy = TanhMlpPolicy(input_size=obs_dim,
                           output_size=action_dim,
                           **variant['policy_kwargs'])
    # 利用copy
    target_qf = copy.deepcopy(qf)
    target_policy = copy.deepcopy(policy)
    # 评估
    eval_path_collector = MdpPathCollector(eval_env, policy)
    # 实验 (探索策略、path收集、replay buffer)
    exploration_policy = PolicyWrappedWithExplorationStrategy(
        exploration_strategy=OUStrategy(action_space=expl_env.action_space),
        policy=policy,
    )
    expl_path_collector = MdpPathCollector(expl_env, exploration_policy)
    replay_buffer = EnvReplayBuffer(variant['replay_buffer_size'], expl_env)

    trainer = DDPGTrainer(qf=qf,
                          target_qf=target_qf,
                          policy=policy,
                          target_policy=target_policy,
                          **variant['trainer_kwargs'])
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        **variant['algorithm_kwargs'])
    # 转化变量格式
    algorithm.to(ptu.device)

    algorithm.train()
Exemple #10
0
def main(goal_idx=0, args=args):
    variant = default_config
    if args.config:
        with open(os.path.join(args.config)) as f:
            exp_params = json.load(f)
        variant = deep_update_dict(exp_params, variant)
    # variant['util_params']['gpu_id'] = gpu
    env = NormalizedBoxEnv(ENVS[variant['env_name']](**variant['env_params']))
    env.reset_task(goal_idx)
    experiment(env=env, goal_idx=goal_idx)
Exemple #11
0
def experiment(variant):
    torch.autograd.set_detect_anomaly(True)
    # expl_env = NormalizedBoxEnv(HalfCheetahEnv())
    # eval_env = NormalizedBoxEnv(HalfCheetahEnv())
    expl_env = NormalizedBoxEnv(gym.make(str(args.env)))
    eval_env = NormalizedBoxEnv(gym.make(str(args.env)))
    obs_dim = expl_env.observation_space.low.size
    action_dim = expl_env.action_space.low.size
    skill_dim = 10

    M = variant['layer_size']
    vf = FlattenMlp(
        input_size=obs_dim,
        output_size=1,
        hidden_sizes=[M, M],
    )
    worker = RandomSkillTanhGaussianPolicy(obs_dim=obs_dim + skill_dim,
                                           action_dim=action_dim,
                                           hidden_sizes=[M, M])
    torch.save(worker, "data/random_policy_params.pkl")
    policy = TanhGaussianPolicy(
        obs_dim=obs_dim,
        action_dim=skill_dim,
        hidden_sizes=[M, M],
    )
    eval_policy = MakeDeterministic(policy)
    eval_step_collector = DirichletManagerPPOMdpPathCollector(
        eval_env, eval_policy, worker, calculate_advantages=False)
    expl_step_collector = DirichletManagerPPOMdpPathCollector(
        expl_env,
        policy,
        worker,
        calculate_advantages=True,
        vf=vf,
        gae_lambda=0.97,
        discount=0.995,
    )
    replay_buffer = ManagerPPOEnvReplayBuffer(variant['replay_buffer_size'],
                                              expl_env,
                                              skill_dim=skill_dim)
    trainer = PPOTrainer(env=eval_env,
                         policy=policy,
                         vf=vf,
                         **variant['trainer_kwargs'])
    algorithm = PPOTorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_step_collector,
        evaluation_data_collector=eval_step_collector,
        replay_buffer=replay_buffer,
        **variant['algorithm_kwargs'])
    algorithm.to(ptu.device)
    algorithm.train()
Exemple #12
0
def create_env(env_id, env_kwargs, num_skills=0):
    if env_id == 'PointEnv':
        env = NormalizedBoxEnv(PointEnv_SMM(**env_kwargs))
        training_env = NormalizedBoxEnv(PointEnv_SMM(**env_kwargs))
    else:
        raise NotImplementedError('Unrecognized environment:', env_id)

    # Append skill to observation vector.
    if num_skills > 0:
        env = AugmentedBoxObservationShapeEnv(env, num_skills)
        training_env = AugmentedBoxObservationShapeEnv(env, num_skills)

    return env, training_env
Exemple #13
0
def experiment(variant):
    expl_env = NormalizedBoxEnv(HalfCheetahEnv())
    eval_env = NormalizedBoxEnv(HalfCheetahEnv())
    obs_dim = expl_env.observation_space.low.size
    action_dim = expl_env.action_space.low.size
    qf1 = FlattenMlp(input_size=obs_dim + action_dim,
                     output_size=1,
                     **variant["qf_kwargs"])
    qf2 = FlattenMlp(input_size=obs_dim + action_dim,
                     output_size=1,
                     **variant["qf_kwargs"])
    target_qf1 = FlattenMlp(input_size=obs_dim + action_dim,
                            output_size=1,
                            **variant["qf_kwargs"])
    target_qf2 = FlattenMlp(input_size=obs_dim + action_dim,
                            output_size=1,
                            **variant["qf_kwargs"])
    policy = TanhMlpPolicy(input_size=obs_dim,
                           output_size=action_dim,
                           **variant["policy_kwargs"])
    target_policy = TanhMlpPolicy(input_size=obs_dim,
                                  output_size=action_dim,
                                  **variant["policy_kwargs"])
    es = GaussianStrategy(
        action_space=expl_env.action_space,
        max_sigma=0.1,
        min_sigma=0.1,  # Constant sigma
    )
    exploration_policy = PolicyWrappedWithExplorationStrategy(
        exploration_strategy=es, policy=policy)
    eval_path_collector = MdpPathCollector(eval_env, policy)
    expl_path_collector = MdpPathCollector(expl_env, exploration_policy)
    replay_buffer = EnvReplayBuffer(variant["replay_buffer_size"], expl_env)
    trainer = TD3Trainer(policy=policy,
                         qf1=qf1,
                         qf2=qf2,
                         target_qf1=target_qf1,
                         target_qf2=target_qf2,
                         target_policy=target_policy,
                         **variant["trainer_kwargs"])
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        **variant["algorithm_kwargs"])
    algorithm.to(ptu.device)
    algorithm.train()
Exemple #14
0
def experiment(variant):
    torch.autograd.set_detect_anomaly(True)
    # expl_env = NormalizedBoxEnv(HalfCheetahEnv())
    # eval_env = NormalizedBoxEnv(HalfCheetahEnv())
    expl_env = NormalizedBoxEnv(Mani2dEnv())
    eval_env = NormalizedBoxEnv(Mani2dEnv())
    obs_dim = expl_env.observation_space.low.size
    worker = torch.load(str(args.worker))['trainer/policy']
    skill_dim = worker.skill_dim

    M = variant['layer_size']
    vf = FlattenMlp(
        input_size=obs_dim,
        output_size=1,
        hidden_sizes=[M, M],
    )
    policy = DiscretePolicy(
        obs_dim=obs_dim,
        action_dim=skill_dim,
        hidden_sizes=[M, M],
    )
    eval_policy = MakeDeterministic(policy)
    eval_step_collector = ManagerPPOMdpPathCollector(
        eval_env, eval_policy, worker, calculate_advantages=False)
    expl_step_collector = ManagerPPOMdpPathCollector(
        expl_env,
        policy,
        worker,
        calculate_advantages=True,
        vf=vf,
        gae_lambda=0.97,
        discount=0.995,
    )
    replay_buffer = ManagerPPOEnvReplayBuffer(variant['replay_buffer_size'],
                                              expl_env,
                                              skill_dim=skill_dim)
    trainer = DiscretePPOTrainer(env=eval_env,
                                 policy=policy,
                                 vf=vf,
                                 **variant['trainer_kwargs'])
    algorithm = PPOTorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_step_collector,
        evaluation_data_collector=eval_step_collector,
        replay_buffer=replay_buffer,
        **variant['algorithm_kwargs'])
    algorithm.to(ptu.device)
    algorithm.train()
Exemple #15
0
def experiment(variant):
    env = NormalizedBoxEnv(variant['env_class']())
    es = GaussianStrategy(action_space=env.action_space,
                          **variant['es_kwargs'])
    obs_dim = env.observation_space.low.size
    action_dim = env.action_space.low.size
    qf1 = ConcatMlp(input_size=obs_dim + action_dim,
                    output_size=1,
                    **variant['qf_kwargs'])
    qf2 = ConcatMlp(input_size=obs_dim + action_dim,
                    output_size=1,
                    **variant['qf_kwargs'])
    policy = TanhMlpPolicy(input_size=obs_dim,
                           output_size=action_dim,
                           **variant['policy_kwargs'])
    exploration_policy = PolicyWrappedWithExplorationStrategy(
        exploration_strategy=es,
        policy=policy,
    )
    algorithm = TD3(env,
                    qf1=qf1,
                    qf2=qf2,
                    policy=policy,
                    exploration_policy=exploration_policy,
                    **variant['algo_kwargs'])
    algorithm.to(ptu.device)
    algorithm.train()
def experiment(variant):
    env = gym.make('replab-v0')._start_rospy(goal_oriented=False)
    #SIM
    #env = gym.make('replab-v0')._start_sim(goal_oriented=False, render=False)
    env = NormalizedBoxEnv(env)
    es = GaussianStrategy(action_space=env.action_space)
    obs_dim = env.observation_space.low.size
    action_dim = env.action_space.low.size
    qf = FlattenMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        hidden_sizes=[400, 300],
    )
    policy = TanhMlpPolicy(
        input_size=obs_dim,
        output_size=action_dim,
        hidden_sizes=[400, 300],
    )
    exploration_policy = PolicyWrappedWithExplorationStrategy(
        exploration_strategy=es,
        policy=policy,
    )
    algorithm = DDPG(env,
                     qf=qf,
                     policy=policy,
                     exploration_policy=exploration_policy,
                     **variant['algo_params'])
    algorithm.to(ptu.device)
    algorithm.train()
Exemple #17
0
def experiment(variant):
    env = SawyerXYZEnv(**variant['env_kwargs'])
    env = MultitaskToFlatEnv(env)
    if variant['normalize']:
        env = NormalizedBoxEnv(env)
    obs_dim = env.observation_space.low.size
    action_dim = env.action_space.low.size
    qf = ConcatMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        **variant['qf_kwargs']
    )
    vf = ConcatMlp(
        input_size=obs_dim,
        output_size=1,
        **variant['vf_kwargs']
    )
    policy = TanhGaussianPolicy(
        obs_dim=obs_dim,
        action_dim=action_dim,
        **variant['policy_kwargs']
    )
    algorithm = SoftActorCritic(
        env=env,
        policy=policy,
        qf=qf,
        vf=vf,
        **variant['algo_kwargs']
    )
    algorithm.to(ptu.device)
    algorithm.train()
Exemple #18
0
def experiment(variant):
    env = NormalizedBoxEnv(CartpoleSwingupSparseEnv())

    obs_dim = int(np.prod(env.observation_space.shape))
    action_dim = int(np.prod(env.action_space.shape))

    heads = 5

    net_size = variant['net_size']
    qf1 = EnsembleFlattenMlp(
        heads,
        hidden_sizes=[net_size, net_size],
        input_size=obs_dim + action_dim,
        output_size=1,
    )
    qf2 = EnsembleFlattenMlp(
        heads,
        hidden_sizes=[net_size, net_size],
        input_size=obs_dim + action_dim,
        output_size=1,
    )
    pqf1 = EnsembleFlattenMlp(
        heads,
        hidden_sizes=[net_size, net_size],
        input_size=obs_dim + action_dim,
        output_size=1,
    )
    pqf2 = EnsembleFlattenMlp(
        heads,
        hidden_sizes=[net_size, net_size],
        input_size=obs_dim + action_dim,
        output_size=1,
    )
    vf = FlattenMlp(
        hidden_sizes=[1],
        input_size=obs_dim,
        output_size=1,
    )
    policy = MultiTanhGaussianPolicy(
        hidden_sizes=[net_size, net_size],
        obs_dim=obs_dim,
        action_dim=action_dim,
        heads=heads,
    )

    algorithm = BigThompsonSoftActorCritic(
        env=env,
        policy=policy,
        qf1=qf1,
        qf2=qf2,
        pqf1=pqf1,
        pqf2=pqf2,
        prior_coef=10,
        vf=vf,
        #disc=disc,
        #skill_dim=skill_dim,
        **variant['algo_params']
    )
    algorithm.to(ptu.device)
    algorithm.train()
Exemple #19
0
def experiment(variant):
    farmlist_base = [('123.123.123.123', 4)]

    farmer = Farmer(farmlist_base)
    environment = acq_remote_env(farmer)
    env = NormalizedBoxEnv(environment)

    es = OUStrategy(action_space=env.action_space)
    obs_dim = env.observation_space.low.size
    action_dim = env.action_space.low.size
    qf = FlattenMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        hidden_sizes=[400, 300],
    )
    policy = TanhMlpPolicy(
        input_size=obs_dim,
        output_size=action_dim,
        hidden_sizes=[400, 300],
    )
    exploration_policy = PolicyWrappedWithExplorationStrategy(
        exploration_strategy=es,
        policy=policy,
    )
    algorithm = DDPG(env,
                     qf=qf,
                     policy=policy,
                     exploration_policy=exploration_policy,
                     **variant['algo_params'])
    if ptu.gpu_enabled():
        algorithm.cuda()
    algorithm.train()
Exemple #20
0
def experiment(variant):
    env = NormalizedBoxEnv(gym.make('HalfCheetah-v1'))
    es = OUStrategy(action_space=env.action_space)
    obs_dim = env.observation_space.low.size
    action_dim = env.action_space.low.size
    qf = FlattenMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        hidden_sizes=[400, 300],
    )
    policy = TanhMlpPolicy(
        input_size=obs_dim,
        output_size=action_dim,
        hidden_sizes=[400, 300],
    )
    exploration_policy = PolicyWrappedWithExplorationStrategy(
        exploration_strategy=es,
        policy=policy,
    )
    algorithm = DDPG(
        env,
        qf=qf,
        policy=policy,
        exploration_policy=exploration_policy,
        **variant['algo_params']
    )
    if ptu.gpu_enabled():
        algorithm.cuda()
    algorithm.train()
Exemple #21
0
def experiment(variant):
    env = NormalizedBoxEnv(gym.make(variant['env']))
    obs_dim = int(np.prod(env.observation_space.shape))
    action_dim = int(np.prod(env.action_space.shape))

    net_size = variant['net_size']
    qf = FlattenMlp(
        hidden_sizes=[net_size, net_size],
        input_size=obs_dim + action_dim,
        output_size=1,
    )
    vf = FlattenMlp(
        hidden_sizes=[net_size, net_size],
        input_size=obs_dim,
        output_size=1,
    )
    policy = TanhGaussianPolicy(
        hidden_sizes=[net_size, net_size],
        obs_dim=obs_dim,
        action_dim=action_dim,
    )
    algorithm = BetaVirel(env=env,
                          policy=policy,
                          qf=qf,
                          vf=vf,
                          **variant['algo_params'])
    algorithm.to(ptu.device)
    algorithm.train()
Exemple #22
0
def experiment(variant):
    #env = NormalizedBoxEnv(HalfCheetahEnv())
    # Or for a specific version:
    # import gym
    env = NormalizedBoxEnv(gym.make('Pointmass-v1'))

    obs_dim = int(np.prod(env.observation_space.shape))
    action_dim = int(np.prod(env.action_space.shape))

    net_size = variant['net_size']
    qf = FlattenMlp(
        hidden_sizes=[net_size, net_size],
        input_size=obs_dim + action_dim,
        output_size=1,
    )
    vf = FlattenMlp(
        hidden_sizes=[net_size, net_size],
        input_size=obs_dim,
        output_size=1,
    )
    policy = TanhGaussianPolicy(
        hidden_sizes=[net_size, net_size],
        obs_dim=obs_dim,
        action_dim=action_dim,
    )
    algorithm = SoftActorCritic(env=env,
                                policy=policy,
                                qf=qf,
                                vf=vf,
                                **variant['algo_params'])
    if ptu.gpu_enabled():
        algorithm.cuda()
    algorithm.train()
Exemple #23
0
def example(variant):
    env = CartpoleEnv()
    env = NormalizedBoxEnv(env)
    es = OUStrategy(action_space=env.action_space)
    qf = FeedForwardQFunction(
        int(env.observation_space.flat_dim),
        int(env.action_space.flat_dim),
        **variant['qf_params'],
    )
    policy = FeedForwardPolicy(
        int(env.observation_space.flat_dim),
        int(env.action_space.flat_dim),
        400,
        300,
    )
    exploration_policy = PolicyWrappedWithExplorationStrategy(
        exploration_strategy=es,
        policy=policy,
    )
    algorithm = DDPG(
        env,
        qf,
        policy,
        exploration_policy,
        **variant['algo_params']
    )
    algorithm.to(ptu.device)
    algorithm.train()
def build_PEARL_envs(seed, env_name, params=None):
    '''
      Build env from PEARL
    '''
    from rlkit.envs import ENVS
    from rlkit.envs.wrappers import NormalizedBoxEnv

    if env_name == 'ant-dir':
        env_params = {
            'n_tasks': params.n_tasks,
            'randomize_tasks': params.randomize_tasks,
            #"low_gear": params.low_gear,
            "forward_backward": params.forward_backward,
        }

    elif env_name == 'ant-goal':
        env_params = {
            'n_tasks': params.n_tasks,
            'randomize_tasks': params.randomize_tasks,
            #"low_gear": params.low_gear,
        }

    else:
        env_params = {
            'n_tasks': params.n_tasks,
            'randomize_tasks': params.randomize_tasks
        }

    env = ENVS[env_name](**env_params)
    env.seed(seed)
    env = NormalizedBoxEnv(env)
    env.action_space.np_random.seed(seed)

    return env
Exemple #25
0
def experiment(variant):
    #env = NormalizedBoxEnv(HalfCheetahEnv())
    env = NormalizedBoxEnv(create_swingup())
    # Or for a specific version:
    # import gym
    # env = NormalizedBoxEnv(gym.make('HalfCheetah-v1'))
    es = OUStrategy(action_space=env.action_space)
    obs_dim = env.observation_space.low.size
    action_dim = env.action_space.low.size
    qf = FlattenMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        hidden_sizes=[400, 300],
    )
    policy = TanhMlpPolicy(
        input_size=obs_dim,
        output_size=action_dim,
        hidden_sizes=[400, 300],
    )
    exploration_policy = PolicyWrappedWithExplorationStrategy(
        exploration_strategy=es,
        policy=policy,
    )
    algorithm = DDPG(
        env,
        qf=qf,
        policy=policy,
        exploration_policy=exploration_policy,
        **variant['algo_params']
    )
    algorithm.to(ptu.device)
    algorithm.train()
Exemple #26
0
def experiment(variant):
    env = SawyerXYZEnv(**variant['env_kwargs'])
    if variant['normalize']:
        env = NormalizedBoxEnv(env)
    obs_dim = env.observation_space.low.size
    action_dim = env.action_space.low.size
    goal_dim = env.goal_dim
    qf = ConcatMlp(input_size=obs_dim + action_dim + goal_dim,
                   output_size=1,
                   **variant['qf_kwargs'])
    vf = ConcatMlp(input_size=obs_dim + goal_dim,
                   output_size=1,
                   **variant['vf_kwargs'])
    policy = TanhGaussianPolicy(obs_dim=obs_dim + goal_dim,
                                action_dim=action_dim,
                                **variant['policy_kwargs'])
    replay_buffer = SimpleHerReplayBuffer(env=env,
                                          **variant['replay_buffer_kwargs'])
    algorithm = HerSac(env=env,
                       policy=policy,
                       qf=qf,
                       vf=vf,
                       replay_buffer=replay_buffer,
                       **variant['algo_kwargs'])
    algorithm.to(ptu.device)
    algorithm.train()
Exemple #27
0
def example(variant):
    env = variant['env_class']()
    if variant['normalize']:
        env = NormalizedBoxEnv(env)
    es = OUStrategy(action_space=env.action_space, **variant['es_kwargs'])
    obs_dim = int(np.prod(env.observation_space.low.shape))
    action_dim = int(np.prod(env.action_space.low.shape))
    qf = FlattenMlp(input_size=obs_dim + action_dim,
                    output_size=1,
                    **variant['vf_params'])
    vf = FlattenMlp(input_size=obs_dim, output_size=1, **variant['vf_params'])
    policy = TanhMlpPolicy(input_size=obs_dim,
                           output_size=action_dim,
                           **variant['policy_params'])
    exploration_policy = PolicyWrappedWithExplorationStrategy(
        exploration_strategy=es,
        policy=policy,
    )
    algorithm = N3DPG(env,
                      qf=qf,
                      vf=vf,
                      policy=policy,
                      exploration_policy=exploration_policy,
                      **variant['algo_kwargs'])
    algorithm.to(ptu.device)
    algorithm.train()
Exemple #28
0
def make_env(name):
    env = gym.make(name)
    # Remove TimeLimit Wrapper
    if isinstance(env, TimeLimit):
        env = env.unwrapped
    env = CustomInfoEnv(env)
    env = NormalizedBoxEnv(env)
    return env
Exemple #29
0
def experiment(variant):
    eval_env = NormalizedBoxEnv(HalfCheetahEnv())
    expl_env = NormalizedBoxEnv(HalfCheetahEnv())
    # Or for a specific version:
    # import gym
    # env = NormalizedBoxEnv(gym.make('HalfCheetah-v1'))
    obs_dim = eval_env.observation_space.low.size
    action_dim = eval_env.action_space.low.size
    qf = ConcatMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        **variant['qf_kwargs']
    )
    policy = TanhMlpPolicy(
        input_size=obs_dim,
        output_size=action_dim,
        **variant['policy_kwargs']
    )
    target_qf = copy.deepcopy(qf)
    target_policy = copy.deepcopy(policy)
    eval_path_collector = MdpPathCollector(eval_env, policy)
    exploration_policy = PolicyWrappedWithExplorationStrategy(
        exploration_strategy=OUStrategy(action_space=expl_env.action_space),
        policy=policy,
    )
    expl_path_collector = MdpPathCollector(expl_env, exploration_policy)
    replay_buffer = EnvReplayBuffer(variant['replay_buffer_size'], expl_env)
    trainer = DDPGTrainer(
        qf=qf,
        target_qf=target_qf,
        policy=policy,
        target_policy=target_policy,
        **variant['trainer_kwargs']
    )
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        **variant['algorithm_kwargs']
    )
    algorithm.to(ptu.device)
    algorithm.train()
Exemple #30
0
def experiment(variant):
    env = NormalizedBoxEnv(gym.make(args.env_name))

    obs_dim = int(np.prod(env.observation_space.shape))
    action_dim = int(np.prod(env.action_space.shape))
    net_size = variant['net_size']

    algo_map = dict(
        sac=dict(algo=SoftActorCritic,
                 network=dict(
                     policy=TanhGaussianPolicy(
                         hidden_sizes=[net_size, net_size],
                         obs_dim=obs_dim,
                         action_dim=action_dim,
                     ),
                     qf=FlattenMlp(
                         hidden_sizes=[net_size, net_size],
                         input_size=obs_dim + action_dim,
                         output_size=1,
                     ),
                     vf=FlattenMlp(
                         hidden_sizes=[net_size, net_size],
                         input_size=obs_dim,
                         output_size=1,
                     ),
                 )),
        tsac=dict(algo=TwinSAC,
                  network=dict(
                      policy=TanhGaussianPolicy(
                          hidden_sizes=[net_size, net_size],
                          obs_dim=obs_dim,
                          action_dim=action_dim,
                      ),
                      qf1=FlattenMlp(
                          hidden_sizes=[net_size, net_size],
                          input_size=obs_dim + action_dim,
                          output_size=1,
                      ),
                      qf2=FlattenMlp(
                          hidden_sizes=[net_size, net_size],
                          input_size=obs_dim + action_dim,
                          output_size=1,
                      ),
                      vf=FlattenMlp(
                          hidden_sizes=[net_size, net_size],
                          input_size=obs_dim,
                          output_size=1,
                      ),
                  )),
    )

    algo_type = algo_map[args.algo]
    algorithm = algo_type['algo'](env=env,
                                  **algo_type['network'],
                                  **variant['algo_params'])
    algorithm.to(ptu.device)
    algorithm.train()