Example #1
0
 def __init__(self):
     self.max_iters = 500
     self.horizon = 50000 # Batch size: time steps per batch
     self.episode_long = 5000
     self.max_iters = 1000
     self.horizon = 100000 # Batch size: time steps per batch
     self.episode_long = 1000
     self.l2_reg = 1E-4  # L2 regularization lambda for value loss function
     self.max_KL = 0.01  # Max KL divergence threshold for TRPO update
     # Environemnt
     self.env = ant_v3.AntEnv(ctrl_cost_weight=1E-6, contact_cost_weight=1E-3, healthy_reward=0.05)
     self.env.seed(seed)
     self.agent = Ant(self.env, self.horizon)  # create agent
     self.env = Walker2dEnv() #ant_v3.AntEnv(ctrl_cost_weight=1E-6, contact_cost_weight=1E-3, healthy_reward=0.05)
     self.env.seed(seed)
     self.agent = Ant(self.env, self.horizon,self.episode_long)  # create agent
     self.pi_net = Policy_Net(self.agent.ob_dim, self.agent.ac_dim)  # Create Policy Network
     self.value_net = Value_Net(self.agent.ob_dim, 1)  # Create Value Network
     self.value_net_lr = 1  # Declare value net learning rate
     self.LBFGS_iters = 20  # Declare the number of update times for value_net parameters in one TRPO update
     self.cg_iters = 500  # Declare the number of iterations for conjugate gradient algorithm
     self.cg_threshold = 1e-2  # Eearly stopping threshold for conjugate gradient
     self.line_search_alpha = 0.5  # Line search decay rate for TRPO
     self.search_iters = 10  # Line search iterations
     self.gamma = 1
     self.Lambda = 0.95
Example #2
0
 def __init__(self, **kwargs):
     Walker2dEnv.__init__(self,)
     offline_env.OfflineEnv.__init__(self, **kwargs)
Example #3
0
 def __init__(self):
     self._base_pos = 1.
     Walker2dEnv.__init__(self)
Example #4
0
def experiment(variant):
    # Or for a specific version (Daniel: doesn't work):
    # import gym
    # env = NormalizedBoxEnv(gym.make('HalfCheetah-v1'))

    if 'Ant' in args.env:
        expl_env = NormalizedBoxEnv(AntEnv())
        eval_env = NormalizedBoxEnv(AntEnv())
    elif 'InvertedPendulum' in args.env:
        expl_env = NormalizedBoxEnv(InvertedPendulumEnv())
        eval_env = NormalizedBoxEnv(InvertedPendulumEnv())
    elif 'HalfCheetah' in args.env:
        expl_env = NormalizedBoxEnv(HalfCheetahEnv())
        eval_env = NormalizedBoxEnv(HalfCheetahEnv())
    elif 'Hopper' in args.env:
        expl_env = NormalizedBoxEnv(HopperEnv())
        eval_env = NormalizedBoxEnv(HopperEnv())
    elif 'Reacher' in args.env:
        expl_env = NormalizedBoxEnv(ReacherEnv())
        eval_env = NormalizedBoxEnv(ReacherEnv())
    elif 'Swimmer' in args.env:
        expl_env = NormalizedBoxEnv(SwimmerEnv())
        eval_env = NormalizedBoxEnv(SwimmerEnv())
    elif 'Walker2d' in args.env:
        expl_env = NormalizedBoxEnv(Walker2dEnv())
        eval_env = NormalizedBoxEnv(Walker2dEnv())
    else:
        raise ValueError(args.env)

    obs_dim = eval_env.observation_space.low.size
    action_dim = eval_env.action_space.low.size
    qf = FlattenMlp(input_size=obs_dim + action_dim,
                    output_size=1,
                    **variant['qf_kwargs'])
    policy = TanhMlpPolicy(input_size=obs_dim,
                           output_size=action_dim,
                           **variant['policy_kwargs'])
    target_qf = copy.deepcopy(qf)
    target_policy = copy.deepcopy(policy)
    eval_path_collector = MdpPathCollector(eval_env, policy)
    exploration_policy = PolicyWrappedWithExplorationStrategy(
        exploration_strategy=OUStrategy(action_space=expl_env.action_space),
        policy=policy,
    )
    expl_path_collector = MdpPathCollector(expl_env, exploration_policy)
    replay_buffer = EnvReplayBuffer(variant['replay_buffer_size'], expl_env)
    trainer = DDPGTrainer(qf=qf,
                          target_qf=target_qf,
                          policy=policy,
                          target_policy=target_policy,
                          **variant['trainer_kwargs'])
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        **variant['algorithm_kwargs'])
    algorithm.to(ptu.device)
    algorithm.train()
Example #5
0
def run_task(snapshot_config, *_):
    """Set up environment and algorithm and run the task.
    Args:
        snapshot_config (garage.experiment.SnapshotConfig): The snapshot
            configuration used by LocalRunner to create the snapshotter.
            If None, it will create one with default settings.
        _ : Unused parameters
    """

    th = 1.8
    g_max = 0.1
    #delta = 1e-7
    if args.env == 'CartPole':
        #CartPole

        env = TfEnv(normalize(CartPoleEnv()))
        runner = LocalRunner(snapshot_config)
        batch_size = 5000
        max_length = 100
        n_timestep = 5e5
        n_counts = 5
        name = 'CartPole'
        grad_factor = 5
        th = 1.2
        #batchsize: 1
        # lr = 0.1
        # w = 2
        # c = 50

        #batchsize: 50
        lr = 0.75
        c = 3
        w = 2

        discount = 0.995
        path = './init/CartPole_policy.pth'

    if args.env == 'Walker':
        #Walker_2d
        env = TfEnv(normalize(Walker2dEnv()))
        runner = LocalRunner(snapshot_config)
        batch_size = 50000
        max_length = 500

        n_timestep = 1e7
        n_counts = 5
        lr = 0.75
        w = 2
        c = 12
        grad_factor = 6

        discount = 0.999

        name = 'Walk'
        path = './init/Walk_policy.pth'

    if args.env == 'HalfCheetah':
        env = TfEnv(normalize(HalfCheetahEnv()))
        runner = LocalRunner(snapshot_config)

        batch_size = 50000
        max_length = 500

        n_timestep = 1e7
        n_counts = 5
        lr = 0.6
        w = 1
        c = 4
        grad_factor = 5
        th = 1.2
        g_max = 0.06

        discount = 0.999

        name = 'HalfCheetah'
        path = './init/HalfCheetah_policy.pth'

    if args.env == 'Hopper':
        #Hopper
        env = TfEnv(normalize(HopperEnv()))
        runner = LocalRunner(snapshot_config)

        batch_size = 50000
        max_length = 1000
        th = 1.5
        n_timestep = 1e7
        n_counts = 5
        lr = 0.75
        w = 1
        c = 3
        grad_factor = 6
        g_max = 0.15
        discount = 0.999

        name = 'Hopper'
        path = './init/Hopper_policy.pth'

    for i in range(n_counts):
        # print(env.spec)
        if args.env == 'CartPole':
            policy = CategoricalMLPPolicy(env.spec,
                                       hidden_sizes=[8, 8],
                                       hidden_nonlinearity=torch.tanh,
                                       output_nonlinearity=None)
        else:
            policy = GaussianMLPPolicy(env.spec,
                                       hidden_sizes=[64, 64],
                                       hidden_nonlinearity=torch.tanh,
                                       output_nonlinearity=None)


        policy.load_state_dict(torch.load(path))
        baseline = LinearFeatureBaseline(env_spec=env.spec)

        algo = MBPG_HA(env_spec=env.spec,
                   env = env,
                    env_name= name,
                   policy=policy,
                   baseline=baseline,
                   max_path_length=max_length,
                   discount=discount,
                   grad_factor=grad_factor,
                   policy_lr= lr,
                   c = c,
                   w = w,
                   th=th,
                   g_max=g_max,
                   n_timestep=n_timestep,

                   batch_size=batch_size,
                   center_adv=True,
                   # delta=delta
                   #decay_learning_rate=d_lr,

                   )

        runner.setup(algo, env)
        runner.train(n_epochs=100, batch_size=batch_size)
Example #6
0
def experiment(variant, args):
    # Doesn't work :(
    #import gym
    #expl_env = NormalizedBoxEnv( gym.make(args.env) )
    #eval_env = NormalizedBoxEnv( gym.make(args.env) )

    if 'Ant' in args.env:
        expl_env = NormalizedBoxEnv( AntEnv() )
        eval_env = NormalizedBoxEnv( AntEnv() )
    elif 'InvertedPendulum' in args.env:
        expl_env = NormalizedBoxEnv( InvertedPendulumEnv() )
        eval_env = NormalizedBoxEnv( InvertedPendulumEnv() )
    elif 'HalfCheetah' in args.env:
        expl_env = NormalizedBoxEnv( HalfCheetahEnv() )
        eval_env = NormalizedBoxEnv( HalfCheetahEnv() )
    elif 'Hopper' in args.env:
        expl_env = NormalizedBoxEnv( HopperEnv() )
        eval_env = NormalizedBoxEnv( HopperEnv() )
    elif 'Reacher' in args.env:
        expl_env = NormalizedBoxEnv( ReacherEnv() )
        eval_env = NormalizedBoxEnv( ReacherEnv() )
    elif 'Swimmer' in args.env:
        expl_env = NormalizedBoxEnv( SwimmerEnv() )
        eval_env = NormalizedBoxEnv( SwimmerEnv() )
    elif 'Walker2d' in args.env:
        expl_env = NormalizedBoxEnv( Walker2dEnv() )
        eval_env = NormalizedBoxEnv( Walker2dEnv() )
    else:
        raise ValueError(args.env)

    # Back to normal.
    obs_dim = expl_env.observation_space.low.size
    action_dim = expl_env.action_space.low.size
    qf1 = FlattenMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        **variant['qf_kwargs']
    )
    qf2 = FlattenMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        **variant['qf_kwargs']
    )
    target_qf1 = FlattenMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        **variant['qf_kwargs']
    )
    target_qf2 = FlattenMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        **variant['qf_kwargs']
    )
    policy = TanhMlpPolicy(
        input_size=obs_dim,
        output_size=action_dim,
        **variant['policy_kwargs']
    )
    target_policy = TanhMlpPolicy(
        input_size=obs_dim,
        output_size=action_dim,
        **variant['policy_kwargs']
    )
    es = GaussianStrategy(
        action_space=expl_env.action_space,
        max_sigma=0.1,
        min_sigma=0.1,  # Constant sigma
    )
    exploration_policy = PolicyWrappedWithExplorationStrategy(
        exploration_strategy=es,
        policy=policy,
    )
    eval_path_collector = MdpPathCollector(
        eval_env,
        policy,
    )
    expl_path_collector = MdpPathCollector(
        expl_env,
        exploration_policy,
    )
    replay_buffer = EnvReplayBuffer(
        variant['replay_buffer_size'],
        expl_env,
    )
    trainer = TD3Trainer(
        policy=policy,
        qf1=qf1,
        qf2=qf2,
        target_qf1=target_qf1,
        target_qf2=target_qf2,
        target_policy=target_policy,
        **variant['trainer_kwargs']
    )
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        **variant['algorithm_kwargs']
    )
    algorithm.to(ptu.device)
    algorithm.train()