Example #1
0
def run_task(*_):
        f = open('/home/qingkai/verina.csv', "w+")
        ff = open('/home/qingkai/cpo_dual.csv', "w+")
        trpo_stepsize = 0.01
        trpo_subsample_factor = 0.2
        
        env = AntGatherEnv(apple_reward=10,bomb_cost=1,n_apples=2, activity_range=6)

        policy = GaussianMLPPolicy(env.spec,
                    hidden_sizes=(64,32)
                 )

        baseline = GaussianMLPBaseline(
            env_spec=env.spec,
            regressor_args={
                    'hidden_sizes': (64,32),
                    'hidden_nonlinearity': NL.tanh,
                    'learn_std':False,
                    'step_size':trpo_stepsize,
                    'optimizer':ConjugateGradientOptimizer(subsample_factor=trpo_subsample_factor)
                    }
        )

        safety_baseline = GaussianMLPBaseline(
            env_spec=env.spec,
            regressor_args={
                    'hidden_sizes': (64,32),
                    'hidden_nonlinearity': NL.tanh,
                    'learn_std':False,
                    'step_size':trpo_stepsize,
                    'optimizer':ConjugateGradientOptimizer(subsample_factor=trpo_subsample_factor)
                    },
            target_key='safety_returns',
            )

        safety_constraint = GatherSafetyConstraint(max_value=0.2, baseline=safety_baseline)



        algo = CPO(
            env=env,
            policy=policy,
            baseline=baseline,
            safety_constraint=safety_constraint,
            safety_gae_lambda=0.5,
            batch_size=100000,
            max_path_length=500,
            n_itr=2000,
            gae_lambda=0.95,
            discount=0.995,
            step_size=trpo_stepsize,
            optimizer_args={'subsample_factor':trpo_subsample_factor},
            #plot=True,
        )

        algo.train()

        f.close()
        ff.close()
Example #2
0
    def __init__(self,
                 optimizer=None,
                 optimizer_args=None,
                 safety_constraint=None,
                 pdo_vf_mode=1,
                 **kwargs):
        Serializable.quick_init(self, locals())
        if optimizer is None:
            if optimizer_args is None:
                optimizer_args = dict()
            optimizer = ConjugateGradientOptimizer(**optimizer_args)

        pop_keys = [
            'safety_constrained_optimizer', 'safety_tradeoff',
            'learn_safety_tradeoff_coeff', 'safety_key'
        ]

        for key in pop_keys:
            if key in kwargs.keys():
                kwargs.pop(key)

        if pdo_vf_mode == 1:
            # won't be using safety baseline, so key should not be advantages.
            safety_key = 'returns'
        else:
            safety_key = 'advantages'

        if pdo_vf_mode == 2 and not (hasattr(safety_constraint, 'baseline')):
            logger.log(
                "Warning: selected two-VF PDO, without providing VF for safety constraint."
            )
            logger.log("Defaulting to one-VF PDO.")
            pdo_vf_mode = 1
            safety_key = 'returns'

        super(PDO_OFF, self).__init__(optimizer=optimizer,
                                      safety_constrained_optimizer=False,
                                      safety_constraint=safety_constraint,
                                      safety_tradeoff=True,
                                      learn_safety_tradeoff_coeff=True,
                                      safety_key=safety_key,
                                      pdo_vf_mode=pdo_vf_mode,
                                      **kwargs)
Example #3
0
 def __init__(self,
              optimizer=None,
              optimizer_args=None,
              safety_constrained_optimizer=True,
              safety_constraint=None,
              **kwargs):
     Serializable.quick_init(self, locals())
     if optimizer is None:
         if optimizer_args is None:
             optimizer_args = dict()
         if safety_constraint is not None and safety_constrained_optimizer:
             optimizer = ConjugateConstraintOptimizer(**optimizer_args)
         else:
             optimizer = ConjugateGradientOptimizer(**optimizer_args)
     super(TRPO, self).__init__(
         optimizer=optimizer,
         safety_constrained_optimizer=safety_constrained_optimizer,
         safety_constraint=safety_constraint,
         **kwargs)
Example #4
0
def run_task(*_):
    trpo_stepsize = 0.01
    trpo_subsample_factor = 0.2

    env = PointGatherEnv(apple_reward=10,
                         bomb_cost=1,
                         n_apples=2,
                         activity_range=6)

    policy = GaussianMLPPolicy(env.spec, hidden_sizes=(64, 32))

    baseline = GaussianMLPBaseline(
        env_spec=env.spec,
        regressor_args={
            'hidden_sizes': (64, 32),
            'hidden_nonlinearity':
            NL.tanh,
            'learn_std':
            False,
            'step_size':
            trpo_stepsize,
            'optimizer':
            ConjugateGradientOptimizer(subsample_factor=trpo_subsample_factor)
        })

    safety_constraint = GatherSafetyConstraint(max_value=0.1)

    algo = PDO(
        env=env,
        policy=policy,
        baseline=baseline,
        safety_constraint=safety_constraint,
        batch_size=50000,
        max_path_length=15,
        n_itr=100,
        gae_lambda=0.95,
        discount=0.995,
        step_size=trpo_stepsize,
        optimizer_args={'subsample_factor': trpo_subsample_factor},
        #plot=True,
    )

    algo.train()
Example #5
0
def run_task(*_):
    f = open('/home/qingkai/verina.csv', "w+")
    trpo_stepsize = 0.01
    trpo_subsample_factor = 0.2

    env = PointGatherEnv(apple_reward=10,
                         bomb_cost=1,
                         n_apples=2,
                         activity_range=6)

    policy = GaussianMLPPolicy(env.spec, hidden_sizes=(64, 32))

    baseline = GaussianMLPBaseline(
        env_spec=env.spec,
        regressor_args={
            'hidden_sizes': (64, 32),
            'hidden_nonlinearity':
            NL.tanh,
            'learn_std':
            False,
            'step_size':
            trpo_stepsize,
            'optimizer':
            ConjugateGradientOptimizer(subsample_factor=trpo_subsample_factor)
        })

    safety_constraint = GatherSafetyConstraint(max_value=0.2)

    ddpg_policy = DeterministicMLPPolicy(env_spec=env.spec,
                                         hidden_sizes=(64, 32))

    ddpg_es = OUStrategy(env_spec=env.spec)

    ddpg_qf = ContinuousMLPQFunction(env_spec=env.spec,
                                     hidden_sizes=(100, 100))
    ddpg_qf_cost = ContinuousMLPQFunction(env_spec=env.spec,
                                          hidden_sizes=(100, 100))

    offline_itr_n = 100000

    algo = PDO_OFF(
        env=env,
        policy=policy,
        baseline=baseline,
        safety_constraint=safety_constraint,
        batch_size=20000,
        max_path_length=15,
        n_itr=200,
        gae_lambda=0.95,
        discount=0.995,
        step_size=trpo_stepsize,
        optimizer_args={'subsample_factor': trpo_subsample_factor},
        ddpg_policy=ddpg_policy,
        ddpg_qf=ddpg_qf,
        ddpg_qf_cost=ddpg_qf_cost,
        ddpg_es=ddpg_es,
        ddpg_dual_var=0,
        ddpg_batch_size=64,
        ddpg_qf_learning_rate=1e-4,
        ddpg_qf_cost_learning_rate=1e-4,
        ddpg_dual_learning_rate=1e-3,
        ddpg_policy_learning_rate=1e-3,
        ddpg_scale_reward=1,
        ddpg_scale_cost=1,
        offline_itr_n=offline_itr_n,
        balance=0,
        safety_tradeoff_coeff_lr=1e-2,
        ddpg_avg_horizon=offline_itr_n,
        adjust_epoch=5,
        ddpg_qf_weight_decay=0.,
        #plot=True,
    )

    algo.train()
    f.close()