コード例 #1
0
ファイル: trpo_point.py プロジェクト: ferric123/robotarm
def run_task(*_):
    env = normalize(PointEnv())
    policy = GaussianMLPPolicy(env_spec=env.spec,)
    baseline = LinearFeatureBaseline(env_spec=env.spec)
    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=4000,
        max_path_length=100,
        n_itr=500,discount=0.99,step_size=0.01
    )
    algo.train()
コード例 #2
0
ファイル: trpo_runner.py プロジェクト: pcmoritz/flow
def run_task(*_):
    """Implement the ``run_task`` method needed to run experiments with rllab.

    Note that the flow-specific parameters are imported at the start of this
    script and unzipped and processed here.
    """
    env_name = flow_params["env_name"]
    exp_tag = flow_params["exp_tag"]
    sumo_params = flow_params["sumo"]
    vehicles = flow_params["veh"]
    env_params = flow_params["env"]
    net_params = flow_params["net"]
    initial_config = flow_params.get("initial", InitialConfig())
    traffic_lights = flow_params.get("tls", TrafficLights())

    # import the scenario and generator classes
    module = __import__("flow.scenarios", fromlist=[flow_params["scenario"]])
    scenario_class = getattr(module, flow_params["scenario"])
    module = __import__("flow.scenarios", fromlist=[flow_params["generator"]])
    generator_class = getattr(module, flow_params["generator"])

    # create the scenario object
    scenario = scenario_class(name=exp_tag,
                              generator_class=generator_class,
                              vehicles=vehicles,
                              net_params=net_params,
                              initial_config=initial_config,
                              traffic_lights=traffic_lights)

    pass_params = (env_name, sumo_params, vehicles, env_params, net_params,
                   initial_config, scenario)

    env = GymEnv(env_name, record_video=False, register_params=pass_params)
    env = normalize(env)

    policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(100, 50, 25))

    baseline = LinearFeatureBaseline(env_spec=env.spec)
    horizon = flow_params["env"].horizon

    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=horizon * (N_ROLLOUTS - PARALLEL_ROLLOUTS + 1),
        max_path_length=horizon,
        n_itr=500,
        discount=0.999,
        step_size=0.01,
    )
    algo.train(),
コード例 #3
0
def test_trpo_deterministic_nan():
    env = DummyEnv()
    policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(1, ))
    policy._l_log_std.param.set_value([np.float32(np.log(1e-8))])
    baseline = ZeroBaseline(env_spec=env.spec)
    algo = TRPO(env=env,
                policy=policy,
                baseline=baseline,
                n_itr=10,
                batch_size=1000,
                max_path_length=100,
                step_size=0.01)
    algo.train()
    assert not np.isnan(np.sum(policy.get_param_values()))
コード例 #4
0
def test_trpo_relu_nan():
    env = DummyEnv()
    policy = GaussianMLPPolicy(env_spec=env.spec,
                               hidden_nonlinearity=naive_relu,
                               hidden_sizes=(1, ))
    baseline = ZeroBaseline(env_spec=env.spec)
    algo = TRPO(env=env,
                policy=policy,
                baseline=baseline,
                n_itr=1,
                batch_size=1000,
                max_path_length=100,
                step_size=0.001)
    algo.train()
    assert not np.isnan(np.sum(policy.get_param_values()))
コード例 #5
0
def run_task(v):

    print("_________________________________")
    print("#################################")
    print("_________________________________")
    print("_________________________________")
    print("#################################")
    print("###    agents_number : " + str(agents_number) + "    ####")
    print("###                          ####")
    print("### participation_rate : " + str(participation_rate) + " ####")
    print("###                          ####")
    print("###    average_period : " + str(average_period) + "   ####")
    print("###                          ####")
    print("### quantization_tuning : " + str(quantization_tuning) + " ####")
    print("###                          ####")
    print("###     discount : " + str(discount) + "      ####")
    print("#################################")
    print("_________________________________")
    print("_________________________________")
    print("#################################")
    print("_________________________________")

    env = normalize(CartpoleEnv())

    policy = GaussianMLPPolicy(env_spec=env.spec)

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = Server(
        participation_rate=participation_rate,
        agents_number=agents_number,
        average_period=average_period,
        env=env,
        policy=policy,
        baseline=baseline,
        difference_params=True,
        quantize=True,
        quantization_tuning=quantization_tuning,
        batch_size=400,
        max_path_length=100,
        n_itr=50,
        discount=discount,
        step_size=0.01,
        # Uncomment both lines (this and the plot parameter below) to enable plotting
        # plot=True,
    )

    algo.train()
コード例 #6
0
def run_task(*_):
    tot_cars = 6
    auton_cars = 6

    sumo_params = SumoParams(time_step=0.1,  rl_speed_mode="no_collide", sumo_binary="sumo-gui")

    vehicles = Vehicles()
    vehicles.add_vehicles("rl", (RLController, {}), (StaticLaneChanger, {}), (ContinuousRouter, {}), 0, auton_cars)

    env_params = EnvParams(additional_params={"target_velocity": 25, "num_steps": 1000})

    additional_net_params = {"length": 220, "lanes": 1, "speed_limit": 30, "resolution": 40}
    net_params = NetParams(additional_params=additional_net_params)

    initial_config = InitialConfig()

    scenario = LoopScenario("rl-test", CircleGenerator, vehicles, net_params, initial_config)

    env_name = "SimpleAccelerationEnvironment"
    pass_params = (env_name, sumo_params, vehicles, env_params, net_params,
                   initial_config, scenario)

    env = GymEnv(env_name, record_video=False, register_params=pass_params)
    horizon = env.horizon
    env = normalize(env)

    logging.info("Experiment Set Up complete")

    policy = GaussianMLPPolicy(
        env_spec=env.spec,
        hidden_sizes=(16,)
    )

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=2000,
        max_path_length=horizon,
        # whole_paths=True,
        n_itr=2,  # 1000
        # discount=0.99,
        # step_size=0.01,
    )
    algo.train()
コード例 #7
0
ファイル: PDO_point_gather.py プロジェクト: victor856/cpo
def run_task(*_):
    f = open('/home/qingkai/verina.csv', "w+")
    trpo_stepsize = 0.01
    trpo_subsample_factor = 0.2

    env = PointGatherEnv(apple_reward=10,
                         bomb_cost=1,
                         n_apples=2,
                         activity_range=6)

    policy = GaussianMLPPolicy(env.spec, hidden_sizes=(64, 32))

    baseline = GaussianMLPBaseline(
        env_spec=env.spec,
        regressor_args={
            'hidden_sizes': (64, 32),
            'hidden_nonlinearity':
            NL.tanh,
            'learn_std':
            False,
            'step_size':
            trpo_stepsize,
            'optimizer':
            ConjugateGradientOptimizer(subsample_factor=trpo_subsample_factor)
        })

    safety_constraint = GatherSafetyConstraint(max_value=0.2)

    algo = PDO(
        env=env,
        policy=policy,
        baseline=baseline,
        safety_constraint=safety_constraint,
        batch_size=50000,
        max_path_length=15,
        n_itr=100,
        gae_lambda=0.95,
        discount=0.995,
        safety_tradeoff_coeff_lr=1e-1,
        step_size=trpo_stepsize,
        optimizer_args={'subsample_factor': trpo_subsample_factor},
        #plot=True,
    )

    algo.train()
    f.close()
コード例 #8
0
def experiment_scratch_baseline():
    # k = 100

    for seed in [10, 30, 50, 100]:
        for _ in range(4):
            env = StandardControllerEnv(k=4,
                                        noise=0.05,
                                        num_dynamics=4,
                                        num_points=k)
            now = datetime.datetime.now()
            timestamp = now.strftime('%Y_%m_%d_%H_%M_%S')

            policy = GaussianMLPPolicy(
                env_spec=env.spec,
                hidden_sizes=(
                    32,
                    32,
                ),
            )
            baseline = LinearFeatureBaseline(env_spec=env.spec)
            algo = TRPO(
                env=env,
                policy=policy,
                baseline=baseline,
                batch_size=1000,
                max_path_length=env.horizon,
                n_itr=100,
                discount=0.995,
                step_size=0.001,
                plot=False,
            )
            run_experiment_lite(
                algo.train(),
                # Number of parallel workers for sampling
                n_parallel=4,
                # Only keep the snapshot parameters for the last iteration
                snapshot_mode="last",
                # script="scripts/run_experiment_lite_rl.py",
                script="scripts/run_experiment_lite.py",
                exp_name=os.path.join("Baseline %d" % k, timestamp),
                log_dir=os.path.join(
                    "Results/Controls/Increasing_Points/Baseline", timestamp)
                # Specifies the seed for the experiment. If this is not provided, a random seed
                # will be used
                # plot=True,
            )
コード例 #9
0
ファイル: test_walker.py プロジェクト: parthchadha/metarl
def test(num=1, path="./Results/Tmp", save=False):
    # env = normalize(GymEnv("BipedalWalkerPit-v2"))
    env = normalize(GymEnv("BipedalWalker-v2", record_video=False))
    # env = DoublePendulumEnv()
    policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(32, 32))
    rollout(env, policy)
    baseline = LinearFeatureBaseline(env_spec=env.spec)
    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=50000,
        max_path_length=env.horizon,
        n_itr=100,
        discount=0.995,
        step_size=0.01,
        # plot=True,
    )
コード例 #10
0
ファイル: trpo_gym_sym.py プロジェクト: VincentYu68/rllab
def run_task(*_):
    env = normalize(
        GymEnv("DartWalker3d-v1", record_log=False, record_video=False))

    policy = GaussianMLPPolicy(
        env_spec=env.spec,
        # The neural network policy should have two hidden layers, each with 32 hidden units.
        hidden_sizes=(128, 64),
        net_mode=0,
    )
    #policy = joblib.load('data/local/experiment/walker3d_symmetry1_sd13_2alivebonus_2velrew_targetvelocity1_15frameskip_5en1absenergypenalty_2d_hardvelenforce_contsupport/policy.pkl')

    # increase policy std a bit for exploration
    #policy.get_params()[-1].set_value(policy.get_params()[-1].get_value() + 0.5)

    print('trainable parameter size: ',
          policy.get_param_values(trainable=True).shape)

    baseline = LinearFeatureBaseline(env_spec=env.spec, additional_dim=0)


    algo = TRPO_Symmetry(
        env=env,
        policy=policy,
        baseline=baseline,

        batch_size=60000,

        max_path_length=env.horizon,
        n_itr=500,

        discount=0.99,
        step_size=0.02,
        gae_lambda=0.97,
        observation_permutation=np.array([0.0001,-1, 2,-3,-4, -5,-6,7, 14,-15,-16, 17, 18,-19, 8,-9,-10, 11, 12,-13,\
                                          20,21,-22, 23,-24,-25, -26,-27,28, 35,-36,-37, 38, 39,-40, 29,-30,-31, 32, 33,-34, 42, 41]),
        #observation_permutation=np.array([0.0001, 1, 5,6,7, 2,3,4, 8,9,10, 14,15,16, 11,12,13]),
        #action_permutation=np.array([3,4,5, 0.00001,1,2]),
        action_permutation=np.array([-0.0001, -1, 2, 9, -10, -11, 12, 13, -14, 3, -4, -5, 6, 7, -8]),

        sym_loss_weight=2.0,
        whole_paths=False,
    )
    algo.train()
コード例 #11
0
def run_task(*_):
    env = normalize(GymEnv("Pendulum-v0"))

    policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(32, 32))

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=4000,
        max_path_length=env.horizon,
        n_itr=50,
        discount=0.99,
        step_size=0.01,
        plot=True,
    )
    algo.train()
コード例 #12
0
def run_task(*_):
    env = normalize(
        GymEnv("DartHumanWalker-v1", record_log=False, record_video=False))

    policy = GaussianMLPPolicy(
        env_spec=env.spec,
        # The neural network policy should have two hidden layers, each with 32 hidden units.
        hidden_sizes=(128, 64),
        net_mode=0,
    )
    #policy = joblib.load('data/local/experiment/humanwalker_symmetry1_sd11_1alivebonus_2velrew_targetvelocity1_15frameskip_5en1absenergypenalty_spd20002000/policy.pkl')

    # increase policy std a bit for exploration
    #policy.get_params()[-1].set_value(policy.get_params()[-1].get_value() + 0.5)

    print('trainable parameter size: ',
          policy.get_param_values(trainable=True).shape)

    baseline = LinearFeatureBaseline(env_spec=env.spec, additional_dim=0)


    algo = TRPO_Symmetry(
        env=env,
        policy=policy,
        baseline=baseline,

        batch_size=50000,

        max_path_length=env.horizon,
        n_itr=1000,

        discount=0.99,
        step_size=0.02,
        gae_lambda=0.97,
        observation_permutation=np.array([0.0001,-1,2,-3,-4, -11,12,-13,14,15,16, -5,6,-7,8,9,10, -17,18, -19, -24,25,-26,27, -20,21,-22,23,\
                                          28,29,-30,31,-32,-33, -40,41,-42,43,44,45, -34,35,-36,37,38,39, -46,47, -48, -53,54,-55,56, -49,50,-51,52, 58,57]),
        action_permutation=np.array([-6,7,-8, 9, 10,11,  -0.001,1,-2, 3, 4,5, -12,13, -14, -19,20,-21,22, -15,16,-17,18]),

        sym_loss_weight=1.0,
        action_reg_weight=0.0,
        whole_paths=False,
    )
    algo.train()
コード例 #13
0
def main(exp_name, ent_wt=1.0):
    register_custom_envs()
    env_name = 'Acrobot-v2'
    env = GymEnv(env_name)
    policy = GaussianMLPPolicy(env_spec=env, hidden_sizes=(64, 64))
    algo = PPO(env=env,
               policy=policy,
               n_itr=1500,
               batch_size=8000,
               max_path_length=1000,
               discount=0.95,
               store_paths=True,
               entropy_weight=ent_wt,
               baseline=LinearFeatureBaseline(env_spec=env))
    data_path = 'data/acrobat_data_rllab_ppo/%s/' % exp_name
    os.makedirs(data_path, exist_ok=True)
    logger.set_snapshot_dir(data_path)
    algo.train()
    logger.set_snapshot_dir(None)
コード例 #14
0
def run_vime(vv):
    setup_rllab_logging(vv)
    seed = vv['seed']
    eta = 0.0001
    path_len = vv['path_len']
    mdp = get_env(vv)
    policy = GaussianMLPPolicy(
        env_spec=mdp.spec,
        hidden_sizes=(300, 200, 100),
        init_std=1.0,
    )

    baseline = LinearFeatureBaseline(
        mdp.spec,
    )

    batch_size = path_len * 100
    algo = TRPOVIME(
        env=mdp,
        policy=policy,
        baseline=baseline,
        batch_size=batch_size,
        whole_paths=True,
        max_path_length=path_len,
        n_itr=1000,
        step_size=0.01,
        eta=eta,
        snn_n_samples=10,
        subsample_factor=1.0,
        use_replay_pool=True,
        use_kl_ratio=True,
        use_kl_ratio_q=True,
        n_itr_update=1,
        kl_batch_size=1,
        normalize_reward=False,
        replay_pool_size=1000000,
        n_updates_per_sample=5000,
        second_order_update=True,
        unn_n_hidden=[32],
        unn_layers_type=[1, 1],
        unn_learning_rate=0.0001
    )
    algo.train()
コード例 #15
0
def run_task(v):

    which_agent = v["which_agent"]
    env, _ = create_env(which_agent)
    baseline = LinearFeatureBaseline(env_spec=env.spec)
    optimizer_params = dict(base_eps=1e-5)

    #how many iters
    num_trpo_iters = 2500
    if (which_agent == 1):
        num_trpo_iters = 2500
    if (which_agent == 2):
        steps_per_rollout = 333
        num_trpo_iters = 200
    if (which_agent == 4):
        num_trpo_iters = 2000
    if (which_agent == 6):
        num_trpo_iters = 2000

    #recreate the policy
    policy = GaussianMLPPolicy(env_spec=env.spec,
                               hidden_sizes=(v["depth_fc_layers"],
                                             v["depth_fc_layers"]),
                               init_std=v["std_on_mlp_policy"])
    all_params = np.concatenate(
        (v["policy_values"], policy._l_log_std.get_params()[0].get_value()))
    policy.set_param_values(all_params)

    algo = TRPO(env=env,
                policy=policy,
                baseline=baseline,
                batch_size=v["trpo_batchsize"],
                max_path_length=v["steps_per_rollout"],
                n_itr=num_trpo_iters,
                discount=0.995,
                optimizer=v["ConjugateGradientOptimizer"](
                    hvp_approach=v["FiniteDifferenceHvp"](**optimizer_params)),
                step_size=0.05,
                plot_true=True)

    #train the policy
    algo.train()
コード例 #16
0
        def run_task(*_):
            env = normalize(GymEnv(models[k]))

            baseline = LinearFeatureBaseline(env_spec=env.spec)

            learn_std = True
            init_std = 1

            # hidden_sizes = NN_sizes[i]
            # hidden_sizes=(8,)
            # hidden_sizes=(32, 32)
            hidden_sizes=(100, 50, 25)

            policy = GaussianMLPPolicy(
                env_spec=env.spec,
                hidden_sizes=hidden_sizes,
                learn_std=learn_std,
                init_std=init_std
            )

            # =======================
            # Defining the algorithm
            # =======================
            batch_size = 5000
            n_itr = 200
            gamma = .99
            step_size = 0.01
            # max_path_length = 96,

            # algo = VPG(
            algo = TRPO(
                env=env,
                policy=policy,
                baseline=baseline,
                batch_size=batch_size,
                # max_path_length=max_path_length,
                n_itr=n_itr,
                discount=gamma,
                step_size=step_size
            )
            algo.train()
コード例 #17
0
def main(exp_name, ent_wt=1.0):
    register_custom_envs()
    env_name = 'LunarLanderContinuous-v3'
    env = GymEnv(env_name)
    policy = GaussianMLPPolicy(env_spec=env, hidden_sizes=(64, 64))
    baseline = GaussianMLPBaseline(env_spec=env)
    algo = PPO(env=env,
               policy=policy,
               n_itr=1500,
               batch_size=8000,
               max_path_length=1000,
               discount=0.99,
               store_paths=True,
               entropy_weight=ent_wt,
               baseline=baseline)
    data_path = 'data/%s_data_rllab_%s/%s/' % (env_name.replace(
        '-', '_'), str(algo.__class__.__name__), exp_name)
    os.makedirs(data_path, exist_ok=True)
    logger.set_snapshot_dir(data_path)
    algo.train()
    logger.set_snapshot_dir(None)
コード例 #18
0
def run_task(v):

    env, _ = create_env(v["which_agent"])
    policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(64, 64))
    baseline = LinearFeatureBaseline(env_spec=env.spec)
    optimizer_params = dict(base_eps=1e-5)

    algo = TRPO(env=env,
                policy=policy,
                baseline=baseline,
                batch_size=v["batch_size"],
                max_path_length=v["steps_per_rollout"],
                n_itr=v["num_trpo_iters"],
                discount=0.995,
                optimizer=v["ConjugateGradientOptimizer"](
                    hvp_approach=v["FiniteDifferenceHvp"](**optimizer_params)),
                step_size=0.05,
                plot_true=True)

    #train the policy
    algo.train()
コード例 #19
0
        def run_task(*_):
            env = normalize(GymEnv('HovorkaInterval-v0'))
            # env.wrapped_env.env.env.env.reward_flag = 'absolute'
            env.wrapped_env.env.env.reward_flag = reward_functions[k]

            baseline = LinearFeatureBaseline(env_spec=env.spec)

            learn_std = True
            init_std = 1

            hidden_sizes = NN_sizes[i]
            # hidden_sizes=(8,)
            # hidden_sizes=(32, 32)
            # hidden_sizes=(100, 50, 25)

            policy = GaussianMLPPolicy(env_spec=env.spec,
                                       hidden_sizes=hidden_sizes,
                                       learn_std=learn_std,
                                       init_std=init_std)

            # =======================
            # Defining the algorithm
            # =======================
            batch_size = 5000
            n_itr = 200
            gamma = .99
            step_size = 0.01
            # max_path_length = 96,

            algo = VPG(
                env=env,
                policy=policy,
                baseline=baseline,
                batch_size=batch_size,
                # max_path_length=max_path_length,
                n_itr=n_itr,
                discount=gamma,
                step_size=step_size)
            algo.train()
コード例 #20
0
ファイル: main_benchmark.py プロジェクト: Xingchen-Yu/S2VGD
def run_vpg_baseline_large_batch_size_no_critic(*_):
    env = normalize(env_name())
    policy = GaussianMLPPolicy(
        env_spec=env.spec,
        hidden_sizes=(50,25,),
        adaptive_std=False,
    )
    baseline = LinearFeatureBaseline(env_spec=env.spec)
    print("Iteration Number: {:}".format(n_itr))
    print("Learning Rate : {:}".format(learning_rate))
    algo = VPG(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=batch_size* num_of_agents,
        max_path_length=500,
        n_itr=n_itr,
        discount=0.99,
        optimizer_args = {'learning_rate':learning_rate},
        sampler_cls = BatchSampler_no_critic,
    )
    algo.train()
コード例 #21
0
def run_task(*_):
    env = normalize(CartpoleEnv())

    policy = GaussianMLPPolicy(
        env_spec=env.spec,
        # The neural network policy should have two hidden layers, each with 32 hidden units.
        hidden_sizes=(32, 32))

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=4000,
        max_path_length=100,
        n_itr=1000,
        discount=0.99,
        step_size=0.01,
        # Uncomment both lines (this and the plot parameter below) to enable plotting
        plot=True)
    algo.train()
コード例 #22
0
def run_task(*_):
    # Non-registration of this custom environment is an rllab bug
    # See https://github.com/openai/rllab/issues/68
    # At the moment I'm bypassing this problem by adding the
    # import statement in gym_env.py
    import gym_follower_2d
    import lasagne.nonlinearities as NL

    gymenv = GymEnv(args.env,
                    force_reset=True,
                    record_video=False,
                    record_log=True)
    env = normalize(gymenv)

    logger.log("Training Policy on %s" % args.env)

    policy = GaussianMLPPolicy(env_spec=env.spec,
                               hidden_sizes=(100, 50, 25),
                               hidden_nonlinearity=NL.tanh)

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=args.batch_size,
        max_path_length=100,
        n_itr=args.num_epochs,
        discount=0.99,
        step_size=args.step_size,
        optimizer=ConjugateGradientOptimizer(
            reg_coeff=args.reg_coeff,
            hvp_approach=FiniteDifferenceHvp(base_eps=args.reg_coeff)),
        plot=False,
    )

    algo.train()
コード例 #23
0
def run_trpo(vv):
    setup_rllab_logging(vv)
    path_len = vv['path_len']
    env = get_env(vv)

    policy = GaussianMLPPolicy(
        env_spec=env.spec,
        hidden_sizes=(300, 200, 100),
        init_std=1.0,
    )

    baseline = LinearFeatureBaseline(env_spec=env.spec)
    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=100*path_len,
        max_path_length=path_len,
        n_itr=1000,
        discount=0.99,
        step_size=0.01,
    )
    algo.train()
コード例 #24
0
def run_task(*_):
   """TRY OUT normalized environment"""
   env = normalize(TendonOneSegmentEnv())

   policy = GaussianMLPPolicy(
         env_spec = env.spec,
         hidden_sizes=(64, 64)
#         output_nonlinearity=NL.tanh
   )

   baseline = LinearFeatureBaseline(env_spec=env.spec)

   algo = TRPO(
         env=env,
         policy=policy,
         baseline=baseline,
         batch_size = 4000,
         max_path_length=np.inf,
         n_itr=20001,
         discount=0.99,
         step_size=0.01,
   )
   algo.train()
コード例 #25
0
def run_task(*_):
    # env = normalize(SwimmerWrapperGym('Swimmer-v1'))
    env = normalize(GymEnv('Swimmer-v1'))
    policy = GaussianMLPPolicy(
        env_spec=env.spec,
        # The neural network policy should have two hidden layers, each with 32 hidden units.
        hidden_sizes=(32, 32),
        learn_std=True)

    print('horizon {}'.format(env.horizon))
    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=4000,
        max_path_length=env.horizon,
        n_itr=200,
        discount=0.99,
        step_size=0.01,
    )
    algo.train()
コード例 #26
0
def rllab_vpg_launcher(variant):
	from rllab.algos.trpo import TRPO
	from railrl.launchers.launcher_util import get_env_settings
	from railrl.algos.icm_trpo import ICM
	from rllab.policies.gaussian_mlp_policy import GaussianMLPPolicy
	from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline
	env_settings = get_env_settings(**variant['env_params'])
	env = TfEnv(env_settings['env'])
	policy = GaussianMLPPolicy(
		name="policy",
		env_spec=env.spec,
		hidden_sizes=(32, 32)
	)

	baseline = LinearFeatureBaseline(env_spec=env.spec)

	algorithm = VPG(
		env=env,
		policy=policy,
		baseline=baseline,
		**variant['algo_params']
	)
	algorithm.train()
コード例 #27
0
def run_task(v):

    env = ServerEnv(agents_number, -10, 10)

    policy = GaussianMLPPolicy(env_spec=env.agents_envs[0].spec, )

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = Server(
        agents_number=agents_number,
        average_period=average_period,
        server_env=env,
        policy=policy,
        baseline=baseline,
        batch_size=400,
        max_path_length=100,
        n_itr=20,
        discount=0.99,
        step_size=0.01,
        # Uncomment both lines (this and the plot parameter below) to enable plotting
        # plot=True,
    )

    algo.train()
コード例 #28
0
def run_task(*_):
    env = normalize(GymEnv(args.env))

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    learn_std = args.learn_std
    init_std = args.init_std

    if args.hidden_sizes == 0:
        hidden_sizes = (8, )
    elif args.hidden_sizes == 1:
        hidden_sizes = (32, 32)
    elif args.hidden_sizes == 2:
        hidden_sizes = (100, 50, 25)

    policy = GaussianMLPPolicy(env_spec=env.spec,
                               hidden_sizes=hidden_sizes,
                               learn_std=learn_std,
                               init_std=init_std)

    # =======================
    # Defining the algorithm
    # =======================
    batch_size = args.batch_size
    n_itr = args.n_itr
    gamma = args.gamma
    step_size = args.step_size

    algo = TNPG(env=env,
                policy=policy,
                baseline=baseline,
                batch_size=batch_size,
                n_itr=n_itr,
                discount=gamma,
                step_size=step_size)
    algo.train()
コード例 #29
0
ファイル: trpo_gym.py プロジェクト: kpeeters14/rl_gcg
def run_task(*_):
    env = normalize(GymEnv("Pendulum-v0", record_video=False))

    policy = GaussianMLPPolicy(
        env_spec=env.spec,
        # The neural network policy should have two hidden layers, each with 32 hidden units.
        hidden_sizes=(8, 8)
    )

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=4000,
        max_path_length=env.horizon,
        n_itr=50,
        discount=0.99,
        step_size=0.01,
        # Uncomment both lines (this and the plot parameter below) to enable plotting
        # plot=True,
    )
    algo.train()
コード例 #30
0
from rllab.policies.gaussian_mlp_policy import GaussianMLPPolicy
from rllab.envs.normalized_env import NormalizedEnv

from rllab.algos.trpo import TRPO
from rllab.misc.instrument import stub, run_experiment_lite
from sandbox.bradly.third_person.envs.reacher import ReacherEnv
from rllab.envs.gym_env import GymEnv
stub(globals())
env = GymEnv("Reacher3DOF-v1", mode='oracle', force_reset=True)#, imsize=(48,48))   

# env = TfEnv(normalize(ReacherEnv()))
policy = GaussianMLPPolicy(
    # name="policy",
    env_spec=env.spec,
    # The neural network policy should have two hidden layers, each with 32 hidden units.
    hidden_sizes=(32, 32),
    init_std=10
)

baseline = LinearFeatureBaseline(env_spec=env.spec)

algo = TRPO(
    env=env,
    policy=policy,
    baseline=baseline,
    batch_size=25000,
    max_path_length=50,
    n_itr=1000,
    discount=0.99,
    step_size=0.01,