def run_sql_experiment(main, mode, include_folders=None, log_dir=None,
                       exp_prefix="experiment", exp_name=None, **kwargs):
    if exp_name is None:
        exp_name = timestamp()

    if log_dir is None:
        log_dir = os.path.join(
            DEFAULT_LOG_DIR,
            "local",
            exp_prefix.replace("_", "-"),
            exp_name)

    if include_folders is None:
        include_folders = list()

    if mode == 'ec2':
        include_folders.append('softqlearning')
        all_symlinks = list()

        for folder in include_folders:
            all_symlinks.append(_create_symlink(folder))

        kwargs.update(added_project_directories=all_symlinks)

    run_experiment_lite(
        stub_method_call=main,
        mode=mode,
        exp_prefix=exp_prefix,
        exp_name=exp_name,
        log_dir=log_dir,
        **kwargs,
    )
Exemple #2
0
def _launch_ec2(func, exp_prefix, exp_name, params, run_experiment_kwargs):
    print("Launching task", exp_name)
    kwargs = dict(
        n_parallel=1,
        snapshot_mode="last",
        seed=params.get("seed",None),
        mode="ec2"
        )
    kwargs.update(run_experiment_kwargs)
    kwargs.update(dict(
        exp_prefix=exp_prefix,
        exp_name=exp_name,
        variant=params,
        confirm_remote=False))
    
    run_experiment_lite(func,**kwargs)
Exemple #3
0
def run_experiment(**params):
    base_params = copy.copy(DEFAULTS)
    base_params.update(params)
    params = base_params
    pprint(params)

    grid_world = SlaveGridWorldEnv("walled_chain",
                                   max_traj_length=DEFAULTS["max_path_length"],
                                   goal_reward=params["goal_reward"])
    agent = GridWorldMasterAgent(grid_world, match_reward=params["match_reward"])
    env = normalize(SituatedConversationEnvironment(env=grid_world, b_agent=agent))
    baseline = LinearFeatureBaseline(env)

    policy = RecurrentCategoricalPolicy(
            name="policy",
            env_spec=env.spec,
            hidden_dims=params["policy_hidden_dims"],
            feature_network=MLPNetworkWithEmbeddings(
                "feature_network", env.observation_space.flat_dim,
                params["feature_dim"], params["feature_hidden_dims"],
                tf.tanh, tf.tanh, agent.vocab_size, params["embedding_dim"]),
            state_include_action=False,
    )

    optimizer = ConjugateGradientOptimizer(hvp_approach=FiniteDifferenceHvp(base_eps=1e-5))

    algo = TRPO(
            env=env,
            policy=policy,
            baseline=baseline,
            batch_size=params["batch_size"],
            max_path_length=params["max_path_length"],
            n_itr=params["n_itr"],
            discount=0.99,
            step_size=params["step_size"],
            optimizer=optimizer,
    )

    run_experiment_lite(
            algo.train(),
            n_parallel=15,
            snapshot_mode="last",
            exp_prefix="grid_world_sweep3",
            variant=params,
    )
Exemple #4
0
def run_experiment(params):
    params_base = copy.copy(DEFAULTS)
    params_base.update(params)
    params = params_base

    policy = RecurrentCategoricalPolicy(
            name="policy",
            env_spec=env.spec,
            hidden_dims=params["policy_hidden_dims"],
            feature_network=MLPNetworkWithEmbeddings(
                "embeddings", len(VOCAB), params["feature_dim"],
                params["feature_hidden_dims"], tf.tanh, tf.tanh, len(VOCAB),
                params["embedding_dim"], has_other_input=False),
            state_include_action=False,
    )

    baseline = LinearFeatureBaseline(env.spec)

    optimizer = ConjugateGradientOptimizer(hvp_approach=FiniteDifferenceHvp(base_eps=1e-5))

    algo = TRPO(
            env=env,
            policy=policy,
            baseline=baseline,
            batch_size=params["batch_size"],
            max_path_length=LENGTH,
            n_itr=params["n_itr"],
            discount=0.99,
            step_size=params["step_size"],
            optimizer=optimizer,
    )

    run_experiment_lite(
            algo.train(),
            n_parallel=5,
            snapshot_mode="last",
            exp_prefix="autoenc_unnorm_reward",
            variant=params,
    )
Exemple #5
0
def run_experiment(**params):
    base_params = copy.copy(DEFAULTS)
    base_params.update(params)
    params = base_params

    grid_world = SlaveGridWorldEnv("3x3", goal_reward=params["goal_reward"])
    env = normalize(grid_world)
    baseline = LinearFeatureBaseline(env)

    policy = CategoricalMLPPolicy(
            name="policy",
            env_spec=env.spec,
            hidden_sizes=params["policy_hidden_dims"],
    )

    optimizer = ConjugateGradientOptimizer(hvp_approach=FiniteDifferenceHvp(base_eps=1e-5))

    algo = TRPO(
            env=env,
            policy=policy,
            baseline=baseline,
            batch_size=params["batch_size"],
            max_path_length=5,
            n_itr=params["n_itr"],
            discount=0.99,
            step_size=params["step_size"],
            optimizer=optimizer,
    )

    run_experiment_lite(
            algo.train(),
            n_parallel=5,
            snapshot_mode="last",
            exp_prefix="grid_world_silent",
            variant=params,
    )
Exemple #6
0
from sandbox.rocky.tf.envs.base import TfEnv

stub(globals())

#env = TfEnv(normalize(PointEnv()))
env = TfEnv(normalize(PointEnvRandGoal()))
policy = GaussianMLPPolicy(
    name="policy",
    env_spec=env.spec,
)
#baseline = LinearFeatureBaseline(env_spec=env.spec)
baseline = ZeroBaseline(env_spec=env.spec)
algo = VPG(
    env=env,
    policy=policy,
    baseline=baseline,
    #batch_size=20,
    max_path_length=5,
    n_itr=100,
    #plot=True,
)
run_experiment_lite(
    algo.train(),
    n_parallel=1,
    snapshot_mode="last",
    seed=1,
    exp_prefix='deleteme',
    exp_name='deleteme',
    #plot=True,
)
    batch_size=64,
    max_path_length=env.horizon,
    epoch_length=1000,
    min_pool_size=10000,
    n_epochs=args.num_epochs,
    discount=0.99,
    scale_reward=args.reward_scale,
    qf_learning_rate=1e-3,
    policy_learning_rate=1e-4,
    plot=False
)


run_experiment_lite(
    algo.train(),
    log_dir=None if args.use_ec2 else args.data_dir,
    # Number of parallel workers for sampling
    n_parallel=1,
    # Only keep the snapshot parameters for the last iteration
    snapshot_mode="last",
    # Specifies the seed for the experiment. If this is not provided, a random seed
    # will be used
    exp_prefix="DDPG_" + args.env,
    seed=1,
    mode="ec2" if args.use_ec2 else "local",
    plot=False,
    # dry=True,
    terminate_machine=args.dont_terminate_machine,
    added_project_directories=[osp.abspath(osp.join(osp.dirname(__file__), '.'))]
)
                        )
                    else:
                        policy = GaussianMLPPolicy(env_spec=mdp.spec,
                                                   hidden_sizes=(32, 32),
                                                   init_std=10)

                        baseline = LinearFeatureBaseline(mdp.spec, )

                        batch_size = 50 * 250
                        algo = TRPO(env=mdp,
                                    policy=policy,
                                    baseline=baseline,
                                    batch_size=batch_size,
                                    whole_paths=True,
                                    max_path_length=50,
                                    n_itr=200,
                                    step_size=0.01,
                                    subsample_factor=1.0,
                                    **copyparams)

                    run_experiment_lite(
                        algo.train(),
                        exp_prefix="r_push_new_ours-quad1",
                        n_parallel=4,
                        # dry=True,
                        snapshot_mode="all",
                        seed=seed,
                        mode="ec2_mujoco",
                        # terminate_machine=False
                    )
Exemple #9
0
                    env=env,
                    policy=policy,
                    es=es,
                    qf=qf,
                    batch_size=size_of_batch,
                    max_path_length=100,
                    epoch_length=1000,
                    min_pool_size=10000,
                    n_epochs=number_of_episodes,
                    discount=discount_factor,
                    scale_reward=reward_scaling[r],
                    qf_learning_rate=critic_learning_rate[c],
                    policy_learning_rate=actor_learning_rate[c],
                    # Uncomment both lines (this and the plot parameter below) to enable plotting
                    # plot=True,
                )
                algo.train()

            run_experiment_lite(
                run_task,
                # Number of parallel workers for sampling
                n_parallel=1,
                # Only keep the snapshot parameters for the last iteration
                snapshot_mode="last",
                # Specifies the seed for the experiment. If this is not provided, a random seed
                # will be used
                exp_name="DDPG_HalfCheetah/" + "HalfCheetah",
                seed=1,
                # plot=True,
            )
            )
            if bas == 'zero':
                baseline = ZeroBaseline(env_spec=env.spec)
            elif 'linear' in bas:
                baseline = LinearFeatureBaseline(env_spec=env.spec)
            else:
                baseline = GaussianMLPBaseline(env_spec=env.spec)
            algo = MAMLTRPO(
                env=env,
                policy=policy,
                baseline=baseline,
                batch_size=fast_batch_size, # number of trajs for grad update
                max_path_length=max_path_length,
                meta_batch_size=meta_batch_size,
                num_grad_updates=num_grad_updates,
                n_itr=100,
                use_maml=use_maml,
                step_size=meta_step_size,
                plot=False,
            )
            run_experiment_lite(
                algo.train(),
                n_parallel=1,
                snapshot_mode="last",
                python_command='python3',
                seed=1,
                exp_prefix='vpg_maml_point100',
                exp_name='trpomaml'+str(int(use_maml))+'_fbs'+str(fast_batch_size)+'_mbs'+str(meta_batch_size)+'_flr_' + str(fast_learning_rate) + 'metalr_' + str(meta_step_size) +'_step1'+str(num_grad_updates),
                plot=False,
            )
Exemple #11
0
            run_experiment_lite(
                # use_cloudpickle=False,
                stub_method_call=run_task,
                variant=vv,
                mode=mode,
                # Number of parallel workers for sampling
                n_parallel=n_parallel,
                # Only keep the snapshot parameters for the last iteration
                snapshot_mode="last",
                seed=vv['seed'],
                # plot=True,
                exp_prefix=exp_prefix,
                # exp_name=exp_name,
                sync_s3_pkl=True,
                # for sync the pkl file also during the training
                sync_s3_png=True,
                sync_s3_html=True,
                # # use this ONLY with ec2 or local_docker!!!
                pre_commands=[
                    'export MPLBACKEND=Agg',
                    'pip install --upgrade pip',
                    'pip install --upgrade -I tensorflow',
                    'pip install git+https://github.com/tflearn/tflearn.git',
                    'pip install dominate',
                    'pip install multiprocessing_on_dill',
                    'pip install scikit-image',
                    'conda install numpy -n rllab3 -y',
                ],
            )
            if mode == 'local_docker':
Exemple #12
0
            policy=policy,
            es=es,
            qf=qf,
            batch_size=batch_size_values[b],
            max_path_length=env.horizon,
            epoch_length=1000,
            min_pool_size=10000,
            n_epochs=args.num_epochs,
            discount=0.99,
            scale_reward=1.0,
            qf_learning_rate=1e-3,
            policy_learning_rate=1e-4,
            # Uncomment both lines (this and the plot parameter below) to enable plotting
            plot=args.plot,
        )


        run_experiment_lite(
            algo.train(),
            # log_dir=args.data_dir,
            # Number of parallel workers for sampling
            n_parallel=1,
            # Only keep the snapshot parameters for the last iteration
            snapshot_mode="last",
            # Specifies the seed for the experiment. If this is not provided, a random seed
            # will be used
            exp_name="reproducibility_ML/" + "DDPG/" + "HalfCheetah/" + "Batch_Size_Tune/" +  "Batch_Size_" + str(batch_size_values[b]) + "_Experiment_" + str(e),
            seed=1,
            plot=args.plot,
        )
from rllab.policies.gaussian_gru_policy import GaussianGRUPolicy
from rllab.optimizers.conjugate_gradient_optimizer import ConjugateGradientOptimizer, FiniteDifferenceHvp
from rllab.misc.instrument import stub, run_experiment_lite

stub(globals())

env = normalize(CartpoleEnv())

policy = GaussianGRUPolicy(
    env_spec=env.spec,
)

baseline = LinearFeatureBaseline(env_spec=env.spec)

algo = TRPO(
    env=env,
    policy=policy,
    baseline=baseline,
    batch_size=4000,
    max_path_length=100,
    n_itr=10,
    discount=0.99,
    step_size=0.01,
    optimizer=ConjugateGradientOptimizer(hvp_approach=FiniteDifferenceHvp(base_eps=1e-5))
)
run_experiment_lite(
    algo.train(),
    n_parallel=1,
    seed=1,
)
Exemple #14
0
                if param.name == hrl_param.name:
                    param.set_value(hrl_param.get_value(borrow=True))

        for param in hrl_pol_param:
            for hrl_param in llc_param:
                if param.name == hrl_param.name:
                    param.set_value(hrl_param.get_value(borrow=True))

    for i in range(100):
        algo1.current_itr = 0
        algo2.current_itr = 0
        algo2.train(continue_learning=(i > 0))
        sep2int()

        algo1.train(continue_learning=(i > 0))
        int2sep()


run_experiment_lite(
    run_task,
    # Number of parallel workers for sampling
    n_parallel=0,
    # Only keep the snapshot parameters for the last iteration
    snapshot_mode="last",
    # Specifies the seed for the experiment. If this is not provided, a random seed
    # will be used
    seed=1,
    exp_prefix='Walker3d_async_hrl'
    # plot=True
)
Exemple #15
0
            baseline=baseline,
            batch_size=4000,  # 2x
            max_path_length=200,
            n_itr=n_itr,
            reset_arg=goal,
            optimizer_args={'init_learning_rate': step_sizes[step_i], 'tf_optimizer_args': {'learning_rate': 0.5*step_sizes[step_i]}, 'tf_optimizer_cls': tf.train.GradientDescentOptimizer}
        )


        run_experiment_lite(
            algo.train(),
            # Number of parallel workers for sampling
            n_parallel=4,
            # Only keep the snapshot parameters for the last iteration
            snapshot_mode="all",
            # Specifies the seed for the experiment. If this is not provided, a random seed
            # will be used
            seed=1,
            exp_prefix='ant_test_posticml',
            exp_name='test' + str(run_id),
            #plot=True,
        )



        # get return from the experiment
        with open('data/local/ant-test-posticml/test'+str(run_id)+'/progress.csv', 'r') as f:
            reader = csv.reader(f, delimiter=',')
            i = 0
            row = None
            returns = []

quantization_tunings = [1, 5, 15, 20]
discounts = [0.99]
participation_rates = [1]
agents_numbers = [5]
average_periods = [10]

for quantization_tuning in quantization_tunings:
    for discount in discounts:
        for participation_rate in participation_rates:
            for agents_number in agents_numbers:
                for average_period in average_periods:
                    run_experiment_lite(
                        run_task,
                        exp_prefix="test_quantized",
                        # Number of parallel workers for sampling
                        n_parallel=1,
                        # Only keep the snapshot parameters for the last iteration
                        snapshot_mode="last",
                        # Specifies the seed for the experiment. If this is not provided, a random seed
                        # will be used
                        mode="local",
                        variant=dict(quantization_tuning=quantization_tuning,
                                     discount=discount,
                                     participation_rate=participation_rate,
                                     agents_number=agents_number,
                                     average_period=average_period)
                        # plot=True,
                        # terminate_machine=False,
                    )
Exemple #17
0
     assert 'render_every' in params['rollout_params']
     params['rollout_params']['render_every'] = None
     count = 1
     for i in range(options.n):
         l_bfgs_exception(params)
         aws_config = get_aws_config(count)
         run_experiment_lite(train,
                             exp_prefix=exp_prefix,
                             mode=mode,
                             variant=dict(mode=mode,
                                          params=params,
                                          use_gpu=True,
                                          seed=i),
                             dry=False,
                             aws_config=aws_config,
                             sync_s3_pkl=True,
                             sync_s3_png=True,
                             sync_s3_log=True,
                             pre_commands=[
                                 "pip install --upgrade pip",
                                 "pip install mpi4py", "pip install plotly",
                                 "pip install pandas", "pip install seaborn"
                             ],
                             use_gpu=True
                             # terminate_machine=False
                             )
         print(count)
         count += 1
 else:
     mode = "local"
     l_bfgs_exception(params)
     import colored_traceback.always
        # exp_prefix = "test"
        now = datetime.datetime.now(dateutil.tz.tzlocal())
        timestamp = now.strftime('%Y_%m_%d_%H_%M_%S')
        exp_name = 'TRPO_scratch__{}batch_{}length_{}id_{}_parallel{}'.format(
            # time_step_agg,
            int(batch_size),
            int(max_path_length),
            maze_id,
            timestamp,
            n_parallel)

        run_experiment_lite(
            stub_method_call=algo.train(),
            mode=mode,
            use_cloudpickle=False,
            pre_commands=[
                'pip install --upgrade pip',
                'pip install --upgrade theano',
            ],
            # Number of parallel workers for sampling
            n_parallel=n_parallel,

            # Only keep the snapshot parameters for the last iteration
            snapshot_mode="last",
            seed=s,
            # Save to data/local/exp_prefix/exp_name/
            exp_prefix=exp_prefix,
            exp_name=exp_name,
            use_gpu=False,
        )
Exemple #19
0
        whole_paths=True,
        max_path_length=500,
        n_itr=10000,
        step_size=0.01,
        eta=eta,
        snn_n_samples=10,
        subsample_factor=1.0,
        use_replay_pool=True,
        use_kl_ratio=True,
        use_kl_ratio_q=True,
        n_itr_update=1,
        kl_batch_size=1,
        normalize_reward=False,
        replay_pool_size=1000000,
        n_updates_per_sample=5000,
        second_order_update=True,
        unn_n_hidden=[32],
        unn_layers_type=[1, 1],
        unn_learning_rate=0.0001
    )

    run_experiment_lite(
        algo.train(),
        exp_prefix="trpo-expl",
        n_parallel=4,
        snapshot_mode="last",
        seed=seed,
        mode="local",
        script="sandbox/vime/experiments/run_experiment_lite.py",
    )
Exemple #20
0
        max_path_length=1500,
        n_itr=4500,
        step_size=0.01,
        eta=eta,
        snn_n_samples=10,
        subsample_factor=1.0,
        use_replay_pool=True,
        use_kl_ratio=True,
        use_kl_ratio_q=True,
        n_itr_update=1,
        kl_batch_size=1,
        normalize_reward=False,
        replay_pool_size=1000000,
        n_updates_per_sample=5000,
        second_order_update=True,
        unn_n_hidden=[64],
        unn_layers_type=[1, 1],
        unn_learning_rate=0.0001
    )

    run_experiment_lite(
        algo.train(),
        exp_prefix="trpo-expl",
        n_parallel=1,
        snapshot_mode="last",
        seed=seed,
	args_data="/home/ubuntu/work/rllab/data/local/trpo-expl/trpo-expl_2016_06_29_01_56_06_0001/params.pkl",
        mode="local",
        script="sandbox/vime/experiments/run_experiment_lite.py",
    )
Exemple #21
0
stub(globals())

# Param ranges
seeds = range(2)
# SwimmerGather hierarchical task
mdp_classes = [SwimmerGatherEnv]
mdps = [NormalizedEnv(env=mdp_class()) for mdp_class in mdp_classes]
param_cart_product = itertools.product(mdps, seeds)

for mdp, seed in param_cart_product:

    policy = GaussianMLPPolicy(env_spec=mdp.spec, hidden_sizes=(64, 32))

    baseline = LinearFeatureBaseline(mdp.spec)

    batch_size = 50000
    algo = TRPO(
        env=mdp,
        policy=policy,
        baseline=baseline,
        batch_size=batch_size,
        whole_paths=True,
        max_path_length=500,
        n_itr=10000,
        step_size=0.01,
        subsample_factor=1.0,
    )

    run_experiment_lite(algo.train(), exp_prefix="trpo", n_parallel=4, snapshot_mode="last", seed=seed, mode="local")
Exemple #22
0
from examples.point_env_randgoal import PointEnvRandGoal
from rllab.envs.normalized_env import normalize
from rllab.misc.instrument import stub, run_experiment_lite
#from rllab.policies.gaussian_mlp_policy import GaussianMLPPolicy
#from sandbox.rocky.tf.policies.gaussian_mlp_policy import GaussianMLPPolicy
from sandbox.rocky.tf.policies.minimal_gauss_mlp_policy import GaussianMLPPolicy
from sandbox.rocky.tf.envs.base import TfEnv

stub(globals())

env = TfEnv(normalize(PointEnv()))
#env = TfEnv(normalize(PointEnvRandGoal()))
policy = GaussianMLPPolicy(
    name="policy",
    env_spec=env.spec,
)
baseline = LinearFeatureBaseline(env_spec=env.spec)
algo = VPG(
    env=env,
    policy=policy,
    baseline=baseline,
    #plot=True,
)
run_experiment_lite(
    algo.train(),
    n_parallel=1,
    snapshot_mode="last",
    seed=1,
    #plot=True,
)
# see https://github.com/openai/rllab/issues/87#issuecomment-282519288
env = TfEnv(normalize(GymEnv("CartPole-v0", force_reset=True)))

policy = CategoricalMLPPolicy(
name="policy",
env_spec=env.spec,
# The neural network policy should have two hidden layers, each with 32 hidden units.
hidden_sizes=(32, 32)
)

baseline = LinearFeatureBaseline(env_spec=env.spec)

algo = TRPO(
    env=env,
    policy=policy,
    baseline=baseline,
    batch_size=4000,
    max_path_length=200,
    n_itr=120,
    discount=0.99,
    step_size=0.01,
    # optimizer=ConjugateGradientOptimizer(hvp_approach=FiniteDifferenceHvp(base_eps=1e-5))
)

run_experiment_lite(
    algo.train(),
    n_parallel=1,
    snapshot_mode="last",
    seed=1
)
def run_evaluation(argv):

    # -------------------- Parse Arguments -----------------------------------
    parser = argparse.ArgumentParser()
    parser.add_argument(
        'exp_prefix_dir',
        type=str,
        help='path to dump dir which contains folders with '
        'the train results i.e. params.pkl and variant.json file')
    parser.add_argument(
        '--mode',
        type=str,
        default='local',
        help='Mode for running the experiments - local: runs on local machine, '
        'ec2: runs on AWS ec2 cluster (requires a proper configuration file)')
    parser.add_argument(
        '--n_parallel',
        type=int,
        default=1,
        help=
        'Number of parallel workers to perform rollouts. 0 => don\'t start any workers'
    )
    parser.add_argument('--num_sampled_envs',
                        type=int,
                        default=5,
                        help='number or environments with samples parameters')

    args = parser.parse_args(argv[1:])

    # ----------------------- EVALUATION ---------------------------------------

    exp_prefix = os.path.basename(args.exp_prefix_dir)
    eval_exp_prefix = exp_prefix + '-eval'
    evaluation_runs = eval.prepare_evaluation_runs(
        args.exp_prefix_dir,
        EXP_PREFIX,
        num_sampled_envs=args.num_sampled_envs)

    # ----------------------- AWS conficuration ---------------------------------
    if args.mode == 'ec2':
        subnets = cheapest_subnets(ec2_instance, num_subnets=3)
        info = config.INSTANCE_TYPE_INFO[ec2_instance]
        config.AWS_INSTANCE_TYPE = ec2_instance
        config.AWS_SPOT_PRICE = str(info["price"])

        print("\n" + "**********" * 10 +
              "\nexp_prefix: {}\nvariants: {}".format('TRPO',
                                                      len(evaluation_runs)))
        print(
            'Running on type {}, with price {}, on the subnets: '.format(
                config.AWS_INSTANCE_TYPE,
                config.AWS_SPOT_PRICE,
            ), str(subnets))

    for eval_exp_name, v in evaluation_runs:

        if args.mode == 'ec2':
            subnet = random.choice(subnets)
            config.AWS_REGION_NAME = subnet[:-1]
            config.AWS_KEY_NAME = config.ALL_REGION_AWS_KEY_NAMES[
                config.AWS_REGION_NAME]
            config.AWS_IMAGE_ID = config.ALL_REGION_AWS_IMAGE_IDS[
                config.AWS_REGION_NAME]
            config.AWS_SECURITY_GROUP_IDS = \
                config.ALL_REGION_AWS_SECURITY_GROUP_IDS[
                    config.AWS_REGION_NAME]

        run_experiment_lite(
            run_eval_task,
            exp_prefix=eval_exp_prefix,
            exp_name=eval_exp_name,
            # Number of parallel workers for sampling
            n_parallel=args.n_parallel,
            # Only keep the snapshot parameters for the last iteration
            snapshot_mode="last",
            # Specifies the seed for the experiment. If this is not provided, a random seed
            # will be used
            seed=v["seed"],
            python_command='python3',
            pre_commands=[
                "yes | pip install --upgrade pip",
                "yes | pip install tensorflow=='1.6.0'",
                "yes | pip install --upgrade cloudpickle"
            ],
            mode=args.mode,
            use_cloudpickle=True,
            periodic_sync=True,
            variant=v,
            # plot=True,
            # terminate_machine=False,
        )
Exemple #25
0
policy = GaussianMLPPolicy(
    env_spec=env.spec,
    # The neural network policy should have two hidden layers, each with 32 hidden units.
    hidden_sizes=(32, 32))

baseline = LinearFeatureBaseline(env_spec=env.spec)

algo = TRPO(env=env,
            policy=policy,
            baseline=baseline,
            batch_size=4000,
            whole_paths=True,
            max_path_length=100,
            n_itr=20,
            discount=0.99,
            step_size=0.01,
            plot=False)

run_experiment_lite(
    algo.train(),
    # Number of parallel workers for sampling
    n_parallel=1,
    # Only keep the snapshot parameters for the last iteration
    snapshot_mode="last",
    log_dir="./results",
    # Specifies the seed for the experiment. If this is not provided, a random seed
    # will be used
    seed=1,
    plot=True,
)
Exemple #26
0
policy = RecurrentCategoricalPolicy(
        name="policy",
        env_spec=env.spec,
        hidden_dim=128,
        state_include_action=False,
        #temperature=2,
)

baseline = LinearFeatureBaseline(env_spec=env.spec)


algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=50000,
        max_path_length=5,
        n_itr=50,
        discount=0.99,
        step_size=0.01,
        optimizer=ConjugateGradientOptimizer(hvp_approach=FiniteDifferenceHvp(base_eps=1e-5))
)


run_experiment_lite(
        algo.train(),
        n_parallel=5,
        snapshot_mode="last",
        log_dir="./log",
)
Exemple #27
0
                                                                    INPUT_FEED)
                                                                run_experiment_lite(
                                                                    algo.train(
                                                                    ),
                                                                    n_parallel=
                                                                    1,
                                                                    snapshot_mode
                                                                    ="all",
                                                                    python_command
                                                                    ='python3',
                                                                    seed=seed,
                                                                    exp_prefix=
                                                                    str('PU_IL_'
                                                                        + time.
                                                                        strftime(
                                                                            "%D"
                                                                        ).
                                                                        replace(
                                                                            "/",
                                                                            "")
                                                                        [0:4]),
                                                                    exp_name=
                                                                    exp_name,
                                                                    plot=False,
                                                                    sync_s3_pkl
                                                                    =True,
                                                                    mode=mode,
                                                                    terminate_machine
                                                                    =True,
                                                                )
Exemple #28
0
        env_spec=env.spec,
        # The neural network policy should have two hidden layers, each with 32 hidden units.
        hidden_sizes=(32, 32))

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=4000,
        max_path_length=100,
        n_itr=1000,
        discount=0.99,
        step_size=0.01,
    )
    algo.train()


run_experiment_lite(
    run_task,
    # Number of parallel workers for sampling
    n_parallel=1,
    # Only keep the snapshot parameters for the last iteration
    snapshot_mode="last",
    exp_name="TRPO_Trial_Results/" + "Trial_GridWorld/",
    # Specifies the seed for the experiment. If this is not provided, a random seed
    # will be used
    seed=1,
    # plot=True,
)
Exemple #29
0
def run_experiment_old(
        task,
        exp_prefix='default',
        seed=None,
        variant=None,
        time_it=True,
        save_profile=False,
        profile_file='time_log.prof',
        mode='here',
        exp_id=0,
        unique_id=None,
        prepend_date_to_exp_prefix=True,
        use_gpu=False,
        snapshot_mode='last',
        snapshot_gap=1,
        n_parallel=0,
        base_log_dir=None,
        **run_experiment_lite_kwargs
):
    """
    Run a task via the rllab interface, i.e. serialize it and then run it via
    the run_experiment_lite script.

    This will soon be deprecated.

    :param task:
    :param exp_prefix:
    :param seed:
    :param variant:
    :param time_it: Add a "time" command to the python command?
    :param save_profile: Create a cProfile log?
    :param profile_file: Where to save the cProfile log.
    :param mode: 'here' will run the code in line, without any serialization
    Other options include 'local', 'local_docker', and 'ec2'. See
    run_experiment_lite documentation to learn what those modes do.
    :param exp_id: Experiment ID. Should be unique across all
    experiments. Note that one experiment may correspond to multiple seeds.
    :param unique_id: Unique ID should be unique across all runs--even different
    seeds!
    :param prepend_date_to_exp_prefix: If True, prefix "month-day_" to
    exp_prefix
    :param run_experiment_lite_kwargs: kwargs to be passed to
    `run_experiment_lite`
    :return:
    """
    if seed is None:
        seed = random.randint(0, 100000)
    if variant is None:
        variant = {}
    if unique_id is None:
        unique_id = str(uuid.uuid4())
    if prepend_date_to_exp_prefix:
        exp_prefix = time.strftime("%m-%d") + "_" + exp_prefix
    variant['seed'] = str(seed)
    variant['exp_id'] = str(exp_id)
    variant['unique_id'] = str(unique_id)
    logger.log("Variant:")
    logger.log(json.dumps(ppp.dict_to_safe_json(variant), indent=2))
    command_words = []
    if time_it:
        command_words.append('time')
    command_words.append('python')
    if save_profile:
        command_words += ['-m cProfile -o', profile_file]
    repo = git.Repo(os.getcwd())
    diff_string = repo.git.diff(None)
    commit_hash = repo.head.commit.hexsha
    script_name = "tmp"
    if mode == 'here':
        log_dir, exp_name = create_log_dir(exp_prefix, exp_id, seed,
                                           base_log_dir)
        data = dict(
            log_dir=log_dir,
            exp_name=exp_name,
            mode=mode,
            variant=variant,
            exp_id=exp_id,
            exp_prefix=exp_prefix,
            seed=seed,
            use_gpu=use_gpu,
            snapshot_mode=snapshot_mode,
            snapshot_gap=snapshot_gap,
            diff_string=diff_string,
            commit_hash=commit_hash,
            n_parallel=n_parallel,
            base_log_dir=base_log_dir,
            script_name=script_name,
        )
        save_experiment_data(data, log_dir)
    if mode == 'here':
        run_experiment_here(
            task,
            exp_prefix=exp_prefix,
            variant=variant,
            exp_id=exp_id,
            seed=seed,
            use_gpu=use_gpu,
            snapshot_mode=snapshot_mode,
            snapshot_gap=snapshot_gap,
            code_diff=diff_string,
            commit_hash=commit_hash,
            script_name=script_name,
            n_parallel=n_parallel,
            base_log_dir=base_log_dir,
        )
    else:
        if mode == "ec2" and use_gpu:
            if not query_yes_no(
                    "EC2 is more expensive with GPUs. Confirm?"
            ):
                sys.exit(1)
        code_diff = (
            base64.b64encode(cloudpickle.dumps(diff_string)).decode("utf-8")
        )
        run_experiment_lite(
            task,
            snapshot_mode=snapshot_mode,
            snapshot_gap=snapshot_gap,
            exp_prefix=exp_prefix,
            variant=variant,
            seed=seed,
            use_cloudpickle=True,
            python_command=' '.join(command_words),
            mode=mode,
            use_gpu=use_gpu,
            script="railrl/scripts/run_experiment_lite.py",
            code_diff=code_diff,
            commit_hash=commit_hash,
            script_name=script_name,
            n_parallel=n_parallel,
            **run_experiment_lite_kwargs
        )
Exemple #30
0
def run_trpo_vase(env,nRuns = 20,seed_base=0, sigma_c=0.5, ablation_mode=False):

    now = datetime.datetime.now(dateutil.tz.tzlocal())
    timestamp = now.strftime('%Y_%m_%d_%H_%M_%S')

    for seed in range(seed_base,nRuns):

        if env == 'mountaincar':
            mdp = MountainCarEnvX()
            n_itr = 50
            max_path_length = 500
            type = 'classic'
        elif env == 'cartpole':
            mdp = NormalizedEnv(env=CartpoleSwingupEnvX())
            n_itr = 400
            max_path_length = 500
            type = 'classic'
        elif env == 'doublependulum':
            mdp = NormalizedEnv(env=DoublePendulumEnvX())
            n_itr = 400
            max_path_length = 500
            type = 'classic'
        elif env == 'halfcheetah':
            mdp = NormalizedEnv(env=HalfCheetahEnvX())
            n_itr = 600
            max_path_length = 500
            type = 'locomotion'
        elif env == 'ant':
            mdp = NormalizedEnv(env=AntEnv())
            n_itr = 600
            max_path_length = 500
            type = 'locomotion'
        elif env == 'lunarlander':
            mdp = NormalizedEnv(env=LunarLanderContinuous())
            n_itr = 100
            max_path_length = 1000
            type = 'classic'
        else:
            sys.stderr.write("Error! Environment '%s' not recognised\n" % env)
            sys.exit(-1)

        if type == 'classic':
            step_size = 0.01
            replay_pool_size = 100000
            policy_hidden_sizes = (32,)
            unn_n_hidden = [32]
            unn_layers_type=[1, 1]

            baseline = GaussianMLPBaseline(
                env_spec=mdp.spec,
                regressor_args={
                    'hidden_sizes': (32,),
                    'learn_std': False,
                    'hidden_nonlinearity': NL.rectify,
                    'optimizer': ConjugateGradientOptimizer(subsample_factor=1.0)
                }
            )
        else:
            step_size = 0.05
            replay_pool_size = 5000000
            policy_hidden_sizes = (64, 32)
            unn_n_hidden = [64, 64]
            unn_layers_type=[1, 1, 1]

            baseline = LinearFeatureBaseline(
                mdp.spec,
            )

        policy = GaussianMLPPolicy(
            env_spec=mdp.spec,
            hidden_sizes=policy_hidden_sizes,
            hidden_nonlinearity=NL.tanh
        )


        algo = TRPO(
            env=mdp,
            policy=policy,
            baseline=baseline,
            n_itr=n_itr,
            batch_size=5000,
            max_path_length = max_path_length,
            discount = 0.995,
            gae_lambda = 0.95,
            whole_paths=True,
            step_size=step_size,
            eta=1e-4,
            snn_n_samples=10,
            prior_sd=0.5,
            likelihood_sd=sigma_c,
            subsample_factor=1.0,
            use_replay_pool=True,
            replay_pool_size=replay_pool_size,
            n_updates_per_sample=500,
            unn_n_hidden=unn_n_hidden,
            unn_layers_type=unn_layers_type,
            unn_learning_rate=0.001
        )

        exp_name = "trpo-vase_%s_%04d" % (timestamp, seed + 1)
        if ablation_mode:
            cwd = os.getcwd()
            log_dir = cwd + "/data/local/sigmas/" + env + ("/%.3f/" % sigma_c) + exp_name
        else:
            log_dir = config.LOG_DIR + "/local/" + env +  "/" + exp_name

        run_experiment_lite(
            algo.train(),
            exp_name = exp_name,
            log_dir= log_dir,
            n_parallel=0,
            snapshot_mode="last",
            seed=seed,
            mode="local",
            script="sandbox/vase/experiments/run_experiment_lite.py"
        )
Exemple #31
0
    es=es,
    qf=qf,
    batch_size=35,
    max_path_length=100,
    epoch_length=5000,
    min_pool_size=10000,
    n_epochs=100,
    discount=0.99,
    scale_reward=variant["scale_reward"],
    soft_target_tau=1e-3,
    qf_learning_rate=variant["qf_learning_rate"],
    policy_learning_rate=variant["policy_learning_rate"],
    #Uncomment both lines (this and the plot parameter below) to enable plotting
    plot=True,
    eval_samples=5000,
)

run_experiment_lite(
    algo.train(),
    # Number of parallel workers for sampling
    n_parallel=1,
    # Only keep the snapshot parameters for the last iteration
    snapshot_mode="last",
    exp_prefix="dpg_search",
    # Specifies the seed for the experiment. If this is not provided, a random seed
    # will be used
    exp_name=str(num),
    seed=2,
    plot=True,
)
Exemple #32
0
                                                AssociatePublicIpAddress=True,
                                            )
                                        ]

                                        run_experiment_lite(
                                            stub_method_call=algo.train(),
                                            mode='ec2',
                                            use_cloudpickle=False,
                                            # Number of parallel workers for sampling
                                            n_parallel=n_parallel,
                                            # Only keep the snapshot parameters for the last iteration
                                            snapshot_mode="last",
                                            seed=s,
                                            # plot=True,
                                            exp_prefix=exp_prefix,
                                            exp_name=exp_name,
                                            sync_s3_pkl=True,
                                            # for sync the pkl file also during the training
                                            sync_s3_png=True,
                                            # # use this ONLY with ec2 or local_docker!!!
                                            pre_commands=[
                                                "which conda",
                                                "which python",
                                                "conda list -n rllab3",
                                                "conda install -f numpy -n rllab3 -y",
                                            ],
                                        )
                                    else:
                                        run_experiment_lite(
                                            stub_method_call=algo.train(),
                                            mode='local',
from rllab.policies.gaussian_mlp_policy import GaussianMLPPolicy


def run_task(*_):
    env = normalize(GymEnv("Pendulum-v0"))

    policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(32, 32))

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=4000,
        max_path_length=env.horizon,
        n_itr=50,
        discount=0.99,
        step_size=0.01,
        plot=True,
    )
    algo.train()


run_experiment_lite(
    run_task,
    n_parallel=1,
    snapshot_mode="last",
    plot=True,
)
Exemple #34
0
    normalize(GymEnv("Walker2d-v1", record_video=False, force_reset=True)))

policy = GaussianMLPPolicy(
    name="policy",
    env_spec=env.spec,
    # The neural network policy should have two hidden layers, each with 64 hidden units.
    hidden_sizes=(64, 64),
)

baseline = LinearFeatureBaseline(env_spec=env.spec)

algo = TRPO(
    env=env,
    policy=policy,
    baseline=baseline,
    batch_size=50000,
    max_path_length=env.horizon,
    n_itr=10,
    discount=0.995,
    step_size=0.01,
)

run_experiment_lite(
    algo.train(),
    # Only keep the snapshot parameters for the last iteration
    snapshot_mode="last",
    n_parallel=4,
    seed=0,
    # plot=True,
)
Exemple #35
0
    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=HORIZON * N_ROLLOUTS,
        max_path_length=HORIZON,
        n_itr=1000,
        # whole_paths=True,
        discount=0.999,
    )
    algo.train(),


exp_tag = "stabilizing_highway_%.3f" % RL_PENETRATION

for seed in [5]:  # , 20, 68, 72, 125]:
    run_experiment_lite(
        run_task,
        # Number of parallel workers for sampling
        n_parallel=N_CPUS,
        # Keeps the snapshot parameters for all iterations
        snapshot_mode="all",
        # Specifies the seed for the experiment. If this is not provided, a
        # random seed will be used
        seed=seed,
        mode="local",
        exp_prefix=exp_tag,
        # plot=True,
        sync_s3_pkl=True,
    )
Exemple #36
0
    baseline = LinearFeatureBaseline(env_spec=env.spec)
    #
    algo = PPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=4000,
        max_path_length=env.horizon,
        n_itr=100,  #000,
        discount=0.99,
        step_size=0.0075,  # 0.01
        # Uncomment both lines (this and the plot parameter below) to enable plotting
        # plot=True,
    )
    algo.train()


run_experiment_lite(
    run_task,
    # Number of parallel workers for sampling
    n_parallel=5,
    # Only keep the snapshot parameters for the last iteration
    snapshot_mode="last",
    exp_name=
    "testing",  #relu_small_network_ppo_capped_action_simpler_dense_layer_xW_learn_std_smaller_learning_rate",
    # Specifies the seed for the experiment. If this is not provided, a random seed
    # will be used
    seed=0,
    plot=True,
)
Exemple #37
0
    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=4000,
        max_path_length=env.horizon,
        n_itr=500,
        discount=0.99,
        step_size=0.01,
        # Uncomment both lines (this and the plot parameter below) to enable plotting
        # plot=True,
    )


run_experiment_lite(
    algo.train(),
    # Number of parallel workers for sampling
    n_parallel=1,
    # Only keep the snapshot parameters for the last iteration
    snapshot_mode="last",
    # Specifies the seed for the experiment. If this is not provided, a random seed
    # will be used
    seed=1,
    use_gpu=True,
    # plot=True,
)

Exemple #38
0
        init_lr=0.001,
        n_itr=5,
        train_feature_network=True,
    )

    batch_size = 10000
    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=batch_size,
        whole_paths=True,
        max_path_length=1000,
        n_itr=1000,
        step_size=0.01,
        subsample_factor=1.0,
        sampler_cls=BatchSampler,
        optimizer_args={
            'num_slices' : 10,
        }
    )

    run_experiment_lite(
        algo.train(),
        exp_prefix='trpo_box3d_pixel_v11_tf',
        n_parallel=12,
        snapshot_mode="gap",
        snapshot_gap=200,
        seed=seed,
        mode="local"
    )
Exemple #39
0
baseline = LinearFeatureBaseline(env_spec=env.spec)

algo = TRPO(
    #algo = VPG(
    env=env,
    policy=policy,
    baseline=baseline,
    batch_size=
    1000,  # was 4k  # 500 for path lenght of 5, 1000 for path length of 100
    max_path_length=100,
    n_itr=100,
    discount=0.99,
    step_size=0.01,
    #plot=True,
)
#algo.train()

run_experiment_lite(
    algo.train(),
    # Number of parallel workers for sampling
    n_parallel=4,
    # Only keep the snapshot parameters for the last iteration
    snapshot_mode="last",
    # Specifies the seed for the experiment. If this is not provided, a random seed
    # will be used
    seed=1,
    exp_prefix='vpg_sensitive_point100',
    exp_name='oracleenv2',
    #plot=True,
)
            algo = SensitiveTRPO(
                #algo = SensitiveVPG(
                env=env,
                policy=policy,
                baseline=baseline,
                batch_size=fast_batch_size,  # number of trajs for grad update
                max_path_length=max_path_length,
                meta_batch_size=meta_batch_size,
                num_grad_updates=num_grad_updates,
                n_itr=400,
                use_sensitive=use_sensitive,
                #optimizer_args={'tf_optimizer_args':{'learning_rate': learning_rate}},
                plot=False,
            )
            run_experiment_lite(
                algo.train(),
                n_parallel=0,
                snapshot_mode="last",
                seed=1,
                #exp_prefix='deleteme',
                #exp_name='deleteme'
                #exp_prefix='sensitive1dT5_2017_01_19',
                #exp_prefix='bugfix_sensitive0d_8tasks_T'+str(max_path_length)+'_2017_02_05',
                exp_prefix='trpo_sensitive_cheetah' + str(max_path_length),
                exp_name='sens' + str(int(use_sensitive)) + '_fbs' +
                str(fast_batch_size) + '_mbs' + str(meta_batch_size) +
                '_flr_' + str(fast_learning_rate) + '_lr_' +
                str(learning_rate) + '_step1' + str(num_grad_updates),
                plot=False,
            )
            hidden_sizes=(100,100),
        )
        if bas == 'zero':
            baseline = ZeroBaseline(env_spec=env.spec)
        elif 'linear' in bas:
            baseline = LinearFeatureBaseline(env_spec=env.spec)
        else:
            baseline = GaussianMLPBaseline(env_spec=env.spec)
        algo = MAMLTRPO(
            env=env,
            policy=policy,
            baseline=baseline,
            batch_size=fast_batch_size, # number of trajs for grad update
            max_path_length=max_path_length,
            meta_batch_size=meta_batch_size,
            num_grad_updates=num_grad_updates,
            n_itr=800,
            use_maml=use_maml,
            step_size=meta_step_size,
            plot=False,
        )
        run_experiment_lite(
            algo.train(),
            n_parallel=4,
            snapshot_mode="last",
            seed=1,
            exp_prefix='trpo_maml_4state',
            exp_name='trpo_maml'+str(int(use_maml))+'_fbs'+str(fast_batch_size)+'_mbs'+str(meta_batch_size)+'_flr_' + str(fast_learning_rate) + 'metalr_' + str(meta_step_size) +'_step1'+str(num_grad_updates),
            plot=False,
        )
Exemple #42
0
    parser.add_argument("env_name", type=str, help='available env_name:')
    parser.add_argument("random_seed", type=int)
    parser.add_argument("num_of_agents", type=int)
    parser.add_argument("temperature", type=float)
    parser.add_argument("batch_size", type=int, default=5000)

    args = parser.parse_args()
    env_name = env_map[args.env_name]
    prefix = prefix_map[args.env_name]
    n_epochs = n_epochs_map[args.env_name]
    random_seed = int(args.random_seed)
    run_function = function_map[args.algo]
    n_itr = n_epochs_map[args.env_name]
    num_of_agents = int(args.num_of_agents)
    temperature = float(args.temperature)
    learning_rate = learning_rate_map[args.env_name]
    batch_size = int(args.batch_size)

    if args.algo == "multi_REINFORCE_stein" or args.algo == "multi_REINFORCE_stein_anneal" or args.algo == 'multi_REINOFRCE_stein_reg' or args.algo == "multi_REINFORCE_stein_no_critic" or args.algo == 'multi_REINFORCE_baseline_no_critic' or args.algo == 'multi_REINFORCE_stein_evolution':
        args.algo = "{:}#{:}_temp={:}".format(args.algo, num_of_agents,
                                              args.temperature)

    run_experiment_lite(
        run_function,
        n_parallel=4,
        snapshot_mode="last",
        seed=random_seed,
        log_dir="./../exp_log/{:}_seed={:}_iter=500_env={:}_{:}".format(
            args.algo, random_seed, prefix, get_date()),
    )
        max_path_length=max_path_length,
        meta_batch_size=v['meta_batch_size'],
        num_grad_updates=num_grad_updates,
        n_itr=800,
        use_maml=use_maml,
        step_size=v['meta_step_size'],
        plot=False,
    )
    direc = 'direc' if direc else ''

    run_experiment_lite(
        algo.train(),
        exp_prefix='trpo_maml_cheetah' + direc + str(max_path_length),
        exp_name='maml'+str(int(use_maml))+'_fbs'+str(v['fast_batch_size'])+'_mbs'+str(v['meta_batch_size'])+'_flr_' + str(v['fast_lr'])  + '_mlr' + str(v['meta_step_size']),
        # Number of parallel workers for sampling
        n_parallel=8,
        # Only keep the snapshot parameters for the last iteration
        snapshot_mode="gap",
        snapshot_gap=25,
        sync_s3_pkl=True,
        python_command='python3',
        # Specifies the seed for the experiment. If this is not provided, a random seed
        # will be used
        seed=v["seed"],
        mode="local",
        #mode="ec2",
        variant=v,
        # plot=True,
        # terminate_machine=False,
    )
Exemple #44
0
    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=64 * 3 * horizon,
        max_path_length=horizon,
        # whole_paths=True,
        n_itr=1000,
        discount=0.999,
        # step_size=0.01,
    )
    algo.train()


exp_tag = "cooperative_merge_example"  # experiment prefix

for seed in [1]:  # , 5, 10, 56, 73]:
    run_experiment_lite(
        run_task,
        # Number of parallel workers for sampling
        n_parallel=1,
        # Only keep the snapshot parameters for the last iteration
        snapshot_mode="all",
        # Specifies the seed for the experiment. If this is not provided, a
        # random seed will be used
        seed=seed,
        mode="local",  # "ec2"
        exp_prefix=exp_tag,
        # plot=True,
    )
    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=4000,
        max_path_length=100,
        n_itr=40,
        discount=0.99,
        step_size=v["step_size"],
        # Uncomment both lines (this and the plot parameter below) to enable plotting
        # plot=True,
    )

    run_experiment_lite(
        algo.train(),
        exp_prefix="first_exp",
        # Number of parallel workers for sampling
        n_parallel=1,
        # Only keep the snapshot parameters for the last iteration
        snapshot_mode="last",
        # Specifies the seed for the experiment. If this is not provided, a random seed
        # will be used
        seed=v["seed"],
        # mode="local",
        mode="ec2",
        variant=v,
        # plot=True,
        # terminate_machine=False,
    )
    sys.exit()
        policy=policy,
        baseline=baseline,
        batch_size=batch_size,
        whole_paths=True,
        max_path_length=200,
        n_itr=1000,
        step_size=0.01,
        subsample_factor=1.0,
        optimizer_args={'num_slices': 10},
        sampler_cls=BatchSampler,
    )

    algorithm = ICM(
        mdp,
        algo,
        "/home/fred/box3d/trpo_box3d_state_v11_tf_icm_cos_ext0.9_%d" % seed,
        feature_dim=mdp.spec.observation_space.flat_dim,
        forward_weight=0.1,
        external_reward_weight=0.9,
        inverse_tanh=True,
        init_learning_rate=1e-4,
        n_updates_per_iter=500)

    run_experiment_lite(algorithm.train(),
                        exp_prefix='trpo_box3d_state_v11_tf_icm_cos_ext0.9',
                        n_parallel=8,
                        snapshot_mode="gap",
                        snapshot_gap=200,
                        seed=seed,
                        mode="local")
            load_policy=initial_params_file,
            baseline=baseline,
            batch_size=8000,
            max_path_length=200,
            n_itr=n_itr,
            reset_arg=goal,
            optimizer_args={'init_learning_rate': step_sizes[step_i], 'tf_optimizer_args': {'learning_rate': 0.01*step_sizes[step_i]}, 'tf_optimizer_cls': tf.train.GradientDescentOptimizer}
        )

        run_experiment_lite(
            algo.train(),
            # Number of parallel workers for sampling
            n_parallel=4,
            # Only keep the snapshot parameters for the last iteration
            snapshot_mode="all",
            # Specifies the seed for the experiment. If this is not provided, a random seed
            # will be used
            seed=goal_i,
            exp_prefix='antdirec_test',
            exp_name='test' + str(run_id),
            plot=True,
        )

        # get return from the experiment
        with open('data/local/antdirec-test/test'+str(run_id)+'/progress.csv', 'r') as f:
            reader = csv.reader(f, delimiter=',')
            i = 0
            row = None
            returns = []
            for row in reader:
                i+=1
    baseline = LinearFeatureBaseline(env_spec=env.spec)
    # max_path_length = env.horizon
    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=4000,
        max_path_length=300,
        n_itr=10000,
        discount=0.99,
        step_size=0.02,
        # truncate_local_is_ratio = 0.2
        # Uncomment both lines (this and the plot parameter below) to enable plotting
        # plot=True,
    )
    algo.train()


run_experiment_lite(
    run_task,
    # Number of parallel workers for sampling
    log_dir='./log/trpo_mntcar_cont',
    n_parallel=1,
    # Only keep the snapshot parameters for the last iteration
    snapshot_mode="last",
    # Specifies the seed for the experiment. If this is not provided, a random seed
    # will be used
    seed=1,
    # plot=True,
)
def run_task(*_):
    env = normalize(GymEnv("Pendulum-v0"))

    policy = GaussianMLPPolicy(
        env_spec=env.spec,
        hidden_sizes=(32, 32)
    )

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=4000,
        max_path_length=env.horizon,
        n_itr=50,
        discount=0.99,
        step_size=0.01,
        plot=True,
    )
    algo.train()


run_experiment_lite(
    run_task,
    n_parallel=1,
    snapshot_mode="last",
    plot=True,
)
Exemple #50
0
        step_size=v['meta_step_size'],
        plot=False,
    )
    exp_name = 'Cellrobot_BigDog2trpo_maml' + task_var + '_' + str(
        max_path_length) + '_EXP' + str(exp_id)
    run_experiment_lite(
        algo.train(),
        exp_prefix=exp_name,
        exp_name='maml' + str(int(use_maml)) + '_fbs' +
        str(v['fast_batch_size']) + '_mbs' + str(v['meta_batch_size']) +
        '_flr_' + str(v['fast_lr']) + '_mlr' + str(v['meta_step_size']),
        # Number of parallel workers for sampling
        n_parallel=16,
        # Only keep the snapshot parameters for the last iteration
        snapshot_mode="gap",
        snapshot_gap=2,
        sync_s3_pkl=True,
        # Specifies the seed for the experiment. If this is not provided, a random seed
        # will be used
        seed=v["seed"],
        mode="local",
        # mode="ec2",
        variant=v,
        # plot=True,
        # terminate_machine=False,
    )

    if ssh_FLAG:
        local_dir = os.path.abspath('data/local/' + exp_name + '/')
        remote_dir = '/home/drl/PycharmProjects/maml_rl-master/data/AWS_data/' + exp_name + '/'
        ssh.upload(local_dir,
            load_policy=initial_params_file,
            baseline=baseline,
            batch_size=4000,  # 2x
            max_path_length=100,
            n_itr=n_itr,
            optimizer_args={'init_learning_rate': step_sizes[step_i], 'tf_optimizer_args': {'learning_rate': 0.5*step_sizes[step_i]}, 'tf_optimizer_cls': tf.train.GradientDescentOptimizer}
        )


        run_experiment_lite(
            algo.train(),
            # Number of parallel workers for sampling
            n_parallel=4,
            # Only keep the snapshot parameters for the last iteration
            snapshot_mode="last",
            # Specifies the seed for the experiment. If this is not provided, a random seed
            # will be used
            seed=4,
            exp_prefix='trpopoint2d_test',
            exp_name='test',
            #plot=True,
        )
        import pdb; pdb.set_trace()
        # get return from the experiment
        with open('data/local/trpopoint2d-test/test/progress.csv', 'r') as f:
            reader = csv.reader(f, delimiter=',')
            i = 0
            row = None
            returns = []
            for row in reader:
                i+=1
else:
    env = TfEnv(normalize(SwimmerEnv()))
    batch_size = 20
policy = GaussianMLPPolicy(
    name="policy",
    env_spec=env.spec,
    hidden_sizes=(100,100),
)
baseline = LinearFeatureBaseline(env_spec=env.spec)
#baseline = ZeroBaseline(env_spec=env.spec)
algo = VPG(
    env=env,
    policy=policy,
    baseline=baseline,
    batch_size=500*batch_size,
    max_path_length=500,
    n_itr=500,
    #plot=True,
    optimizer_args={'tf_optimizer_args':{'learning_rate': 1e-3}},
)
run_experiment_lite(
    algo.train(),
    n_parallel=1,  # try increasing this to make it faster??? (Maybe need to modify code for this)
    snapshot_mode="last",
    seed=1,
    exp_prefix='vpgswimmer',
    #exp_name='basic',
    exp_name='randomenv',
    #plot=True,
)