def run_sql_experiment(main, mode, include_folders=None, log_dir=None, exp_prefix="experiment", exp_name=None, **kwargs): if exp_name is None: exp_name = timestamp() if log_dir is None: log_dir = os.path.join( DEFAULT_LOG_DIR, "local", exp_prefix.replace("_", "-"), exp_name) if include_folders is None: include_folders = list() if mode == 'ec2': include_folders.append('softqlearning') all_symlinks = list() for folder in include_folders: all_symlinks.append(_create_symlink(folder)) kwargs.update(added_project_directories=all_symlinks) run_experiment_lite( stub_method_call=main, mode=mode, exp_prefix=exp_prefix, exp_name=exp_name, log_dir=log_dir, **kwargs, )
def _launch_ec2(func, exp_prefix, exp_name, params, run_experiment_kwargs): print("Launching task", exp_name) kwargs = dict( n_parallel=1, snapshot_mode="last", seed=params.get("seed",None), mode="ec2" ) kwargs.update(run_experiment_kwargs) kwargs.update(dict( exp_prefix=exp_prefix, exp_name=exp_name, variant=params, confirm_remote=False)) run_experiment_lite(func,**kwargs)
def run_experiment(**params): base_params = copy.copy(DEFAULTS) base_params.update(params) params = base_params pprint(params) grid_world = SlaveGridWorldEnv("walled_chain", max_traj_length=DEFAULTS["max_path_length"], goal_reward=params["goal_reward"]) agent = GridWorldMasterAgent(grid_world, match_reward=params["match_reward"]) env = normalize(SituatedConversationEnvironment(env=grid_world, b_agent=agent)) baseline = LinearFeatureBaseline(env) policy = RecurrentCategoricalPolicy( name="policy", env_spec=env.spec, hidden_dims=params["policy_hidden_dims"], feature_network=MLPNetworkWithEmbeddings( "feature_network", env.observation_space.flat_dim, params["feature_dim"], params["feature_hidden_dims"], tf.tanh, tf.tanh, agent.vocab_size, params["embedding_dim"]), state_include_action=False, ) optimizer = ConjugateGradientOptimizer(hvp_approach=FiniteDifferenceHvp(base_eps=1e-5)) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=params["batch_size"], max_path_length=params["max_path_length"], n_itr=params["n_itr"], discount=0.99, step_size=params["step_size"], optimizer=optimizer, ) run_experiment_lite( algo.train(), n_parallel=15, snapshot_mode="last", exp_prefix="grid_world_sweep3", variant=params, )
def run_experiment(params): params_base = copy.copy(DEFAULTS) params_base.update(params) params = params_base policy = RecurrentCategoricalPolicy( name="policy", env_spec=env.spec, hidden_dims=params["policy_hidden_dims"], feature_network=MLPNetworkWithEmbeddings( "embeddings", len(VOCAB), params["feature_dim"], params["feature_hidden_dims"], tf.tanh, tf.tanh, len(VOCAB), params["embedding_dim"], has_other_input=False), state_include_action=False, ) baseline = LinearFeatureBaseline(env.spec) optimizer = ConjugateGradientOptimizer(hvp_approach=FiniteDifferenceHvp(base_eps=1e-5)) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=params["batch_size"], max_path_length=LENGTH, n_itr=params["n_itr"], discount=0.99, step_size=params["step_size"], optimizer=optimizer, ) run_experiment_lite( algo.train(), n_parallel=5, snapshot_mode="last", exp_prefix="autoenc_unnorm_reward", variant=params, )
def run_experiment(**params): base_params = copy.copy(DEFAULTS) base_params.update(params) params = base_params grid_world = SlaveGridWorldEnv("3x3", goal_reward=params["goal_reward"]) env = normalize(grid_world) baseline = LinearFeatureBaseline(env) policy = CategoricalMLPPolicy( name="policy", env_spec=env.spec, hidden_sizes=params["policy_hidden_dims"], ) optimizer = ConjugateGradientOptimizer(hvp_approach=FiniteDifferenceHvp(base_eps=1e-5)) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=params["batch_size"], max_path_length=5, n_itr=params["n_itr"], discount=0.99, step_size=params["step_size"], optimizer=optimizer, ) run_experiment_lite( algo.train(), n_parallel=5, snapshot_mode="last", exp_prefix="grid_world_silent", variant=params, )
from sandbox.rocky.tf.envs.base import TfEnv stub(globals()) #env = TfEnv(normalize(PointEnv())) env = TfEnv(normalize(PointEnvRandGoal())) policy = GaussianMLPPolicy( name="policy", env_spec=env.spec, ) #baseline = LinearFeatureBaseline(env_spec=env.spec) baseline = ZeroBaseline(env_spec=env.spec) algo = VPG( env=env, policy=policy, baseline=baseline, #batch_size=20, max_path_length=5, n_itr=100, #plot=True, ) run_experiment_lite( algo.train(), n_parallel=1, snapshot_mode="last", seed=1, exp_prefix='deleteme', exp_name='deleteme', #plot=True, )
batch_size=64, max_path_length=env.horizon, epoch_length=1000, min_pool_size=10000, n_epochs=args.num_epochs, discount=0.99, scale_reward=args.reward_scale, qf_learning_rate=1e-3, policy_learning_rate=1e-4, plot=False ) run_experiment_lite( algo.train(), log_dir=None if args.use_ec2 else args.data_dir, # Number of parallel workers for sampling n_parallel=1, # Only keep the snapshot parameters for the last iteration snapshot_mode="last", # Specifies the seed for the experiment. If this is not provided, a random seed # will be used exp_prefix="DDPG_" + args.env, seed=1, mode="ec2" if args.use_ec2 else "local", plot=False, # dry=True, terminate_machine=args.dont_terminate_machine, added_project_directories=[osp.abspath(osp.join(osp.dirname(__file__), '.'))] )
) else: policy = GaussianMLPPolicy(env_spec=mdp.spec, hidden_sizes=(32, 32), init_std=10) baseline = LinearFeatureBaseline(mdp.spec, ) batch_size = 50 * 250 algo = TRPO(env=mdp, policy=policy, baseline=baseline, batch_size=batch_size, whole_paths=True, max_path_length=50, n_itr=200, step_size=0.01, subsample_factor=1.0, **copyparams) run_experiment_lite( algo.train(), exp_prefix="r_push_new_ours-quad1", n_parallel=4, # dry=True, snapshot_mode="all", seed=seed, mode="ec2_mujoco", # terminate_machine=False )
env=env, policy=policy, es=es, qf=qf, batch_size=size_of_batch, max_path_length=100, epoch_length=1000, min_pool_size=10000, n_epochs=number_of_episodes, discount=discount_factor, scale_reward=reward_scaling[r], qf_learning_rate=critic_learning_rate[c], policy_learning_rate=actor_learning_rate[c], # Uncomment both lines (this and the plot parameter below) to enable plotting # plot=True, ) algo.train() run_experiment_lite( run_task, # Number of parallel workers for sampling n_parallel=1, # Only keep the snapshot parameters for the last iteration snapshot_mode="last", # Specifies the seed for the experiment. If this is not provided, a random seed # will be used exp_name="DDPG_HalfCheetah/" + "HalfCheetah", seed=1, # plot=True, )
) if bas == 'zero': baseline = ZeroBaseline(env_spec=env.spec) elif 'linear' in bas: baseline = LinearFeatureBaseline(env_spec=env.spec) else: baseline = GaussianMLPBaseline(env_spec=env.spec) algo = MAMLTRPO( env=env, policy=policy, baseline=baseline, batch_size=fast_batch_size, # number of trajs for grad update max_path_length=max_path_length, meta_batch_size=meta_batch_size, num_grad_updates=num_grad_updates, n_itr=100, use_maml=use_maml, step_size=meta_step_size, plot=False, ) run_experiment_lite( algo.train(), n_parallel=1, snapshot_mode="last", python_command='python3', seed=1, exp_prefix='vpg_maml_point100', exp_name='trpomaml'+str(int(use_maml))+'_fbs'+str(fast_batch_size)+'_mbs'+str(meta_batch_size)+'_flr_' + str(fast_learning_rate) + 'metalr_' + str(meta_step_size) +'_step1'+str(num_grad_updates), plot=False, )
run_experiment_lite( # use_cloudpickle=False, stub_method_call=run_task, variant=vv, mode=mode, # Number of parallel workers for sampling n_parallel=n_parallel, # Only keep the snapshot parameters for the last iteration snapshot_mode="last", seed=vv['seed'], # plot=True, exp_prefix=exp_prefix, # exp_name=exp_name, sync_s3_pkl=True, # for sync the pkl file also during the training sync_s3_png=True, sync_s3_html=True, # # use this ONLY with ec2 or local_docker!!! pre_commands=[ 'export MPLBACKEND=Agg', 'pip install --upgrade pip', 'pip install --upgrade -I tensorflow', 'pip install git+https://github.com/tflearn/tflearn.git', 'pip install dominate', 'pip install multiprocessing_on_dill', 'pip install scikit-image', 'conda install numpy -n rllab3 -y', ], ) if mode == 'local_docker':
policy=policy, es=es, qf=qf, batch_size=batch_size_values[b], max_path_length=env.horizon, epoch_length=1000, min_pool_size=10000, n_epochs=args.num_epochs, discount=0.99, scale_reward=1.0, qf_learning_rate=1e-3, policy_learning_rate=1e-4, # Uncomment both lines (this and the plot parameter below) to enable plotting plot=args.plot, ) run_experiment_lite( algo.train(), # log_dir=args.data_dir, # Number of parallel workers for sampling n_parallel=1, # Only keep the snapshot parameters for the last iteration snapshot_mode="last", # Specifies the seed for the experiment. If this is not provided, a random seed # will be used exp_name="reproducibility_ML/" + "DDPG/" + "HalfCheetah/" + "Batch_Size_Tune/" + "Batch_Size_" + str(batch_size_values[b]) + "_Experiment_" + str(e), seed=1, plot=args.plot, )
from rllab.policies.gaussian_gru_policy import GaussianGRUPolicy from rllab.optimizers.conjugate_gradient_optimizer import ConjugateGradientOptimizer, FiniteDifferenceHvp from rllab.misc.instrument import stub, run_experiment_lite stub(globals()) env = normalize(CartpoleEnv()) policy = GaussianGRUPolicy( env_spec=env.spec, ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=4000, max_path_length=100, n_itr=10, discount=0.99, step_size=0.01, optimizer=ConjugateGradientOptimizer(hvp_approach=FiniteDifferenceHvp(base_eps=1e-5)) ) run_experiment_lite( algo.train(), n_parallel=1, seed=1, )
if param.name == hrl_param.name: param.set_value(hrl_param.get_value(borrow=True)) for param in hrl_pol_param: for hrl_param in llc_param: if param.name == hrl_param.name: param.set_value(hrl_param.get_value(borrow=True)) for i in range(100): algo1.current_itr = 0 algo2.current_itr = 0 algo2.train(continue_learning=(i > 0)) sep2int() algo1.train(continue_learning=(i > 0)) int2sep() run_experiment_lite( run_task, # Number of parallel workers for sampling n_parallel=0, # Only keep the snapshot parameters for the last iteration snapshot_mode="last", # Specifies the seed for the experiment. If this is not provided, a random seed # will be used seed=1, exp_prefix='Walker3d_async_hrl' # plot=True )
baseline=baseline, batch_size=4000, # 2x max_path_length=200, n_itr=n_itr, reset_arg=goal, optimizer_args={'init_learning_rate': step_sizes[step_i], 'tf_optimizer_args': {'learning_rate': 0.5*step_sizes[step_i]}, 'tf_optimizer_cls': tf.train.GradientDescentOptimizer} ) run_experiment_lite( algo.train(), # Number of parallel workers for sampling n_parallel=4, # Only keep the snapshot parameters for the last iteration snapshot_mode="all", # Specifies the seed for the experiment. If this is not provided, a random seed # will be used seed=1, exp_prefix='ant_test_posticml', exp_name='test' + str(run_id), #plot=True, ) # get return from the experiment with open('data/local/ant-test-posticml/test'+str(run_id)+'/progress.csv', 'r') as f: reader = csv.reader(f, delimiter=',') i = 0 row = None returns = []
quantization_tunings = [1, 5, 15, 20] discounts = [0.99] participation_rates = [1] agents_numbers = [5] average_periods = [10] for quantization_tuning in quantization_tunings: for discount in discounts: for participation_rate in participation_rates: for agents_number in agents_numbers: for average_period in average_periods: run_experiment_lite( run_task, exp_prefix="test_quantized", # Number of parallel workers for sampling n_parallel=1, # Only keep the snapshot parameters for the last iteration snapshot_mode="last", # Specifies the seed for the experiment. If this is not provided, a random seed # will be used mode="local", variant=dict(quantization_tuning=quantization_tuning, discount=discount, participation_rate=participation_rate, agents_number=agents_number, average_period=average_period) # plot=True, # terminate_machine=False, )
assert 'render_every' in params['rollout_params'] params['rollout_params']['render_every'] = None count = 1 for i in range(options.n): l_bfgs_exception(params) aws_config = get_aws_config(count) run_experiment_lite(train, exp_prefix=exp_prefix, mode=mode, variant=dict(mode=mode, params=params, use_gpu=True, seed=i), dry=False, aws_config=aws_config, sync_s3_pkl=True, sync_s3_png=True, sync_s3_log=True, pre_commands=[ "pip install --upgrade pip", "pip install mpi4py", "pip install plotly", "pip install pandas", "pip install seaborn" ], use_gpu=True # terminate_machine=False ) print(count) count += 1 else: mode = "local" l_bfgs_exception(params) import colored_traceback.always
# exp_prefix = "test" now = datetime.datetime.now(dateutil.tz.tzlocal()) timestamp = now.strftime('%Y_%m_%d_%H_%M_%S') exp_name = 'TRPO_scratch__{}batch_{}length_{}id_{}_parallel{}'.format( # time_step_agg, int(batch_size), int(max_path_length), maze_id, timestamp, n_parallel) run_experiment_lite( stub_method_call=algo.train(), mode=mode, use_cloudpickle=False, pre_commands=[ 'pip install --upgrade pip', 'pip install --upgrade theano', ], # Number of parallel workers for sampling n_parallel=n_parallel, # Only keep the snapshot parameters for the last iteration snapshot_mode="last", seed=s, # Save to data/local/exp_prefix/exp_name/ exp_prefix=exp_prefix, exp_name=exp_name, use_gpu=False, )
whole_paths=True, max_path_length=500, n_itr=10000, step_size=0.01, eta=eta, snn_n_samples=10, subsample_factor=1.0, use_replay_pool=True, use_kl_ratio=True, use_kl_ratio_q=True, n_itr_update=1, kl_batch_size=1, normalize_reward=False, replay_pool_size=1000000, n_updates_per_sample=5000, second_order_update=True, unn_n_hidden=[32], unn_layers_type=[1, 1], unn_learning_rate=0.0001 ) run_experiment_lite( algo.train(), exp_prefix="trpo-expl", n_parallel=4, snapshot_mode="last", seed=seed, mode="local", script="sandbox/vime/experiments/run_experiment_lite.py", )
max_path_length=1500, n_itr=4500, step_size=0.01, eta=eta, snn_n_samples=10, subsample_factor=1.0, use_replay_pool=True, use_kl_ratio=True, use_kl_ratio_q=True, n_itr_update=1, kl_batch_size=1, normalize_reward=False, replay_pool_size=1000000, n_updates_per_sample=5000, second_order_update=True, unn_n_hidden=[64], unn_layers_type=[1, 1], unn_learning_rate=0.0001 ) run_experiment_lite( algo.train(), exp_prefix="trpo-expl", n_parallel=1, snapshot_mode="last", seed=seed, args_data="/home/ubuntu/work/rllab/data/local/trpo-expl/trpo-expl_2016_06_29_01_56_06_0001/params.pkl", mode="local", script="sandbox/vime/experiments/run_experiment_lite.py", )
stub(globals()) # Param ranges seeds = range(2) # SwimmerGather hierarchical task mdp_classes = [SwimmerGatherEnv] mdps = [NormalizedEnv(env=mdp_class()) for mdp_class in mdp_classes] param_cart_product = itertools.product(mdps, seeds) for mdp, seed in param_cart_product: policy = GaussianMLPPolicy(env_spec=mdp.spec, hidden_sizes=(64, 32)) baseline = LinearFeatureBaseline(mdp.spec) batch_size = 50000 algo = TRPO( env=mdp, policy=policy, baseline=baseline, batch_size=batch_size, whole_paths=True, max_path_length=500, n_itr=10000, step_size=0.01, subsample_factor=1.0, ) run_experiment_lite(algo.train(), exp_prefix="trpo", n_parallel=4, snapshot_mode="last", seed=seed, mode="local")
from examples.point_env_randgoal import PointEnvRandGoal from rllab.envs.normalized_env import normalize from rllab.misc.instrument import stub, run_experiment_lite #from rllab.policies.gaussian_mlp_policy import GaussianMLPPolicy #from sandbox.rocky.tf.policies.gaussian_mlp_policy import GaussianMLPPolicy from sandbox.rocky.tf.policies.minimal_gauss_mlp_policy import GaussianMLPPolicy from sandbox.rocky.tf.envs.base import TfEnv stub(globals()) env = TfEnv(normalize(PointEnv())) #env = TfEnv(normalize(PointEnvRandGoal())) policy = GaussianMLPPolicy( name="policy", env_spec=env.spec, ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = VPG( env=env, policy=policy, baseline=baseline, #plot=True, ) run_experiment_lite( algo.train(), n_parallel=1, snapshot_mode="last", seed=1, #plot=True, )
# see https://github.com/openai/rllab/issues/87#issuecomment-282519288 env = TfEnv(normalize(GymEnv("CartPole-v0", force_reset=True))) policy = CategoricalMLPPolicy( name="policy", env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(32, 32) ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=4000, max_path_length=200, n_itr=120, discount=0.99, step_size=0.01, # optimizer=ConjugateGradientOptimizer(hvp_approach=FiniteDifferenceHvp(base_eps=1e-5)) ) run_experiment_lite( algo.train(), n_parallel=1, snapshot_mode="last", seed=1 )
def run_evaluation(argv): # -------------------- Parse Arguments ----------------------------------- parser = argparse.ArgumentParser() parser.add_argument( 'exp_prefix_dir', type=str, help='path to dump dir which contains folders with ' 'the train results i.e. params.pkl and variant.json file') parser.add_argument( '--mode', type=str, default='local', help='Mode for running the experiments - local: runs on local machine, ' 'ec2: runs on AWS ec2 cluster (requires a proper configuration file)') parser.add_argument( '--n_parallel', type=int, default=1, help= 'Number of parallel workers to perform rollouts. 0 => don\'t start any workers' ) parser.add_argument('--num_sampled_envs', type=int, default=5, help='number or environments with samples parameters') args = parser.parse_args(argv[1:]) # ----------------------- EVALUATION --------------------------------------- exp_prefix = os.path.basename(args.exp_prefix_dir) eval_exp_prefix = exp_prefix + '-eval' evaluation_runs = eval.prepare_evaluation_runs( args.exp_prefix_dir, EXP_PREFIX, num_sampled_envs=args.num_sampled_envs) # ----------------------- AWS conficuration --------------------------------- if args.mode == 'ec2': subnets = cheapest_subnets(ec2_instance, num_subnets=3) info = config.INSTANCE_TYPE_INFO[ec2_instance] config.AWS_INSTANCE_TYPE = ec2_instance config.AWS_SPOT_PRICE = str(info["price"]) print("\n" + "**********" * 10 + "\nexp_prefix: {}\nvariants: {}".format('TRPO', len(evaluation_runs))) print( 'Running on type {}, with price {}, on the subnets: '.format( config.AWS_INSTANCE_TYPE, config.AWS_SPOT_PRICE, ), str(subnets)) for eval_exp_name, v in evaluation_runs: if args.mode == 'ec2': subnet = random.choice(subnets) config.AWS_REGION_NAME = subnet[:-1] config.AWS_KEY_NAME = config.ALL_REGION_AWS_KEY_NAMES[ config.AWS_REGION_NAME] config.AWS_IMAGE_ID = config.ALL_REGION_AWS_IMAGE_IDS[ config.AWS_REGION_NAME] config.AWS_SECURITY_GROUP_IDS = \ config.ALL_REGION_AWS_SECURITY_GROUP_IDS[ config.AWS_REGION_NAME] run_experiment_lite( run_eval_task, exp_prefix=eval_exp_prefix, exp_name=eval_exp_name, # Number of parallel workers for sampling n_parallel=args.n_parallel, # Only keep the snapshot parameters for the last iteration snapshot_mode="last", # Specifies the seed for the experiment. If this is not provided, a random seed # will be used seed=v["seed"], python_command='python3', pre_commands=[ "yes | pip install --upgrade pip", "yes | pip install tensorflow=='1.6.0'", "yes | pip install --upgrade cloudpickle" ], mode=args.mode, use_cloudpickle=True, periodic_sync=True, variant=v, # plot=True, # terminate_machine=False, )
policy = GaussianMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO(env=env, policy=policy, baseline=baseline, batch_size=4000, whole_paths=True, max_path_length=100, n_itr=20, discount=0.99, step_size=0.01, plot=False) run_experiment_lite( algo.train(), # Number of parallel workers for sampling n_parallel=1, # Only keep the snapshot parameters for the last iteration snapshot_mode="last", log_dir="./results", # Specifies the seed for the experiment. If this is not provided, a random seed # will be used seed=1, plot=True, )
policy = RecurrentCategoricalPolicy( name="policy", env_spec=env.spec, hidden_dim=128, state_include_action=False, #temperature=2, ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=50000, max_path_length=5, n_itr=50, discount=0.99, step_size=0.01, optimizer=ConjugateGradientOptimizer(hvp_approach=FiniteDifferenceHvp(base_eps=1e-5)) ) run_experiment_lite( algo.train(), n_parallel=5, snapshot_mode="last", log_dir="./log", )
INPUT_FEED) run_experiment_lite( algo.train( ), n_parallel= 1, snapshot_mode ="all", python_command ='python3', seed=seed, exp_prefix= str('PU_IL_' + time. strftime( "%D" ). replace( "/", "") [0:4]), exp_name= exp_name, plot=False, sync_s3_pkl =True, mode=mode, terminate_machine =True, )
env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=4000, max_path_length=100, n_itr=1000, discount=0.99, step_size=0.01, ) algo.train() run_experiment_lite( run_task, # Number of parallel workers for sampling n_parallel=1, # Only keep the snapshot parameters for the last iteration snapshot_mode="last", exp_name="TRPO_Trial_Results/" + "Trial_GridWorld/", # Specifies the seed for the experiment. If this is not provided, a random seed # will be used seed=1, # plot=True, )
def run_experiment_old( task, exp_prefix='default', seed=None, variant=None, time_it=True, save_profile=False, profile_file='time_log.prof', mode='here', exp_id=0, unique_id=None, prepend_date_to_exp_prefix=True, use_gpu=False, snapshot_mode='last', snapshot_gap=1, n_parallel=0, base_log_dir=None, **run_experiment_lite_kwargs ): """ Run a task via the rllab interface, i.e. serialize it and then run it via the run_experiment_lite script. This will soon be deprecated. :param task: :param exp_prefix: :param seed: :param variant: :param time_it: Add a "time" command to the python command? :param save_profile: Create a cProfile log? :param profile_file: Where to save the cProfile log. :param mode: 'here' will run the code in line, without any serialization Other options include 'local', 'local_docker', and 'ec2'. See run_experiment_lite documentation to learn what those modes do. :param exp_id: Experiment ID. Should be unique across all experiments. Note that one experiment may correspond to multiple seeds. :param unique_id: Unique ID should be unique across all runs--even different seeds! :param prepend_date_to_exp_prefix: If True, prefix "month-day_" to exp_prefix :param run_experiment_lite_kwargs: kwargs to be passed to `run_experiment_lite` :return: """ if seed is None: seed = random.randint(0, 100000) if variant is None: variant = {} if unique_id is None: unique_id = str(uuid.uuid4()) if prepend_date_to_exp_prefix: exp_prefix = time.strftime("%m-%d") + "_" + exp_prefix variant['seed'] = str(seed) variant['exp_id'] = str(exp_id) variant['unique_id'] = str(unique_id) logger.log("Variant:") logger.log(json.dumps(ppp.dict_to_safe_json(variant), indent=2)) command_words = [] if time_it: command_words.append('time') command_words.append('python') if save_profile: command_words += ['-m cProfile -o', profile_file] repo = git.Repo(os.getcwd()) diff_string = repo.git.diff(None) commit_hash = repo.head.commit.hexsha script_name = "tmp" if mode == 'here': log_dir, exp_name = create_log_dir(exp_prefix, exp_id, seed, base_log_dir) data = dict( log_dir=log_dir, exp_name=exp_name, mode=mode, variant=variant, exp_id=exp_id, exp_prefix=exp_prefix, seed=seed, use_gpu=use_gpu, snapshot_mode=snapshot_mode, snapshot_gap=snapshot_gap, diff_string=diff_string, commit_hash=commit_hash, n_parallel=n_parallel, base_log_dir=base_log_dir, script_name=script_name, ) save_experiment_data(data, log_dir) if mode == 'here': run_experiment_here( task, exp_prefix=exp_prefix, variant=variant, exp_id=exp_id, seed=seed, use_gpu=use_gpu, snapshot_mode=snapshot_mode, snapshot_gap=snapshot_gap, code_diff=diff_string, commit_hash=commit_hash, script_name=script_name, n_parallel=n_parallel, base_log_dir=base_log_dir, ) else: if mode == "ec2" and use_gpu: if not query_yes_no( "EC2 is more expensive with GPUs. Confirm?" ): sys.exit(1) code_diff = ( base64.b64encode(cloudpickle.dumps(diff_string)).decode("utf-8") ) run_experiment_lite( task, snapshot_mode=snapshot_mode, snapshot_gap=snapshot_gap, exp_prefix=exp_prefix, variant=variant, seed=seed, use_cloudpickle=True, python_command=' '.join(command_words), mode=mode, use_gpu=use_gpu, script="railrl/scripts/run_experiment_lite.py", code_diff=code_diff, commit_hash=commit_hash, script_name=script_name, n_parallel=n_parallel, **run_experiment_lite_kwargs )
def run_trpo_vase(env,nRuns = 20,seed_base=0, sigma_c=0.5, ablation_mode=False): now = datetime.datetime.now(dateutil.tz.tzlocal()) timestamp = now.strftime('%Y_%m_%d_%H_%M_%S') for seed in range(seed_base,nRuns): if env == 'mountaincar': mdp = MountainCarEnvX() n_itr = 50 max_path_length = 500 type = 'classic' elif env == 'cartpole': mdp = NormalizedEnv(env=CartpoleSwingupEnvX()) n_itr = 400 max_path_length = 500 type = 'classic' elif env == 'doublependulum': mdp = NormalizedEnv(env=DoublePendulumEnvX()) n_itr = 400 max_path_length = 500 type = 'classic' elif env == 'halfcheetah': mdp = NormalizedEnv(env=HalfCheetahEnvX()) n_itr = 600 max_path_length = 500 type = 'locomotion' elif env == 'ant': mdp = NormalizedEnv(env=AntEnv()) n_itr = 600 max_path_length = 500 type = 'locomotion' elif env == 'lunarlander': mdp = NormalizedEnv(env=LunarLanderContinuous()) n_itr = 100 max_path_length = 1000 type = 'classic' else: sys.stderr.write("Error! Environment '%s' not recognised\n" % env) sys.exit(-1) if type == 'classic': step_size = 0.01 replay_pool_size = 100000 policy_hidden_sizes = (32,) unn_n_hidden = [32] unn_layers_type=[1, 1] baseline = GaussianMLPBaseline( env_spec=mdp.spec, regressor_args={ 'hidden_sizes': (32,), 'learn_std': False, 'hidden_nonlinearity': NL.rectify, 'optimizer': ConjugateGradientOptimizer(subsample_factor=1.0) } ) else: step_size = 0.05 replay_pool_size = 5000000 policy_hidden_sizes = (64, 32) unn_n_hidden = [64, 64] unn_layers_type=[1, 1, 1] baseline = LinearFeatureBaseline( mdp.spec, ) policy = GaussianMLPPolicy( env_spec=mdp.spec, hidden_sizes=policy_hidden_sizes, hidden_nonlinearity=NL.tanh ) algo = TRPO( env=mdp, policy=policy, baseline=baseline, n_itr=n_itr, batch_size=5000, max_path_length = max_path_length, discount = 0.995, gae_lambda = 0.95, whole_paths=True, step_size=step_size, eta=1e-4, snn_n_samples=10, prior_sd=0.5, likelihood_sd=sigma_c, subsample_factor=1.0, use_replay_pool=True, replay_pool_size=replay_pool_size, n_updates_per_sample=500, unn_n_hidden=unn_n_hidden, unn_layers_type=unn_layers_type, unn_learning_rate=0.001 ) exp_name = "trpo-vase_%s_%04d" % (timestamp, seed + 1) if ablation_mode: cwd = os.getcwd() log_dir = cwd + "/data/local/sigmas/" + env + ("/%.3f/" % sigma_c) + exp_name else: log_dir = config.LOG_DIR + "/local/" + env + "/" + exp_name run_experiment_lite( algo.train(), exp_name = exp_name, log_dir= log_dir, n_parallel=0, snapshot_mode="last", seed=seed, mode="local", script="sandbox/vase/experiments/run_experiment_lite.py" )
es=es, qf=qf, batch_size=35, max_path_length=100, epoch_length=5000, min_pool_size=10000, n_epochs=100, discount=0.99, scale_reward=variant["scale_reward"], soft_target_tau=1e-3, qf_learning_rate=variant["qf_learning_rate"], policy_learning_rate=variant["policy_learning_rate"], #Uncomment both lines (this and the plot parameter below) to enable plotting plot=True, eval_samples=5000, ) run_experiment_lite( algo.train(), # Number of parallel workers for sampling n_parallel=1, # Only keep the snapshot parameters for the last iteration snapshot_mode="last", exp_prefix="dpg_search", # Specifies the seed for the experiment. If this is not provided, a random seed # will be used exp_name=str(num), seed=2, plot=True, )
AssociatePublicIpAddress=True, ) ] run_experiment_lite( stub_method_call=algo.train(), mode='ec2', use_cloudpickle=False, # Number of parallel workers for sampling n_parallel=n_parallel, # Only keep the snapshot parameters for the last iteration snapshot_mode="last", seed=s, # plot=True, exp_prefix=exp_prefix, exp_name=exp_name, sync_s3_pkl=True, # for sync the pkl file also during the training sync_s3_png=True, # # use this ONLY with ec2 or local_docker!!! pre_commands=[ "which conda", "which python", "conda list -n rllab3", "conda install -f numpy -n rllab3 -y", ], ) else: run_experiment_lite( stub_method_call=algo.train(), mode='local',
from rllab.policies.gaussian_mlp_policy import GaussianMLPPolicy def run_task(*_): env = normalize(GymEnv("Pendulum-v0")) policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=4000, max_path_length=env.horizon, n_itr=50, discount=0.99, step_size=0.01, plot=True, ) algo.train() run_experiment_lite( run_task, n_parallel=1, snapshot_mode="last", plot=True, )
normalize(GymEnv("Walker2d-v1", record_video=False, force_reset=True))) policy = GaussianMLPPolicy( name="policy", env_spec=env.spec, # The neural network policy should have two hidden layers, each with 64 hidden units. hidden_sizes=(64, 64), ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=50000, max_path_length=env.horizon, n_itr=10, discount=0.995, step_size=0.01, ) run_experiment_lite( algo.train(), # Only keep the snapshot parameters for the last iteration snapshot_mode="last", n_parallel=4, seed=0, # plot=True, )
algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=HORIZON * N_ROLLOUTS, max_path_length=HORIZON, n_itr=1000, # whole_paths=True, discount=0.999, ) algo.train(), exp_tag = "stabilizing_highway_%.3f" % RL_PENETRATION for seed in [5]: # , 20, 68, 72, 125]: run_experiment_lite( run_task, # Number of parallel workers for sampling n_parallel=N_CPUS, # Keeps the snapshot parameters for all iterations snapshot_mode="all", # Specifies the seed for the experiment. If this is not provided, a # random seed will be used seed=seed, mode="local", exp_prefix=exp_tag, # plot=True, sync_s3_pkl=True, )
baseline = LinearFeatureBaseline(env_spec=env.spec) # algo = PPO( env=env, policy=policy, baseline=baseline, batch_size=4000, max_path_length=env.horizon, n_itr=100, #000, discount=0.99, step_size=0.0075, # 0.01 # Uncomment both lines (this and the plot parameter below) to enable plotting # plot=True, ) algo.train() run_experiment_lite( run_task, # Number of parallel workers for sampling n_parallel=5, # Only keep the snapshot parameters for the last iteration snapshot_mode="last", exp_name= "testing", #relu_small_network_ppo_capped_action_simpler_dense_layer_xW_learn_std_smaller_learning_rate", # Specifies the seed for the experiment. If this is not provided, a random seed # will be used seed=0, plot=True, )
baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=4000, max_path_length=env.horizon, n_itr=500, discount=0.99, step_size=0.01, # Uncomment both lines (this and the plot parameter below) to enable plotting # plot=True, ) run_experiment_lite( algo.train(), # Number of parallel workers for sampling n_parallel=1, # Only keep the snapshot parameters for the last iteration snapshot_mode="last", # Specifies the seed for the experiment. If this is not provided, a random seed # will be used seed=1, use_gpu=True, # plot=True, )
init_lr=0.001, n_itr=5, train_feature_network=True, ) batch_size = 10000 algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=batch_size, whole_paths=True, max_path_length=1000, n_itr=1000, step_size=0.01, subsample_factor=1.0, sampler_cls=BatchSampler, optimizer_args={ 'num_slices' : 10, } ) run_experiment_lite( algo.train(), exp_prefix='trpo_box3d_pixel_v11_tf', n_parallel=12, snapshot_mode="gap", snapshot_gap=200, seed=seed, mode="local" )
baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( #algo = VPG( env=env, policy=policy, baseline=baseline, batch_size= 1000, # was 4k # 500 for path lenght of 5, 1000 for path length of 100 max_path_length=100, n_itr=100, discount=0.99, step_size=0.01, #plot=True, ) #algo.train() run_experiment_lite( algo.train(), # Number of parallel workers for sampling n_parallel=4, # Only keep the snapshot parameters for the last iteration snapshot_mode="last", # Specifies the seed for the experiment. If this is not provided, a random seed # will be used seed=1, exp_prefix='vpg_sensitive_point100', exp_name='oracleenv2', #plot=True, )
algo = SensitiveTRPO( #algo = SensitiveVPG( env=env, policy=policy, baseline=baseline, batch_size=fast_batch_size, # number of trajs for grad update max_path_length=max_path_length, meta_batch_size=meta_batch_size, num_grad_updates=num_grad_updates, n_itr=400, use_sensitive=use_sensitive, #optimizer_args={'tf_optimizer_args':{'learning_rate': learning_rate}}, plot=False, ) run_experiment_lite( algo.train(), n_parallel=0, snapshot_mode="last", seed=1, #exp_prefix='deleteme', #exp_name='deleteme' #exp_prefix='sensitive1dT5_2017_01_19', #exp_prefix='bugfix_sensitive0d_8tasks_T'+str(max_path_length)+'_2017_02_05', exp_prefix='trpo_sensitive_cheetah' + str(max_path_length), exp_name='sens' + str(int(use_sensitive)) + '_fbs' + str(fast_batch_size) + '_mbs' + str(meta_batch_size) + '_flr_' + str(fast_learning_rate) + '_lr_' + str(learning_rate) + '_step1' + str(num_grad_updates), plot=False, )
hidden_sizes=(100,100), ) if bas == 'zero': baseline = ZeroBaseline(env_spec=env.spec) elif 'linear' in bas: baseline = LinearFeatureBaseline(env_spec=env.spec) else: baseline = GaussianMLPBaseline(env_spec=env.spec) algo = MAMLTRPO( env=env, policy=policy, baseline=baseline, batch_size=fast_batch_size, # number of trajs for grad update max_path_length=max_path_length, meta_batch_size=meta_batch_size, num_grad_updates=num_grad_updates, n_itr=800, use_maml=use_maml, step_size=meta_step_size, plot=False, ) run_experiment_lite( algo.train(), n_parallel=4, snapshot_mode="last", seed=1, exp_prefix='trpo_maml_4state', exp_name='trpo_maml'+str(int(use_maml))+'_fbs'+str(fast_batch_size)+'_mbs'+str(meta_batch_size)+'_flr_' + str(fast_learning_rate) + 'metalr_' + str(meta_step_size) +'_step1'+str(num_grad_updates), plot=False, )
parser.add_argument("env_name", type=str, help='available env_name:') parser.add_argument("random_seed", type=int) parser.add_argument("num_of_agents", type=int) parser.add_argument("temperature", type=float) parser.add_argument("batch_size", type=int, default=5000) args = parser.parse_args() env_name = env_map[args.env_name] prefix = prefix_map[args.env_name] n_epochs = n_epochs_map[args.env_name] random_seed = int(args.random_seed) run_function = function_map[args.algo] n_itr = n_epochs_map[args.env_name] num_of_agents = int(args.num_of_agents) temperature = float(args.temperature) learning_rate = learning_rate_map[args.env_name] batch_size = int(args.batch_size) if args.algo == "multi_REINFORCE_stein" or args.algo == "multi_REINFORCE_stein_anneal" or args.algo == 'multi_REINOFRCE_stein_reg' or args.algo == "multi_REINFORCE_stein_no_critic" or args.algo == 'multi_REINFORCE_baseline_no_critic' or args.algo == 'multi_REINFORCE_stein_evolution': args.algo = "{:}#{:}_temp={:}".format(args.algo, num_of_agents, args.temperature) run_experiment_lite( run_function, n_parallel=4, snapshot_mode="last", seed=random_seed, log_dir="./../exp_log/{:}_seed={:}_iter=500_env={:}_{:}".format( args.algo, random_seed, prefix, get_date()), )
max_path_length=max_path_length, meta_batch_size=v['meta_batch_size'], num_grad_updates=num_grad_updates, n_itr=800, use_maml=use_maml, step_size=v['meta_step_size'], plot=False, ) direc = 'direc' if direc else '' run_experiment_lite( algo.train(), exp_prefix='trpo_maml_cheetah' + direc + str(max_path_length), exp_name='maml'+str(int(use_maml))+'_fbs'+str(v['fast_batch_size'])+'_mbs'+str(v['meta_batch_size'])+'_flr_' + str(v['fast_lr']) + '_mlr' + str(v['meta_step_size']), # Number of parallel workers for sampling n_parallel=8, # Only keep the snapshot parameters for the last iteration snapshot_mode="gap", snapshot_gap=25, sync_s3_pkl=True, python_command='python3', # Specifies the seed for the experiment. If this is not provided, a random seed # will be used seed=v["seed"], mode="local", #mode="ec2", variant=v, # plot=True, # terminate_machine=False, )
algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=64 * 3 * horizon, max_path_length=horizon, # whole_paths=True, n_itr=1000, discount=0.999, # step_size=0.01, ) algo.train() exp_tag = "cooperative_merge_example" # experiment prefix for seed in [1]: # , 5, 10, 56, 73]: run_experiment_lite( run_task, # Number of parallel workers for sampling n_parallel=1, # Only keep the snapshot parameters for the last iteration snapshot_mode="all", # Specifies the seed for the experiment. If this is not provided, a # random seed will be used seed=seed, mode="local", # "ec2" exp_prefix=exp_tag, # plot=True, )
algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=4000, max_path_length=100, n_itr=40, discount=0.99, step_size=v["step_size"], # Uncomment both lines (this and the plot parameter below) to enable plotting # plot=True, ) run_experiment_lite( algo.train(), exp_prefix="first_exp", # Number of parallel workers for sampling n_parallel=1, # Only keep the snapshot parameters for the last iteration snapshot_mode="last", # Specifies the seed for the experiment. If this is not provided, a random seed # will be used seed=v["seed"], # mode="local", mode="ec2", variant=v, # plot=True, # terminate_machine=False, ) sys.exit()
policy=policy, baseline=baseline, batch_size=batch_size, whole_paths=True, max_path_length=200, n_itr=1000, step_size=0.01, subsample_factor=1.0, optimizer_args={'num_slices': 10}, sampler_cls=BatchSampler, ) algorithm = ICM( mdp, algo, "/home/fred/box3d/trpo_box3d_state_v11_tf_icm_cos_ext0.9_%d" % seed, feature_dim=mdp.spec.observation_space.flat_dim, forward_weight=0.1, external_reward_weight=0.9, inverse_tanh=True, init_learning_rate=1e-4, n_updates_per_iter=500) run_experiment_lite(algorithm.train(), exp_prefix='trpo_box3d_state_v11_tf_icm_cos_ext0.9', n_parallel=8, snapshot_mode="gap", snapshot_gap=200, seed=seed, mode="local")
load_policy=initial_params_file, baseline=baseline, batch_size=8000, max_path_length=200, n_itr=n_itr, reset_arg=goal, optimizer_args={'init_learning_rate': step_sizes[step_i], 'tf_optimizer_args': {'learning_rate': 0.01*step_sizes[step_i]}, 'tf_optimizer_cls': tf.train.GradientDescentOptimizer} ) run_experiment_lite( algo.train(), # Number of parallel workers for sampling n_parallel=4, # Only keep the snapshot parameters for the last iteration snapshot_mode="all", # Specifies the seed for the experiment. If this is not provided, a random seed # will be used seed=goal_i, exp_prefix='antdirec_test', exp_name='test' + str(run_id), plot=True, ) # get return from the experiment with open('data/local/antdirec-test/test'+str(run_id)+'/progress.csv', 'r') as f: reader = csv.reader(f, delimiter=',') i = 0 row = None returns = [] for row in reader: i+=1
baseline = LinearFeatureBaseline(env_spec=env.spec) # max_path_length = env.horizon algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=4000, max_path_length=300, n_itr=10000, discount=0.99, step_size=0.02, # truncate_local_is_ratio = 0.2 # Uncomment both lines (this and the plot parameter below) to enable plotting # plot=True, ) algo.train() run_experiment_lite( run_task, # Number of parallel workers for sampling log_dir='./log/trpo_mntcar_cont', n_parallel=1, # Only keep the snapshot parameters for the last iteration snapshot_mode="last", # Specifies the seed for the experiment. If this is not provided, a random seed # will be used seed=1, # plot=True, )
def run_task(*_): env = normalize(GymEnv("Pendulum-v0")) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(32, 32) ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=4000, max_path_length=env.horizon, n_itr=50, discount=0.99, step_size=0.01, plot=True, ) algo.train() run_experiment_lite( run_task, n_parallel=1, snapshot_mode="last", plot=True, )
step_size=v['meta_step_size'], plot=False, ) exp_name = 'Cellrobot_BigDog2trpo_maml' + task_var + '_' + str( max_path_length) + '_EXP' + str(exp_id) run_experiment_lite( algo.train(), exp_prefix=exp_name, exp_name='maml' + str(int(use_maml)) + '_fbs' + str(v['fast_batch_size']) + '_mbs' + str(v['meta_batch_size']) + '_flr_' + str(v['fast_lr']) + '_mlr' + str(v['meta_step_size']), # Number of parallel workers for sampling n_parallel=16, # Only keep the snapshot parameters for the last iteration snapshot_mode="gap", snapshot_gap=2, sync_s3_pkl=True, # Specifies the seed for the experiment. If this is not provided, a random seed # will be used seed=v["seed"], mode="local", # mode="ec2", variant=v, # plot=True, # terminate_machine=False, ) if ssh_FLAG: local_dir = os.path.abspath('data/local/' + exp_name + '/') remote_dir = '/home/drl/PycharmProjects/maml_rl-master/data/AWS_data/' + exp_name + '/' ssh.upload(local_dir,
load_policy=initial_params_file, baseline=baseline, batch_size=4000, # 2x max_path_length=100, n_itr=n_itr, optimizer_args={'init_learning_rate': step_sizes[step_i], 'tf_optimizer_args': {'learning_rate': 0.5*step_sizes[step_i]}, 'tf_optimizer_cls': tf.train.GradientDescentOptimizer} ) run_experiment_lite( algo.train(), # Number of parallel workers for sampling n_parallel=4, # Only keep the snapshot parameters for the last iteration snapshot_mode="last", # Specifies the seed for the experiment. If this is not provided, a random seed # will be used seed=4, exp_prefix='trpopoint2d_test', exp_name='test', #plot=True, ) import pdb; pdb.set_trace() # get return from the experiment with open('data/local/trpopoint2d-test/test/progress.csv', 'r') as f: reader = csv.reader(f, delimiter=',') i = 0 row = None returns = [] for row in reader: i+=1
else: env = TfEnv(normalize(SwimmerEnv())) batch_size = 20 policy = GaussianMLPPolicy( name="policy", env_spec=env.spec, hidden_sizes=(100,100), ) baseline = LinearFeatureBaseline(env_spec=env.spec) #baseline = ZeroBaseline(env_spec=env.spec) algo = VPG( env=env, policy=policy, baseline=baseline, batch_size=500*batch_size, max_path_length=500, n_itr=500, #plot=True, optimizer_args={'tf_optimizer_args':{'learning_rate': 1e-3}}, ) run_experiment_lite( algo.train(), n_parallel=1, # try increasing this to make it faster??? (Maybe need to modify code for this) snapshot_mode="last", seed=1, exp_prefix='vpgswimmer', #exp_name='basic', exp_name='randomenv', #plot=True, )