def generate_expert_dp(): env = TfEnv(normalize(InvertedPendulumEnv())) policy = GaussianMLPPolicy( name="expert_policy", env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(64, 64), std_hidden_sizes=(64, 64), adaptive_std=True, ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=4000, max_path_length=100, n_itr=64, discount=0.995, step_size=0.01, optimizer=ConjugateGradientOptimizer(hvp_approach=FiniteDifferenceHvp( base_eps=1e-5)), gae_lambda=0.97, ) with tf.Session() as sess: algo.train(sess=sess) t = rollout(env=env, agent=policy, max_path_length=100, animated=False) print(sum(t['rewards'])) with open('expert_dp.pickle', 'wb') as handle: pickle.dump(policy, handle) while True: rollout(env=env, agent=policy, max_path_length=100, animated=False)
def run_linear_ocm_exp(variant): from sandbox.rocky.tf.algos.trpo import TRPO from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline from sandbox.rocky.tf.policies.gaussian_lstm_policy import GaussianLSTMPolicy import sandbox.rocky.tf.core.layers as L from sandbox.rocky.tf.optimizers.conjugate_gradient_optimizer import ( ConjugateGradientOptimizer, FiniteDifferenceHvp, ) from railrl.envs.flattened_product_box import FlattenedProductBox from railrl.envs.memory.continuous_memory_augmented import ( ContinuousMemoryAugmented) from railrl.envs.memory.one_char_memory import ( OneCharMemoryEndOnly, ) from railrl.envs.memory.high_low import HighLow from railrl.launchers.launcher_util import ( set_seed, ) """ Set up experiment variants. """ H = variant['H'] seed = variant['seed'] num_values = variant['num_values'] set_seed(seed) onehot_dim = num_values + 1 """ Code for running the experiment. """ # env = OneCharMemoryEndOnly(n=num_values, num_steps=H, softmax_action=True) env = HighLow(num_steps=H) env = ContinuousMemoryAugmented( env, num_memory_states=onehot_dim, ) env = FlattenedProductBox(env) policy = GaussianLSTMPolicy( name="policy", env_spec=env.spec, lstm_layer_cls=L.LSTMLayer, ) baseline = LinearFeatureBaseline(env_spec=env.spec) optimizer_params = variant['optimizer_params'] trpo_params = variant['trpo_params'] algo = TRPO(env=env, policy=policy, baseline=baseline, optimizer=ConjugateGradientOptimizer( hvp_approach=FiniteDifferenceHvp(**optimizer_params)), **trpo_params) algo.train()
def run_linear_ocm_exp(variant): from sandbox.rocky.tf.algos.trpo import TRPO from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline from sandbox.rocky.tf.policies.gaussian_mlp_policy import GaussianMLPPolicy from sandbox.rocky.tf.optimizers.conjugate_gradient_optimizer import ( ConjugateGradientOptimizer, FiniteDifferenceHvp, ) from rlkit.envs.flattened_product_box import FlattenedProductBox from rlkit.envs.memory.continuous_memory_augmented import ( ContinuousMemoryAugmented) from rlkit.envs.memory.one_char_memory import ( OneCharMemoryEndOnly, ) from rlkit.launchers.launcher_util import ( set_seed, ) """ Set up experiment variants. """ H = variant['H'] seed = variant['seed'] num_values = variant['num_values'] set_seed(seed) onehot_dim = num_values + 1 """ Code for running the experiment. """ env = OneCharMemoryEndOnly(n=num_values, num_steps=H, softmax_action=True) env = ContinuousMemoryAugmented( env, num_memory_states=onehot_dim, ) env = FlattenedProductBox(env) policy = GaussianMLPPolicy( name="policy", env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(32, 32), ) baseline = LinearFeatureBaseline(env_spec=env.spec) optimizer_params = variant['optimizer_params'] trpo_params = variant['trpo_params'] algo = TRPO(env=env, policy=policy, baseline=baseline, optimizer=ConjugateGradientOptimizer( hvp_approach=FiniteDifferenceHvp(**optimizer_params)), **trpo_params) algo.train()
def run_experiment(**params): base_params = copy.copy(DEFAULTS) base_params.update(params) params = base_params pprint(params) grid_world = SlaveGridWorldEnv("walled_chain", max_traj_length=DEFAULTS["max_path_length"], goal_reward=params["goal_reward"]) agent = GridWorldMasterAgent(grid_world, match_reward=params["match_reward"]) env = normalize( SituatedConversationEnvironment(env=grid_world, b_agent=agent)) baseline = LinearFeatureBaseline(env) policy = RecurrentCategoricalPolicy( name="policy", env_spec=env.spec, hidden_dims=params["policy_hidden_dims"], feature_network=MLPNetworkWithEmbeddings( "feature_network", env.observation_space.flat_dim, params["feature_dim"], params["feature_hidden_dims"], tf.tanh, tf.tanh, agent.vocab_size, params["embedding_dim"]), state_include_action=False, ) optimizer = ConjugateGradientOptimizer(hvp_approach=FiniteDifferenceHvp( base_eps=1e-5)) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=params["batch_size"], max_path_length=params["max_path_length"], n_itr=params["n_itr"], discount=0.99, step_size=params["step_size"], optimizer=optimizer, ) run_experiment_lite( algo.train(), n_parallel=15, snapshot_mode="last", exp_prefix="grid_world_sweep3", variant=params, )
def run_experiment(params): params_base = copy.copy(DEFAULTS) params_base.update(params) params = params_base policy = RecurrentCategoricalPolicy( name="policy", env_spec=env.spec, hidden_dims=params["policy_hidden_dims"], feature_network=MLPNetworkWithEmbeddings("embeddings", len(VOCAB), params["feature_dim"], params["feature_hidden_dims"], tf.tanh, tf.tanh, len(VOCAB), params["embedding_dim"], has_other_input=False), state_include_action=False, ) baseline = LinearFeatureBaseline(env.spec) optimizer = ConjugateGradientOptimizer(hvp_approach=FiniteDifferenceHvp( base_eps=1e-5)) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=params["batch_size"], max_path_length=LENGTH, n_itr=params["n_itr"], discount=0.99, step_size=params["step_size"], optimizer=optimizer, ) run_experiment_lite( algo.train(), n_parallel=5, snapshot_mode="last", exp_prefix="autoenc_unnorm_reward", variant=params, )
def experiment(variant): env = variant['env_class'](**variant['env_kwargs']) if variant['multitask']: env = MultitaskToFlatEnv(env) env = NormalizedBoxEnv(env) env = ConvertEnvToTf(env) policy = GaussianMLPPolicy(name="policy", env_spec=env.spec, **variant['policy_params']) baseline = LinearFeatureBaseline(env_spec=env.spec) optimizer_params = variant['optimizer_params'] algo_kwargs = variant['algo_kwargs'] algo = TRPO(env=env, policy=policy, baseline=baseline, optimizer=ConjugateGradientOptimizer( hvp_approach=FiniteDifferenceHvp(**optimizer_params)), **algo_kwargs) algo.train()
def run_experiment(**params): base_params = copy.copy(DEFAULTS) base_params.update(params) params = base_params grid_world = SlaveGridWorldEnv("3x3", goal_reward=params["goal_reward"]) env = normalize(grid_world) baseline = LinearFeatureBaseline(env) policy = CategoricalMLPPolicy( name="policy", env_spec=env.spec, hidden_sizes=params["policy_hidden_dims"], ) optimizer = ConjugateGradientOptimizer(hvp_approach=FiniteDifferenceHvp( base_eps=1e-5)) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=params["batch_size"], max_path_length=5, n_itr=params["n_itr"], discount=0.99, step_size=params["step_size"], optimizer=optimizer, ) run_experiment_lite( algo.train(), n_parallel=5, snapshot_mode="last", exp_prefix="grid_world_silent", variant=params, )
def _init_bnn_trpo(self, bnn_model, training_policy, time_step): if hasattr(self.env._wrapped_env, '_wrapped_env'): inner_env = self.env._wrapped_env._wrapped_env else: inner_env = self.env._wrapped_env.env.unwrapped cost_np_vec = inner_env.cost_np_vec batch_size = self.policy_opt_params["trpo"]["batch_size"] if bnn_model is not None: bnn_env = TfEnv( BayesNeuralNetEnv(env=self.env, inner_env=inner_env, cost_np=cost_np_vec, bnn_model=bnn_model, sam_mode=None)) else: bnn_env = self.env baseline = LinearFeatureBaseline(env_spec=self.env.spec) algo = TRPO( env=bnn_env, policy=training_policy, baseline=baseline, batch_size=batch_size, max_path_length=time_step, discount=self.policy_opt_params["trpo"]["discount"], step_size=self.policy_opt_params["trpo"]["step_size"], optimizer=ConjugateGradientOptimizer( hvp_approach=FiniteDifferenceHvp(base_eps=1e-5)) # sampler_args=sampler_args, # params for VectorizedSampler ) return algo, cost_np_vec
regularisation_coefficient = 1e-5 with tf.Session() as sess: oracle_algo = Oracle_TRPO( env=env, policy=oracle_policy, baseline=oracle_baseline, batch_size=batch_size_value, #use batch size upto 25000 max_path_length= max_path_length_horizon, #or use env.horizon here - would be suited for different environments (may not be defined for all envs though) n_itr=args.num_epochs, discount=0.99, step_size=step_size_value, optimizer=ConjugateGradientOptimizer( reg_coeff=regularisation_coefficient, hvp_approach=FiniteDifferenceHvp( base_eps=regularisation_coefficient))) oracle_train(oracle_algo, sess=sess) # rollouts = oracle_algo.obtain_samples(num_epochs + 1) #logger.log("Average reward for training rollouts on (%s): %f +- %f " % (env_name, np.mean([np.sum(p['rewards']) for p in rollouts]), np.std([np.sum(p['rewards']) for p in rollouts]))) """ Evaluating the learnt policy below using the "obtaines_samples" collected from above batch_polopt.py """ # Final evaluation on all environments using the learned policy # total_rollouts = [] # # for env_name, env in envs: # rollouts = []
# The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(32, 32) ) baseline = LinearFeatureBaseline(env_spec=expert_env.spec) algo = TRPO( env=novice_env, policy=policy, baseline=baseline, batch_size=4000, max_path_length=50, n_itr=40, discount=0.99, step_size=0.01, optimizer=ConjugateGradientOptimizer(hvp_approach=FiniteDifferenceHvp(base_eps=1e-5)) ) with tf.Session() as sess: algo.n_itr = 0 algo.start_itr = 0 algo.train(sess=sess) im_size = 50 im_channels = 3 dim_input = [im_size, im_size, im_channels] disc = DomainConfusionVelocityDiscriminator(input_dim=dim_input, output_dim_class=2, output_dim_dom=2,
def __init__(self, env, args): self.args = args # Parallel setup parallel_sampler.initialize(n_parallel=args.n_parallel) if args.seed is not None: set_seed(args.seed) parallel_sampler.set_seed(args.seed) env, policy = rllab_envpolicy_parser(env, args) if not args.algo == 'thddpg': # Baseline if args.baseline_type == 'linear': baseline = LinearFeatureBaseline(env_spec=env.spec) elif args.baseline_type == 'zero': baseline = ZeroBaseline(env_spec=env.spec) else: raise NotImplementedError(args.baseline_type) # Logger default_log_dir = config.LOG_DIR if args.log_dir is None: log_dir = osp.join(default_log_dir, args.exp_name) else: log_dir = args.log_dir tabular_log_file = osp.join(log_dir, args.tabular_log_file) text_log_file = osp.join(log_dir, args.text_log_file) params_log_file = osp.join(log_dir, args.params_log_file) logger.log_parameters_lite(params_log_file, args) logger.add_text_output(text_log_file) logger.add_tabular_output(tabular_log_file) prev_snapshot_dir = logger.get_snapshot_dir() prev_mode = logger.get_snapshot_mode() logger.set_snapshot_dir(log_dir) logger.set_snapshot_mode(args.snapshot_mode) logger.set_log_tabular_only(args.log_tabular_only) logger.push_prefix("[%s] " % args.exp_name) if args.algo == 'tftrpo': self.algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=args.batch_size, max_path_length=args.max_path_length, n_itr=args.n_iter, discount=args.discount, gae_lambda=args.gae_lambda, step_size=args.step_size, optimizer=ConjugateGradientOptimizer( hvp_approach=FiniteDifferenceHvp( base_eps=1e-5)) if args.recurrent else None, mode=args.control) elif args.algo == 'thddpg': qfunc = thContinuousMLPQFunction(env_spec=env.spec) if args.exp_strategy == 'ou': es = OUStrategy(env_spec=env.spec) elif args.exp_strategy == 'gauss': es = GaussianStrategy(env_spec=env.spec) else: raise NotImplementedError() self.algo = thDDPG(env=env, policy=policy, qf=qfunc, es=es, batch_size=args.batch_size, max_path_length=args.max_path_length, epoch_length=args.epoch_length, min_pool_size=args.min_pool_size, replay_pool_size=args.replay_pool_size, n_epochs=args.n_iter, discount=args.discount, scale_reward=0.01, qf_learning_rate=args.qfunc_lr, policy_learning_rate=args.policy_lr, eval_samples=args.eval_samples, mode=args.control)
recognition_model = utils.build_recognition_model(args, env, summary_writer) baseline = utils.build_baseline(args, env) reward_handler = utils.build_reward_handler(args, summary_writer) validator = auto_validator.AutoValidator(summary_writer, data['obs_mean'], data['obs_std'], render=args.validator_render, render_every=args.render_every, flat_recurrent=args.policy_recurrent) # build algo saver = tf.train.Saver(max_to_keep=100, keep_checkpoint_every_n_hours=.5) sampler_args = dict(n_envs=args.n_envs) if args.vectorize else None if args.policy_recurrent: optimizer = ConjugateGradientOptimizer( max_backtracks=50, hvp_approach=FiniteDifferenceHvp(base_eps=1e-5)) else: optimizer = None algo = GAIL(critic=critic, recognition=recognition_model, reward_handler=reward_handler, env=env, policy=policy, baseline=baseline, validator=validator, batch_size=args.batch_size, max_path_length=args.max_path_length, n_itr=args.n_itr, discount=args.discount, step_size=args.trpo_step_size, saver=saver,
def main(): now = datetime.datetime.now(dateutil.tz.tzlocal()) rand_id = str(uuid.uuid4())[:5] timestamp = now.strftime('%Y_%m_%d_%H_%M_%S_%f_%Z') default_exp_name = 'experiment_%s_%s' % (timestamp, rand_id) parser = argparse.ArgumentParser() parser.add_argument('--exp_name', type=str, default=default_exp_name, help='Name of the experiment.') parser.add_argument('--discount', type=float, default=0.95) parser.add_argument('--gae_lambda', type=float, default=0.99) parser.add_argument('--reward_scale', type=float, default=1.0) parser.add_argument('--enable_obsnorm', action='store_true', default=False) parser.add_argument('--chunked', action='store_true', default=False) parser.add_argument('--n_iter', type=int, default=250) parser.add_argument('--sampler_workers', type=int, default=1) parser.add_argument('--max_traj_len', type=int, default=250) parser.add_argument('--update_curriculum', action='store_true', default=False) parser.add_argument('--anneal_step_size', type=int, default=0) parser.add_argument('--n_timesteps', type=int, default=8000) parser.add_argument('--control', type=str, default='centralized') parser.add_argument('--buffer_size', type=int, default=1) parser.add_argument('--radius', type=float, default=0.015) parser.add_argument('--n_evaders', type=int, default=10) parser.add_argument('--n_pursuers', type=int, default=8) parser.add_argument('--n_poison', type=int, default=10) parser.add_argument('--n_coop', type=int, default=4) parser.add_argument('--n_sensors', type=int, default=30) parser.add_argument('--sensor_range', type=str, default='0.2') parser.add_argument('--food_reward', type=float, default=5) parser.add_argument('--poison_reward', type=float, default=-1) parser.add_argument('--encounter_reward', type=float, default=0.05) parser.add_argument('--reward_mech', type=str, default='local') parser.add_argument('--recurrent', type=str, default=None) parser.add_argument('--baseline_type', type=str, default='linear') parser.add_argument('--policy_hidden_sizes', type=str, default='128,128') parser.add_argument('--baseline_hidden_sizes', type=str, default='128,128') parser.add_argument('--max_kl', type=float, default=0.01) parser.add_argument('--log_dir', type=str, required=False) parser.add_argument('--tabular_log_file', type=str, default='progress.csv', help='Name of the tabular log file (in csv).') parser.add_argument('--text_log_file', type=str, default='debug.log', help='Name of the text log file (in pure text).') parser.add_argument('--params_log_file', type=str, default='params.json', help='Name of the parameter log file (in json).') parser.add_argument('--seed', type=int, help='Random seed for numpy') parser.add_argument('--args_data', type=str, help='Pickled data for stub objects') parser.add_argument('--snapshot_mode', type=str, default='all', help='Mode to save the snapshot. Can be either "all" ' '(all iterations will be saved), "last" (only ' 'the last iteration will be saved), or "none" ' '(do not save snapshots)') parser.add_argument( '--log_tabular_only', type=ast.literal_eval, default=False, help= 'Whether to only print the tabular log information (in a horizontal format)' ) args = parser.parse_args() parallel_sampler.initialize(n_parallel=args.sampler_workers) if args.seed is not None: set_seed(args.seed) parallel_sampler.set_seed(args.seed) args.hidden_sizes = tuple(map(int, args.policy_hidden_sizes.split(','))) centralized = True if args.control == 'centralized' else False sensor_range = np.array(map(float, args.sensor_range.split(','))) if len(sensor_range) == 1: sensor_range = sensor_range[0] else: assert sensor_range.shape == (args.n_pursuers, ) env = MAWaterWorld(args.n_pursuers, args.n_evaders, args.n_coop, args.n_poison, radius=args.radius, n_sensors=args.n_sensors, food_reward=args.food_reward, poison_reward=args.poison_reward, encounter_reward=args.encounter_reward, reward_mech=args.reward_mech, sensor_range=sensor_range, obstacle_loc=None) env = TfEnv( RLLabEnv(StandardizedEnv(env, scale_reward=args.reward_scale, enable_obsnorm=args.enable_obsnorm), mode=args.control)) if args.buffer_size > 1: env = ObservationBuffer(env, args.buffer_size) if args.recurrent: feature_network = MLP( name='feature_net', input_shape=(env.spec.observation_space.flat_dim + env.spec.action_space.flat_dim, ), output_dim=16, hidden_sizes=(128, 64, 32), hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=None) if args.recurrent == 'gru': policy = GaussianGRUPolicy(env_spec=env.spec, feature_network=feature_network, hidden_dim=int( args.policy_hidden_sizes), name='policy') elif args.recurrent == 'lstm': policy = GaussianLSTMPolicy(env_spec=env.spec, feature_network=feature_network, hidden_dim=int( args.policy_hidden_sizes), name='policy') else: policy = GaussianMLPPolicy( name='policy', env_spec=env.spec, hidden_sizes=tuple(map(int, args.policy_hidden_sizes.split(','))), min_std=10e-5) if args.baseline_type == 'linear': baseline = LinearFeatureBaseline(env_spec=env.spec) elif args.baseline_type == 'mlp': raise NotImplementedError() # baseline = GaussianMLPBaseline( # env_spec=env.spec, hidden_sizes=tuple(map(int, args.baseline_hidden_sizes.split(',')))) else: baseline = ZeroBaseline(env_spec=env.spec) # logger default_log_dir = config.LOG_DIR if args.log_dir is None: log_dir = osp.join(default_log_dir, args.exp_name) else: log_dir = args.log_dir tabular_log_file = osp.join(log_dir, args.tabular_log_file) text_log_file = osp.join(log_dir, args.text_log_file) params_log_file = osp.join(log_dir, args.params_log_file) logger.log_parameters_lite(params_log_file, args) logger.add_text_output(text_log_file) logger.add_tabular_output(tabular_log_file) prev_snapshot_dir = logger.get_snapshot_dir() prev_mode = logger.get_snapshot_mode() logger.set_snapshot_dir(log_dir) logger.set_snapshot_mode(args.snapshot_mode) logger.set_log_tabular_only(args.log_tabular_only) logger.push_prefix("[%s] " % args.exp_name) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=args.n_timesteps, max_path_length=args.max_traj_len, #max_path_length_limit=args.max_path_length_limit, update_max_path_length=args.update_curriculum, anneal_step_size=args.anneal_step_size, n_itr=args.n_iter, discount=args.discount, gae_lambda=args.gae_lambda, step_size=args.max_kl, optimizer=ConjugateGradientOptimizer(hvp_approach=FiniteDifferenceHvp( base_eps=1e-5)) if args.recurrent else None, mode=args.control if not args.chunked else 'chunk_{}'.format(args.control), ) algo.train()
def get_algo(env, policy, es, qf, baseline, max_path_length, batch_size, replay_pool_size, discount, scale_reward, learning_rate, replacement_prob, policy_updates_ratio, step_size, gae_lambda, sample_backups, kl_sample_backups, qprop_eta_option, qprop_unbias, qprop_nu, algo_name, n_itr, recurrent, updates_ratio, policy_use_target, policy_batch_size, policy_sample_last, ac_delta, ac_sample_backups, save_freq, restore_auto, qf_learning_rate, qf_use_target, qf_mc_ratio, qf_batch_size, qf_residual_phi, **kwargs): algo = None algo_class = None min_pool_size = 1000 qf_baseline = None extra_kwargs = dict() print('Creating algo=%s with n_itr=%d, max_path_length=%d...' % (algo_name, n_itr, max_path_length)) if algo_name in [ 'ddpg', 'dspg', 'dspgoff', 'dqn', 'dsqn', 'trpg', 'trpgoff', ]: if algo_name in [ 'trpg', ]: extra_kwargs['policy_update_method'] = 'cg' algo = DDPG( env=env, policy=policy, policy_use_target=policy_use_target, es=es, qf=qf, qf_use_target=qf_use_target, policy_batch_size=policy_batch_size, qf_batch_size=qf_batch_size, qf_mc_ratio=qf_mc_ratio, qf_residual_phi=qf_residual_phi, max_path_length=max_path_length, epoch_length=batch_size, # make comparable to batchopt methods min_pool_size=min_pool_size, replay_pool_size=replay_pool_size, n_epochs=n_itr, discount=discount, scale_reward=scale_reward, qf_learning_rate=qf_learning_rate, policy_learning_rate=learning_rate, policy_step_size=step_size, policy_sample_last=policy_sample_last, replacement_prob=replacement_prob, policy_updates_ratio=policy_updates_ratio, updates_ratio=updates_ratio, save_freq=save_freq, restore_auto=restore_auto, **extra_kwargs, ) algo_class = 'DDPG' elif algo_name in [ 'trpo', 'nuqprop', 'nuqfqprop', 'actrpo', 'acqftrpo', 'qprop', 'mqprop', 'qfqprop', 'nafqprop', ]: if recurrent: extra_kwargs['optimizer'] = \ ConjugateGradientOptimizer(hvp_approach=FiniteDifferenceHvp(base_eps=1e-5)) if algo_name in [ 'actrpo', 'acqftrpo', ]: extra_kwargs['ac_delta'] = ac_delta extra_kwargs['qprop'] = False # disable qprop if ac_delta == 0: qf = None if algo_name in [ 'mqprop', ]: extra_kwargs['mqprop'] = True if algo_name in [ 'nuqprop', 'nuqfqprop', ]: extra_kwargs['qprop_nu'] = qprop_nu if qf is not None: qf_baseline = QfunctionBaseline(env_spec=env.spec, policy=policy, qf=qf) algo = TRPO(env=env, policy=policy, baseline=baseline, batch_size=batch_size, max_path_length=max_path_length, n_itr=n_itr, discount=discount, step_size=step_size, gae_lambda=gae_lambda, sample_backups=sample_backups, kl_sample_backups=kl_sample_backups, qf=qf, qf_use_target=qf_use_target, qf_batch_size=qf_batch_size, qf_mc_ratio=qf_mc_ratio, qf_residual_phi=qf_residual_phi, min_pool_size=min_pool_size, scale_reward=scale_reward, qf_updates_ratio=updates_ratio, qprop_eta_option=qprop_eta_option, qprop_unbias=qprop_unbias, replay_pool_size=replay_pool_size, replacement_prob=replacement_prob, qf_baseline=qf_baseline, qf_learning_rate=qf_learning_rate, ac_sample_backups=ac_sample_backups, policy_sample_last=policy_sample_last, save_freq=save_freq, restore_auto=restore_auto, **extra_kwargs) algo_class = 'TRPO' elif algo_name in [ 'vpg', 'qvpg', ]: if qf is not None: qf_baseline = QfunctionBaseline(env_spec=env.spec, policy=policy, qf=qf) algo = VPG( env=env, policy=policy, baseline=baseline, batch_size=batch_size, max_path_length=max_path_length, n_itr=n_itr, discount=discount, gae_lambda=gae_lambda, optimizer_args=dict( tf_optimizer_args=dict(learning_rate=learning_rate, )), qf=qf, qf_use_target=qf_use_target, qf_batch_size=qf_batch_size, qf_mc_ratio=qf_mc_ratio, qf_residual_phi=qf_residual_phi, min_pool_size=min_pool_size, scale_reward=scale_reward, qf_updates_ratio=updates_ratio, qprop_eta_option=qprop_eta_option, qprop_unbias=qprop_unbias, replay_pool_size=replay_pool_size, qf_baseline=qf_baseline, qf_learning_rate=qf_learning_rate, save_freq=save_freq, restore_auto=restore_auto, ) algo_class = 'VPG' print('[get_algo] Instantiating %s.' % algo_class) return algo
def main(): now = datetime.datetime.now(dateutil.tz.tzlocal()) rand_id = str(uuid.uuid4())[:5] timestamp = now.strftime('%Y_%m_%d_%H_%M_%S_%f_%Z') default_exp_name = 'experiment_%s_%s' % (timestamp, rand_id) parser = argparse.ArgumentParser() parser.add_argument('--exp_name', type=str, default=default_exp_name, help='Name of the experiment.') parser.add_argument('--discount', type=float, default=0.99) parser.add_argument('--gae_lambda', type=float, default=1.0) parser.add_argument('--reward_scale', type=float, default=1.0) parser.add_argument('--n_iter', type=int, default=250) parser.add_argument('--sampler_workers', type=int, default=1) parser.add_argument('--max_traj_len', type=int, default=250) parser.add_argument('--update_curriculum', action='store_true', default=False) parser.add_argument('--n_timesteps', type=int, default=8000) parser.add_argument('--control', type=str, default='centralized') parser.add_argument('--rectangle', type=str, default='10,10') parser.add_argument('--map_type', type=str, default='rectangle') parser.add_argument('--n_evaders', type=int, default=5) parser.add_argument('--n_pursuers', type=int, default=2) parser.add_argument('--obs_range', type=int, default=3) parser.add_argument('--n_catch', type=int, default=2) parser.add_argument('--urgency', type=float, default=0.0) parser.add_argument('--pursuit', dest='train_pursuit', action='store_true') parser.add_argument('--evade', dest='train_pursuit', action='store_false') parser.set_defaults(train_pursuit=True) parser.add_argument('--surround', action='store_true', default=False) parser.add_argument('--constraint_window', type=float, default=1.0) parser.add_argument('--sample_maps', action='store_true', default=False) parser.add_argument('--map_file', type=str, default='../maps/map_pool.npy') parser.add_argument('--flatten', action='store_true', default=False) parser.add_argument('--reward_mech', type=str, default='global') parser.add_argument('--catchr', type=float, default=0.1) parser.add_argument('--term_pursuit', type=float, default=5.0) parser.add_argument('--recurrent', type=str, default=None) parser.add_argument('--policy_hidden_sizes', type=str, default='128,128') parser.add_argument('--baselin_hidden_sizes', type=str, default='128,128') parser.add_argument('--baseline_type', type=str, default='linear') parser.add_argument('--conv', action='store_true', default=False) parser.add_argument('--max_kl', type=float, default=0.01) parser.add_argument('--checkpoint', type=str, default=None) parser.add_argument('--log_dir', type=str, required=False) parser.add_argument('--tabular_log_file', type=str, default='progress.csv', help='Name of the tabular log file (in csv).') parser.add_argument('--text_log_file', type=str, default='debug.log', help='Name of the text log file (in pure text).') parser.add_argument('--params_log_file', type=str, default='params.json', help='Name of the parameter log file (in json).') parser.add_argument('--seed', type=int, help='Random seed for numpy') parser.add_argument('--args_data', type=str, help='Pickled data for stub objects') parser.add_argument('--snapshot_mode', type=str, default='all', help='Mode to save the snapshot. Can be either "all" ' '(all iterations will be saved), "last" (only ' 'the last iteration will be saved), or "none" ' '(do not save snapshots)') parser.add_argument( '--log_tabular_only', type=ast.literal_eval, default=False, help= 'Whether to only print the tabular log information (in a horizontal format)' ) args = parser.parse_args() parallel_sampler.initialize(n_parallel=args.sampler_workers) if args.seed is not None: set_seed(args.seed) parallel_sampler.set_seed(args.seed) args.hidden_sizes = tuple(map(int, args.policy_hidden_sizes.split(','))) if args.checkpoint: with tf.Session() as sess: data = joblib.load(args.checkpoint) policy = data['policy'] env = data['env'] else: if args.sample_maps: map_pool = np.load(args.map_file) else: if args.map_type == 'rectangle': env_map = TwoDMaps.rectangle_map( *map(int, args.rectangle.split(','))) elif args.map_type == 'complex': env_map = TwoDMaps.complex_map( *map(int, args.rectangle.split(','))) else: raise NotImplementedError() map_pool = [env_map] env = PursuitEvade(map_pool, n_evaders=args.n_evaders, n_pursuers=args.n_pursuers, obs_range=args.obs_range, n_catch=args.n_catch, train_pursuit=args.train_pursuit, urgency_reward=args.urgency, surround=args.surround, sample_maps=args.sample_maps, constraint_window=args.constraint_window, flatten=args.flatten, reward_mech=args.reward_mech, catchr=args.catchr, term_pursuit=args.term_pursuit) env = TfEnv( RLLabEnv(StandardizedEnv(env, scale_reward=args.reward_scale, enable_obsnorm=False), mode=args.control)) if args.recurrent: if args.conv: feature_network = ConvNetwork( name='feature_net', input_shape=emv.spec.observation_space.shape, output_dim=5, conv_filters=(16, 32, 32), conv_filter_sizes=(3, 3, 3), conv_strides=(1, 1, 1), conv_pads=('VALID', 'VALID', 'VALID'), hidden_sizes=(64, ), hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.softmax) else: feature_network = MLP( name='feature_net', input_shape=(env.spec.observation_space.flat_dim + env.spec.action_space.flat_dim, ), output_dim=5, hidden_sizes=(256, 128, 64), hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=None) if args.recurrent == 'gru': policy = CategoricalGRUPolicy(env_spec=env.spec, feature_network=feature_network, hidden_dim=int( args.policy_hidden_sizes), name='policy') elif args.recurrent == 'lstm': policy = CategoricalLSTMPolicy(env_spec=env.spec, feature_network=feature_network, hidden_dim=int( args.policy_hidden_sizes), name='policy') elif args.conv: feature_network = ConvNetwork( name='feature_net', input_shape=env.spec.observation_space.shape, output_dim=5, conv_filters=(8, 16), conv_filter_sizes=(3, 3), conv_strides=(2, 1), conv_pads=('VALID', 'VALID'), hidden_sizes=(32, ), hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.softmax) policy = CategoricalMLPPolicy(name='policy', env_spec=env.spec, prob_network=feature_network) else: policy = CategoricalMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=args.hidden_sizes) if args.baseline_type == 'linear': baseline = LinearFeatureBaseline(env_spec=env.spec) else: baseline = ZeroBaseline(env_spec=env.spec) # logger default_log_dir = config.LOG_DIR if args.log_dir is None: log_dir = osp.join(default_log_dir, args.exp_name) else: log_dir = args.log_dir tabular_log_file = osp.join(log_dir, args.tabular_log_file) text_log_file = osp.join(log_dir, args.text_log_file) params_log_file = osp.join(log_dir, args.params_log_file) logger.log_parameters_lite(params_log_file, args) logger.add_text_output(text_log_file) logger.add_tabular_output(tabular_log_file) prev_snapshot_dir = logger.get_snapshot_dir() prev_mode = logger.get_snapshot_mode() logger.set_snapshot_dir(log_dir) logger.set_snapshot_mode(args.snapshot_mode) logger.set_log_tabular_only(args.log_tabular_only) logger.push_prefix("[%s] " % args.exp_name) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=args.n_timesteps, max_path_length=args.max_traj_len, n_itr=args.n_iter, discount=args.discount, gae_lambda=args.gae_lambda, step_size=args.max_kl, optimizer=ConjugateGradientOptimizer(hvp_approach=FiniteDifferenceHvp( base_eps=1e-5)) if args.recurrent else None, mode=args.control, ) algo.train()
def train(self): expert_env = TfEnv( self.expert_env ) #TfEnv(GymEnv("Pusher3DOF-v1", force_reset=True, record_video=False)) # expert_env = TfEnv(normalize(ReacherEnv())) novice_env = TfEnv( self.novice_env ) #TfEnv(GymEnv("Pusher3DOFNoChange-v1", force_reset=True, record_video=True)) # novice_env = TfEnv(normalize(ReacherTwoEnv(), normalize_obs=True)) expert_fail_pol = RandomPolicy(expert_env.spec) policy = GaussianMLPPolicy( name="novice_policy", env_spec=novice_env.spec, init_std=10, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=expert_env.spec) algo = TRPO(env=novice_env, policy=policy, baseline=baseline, batch_size=50 * 500, max_path_length=self.horizon, n_itr=self.itrs, discount=0.99, step_size=0.01, optimizer=ConjugateGradientOptimizer( hvp_approach=FiniteDifferenceHvp(base_eps=1e-5))) config = tf.ConfigProto() config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: #What do the n_itr and start_itr mean? algo.n_itr = 0 algo.start_itr = 0 algo.train(sess=sess) #TODO: What is happening here? im_height = self.imsize[0] im_width = self.imsize[1] im_channels = 3 dim_input = [im_height, im_width, im_channels] disc = DomainConfusionVelocityDiscriminator(input_dim=dim_input, output_dim_class=2, output_dim_dom=2, tf_sess=sess) #data = joblib.load(self.expert_pkl)#"/home/andrewliu/research/viewpoint/rllab-tpil/third_person_im/data/local/experiment/experiment_2017_05_07_20_58_39_0001/itr_123.pkl")#"/home/abhigupta/abhishek_sandbox/viewpoint/third_person_im/data/local/experiment/experiment_2017_05_06_18_07_38_0001/itr_900.pkl") #expert_policy = data['policy'] with open(self.expert_pkl, 'rb') as pfile: expert_policy = pickle.load(pfile) # expert_policy = load_expert_reacher(expert_env, sess) #Load the expert #TODO: Need to train the expert #from rllab.sampler.utils import rollout #while True: # t = rollout(env=expert_env, agent=expert_policy, max_path_length=50, animated=True) algo.n_itr = self.itrs trainer = CyberPunkTrainer(disc=disc, novice_policy_env=novice_env, expert_fail_pol=expert_fail_pol, expert_env=expert_env, novice_policy=policy, novice_policy_opt_algo=algo, expert_success_pol=expert_policy, im_width=im_width, im_height=im_height, im_channels=im_channels, tf_sess=sess, horizon=self.horizon) iterations = self.itrs for iter_step in range(0, iterations): logger.record_tabular('Iteration', iter_step) trainer.take_iteration(n_trajs_cost=self.trajs, n_trajs_policy=self.trajs) logger.dump_tabular(with_prefix=False) trainer.log_and_finish()
def run_experiment(expert_rollout_pickle_path, trained_policy_pickle_path, env, cost_trainer_type, iterations=30, num_frames=1, traj_len=200, config={}): # Load the expert rollouts into memory expert_rollouts = load_expert_rollouts(expert_rollout_pickle_path) # In the case that we only have one expert rollout in the file if type(expert_rollouts) is dict: expert_rollouts = [expert_rollouts] #TODO: make this configurable expert_rollouts = [ shorten_tensor_dict(x, traj_len) for x in expert_rollouts ] # import pdb; pdb.set_trace() # Sanity check, TODO: should prune any "expert" rollouts with suboptimal reward? print("Average reward for expert rollouts: %f" % np.mean([np.sum(p['rewards']) for p in expert_rollouts])) if "transformers" in config and len(config["transformers"]) > 0: print("Transforming expert rollouts...") for rollout in tqdm(expert_rollouts): transformed_observations = [] for ob in tqdm(rollout["observations"]): for transformer in config["transformers"]: ob = transformer.transform(ob) transformed_observations.append(ob) rollout["observations"] = np.array(transformed_observations) # Handle both flattened state input and image input # TODO: this could be done better by looking at just the shape and determining from that if config["img_input"]: obs_dims = expert_rollouts[0]['observations'][0].shape else: # import pdb; pdb.set_trace() obs_dims = len(expert_rollouts[0]['observations'][0]) if "num_novice_rollouts" in config: number_of_sample_trajectories = config["num_novice_rollouts"] else: number_of_sample_trajectories = len(expert_rollouts) print(number_of_sample_trajectories) # Choose a policy (Conv based on images, mlp based on states) # TODO: may also have to switch out categorical for something else in continuous state spaces?? # Let's just avoid that for now? if config[ "img_input"]: # TODO: unclear right now if this even works ok. get poor results early on. policy = CategoricalConvPolicy( name="policy", env_spec=env.spec, conv_filters=[32, 64, 64], conv_filter_sizes=[3, 3, 3], conv_strides=[1, 1, 1], conv_pads=['SAME', 'SAME', 'SAME'], # The neural network policy should have two hidden layers, each with 100 hidden units each (see RLGAN paper) hidden_sizes=[200, 200]) elif type(env.spec.action_space) == Discrete: policy = CategoricalMLPPolicy( name="policy", env_spec=env.spec, # The neural network policy should have two hidden layers, each with 100 hidden units each (see RLGAN paper) hidden_sizes=(400, 300)) else: policy = GaussianMLPPolicy(name="policy", env_spec=env.spec, hidden_sizes=(100, 50, 25)) if config["img_input"]: # TODO: right now the linear feature baseline is too computationally expensive to actually use # with full image inputs, so for now just use the zero baseline baseline = ZeroBaseline(env_spec=env.spec) else: baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=number_of_sample_trajectories * traj_len, # This is actually used internally by the sampler. We make use of this sampler to generate our samples, hence we pass it here max_path_length= traj_len, # same with this value. A cleaner way may be to create our own sampler, but for now doing it this way.. n_itr=40, discount=0.995, step_size=0.01, optimizer=ConjugateGradientOptimizer( hvp_approach=FiniteDifferenceHvp(base_eps=1e-5), max_backtracks=40)) # Prune the number of rollouts if that option is enabled if "num_expert_rollouts" in config: rollouts_to_use = min(config["num_expert_rollouts"], len(expert_rollouts)) expert_rollouts = expert_rollouts[:rollouts_to_use] print("Only using %d expert rollouts" % rollouts_to_use) true_rewards = [] actual_rewards = [] # Extract observations to a tensor expert_rollouts_tensor = tensor_utils.stack_tensor_list( [path["observations"] for path in expert_rollouts]) if "oversample" in config and config["oversample"]: oversample_rate = max( int(number_of_sample_trajectories / len(expert_rollouts_tensor)), 1.) expert_rollouts_tensor = expert_rollouts_tensor.repeat(oversample_rate, axis=0) print("oversampling %d times to %d" % (oversample_rate, len(expert_rollouts_tensor))) with tf.Session() as sess: algo.start_worker() cost_trainer = cost_trainer_type([num_frames, obs_dims], config=config) trainer = Trainer(env=env, sess=sess, cost_approximator=cost_trainer, cost_trainer=cost_trainer, novice_policy=policy, novice_policy_optimizer=algo, num_frames=num_frames) sess.run(tf.global_variables_initializer()) for iter_step in range(0, iterations): dump_data = (iter_step == ( iterations - 1)) and config["generate_option_graphs"] # is last iteration true_reward, actual_reward = trainer.step( dump_datapoints=dump_data, config=config, expert_horizon=traj_len, number_of_sample_trajectories=number_of_sample_trajectories) true_rewards.append(true_reward) actual_rewards.append(actual_reward) # run a rollout for the video if "recording_env" in config: novice_rollouts = rollout_policy(policy, config["recording_env"], get_image_observations=False, max_path_length=200) novice_rollouts = algo.obtain_samples(iter_step) rollout_rewards = [np.sum(x['rewards']) for x in novice_rollouts] print("Reward stats for final policy: %f +/- %f " % (np.mean(rollout_rewards), np.std(rollout_rewards))) # save the novice policy learned with open(trained_policy_pickle_path, "wb") as output_file: pickle.dump(policy, output_file) # TODO: also save the reward function? algo.shutdown_worker() second_true_rewards = [] second_actual_rewards = [] # Do our transfer learning task here: # TODO: move this to a separate script and save the learned weights if config['second_env'] is not None: with tf.variable_scope("second_policy"): #TODO: remove gross copypasta if not config["reset_second_policy"]: second_policy = Serializable.clone( policy) # TODO: start with a fresh policy else: if config[ "img_input"]: # TODO: unclear right now if this even works ok. get poor results early on. second_policy = CategoricalConvPolicy( name="policy", env_spec=config["second_env"].spec, conv_filters=[32, 64, 64], conv_filter_sizes=[3, 3, 3], conv_strides=[1, 1, 1], conv_pads=['SAME', 'SAME', 'SAME'], # The neural network policy should have two hidden layers, each with 100 hidden units each (see RLGAN paper) hidden_sizes=[200, 200]) elif type(env.spec.action_space) == Discrete: second_policy = CategoricalMLPPolicy( name="policy", env_spec=config["second_env"].spec, # The neural network policy should have two hidden layers, each with 100 hidden units each (see RLGAN paper) hidden_sizes=(400, 300)) else: second_policy = GaussianMLPPolicy( name="policy", env_spec=config["second_env"].spec, hidden_sizes=(100, 50, 25)) if config["img_input"]: # TODO: right now the linear feature baseline is too computationally expensive to actually use # with full image inputs, so for now just use the zero baseline baseline = ZeroBaseline(env_spec=config["second_env"].spec) else: baseline = LinearFeatureBaseline( env_spec=config["second_env"].spec) algo = TRPO( env=config["second_env"], policy=second_policy, baseline=baseline, batch_size=number_of_sample_trajectories * traj_len, # This is actually used internally by the sampler. We make use of this sampler to generate our samples, hence we pass it here max_path_length= traj_len, # same with this value. A cleaner way may be to create our own sampler, but for now doing it this way.. n_itr=40, discount=0.995, step_size=0.01, optimizer=ConjugateGradientOptimizer( hvp_approach=FiniteDifferenceHvp(base_eps=1e-5), max_backtracks=40)) if not config["stop_disc_training_on_second_run"] and config[ "use_prev_options_relearn_mixing_func"]: # If we're not retraining the discriminator at all in the transfer learning step, # just keep the old network options = cost_trainer.disc.discriminator_options cost_trainer.disc._remake_network_from_disc_options( options, stop_gradients=(not config["retrain_options"]), num_extra_options=config["num_extra_options_on_transfer"]) trainer = Trainer( env=config['second_env'], sess=sess, cost_approximator=cost_trainer, cost_trainer=cost_trainer, novice_policy=second_policy, novice_policy_optimizer=algo, num_frames=num_frames, train_disc=(not config["stop_disc_training_on_second_run"])) algo.start_worker() initialize_uninitialized(sess) for iter_step in range(0, iterations): # import pdb; pdb.set_trace() dump_data = (iter_step == (iterations - 1)) and config[ "generate_option_graphs"] # is last iteration true_reward, actual_reward = trainer.step( expert_rollouts_tensor=expert_rollouts_tensor, dump_datapoints=dump_data, config=config, expert_horizon=traj_len, number_of_sample_trajectories=number_of_sample_trajectories ) second_true_rewards.append(true_reward) second_actual_rewards.append(actual_reward) # run a rollout for the video if "recording_env" in config: novice_rollouts = rollout_policy( second_policy, config["recording_env"], get_image_observations=False, max_path_length=traj_len) novice_rollouts = algo.obtain_samples(iter_step) rollout_rewards = [np.sum(x['rewards']) for x in novice_rollouts] print("Reward stats for final policy: %f +/- %f " % (np.mean(rollout_rewards), np.std(rollout_rewards))) # save the novice policy learned with open(trained_policy_pickle_path, "wb") as output_file: pickle.dump(second_policy, output_file) algo.shutdown_worker() return true_rewards, actual_rewards, second_true_rewards, second_actual_rewards
def setup(self, env, policy, start_itr): if not self.args.algo == 'thddpg': # Baseline if self.args.baseline_type == 'linear': baseline = LinearFeatureBaseline(env_spec=env.spec) elif self.args.baseline_type == 'zero': baseline = ZeroBaseline(env_spec=env.spec) else: raise NotImplementedError(self.args.baseline_type) if self.args.control == 'concurrent': baseline = [baseline for _ in range(len(env.agents))] # Logger default_log_dir = config.LOG_DIR if self.args.log_dir is None: log_dir = osp.join(default_log_dir, self.args.exp_name) else: log_dir = self.args.log_dir tabular_log_file = osp.join(log_dir, self.args.tabular_log_file) text_log_file = osp.join(log_dir, self.args.text_log_file) params_log_file = osp.join(log_dir, self.args.params_log_file) logger.log_parameters_lite(params_log_file, self.args) logger.add_text_output(text_log_file) logger.add_tabular_output(tabular_log_file) prev_snapshot_dir = logger.get_snapshot_dir() prev_mode = logger.get_snapshot_mode() logger.set_snapshot_dir(log_dir) logger.set_snapshot_mode(self.args.snapshot_mode) logger.set_log_tabular_only(self.args.log_tabular_only) logger.push_prefix("[%s] " % self.args.exp_name) if self.args.algo == 'tftrpo': algo = MATRPO( env=env, policy_or_policies=policy, baseline_or_baselines=baseline, batch_size=self.args.batch_size, start_itr=start_itr, max_path_length=self.args.max_path_length, n_itr=self.args.n_iter, discount=self.args.discount, gae_lambda=self.args.gae_lambda, step_size=self.args.step_size, optimizer=ConjugateGradientOptimizer( hvp_approach=FiniteDifferenceHvp( base_eps=1e-5)) if self.args.recurrent else None, ma_mode=self.args.control) elif self.args.algo == 'thddpg': qfunc = thContinuousMLPQFunction(env_spec=env.spec) if self.args.exp_strategy == 'ou': es = OUStrategy(env_spec=env.spec) elif self.args.exp_strategy == 'gauss': es = GaussianStrategy(env_spec=env.spec) else: raise NotImplementedError() algo = thDDPG(env=env, policy=policy, qf=qfunc, es=es, batch_size=self.args.batch_size, max_path_length=self.args.max_path_length, epoch_length=self.args.epoch_length, min_pool_size=self.args.min_pool_size, replay_pool_size=self.args.replay_pool_size, n_epochs=self.args.n_iter, discount=self.args.discount, scale_reward=0.01, qf_learning_rate=self.args.qfunc_lr, policy_learning_rate=self.args.policy_lr, eval_samples=self.args.eval_samples, mode=self.args.control) return algo
hidden_sizes=(100, 50, 25), hidden_nonlinearity=tf.nn.relu, ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO(env=env, policy=policy, baseline=baseline, batch_size=5000, max_path_length=env.horizon, n_itr=args.num_epochs, discount=0.99, step_size=0.01, optimizer=ConjugateGradientOptimizer( hvp_approach=FiniteDifferenceHvp(base_eps=1e-5))) run_experiment_lite( algo.train(), log_dir=None if args.use_ec2 else args.data_dir, # Number of parallel workers for sampling n_parallel=1, # Only keep the snapshot parameters for the last iteration snapshot_mode="last", # Specifies the seed for the experiment. If this is not provided, a random seed # will be used exp_prefix="UnifiedDDPG_" + args.env + "_trpo", seed=1, mode="ec2" if args.use_ec2 else "local", plot=False, # dry=True,
def run_task(vv, log_dir=None, exp_name=None): global policy global baseline policy = None baseline = None trpo_stepsize = 0.01 trpo_subsample_factor = 0.2 # Check if variant is available if vv['model_type'] not in ['BrushTireModel', 'LinearTireModel']: raise ValueError('Unrecognized model type for simulating robot') if vv['robot_type'] not in ['MRZR', 'RCCar']: raise ValueError('Unrecognized robot type') # Load environment if not vv['use_ros']: env = StraightEnv( target_velocity=vv['target_velocity'], dt=vv['dt'], model_type=vv['model_type'], robot_type=vv['robot_type'], mu_s=vv['mu_s'], mu_k=vv['mu_k'] ) env=TfEnv(env) else: from aa_simulation.envs.straight.straight_env_ros import StraightEnvROS env = StraightEnvROS( target_velocity=vv['target_velocity'], dt=vv['dt'], model_type=vv['model_type'], robot_type=vv['robot_type'] ) # Save variant information for comparison plots # variant_file = logger.get_snapshot_dir() + '/variant.json' # logger.log_variant(variant_file, vv) # Set variance for each action component separately for exploration # Note: We set the variance manually because we are not scaling our # action space during training. init_std_speed = vv['target_velocity'] / 4 init_std_steer = np.pi / 6 init_std = [init_std_speed, init_std_steer] # Build policy and baseline networks # Note: Mean of policy network set to analytically computed values for # faster training (rough estimates for RL to fine-tune). if policy is None or baseline is None: target_velocity = vv['target_velocity'] target_steering = 0 output_mean = np.array([target_velocity, target_steering]) hidden_sizes = (32, 32) # In mean network, allow output b values to dominate final output # value by constraining the magnitude of the output W matrix. This is # to allow faster learning. These numbers are arbitrarily chosen. W_gain = min(vv['target_velocity'] / 5, np.pi / 15) policy = GaussianLSTMPolicy( name="policy", env_spec=env.spec, # input_shape=(env.spec.observation_space.flat_dim,), # output_dim=env.spec.action_space.flat_dim, # gru_layer_cls=L.GRULayer, ) # mean_network = MLP( # input_shape=(env.spec.observation_space.flat_dim,), # output_dim=env.spec.action_space.flat_dim, # hidden_sizes=hidden_sizes, # hidden_nonlinearity=LN.rectify, # output_nonlinearity=None, # output_W_init=LI.GlorotUniform(gain=W_gain), # output_b_init=output_mean # ) # policy = GaussianMLPPolicy( # env_spec=env.spec, # hidden_sizes=(32, 32), # init_std=init_std, # mean_network=mean_network # ) baseline = LinearFeatureBaseline( env_spec=env.spec, target_key='returns' ) # Reset variance to re-enable exploration when using pre-trained networks else: policy._l_log_std = ParamLayer( policy._mean_network.input_layer, num_units=env.spec.action_space.flat_dim, param=LI.Constant(np.log(init_std)), name='output_log_std', trainable=True ) obs_var = policy._mean_network.input_layer.input_var mean_var, log_std_var = L.get_output([policy._l_mean, policy._l_log_std]) policy._log_std_var = log_std_var LasagnePowered.__init__(policy, [policy._l_mean, policy._l_log_std]) policy._f_dist = ext.compile_function( inputs=[obs_var], outputs=[mean_var, log_std_var] ) safety_baseline = LinearFeatureBaseline( env_spec=env.spec, target_key='safety_returns' ) safety_constraint = StraightSafetyConstraint( max_value=1.0, baseline=safety_baseline ) if vv['algo'] == 'TRPO': algo = Trpo( env=env, policy=policy, baseline=baseline, batch_size=600, max_path_length=env.horizon, n_itr=2000, discount=0.99, step_size=trpo_stepsize, plot=False, optimizer=ConjugateGradientOptimizer(hvp_approach=FiniteDifferenceHvp(base_eps=1e-5)), ) else: algo = CPO( env=env, policy=policy, baseline=baseline, safety_constraint=safety_constraint, batch_size=600, max_path_length=env.horizon, n_itr=2000, discount=0.99, step_size=trpo_stepsize, gae_lambda=0.95, safety_gae_lambda=1, optimizer_args={'subsample_factor': trpo_subsample_factor}, plot=False ) algo.train()
baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=5000, max_path_length=2000, #max_path_length=env.horizon, n_itr=1000, discount=0.99, step_size=0.01, gae_lambda=1.0, optimizer=ConjugateGradientOptimizer( reg_coeff=1e-5, hvp_approach=FiniteDifferenceHvp(base_eps=1e-5))) name = "TRPO_Trial_Results/" + "Trial_GridWorld/" run_experiment_lite( algo.train(), # log_dir=None if args.use_ec2 else args.data_dir, # Number of parallel workers for sampling n_parallel=1, # Only keep the snapshot parameters for the last iteration snapshot_mode="none", # Specifies the seed for the experiment. If this is not provided, a random seed # will be used exp_name=name, seed=1, # mode="ec2" if args.use_ec2 else "local",
for env_name, env in envs: logger.log("Training Policy on %s" % env_name) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=args.batch_size, max_path_length=env.horizon, n_itr=args.num_epochs, discount=0.99, step_size=args.step_size, optimizer=ConjugateGradientOptimizer( reg_coeff=args.reg_coeff, hvp_approach=FiniteDifferenceHvp(base_eps=args.reg_coeff))) custom_train(algo, sess=sess) rollouts = algo.obtain_samples(args.num_epochs + 1) logger.log("Average reward for training rollouts on (%s): %f +- %f " % (env_name, np.mean([np.sum(p['rewards']) for p in rollouts]), np.std([np.sum(p['rewards']) for p in rollouts]))) # Final evaluation on all environments using the learned policy total_rollouts = [] for env_name, env in envs: rollouts = []
def run(args): print("loading from:", args.params_filepath) print("saving to:", args.exp_name) exp_dir = utils.set_up_experiment(exp_name=args.exp_name, phase='imitate') saver_dir = os.path.join(exp_dir, 'imitate', 'log') saver_filepath = os.path.join(saver_dir, 'checkpoint') np.savez(os.path.join(saver_dir, 'args'), args=args) summary_writer = tf.summary.FileWriter( os.path.join(exp_dir, 'imitate', 'summaries')) # build components env, act_low, act_high = utils.build_ngsim_env(args, exp_dir, vectorize=args.vectorize) data = utils.load_data(args.expert_filepath, act_low=act_low, act_high=act_high, min_length=args.env_H + args.env_primesteps, clip_std_multiple=args.normalize_clip_std_multiple, ngsim_filename=args.ngsim_filename) critic = utils.build_critic(args, data, env, summary_writer) policy = utils.build_policy(args, env) recognition_model = utils.build_recognition_model(args, env, summary_writer) baseline = utils.build_baseline(args, env) reward_handler = utils.build_reward_handler(args, summary_writer) validator = auto_validator.AutoValidator( summary_writer, data['obs_mean'], data['obs_std'], render=args.validator_render, render_every=args.render_every, flat_recurrent=args.policy_recurrent) # build algo saver = tf.train.Saver(max_to_keep=100, keep_checkpoint_every_n_hours=.5) sampler_args = dict(n_envs=args.n_envs) if args.vectorize else None if args.policy_recurrent: optimizer = ConjugateGradientOptimizer( max_backtracks=50, hvp_approach=FiniteDifferenceHvp(base_eps=1e-5)) else: optimizer = None algo = GAIL(critic=critic, recognition=recognition_model, reward_handler=reward_handler, env=env, policy=policy, baseline=baseline, validator=validator, batch_size=args.batch_size, max_path_length=args.max_path_length, n_itr=args.n_itr, discount=args.discount, step_size=args.trpo_step_size, saver=saver, saver_filepath=saver_filepath, force_batch_sampler=False if args.vectorize else True, sampler_args=sampler_args, snapshot_env=False, plot=False, optimizer=optimizer, optimizer_args=dict(max_backtracks=50, debug_nan=True)) # run it with tf.Session() as session: # running the initialization here to allow for later loading # NOTE: rllab batchpolopt runs this before training as well # this means that any loading subsequent to this is nullified # you have to comment of that initialization for any loading to work session.run(tf.global_variables_initializer()) # loading if args.params_filepath != '': algo.load(args.params_filepath) # run training algo.train(sess=session)