env=mdp, policy=policy, baseline=baseline, batch_size=batch_size, whole_paths=True, max_path_length=200, n_itr=2000, step_size=0.01, subsample_factor=1.0, optimizer_args={'num_slices': 10}, ) algorithm = ICM( mdp, algo, "/home/fred/box3d/trpo_box3d_state_v10_tf_icm_cos_%d" % seed, feature_dim=mdp.spec.observation_space.flat_dim, forward_weight=0.1, external_reward_weight=0.0, inverse_tanh=True, init_learning_rate=1e-4, ) run_experiment_lite(algorithm.train(), exp_prefix='trpo_box3d_state_v10_tf_icm_cos', n_parallel=1, snapshot_mode="gap", snapshot_gap=200, seed=seed, mode="local")
def main(): parser = argparse.ArgumentParser() # Hyperparameters parser.add_argument('--fw_ratio', type=float, default=0.1) parser.add_argument('--init_lr', type=float, default=5e-4) parser.add_argument('--tfboard_path', type=str, default='/tmp/tfboard') parser.add_argument('--gpu_ratio', type=float, default=0.99) args = parser.parse_args() # Param ranges seeds = range(2) for seed in seeds: mdp = TfEnv(normalize(env=GymEnv('Box3dReach-v17',record_video=False, \ log_dir='/tmp/gym_test',record_log=False), normalize_obs=True)) name = 'trpo-state-v17-tf-icm-fw{}-initlr-{}-norm'.format( args.fw_ratio, args.init_lr) policy = GaussianMLPPolicy( "mlp_policy", env_spec=mdp.spec, hidden_sizes=(64, 64, 32), output_nonlinearity=tf.nn.tanh, clip_action=False, ) baseline = LinearFeatureBaseline(mdp.spec, ) batch_size = 50000 algo = TRPO( env=mdp, policy=policy, baseline=baseline, batch_size=batch_size, whole_paths=True, max_path_length=1000, n_itr=1000, step_size=0.01, subsample_factor=1.0, sampler_cls=BatchSampler, ) algorithm = ICM( mdp, algo, args.tfboard_path + "/%s_%d" % (name, seed), feature_dim=mdp.spec.observation_space.flat_dim, forward_weight=args.fw_ratio, external_reward_weight=0.0, replay_pool_size=1000000, init_learning_rate=args.init_lr, n_updates_per_iter=1000, ) run_experiment_lite(algorithm.train(), exp_prefix=name, n_parallel=8, snapshot_mode="gap", snapshot_gap=200, seed=seed, mode="local")
idle = Idle( env=env, policy=policy, baseline=baseline, ) icm = ICM( env, idle, '/home/fshentu/box3d/trpo_box3d_pixel_v8_tf_idle_normalize_fw_0.2_%d' % seed, feature_dim=256, forward_weight=0.2, inverse_tanh=True, init_learning_rate=5e-4, icm_batch_size=128, replay_pool_size=10000, n_updates_per_iter=200, obs_dtype='uint8', normalize_input=True, ) run_experiment_lite( icm.train(), exp_prefix='trpo_box3d_pixel_v8_tf_idle_normalize_fw0.2', n_parallel=1, snapshot_mode="gap", snapshot_gap=200, seed=seed, mode="local")
batch_size = 2400 idle = Idle( env=env, policy=policy, baseline=baseline, ) icm = ICM( env, idle, '/home/fshentu/box3d/trpo_box3d_pixel_v8_tf_idle_normalize_fw_%d' % seed, feature_dim=256, forward_weight=0.2, inverse_tanh=True, init_learning_rate=1e-4, icm_batch_size=128, replay_pool_size=1000000, n_updates_per_iter=200, obs_dtype='uint8', normalize_input=True, ) run_experiment_lite(icm.train(), exp_prefix='trpo_box3d_pixel_v8_tf_idle_normalize', n_parallel=1, snapshot_mode="gap", snapshot_gap=200, seed=seed, mode="local")
batch_size=batch_size, whole_paths=True, max_path_length=200, n_itr=1000, step_size=0.01, subsample_factor=1.0, optimizer_args={'num_slices': 10}, sampler_cls=BatchSampler, ) algorithm = ICM( mdp, algo, "/home/fred/box3d/trpo_box3d_state_v10_tf_icm_cos_new_ext0.95_%d" % seed, feature_dim=mdp.spec.observation_space.flat_dim, forward_weight=0.1, external_reward_weight=0.95, inverse_tanh=True, init_learning_rate=1e-4, n_updates_per_iter=500) run_experiment_lite( algorithm.train(), exp_prefix='trpo_box3d_state_v10_tf_icm_cos_new_ext0.95', n_parallel=8, snapshot_mode="gap", snapshot_gap=200, seed=seed, mode="local")