env=mdp,
        policy=policy,
        baseline=baseline,
        batch_size=batch_size,
        whole_paths=True,
        max_path_length=200,
        n_itr=2000,
        step_size=0.01,
        subsample_factor=1.0,
        optimizer_args={'num_slices': 10},
    )

    algorithm = ICM(
        mdp,
        algo,
        "/home/fred/box3d/trpo_box3d_state_v10_tf_icm_cos_%d" % seed,
        feature_dim=mdp.spec.observation_space.flat_dim,
        forward_weight=0.1,
        external_reward_weight=0.0,
        inverse_tanh=True,
        init_learning_rate=1e-4,
    )

    run_experiment_lite(algorithm.train(),
                        exp_prefix='trpo_box3d_state_v10_tf_icm_cos',
                        n_parallel=1,
                        snapshot_mode="gap",
                        snapshot_gap=200,
                        seed=seed,
                        mode="local")
コード例 #2
0
def main():

    parser = argparse.ArgumentParser()
    # Hyperparameters
    parser.add_argument('--fw_ratio', type=float, default=0.1)
    parser.add_argument('--init_lr', type=float, default=5e-4)

    parser.add_argument('--tfboard_path', type=str, default='/tmp/tfboard')
    parser.add_argument('--gpu_ratio', type=float, default=0.99)

    args = parser.parse_args()

    # Param ranges
    seeds = range(2)

    for seed in seeds:
        mdp = TfEnv(normalize(env=GymEnv('Box3dReach-v17',record_video=False, \
        log_dir='/tmp/gym_test',record_log=False), normalize_obs=True))

        name = 'trpo-state-v17-tf-icm-fw{}-initlr-{}-norm'.format(
            args.fw_ratio, args.init_lr)

        policy = GaussianMLPPolicy(
            "mlp_policy",
            env_spec=mdp.spec,
            hidden_sizes=(64, 64, 32),
            output_nonlinearity=tf.nn.tanh,
            clip_action=False,
        )

        baseline = LinearFeatureBaseline(mdp.spec, )

        batch_size = 50000
        algo = TRPO(
            env=mdp,
            policy=policy,
            baseline=baseline,
            batch_size=batch_size,
            whole_paths=True,
            max_path_length=1000,
            n_itr=1000,
            step_size=0.01,
            subsample_factor=1.0,
            sampler_cls=BatchSampler,
        )

        algorithm = ICM(
            mdp,
            algo,
            args.tfboard_path + "/%s_%d" % (name, seed),
            feature_dim=mdp.spec.observation_space.flat_dim,
            forward_weight=args.fw_ratio,
            external_reward_weight=0.0,
            replay_pool_size=1000000,
            init_learning_rate=args.init_lr,
            n_updates_per_iter=1000,
        )

        run_experiment_lite(algorithm.train(),
                            exp_prefix=name,
                            n_parallel=8,
                            snapshot_mode="gap",
                            snapshot_gap=200,
                            seed=seed,
                            mode="local")
    idle = Idle(
        env=env,
        policy=policy,
        baseline=baseline,
    )

    icm = ICM(
        env,
        idle,
        '/home/fshentu/box3d/trpo_box3d_pixel_v8_tf_idle_normalize_fw_0.2_%d' %
        seed,
        feature_dim=256,
        forward_weight=0.2,
        inverse_tanh=True,
        init_learning_rate=5e-4,
        icm_batch_size=128,
        replay_pool_size=10000,
        n_updates_per_iter=200,
        obs_dtype='uint8',
        normalize_input=True,
    )

    run_experiment_lite(
        icm.train(),
        exp_prefix='trpo_box3d_pixel_v8_tf_idle_normalize_fw0.2',
        n_parallel=1,
        snapshot_mode="gap",
        snapshot_gap=200,
        seed=seed,
        mode="local")
コード例 #4
0
    batch_size = 2400
    idle = Idle(
        env=env,
        policy=policy,
        baseline=baseline,
    )

    icm = ICM(
        env,
        idle,
        '/home/fshentu/box3d/trpo_box3d_pixel_v8_tf_idle_normalize_fw_%d' %
        seed,
        feature_dim=256,
        forward_weight=0.2,
        inverse_tanh=True,
        init_learning_rate=1e-4,
        icm_batch_size=128,
        replay_pool_size=1000000,
        n_updates_per_iter=200,
        obs_dtype='uint8',
        normalize_input=True,
    )

    run_experiment_lite(icm.train(),
                        exp_prefix='trpo_box3d_pixel_v8_tf_idle_normalize',
                        n_parallel=1,
                        snapshot_mode="gap",
                        snapshot_gap=200,
                        seed=seed,
                        mode="local")
        batch_size=batch_size,
        whole_paths=True,
        max_path_length=200,
        n_itr=1000,
        step_size=0.01,
        subsample_factor=1.0,
        optimizer_args={'num_slices': 10},
        sampler_cls=BatchSampler,
    )

    algorithm = ICM(
        mdp,
        algo,
        "/home/fred/box3d/trpo_box3d_state_v10_tf_icm_cos_new_ext0.95_%d" %
        seed,
        feature_dim=mdp.spec.observation_space.flat_dim,
        forward_weight=0.1,
        external_reward_weight=0.95,
        inverse_tanh=True,
        init_learning_rate=1e-4,
        n_updates_per_iter=500)

    run_experiment_lite(
        algorithm.train(),
        exp_prefix='trpo_box3d_state_v10_tf_icm_cos_new_ext0.95',
        n_parallel=8,
        snapshot_mode="gap",
        snapshot_gap=200,
        seed=seed,
        mode="local")