env=mdp,
        policy=policy,
        baseline=baseline,
        batch_size=batch_size,
        whole_paths=True,
        max_path_length=200,
        n_itr=2000,
        step_size=0.01,
        subsample_factor=1.0,
        optimizer_args={'num_slices': 10},
    )

    algorithm = ICM(
        mdp,
        algo,
        "/home/fred/box3d/trpo_box3d_state_v10_tf_icm_cos_%d" % seed,
        feature_dim=mdp.spec.observation_space.flat_dim,
        forward_weight=0.1,
        external_reward_weight=0.0,
        inverse_tanh=True,
        init_learning_rate=1e-4,
    )

    run_experiment_lite(algorithm.train(),
                        exp_prefix='trpo_box3d_state_v10_tf_icm_cos',
                        n_parallel=1,
                        snapshot_mode="gap",
                        snapshot_gap=200,
                        seed=seed,
                        mode="local")
Esempio n. 2
0
    batch_size = 2400
    idle = Idle(
        env=env,
        policy=policy,
        baseline=baseline,
    )

    icm = ICM(
        env,
        idle,
        '/home/fshentu/box3d/trpo_box3d_pixel_v8_tf_idle_normalize_fw_%d' %
        seed,
        feature_dim=256,
        forward_weight=0.2,
        inverse_tanh=True,
        init_learning_rate=1e-4,
        icm_batch_size=128,
        replay_pool_size=1000000,
        n_updates_per_iter=200,
        obs_dtype='uint8',
        normalize_input=True,
    )

    run_experiment_lite(icm.train(),
                        exp_prefix='trpo_box3d_pixel_v8_tf_idle_normalize',
                        n_parallel=1,
                        snapshot_mode="gap",
                        snapshot_gap=200,
                        seed=seed,
                        mode="local")
Esempio n. 3
0
def main():

    parser = argparse.ArgumentParser()
    # Hyperparameters
    parser.add_argument('--fw_ratio', type=float, default=0.1)
    parser.add_argument('--init_lr', type=float, default=5e-4)

    parser.add_argument('--tfboard_path', type=str, default='/tmp/tfboard')
    parser.add_argument('--gpu_ratio', type=float, default=0.99)

    args = parser.parse_args()

    # Param ranges
    seeds = range(2)

    for seed in seeds:
        mdp = TfEnv(normalize(env=GymEnv('Box3dReach-v17',record_video=False, \
        log_dir='/tmp/gym_test',record_log=False), normalize_obs=True))

        name = 'trpo-state-v17-tf-icm-fw{}-initlr-{}-norm'.format(
            args.fw_ratio, args.init_lr)

        policy = GaussianMLPPolicy(
            "mlp_policy",
            env_spec=mdp.spec,
            hidden_sizes=(64, 64, 32),
            output_nonlinearity=tf.nn.tanh,
            clip_action=False,
        )

        baseline = LinearFeatureBaseline(mdp.spec, )

        batch_size = 50000
        algo = TRPO(
            env=mdp,
            policy=policy,
            baseline=baseline,
            batch_size=batch_size,
            whole_paths=True,
            max_path_length=1000,
            n_itr=1000,
            step_size=0.01,
            subsample_factor=1.0,
            sampler_cls=BatchSampler,
        )

        algorithm = ICM(
            mdp,
            algo,
            args.tfboard_path + "/%s_%d" % (name, seed),
            feature_dim=mdp.spec.observation_space.flat_dim,
            forward_weight=args.fw_ratio,
            external_reward_weight=0.0,
            replay_pool_size=1000000,
            init_learning_rate=args.init_lr,
            n_updates_per_iter=1000,
        )

        run_experiment_lite(algorithm.train(),
                            exp_prefix=name,
                            n_parallel=8,
                            snapshot_mode="gap",
                            snapshot_gap=200,
                            seed=seed,
                            mode="local")
Esempio n. 4
0
    )

    batch_size = 2400
    idle = Idle(
        env=env, 
        policy=policy, 
        baseline=baseline,
    )

    icm = ICM(
        env,
        idle,
        '/home/fred/box3d/trpo_box3d_pixel_v8_tf_idle_%d'%seed,
        feature_dim=256,
        forward_weight=0.2,
        inverse_tanh=True,
        init_learning_rate=1e-4,
        icm_batch_size=128,
        replay_pool_size=1000000,
        n_updates_per_iter=200,
        obs_dtype='uint8',
    )

    run_experiment_lite(
        icm.train(),
        exp_prefix='trpo_box3d_pixel_v8_tf_idle',
        n_parallel=1,
        snapshot_mode="gap",
        snapshot_gap=200,
        seed=seed,
        mode="local"
Esempio n. 5
0
    )

    baseline = LinearFeatureBaseline(mdp.spec, )

    batch_size = 50000
    idle = Idle(
        env=mdp,
        policy=policy,
        baseline=baseline,
        n_itr=2000,
    )

    algorithm = ICM(
        mdp,
        idle,
        "/x/mujoco/tfboard_box3d/trpo_box3d_state_v10_tf_icm_idle_%d" % seed,
        feature_dim=mdp.spec.observation_space.flat_dim,
        forward_weight=0.1,
        external_reward_weight=0.0,
        init_learning_rate=1e-4,
        n_updates_per_iter=500,
    )

    run_experiment_lite(algorithm.train(),
                        exp_prefix='trpo_box3d_state_v10_tf_icm_idle',
                        n_parallel=8,
                        snapshot_mode="gap",
                        snapshot_gap=200,
                        seed=seed,
                        mode="local")
        batch_size=batch_size,
        whole_paths=True,
        max_path_length=200,
        n_itr=1000,
        step_size=0.01,
        subsample_factor=1.0,
        optimizer_args={'num_slices': 10},
        sampler_cls=BatchSampler,
    )

    algorithm = ICM(
        mdp,
        algo,
        "/home/fred/box3d/trpo_box3d_state_v10_tf_icm_cos_new_ext0.95_%d" %
        seed,
        feature_dim=mdp.spec.observation_space.flat_dim,
        forward_weight=0.1,
        external_reward_weight=0.95,
        inverse_tanh=True,
        init_learning_rate=1e-4,
        n_updates_per_iter=500)

    run_experiment_lite(
        algorithm.train(),
        exp_prefix='trpo_box3d_state_v10_tf_icm_cos_new_ext0.95',
        n_parallel=8,
        snapshot_mode="gap",
        snapshot_gap=200,
        seed=seed,
        mode="local")
Esempio n. 7
0
                step_size=0.01,
                subsample_factor=0.2,
                sampler_cls=BatchSampler,
                optimizer_args={
                    'num_slices': 4,
                })

    icm = ICM(
        env,
        algo,
        '/z/dianchen/box3d/trpo_box3d_pixel_v11_tf_icm_pretrained_cnn_norew_fw0.01_%d'
        % seed,
        forward_weight=0.01,
        external_reward_weight=0.0,
        init_learning_rate=1e-4,
        forward_cos=True,
        replay_pool_size=100000,
        n_updates_per_iter=1000,
        normalize_input=True,
        obs_dtype='uint8',
        pretrained_icm=True,
        pretrained_icm_path=
        '/z/dianchen/tfmodel_box3d/icm_supervised_box3dpixel_v11_box_dense_2e3_fw_0.01_lr_5e-4.pkl',
    )

    run_experiment_lite(
        icm.train(),
        exp_prefix='trpo_box3d_pixel_v11_tf_icm_pretrained_cnn_norew_fw0.01',
        n_parallel=12,
        snapshot_mode="gap",
        snapshot_gap=200,
Esempio n. 8
0
        baseline=baseline,
        batch_size=batch_size,
        whole_paths=True,
        max_path_length=1000,
        n_itr=1000,
        step_size=0.01,
        subsample_factor=1.0,
        sampler_cls=BatchSampler,
    )

    algorithm = ICM(
        mdp,
        algo,
        "/media/4tb/box3d/trpo_box3d_state_v12_tf_icm_frozen_fw0.1_frozen_%d" %
        seed,
        feature_dim=mdp.spec.observation_space.flat_dim,
        pretrained_icm=True,
        pretrained_icm_path=
        "/home/fred/rllab/data/local/trpo-state-v12-tf-icm-fw0.1-initlr-0.001/trpo-state-v12-tf-icm-fw0.1-initlr-0.001_2017_07_16_22_12_20_0001/itr_1000.pkl",
        freeze_icm=True,
        external_reward_weight=0.0,
    )

    run_experiment_lite(algorithm.train(),
                        exp_prefix='trpo_box3d_state_v12_tf_icm_frozen_fw0.1',
                        n_parallel=8,
                        snapshot_mode="gap",
                        snapshot_gap=200,
                        seed=seed,
                        mode="local")
Esempio n. 9
0
        baseline=baseline,
        batch_size=batch_size,
        whole_paths=True,
        max_path_length=1000,
        n_itr=10000,
        step_size=0.01,
        subsample_factor=1.0,
        sampler_cls=BatchSampler,
    )

    algorithm = ICM(
        mdp,
        algo,
        "/x/mujoco/tfboard_box3d/trpo_box3d_state_v12_tf_icm_fw0.3_5e-4_%d"%seed,
        feature_dim=mdp.spec.observation_space.flat_dim,
        forward_weight=0.3,
        external_reward_weight=0.0,
        replay_pool_size=100000000,
        init_learning_rate=5e-4,
        n_updates_per_iter=2000,
    )

    run_experiment_lite(
        algorithm.train(),
        exp_prefix='trpo_box3d_state_v12_tf_icm_fw0.3_5e-4',
        n_parallel=8,
        snapshot_mode="gap",
        snapshot_gap=200,
        seed=seed,
        mode="local"
    )
Esempio n. 10
0
        env=mdp,
        policy=policy,
        baseline=baseline,
        batch_size=batch_size,
        whole_paths=True,
        max_path_length=200,
        n_itr=1000,
        step_size=0.01,
        subsample_factor=1.0,
        sampler_cls=BatchSampler,
    )

    algorithm = ICM(
        mdp,
        algo,
        "/z/dianchen/box3d/trpo_box3d_state_v4_tf_icm_ext0.9995_%d" % seed,
        feature_dim=mdp.spec.observation_space.flat_dim,
        forward_weight=0.1,
        external_reward_weight=0.9995,
        replay_pool_size=500000,
        init_learning_rate=5e-4,
        n_updates_per_iter=1000)

    run_experiment_lite(algorithm.train(),
                        exp_prefix='trpo_box3d_state_v4_tf_icm_ext0.9995',
                        n_parallel=6,
                        snapshot_mode="gap",
                        snapshot_gap=100,
                        seed=seed,
                        mode="local")
Esempio n. 11
0
        env=mdp,
        policy=policy,
        baseline=baseline,
        batch_size=batch_size,
        whole_paths=True,
        max_path_length=500,
        n_itr=1000,
        step_size=0.01,
        subsample_factor=1.0,
    )

    algorithm = ICM(
        mdp,
        algo,
        "/data0/dianchen/box3d/trpo_box3d_state_v4_tf_icm",
        feature_dim=mdp.spec.observation_space.flat_dim,
        forward_weight=0.2,
        external_reward_weight=0.0,
        inverse_tanh=True,
        init_learning_rate=1e-4,
    )

    run_experiment_lite(
        algorithm.train(),
        exp_prefix='trpo_box3d_state_v4_tf_icm',
        n_parallel=1,
        snapshot_mode="gap",
        snapshot_gap=100,
        seed=seed,
        mode="local"
    )
Esempio n. 12
0
        mdp.spec,
    )

    batch_size = 5000
    algo = Idle(
        env=mdp, 
        policy=policy, 
        baseline=baseline,
    )

    algorithm = ICM(
        mdp,
        algo,
        "/home/dianchen/box3d/trpo_box3d_state_v4_tf_icm_idle",
        no_encoder=False,
        feature_dim=mdp.spec.observation_space.flat_dim,
        forward_weight=0.2,
        external_reward_weight=0.99,
        inverse_tanh=True,
        init_learning_rate=1e-3,
    )

    run_experiment_lite(
        algorithm.train(),
        exp_prefix='trpo_box3d_state_v4_tf_icm_idle',
        n_parallel=1,
        snapshot_mode="gap",
        snapshot_gap=100,
        seed=seed,
        mode="local"
    )
Esempio n. 13
0
        policy=policy,
        baseline=baseline,
        batch_size=batch_size,
        whole_paths=True,
        max_path_length=200,
        n_itr=2000,
        step_size=0.01,
        subsample_factor=1.0,
    )

    icm = ICM(
        env,
        trpo,
        '/home/fred/box3d/trpo_box3d_pixel_v7_tf_icm_%d' % seed,
        feature_dim=256,
        forward_weight=0.2,
        inverse_tanh=True,
        init_learning_rate=1e-4,
        icm_batch_size=128,
        replay_pool_size=1000000,
        n_updates_per_iter=200,
    )

    run_experiment_lite(icm.train(),
                        exp_prefix='trpo_box3d_pixel_v7_tf_icm',
                        n_parallel=1,
                        snapshot_mode="gap",
                        snapshot_gap=200,
                        seed=seed,
                        mode="local")
        baseline=baseline,
        batch_size=batch_size,
        whole_paths=True,
        max_path_length=200,
        n_itr=1000,
        step_size=0.01,
        subsample_factor=1.0,
        sampler_cls=BatchSampler,
    )

    algorithm = ICM(
        mdp,
        algo,
        "/home/fred/box3d/trpo_box3d_state_v11_tf_icm_cos_ext0.995_%d"%seed,
        feature_dim=mdp.spec.observation_space.flat_dim,
        forward_weight=0.1,
        external_reward_weight=0.995,
        init_learning_rate=1e-4,
        replay_pool_size=500000,
        n_updates_per_iter=500
    )

    run_experiment_lite(
        algorithm.train(),
        exp_prefix='trpo_box3d_state_v11_tf_icm_cos_ext0.995',
        n_parallel=8,
        snapshot_mode="gap",
        snapshot_gap=200,
        seed=seed,
        mode="local"
    )