Ejemplo n.º 1
0
def main():
    n_seeds = 1
    mode = "here"
    exp_prefix = "dev-sl"

    # n_seeds = 10
    # mode = "ec2"
    exp_prefix = "paper-6-14-HL-sl-H25"

    H = 25
    # noinspection PyTypeChecker
    variant = dict(
        H=H,
        exp_prefix=exp_prefix,
        algo_params=dict(
            num_batches_per_epoch=100,
            num_epochs=30,
            learning_rate=1e-3,
            batch_size=1000,
            eval_num_episodes=64,
            lstm_state_size=10,
            # rnn_cell_class=LSTMCell,
            # rnn_cell_params=dict(
            #     use_peepholes=True,
            # ),
            rnn_cell_class=SeparateLstmLinearCell,
            rnn_cell_params=dict(
                use_peepholes=True,
                env_noise_std=0,
                memory_noise_std=0,
                output_nonlinearity=tf.nn.tanh,
                # output_nonlinearity=tf.nn.softmax,
                env_hidden_sizes=[],
                output_dim=1,
            ),
            softmax=False,
        ),
        version='Supervised Learning',
        env_class=HighLow,
        env_params=dict(horizon=H, )
        # env_class=OneCharMemory,
    )

    exp_id = -1
    for _ in range(n_seeds):
        seed = random.randint(0, 999999)
        exp_id += 1
        set_seed(seed)
        variant['seed'] = seed
        variant['exp_id'] = exp_id

        run_experiment(
            bptt_launcher,
            exp_prefix=exp_prefix,
            seed=seed,
            mode=mode,
            variant=variant,
            exp_id=exp_id,
        )
Ejemplo n.º 2
0
def run_linear_ocm_exp(variant):
    from railrl.tf.ddpg import DDPG
    from railrl.envs.flattened_product_box import FlattenedProductBox
    from railrl.exploration_strategies.ou_strategy import OUStrategy
    from railrl.tf.policies.nn_policy import FeedForwardPolicy
    from railrl.qfunctions.nn_qfunction import FeedForwardCritic
    from railrl.envs.memory.continuous_memory_augmented import (
        ContinuousMemoryAugmented
    )
    from railrl.launchers.launcher_util import (
        set_seed,
    )

    """
    Set up experiment variants.
    """
    seed = variant['seed']
    algo_params = variant['algo_params']
    env_class = variant['env_class']
    env_params = variant['env_params']
    memory_dim = variant['memory_dim']
    ou_params = variant['ou_params']

    set_seed(seed)

    """
    Code for running the experiment.
    """

    env = env_class(**env_params)
    env = ContinuousMemoryAugmented(
        env,
        num_memory_states=memory_dim,
    )
    env = FlattenedProductBox(env)

    qf = FeedForwardCritic(
        name_or_scope="critic",
        env_spec=env.spec,
    )
    policy = FeedForwardPolicy(
        name_or_scope="policy",
        env_spec=env.spec,
    )
    es = OUStrategy(
        env_spec=env.spec,
        **ou_params
    )
    algorithm = DDPG(
        env,
        es,
        policy,
        qf,
        **algo_params
    )

    algorithm.train()
def run_linear_ocm_exp(variant):
    from sandbox.rocky.tf.algos.trpo import TRPO
    from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline
    from sandbox.rocky.tf.policies.gaussian_lstm_policy import GaussianLSTMPolicy
    import sandbox.rocky.tf.core.layers as L
    from sandbox.rocky.tf.optimizers.conjugate_gradient_optimizer import (
        ConjugateGradientOptimizer,
        FiniteDifferenceHvp,
    )
    from railrl.envs.flattened_product_box import FlattenedProductBox
    from railrl.envs.memory.continuous_memory_augmented import (
        ContinuousMemoryAugmented)
    from railrl.envs.memory.one_char_memory import (
        OneCharMemoryEndOnly, )
    from railrl.envs.memory.high_low import HighLow
    from railrl.launchers.launcher_util import (
        set_seed, )
    """
    Set up experiment variants.
    """
    H = variant['H']
    seed = variant['seed']
    num_values = variant['num_values']

    set_seed(seed)
    onehot_dim = num_values + 1
    """
    Code for running the experiment.
    """

    # env = OneCharMemoryEndOnly(n=num_values, num_steps=H, softmax_action=True)
    env = HighLow(num_steps=H)
    env = ContinuousMemoryAugmented(
        env,
        num_memory_states=onehot_dim,
    )
    env = FlattenedProductBox(env)

    policy = GaussianLSTMPolicy(
        name="policy",
        env_spec=env.spec,
        lstm_layer_cls=L.LSTMLayer,
    )

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    optimizer_params = variant['optimizer_params']
    trpo_params = variant['trpo_params']
    algo = TRPO(env=env,
                policy=policy,
                baseline=baseline,
                optimizer=ConjugateGradientOptimizer(
                    hvp_approach=FiniteDifferenceHvp(**optimizer_params)),
                **trpo_params)

    algo.train()
Ejemplo n.º 4
0
def run_linear_ocm_exp(variant):
    from railrl.tf.ddpg import DDPG
    from railrl.envs.memory.continuous_memory_augmented import (
        ContinuousMemoryAugmented)
    from railrl.envs.memory.one_char_memory import (
        OneCharMemoryEndOnly, )
    from railrl.launchers.launcher_util import (
        set_seed, )
    """
    Set up experiment variants.
    """
    H = variant['H']
    seed = variant['seed']
    num_values = variant['num_values']
    algo_params = variant['algo_params']

    set_seed(seed)
    onehot_dim = num_values + 1

    env_action_dim = num_values + 1
    """
    Code for running the experiment.
    """

    env = OneCharMemoryEndOnly(n=num_values, num_steps=H, softmax_action=True)
    env = ContinuousMemoryAugmented(
        env,
        num_memory_states=onehot_dim,
    )
    # env = FlattenedProductBox(env)

    # qf = FeedForwardCritic(
    #     name_or_scope="critic",
    #     env_spec=env.spec,
    # )
    qf = MlpMemoryQFunction(
        name_or_scope="critic",
        env_spec=env.spec,
    )
    policy = ActionAwareMemoryPolicy(
        name_or_scope="noisy_policy",
        action_dim=env_action_dim,
        memory_dim=memory_dim,
        env_spec=env.spec,
    )
    es = OUStrategy(env_spec=env.spec)
    algorithm = DDPG(env, es, policy, qf, **algo_params)

    algorithm.train()
def experiment(variant):
    from railrl.torch.ddpg import DDPG
    from railrl.launchers.launcher_util import (
        set_seed, )
    seed = variant['seed']
    algo_params = variant['algo_params']
    env_params = variant['env_params']
    es_class = variant['es_class']
    es_params = variant['es_params']

    set_seed(seed)
    env = TwoDPoint(**env_params)
    es = es_class(env_spec=env.spec, **es_params)
    algorithm = DDPG(env, es, **algo_params)
    algorithm.train()
def run_linear_ocm_exp(variant):
    from railrl.tf.ddpg_ocm import DdpgOcm
    from railrl.qfunctions.memory.mlp_memory_qfunction import MlpMemoryQFunction
    from railrl.exploration_strategies.noop import NoopStrategy
    from railrl.exploration_strategies.onehot_sampler import OneHotSampler
    from railrl.exploration_strategies.product_strategy import ProductStrategy
    from railrl.envs.memory.continuous_memory_augmented import (
        ContinuousMemoryAugmented)
    from railrl.envs.memory.one_char_memory import OneCharMemoryEndOnly
    from railrl.tf.policies.memory.linear_ocm_policy import LinearOcmPolicy
    from railrl.launchers.launcher_util import (
        set_seed, )
    """
    Set up experiment variants.
    """
    H = variant['H']
    seed = variant['seed']
    num_values = variant['num_values']
    ddpg_params = variant['ddpg_params']

    onehot_dim = num_values + 1
    set_seed(seed)
    """
    Code for running the experiment.
    """

    env = OneCharMemoryEndOnly(n=num_values, num_steps=H)
    env = ContinuousMemoryAugmented(
        env,
        num_memory_states=onehot_dim,
    )

    policy = LinearOcmPolicy(
        name_or_scope="policy",
        memory_and_action_dim=onehot_dim,
        env_spec=env.spec,
    )

    es = ProductStrategy([OneHotSampler(), NoopStrategy()])
    qf = MlpMemoryQFunction(
        name_or_scope="critic",
        env_spec=env.spec,
    )
    algorithm = DdpgOcm(env, es, policy, qf, **ddpg_params)

    algorithm.train()
Ejemplo n.º 7
0
def run_linear_ocm_exp(variant):
    from railrl.tf.ddpg import DDPG
    from railrl.launchers.launcher_util import (
        set_seed, )
    from railrl.exploration_strategies.ou_strategy import OUStrategy
    from railrl.tf.policies.nn_policy import FeedForwardPolicy
    from railrl.qfunctions.nn_qfunction import FeedForwardCritic
    """
    Set up experiment variants.
    """
    H = variant['H']
    seed = variant['seed']
    algo_params = variant['algo_params']
    env_class = variant['env_class']
    env_params = variant['env_params']
    ou_params = variant['ou_params']

    set_seed(seed)
    """
    Code for running the experiment.
    """

    env = env_class(**env_params)

    qf = FeedForwardCritic(
        name_or_scope="critic",
        env_spec=env.spec,
    )
    policy = FeedForwardPolicy(
        name_or_scope="policy",
        env_spec=env.spec,
    )
    es = OUStrategy(env_spec=env.spec, **ou_params)
    algorithm = DDPG(env, es, policy, qf, **algo_params)

    algorithm.train()
Ejemplo n.º 8
0
    for launcher in [
            # trpo_launcher,
            # mem_trpo_launcher,
            # rtrpo_launcher,
            ddpg_launcher,
            mem_ddpg_launcher,
            rdpg_launcher,
    ]:
        search_space = {
            # 'env_class': [WaterMaze1D, WaterMazeEasy1D, WaterMazeMemory1D],
        }
        sweeper = DeterministicHyperparameterSweeper(
            search_space, default_parameters=variant)
        for exp_id, variant in enumerate(sweeper.iterate_hyperparameters()):
            for seed in range(n_seeds):
                exp_id += 1
                set_seed(seed)
                variant['seed'] = seed
                variant['exp_id'] = exp_id

                run_experiment(
                    launcher,
                    exp_prefix=exp_prefix,
                    seed=seed,
                    mode=mode,
                    variant=variant,
                    exp_id=exp_id,
                    snapshot_mode='last',
                    use_gpu=use_gpu,
                )
Ejemplo n.º 9
0
def main():
    n_seeds = 1
    mode = "here"
    exp_prefix = "dev-sl"

    # n_seeds = 10
    # mode = "ec2"
    # exp_prefix = "6-2-sl-rwa-vs-lstm"

    env_noise_std = 0
    memory_noise_std = 0
    for rnn_cell_class, H in product(
        [SeparateRWALinearCell],
        [512],
            # [RWACell, LSTMCell, GRUCell],
            # [512, 256, 128, 64],
    ):
        # noinspection PyTypeChecker
        variant = dict(
            H=H,
            exp_prefix=exp_prefix,
            algo_params=dict(
                num_batches_per_epoch=10000 // 32,
                num_epochs=100,
                learning_rate=1e-3,
                batch_size=32,
                eval_num_episodes=64,
                lstm_state_size=10,
                rnn_cell_class=rnn_cell_class,
                rnn_cell_params=dict(
                    # use_peepholes=True,
                    state_is_flat_externally=False,
                    output_dim=1,
                ),
                # rnn_cell_class=SeparateLstmLinearCell,
                # rnn_cell_params=dict(
                #     use_peepholes=True,
                #     env_noise_std=env_noise_std,
                #     memory_noise_std=memory_noise_std,
                #     output_nonlinearity=tf.nn.tanh,
                #     # output_nonlinearity=tf.nn.softmax,
                #     env_hidden_sizes=[],
                # ),
                softmax=False,
            ),
            version='Supervised Learning',
            env_class=HighLow,
            # env_class=OneCharMemory,
        )

        exp_id = -1
        for seed in range(n_seeds):
            exp_id += 1
            set_seed(seed)
            variant['seed'] = seed
            variant['exp_id'] = exp_id

            run_experiment(
                bptt_launcher,
                exp_prefix=exp_prefix,
                seed=seed,
                mode=mode,
                variant=variant,
                exp_id=exp_id,
            )