def pool_runner(config, group_dir, seed):
    print("***************************************************")
    exp_dir = os.path.join(group_dir, 'exp_' + str(seed))
    if os.path.exists(exp_dir):
        print("Skipping experiment dir: ", exp_dir)
        # TODO: resume if necessary?
        return

    print("Starting experiment in dir: ", exp_dir)
    run_experiment(
        #run_task,
        lambda sc, *args: run_task(
            sc, *args,
            env_params=config['env_params'],
            algo=config['algo'],
            algo_params=config['algo_params'],
            epochs=config['epochs'],
            batch_size=config['batch_size'],
            policy_hidden_sizes=config['policy_hidden_sizes'],
            embed_state=config['use_state_embedding'],
            model_dir=config['model_dir'],
            augment_embedded_state=config['augment_embedded_state']),
        exp_prefix='trpo_reacher_embedded',
        log_dir=exp_dir,
        snapshot_mode='last',
        seed=seed,
    )
Ejemplo n.º 2
0
def main(latent_policy_pkl):
    def run_task(*_):

        sess = tf.Session()
        sess.__enter__()
        with LocalRunner(sess=sess) as runner:
            inner_env = SimpleReacherEnv(
                goal_position=(0.5, 0, 0.15),
                control_method="position_control",
                completion_bonus=2.,
                action_scale=0.04,
            )
            latent_policy = joblib.load(latent_policy_pkl)["policy"]

            env = TfEnv(EmbeddedPolicyEnv(inner_env, latent_policy))

            action_noise = OUStrategy(env, sigma=0.2)

            policy = ContinuousMLPPolicy(
                env_spec=env.spec,
                name="Actor",
                hidden_sizes=[64, 32],
                hidden_nonlinearity=tf.nn.relu,
            )

            qf = ContinuousMLPQFunction(env_spec=env,
                                        name="Critic",
                                        hidden_sizes=[64, 32],
                                        hidden_nonlinearity=tf.nn.relu)

            replay_buffer = SimpleReplayBuffer(env_spec=env.spec,
                                               size_in_transitions=int(1e6),
                                               time_horizon=100)

            algo = DDPG(env,
                        policy=policy,
                        policy_lr=1e-4,
                        qf_lr=1e-3,
                        qf=qf,
                        plot=True,
                        target_update_tau=1e-2,
                        n_epochs=500,
                        n_train_steps=50,
                        discount=0.9,
                        replay_buffer=replay_buffer,
                        min_buffer_size=int(1e3),
                        exploration_strategy=action_noise,
                        policy_optimizer=tf.train.AdamOptimizer,
                        qf_optimizer=tf.train.AdamOptimizer)
            runner.setup(algo, env)
            runner.train(n_epochs=500, plot=False, n_epoch_cycles=10)

    run_experiment(
        run_task,
        exp_prefix='ddpg_sawyer_compose',
        n_parallel=1,
        seed=1,
        plot=True,
    )
Ejemplo n.º 3
0
def main(latent_policy_pkl):
    def run_task(*_):
        sess = tf.Session()
        sess.__enter__()
        latent_policy = joblib.load(latent_policy_pkl)["policy"]
        with LocalRunner(sess=sess) as runner:
            inner_env = PointEnv(goal=(1.4, 1.4), completion_bonus=100)
            env = TfEnv(EmbeddedPolicyEnv(inner_env, latent_policy))

            action_noise = OUStrategy(env, sigma=0.2)

            policy = ContinuousMLPPolicy(env_spec=env,
                                         name="Actor",
                                         hidden_sizes=[64, 64],
                                         hidden_nonlinearity=tf.nn.relu,
                                         output_nonlinearity=tf.nn.tanh)

            qf = ContinuousMLPQFunction(env_spec=env,
                                        name="Critic",
                                        hidden_sizes=[64, 64],
                                        hidden_nonlinearity=tf.nn.relu)

            replay_buffer = SimpleReplayBuffer(env_spec=env.spec,
                                               size_in_transitions=int(1e6),
                                               time_horizon=100)

            algo = DDPG(env,
                        policy=policy,
                        policy_lr=1e-4,
                        qf_lr=1e-3,
                        qf=qf,
                        plot=False,
                        target_update_tau=1e-2,
                        replay_buffer=replay_buffer,
                        n_train_steps=50,
                        discount=0.9,
                        min_buffer_size=int(1e3),
                        exploration_strategy=action_noise,
                        policy_optimizer=tf.train.AdamOptimizer,
                        qf_optimizer=tf.train.AdamOptimizer)

            runner.setup(algo, env)
            runner.train(n_epochs=500, plot=False, n_epoch_cycles=10)

    run_experiment(
        run_task,
        n_parallel=1,
        exp_prefix="ddpg_point_compose",
        seed=1,
        plot=False,
    )
Ejemplo n.º 4
0
def _launch_ec2(func, exp_prefix, exp_name, params, run_experiment_kwargs):
    print("Launching task", exp_name)
    kwargs = dict(n_parallel=1,
                  snapshot_mode="last",
                  seed=params.get("seed", None),
                  mode="ec2")
    kwargs.update(run_experiment_kwargs)
    kwargs.update(
        dict(exp_prefix=exp_prefix,
             exp_name=exp_name,
             variant=params,
             confirm_remote=False))

    run_experiment(func, **kwargs)
Ejemplo n.º 5
0
def pool_runner(run_task, group_dir, i):
    print("***************************************************")
    exp_dir = os.path.join(group_dir, 'exp_') + str(i)
    if os.path.exists(exp_dir):
        print("Skipping experiment dir: ", exp_dir)
        # TODO: resume if necessary?
        return

    print("Starting experiment in dir: ", exp_dir)
    run_experiment(
        run_task,
        log_dir=exp_dir,
        snapshot_mode='last',
        seed=i,
    )
Ejemplo n.º 6
0
    def test_benchmark_gaussian_mlp_baseline(self):
        bench_envs = [
            'HalfCheetah-v2', 'Reacher-v2', 'Walker2d-v2', 'Hopper-v2',
            'Swimmer-v2', 'InvertedPendulum-v2', 'InvertedDoublePendulum-v2'
        ]

        seeds = np.random.choice(100, size=(len(bench_envs), 3))

        for env_num in range(len(bench_envs)):
            self._env = bench_envs[env_num]
            for seed in seeds[env_num]:
                self._seed = seed
                deterministic.set_seed(self._seed)
                name = '{}_seed_{}_garage'.format(self._env, self._seed)
                run_experiment(self.run_task,
                               snapshot_mode='last',
                               seed=self._seed,
                               n_parallel=12,
                               exp_name=name)
Ejemplo n.º 7
0
def main(latent_policy_pkl):
    def run_task(*_):
        sess = tf.Session()
        sess.__enter__()
        latent_policy = joblib.load(latent_policy_pkl)["policy"]
        with LocalRunner(sess=sess) as runner:
            inner_env = PointEnv(goal=(1.4, 1.4), completion_bonus=100)
            env = TfEnv(EmbeddedPolicyEnv(inner_env, latent_policy))

            policy = GaussianMLPPolicy(name="composer",
                                       env_spec=env.spec,
                                       hidden_sizes=(64, 64),
                                       init_std=20,
                                       std_share_network=False,
                                       adaptive_std=True)

            baseline = GaussianMLPBaseline(env_spec=env)

            algo = PPO(
                env=env,
                policy=policy,
                baseline=baseline,
                batch_size=1024,  # 4096
                max_path_length=50,
                n_itr=1500,
                discount=0.99,
                step_size=0.2,
                policy_ent_coeff=1e-6,
                plot=True,
                use_mpc_es=True,
            )
            runner.setup(algo, env)
            runner.train(n_epochs=600, plot=False, batch_size=1024)

    run_experiment(
        run_task,
        n_parallel=1,
        exp_prefix="ppo_point_compose",
        seed=2,
        plot=False,
    )
Ejemplo n.º 8
0
def cmaes_obj_fcn(l_init, exp_prefix):
    global l_pre_init
    l_pre_init = params.inv_sigmoid(l_init, params.l_lb, params.l_ub)

    now = datetime.now()
    exp_name = now.strftime("%Y_%m_%d_%H_%M_%S")

    run_experiment(run_task,
                   exp_prefix=exp_prefix,
                   exp_name=exp_name,
                   snapshot_mode='last',
                   seed=args.seed,
                   force_cpu=True)

    csv_path = os.path.join(os.environ['PROJECTDIR'], 'data/local',
                            exp_prefix.replace('_', '-'), exp_name,
                            'progress.csv')
    csv_df = pd.read_csv(csv_path)
    final_avg_discounted_return = np.mean(
        csv_df['AverageDiscountedReturn']
        [-params.ppo_inner_final_average_discounted_return_window_size:])
    return -final_avg_discounted_return
from garage.tf.policies import CategoricalMLPPolicy


def run_task(*_):
    env = TfEnv(normalize(gym.make('MountainCar-v0')))

    policy = CategoricalMLPPolicy(env_spec=env.spec, hidden_sizes=(32, 32))

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=4000,
        max_path_length=env.max_episode_steps,
        n_itr=150,
        discount=0.99,
        step_size=0.1,
        plot=True,
    )
    algo.train()


run_experiment(
    run_task,
    n_parallel=1,
    snapshot_mode='last',
    plot=True,
)
Ejemplo n.º 10
0
        max_path_length=100,
        n_itr=40,
        discount=0.99,
        step_size=v["step_size"],
        # Uncomment both lines (this and the plot parameter below) to enable
        # plotting
        plot=True,
    )
    algo.train()


for step_size in [0.01, 0.05, 0.1]:
    for seed in [1, 11, 21, 31, 41]:
        run_experiment(
            run_task,
            exp_prefix="first_exp",
            # Number of parallel workers for sampling
            n_parallel=1,
            # Only keep the snapshot parameters for the last iteration
            snapshot_mode="last",
            # Specifies the seed for the experiment. If this is not provided, a
            # random seed will be used
            seed=seed,
            # mode="local",
            mode="ec2",
            variant=dict(step_size=step_size, seed=seed)
            # plot=True,
            # terminate_machine=False,
        )
        sys.exit()
Ejemplo n.º 11
0
#!/usr/bin/env python3
"""This is an example to resume training programmatically."""
from garage.experiment import run_experiment
from garage.tf.experiment import LocalTFRunner


def run_task(snapshot_config, *_):
    """Run task."""
    with LocalTFRunner(snapshot_config=snapshot_config) as runner:
        runner.restore(from_dir='dir/', from_epoch=2)
        runner.resume()


run_experiment(
    run_task,
    log_dir='new_dir/',
    snapshot_mode='last',
    seed=1,
)
Ejemplo n.º 12
0
        algo = JoleDQN(env_spec=env.spec,
                       policy=policy,
                       qf=qf,
                       obs_model=obs_model,
                       reward_model=reward_model,
                       terminal_model=terminal_model,
                       exploration_strategy=epilson_greedy_strategy,
                       replay_buffer=replay_buffer,
                       qf_lr=1e-3,
                       discount=0.99,
                       min_buffer_size=int(1e3),
                       double_q=False,
                       n_train_steps=50,
                       n_epoch_cycles=n_epoch_cycles,
                       target_network_update_freq=100,
                       buffer_batch_size=64,
                       env_name=env_name)

        runner.setup(algo, env)
        runner.train(n_epochs=n_epochs,
                     n_epoch_cycles=n_epoch_cycles,
                     batch_size=sampler_batch_size)


for i in range(1, 2):
    run_experiment(run_task,
                   snapshot_mode='none',
                   seed=i,
                   log_dir="data/type_dqn/{}/jole_dqn/{}/".format(env_name, i))
Ejemplo n.º 13
0
        # baseline = GaussianMLPBaseline(
        #     env_spec=env.spec,
        #     regressor_args=dict(
        #         hidden_sizes=params.baseline_network_size,
        #         hidden_nonlinearity=tf.nn.tanh,
        #         use_trust_region=True,
        #     ),
        # )
        
        baseline = LinearFeatureBaseline(env_spec=env.spec)

        algo = CMAES(env_spec=env.spec, policy=policy, baseline=baseline, **params.cmaes_algo_kwargs)

        runner.setup(algo, env)

        runner.train(**params.cmaes_train_kwargs)

    
if __name__=='__main__':

    now = datetime.now()

    parser = argparse.ArgumentParser()
    parser.add_argument('--seed', default=int(now.timestamp()), type=int, help='seed')
    parser.add_argument('--exp_id', default=now.strftime("%Y_%m_%d_%H_%M_%S"), help='experiment id (suffix to data directory name)')

    args = parser.parse_args()

    run_experiment(run_task, exp_prefix='cmaes_opt_l_hw_as_action_{}_'.format(args.exp_id) + str(params.n_segments)+'_params', snapshot_mode='last', seed=args.seed, force_cpu=True)
Ejemplo n.º 14
0
import gym

from garage.baselines import LinearFeatureBaseline
from garage.experiment import run_experiment
from garage.tf.algos import TRPO
from garage.tf.envs import TfEnv
from garage.tf.policies import CategoricalMLPPolicy

# Need to wrap in a tf environment and force_reset to true
# see https://github.com/openai/rllab/issues/87#issuecomment-282519288
env = TfEnv(gym.make("CartPole-v0"))

policy = CategoricalMLPPolicy(name="policy",
                              env_spec=env.spec,
                              hidden_sizes=(32, 32))

baseline = LinearFeatureBaseline(env_spec=env.spec)

algo = TRPO(
    env=env,
    policy=policy,
    baseline=baseline,
    batch_size=4000,
    max_path_length=200,
    n_itr=120,
    discount=0.99,
    max_kl_step=0.01,
)

run_experiment(algo.train(), n_parallel=1, snapshot_mode="last", seed=1)
Ejemplo n.º 15
0
def run_task(snapshot_config, *_):
    """Run task."""
    with LocalTFRunner(snapshot_config=snapshot_config) as runner:
        env = TfEnv(env_name='Pusher3DOF-v1')

        policy = GaussianMLPPolicy(name='policy',
                                      env_spec=env.spec,
                                      hidden_sizes=(32, 32),
                                      init_std=10)

        baseline = LinearFeatureBaseline(env_spec=env.spec)

        algo = TRPO(env_spec=env.spec,
                    policy=policy,
                    baseline=baseline,
                    max_path_length=100,
                    discount=0.99,
                    max_kl_step=0.01)

        runner.setup(algo, env)
        runner.train(n_epochs=200, batch_size=50*250)


run_experiment(
    run_task,
    exp_prefix="trpo_pusher_200_0524",
    snapshot_mode='last',
    seed=1,
)
Ejemplo n.º 16
0
        runner.train(
            n_epochs=40,
            batch_size=4000,
            # Uncomment to enable plotting
            # plot=True
        )


variants = VG().variants()

for v in variants:

    run_experiment(
        run_task,
        exp_prefix='first_exp',
        # Number of parallel workers for sampling
        n_parallel=1,
        # Only keep the snapshot parameters for the last iteration
        snapshot_mode='last',
        # Specifies the seed for the experiment. If this is not provided, a
        # random seed will be used
        seed=v['seed'],
        # mode="local",
        mode='ec2',
        variant=v,
        # plot=True,
        # terminate_machine=False,
    )
    sys.exit()
Ejemplo n.º 17
0
        )
        runner.setup(algo,
                     env,
                     batch_size=v.batch_size,
                     max_path_length=v.max_path_length)
        runner.train(n_epochs=2000, plot=False)


config = dict(
    tasks=TASKS,
    latent_length=3,
    inference_window=6,
    batch_size=4096 * len(TASKS),
    policy_ent_coeff=5e-3,  # 1e-2
    embedding_ent_coeff=1e-3,  # 1e-3
    inference_ce_coeff=5e-3,  # 1e-4
    max_path_length=200,
    embedding_init_std=1.0,
    embedding_max_std=2.0,
    policy_init_std=1.0,
)

run_experiment(
    run_task,
    exp_prefix='sawyer_reach_embed_8goal',
    n_parallel=1,
    seed=1,
    variant=config,
    plot=False,
)
Ejemplo n.º 18
0
def run_task(*_):
    env = normalize(gym.make("Pendulum-v0"))

    policy = DummyPolicy(env_spec=env)

    baseline = LinearFeatureBaseline(env_spec=env)
    algo = InstrumentedNOP(env=env,
                           policy=policy,
                           baseline=baseline,
                           batch_size=4000,
                           max_path_length=100,
                           n_itr=4,
                           discount=0.99,
                           step_size=0.01,
                           plot=True)
    algo.train()
    env.close()


run_experiment(
    run_task,
    # Number of parallel workers for sampling
    n_parallel=6,
    # Only keep the snapshot parameters for the last iteration
    snapshot_mode="last",
    # Specifies the seed for the experiment. If this is not provided, a random
    # seed will be used
    seed=1,
    plot=True,
)
Ejemplo n.º 19
0
from garage.tf.policies import CategoricalMLPPolicy


def run_task(*_):
    with LocalRunner() as runner:
        env = TfEnv(env_name='CartPole-v1')

        policy = CategoricalMLPPolicy(name="policy",
                                      env_spec=env.spec,
                                      hidden_sizes=(32, 32))

        baseline = LinearFeatureBaseline(env_spec=env.spec)

        algo = VPG(
            env=env,
            policy=policy,
            baseline=baseline,
            max_path_length=100,
            discount=0.99,
            optimizer_args=dict(tf_optimizer_args=dict(learning_rate=0.01, )))

        runner.setup(algo, env)
        runner.train(n_epochs=100, batch_size=10000)


run_experiment(
    run_task,
    snapshot_mode="last",
    seed=1,
)
Ejemplo n.º 20
0
    policy = GaussianMLPPolicy(env.spec,
                               hidden_sizes=[32, 32],
                               hidden_nonlinearity=torch.tanh,
                               output_nonlinearity=None)

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = TRPO(env_spec=env.spec,
                policy=policy,
                baseline=baseline,
                max_path_length=49,
                discount=0.99,
                center_adv=False,
                max_kl_step=0.005,
                **copyparams)

    #runner.setup(algo, env)
    #runner.train(n_epochs=100, batch_size=50*250)
    runner.restore(
        "/home/dell/garage/data/local/pusher/pusher_2020_06_01_23_45_24_0001")
    runner.resume(n_epochs=800)


run_experiment(
    run_task,
    exp_prefix="pusher",
    snapshot_mode='last',
    seed=1,
)
Ejemplo n.º 21
0
import joblib

from garage.experiment import run_experiment
from garage.envs import normalize
from garage.experiment.deterministic import set_seed
from garage.tf.algos import PPO
from garage.tf.baselines import GaussianMLPBaseline
from garage.tf.envs import TfEnv
from garage.tf.experiment import LocalTFRunner
from garage.tf.policies import GaussianMLPPolicy


def run_task(snapshot_config, *_):
    """Run task."""
    with LocalTFRunner(snapshot_config=snapshot_config, max_cpus=16) as runner:
        data = joblib.load('/data/yxjin/robot/rotation/data/local/experiment/experiment_2020_05_07_06_54_13_0001/itr_2100.pkl')
        algo = data['algo']
        env = data['env']

        runner.setup(algo, env)

        runner.train(n_epochs=40000, batch_size=2048)


run_experiment(
    run_task,
    snapshot_mode='gap',
    snapshot_gap=1,
    seed=1,
)
    ddpg = DDPG(
        env,
        actor=actor_net,
        actor_lr=1e-4,
        critic_lr=1e-3,
        critic=critic_net,
        plot=False,
        target_update_tau=1e-2,
        n_epochs=500,
        n_epoch_cycles=100,
        n_rollout_steps=50,
        n_train_steps=50,
        discount=0.9,
        replay_buffer_size=int(1e6),
        min_buffer_size=int(1e4),
        exploration_strategy=es,
        actor_optimizer=tf.train.AdamOptimizer,
        critic_optimizer=tf.train.AdamOptimizer,
    )

    ddpg.train(sess=sess)


run_experiment(
    run_task,
    n_parallel=2,
    exp_prefix="ddpg_point_compose_seq",
    seed=1,
    plot=True,
)
        algo = CMAES(env_spec=env.spec,
                     policy=policy,
                     baseline=baseline,
                     **params.cmaes_algo_kwargs)

        runner.setup(algo, env)

        runner.train(**params.cmaes_train_kwargs)


if __name__ == '__main__':
    now = datetime.now()
    parser = argparse.ArgumentParser()
    parser.add_argument('--seed',
                        default=int(now.timestamp()),
                        type=int,
                        help='seed')
    parser.add_argument('--exp_id',
                        default=now.strftime("%Y_%m_%d_%H_%M_%S"),
                        help='experiment id (suffix to data directory name)')

    args = parser.parse_args()

    run_experiment(
        run_task,
        exp_prefix='cmaes_opt_k_hw_as_policy_{}_'.format(args.exp_id) +
        str(params.n_springs) + '_params',
        snapshot_mode='last',
        seed=args.seed,
        force_cpu=True)
Ejemplo n.º 24
0
            decay_ratio=0.1)

        algo = DQN(env_spec=env.spec,
                   policy=policy,
                   qf=qf,
                   exploration_strategy=epilson_greedy_strategy,
                   replay_buffer=replay_buffer,
                   qf_lr=1e-4,
                   discount=0.99,
                   min_buffer_size=int(1e4),
                   double_q=False,
                   n_train_steps=500,
                   n_epoch_cycles=n_epoch_cycles,
                   target_network_update_freq=2,
                   buffer_batch_size=32)

        runner.setup(algo, env)
        runner.train(n_epochs=n_epochs,
                     n_epoch_cycles=n_epoch_cycles,
                     batch_size=sampler_batch_size)


env_name = "PongNoFrameskip-v4"
run_experiment(run_task,
               n_parallel=1,
               snapshot_mode='none',
               seed=1,
               plot=False,
               log_dir="data/type_ddpg/{}/normal_ddpg_test/{}".format(
                   env_name, 1))
Ejemplo n.º 25
0

def run_task(*_):
    """Wrap ERWR training task in the run_task function."""
    env = TfEnv(env_name="CartPole-v1")

    policy = CategoricalMLPPolicy(name="policy",
                                  env_spec=env.spec,
                                  hidden_sizes=(32, 32))

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = ERWR(env=env,
                policy=policy,
                baseline=baseline,
                batch_size=10000,
                max_path_length=100,
                n_itr=100,
                plot=True,
                discount=0.99)
    algo.train()


run_experiment(
    run_task,
    n_parallel=1,
    snapshot_mode="last",
    seed=1,
    plot=True,
)
            inference_ce_coeff=v.inference_ce_coeff,
            #optimizer_args=dict(max_grad_norm=0.5)
        )
        runner.setup(algo, env, batch_size=v.batch_size,
            max_path_length=v.max_path_length)
        runner.train(n_epochs=1000, plot=False)

config = dict(
    tasks=TASKS,
    latent_length=3,  # 3
    inference_window=6,  # 6
    batch_size=4096 * len(TASKS),  # 4096 * len(TASKS)
    policy_ent_coeff=1e-5,  # 1e-2 #
    embedding_ent_coeff=3e-4,  # 1e-3
    inference_ce_coeff=2e-5,  # 1e-4
    max_path_length=100,  # 100
    embedding_init_std=1.0,  # 1.0
    embedding_max_std=2.0,  # 2.0
    policy_init_std=0.1,  # 1.0
    policy_max_std=0.2,  # 2.0
)

run_experiment(
    run_task,
    exp_prefix='sawyer_reach_multiworld_torque',
    n_parallel=2,
    seed=1,
    variant=config,
    plot=False,
)
Ejemplo n.º 27
0
        algo = DQN(
            env_spec=env.spec,
            policy=policy,
            qf=qf,
            exploration_strategy=epilson_greedy_strategy,
            replay_buffer=replay_buffer,
            qf_lr=1e-4,
            discount=0.99,
            min_buffer_size=int(1e4),
            double_q=False,
            n_train_steps=500,
            n_epoch_cycles=n_epoch_cycles,
            target_network_update_freq=2,
            buffer_batch_size=32)

        runner.setup(algo, env)
        runner.train(
            n_epochs=n_epochs,
            n_epoch_cycles=n_epoch_cycles,
            batch_size=sampler_batch_size)


run_experiment(
    run_task,
    n_parallel=1,
    snapshot_mode='last',
    seed=1,
    plot=False,
)
Ejemplo n.º 28
0
                    plot=False,
                )
                runner.setup(algo, env)

            runner.train(n_epochs=1200, batch_size=2048, plot=False)


experiment_dir = os.path.abspath("train_ctrler_experiment")
experiment_path = os.path.join(
    experiment_dir, "b{}_{}_{}_s{}_{}".format(
        args.beta, args.int, args.name if args.loadrnn else "nornn", args.seed,
        "{}-{}-{}_{}:{}:{}".format(now.year, now.month, now.day, now.hour,
                                   now.minute, now.second, now.microsecond)))

run_experiment(run_task,
               snapshot_mode="last",
               log_dir=experiment_path,
               seed=args.seed)

#from env import WallAvoidingAgent
#total_frames = 0
#env = ControllerEnv(vae_load=vision_load_path, rnn_load=memory_load_path)
#try:
#    recording_obs = []
#    recording_action = []
#
#    pixel_obs = env.reset()
#    pixel_obs = env.obs
#    agent = WallAvoidingAgent(env.model.env)
#
#    # random policy
#    # more diverse random policy, works slightly better:
        policy = CategoricalLSTMPolicy(
            name='policy',
            env_spec=env.spec,
            lstm_layer_cls=L.TfBasicLSTMLayer,
            # gru_layer_cls=L.GRULayer,
        )

        baseline = LinearFeatureBaseline(env_spec=env.spec)

        algo = TRPO(
            env_spec=env.spec,
            policy=policy,
            baseline=baseline,
            max_path_length=100,
            discount=0.99,
            max_kl_step=0.01,
            optimizer=ConjugateGradientOptimizer,
            optimizer_args=dict(
                hvp_approach=FiniteDifferenceHvp(base_eps=1e-5)))

        runner.setup(algo, env)
        runner.train(n_epochs=100, batch_size=4000)


run_experiment(
    run_task,
    snapshot_mode='last',
    seed=1,
)
def runner(
    ga_type=None,
    env_args=None,
    run_experiment_args=None,
    sim_args=None,
    reward_args=None,
    spaces_args=None,
    policy_args=None,
    algo_args=None,
    runner_args=None,
    bpq_args=None,
    # log_dir='.',
):
    if ga_type is None:
        pass

    if env_args is None:
        env_args = {}

    if run_experiment_args is None:
        run_experiment_args = {}

    if sim_args is None:
        sim_args = {}

    if reward_args is None:
        reward_args = {}

    if spaces_args is None:
        spaces_args = {}

    if policy_args is None:
        policy_args = {}

    if algo_args is None:
        algo_args = {}

    if runner_args is None:
        runner_args = {}

    if bpq_args is None:
        bpq_args = {}

    if 'n_parallel' in run_experiment_args:
        n_parallel = run_experiment_args['n_parallel']
    else:
        n_parallel = 1
        run_experiment_args['n_parallel'] = n_parallel

    if 'max_path_length' in sim_args:
        max_path_length = sim_args['max_path_length']
    else:
        max_path_length = 50
        sim_args['max_path_length'] = max_path_length

    if 'batch_size' in runner_args:
        batch_size = runner_args['batch_size']
    else:
        batch_size = max_path_length * n_parallel
        runner_args['batch_size'] = batch_size

    def run_task(snapshot_config, *_):

        seed = 0
        # top_k = 10
        np.random.seed(seed)

        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        with tf.Session(config=config) as sess:
            with tf.variable_scope('AST', reuse=tf.AUTO_REUSE):

                with LocalTFRunner(snapshot_config=snapshot_config,
                                   max_cpus=4,
                                   sess=sess) as local_runner:

                    # Instantiate the example classes
                    sim = ExampleAVSimulator(**sim_args)
                    reward_function = ExampleAVReward(**reward_args)
                    spaces = ExampleAVSpaces(**spaces_args)

                    # Create the environment
                    if 'id' in env_args:
                        env_args.pop('id')
                    env = ASTEnv(simulator=sim,
                                 reward_function=reward_function,
                                 spaces=spaces,
                                 **env_args)
                    env = TfEnv(env)

                    policy = ContinuousMLPPolicy(name='ast_agent',
                                                 env_spec=env.spec,
                                                 **policy_args)

                    params = policy.get_params()
                    sess.run(tf.variables_initializer(params))

                    # Instantiate the garage objects
                    baseline = ZeroBaseline(env_spec=env.spec)

                    top_paths = BPQ.BoundedPriorityQueue(**bpq_args)

                    sampler_cls = ASTVectorizedSampler
                    sampler_args = {
                        "open_loop": False,
                        "sim": sim,
                        "reward_function": reward_function,
                        "n_envs": n_parallel
                    }

                    if ga_type == 'ga':
                        print('ga')
                        algo = GA(env_spec=env.spec,
                                  policy=policy,
                                  baseline=baseline,
                                  top_paths=top_paths,
                                  **algo_args)
                    elif ga_type == 'gasm':
                        print('gasm')
                        algo = GASM(env_spec=env.spec,
                                    policy=policy,
                                    baseline=baseline,
                                    top_paths=top_paths,
                                    **algo_args)
                    else:
                        raise NotImplementedError

                    local_runner.setup(algo=algo,
                                       env=env,
                                       sampler_cls=sampler_cls,
                                       sampler_args=sampler_args)

                    # Run the experiment
                    local_runner.train(**runner_args)

    # from garage.experiment.experiment import AttrDict
    # import os
    # tabular_log_file = os.path.join("./", "test.csv")
    # from dowel import logger, tabular
    # import dowel
    # logger.add_output(dowel.CsvOutput(tabular_log_file))
    # run_task(AttrDict(run_experiment_args))
    run_experiment(
        run_task,
        **run_experiment_args,
    )