Beispiel #1
0
    def test_copy(self):
        action_dim = 5
        obs_dim = 7
        critic1 = FeedForwardCritic(name_or_scope="qf_a",
                                    observation_dim=obs_dim,
                                    action_dim=action_dim)
        critic2 = FeedForwardCritic(name_or_scope="qf_b",
                                    observation_dim=obs_dim,
                                    action_dim=action_dim)
        critic1.sess = self.sess
        critic2.sess = self.sess

        a = np.random.rand(1, action_dim)
        o = np.random.rand(1, obs_dim)

        feed_1 = {
            critic1.action_input: a,
            critic1.observation_input: o,
        }
        feed_2 = {
            critic2.action_input: a,
            critic2.observation_input: o,
        }

        self.sess.run(tf.global_variables_initializer())

        out1 = self.sess.run(critic1.output, feed_1)
        out2 = self.sess.run(critic2.output, feed_2)
        self.assertFalse((out1 == out2).all())

        critic2.set_param_values(critic1.get_param_values())
        out1 = self.sess.run(critic1.output, feed_1)
        out2 = self.sess.run(critic2.output, feed_2)
        self.assertTrue((out1 == out2).all())
def main():
    env = TfEnv(CartpoleEnv())
    es = OUStrategy(env_spec=env.spec)
    qf = FeedForwardCritic(
        name_or_scope="critic",
        env_spec=env.spec,
    )
    policy = FeedForwardPolicy(
        name_or_scope="actor",
        env_spec=env.spec,
    )
    default_ddpg_params = dict(
        batch_size=128,
        n_epochs=10,
        epoch_length=1000,
        eval_samples=1000,
        max_path_length=100,
        min_pool_size=100,
    )
    exp_prefix = 'ddpg-cartpole-speed-{0}'.format(timestamp())
    algorithm = DDPG(
        env,
        es,
        policy,
        qf,
        **default_ddpg_params,
    )

    run_experiment_lite(
        algorithm.train(),
        n_parallel=1,
        snapshot_mode="last",
        exp_prefix=exp_prefix,
        seed=1,
    )
Beispiel #3
0
def lstm_launcher(variant):
    """
    Run a simple LSTM on an environment.

    :param variant: Dictionary of dictionary with the following keys:
        - algo_params
        - env_params
        - qf_params
        - policy_params
    :return:
    """
    from railrl.algos.ddpg import DDPG as MyDDPG
    from railrl.policies.nn_policy import FeedForwardPolicy
    from railrl.qfunctions.nn_qfunction import FeedForwardCritic
    from rllab.exploration_strategies.ou_strategy import OUStrategy
    from railrl.launchers.launcher_util import get_env_settings
    env_settings = get_env_settings(**variant['env_params'])
    env = env_settings['env']
    es = OUStrategy(env_spec=env.spec)
    qf = FeedForwardCritic(name_or_scope="critic",
                           env_spec=env.spec,
                           **variant.get('qf_params', {}))
    policy = FeedForwardPolicy(name_or_scope="actor",
                               env_spec=env.spec,
                               **variant.get('policy_params', {}))
    algorithm = MyDDPG(env, es, policy, qf, **variant['algo_params'])
    algorithm.train()
Beispiel #4
0
 def test_serialize_feedforward_critic(self):
     f = FeedForwardCritic(
         name_or_scope="a",
         action_dim=self.action_dim,
         observation_dim=self.observation_dim,
     )
     self.sess.run(tf.global_variables_initializer())
     pickle.dumps(f)
Beispiel #5
0
def main():
    stub(globals())
    env = TfEnv(HalfCheetahEnv())
    for seed in range(3):
        ddpg_params = dict(
            batch_size=128,
            n_epochs=100,
            epoch_length=10000,
            eval_samples=10000,
            discount=0.99,
            policy_learning_rate=1e-4,
            qf_learning_rate=1e-3,
            soft_target_tau=0.01,
            replay_pool_size=1000000,
            min_pool_size=256,
            scale_reward=1.0,
            max_path_length=1000,
            qf_weight_decay=0.0,
        )
        vitchyr_es = OUStrategy(env_spec=env.spec)
        vitchyr_qf = FeedForwardCritic(
            name_or_scope="critic",
            env_spec=env.spec,
        )
        vitchyr_policy = FeedForwardPolicy(
            name_or_scope="actor",
            env_spec=env.spec,
        )
        vitchyr_ddpg = DDPG(env, vitchyr_es, vitchyr_policy, vitchyr_qf,
                            **ddpg_params)

        shane_es = GaussianStrategy(env.spec)
        shane_policy = DeterministicMLPPolicy(
            name="init_policy",
            env_spec=env.spec,
            hidden_sizes=(100, 100),
            hidden_nonlinearity=tf.nn.relu,
            output_nonlinearity=tf.nn.tanh,
        )
        shane_qf = ContinuousMLPQFunction(name="qf",
                                          env_spec=env.spec,
                                          hidden_sizes=(100, 100))
        shane_ddpg = ShaneDDPG(env, shane_policy, shane_qf, shane_es,
                               **ddpg_params)

        names_and_algos = [
            ("Vitchyr_DDPG", vitchyr_ddpg),
            ("Shane_DDPG", shane_ddpg),
        ]
        for name, algorithm in names_and_algos:
            env.reset()
            run_experiment_lite(
                algorithm.train(),
                n_parallel=1,
                snapshot_mode="last",
                exp_prefix="ddpg-comparison-cheetah",
                seed=seed,
            )
def run_linear_ocm_exp(variant):
    from railrl.tf.ddpg import DDPG
    from railrl.envs.flattened_product_box import FlattenedProductBox
    from railrl.exploration_strategies.ou_strategy import OUStrategy
    from railrl.tf.policies.nn_policy import FeedForwardPolicy
    from railrl.qfunctions.nn_qfunction import FeedForwardCritic
    from railrl.envs.memory.continuous_memory_augmented import (
        ContinuousMemoryAugmented
    )
    from railrl.launchers.launcher_util import (
        set_seed,
    )

    """
    Set up experiment variants.
    """
    seed = variant['seed']
    algo_params = variant['algo_params']
    env_class = variant['env_class']
    env_params = variant['env_params']
    memory_dim = variant['memory_dim']
    ou_params = variant['ou_params']

    set_seed(seed)

    """
    Code for running the experiment.
    """

    env = env_class(**env_params)
    env = ContinuousMemoryAugmented(
        env,
        num_memory_states=memory_dim,
    )
    env = FlattenedProductBox(env)

    qf = FeedForwardCritic(
        name_or_scope="critic",
        env_spec=env.spec,
    )
    policy = FeedForwardPolicy(
        name_or_scope="policy",
        env_spec=env.spec,
    )
    es = OUStrategy(
        env_spec=env.spec,
        **ou_params
    )
    algorithm = DDPG(
        env,
        es,
        policy,
        qf,
        **algo_params
    )

    algorithm.train()
Beispiel #7
0
    def test_output_len(self):
        action_dim = 5
        obs_dim = 7
        critic = FeedForwardCritic(name_or_scope="1",
                                   observation_dim=obs_dim,
                                   action_dim=action_dim)
        critic.sess = self.sess

        a = np.random.rand(1, action_dim)
        o = np.random.rand(1, obs_dim)
        feed = {
            critic.action_input: a,
            critic.observation_input: o,
        }

        self.sess.run(tf.global_variables_initializer())

        out = self.sess.run(critic.output, feed)
        self.assertEqual(1, out.size)
def run_task(variant):
    import tensorflow as tf
    from railrl.railrl.algos.ddpg import DDPG
    from railrl.policies.nn_policy import FeedForwardPolicy
    from railrl.qfunctions.nn_qfunction import FeedForwardCritic
    from railrl.qfunctions.quadratic_naf_qfunction import QuadraticNAF
    from rllab.exploration_strategies.ou_strategy import OUStrategy
    from sandbox.rocky.tf.envs.base import TfEnv
    from rllab.envs.box2d.cartpole_env import CartpoleEnv

    env = TfEnv(CartpoleEnv())
    algo_name = variant['Algorithm']
    if algo_name == 'Quadratic-DDPG':
        qf = QuadraticNAF(
            name_or_scope="quadratic_qf",
            env_spec=env.spec,
        )
    elif algo_name == 'DDPG':
        qf = FeedForwardCritic(
            name_or_scope="critic",
            env_spec=env.spec,
            embedded_hidden_sizes=(100, ),
            observation_hidden_sizes=(100, ),
            hidden_nonlinearity=tf.nn.relu,
        )
    else:
        raise Exception('Algo name not recognized: {0}'.format(algo_name))

    es = OUStrategy(env_spec=env.spec)
    policy = FeedForwardPolicy(
        name_or_scope="actor",
        env_spec=env.spec,
    )

    ddpg_params = dict(
        batch_size=128,
        n_epochs=100,
        epoch_length=1000,
        eval_samples=1000,
        discount=0.99,
        policy_learning_rate=1e-4,
        qf_learning_rate=1e-3,
        soft_target_tau=0.01,
        replay_pool_size=1000000,
        min_pool_size=256,
        scale_reward=1.0,
        max_path_length=1000,
        qf_weight_decay=0.01,
    )
    algorithm = DDPG(env, es, policy, qf, **ddpg_params)
    algorithm.train()
def my_ddpg_launcher(variant):
	"""
	Run DDPG
	:param variant: Dictionary of dictionary with the following keys:
		- algo_params
		- env_params
		- qf_params
		- policy_params
	:return:
	"""
	from railrl.algos.ddpg import DDPG as MyDDPG
	from railrl.policies.nn_policy import FeedForwardPolicy
	from railrl.qfunctions.nn_qfunction import FeedForwardCritic
	from rllab.exploration_strategies.ou_strategy import OUStrategy
	from railrl.launchers.launcher_util import get_env_settings
	from railrl.core.tf_util import BatchNormConfig
	if ('batch_norm_params' in variant
		and variant['batch_norm_params'] is not None):
		bn_config = BatchNormConfig(**variant['batch_norm_params'])
	else:
		bn_config = None
	env_settings = get_env_settings(**variant['env_params'])
	env = env_settings['env']
	es = OUStrategy(env_spec=env.spec)
	qf = FeedForwardCritic(
		name_or_scope="critic",
		env_spec=env.spec,
		batch_norm_config=bn_config,
		**variant.get('qf_params', {})
	)
	policy = FeedForwardPolicy(
		name_or_scope="actor",
		env_spec=env.spec,
		batch_norm_config=bn_config,
		**variant.get('policy_params', {})
	)

	algorithm = MyDDPG(
		env,
		es,
		policy,
		qf,
		variant['tensorboard'],
		batch_norm_config=bn_config,
		**variant['algo_params'],
	)
	algorithm.train()
Beispiel #10
0
def example(variant):
    load_policy_file = variant.get('load_policy_file', None)
    if load_policy_file is not None and exists(load_policy_file):
        with tf.Session():
            data = joblib.load(load_policy_file)
            print(data)
            policy = data['policy']
            qf = data['qf']
            replay_buffer = data['pool']
        env = HalfCheetahEnv()
        es = OUStrategy(action_space=env.action_space)
        use_new_version = variant['use_new_version']
        algorithm = DDPG(
            env,
            es,
            policy,
            qf,
            n_epochs=2,
            batch_size=1024,
            replay_pool=replay_buffer,
            use_new_version=use_new_version,
        )
        algorithm.train()
    else:
        env = HalfCheetahEnv()
        es = OUStrategy(action_space=env.action_space)
        qf = FeedForwardCritic(
            name_or_scope="critic",
            env_spec=env.spec,
        )
        policy = FeedForwardPolicy(
            name_or_scope="actor",
            env_spec=env.spec,
        )
        use_new_version = variant['use_new_version']
        algorithm = DDPG(
            env,
            es,
            policy,
            qf,
            n_epochs=2,
            batch_size=1024,
            use_new_version=use_new_version,
        )
        algorithm.train()
def main():
    env = TfEnv(CartpoleEnv())
    es = OUStrategy(env_spec=env.spec)
    qf = FeedForwardCritic(
        name_or_scope="critic",
        env_spec=env.spec,
    )
    policy = FeedForwardPolicy(
        name_or_scope="actor",
        env_spec=env.spec,
    )
    default_ddpg_params = dict(
        batch_size=32,
        n_epochs=10,
        epoch_length=1000,
        eval_samples=1000,
        max_path_length=100,
        min_pool_size=1000,
    )
    sweeper = DeterministicHyperparameterSweeper(
        {'scale_reward': [1e-4, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4]}, )
    exp_prefix = 'ddpg-cart-reward-scale-sweep-{0}'.format(timestamp())
    for ddpg_params in sweeper.iterate_hyperparameters():
        algorithm = DDPG(
            env,
            es,
            policy,
            qf,
            scale_reward=ddpg_params['scale_reward'],
            **default_ddpg_params,
        )

        for seed in range(3):
            run_experiment_lite(
                algorithm.train(),
                n_parallel=1,
                snapshot_mode="last",
                exp_prefix=exp_prefix,
                seed=seed,
                # mode="local",
                # use_cloudpickle=True,
            )
def example(*_):
    env = DoublePendulumEnv()
    es = OUStrategy(env_spec=env.spec)
    qf = FeedForwardCritic(
        name_or_scope="critic",
        env_spec=env.spec,
    )
    policy = FeedForwardPolicy(
        name_or_scope="actor",
        env_spec=env.spec,
    )
    algorithm = DDPG(
        env,
        es,
        policy,
        qf,
        n_epochs=30,
        batch_size=1024,
    )
    algorithm.train()
def example(*_):
    env = HalfCheetahEnv()
    es = OUStrategy(env_spec=env.spec)
    qf = FeedForwardCritic(
        name_or_scope="critic",
        env_spec=env.spec,
    )
    policy = FeedForwardPolicy(
        name_or_scope="actor",
        env_spec=env.spec,
    )
    algorithm = DDPG(
        env,
        es,
        policy,
        qf,
        n_epochs=25,
        batch_size=1024,
        replay_pool_size=10000,
    )
    algorithm.train()
def example(variant):
    env_settings = get_env_settings(
        **variant['env_params']
    )
    env = env_settings['env']
    es = OUStrategy(env_spec=env.spec)
    qf = FeedForwardCritic(
        name_or_scope="critic",
        env_spec=env.spec,
    )
    policy = FeedForwardPolicy(
        name_or_scope="actor",
        env_spec=env.spec,
    )
    algorithm = DDPG(
        env,
        es,
        policy,
        qf,
        **variant['ddpg_params']
    )
    algorithm.train()
def run_task(_):
    for seed in range(3):
        env = TfEnv(HalfCheetahEnv())
        es = OUStrategy(env_spec=env.spec)
        qf = FeedForwardCritic(
            name_or_scope="critic",
            env_spec=env.spec,
        )
        policy = FeedForwardPolicy(
            name_or_scope="actor",
            env_spec=env.spec,
        )
        ddpg_params = dict(
            batch_size=16,
            n_epochs=100,
            epoch_length=100,
            eval_samples=100,
            max_path_length=10,
            min_pool_size=2,
        )
        algorithm = DDPG(env, es, policy, qf, **ddpg_params)

        algorithm.train(),
Beispiel #16
0
def run_linear_ocm_exp(variant):
    from railrl.tf.ddpg import DDPG
    from railrl.launchers.launcher_util import (
        set_seed, )
    from railrl.exploration_strategies.ou_strategy import OUStrategy
    from railrl.tf.policies.nn_policy import FeedForwardPolicy
    from railrl.qfunctions.nn_qfunction import FeedForwardCritic
    """
    Set up experiment variants.
    """
    H = variant['H']
    seed = variant['seed']
    algo_params = variant['algo_params']
    env_class = variant['env_class']
    env_params = variant['env_params']
    ou_params = variant['ou_params']

    set_seed(seed)
    """
    Code for running the experiment.
    """

    env = env_class(**env_params)

    qf = FeedForwardCritic(
        name_or_scope="critic",
        env_spec=env.spec,
    )
    policy = FeedForwardPolicy(
        name_or_scope="policy",
        env_spec=env.spec,
    )
    es = OUStrategy(env_spec=env.spec, **ou_params)
    algorithm = DDPG(env, es, policy, qf, **algo_params)

    algorithm.train()
Beispiel #17
0
# Param ranges
seed = 3
policy_lrs = [1e-5, 1e-4, 1e-3]
qf_lrs = [1e-5, 1e-4, 1e-3]
gammas = [0.9, 0.99, 0.995]
taus = [1e-3, 1e-2]

for policy_lr, qf_lr, gamma, tau in itertools.product(policy_lrs, qf_lrs, gammas, taus):
	env = TfEnv(normalize(env=GymEnv('Box3dReach-v4',record_video=False, \
	log_dir='/tmp/gym_test',record_log=False)))
	
	es = OUStrategy(env_spec=env.spec)
	qf = FeedForwardCritic(
		name_or_scope="critic",
		env_spec=env.spec,
		hidden_nonlinearity=tf.nn.tanh,
	)
	policy = FeedForwardPolicy(
		name_or_scope="actor",
		env_spec=env.spec,
		hidden_nonlinearity=tf.nn.tanh,
	)

	algo = DDPG(
		env,
		es,
		policy,
		qf,
		"/data0/dianchen/box3d/ddpg_box3d_state_v4_tf_policy_{0}_qf_{1}_gamma_{2}_tau_{3}".format(
			policy_lr,
def main():

    parser = argparse.ArgumentParser()
    # Hyperparameters
    parser.add_argument('--seed', type=int, default=0)
    parser.add_argument('--policy_initlr', type=float, default=1e-4)
    parser.add_argument('--qf_initlr', type=float, default=1e-3)

    parser.add_argument('--qf_decay', type=float, default=.0)
    parser.add_argument('--qf_soft_tau', type=float, default=1e-3)

    # Exploration hyperparameters
    parser.add_argument('--ou_theta', type=float, default=0.15)
    parser.add_argument('--ou_sigma', type=float, default=0.3)

    parser.add_argument('--tfboard_path', type=str, default='/tmp/tfboard')
    parser.add_argument('--gpu_ratio', type=float, default=1.0)

    args = parser.parse_args()

    env = TfEnv(normalize(env=GymEnv('Box3dReach-v11',record_video=False, \
    log_dir='/tmp/gym_test',record_log=False)))

    name = 'ddpg-state-v11-plr{0}-qlr{1}-tau{2}-qfdecay{3}-ou_theta{4}-ou_sigma{5}'.format(
        args.policy_initlr, args.qf_initlr, args.qf_soft_tau, args.qf_decay,
        args.ou_theta, args.ou_sigma)

    es = OUStrategy(env_spec=env.spec,
                    theta=args.ou_theta,
                    sigma=args.ou_sigma)

    policy = FeedForwardPolicy(
        name_or_scope="actor",
        observation_hidden_sizes=(400, 300),
        env_spec=env.spec,
    )

    qf = FeedForwardCritic(
        name_or_scope="critic",
        env_spec=env.spec,
        embedded_hidden_sizes=(100, ),
        observation_hidden_sizes=(100, ),
    )

    algo = DDPG(
        env=env,
        exploration_strategy=es,
        policy=policy,
        qf=qf,
        tensorboard_path=os.path.join(args.tfboard_path, name,
                                      '_%d' % args.seed),
        qf_learning_rate=args.qf_initlr,
        policy_learning_rate=args.policy_initlr,
        soft_target_tau=args.qf_soft_tau,
        gpu_ratio=args.gpu_ratio,
    )

    run_experiment_lite(algo.train(),
                        exp_prefix=name,
                        n_parallel=1,
                        snapshot_mode="last",
                        seed=args.seed,
                        mode="local")
def icm_launcher(variant):

	if variant["Algorithm"] == "DDPG":
		from railrl.algos.ddpg import DDPG as MyDDPG
		from railrl.policies.nn_policy import FeedForwardPolicy
		from railrl.qfunctions.nn_qfunction import FeedForwardCritic
		from rllab.exploration_strategies.ou_strategy import OUStrategy
		from railrl.exploration_strategies.simple_gaussian_strategy import SimpleGaussianStrategy
		from railrl.launchers.launcher_util import get_env_settings
		from railrl.core.tf_util import BatchNormConfig
		from railrl.algos.icm import ICM

		if ('batch_norm_params' in variant
			and variant['batch_norm_params'] is not None):
			bn_config = BatchNormConfig(**variant['batch_norm_params'])
		else:
			bn_config = None
		env_settings = get_env_settings(**variant['env_params'])
		env = env_settings['env']
		es = OUStrategy(env_spec=env.spec)
		# es = SimpleGaussianStrategy(env_spec=env.spec, sigma=0.5)
		qf = FeedForwardCritic(
			name_or_scope="critic",
			env_spec=env.spec,
			batch_norm_config=bn_config,
			**variant.get('qf_params', {})
		)
		policy = FeedForwardPolicy(
			name_or_scope="actor",
			env_spec=env.spec,
			batch_norm_config=bn_config,
			**variant.get('policy_params', {})
		)

		algo = MyDDPG(
			env,
			es,
			policy,
			qf,
			variant['tensorboard'],
			batch_norm_config=bn_config,
			**variant['algo_params'],
		)
		algorithm = ICM(
			env, 
			algo,
			no_encoder=False,
			feature_dim=env.spec.observation_space.flat_dim, 
			forward_weight=0.9,
			external_reward_weight=0.95,
			inverse_tanh=True,
			init_learning_rate=1e-3
		)
		algorithm.train()
	elif variant["Algorithm"] == "Idle":
		from railrl.algos.idle import IdleAlgo
		from railrl.launchers.launcher_util import get_env_settings
		from railrl.algos.icm import ICM
		env_settings = get_env_settings(**variant['env_params'])
		env = env_settings['env']
		algo = IdleAlgo(env, variant['tensorboard'])
		algorithm = ICM(
			env, 
			algo,
			no_encoder=False,
			feature_dim=env.spec.observation_space.flat_dim,
			forward_weight=0.9,
			external_reward_weight=0.0,
			inverse_tanh=True,
			init_learning_rate=1e-3,
		)
		algorithm.train()
	elif variant["Algorithm"] == "rllab-TRPO":
		from rllab.algos.trpo import TRPO
		from railrl.launchers.launcher_util import get_env_settings
		from railrl.algos.icm_trpo import ICM
		from rllab.policies.gaussian_mlp_policy import GaussianMLPPolicy
		from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline
		from railrl.algos.icm_trpo import ICM
		import lasagne.nonlinearities as NL

		env_settings = get_env_settings(**variant['env_params'])
		env = env_settings['env']
		policy = GaussianMLPPolicy(
			env_spec=env.spec,
			hidden_sizes=(64, 32),
			output_nonlinearity=NL.tanh,
		)

		baseline = LinearFeatureBaseline(
			env.spec,
		)

		batch_size = 5000
		algo = TRPO(
			env=env,
			policy=policy,
			baseline=baseline,
			batch_size=batch_size,
			whole_paths=True,
			max_path_length=1000,
			n_itr=1000,
			step_size=0.01,
			subsample_factor=1.0,
		)
		algorithm = ICM(
			env, 
			algo,
			variant['tensorboard'],
			no_encoder=False,
			feature_dim=env.spec.observation_space.flat_dim,
			forward_weight=0.2,
			external_reward_weight=0.99,
			inverse_tanh=True,
			init_learning_rate=1e-4,
		)
		algorithm.train()

	elif variant["Algorithm"] == 'tf-TRPO':
		from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline
		from sandbox.rocky.tf.baselines.gaussian_conv_baseline import GaussianConvBaseline
		from sandbox.rocky.tf.policies.gaussian_mlp_policy import GaussianMLPPolicy
		from sandbox.rocky.tf.policies.gaussian_conv_policy import GaussianConvPolicy
		from sandbox.rocky.tf.algos.trpo import TRPO
		from sandbox.rocky.tf.envs.base import TfEnv

		from railrl.launchers.launcher_util import get_env_settings
		# from railrl.algos.icm_trpo_tf import ICM
		from railrl.algos.icm_trpo_tf_box3d import ICM
		import tensorflow as tf

		env_settings = get_env_settings(**variant['env_params'])
		env = TfEnv(env_settings['env'])
		if len(env.observation_space.shape) == 1:
			policy = GaussianMLPPolicy(
				"mlp_policy",
				env_spec=env.spec,
				hidden_sizes=(64, 32),
				output_nonlinearity=tf.nn.tanh,
			)
			baseline = LinearFeatureBaseline(
				env.spec,
			)
		elif len(env.observation_space.shape) == 2:
			policy = ConvNNPolicy(
				"conv_policy",
				env_spec=mdp.spec,
				conv_filters=(32, 32, 32, 32),
				conv_filter_sizes=((3,3),(3,3),(3,3),(3,3)),
				conv_strides=(2, 2, 2, 2),
				conv_pads=('SAME', 'SAME', 'SAME', 'SAME'),
				hidden_sizes=(256,),
			)

			baseline = GaussianConvBaseline(
				mdp.spec,
				regressor_args={
					'conv_filters':(32, 32, 32, 32),
					'conv_filter_sizes':((3,3),(3,3),(3,3),(3,3)),
					'conv_strides':(2, 2, 2, 2),
					'conv_pads':('SAME', 'SAME', 'SAME', 'SAME'),
					'hidden_sizes':(256,),
				}
			)
		else:
			raise NotImplementedError("Sorry, no support for observatin space: {}".format(env.observation_space.shape))

		batch_size = 5000
		algo = TRPO(
			env=env,
			policy=policy,
			baseline=baseline,
			batch_size=batch_size,
			whole_paths=True,
			max_path_length=500,
			n_itr=1000,
			step_size=0.01,
			subsample_factor=1.0,
		)

		algorithm = ICM(
			env, 
			algo,
			variant['tensorboard'],
			no_encoder=False,
			feature_dim=env.spec.observation_space.flat_dim,
			forward_weight=0.2,
			external_reward_weight=0.99,
			inverse_tanh=True,
			init_learning_rate=1e-4
		)
		algorithm.train()

	else:
		raise NotImplementedError("Currently only supports DDPG!")