def generate_expert_dp():
    env = TfEnv(normalize(InvertedPendulumEnv()))
    policy = GaussianMLPPolicy(
        name="expert_policy",
        env_spec=env.spec,
        # The neural network policy should have two hidden layers, each with 32 hidden units.
        hidden_sizes=(64, 64),
        std_hidden_sizes=(64, 64),
        adaptive_std=True,
    )

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=4000,
        max_path_length=100,
        n_itr=64,
        discount=0.995,
        step_size=0.01,
        optimizer=ConjugateGradientOptimizer(hvp_approach=FiniteDifferenceHvp(
            base_eps=1e-5)),
        gae_lambda=0.97,
    )

    with tf.Session() as sess:
        algo.train(sess=sess)
        t = rollout(env=env, agent=policy, max_path_length=100, animated=False)
        print(sum(t['rewards']))
        with open('expert_dp.pickle', 'wb') as handle:
            pickle.dump(policy, handle)
        while True:
            rollout(env=env, agent=policy, max_path_length=100, animated=False)
def run_linear_ocm_exp(variant):
    from sandbox.rocky.tf.algos.trpo import TRPO
    from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline
    from sandbox.rocky.tf.policies.gaussian_lstm_policy import GaussianLSTMPolicy
    import sandbox.rocky.tf.core.layers as L
    from sandbox.rocky.tf.optimizers.conjugate_gradient_optimizer import (
        ConjugateGradientOptimizer,
        FiniteDifferenceHvp,
    )
    from railrl.envs.flattened_product_box import FlattenedProductBox
    from railrl.envs.memory.continuous_memory_augmented import (
        ContinuousMemoryAugmented)
    from railrl.envs.memory.one_char_memory import (
        OneCharMemoryEndOnly, )
    from railrl.envs.memory.high_low import HighLow
    from railrl.launchers.launcher_util import (
        set_seed, )
    """
    Set up experiment variants.
    """
    H = variant['H']
    seed = variant['seed']
    num_values = variant['num_values']

    set_seed(seed)
    onehot_dim = num_values + 1
    """
    Code for running the experiment.
    """

    # env = OneCharMemoryEndOnly(n=num_values, num_steps=H, softmax_action=True)
    env = HighLow(num_steps=H)
    env = ContinuousMemoryAugmented(
        env,
        num_memory_states=onehot_dim,
    )
    env = FlattenedProductBox(env)

    policy = GaussianLSTMPolicy(
        name="policy",
        env_spec=env.spec,
        lstm_layer_cls=L.LSTMLayer,
    )

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    optimizer_params = variant['optimizer_params']
    trpo_params = variant['trpo_params']
    algo = TRPO(env=env,
                policy=policy,
                baseline=baseline,
                optimizer=ConjugateGradientOptimizer(
                    hvp_approach=FiniteDifferenceHvp(**optimizer_params)),
                **trpo_params)

    algo.train()
Esempio n. 3
0
def run_linear_ocm_exp(variant):
    from sandbox.rocky.tf.algos.trpo import TRPO
    from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline
    from sandbox.rocky.tf.policies.gaussian_mlp_policy import GaussianMLPPolicy
    from sandbox.rocky.tf.optimizers.conjugate_gradient_optimizer import (
        ConjugateGradientOptimizer,
        FiniteDifferenceHvp,
    )
    from rlkit.envs.flattened_product_box import FlattenedProductBox
    from rlkit.envs.memory.continuous_memory_augmented import (
        ContinuousMemoryAugmented)
    from rlkit.envs.memory.one_char_memory import (
        OneCharMemoryEndOnly, )
    from rlkit.launchers.launcher_util import (
        set_seed, )
    """
    Set up experiment variants.
    """
    H = variant['H']
    seed = variant['seed']
    num_values = variant['num_values']

    set_seed(seed)
    onehot_dim = num_values + 1
    """
    Code for running the experiment.
    """

    env = OneCharMemoryEndOnly(n=num_values, num_steps=H, softmax_action=True)
    env = ContinuousMemoryAugmented(
        env,
        num_memory_states=onehot_dim,
    )
    env = FlattenedProductBox(env)

    policy = GaussianMLPPolicy(
        name="policy",
        env_spec=env.spec,
        # The neural network policy should have two hidden layers, each with 32 hidden units.
        hidden_sizes=(32, 32),
    )

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    optimizer_params = variant['optimizer_params']
    trpo_params = variant['trpo_params']
    algo = TRPO(env=env,
                policy=policy,
                baseline=baseline,
                optimizer=ConjugateGradientOptimizer(
                    hvp_approach=FiniteDifferenceHvp(**optimizer_params)),
                **trpo_params)

    algo.train()
Esempio n. 4
0
def run_experiment(**params):
    base_params = copy.copy(DEFAULTS)
    base_params.update(params)
    params = base_params
    pprint(params)

    grid_world = SlaveGridWorldEnv("walled_chain",
                                   max_traj_length=DEFAULTS["max_path_length"],
                                   goal_reward=params["goal_reward"])
    agent = GridWorldMasterAgent(grid_world,
                                 match_reward=params["match_reward"])
    env = normalize(
        SituatedConversationEnvironment(env=grid_world, b_agent=agent))
    baseline = LinearFeatureBaseline(env)

    policy = RecurrentCategoricalPolicy(
        name="policy",
        env_spec=env.spec,
        hidden_dims=params["policy_hidden_dims"],
        feature_network=MLPNetworkWithEmbeddings(
            "feature_network", env.observation_space.flat_dim,
            params["feature_dim"], params["feature_hidden_dims"], tf.tanh,
            tf.tanh, agent.vocab_size, params["embedding_dim"]),
        state_include_action=False,
    )

    optimizer = ConjugateGradientOptimizer(hvp_approach=FiniteDifferenceHvp(
        base_eps=1e-5))

    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=params["batch_size"],
        max_path_length=params["max_path_length"],
        n_itr=params["n_itr"],
        discount=0.99,
        step_size=params["step_size"],
        optimizer=optimizer,
    )

    run_experiment_lite(
        algo.train(),
        n_parallel=15,
        snapshot_mode="last",
        exp_prefix="grid_world_sweep3",
        variant=params,
    )
Esempio n. 5
0
def run_experiment(params):
    params_base = copy.copy(DEFAULTS)
    params_base.update(params)
    params = params_base

    policy = RecurrentCategoricalPolicy(
        name="policy",
        env_spec=env.spec,
        hidden_dims=params["policy_hidden_dims"],
        feature_network=MLPNetworkWithEmbeddings("embeddings",
                                                 len(VOCAB),
                                                 params["feature_dim"],
                                                 params["feature_hidden_dims"],
                                                 tf.tanh,
                                                 tf.tanh,
                                                 len(VOCAB),
                                                 params["embedding_dim"],
                                                 has_other_input=False),
        state_include_action=False,
    )

    baseline = LinearFeatureBaseline(env.spec)

    optimizer = ConjugateGradientOptimizer(hvp_approach=FiniteDifferenceHvp(
        base_eps=1e-5))

    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=params["batch_size"],
        max_path_length=LENGTH,
        n_itr=params["n_itr"],
        discount=0.99,
        step_size=params["step_size"],
        optimizer=optimizer,
    )

    run_experiment_lite(
        algo.train(),
        n_parallel=5,
        snapshot_mode="last",
        exp_prefix="autoenc_unnorm_reward",
        variant=params,
    )
Esempio n. 6
0
def experiment(variant):
    env = variant['env_class'](**variant['env_kwargs'])
    if variant['multitask']:
        env = MultitaskToFlatEnv(env)
    env = NormalizedBoxEnv(env)
    env = ConvertEnvToTf(env)

    policy = GaussianMLPPolicy(name="policy",
                               env_spec=env.spec,
                               **variant['policy_params'])

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    optimizer_params = variant['optimizer_params']
    algo_kwargs = variant['algo_kwargs']
    algo = TRPO(env=env,
                policy=policy,
                baseline=baseline,
                optimizer=ConjugateGradientOptimizer(
                    hvp_approach=FiniteDifferenceHvp(**optimizer_params)),
                **algo_kwargs)
    algo.train()
Esempio n. 7
0
def run_experiment(**params):
    base_params = copy.copy(DEFAULTS)
    base_params.update(params)
    params = base_params

    grid_world = SlaveGridWorldEnv("3x3", goal_reward=params["goal_reward"])
    env = normalize(grid_world)
    baseline = LinearFeatureBaseline(env)

    policy = CategoricalMLPPolicy(
        name="policy",
        env_spec=env.spec,
        hidden_sizes=params["policy_hidden_dims"],
    )

    optimizer = ConjugateGradientOptimizer(hvp_approach=FiniteDifferenceHvp(
        base_eps=1e-5))

    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=params["batch_size"],
        max_path_length=5,
        n_itr=params["n_itr"],
        discount=0.99,
        step_size=params["step_size"],
        optimizer=optimizer,
    )

    run_experiment_lite(
        algo.train(),
        n_parallel=5,
        snapshot_mode="last",
        exp_prefix="grid_world_silent",
        variant=params,
    )
Esempio n. 8
0
    def _init_bnn_trpo(self, bnn_model, training_policy, time_step):

        if hasattr(self.env._wrapped_env, '_wrapped_env'):
            inner_env = self.env._wrapped_env._wrapped_env
        else:
            inner_env = self.env._wrapped_env.env.unwrapped

        cost_np_vec = inner_env.cost_np_vec

        batch_size = self.policy_opt_params["trpo"]["batch_size"]
        if bnn_model is not None:
            bnn_env = TfEnv(
                BayesNeuralNetEnv(env=self.env,
                                  inner_env=inner_env,
                                  cost_np=cost_np_vec,
                                  bnn_model=bnn_model,
                                  sam_mode=None))
        else:
            bnn_env = self.env

        baseline = LinearFeatureBaseline(env_spec=self.env.spec)

        algo = TRPO(
            env=bnn_env,
            policy=training_policy,
            baseline=baseline,
            batch_size=batch_size,
            max_path_length=time_step,
            discount=self.policy_opt_params["trpo"]["discount"],
            step_size=self.policy_opt_params["trpo"]["step_size"],
            optimizer=ConjugateGradientOptimizer(
                hvp_approach=FiniteDifferenceHvp(base_eps=1e-5))
            # sampler_args=sampler_args,  # params for VectorizedSampler
        )

        return algo, cost_np_vec
Esempio n. 9
0
regularisation_coefficient = 1e-5

with tf.Session() as sess:
    oracle_algo = Oracle_TRPO(
        env=env,
        policy=oracle_policy,
        baseline=oracle_baseline,
        batch_size=batch_size_value,  #use batch size upto 25000
        max_path_length=
        max_path_length_horizon,  #or use env.horizon here - would be suited for different environments (may not be defined for all envs though)
        n_itr=args.num_epochs,
        discount=0.99,
        step_size=step_size_value,
        optimizer=ConjugateGradientOptimizer(
            reg_coeff=regularisation_coefficient,
            hvp_approach=FiniteDifferenceHvp(
                base_eps=regularisation_coefficient)))

    oracle_train(oracle_algo, sess=sess)

    # rollouts = oracle_algo.obtain_samples(num_epochs + 1)
    #logger.log("Average reward for training rollouts on (%s): %f +- %f " % (env_name, np.mean([np.sum(p['rewards']) for p in rollouts]),  np.std([np.sum(p['rewards']) for p in rollouts])))
    """
    Evaluating the learnt policy below
    using the "obtaines_samples" collected from above

    batch_polopt.py
    """
    # Final evaluation on all environments using the learned policy
    # total_rollouts = []
    # # for env_name, env in envs:
    # rollouts = []
Esempio n. 10
0
    # The neural network policy should have two hidden layers, each with 32 hidden units.
    hidden_sizes=(32, 32)
)

baseline = LinearFeatureBaseline(env_spec=expert_env.spec)

algo = TRPO(
    env=novice_env,
    policy=policy,
    baseline=baseline,
    batch_size=4000,
    max_path_length=50,
    n_itr=40,
    discount=0.99,
    step_size=0.01,
    optimizer=ConjugateGradientOptimizer(hvp_approach=FiniteDifferenceHvp(base_eps=1e-5))

)

with tf.Session() as sess:

    algo.n_itr = 0
    algo.start_itr = 0
    algo.train(sess=sess)

    im_size = 50
    im_channels = 3

    dim_input = [im_size, im_size, im_channels]

    disc = DomainConfusionVelocityDiscriminator(input_dim=dim_input, output_dim_class=2, output_dim_dom=2,
Esempio n. 11
0
    def __init__(self, env, args):
        self.args = args
        # Parallel setup
        parallel_sampler.initialize(n_parallel=args.n_parallel)
        if args.seed is not None:
            set_seed(args.seed)
            parallel_sampler.set_seed(args.seed)

        env, policy = rllab_envpolicy_parser(env, args)

        if not args.algo == 'thddpg':
            # Baseline
            if args.baseline_type == 'linear':
                baseline = LinearFeatureBaseline(env_spec=env.spec)
            elif args.baseline_type == 'zero':
                baseline = ZeroBaseline(env_spec=env.spec)
            else:
                raise NotImplementedError(args.baseline_type)

        # Logger
        default_log_dir = config.LOG_DIR
        if args.log_dir is None:
            log_dir = osp.join(default_log_dir, args.exp_name)
        else:
            log_dir = args.log_dir

        tabular_log_file = osp.join(log_dir, args.tabular_log_file)
        text_log_file = osp.join(log_dir, args.text_log_file)
        params_log_file = osp.join(log_dir, args.params_log_file)

        logger.log_parameters_lite(params_log_file, args)
        logger.add_text_output(text_log_file)
        logger.add_tabular_output(tabular_log_file)
        prev_snapshot_dir = logger.get_snapshot_dir()
        prev_mode = logger.get_snapshot_mode()
        logger.set_snapshot_dir(log_dir)
        logger.set_snapshot_mode(args.snapshot_mode)
        logger.set_log_tabular_only(args.log_tabular_only)
        logger.push_prefix("[%s] " % args.exp_name)

        if args.algo == 'tftrpo':
            self.algo = TRPO(
                env=env,
                policy=policy,
                baseline=baseline,
                batch_size=args.batch_size,
                max_path_length=args.max_path_length,
                n_itr=args.n_iter,
                discount=args.discount,
                gae_lambda=args.gae_lambda,
                step_size=args.step_size,
                optimizer=ConjugateGradientOptimizer(
                    hvp_approach=FiniteDifferenceHvp(
                        base_eps=1e-5)) if args.recurrent else None,
                mode=args.control)
        elif args.algo == 'thddpg':
            qfunc = thContinuousMLPQFunction(env_spec=env.spec)
            if args.exp_strategy == 'ou':
                es = OUStrategy(env_spec=env.spec)
            elif args.exp_strategy == 'gauss':
                es = GaussianStrategy(env_spec=env.spec)
            else:
                raise NotImplementedError()

            self.algo = thDDPG(env=env,
                               policy=policy,
                               qf=qfunc,
                               es=es,
                               batch_size=args.batch_size,
                               max_path_length=args.max_path_length,
                               epoch_length=args.epoch_length,
                               min_pool_size=args.min_pool_size,
                               replay_pool_size=args.replay_pool_size,
                               n_epochs=args.n_iter,
                               discount=args.discount,
                               scale_reward=0.01,
                               qf_learning_rate=args.qfunc_lr,
                               policy_learning_rate=args.policy_lr,
                               eval_samples=args.eval_samples,
                               mode=args.control)
Esempio n. 12
0
recognition_model = utils.build_recognition_model(args, env, summary_writer)
baseline = utils.build_baseline(args, env)
reward_handler = utils.build_reward_handler(args, summary_writer)
validator = auto_validator.AutoValidator(summary_writer,
                                         data['obs_mean'],
                                         data['obs_std'],
                                         render=args.validator_render,
                                         render_every=args.render_every,
                                         flat_recurrent=args.policy_recurrent)

# build algo
saver = tf.train.Saver(max_to_keep=100, keep_checkpoint_every_n_hours=.5)
sampler_args = dict(n_envs=args.n_envs) if args.vectorize else None
if args.policy_recurrent:
    optimizer = ConjugateGradientOptimizer(
        max_backtracks=50, hvp_approach=FiniteDifferenceHvp(base_eps=1e-5))
else:
    optimizer = None
algo = GAIL(critic=critic,
            recognition=recognition_model,
            reward_handler=reward_handler,
            env=env,
            policy=policy,
            baseline=baseline,
            validator=validator,
            batch_size=args.batch_size,
            max_path_length=args.max_path_length,
            n_itr=args.n_itr,
            discount=args.discount,
            step_size=args.trpo_step_size,
            saver=saver,
def main():
    now = datetime.datetime.now(dateutil.tz.tzlocal())
    rand_id = str(uuid.uuid4())[:5]
    timestamp = now.strftime('%Y_%m_%d_%H_%M_%S_%f_%Z')
    default_exp_name = 'experiment_%s_%s' % (timestamp, rand_id)

    parser = argparse.ArgumentParser()
    parser.add_argument('--exp_name',
                        type=str,
                        default=default_exp_name,
                        help='Name of the experiment.')

    parser.add_argument('--discount', type=float, default=0.95)
    parser.add_argument('--gae_lambda', type=float, default=0.99)
    parser.add_argument('--reward_scale', type=float, default=1.0)
    parser.add_argument('--enable_obsnorm', action='store_true', default=False)
    parser.add_argument('--chunked', action='store_true', default=False)

    parser.add_argument('--n_iter', type=int, default=250)
    parser.add_argument('--sampler_workers', type=int, default=1)
    parser.add_argument('--max_traj_len', type=int, default=250)
    parser.add_argument('--update_curriculum',
                        action='store_true',
                        default=False)
    parser.add_argument('--anneal_step_size', type=int, default=0)

    parser.add_argument('--n_timesteps', type=int, default=8000)

    parser.add_argument('--control', type=str, default='centralized')
    parser.add_argument('--buffer_size', type=int, default=1)
    parser.add_argument('--radius', type=float, default=0.015)
    parser.add_argument('--n_evaders', type=int, default=10)
    parser.add_argument('--n_pursuers', type=int, default=8)
    parser.add_argument('--n_poison', type=int, default=10)
    parser.add_argument('--n_coop', type=int, default=4)
    parser.add_argument('--n_sensors', type=int, default=30)
    parser.add_argument('--sensor_range', type=str, default='0.2')
    parser.add_argument('--food_reward', type=float, default=5)
    parser.add_argument('--poison_reward', type=float, default=-1)
    parser.add_argument('--encounter_reward', type=float, default=0.05)
    parser.add_argument('--reward_mech', type=str, default='local')

    parser.add_argument('--recurrent', type=str, default=None)
    parser.add_argument('--baseline_type', type=str, default='linear')
    parser.add_argument('--policy_hidden_sizes', type=str, default='128,128')
    parser.add_argument('--baseline_hidden_sizes', type=str, default='128,128')

    parser.add_argument('--max_kl', type=float, default=0.01)

    parser.add_argument('--log_dir', type=str, required=False)
    parser.add_argument('--tabular_log_file',
                        type=str,
                        default='progress.csv',
                        help='Name of the tabular log file (in csv).')
    parser.add_argument('--text_log_file',
                        type=str,
                        default='debug.log',
                        help='Name of the text log file (in pure text).')
    parser.add_argument('--params_log_file',
                        type=str,
                        default='params.json',
                        help='Name of the parameter log file (in json).')
    parser.add_argument('--seed', type=int, help='Random seed for numpy')
    parser.add_argument('--args_data',
                        type=str,
                        help='Pickled data for stub objects')
    parser.add_argument('--snapshot_mode',
                        type=str,
                        default='all',
                        help='Mode to save the snapshot. Can be either "all" '
                        '(all iterations will be saved), "last" (only '
                        'the last iteration will be saved), or "none" '
                        '(do not save snapshots)')
    parser.add_argument(
        '--log_tabular_only',
        type=ast.literal_eval,
        default=False,
        help=
        'Whether to only print the tabular log information (in a horizontal format)'
    )

    args = parser.parse_args()

    parallel_sampler.initialize(n_parallel=args.sampler_workers)

    if args.seed is not None:
        set_seed(args.seed)
        parallel_sampler.set_seed(args.seed)

    args.hidden_sizes = tuple(map(int, args.policy_hidden_sizes.split(',')))

    centralized = True if args.control == 'centralized' else False

    sensor_range = np.array(map(float, args.sensor_range.split(',')))
    if len(sensor_range) == 1:
        sensor_range = sensor_range[0]
    else:
        assert sensor_range.shape == (args.n_pursuers, )

    env = MAWaterWorld(args.n_pursuers,
                       args.n_evaders,
                       args.n_coop,
                       args.n_poison,
                       radius=args.radius,
                       n_sensors=args.n_sensors,
                       food_reward=args.food_reward,
                       poison_reward=args.poison_reward,
                       encounter_reward=args.encounter_reward,
                       reward_mech=args.reward_mech,
                       sensor_range=sensor_range,
                       obstacle_loc=None)

    env = TfEnv(
        RLLabEnv(StandardizedEnv(env,
                                 scale_reward=args.reward_scale,
                                 enable_obsnorm=args.enable_obsnorm),
                 mode=args.control))

    if args.buffer_size > 1:
        env = ObservationBuffer(env, args.buffer_size)

    if args.recurrent:
        feature_network = MLP(
            name='feature_net',
            input_shape=(env.spec.observation_space.flat_dim +
                         env.spec.action_space.flat_dim, ),
            output_dim=16,
            hidden_sizes=(128, 64, 32),
            hidden_nonlinearity=tf.nn.tanh,
            output_nonlinearity=None)
        if args.recurrent == 'gru':
            policy = GaussianGRUPolicy(env_spec=env.spec,
                                       feature_network=feature_network,
                                       hidden_dim=int(
                                           args.policy_hidden_sizes),
                                       name='policy')
        elif args.recurrent == 'lstm':
            policy = GaussianLSTMPolicy(env_spec=env.spec,
                                        feature_network=feature_network,
                                        hidden_dim=int(
                                            args.policy_hidden_sizes),
                                        name='policy')
    else:
        policy = GaussianMLPPolicy(
            name='policy',
            env_spec=env.spec,
            hidden_sizes=tuple(map(int, args.policy_hidden_sizes.split(','))),
            min_std=10e-5)

    if args.baseline_type == 'linear':
        baseline = LinearFeatureBaseline(env_spec=env.spec)
    elif args.baseline_type == 'mlp':
        raise NotImplementedError()
        # baseline = GaussianMLPBaseline(
        #     env_spec=env.spec, hidden_sizes=tuple(map(int, args.baseline_hidden_sizes.split(','))))
    else:
        baseline = ZeroBaseline(env_spec=env.spec)

    # logger
    default_log_dir = config.LOG_DIR
    if args.log_dir is None:
        log_dir = osp.join(default_log_dir, args.exp_name)
    else:
        log_dir = args.log_dir
    tabular_log_file = osp.join(log_dir, args.tabular_log_file)
    text_log_file = osp.join(log_dir, args.text_log_file)
    params_log_file = osp.join(log_dir, args.params_log_file)

    logger.log_parameters_lite(params_log_file, args)
    logger.add_text_output(text_log_file)
    logger.add_tabular_output(tabular_log_file)
    prev_snapshot_dir = logger.get_snapshot_dir()
    prev_mode = logger.get_snapshot_mode()
    logger.set_snapshot_dir(log_dir)
    logger.set_snapshot_mode(args.snapshot_mode)
    logger.set_log_tabular_only(args.log_tabular_only)
    logger.push_prefix("[%s] " % args.exp_name)

    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=args.n_timesteps,
        max_path_length=args.max_traj_len,
        #max_path_length_limit=args.max_path_length_limit,
        update_max_path_length=args.update_curriculum,
        anneal_step_size=args.anneal_step_size,
        n_itr=args.n_iter,
        discount=args.discount,
        gae_lambda=args.gae_lambda,
        step_size=args.max_kl,
        optimizer=ConjugateGradientOptimizer(hvp_approach=FiniteDifferenceHvp(
            base_eps=1e-5)) if args.recurrent else None,
        mode=args.control
        if not args.chunked else 'chunk_{}'.format(args.control),
    )

    algo.train()
Esempio n. 14
0
def get_algo(env, policy, es, qf, baseline, max_path_length, batch_size,
             replay_pool_size, discount, scale_reward, learning_rate,
             replacement_prob, policy_updates_ratio, step_size, gae_lambda,
             sample_backups, kl_sample_backups, qprop_eta_option, qprop_unbias,
             qprop_nu, algo_name, n_itr, recurrent, updates_ratio,
             policy_use_target, policy_batch_size, policy_sample_last,
             ac_delta, ac_sample_backups, save_freq, restore_auto,
             qf_learning_rate, qf_use_target, qf_mc_ratio, qf_batch_size,
             qf_residual_phi, **kwargs):
    algo = None
    algo_class = None
    min_pool_size = 1000
    qf_baseline = None
    extra_kwargs = dict()

    print('Creating algo=%s with n_itr=%d, max_path_length=%d...' %
          (algo_name, n_itr, max_path_length))
    if algo_name in [
            'ddpg',
            'dspg',
            'dspgoff',
            'dqn',
            'dsqn',
            'trpg',
            'trpgoff',
    ]:
        if algo_name in [
                'trpg',
        ]:
            extra_kwargs['policy_update_method'] = 'cg'
        algo = DDPG(
            env=env,
            policy=policy,
            policy_use_target=policy_use_target,
            es=es,
            qf=qf,
            qf_use_target=qf_use_target,
            policy_batch_size=policy_batch_size,
            qf_batch_size=qf_batch_size,
            qf_mc_ratio=qf_mc_ratio,
            qf_residual_phi=qf_residual_phi,
            max_path_length=max_path_length,
            epoch_length=batch_size,  # make comparable to batchopt methods
            min_pool_size=min_pool_size,
            replay_pool_size=replay_pool_size,
            n_epochs=n_itr,
            discount=discount,
            scale_reward=scale_reward,
            qf_learning_rate=qf_learning_rate,
            policy_learning_rate=learning_rate,
            policy_step_size=step_size,
            policy_sample_last=policy_sample_last,
            replacement_prob=replacement_prob,
            policy_updates_ratio=policy_updates_ratio,
            updates_ratio=updates_ratio,
            save_freq=save_freq,
            restore_auto=restore_auto,
            **extra_kwargs,
        )
        algo_class = 'DDPG'
    elif algo_name in [
            'trpo',
            'nuqprop',
            'nuqfqprop',
            'actrpo',
            'acqftrpo',
            'qprop',
            'mqprop',
            'qfqprop',
            'nafqprop',
    ]:
        if recurrent:
            extra_kwargs['optimizer'] = \
                ConjugateGradientOptimizer(hvp_approach=FiniteDifferenceHvp(base_eps=1e-5))
        if algo_name in [
                'actrpo',
                'acqftrpo',
        ]:
            extra_kwargs['ac_delta'] = ac_delta
            extra_kwargs['qprop'] = False  # disable qprop
            if ac_delta == 0: qf = None
        if algo_name in [
                'mqprop',
        ]:
            extra_kwargs['mqprop'] = True
        if algo_name in [
                'nuqprop',
                'nuqfqprop',
        ]:
            extra_kwargs['qprop_nu'] = qprop_nu
        if qf is not None:
            qf_baseline = QfunctionBaseline(env_spec=env.spec,
                                            policy=policy,
                                            qf=qf)
        algo = TRPO(env=env,
                    policy=policy,
                    baseline=baseline,
                    batch_size=batch_size,
                    max_path_length=max_path_length,
                    n_itr=n_itr,
                    discount=discount,
                    step_size=step_size,
                    gae_lambda=gae_lambda,
                    sample_backups=sample_backups,
                    kl_sample_backups=kl_sample_backups,
                    qf=qf,
                    qf_use_target=qf_use_target,
                    qf_batch_size=qf_batch_size,
                    qf_mc_ratio=qf_mc_ratio,
                    qf_residual_phi=qf_residual_phi,
                    min_pool_size=min_pool_size,
                    scale_reward=scale_reward,
                    qf_updates_ratio=updates_ratio,
                    qprop_eta_option=qprop_eta_option,
                    qprop_unbias=qprop_unbias,
                    replay_pool_size=replay_pool_size,
                    replacement_prob=replacement_prob,
                    qf_baseline=qf_baseline,
                    qf_learning_rate=qf_learning_rate,
                    ac_sample_backups=ac_sample_backups,
                    policy_sample_last=policy_sample_last,
                    save_freq=save_freq,
                    restore_auto=restore_auto,
                    **extra_kwargs)
        algo_class = 'TRPO'
    elif algo_name in [
            'vpg',
            'qvpg',
    ]:
        if qf is not None:
            qf_baseline = QfunctionBaseline(env_spec=env.spec,
                                            policy=policy,
                                            qf=qf)
        algo = VPG(
            env=env,
            policy=policy,
            baseline=baseline,
            batch_size=batch_size,
            max_path_length=max_path_length,
            n_itr=n_itr,
            discount=discount,
            gae_lambda=gae_lambda,
            optimizer_args=dict(
                tf_optimizer_args=dict(learning_rate=learning_rate, )),
            qf=qf,
            qf_use_target=qf_use_target,
            qf_batch_size=qf_batch_size,
            qf_mc_ratio=qf_mc_ratio,
            qf_residual_phi=qf_residual_phi,
            min_pool_size=min_pool_size,
            scale_reward=scale_reward,
            qf_updates_ratio=updates_ratio,
            qprop_eta_option=qprop_eta_option,
            qprop_unbias=qprop_unbias,
            replay_pool_size=replay_pool_size,
            qf_baseline=qf_baseline,
            qf_learning_rate=qf_learning_rate,
            save_freq=save_freq,
            restore_auto=restore_auto,
        )
        algo_class = 'VPG'
    print('[get_algo] Instantiating %s.' % algo_class)
    return algo
Esempio n. 15
0
def main():
    now = datetime.datetime.now(dateutil.tz.tzlocal())
    rand_id = str(uuid.uuid4())[:5]
    timestamp = now.strftime('%Y_%m_%d_%H_%M_%S_%f_%Z')
    default_exp_name = 'experiment_%s_%s' % (timestamp, rand_id)

    parser = argparse.ArgumentParser()
    parser.add_argument('--exp_name',
                        type=str,
                        default=default_exp_name,
                        help='Name of the experiment.')

    parser.add_argument('--discount', type=float, default=0.99)
    parser.add_argument('--gae_lambda', type=float, default=1.0)
    parser.add_argument('--reward_scale', type=float, default=1.0)

    parser.add_argument('--n_iter', type=int, default=250)
    parser.add_argument('--sampler_workers', type=int, default=1)
    parser.add_argument('--max_traj_len', type=int, default=250)
    parser.add_argument('--update_curriculum',
                        action='store_true',
                        default=False)
    parser.add_argument('--n_timesteps', type=int, default=8000)
    parser.add_argument('--control', type=str, default='centralized')

    parser.add_argument('--rectangle', type=str, default='10,10')
    parser.add_argument('--map_type', type=str, default='rectangle')
    parser.add_argument('--n_evaders', type=int, default=5)
    parser.add_argument('--n_pursuers', type=int, default=2)
    parser.add_argument('--obs_range', type=int, default=3)
    parser.add_argument('--n_catch', type=int, default=2)
    parser.add_argument('--urgency', type=float, default=0.0)
    parser.add_argument('--pursuit', dest='train_pursuit', action='store_true')
    parser.add_argument('--evade', dest='train_pursuit', action='store_false')
    parser.set_defaults(train_pursuit=True)
    parser.add_argument('--surround', action='store_true', default=False)
    parser.add_argument('--constraint_window', type=float, default=1.0)
    parser.add_argument('--sample_maps', action='store_true', default=False)
    parser.add_argument('--map_file', type=str, default='../maps/map_pool.npy')
    parser.add_argument('--flatten', action='store_true', default=False)
    parser.add_argument('--reward_mech', type=str, default='global')
    parser.add_argument('--catchr', type=float, default=0.1)
    parser.add_argument('--term_pursuit', type=float, default=5.0)

    parser.add_argument('--recurrent', type=str, default=None)
    parser.add_argument('--policy_hidden_sizes', type=str, default='128,128')
    parser.add_argument('--baselin_hidden_sizes', type=str, default='128,128')
    parser.add_argument('--baseline_type', type=str, default='linear')

    parser.add_argument('--conv', action='store_true', default=False)

    parser.add_argument('--max_kl', type=float, default=0.01)

    parser.add_argument('--checkpoint', type=str, default=None)

    parser.add_argument('--log_dir', type=str, required=False)
    parser.add_argument('--tabular_log_file',
                        type=str,
                        default='progress.csv',
                        help='Name of the tabular log file (in csv).')
    parser.add_argument('--text_log_file',
                        type=str,
                        default='debug.log',
                        help='Name of the text log file (in pure text).')
    parser.add_argument('--params_log_file',
                        type=str,
                        default='params.json',
                        help='Name of the parameter log file (in json).')
    parser.add_argument('--seed', type=int, help='Random seed for numpy')
    parser.add_argument('--args_data',
                        type=str,
                        help='Pickled data for stub objects')
    parser.add_argument('--snapshot_mode',
                        type=str,
                        default='all',
                        help='Mode to save the snapshot. Can be either "all" '
                        '(all iterations will be saved), "last" (only '
                        'the last iteration will be saved), or "none" '
                        '(do not save snapshots)')
    parser.add_argument(
        '--log_tabular_only',
        type=ast.literal_eval,
        default=False,
        help=
        'Whether to only print the tabular log information (in a horizontal format)'
    )

    args = parser.parse_args()

    parallel_sampler.initialize(n_parallel=args.sampler_workers)

    if args.seed is not None:
        set_seed(args.seed)
        parallel_sampler.set_seed(args.seed)

    args.hidden_sizes = tuple(map(int, args.policy_hidden_sizes.split(',')))

    if args.checkpoint:
        with tf.Session() as sess:
            data = joblib.load(args.checkpoint)
            policy = data['policy']
            env = data['env']
    else:
        if args.sample_maps:
            map_pool = np.load(args.map_file)
        else:
            if args.map_type == 'rectangle':
                env_map = TwoDMaps.rectangle_map(
                    *map(int, args.rectangle.split(',')))
            elif args.map_type == 'complex':
                env_map = TwoDMaps.complex_map(
                    *map(int, args.rectangle.split(',')))
            else:
                raise NotImplementedError()
            map_pool = [env_map]

        env = PursuitEvade(map_pool,
                           n_evaders=args.n_evaders,
                           n_pursuers=args.n_pursuers,
                           obs_range=args.obs_range,
                           n_catch=args.n_catch,
                           train_pursuit=args.train_pursuit,
                           urgency_reward=args.urgency,
                           surround=args.surround,
                           sample_maps=args.sample_maps,
                           constraint_window=args.constraint_window,
                           flatten=args.flatten,
                           reward_mech=args.reward_mech,
                           catchr=args.catchr,
                           term_pursuit=args.term_pursuit)

        env = TfEnv(
            RLLabEnv(StandardizedEnv(env,
                                     scale_reward=args.reward_scale,
                                     enable_obsnorm=False),
                     mode=args.control))

        if args.recurrent:
            if args.conv:
                feature_network = ConvNetwork(
                    name='feature_net',
                    input_shape=emv.spec.observation_space.shape,
                    output_dim=5,
                    conv_filters=(16, 32, 32),
                    conv_filter_sizes=(3, 3, 3),
                    conv_strides=(1, 1, 1),
                    conv_pads=('VALID', 'VALID', 'VALID'),
                    hidden_sizes=(64, ),
                    hidden_nonlinearity=tf.nn.relu,
                    output_nonlinearity=tf.nn.softmax)
            else:
                feature_network = MLP(
                    name='feature_net',
                    input_shape=(env.spec.observation_space.flat_dim +
                                 env.spec.action_space.flat_dim, ),
                    output_dim=5,
                    hidden_sizes=(256, 128, 64),
                    hidden_nonlinearity=tf.nn.tanh,
                    output_nonlinearity=None)
            if args.recurrent == 'gru':
                policy = CategoricalGRUPolicy(env_spec=env.spec,
                                              feature_network=feature_network,
                                              hidden_dim=int(
                                                  args.policy_hidden_sizes),
                                              name='policy')
            elif args.recurrent == 'lstm':
                policy = CategoricalLSTMPolicy(env_spec=env.spec,
                                               feature_network=feature_network,
                                               hidden_dim=int(
                                                   args.policy_hidden_sizes),
                                               name='policy')
        elif args.conv:
            feature_network = ConvNetwork(
                name='feature_net',
                input_shape=env.spec.observation_space.shape,
                output_dim=5,
                conv_filters=(8, 16),
                conv_filter_sizes=(3, 3),
                conv_strides=(2, 1),
                conv_pads=('VALID', 'VALID'),
                hidden_sizes=(32, ),
                hidden_nonlinearity=tf.nn.relu,
                output_nonlinearity=tf.nn.softmax)
            policy = CategoricalMLPPolicy(name='policy',
                                          env_spec=env.spec,
                                          prob_network=feature_network)
        else:
            policy = CategoricalMLPPolicy(name='policy',
                                          env_spec=env.spec,
                                          hidden_sizes=args.hidden_sizes)

    if args.baseline_type == 'linear':
        baseline = LinearFeatureBaseline(env_spec=env.spec)
    else:
        baseline = ZeroBaseline(env_spec=env.spec)

    # logger
    default_log_dir = config.LOG_DIR
    if args.log_dir is None:
        log_dir = osp.join(default_log_dir, args.exp_name)
    else:
        log_dir = args.log_dir
    tabular_log_file = osp.join(log_dir, args.tabular_log_file)
    text_log_file = osp.join(log_dir, args.text_log_file)
    params_log_file = osp.join(log_dir, args.params_log_file)

    logger.log_parameters_lite(params_log_file, args)
    logger.add_text_output(text_log_file)
    logger.add_tabular_output(tabular_log_file)
    prev_snapshot_dir = logger.get_snapshot_dir()
    prev_mode = logger.get_snapshot_mode()
    logger.set_snapshot_dir(log_dir)
    logger.set_snapshot_mode(args.snapshot_mode)
    logger.set_log_tabular_only(args.log_tabular_only)
    logger.push_prefix("[%s] " % args.exp_name)

    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=args.n_timesteps,
        max_path_length=args.max_traj_len,
        n_itr=args.n_iter,
        discount=args.discount,
        gae_lambda=args.gae_lambda,
        step_size=args.max_kl,
        optimizer=ConjugateGradientOptimizer(hvp_approach=FiniteDifferenceHvp(
            base_eps=1e-5)) if args.recurrent else None,
        mode=args.control,
    )

    algo.train()
    def train(self):

        expert_env = TfEnv(
            self.expert_env
        )  #TfEnv(GymEnv("Pusher3DOF-v1", force_reset=True, record_video=False))
        # expert_env = TfEnv(normalize(ReacherEnv()))
        novice_env = TfEnv(
            self.novice_env
        )  #TfEnv(GymEnv("Pusher3DOFNoChange-v1", force_reset=True, record_video=True))

        # novice_env = TfEnv(normalize(ReacherTwoEnv(), normalize_obs=True))
        expert_fail_pol = RandomPolicy(expert_env.spec)

        policy = GaussianMLPPolicy(
            name="novice_policy",
            env_spec=novice_env.spec,
            init_std=10,
            # The neural network policy should have two hidden layers, each with 32 hidden units.
            hidden_sizes=(32, 32))

        baseline = LinearFeatureBaseline(env_spec=expert_env.spec)

        algo = TRPO(env=novice_env,
                    policy=policy,
                    baseline=baseline,
                    batch_size=50 * 500,
                    max_path_length=self.horizon,
                    n_itr=self.itrs,
                    discount=0.99,
                    step_size=0.01,
                    optimizer=ConjugateGradientOptimizer(
                        hvp_approach=FiniteDifferenceHvp(base_eps=1e-5)))

        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        with tf.Session(config=config) as sess:

            #What do the n_itr and start_itr mean?
            algo.n_itr = 0
            algo.start_itr = 0
            algo.train(sess=sess)  #TODO: What is happening here?

            im_height = self.imsize[0]
            im_width = self.imsize[1]
            im_channels = 3

            dim_input = [im_height, im_width, im_channels]

            disc = DomainConfusionVelocityDiscriminator(input_dim=dim_input,
                                                        output_dim_class=2,
                                                        output_dim_dom=2,
                                                        tf_sess=sess)

            #data = joblib.load(self.expert_pkl)#"/home/andrewliu/research/viewpoint/rllab-tpil/third_person_im/data/local/experiment/experiment_2017_05_07_20_58_39_0001/itr_123.pkl")#"/home/abhigupta/abhishek_sandbox/viewpoint/third_person_im/data/local/experiment/experiment_2017_05_06_18_07_38_0001/itr_900.pkl")
            #expert_policy = data['policy']
            with open(self.expert_pkl, 'rb') as pfile:
                expert_policy = pickle.load(pfile)
            # expert_policy = load_expert_reacher(expert_env, sess) #Load the expert #TODO: Need to train the expert

            #from rllab.sampler.utils import rollout
            #while True:
            #        t = rollout(env=expert_env, agent=expert_policy, max_path_length=50, animated=True)

            algo.n_itr = self.itrs
            trainer = CyberPunkTrainer(disc=disc,
                                       novice_policy_env=novice_env,
                                       expert_fail_pol=expert_fail_pol,
                                       expert_env=expert_env,
                                       novice_policy=policy,
                                       novice_policy_opt_algo=algo,
                                       expert_success_pol=expert_policy,
                                       im_width=im_width,
                                       im_height=im_height,
                                       im_channels=im_channels,
                                       tf_sess=sess,
                                       horizon=self.horizon)

            iterations = self.itrs
            for iter_step in range(0, iterations):
                logger.record_tabular('Iteration', iter_step)
                trainer.take_iteration(n_trajs_cost=self.trajs,
                                       n_trajs_policy=self.trajs)
                logger.dump_tabular(with_prefix=False)

            trainer.log_and_finish()
def run_experiment(expert_rollout_pickle_path,
                   trained_policy_pickle_path,
                   env,
                   cost_trainer_type,
                   iterations=30,
                   num_frames=1,
                   traj_len=200,
                   config={}):

    # Load the expert rollouts into memory
    expert_rollouts = load_expert_rollouts(expert_rollout_pickle_path)

    # In the case that we only have one expert rollout in the file
    if type(expert_rollouts) is dict:
        expert_rollouts = [expert_rollouts]

    #TODO: make this configurable
    expert_rollouts = [
        shorten_tensor_dict(x, traj_len) for x in expert_rollouts
    ]

    # import pdb; pdb.set_trace()

    # Sanity check, TODO: should prune any "expert" rollouts with suboptimal reward?
    print("Average reward for expert rollouts: %f" %
          np.mean([np.sum(p['rewards']) for p in expert_rollouts]))

    if "transformers" in config and len(config["transformers"]) > 0:
        print("Transforming expert rollouts...")
        for rollout in tqdm(expert_rollouts):
            transformed_observations = []
            for ob in tqdm(rollout["observations"]):
                for transformer in config["transformers"]:
                    ob = transformer.transform(ob)
                transformed_observations.append(ob)
            rollout["observations"] = np.array(transformed_observations)

    # Handle both flattened state input and image input
    # TODO: this could be done better by looking at just the shape and determining from that
    if config["img_input"]:
        obs_dims = expert_rollouts[0]['observations'][0].shape
    else:
        # import pdb; pdb.set_trace()
        obs_dims = len(expert_rollouts[0]['observations'][0])

    if "num_novice_rollouts" in config:
        number_of_sample_trajectories = config["num_novice_rollouts"]
    else:
        number_of_sample_trajectories = len(expert_rollouts)

    print(number_of_sample_trajectories)

    # Choose a policy (Conv based on images, mlp based on states)
    # TODO: may also have to switch out categorical for something else in continuous state spaces??
    # Let's just avoid that for now?
    if config[
            "img_input"]:  # TODO: unclear right now if this even works ok. get poor results early on.
        policy = CategoricalConvPolicy(
            name="policy",
            env_spec=env.spec,
            conv_filters=[32, 64, 64],
            conv_filter_sizes=[3, 3, 3],
            conv_strides=[1, 1, 1],
            conv_pads=['SAME', 'SAME', 'SAME'],
            # The neural network policy should have two hidden layers, each with 100 hidden units each (see RLGAN paper)
            hidden_sizes=[200, 200])
    elif type(env.spec.action_space) == Discrete:
        policy = CategoricalMLPPolicy(
            name="policy",
            env_spec=env.spec,
            # The neural network policy should have two hidden layers, each with 100 hidden units each (see RLGAN paper)
            hidden_sizes=(400, 300))
    else:
        policy = GaussianMLPPolicy(name="policy",
                                   env_spec=env.spec,
                                   hidden_sizes=(100, 50, 25))

    if config["img_input"]:
        # TODO: right now the linear feature baseline is too computationally expensive to actually use
        # with full image inputs, so for now just use the zero baseline
        baseline = ZeroBaseline(env_spec=env.spec)
    else:
        baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=number_of_sample_trajectories *
        traj_len,  # This is actually used internally by the sampler. We make use of this sampler to generate our samples, hence we pass it here
        max_path_length=
        traj_len,  # same with this value. A cleaner way may be to create our own sampler, but for now doing it this way..
        n_itr=40,
        discount=0.995,
        step_size=0.01,
        optimizer=ConjugateGradientOptimizer(
            hvp_approach=FiniteDifferenceHvp(base_eps=1e-5),
            max_backtracks=40))

    # Prune the number of rollouts if that option is enabled
    if "num_expert_rollouts" in config:
        rollouts_to_use = min(config["num_expert_rollouts"],
                              len(expert_rollouts))
        expert_rollouts = expert_rollouts[:rollouts_to_use]
        print("Only using %d expert rollouts" % rollouts_to_use)

    true_rewards = []
    actual_rewards = []

    # Extract observations to a tensor
    expert_rollouts_tensor = tensor_utils.stack_tensor_list(
        [path["observations"] for path in expert_rollouts])

    if "oversample" in config and config["oversample"]:
        oversample_rate = max(
            int(number_of_sample_trajectories / len(expert_rollouts_tensor)),
            1.)
        expert_rollouts_tensor = expert_rollouts_tensor.repeat(oversample_rate,
                                                               axis=0)
        print("oversampling %d times to %d" %
              (oversample_rate, len(expert_rollouts_tensor)))

    with tf.Session() as sess:
        algo.start_worker()

        cost_trainer = cost_trainer_type([num_frames, obs_dims], config=config)

        trainer = Trainer(env=env,
                          sess=sess,
                          cost_approximator=cost_trainer,
                          cost_trainer=cost_trainer,
                          novice_policy=policy,
                          novice_policy_optimizer=algo,
                          num_frames=num_frames)
        sess.run(tf.global_variables_initializer())

        for iter_step in range(0, iterations):
            dump_data = (iter_step == (
                iterations -
                1)) and config["generate_option_graphs"]  # is last iteration
            true_reward, actual_reward = trainer.step(
                dump_datapoints=dump_data,
                config=config,
                expert_horizon=traj_len,
                number_of_sample_trajectories=number_of_sample_trajectories)
            true_rewards.append(true_reward)
            actual_rewards.append(actual_reward)

            # run a rollout for the video
            if "recording_env" in config:
                novice_rollouts = rollout_policy(policy,
                                                 config["recording_env"],
                                                 get_image_observations=False,
                                                 max_path_length=200)

        novice_rollouts = algo.obtain_samples(iter_step)

        rollout_rewards = [np.sum(x['rewards']) for x in novice_rollouts]

        print("Reward stats for final policy: %f +/- %f " %
              (np.mean(rollout_rewards), np.std(rollout_rewards)))
        # save the novice policy learned
        with open(trained_policy_pickle_path, "wb") as output_file:
            pickle.dump(policy, output_file)
        # TODO: also save the reward function?

        algo.shutdown_worker()

        second_true_rewards = []
        second_actual_rewards = []
        # Do our transfer learning task here:
        # TODO: move this to a separate script and save the learned weights
        if config['second_env'] is not None:
            with tf.variable_scope("second_policy"):
                #TODO: remove gross copypasta
                if not config["reset_second_policy"]:
                    second_policy = Serializable.clone(
                        policy)  # TODO: start with a fresh policy
                else:
                    if config[
                            "img_input"]:  # TODO: unclear right now if this even works ok. get poor results early on.
                        second_policy = CategoricalConvPolicy(
                            name="policy",
                            env_spec=config["second_env"].spec,
                            conv_filters=[32, 64, 64],
                            conv_filter_sizes=[3, 3, 3],
                            conv_strides=[1, 1, 1],
                            conv_pads=['SAME', 'SAME', 'SAME'],
                            # The neural network policy should have two hidden layers, each with 100 hidden units each (see RLGAN paper)
                            hidden_sizes=[200, 200])
                    elif type(env.spec.action_space) == Discrete:
                        second_policy = CategoricalMLPPolicy(
                            name="policy",
                            env_spec=config["second_env"].spec,
                            # The neural network policy should have two hidden layers, each with 100 hidden units each (see RLGAN paper)
                            hidden_sizes=(400, 300))
                    else:
                        second_policy = GaussianMLPPolicy(
                            name="policy",
                            env_spec=config["second_env"].spec,
                            hidden_sizes=(100, 50, 25))

                if config["img_input"]:
                    # TODO: right now the linear feature baseline is too computationally expensive to actually use
                    # with full image inputs, so for now just use the zero baseline
                    baseline = ZeroBaseline(env_spec=config["second_env"].spec)
                else:
                    baseline = LinearFeatureBaseline(
                        env_spec=config["second_env"].spec)

                algo = TRPO(
                    env=config["second_env"],
                    policy=second_policy,
                    baseline=baseline,
                    batch_size=number_of_sample_trajectories *
                    traj_len,  # This is actually used internally by the sampler. We make use of this sampler to generate our samples, hence we pass it here
                    max_path_length=
                    traj_len,  # same with this value. A cleaner way may be to create our own sampler, but for now doing it this way..
                    n_itr=40,
                    discount=0.995,
                    step_size=0.01,
                    optimizer=ConjugateGradientOptimizer(
                        hvp_approach=FiniteDifferenceHvp(base_eps=1e-5),
                        max_backtracks=40))

            if not config["stop_disc_training_on_second_run"] and config[
                    "use_prev_options_relearn_mixing_func"]:
                # If we're not retraining the discriminator at all in the transfer learning step,
                # just keep the old network
                options = cost_trainer.disc.discriminator_options
                cost_trainer.disc._remake_network_from_disc_options(
                    options,
                    stop_gradients=(not config["retrain_options"]),
                    num_extra_options=config["num_extra_options_on_transfer"])

            trainer = Trainer(
                env=config['second_env'],
                sess=sess,
                cost_approximator=cost_trainer,
                cost_trainer=cost_trainer,
                novice_policy=second_policy,
                novice_policy_optimizer=algo,
                num_frames=num_frames,
                train_disc=(not config["stop_disc_training_on_second_run"]))
            algo.start_worker()

            initialize_uninitialized(sess)
            for iter_step in range(0, iterations):
                # import pdb; pdb.set_trace()
                dump_data = (iter_step == (iterations - 1)) and config[
                    "generate_option_graphs"]  # is last iteration
                true_reward, actual_reward = trainer.step(
                    expert_rollouts_tensor=expert_rollouts_tensor,
                    dump_datapoints=dump_data,
                    config=config,
                    expert_horizon=traj_len,
                    number_of_sample_trajectories=number_of_sample_trajectories
                )
                second_true_rewards.append(true_reward)
                second_actual_rewards.append(actual_reward)

                # run a rollout for the video
                if "recording_env" in config:
                    novice_rollouts = rollout_policy(
                        second_policy,
                        config["recording_env"],
                        get_image_observations=False,
                        max_path_length=traj_len)

            novice_rollouts = algo.obtain_samples(iter_step)

            rollout_rewards = [np.sum(x['rewards']) for x in novice_rollouts]
            print("Reward stats for final policy: %f +/- %f " %
                  (np.mean(rollout_rewards), np.std(rollout_rewards)))
            # save the novice policy learned
            with open(trained_policy_pickle_path, "wb") as output_file:
                pickle.dump(second_policy, output_file)

            algo.shutdown_worker()

    return true_rewards, actual_rewards, second_true_rewards, second_actual_rewards
Esempio n. 18
0
    def setup(self, env, policy, start_itr):

        if not self.args.algo == 'thddpg':
            # Baseline
            if self.args.baseline_type == 'linear':
                baseline = LinearFeatureBaseline(env_spec=env.spec)
            elif self.args.baseline_type == 'zero':
                baseline = ZeroBaseline(env_spec=env.spec)
            else:
                raise NotImplementedError(self.args.baseline_type)

            if self.args.control == 'concurrent':
                baseline = [baseline for _ in range(len(env.agents))]
        # Logger
        default_log_dir = config.LOG_DIR
        if self.args.log_dir is None:
            log_dir = osp.join(default_log_dir, self.args.exp_name)
        else:
            log_dir = self.args.log_dir

        tabular_log_file = osp.join(log_dir, self.args.tabular_log_file)
        text_log_file = osp.join(log_dir, self.args.text_log_file)
        params_log_file = osp.join(log_dir, self.args.params_log_file)

        logger.log_parameters_lite(params_log_file, self.args)
        logger.add_text_output(text_log_file)
        logger.add_tabular_output(tabular_log_file)
        prev_snapshot_dir = logger.get_snapshot_dir()
        prev_mode = logger.get_snapshot_mode()
        logger.set_snapshot_dir(log_dir)
        logger.set_snapshot_mode(self.args.snapshot_mode)
        logger.set_log_tabular_only(self.args.log_tabular_only)
        logger.push_prefix("[%s] " % self.args.exp_name)

        if self.args.algo == 'tftrpo':
            algo = MATRPO(
                env=env,
                policy_or_policies=policy,
                baseline_or_baselines=baseline,
                batch_size=self.args.batch_size,
                start_itr=start_itr,
                max_path_length=self.args.max_path_length,
                n_itr=self.args.n_iter,
                discount=self.args.discount,
                gae_lambda=self.args.gae_lambda,
                step_size=self.args.step_size,
                optimizer=ConjugateGradientOptimizer(
                    hvp_approach=FiniteDifferenceHvp(
                        base_eps=1e-5)) if self.args.recurrent else None,
                ma_mode=self.args.control)
        elif self.args.algo == 'thddpg':
            qfunc = thContinuousMLPQFunction(env_spec=env.spec)
            if self.args.exp_strategy == 'ou':
                es = OUStrategy(env_spec=env.spec)
            elif self.args.exp_strategy == 'gauss':
                es = GaussianStrategy(env_spec=env.spec)
            else:
                raise NotImplementedError()

            algo = thDDPG(env=env,
                          policy=policy,
                          qf=qfunc,
                          es=es,
                          batch_size=self.args.batch_size,
                          max_path_length=self.args.max_path_length,
                          epoch_length=self.args.epoch_length,
                          min_pool_size=self.args.min_pool_size,
                          replay_pool_size=self.args.replay_pool_size,
                          n_epochs=self.args.n_iter,
                          discount=self.args.discount,
                          scale_reward=0.01,
                          qf_learning_rate=self.args.qfunc_lr,
                          policy_learning_rate=self.args.policy_lr,
                          eval_samples=self.args.eval_samples,
                          mode=self.args.control)
        return algo
Esempio n. 19
0
    hidden_sizes=(100, 50, 25),
    hidden_nonlinearity=tf.nn.relu,
)

baseline = LinearFeatureBaseline(env_spec=env.spec)

algo = TRPO(env=env,
            policy=policy,
            baseline=baseline,
            batch_size=5000,
            max_path_length=env.horizon,
            n_itr=args.num_epochs,
            discount=0.99,
            step_size=0.01,
            optimizer=ConjugateGradientOptimizer(
                hvp_approach=FiniteDifferenceHvp(base_eps=1e-5)))

run_experiment_lite(
    algo.train(),
    log_dir=None if args.use_ec2 else args.data_dir,
    # Number of parallel workers for sampling
    n_parallel=1,
    # Only keep the snapshot parameters for the last iteration
    snapshot_mode="last",
    # Specifies the seed for the experiment. If this is not provided, a random seed
    # will be used
    exp_prefix="UnifiedDDPG_" + args.env + "_trpo",
    seed=1,
    mode="ec2" if args.use_ec2 else "local",
    plot=False,
    # dry=True,
def run_task(vv, log_dir=None, exp_name=None):
    global policy
    global baseline
    policy = None
    baseline = None

    trpo_stepsize = 0.01
    trpo_subsample_factor = 0.2

    # Check if variant is available
    if vv['model_type'] not in ['BrushTireModel', 'LinearTireModel']:
        raise ValueError('Unrecognized model type for simulating robot')
    if vv['robot_type'] not in ['MRZR', 'RCCar']:
        raise ValueError('Unrecognized robot type')

    # Load environment
    if not vv['use_ros']:
        env = StraightEnv(
            target_velocity=vv['target_velocity'],
            dt=vv['dt'],
            model_type=vv['model_type'],
            robot_type=vv['robot_type'],
            mu_s=vv['mu_s'],
            mu_k=vv['mu_k']
        )
        env=TfEnv(env)
    else:
        from aa_simulation.envs.straight.straight_env_ros import StraightEnvROS
        env = StraightEnvROS(
            target_velocity=vv['target_velocity'],
            dt=vv['dt'],
            model_type=vv['model_type'],
            robot_type=vv['robot_type']
        )

    # Save variant information for comparison plots
    # variant_file = logger.get_snapshot_dir() + '/variant.json'
    # logger.log_variant(variant_file, vv)

    # Set variance for each action component separately for exploration
    # Note: We set the variance manually because we are not scaling our
    #       action space during training.
    init_std_speed = vv['target_velocity'] / 4
    init_std_steer = np.pi / 6
    init_std = [init_std_speed, init_std_steer]

    # Build policy and baseline networks
    # Note: Mean of policy network set to analytically computed values for
    #       faster training (rough estimates for RL to fine-tune).
    if policy is None or baseline is None:
        target_velocity = vv['target_velocity']
        target_steering = 0
        output_mean = np.array([target_velocity, target_steering])
        hidden_sizes = (32, 32)

        # In mean network, allow output b values to dominate final output
        # value by constraining the magnitude of the output W matrix. This is
        # to allow faster learning. These numbers are arbitrarily chosen.
        W_gain = min(vv['target_velocity'] / 5, np.pi / 15)


        policy = GaussianLSTMPolicy(
            name="policy",
            env_spec=env.spec,
            # input_shape=(env.spec.observation_space.flat_dim,),
            # output_dim=env.spec.action_space.flat_dim,
           # gru_layer_cls=L.GRULayer,
        )               
        # mean_network = MLP(
        #     input_shape=(env.spec.observation_space.flat_dim,),
        #     output_dim=env.spec.action_space.flat_dim,
        #     hidden_sizes=hidden_sizes,
        #     hidden_nonlinearity=LN.rectify,
        #     output_nonlinearity=None,
        #     output_W_init=LI.GlorotUniform(gain=W_gain),
        #     output_b_init=output_mean
        # )
        # policy = GaussianMLPPolicy(
        #     env_spec=env.spec,
        #     hidden_sizes=(32, 32),
        #     init_std=init_std,
        #     mean_network=mean_network
        # )
        baseline = LinearFeatureBaseline(
            env_spec=env.spec,
            target_key='returns'
        )

    # Reset variance to re-enable exploration when using pre-trained networks
    else:
        policy._l_log_std = ParamLayer(
            policy._mean_network.input_layer,
            num_units=env.spec.action_space.flat_dim,
            param=LI.Constant(np.log(init_std)),
            name='output_log_std',
            trainable=True
        )
        obs_var = policy._mean_network.input_layer.input_var
        mean_var, log_std_var = L.get_output([policy._l_mean, policy._l_log_std])
        policy._log_std_var = log_std_var
        LasagnePowered.__init__(policy, [policy._l_mean, policy._l_log_std])
        policy._f_dist = ext.compile_function(
            inputs=[obs_var],
            outputs=[mean_var, log_std_var]
        )

    safety_baseline = LinearFeatureBaseline(
        env_spec=env.spec,
        target_key='safety_returns'
    )

    safety_constraint = StraightSafetyConstraint(
        max_value=1.0,
        baseline=safety_baseline
    )

    if vv['algo'] == 'TRPO':
        algo = Trpo(
            env=env,
            policy=policy,
            baseline=baseline,
            batch_size=600,
            max_path_length=env.horizon,
            n_itr=2000,
            discount=0.99,
            step_size=trpo_stepsize,
            plot=False,
            optimizer=ConjugateGradientOptimizer(hvp_approach=FiniteDifferenceHvp(base_eps=1e-5)),
        )
    else:
        algo = CPO(
            env=env,
            policy=policy,
            baseline=baseline,
            safety_constraint=safety_constraint,
            batch_size=600,
            max_path_length=env.horizon,
            n_itr=2000,
            discount=0.99,
            step_size=trpo_stepsize,
            gae_lambda=0.95,
            safety_gae_lambda=1,
            optimizer_args={'subsample_factor': trpo_subsample_factor},
            plot=False
        )
    algo.train()
Esempio n. 21
0
baseline = LinearFeatureBaseline(env_spec=env.spec)

algo = TRPO(
    env=env,
    policy=policy,
    baseline=baseline,
    batch_size=5000,
    max_path_length=2000,
    #max_path_length=env.horizon,
    n_itr=1000,
    discount=0.99,
    step_size=0.01,
    gae_lambda=1.0,
    optimizer=ConjugateGradientOptimizer(
        reg_coeff=1e-5, hvp_approach=FiniteDifferenceHvp(base_eps=1e-5)))

name = "TRPO_Trial_Results/" + "Trial_GridWorld/"

run_experiment_lite(
    algo.train(),
    # log_dir=None if args.use_ec2 else args.data_dir,
    # Number of parallel workers for sampling
    n_parallel=1,
    # Only keep the snapshot parameters for the last iteration
    snapshot_mode="none",
    # Specifies the seed for the experiment. If this is not provided, a random seed
    # will be used
    exp_name=name,
    seed=1,
    # mode="ec2" if args.use_ec2 else "local",
Esempio n. 22
0
    for env_name, env in envs:

        logger.log("Training Policy on %s" % env_name)

        algo = TRPO(
            env=env,
            policy=policy,
            baseline=baseline,
            batch_size=args.batch_size,
            max_path_length=env.horizon,
            n_itr=args.num_epochs,
            discount=0.99,
            step_size=args.step_size,
            optimizer=ConjugateGradientOptimizer(
                reg_coeff=args.reg_coeff,
                hvp_approach=FiniteDifferenceHvp(base_eps=args.reg_coeff)))

        custom_train(algo, sess=sess)

        rollouts = algo.obtain_samples(args.num_epochs + 1)

        logger.log("Average reward for training rollouts on (%s): %f +- %f " %
                   (env_name, np.mean([np.sum(p['rewards'])
                                       for p in rollouts]),
                    np.std([np.sum(p['rewards']) for p in rollouts])))

    # Final evaluation on all environments using the learned policy

    total_rollouts = []
    for env_name, env in envs:
        rollouts = []
Esempio n. 23
0
def run(args):
    print("loading from:", args.params_filepath)
    print("saving to:", args.exp_name)
    exp_dir = utils.set_up_experiment(exp_name=args.exp_name, phase='imitate')
    saver_dir = os.path.join(exp_dir, 'imitate', 'log')
    saver_filepath = os.path.join(saver_dir, 'checkpoint')
    np.savez(os.path.join(saver_dir, 'args'), args=args)
    summary_writer = tf.summary.FileWriter(
        os.path.join(exp_dir, 'imitate', 'summaries'))

    # build components
    env, act_low, act_high = utils.build_ngsim_env(args,
                                                   exp_dir,
                                                   vectorize=args.vectorize)
    data = utils.load_data(args.expert_filepath,
                           act_low=act_low,
                           act_high=act_high,
                           min_length=args.env_H + args.env_primesteps,
                           clip_std_multiple=args.normalize_clip_std_multiple,
                           ngsim_filename=args.ngsim_filename)
    critic = utils.build_critic(args, data, env, summary_writer)
    policy = utils.build_policy(args, env)
    recognition_model = utils.build_recognition_model(args, env,
                                                      summary_writer)
    baseline = utils.build_baseline(args, env)
    reward_handler = utils.build_reward_handler(args, summary_writer)
    validator = auto_validator.AutoValidator(
        summary_writer,
        data['obs_mean'],
        data['obs_std'],
        render=args.validator_render,
        render_every=args.render_every,
        flat_recurrent=args.policy_recurrent)

    # build algo
    saver = tf.train.Saver(max_to_keep=100, keep_checkpoint_every_n_hours=.5)
    sampler_args = dict(n_envs=args.n_envs) if args.vectorize else None
    if args.policy_recurrent:
        optimizer = ConjugateGradientOptimizer(
            max_backtracks=50, hvp_approach=FiniteDifferenceHvp(base_eps=1e-5))
    else:
        optimizer = None
    algo = GAIL(critic=critic,
                recognition=recognition_model,
                reward_handler=reward_handler,
                env=env,
                policy=policy,
                baseline=baseline,
                validator=validator,
                batch_size=args.batch_size,
                max_path_length=args.max_path_length,
                n_itr=args.n_itr,
                discount=args.discount,
                step_size=args.trpo_step_size,
                saver=saver,
                saver_filepath=saver_filepath,
                force_batch_sampler=False if args.vectorize else True,
                sampler_args=sampler_args,
                snapshot_env=False,
                plot=False,
                optimizer=optimizer,
                optimizer_args=dict(max_backtracks=50, debug_nan=True))

    # run it
    with tf.Session() as session:

        # running the initialization here to allow for later loading
        # NOTE: rllab batchpolopt runs this before training as well
        # this means that any loading subsequent to this is nullified
        # you have to comment of that initialization for any loading to work
        session.run(tf.global_variables_initializer())

        # loading
        if args.params_filepath != '':
            algo.load(args.params_filepath)

        # run training
        algo.train(sess=session)