コード例 #1
0
ファイル: offline.py プロジェクト: phillip1029/BREMEN
def get_data_from_offline_batch(params,
                                env,
                                normalization_scope=None,
                                model='dynamics',
                                split_ratio=0.666667):
    train_collection = DataCollection(batch_size=params[model]['batch_size'],
                                      max_size=params['max_train_data'],
                                      shuffle=True)
    val_collection = DataCollection(batch_size=params[model]['batch_size'],
                                    max_size=params['max_val_data'],
                                    shuffle=False)
    rollout_sampler = RolloutSampler(env)
    rl_paths = rollout_sampler.generate_offline_data(
        data_file=params['data_file'], n_train=params["n_train"])
    path_collection = PathCollection()
    obs_dim = env.observation_space.shape[0]
    normalization = add_path_data_to_collection_and_update_normalization(
        rl_paths,
        path_collection,
        train_collection,
        val_collection,
        normalization=None,
        split_ratio=split_ratio,
        obs_dim=obs_dim,
        normalization_scope=normalization_scope)
    return train_collection, val_collection, normalization, path_collection, rollout_sampler
コード例 #2
0
ファイル: main.py プロジェクト: clvoloshin/mbbl-metrpo
def pre_train_dynamics(params, dyn_model, policy, algo, reset_opt, sess,
                       path_collection, train_collection, val_collection,
                       normalization, rollout_sampler):
    dyn_model.use_intrinsic_rewards_only()

    pre_train_itr = params["dynamics"].get("pre_training", {}).get("itr", 0)
    logger.info("Pre-training dynamics model for {} iterations...".format(
        pre_train_itr))
    tf.global_variables_initializer().run()

    for itr in range(pre_train_itr):
        logger.info('Pre-training itr #{} |'.format(itr))
        dyn_model.fit(train_collection, val_collection)
        rollout_sampler.update_dynamics(dyn_model)
        dyn_model.update_randomness()

        sess.run(reset_opt)

        if params['policy'].get('reinitialize_every_itr', False):
            logger.info("Re-initialize policy variables")
            policy.initialize_variables()

        train_policy_trpo(params, algo, dyn_model,
                          params["dynamics"]["pre_training"]["policy_itr"])
        rl_paths = rollout_sampler.sample(
            num_paths=params['num_path_onpol'],
            horizon=params['env_horizon'],
            visualize=params.get("rollout_visualization", False),
            visualize_path_no=params.get("rollout_record_path_no"),
        )

        returns = np.array([sum(path["rewards"]) for path in rl_paths])
        log_tabular_results(returns, itr, train_collection)

        normalization = add_path_data_to_collection_and_update_normalization(
            rl_paths, path_collection, train_collection, val_collection,
            normalization)
    logger.info("Done pre-training dynamics model.")
コード例 #3
0
ファイル: main.py プロジェクト: clvoloshin/mbbl-metrpo
def get_data_from_random_rollouts(params, env, normalization_scope=None):
    train_collection = DataCollection(
        batch_size=params['dynamics']['batch_size'],
        max_size=params['max_train_data'],
        shuffle=True)
    val_collection = DataCollection(
        batch_size=params['dynamics']['batch_size'],
        max_size=params['max_val_data'],
        shuffle=False)
    rollout_sampler = RolloutSampler(env)
    random_paths = rollout_sampler.generate_random_rollouts(
        num_paths=params['num_path_random'], horizon=params['env_horizon'])
    path_collection = PathCollection()
    obs_dim = env.observation_space.shape[0]
    normalization = add_path_data_to_collection_and_update_normalization(
        random_paths,
        path_collection,
        train_collection,
        val_collection,
        normalization=None,
        obs_dim=obs_dim,
        normalization_scope=normalization_scope)
    return train_collection, val_collection, normalization, path_collection, rollout_sampler
コード例 #4
0
def get_data_from_random_rollouts(params,
                                  env,
                                  random_paths,
                                  normalization_scope=None,
                                  model='dynamics',
                                  split_ratio=0.666667):
    train_collection = DataCollection(batch_size=params[model]['batch_size'],
                                      max_size=params['max_train_data'],
                                      shuffle=True)
    val_collection = DataCollection(batch_size=params[model]['batch_size'],
                                    max_size=params['max_val_data'],
                                    shuffle=False)
    path_collection = PathCollection()
    obs_dim = env.observation_space.shape[0]
    normalization = add_path_data_to_collection_and_update_normalization(
        random_paths,
        path_collection,
        train_collection,
        val_collection,
        normalization=None,
        split_ratio=split_ratio,
        obs_dim=obs_dim,
        normalization_scope=normalization_scope)
    return train_collection, val_collection, normalization, path_collection
コード例 #5
0
def train(params):
    sess = get_session(interactive=True)
    env = get_env(params['env_name'], params.get('video_dir'))
    inner_env = get_inner_env(env)

    num_paths = int(params['n_train'] * params['interval'] /
                    params['onpol_iters'] / params['env_horizon'])

    rollout_sampler = RolloutSampler(env)
    behavior_policy_rollout_sampler = RolloutSampler(env)
    random_paths = rollout_sampler.generate_random_rollouts(
        num_paths=num_paths, horizon=params['env_horizon'])

    # get random traj
    train_collection, val_collection, normalization, path_collection = \
        get_data_from_random_rollouts(params, env, random_paths, split_ratio=0.85)

    behavior_policy_train_collection, behavior_policy_val_collection, \
        behavior_policy_normalization, behavior_policy_path_collection = \
        get_data_from_random_rollouts(params, env, random_paths, normalization_scope='behavior_policy', model='behavior_policy', split_ratio=1.0)

    # ############################################################
    # ############### create computational graph #################
    # ############################################################
    policy = create_policy_from_params(params, env, sess)
    controller = create_controller_from_policy(policy)
    rollout_sampler.update_controller(controller)

    # (approximated) behavior policy
    behavior_policy = create_behavior_policy_from_params(params, env, sess)
    behavior_policy_controller = create_controller_from_policy(behavior_policy)
    behavior_policy_rollout_sampler.update_controller(
        behavior_policy_controller)

    dyn_model = create_dynamics_model(params, env, normalization, sess)

    if params['algo'] not in ('trpo', 'vime'):
        raise NotImplementedError

    algo = create_trpo_algo(
        params,
        env,
        inner_env,
        policy,
        dyn_model,
        sess,
        behavior_policy=behavior_policy,
        offline_dataset=behavior_policy_train_collection.data["observations"])

    # ############################################################
    # ######################### learning #########################
    # ############################################################

    # init global variables
    all_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                      scope=None)
    policy_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                         scope="policy")
    behavior_policy_variables = tf.get_collection(
        tf.GraphKeys.GLOBAL_VARIABLES, scope="behavior_policy")
    if params['param_value']:
        value_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                            scope="baseline")
        all_var_except_policy = [
            v for v in all_variables
            if v not in (policy_variables + behavior_policy_variables +
                         value_variables)
        ]
    else:
        all_var_except_policy = [
            v for v in all_variables
            if v not in (policy_variables + behavior_policy_variables)
        ]

    train_dyn_with_intrinsic_reward_only = params["dynamics"].get(
        "intrinsic_reward_only", False)
    logger.log("Train dynamics model with intrinsic reward only? {}".format(
        train_dyn_with_intrinsic_reward_only))

    dynamics_saver = tf.train.Saver(var_list=all_var_except_policy)
    behavior_policy_saver = tf.train.Saver(var_list=behavior_policy_variables)
    policy_saver = tf.train.Saver(var_list=policy_variables)
    tf.global_variables_initializer().run()

    if params['restart_iter'] != 0:
        start_itr = params['restart_iter'] + 1
    else:
        start_itr = params.get("start_onpol_iter", 0)
    interval = params['interval']
    end_itr = params['onpol_iters']

    if train_dyn_with_intrinsic_reward_only:
        # Note: not supported
        dyn_model.use_intrinsic_rewards_only()
    else:
        dyn_model.use_external_rewards_only()

    # for restart experiment
    if confirm_restoring_policy(params):
        restore_policy(params, policy_saver, sess)
    if confirm_restoring_dynamics_model(params):
        restore_model(params, dynamics_saver, sess)
    if confirm_restoring_behavior_policy(params):
        restore_behavior_policy(params, behavior_policy_saver, sess)
    if confirm_restoring_offline_data(params):
        train_collection, val_collection, behavior_policy_train_collection = restore_offline_data(
            params)
        policy.running_stats.update_stats(
            train_collection.data["observations"])
        behavior_policy.running_stats.update_stats(
            behavior_policy_train_collection.data["observations"])
    if confirm_restoring_value(params):
        algo.baseline.restore_value_function(params['restore_path'],
                                             params['restart_iter'])
        algo.baseline.running_stats.update_stats(
            train_collection.data["observations"])

    # training
    for itr in range(start_itr, end_itr):
        if itr % interval == 0:
            if itr != 0:
                logger.info("Collecting offline data with online interaction.")
                rl_paths = rollout_sampler.sample(
                    num_paths=num_paths,
                    horizon=params['env_horizon'],
                    evaluation=False)
                # Update data for dynamics training
                normalization = add_path_data_to_collection_and_update_normalization(
                    rl_paths,
                    path_collection,
                    train_collection,
                    val_collection,
                    normalization,
                    split_ratio=0.85)
                # Update data for BC fitting
                if not params['all_bc']:
                    behavior_policy_normalization = replace_path_data_to_collection_and_update_normalization(
                        rl_paths,
                        behavior_policy_train_collection,
                        behavior_policy_val_collection,
                        behavior_policy_normalization,
                        split_ratio=1.0)
                else:
                    behavior_policy_normalization = add_path_data_to_collection_and_update_normalization(
                        rl_paths,
                        behavior_policy_path_collection,
                        behavior_policy_train_collection,
                        behavior_policy_val_collection,
                        behavior_policy_normalization,
                        split_ratio=1.0)
                behavior_policy_train_collection.set_batch_size(
                    params['behavior_policy']['batch_size'])
            # dynamics
            logger.info("Fitting dynamics.")
            dyn_model.fit(train_collection, val_collection)
            logger.info("Done fitting dynamics.")
            save_cur_iter_dynamics_model(params, dynamics_saver, sess, itr)
            rollout_sampler.update_dynamics(dyn_model)
            # BC
            logger.info("Fitting BC.")
            behavior_policy.initialize_variables()
            behavior_policy.running_stats.update_stats(
                behavior_policy_train_collection.data["observations"])
            behavior_policy.fit_as_bc(behavior_policy_train_collection,
                                      behavior_policy_val_collection,
                                      behavior_policy_rollout_sampler)
            save_cur_iter_behavior_policy(params, behavior_policy_saver, sess,
                                          itr)
            logger.info("Done fitting BC.")
            # re-initialize TRPO policy with BC policy
            if params['bc_init']:
                logger.info("Initialize TRPO policy with BC.")
                update_weights = [
                    tf.assign(new, old)
                    for (new,
                         old) in zip(tf.trainable_variables('policy'),
                                     tf.trainable_variables('behavior_policy'))
                ]
                sess.run(update_weights)
                algo.reinit_with_source_policy(behavior_policy)
                if rollout_sampler:
                    rl_paths = rollout_sampler.sample(
                        num_paths=params['num_path_onpol'],
                        horizon=params['env_horizon'],
                        evaluation=True)
                returns = np.mean(
                    np.array([sum(path["rewards"]) for path in rl_paths]))
                logger.info(
                    "TRPO policy initialized with BC average return: {}".
                    format(returns))

            if params['pretrain_value']:
                logger.info("Fitting value function.")
                behavior_policy_train_collection.set_batch_size(
                    params['max_path_length'])
                for obses, _, _, rewards in behavior_policy_train_collection:
                    algo.pre_train_baseline(obses, rewards,
                                            params['trpo']['gamma'],
                                            params['trpo']['gae'])
                logger.info("Done fitting value function.")

            save_cur_iter_offline_data(
                params,
                train_collection,
                val_collection,
                behavior_policy_train_collection,
                itr,
            )

        logger.info('itr #%d | ' % itr)

        # Update randomness
        logger.info("Updating randomness.")
        dyn_model.update_randomness()
        logger.info("Done updating randomness.")

        # Policy training
        logger.info("Training policy using TRPO.")
        train_policy_trpo(params, algo, dyn_model,
                          params['trpo']['iterations'], sess)
        logger.info("Done training policy.")

        # Generate on-policy rollouts.
        # only for evaluation, not for updating data
        logger.info("Generating on-policy rollouts.")
        if params['eval_model']:
            rl_paths, rollouts, residuals = rollout_sampler.sample(
                num_paths=params['num_path_onpol'],
                horizon=params['env_horizon'],
                evaluation=True,
                eval_model=params['eval_model'])
        else:
            rl_paths = rollout_sampler.sample(
                num_paths=params['num_path_onpol'],
                horizon=params['env_horizon'],
                evaluation=True)
        logger.info("Done generating on-policy rollouts.")
        returns = np.array([sum(path["rewards"]) for path in rl_paths])
        log_tabular_results(returns, itr, train_collection)
        if params['eval_model']:
            n_transitions = sum([len(path["rewards"]) for path in rl_paths])
            # step_wise_analysis
            step_wise_mse = np.mean(
                [sum(np.array(path["observations"])**2) for path in residuals])
            step_wise_mse /= n_transitions
            logger.record_tabular('step_wise_mse', step_wise_mse)
            step_wise_episode_mean = np.mean(
                [sum(path["rewards"]) for path in residuals])
            logger.record_tabular('step_wise_episode_mean',
                                  step_wise_episode_mean)
            # trajectory_wise_analysis
            min_path = min([len(path["observations"]) for path in rl_paths])
            min_rollout = min(
                [len(rollout["observations"]) for rollout in rollouts])
            traj_len = min(min_path, min_rollout)
            traj_wise_mse = np.mean([
                sum((np.array(path["observations"])[:traj_len] -
                     np.array(rollout["observations"])[:traj_len])**2)
                for (path, rollout) in zip(rl_paths, rollouts)
            ])
            traj_wise_mse /= traj_len * params['num_path_onpol']
            logger.record_tabular('traj_wise_mse', traj_wise_mse)
            traj_wise_episode_mean = np.mean(
                [sum(path["rewards"][:traj_len]) for path in rollouts])
            logger.record_tabular('traj_wise_episode_mean',
                                  traj_wise_episode_mean)
            # Energy distance between \tau_{sim} and \tau_{real}
            combination_sim_real = list(itertools.product(rl_paths, rollouts))
            A = np.mean([
                sum(
                    np.sqrt((np.array(v[0]["observations"][:traj_len]) -
                             np.array(v[1]["observations"][:traj_len]))**2))
                for v in combination_sim_real
            ])
            combination_sim = list(itertools.product(rollouts, rollouts))
            B = np.mean([
                sum(
                    np.sqrt((np.array(v[0]["observations"][:traj_len]) -
                             np.array(v[1]["observations"][:traj_len]))**2))
                for v in combination_sim
            ])
            combination_real = list(itertools.product(rl_paths, rl_paths))
            C = np.mean([
                sum(
                    np.sqrt((np.array(v[0]["observations"][:traj_len]) -
                             np.array(v[1]["observations"][:traj_len]))**2))
                for v in combination_real
            ])
            energy_dist = np.sqrt(2 * A - B - C)
            logger.record_tabular('energy_distance', energy_dist)
            logger.dump_tabular()
        if itr % interval == 0 or itr == end_itr - 1:
            save_cur_iter_policy(params, policy_saver, sess, itr)
            if params['save_variables']:
                algo.baseline.save_value_function(params['exp_dir'], itr)
コード例 #6
0
ファイル: main.py プロジェクト: clvoloshin/mbbl-metrpo
def train(params):

    sess = get_session(interactive=True)
    env = get_env(params['env_name'], params.get('video_dir'))
    # TODO(GD): change to replay_buffer
    inner_env = get_inner_env(env)

    train_collection, val_collection, normalization, path_collection, rollout_sampler = \
        get_data_from_random_rollouts(params, env)

    # ############################################################
    # ############### create computational graph #################
    # ############################################################
    policy = create_policy_from_params(params, env, sess)

    controller, reset_opt = create_controller_from_policy(policy)
    dyn_model = create_dynamics_model(params, env, normalization, sess)

    rollout_sampler.update_controller(controller)
    if params['algo'] not in ('trpo', 'vime'):
        raise NotImplementedError

    algo = create_trpo_algo(params, env, inner_env, policy, dyn_model, sess)

    # ############################################################
    # ######################### learning #########################
    # ############################################################

    # init global variables
    all_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                      scope=None)
    policy_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                         scope="policy")
    all_var_except_policy = [
        v for v in all_variables if v not in policy_variables
    ]

    train_dyn_with_intrinsic_reward_only = params["dynamics"].get(
        "intrinsic_reward_only", False)
    logger.log("Train dynamics model with intrinsic reward only? {}".format(
        train_dyn_with_intrinsic_reward_only))
    if train_dyn_with_intrinsic_reward_only:
        external_evaluation_interval = params["dynamics"][
            "external_reward_evaluation_interval"]
        policy_ext = create_policy_from_params(params,
                                               env,
                                               sess,
                                               scope='policy_ext_reward')
        controller_ext, reset_opt_ext = create_controller_from_policy(
            policy_ext)
        algo_ext = create_trpo_algo(params,
                                    env,
                                    inner_env,
                                    policy_ext,
                                    dyn_model,
                                    sess,
                                    scope="trpo_ext_reward")
        rollout_sampler_ext = RolloutSampler(env, controller=controller_ext)
    else:
        external_evaluation_interval = None
        policy_ext = None
        algo_ext = None
        rollout_sampler_ext = None

    saver = tf.train.Saver(var_list=all_var_except_policy)
    tf.global_variables_initializer().run()

    start_itr = params.get("start_onpol_iter", 0)
    end_itr = params['onpol_iters']

    # Pre-training
    pretrain_mode = params["dynamics"].get("pre_training", {}).get("mode")
    pretrain_itr = params["dynamics"].get("pre_training", {}).get("itr", 0)

    if pretrain_mode == "intrinsic_reward":
        pre_train_dynamics(params, dyn_model, policy, algo, reset_opt, sess,
                           path_collection, train_collection, val_collection,
                           normalization, rollout_sampler)

    elif pretrain_mode == "random":
        logger.log(
            "Baseline without pre-training. Generating random rollouts to match pre-train samples."
        )
        rl_paths = rollout_sampler.generate_random_rollouts(
            num_paths=pretrain_itr * params['num_path_onpol'],
            horizon=params['env_horizon'])

        normalization = add_path_data_to_collection_and_update_normalization(
            rl_paths, path_collection, train_collection, val_collection,
            normalization)

    elif pretrain_mode == "metrpo":
        # simply start a few iterations early
        start_itr -= pretrain_itr

    if train_dyn_with_intrinsic_reward_only:
        dyn_model.use_intrinsic_rewards_only()
    else:
        dyn_model.use_external_rewards_only()

    # Main training loop
    for itr in range(start_itr, end_itr):
        logger.info('itr #%d | ' % itr)

        if confirm_restoring_dynamics_model(params):
            restore_model(params, saver, sess, itr)
        else:
            # Fit the dynamics.
            logger.info("Fitting dynamics.")

            dyn_model.fit(train_collection, val_collection)

            logger.info("Done fitting dynamics.")

            rollout_sampler.update_dynamics(dyn_model)

        # Update randomness
        logger.info("Updating randomness.")
        dyn_model.update_randomness()
        logger.info("Done updating randomness.")

        # Policy training
        logger.info("Training policy using TRPO.")

        logger.info("Re-initialize init_std.")
        sess.run(reset_opt)

        if params['policy'].get('reinitialize_every_itr', False):
            logger.info("Re-initialize policy variables.")
            policy.initialize_variables()

        train_policy_trpo(params, algo, dyn_model,
                          params['trpo']['iterations'])

        logger.info("Done training policy.")

        # Generate on-policy rollouts.
        logger.info("Generating on-policy rollouts.")
        rl_paths = rollout_sampler.sample(num_paths=params['num_path_onpol'],
                                          horizon=params['env_horizon'])
        logger.info("Done generating on-policy rollouts.")

        # Update data.
        normalization = add_path_data_to_collection_and_update_normalization(
            rl_paths, path_collection, train_collection, val_collection,
            normalization)
        if train_dyn_with_intrinsic_reward_only:
            # Evaluate with external reward once in a while
            if (itr + 1) % external_evaluation_interval == 0:
                dyn_model.use_external_rewards_only()
                logger.info(
                    "Training policy with external reward to evaluate the dynamics model."
                )
                policy_ext.initialize_variables()
                train_policy_trpo(params, algo_ext, dyn_model,
                                  params['trpo_ext_reward']['iterations'])
                logger.info("Done training policy with external reward.")
                logger.info(
                    "Generating on-policy rollouts with external reward.")
                rl_paths_ext = rollout_sampler_ext.sample(
                    num_paths=params['num_path_onpol'],
                    horizon=params['env_horizon'])
                logger.info(
                    "Done generating on-policy rollouts with external reward.")
                # Compute metrics and log results
                returns = np.array(
                    [sum(path["rewards"]) for path in rl_paths_ext])
                log_tabular_results(returns, itr, train_collection)
                dyn_model.use_intrinsic_rewards_only()
        else:
            # Compute metrics and log results
            returns = np.array([sum(path["rewards"]) for path in rl_paths])
            log_tabular_results(returns, itr, train_collection)

        # save dynamics model if applicable
        save_cur_iter_dynamics_model(params, saver, sess, itr)