Ejemplo n.º 1
0
def vis_data(off_data, off_label, def_data, def_label, outdir, start_idx=0):
    """ vis the dataset of transitions

    Args
    ----
    """

    idx_ = start_idx

    init_pos = [
        np.array(off_data[idx_, 0, -1, 0, :]),
        np.array(off_data[idx_, 0, -1, 1:6, :], dtype=np.float),
        np.array(off_data[idx_, 0, -1, 6:11, :], dtype=np.float)
    ]
    env = gym.make('bball-pretrain-v0')
    env = BBallWrapper(env,
                       if_clip=False,
                       if_norm_obs=False,
                       if_norm_act=False,
                       init_mode=2,
                       if_vis_visual_aid=True,
                       if_vis_trajectory=False,
                       init_positions=init_pos)
    env = gym.wrappers.Monitor(env,
                               outdir,
                               lambda unused_episode_number: True,
                               force=False,
                               resume=True)
    obs = env.reset()

    while True:
        # prevent from modification
        temp_off_label = np.array(off_label[idx_, 0])
        temp_def_label = np.array(def_label[idx_, 0])
        if idx_ == start_idx:
            # the env's velocity is zero, so we add the last velocity after env reset.
            last_vel = off_data[idx_, 0, -1, 1:6, :] - \
                off_data[idx_, 0, -2, 1:6, :]
            temp_off_label[5:] += last_vel.reshape([
                10,
            ])
            last_vel = def_data[idx_, 0, -1, 6:11, :] - \
                def_data[idx_, 0, -2, 6:11, :]
            temp_def_label += last_vel.reshape([
                10,
            ])
        # offense
        action = pack_action([temp_off_label[:3], temp_off_label[3:]],
                             team='offense')
        obs, _, done, _ = env.step(action)
        if done:
            env.close()
            break
        # deffense
        action = pack_action(temp_def_label, team='defense')
        obs, _, done, _ = env.step(action)
        if done:
            env.close()
            break
        idx_ += 1
Ejemplo n.º 2
0
def collect_results(config, steps, ppo_policy, D, denormalize_observ, generated_amount=100):
    """ test policy
    - draw episode into mpeg video
    - collect episode with scores on each frame into .npz file (for out customized player)

    Args
    -----
    config : object, providing configurations via attributes.
    vanilla_env : object, env
    steps : int, to name the file with number of iterations of Discriminator
    ppo_policy : object, policy to generate actions
    D : object, discriminator to judge realistic
    denormalize_observ : function, denorm the returned observation
    """
    timer = time.time()
    # read condition length
    data_len = np.load('bball_strategies/data/FixedFPS5Length.npy')
    # data_len = np.load('bball_strategies/data/WGAN/all_model_results/length.npy')
    # env to testing
    vanilla_env = gym.make(config.env)
    vanilla_env = BBallWrapper(vanilla_env, data=h5py.File(
        'bball_strategies/data/OrderedGAILTransitionData_Testing.hdf5', 'r'), init_mode=1, fps=config.FPS, time_limit=np.max(data_len)-2)
    vanilla_env = MonitorWrapper(vanilla_env, directory=os.path.join(config.logdir, 'collect_result/video/'), video_callable=lambda _: True,
                                 # init from dataset
                                 init_mode=1)
    total_output = []
    index_list = []
    for i in range(generated_amount):
        print('generating # {} episode'.format(i))
        numpy_collector = []
        act_collector = []
        vanilla_obs = vanilla_env.reset()
        for _ in range(vanilla_env.time_limit):
            vanilla_act = ppo_policy.act(
                np.array(vanilla_obs)[None, None], stochastic=False)
            act_collector.append(vanilla_act.reshape([5, 2]))
            vanilla_trans_act = [
                # Discrete(3) must be int
                int(0),
                # Box(2,)
                np.array([0.0, 0.0], dtype=np.float32),
                # Box(5, 2)
                np.zeros(shape=[5, 2], dtype=np.float32),
                # Box(5, 2)
                np.reshape(vanilla_act, [5, 2])
            ]
            vanilla_obs, _, _, info = vanilla_env.step(
                vanilla_trans_act)
            numpy_collector.append(vanilla_obs)
        index_list.append(info['data_idx'])
        numpy_collector = np.array(numpy_collector)
        act_collector = np.array(act_collector)
        numpy_collector = denormalize_observ(numpy_collector)
        total_output.append(numpy_collector)
    total_output = np.array(total_output)
    # save numpy
    np.save(os.path.join(config.logdir,
                         'collect_result/total_output.npy'), total_output)
    np.save(os.path.join(config.logdir,
                         'collect_result/total_output_length.npy'), data_len[index_list]-2)

    print('collect_results time cost: {} per episode'.format(
        (time.time() - timer)/generated_amount))
    vanilla_env.close()
Ejemplo n.º 3
0
def vis_result(sess, model, off_data, off_label, def_data, def_label, outdir,
               num_video):
    """ vis the results by using the pretrain output interacting with env 

    Args
    ----
    """

    data_len = np.load('bball_strategies/data/FixedFPS5Length.npy')
    accumulator = 0
    for i, v in enumerate(data_len):
        data_len[i] += accumulator
        accumulator += v
    for i in range(num_video):
        start_idx = data_len[i]
        idx_ = start_idx
        init_pos = [
            np.array(off_data[idx_, 0, -1, 0, :]),
            np.array(off_data[idx_, 0, -1, 1:6, :], dtype=np.float),
            np.array(off_data[idx_, 0, -1, 6:11, :], dtype=np.float)
        ]
        env = gym.make('bball-pretrain-v0')
        env = BBallWrapper(env,
                           if_clip=False,
                           if_norm_obs=False,
                           if_norm_act=False,
                           init_mode=2,
                           if_vis_visual_aid=True,
                           if_vis_trajectory=False,
                           init_positions=init_pos)
        env = gym.wrappers.Monitor(env,
                                   outdir,
                                   lambda unused_episode_number: True,
                                   force=False,
                                   resume=True)
        obs = env.reset()

        while True:
            # prevent from modification
            temp_off_label = np.array(off_label[idx_, 0])
            temp_def_label = np.array(def_label[idx_, 0])
            if idx_ == start_idx:
                # the env's velocity is zero, so we add the last velocity after env reset.
                last_vel = off_data[idx_, 0, -1, 1:6, :] - \
                    off_data[idx_, 0, -2, 1:6, :]
                temp_off_label[5:] += last_vel.reshape([
                    10,
                ])
                last_vel = def_data[idx_, 0, -1, 6:11, :] - \
                    def_data[idx_, 0, -2, 6:11, :]
                temp_def_label += last_vel.reshape([
                    10,
                ])
            if FLAGS.config == 'offense':
                # offense turn
                obs = norm_obs(env, obs)
                logits, actions = model.perform(sess, obs[None, None])
                actions = pack_action([logits[0, 0], actions[0, 0]],
                                      FLAGS.config)
                obs, _, done, _ = env.step(actions)
                if done:
                    env.close()
                    break
                # defense turn
                actions = pack_action(temp_def_label, team='defense')
                obs, _, done, _ = env.step(actions)
                if done:
                    env.close()
                    break
            elif FLAGS.config == 'defense':
                # offense turn
                actions = pack_action([temp_off_label[:3], temp_off_label[3:]],
                                      team='offense')
                obs, _, done, _ = env.step(actions)
                if done:
                    env.close()
                    break
                # defense turn
                obs = norm_obs(env, obs)
                actions = model.perform(sess, obs[None, None])
                actions = pack_action(actions, FLAGS.config)
                obs, _, done, _ = env.step(actions)
                if done:
                    env.close()
                    break
            idx_ += 1
Ejemplo n.º 4
0
def train(config, env_processes, outdir):
    """ Training and evaluation entry point yielding scores.

    Resolves some configuration attributes, creates environments, graph, and
    training loop. By default, assigns all operations to the CPU.

    Args
    ----
    config : Object providing configurations via attributes.
    env_processes : Whether to step environment in external processes.
    outdir : Directory path to save rendering result while traning.

    Yields
    ------
    score : Evaluation scores.
    """
    tf.reset_default_graph()
    # env to get config
    dummy_env = gym.make(config.env)

    def normalize_observ(observ):
        min_ = dummy_env.observation_space.low[0]
        max_ = dummy_env.observation_space.high[0]
        observ = 2.0 * (observ - min_) / (max_ - min_) - 1.0
        return observ

    def normalize_action(act):
        min_ = dummy_env.action_space[3].low
        max_ = dummy_env.action_space[3].high
        act = 2.0 * (act - min_) / (max_ - min_) - 1.0
        return act

    def denormalize_observ(observ):
        min_ = dummy_env.observation_space.low[0]
        max_ = dummy_env.observation_space.high[0]
        observ = (observ + 1.0) * (max_ - min_) / 2.0 + min_
        return observ

    # env to testing
    vanilla_env = gym.make(config.env)
    vanilla_env = BBallWrapper(vanilla_env, init_mode=1, fps=config.FPS, if_back_real=False,
                               time_limit=50)
    vanilla_env = MonitorWrapper(vanilla_env, directory=os.path.join(config.logdir, 'gail_testing_{}/'.format(config.train_len)), if_back_real=False, video_callable=lambda _: True,
                                 # init from dataset
                                 init_mode=1)
    # if not os.path.exists(os.path.join(config.logdir, 'gail_testing')):
    #     os.makedirs(os.path.join(config.logdir, 'gail_testing'))
    vanilla_env.data = np.load('bball_strategies/data/GAILEnvData_51.npy')
    # env to generate fake state
    env = gym.make(config.env)
    env = BBallWrapper(env, init_mode=3, fps=config.FPS, if_back_real=config.if_back_real,
                       time_limit=config.max_length)
    env = MonitorWrapper(env, directory=os.path.join(config.logdir, 'gail_training/'), if_back_real=config.if_back_real,
                         # init from dataset in order
                         init_mode=3)
    # Discriminator graph
    with tf.device('/gpu:0'):
        D = Discriminator(config, dummy_env)
    # PPO graph
    if config.update_every % config.num_agents:
        tf.logging.warn('Number of agents should divide episodes per update.')
    with tf.device('/cpu:0'):
        batch_env = utility.define_batch_env(
            lambda: _create_environment(config),
            config.num_agents, env_processes, outdir=outdir, is_gail=config.is_gail)
        graph = utility.define_simulation_graph(
            batch_env, config.algorithm, config)
        loop = _define_loop(
            graph, config.logdir,
            config.update_every * config.max_length,
            config.eval_episodes * config.max_length)
        total_steps = int(
            config.steps / config.update_every *
            (config.update_every + config.eval_episodes))
    # Agent to genrate acttion
    ppo_policy = PPOPolicy(config, env)
    # Data
    all_data = h5py.File(
        'bball_strategies/data/GAILTransitionData_{}.hdf5'.format(config.train_len), 'r')
    expert_data, valid_expert_data = np.split(
        all_data['OBS'].value, [all_data['OBS'].value.shape[0]*9//10])
    expert_action, valid_expert_action = np.split(
        all_data['DEF_ACT'].value, [all_data['DEF_ACT'].value.shape[0]*9//10])
    print('expert_data', expert_data.shape)
    print('valid_expert_data', valid_expert_data.shape)
    print('expert_action', expert_action.shape)
    print('valid_expert_action', valid_expert_action.shape)

    # TF Session
    # TODO _num_finished_episodes => Variable:0
    saver = utility.define_saver(
        exclude=(r'.*_temporary.*', r'.*memory.*', r'Variable:0', r'.*Adam.*', r'.*beta.*'))
    sess_config = tf.ConfigProto(
        allow_soft_placement=True, log_device_placement=config.log_device_placement)
    sess_config.gpu_options.allow_growth = True
    with tf.Session(config=sess_config) as sess:
        utility.initialize_variables(
            sess, saver, config.logdir, resume=FLAGS.resume)
        # NOTE reset variables in optimizer
        D.reset_optimizer(sess)
        # reset PPO optimizer
        opt_reset = tf.group(
            [v.initializer for v in graph.algo._optimizer.variables()])
        sess.run(opt_reset)
        # visulization stuff
        if FLAGS.tally_only:
            tally_reward_line_chart(config, sess.run(
                D._global_steps), ppo_policy, D, denormalize_observ, normalize_observ, normalize_action)
            exit()
        # GAIL
        cumulate_steps = sess.run(graph.step)
        episode_idx = 0
        valid_episode_idx = 0
        while True:
            if episode_idx > (expert_data.shape[0]-config.episodes_per_batch*config.train_d_per_ppo) or episode_idx == 0:
                episode_idx = 0
                perm_idx = np.random.permutation(expert_data.shape[0])
                expert_data = expert_data[perm_idx]
                expert_action = expert_action[perm_idx]
            if valid_episode_idx > (valid_expert_data.shape[0]-config.episodes_per_batch) or valid_episode_idx == 0:
                valid_episode_idx = 0
                valid_perm_idx = np.random.permutation(
                    valid_expert_data.shape[0])
                valid_expert_data = valid_expert_data[valid_perm_idx]
                valid_expert_action = valid_expert_action[valid_perm_idx]
            # testing
            if valid_episode_idx % (100 * config.episodes_per_batch) == 0:
                test_policy(config, vanilla_env, sess.run(D._global_steps), ppo_policy,
                            D, denormalize_observ)
            if valid_episode_idx % (1000 * config.episodes_per_batch) == 0:
                tally_reward_line_chart(config, sess.run(
                    D._global_steps), ppo_policy, D, denormalize_observ, normalize_observ, normalize_action)
            # train Discriminator
            train_Discriminator(
                episode_idx, config, expert_data, expert_action, env, ppo_policy, D, normalize_observ, normalize_action)
            if valid_episode_idx % (1000 * config.episodes_per_batch) == 0:
                tally_reward_line_chart(config, sess.run(
                    D._global_steps), ppo_policy, D, denormalize_observ, normalize_observ, normalize_action)
            # valid Discriminator
            valid_Discriminator(
                valid_episode_idx, config, valid_expert_data, valid_expert_action, env, ppo_policy, D, normalize_observ, normalize_action)
            episode_idx += config.episodes_per_batch*config.train_d_per_ppo
            valid_episode_idx += config.episodes_per_batch
            # train PPO
            print('train PPO')
            cumulate_steps += total_steps
            for score in loop.run(sess, saver, cumulate_steps):
                yield score
    batch_env.close()
    vanilla_env.close()
    env.close()
Ejemplo n.º 5
0
def train(config, env_processes, outdir):
    """ Training and evaluation entry point yielding scores.

    Resolves some configuration attributes, creates environments, graph, and
    training loop. By default, assigns all operations to the CPU.

    Args
    ----
    config : Object providing configurations via attributes.
    env_processes : Whether to step environment in external processes.
    outdir : Directory path to save rendering result while traning.

    Yields
    ------
    score : Evaluation scores.
    """
    tf.reset_default_graph()
    # env to get config
    dummy_env = gym.make(config.env)

    def normalize_observ(observ):
        min_ = dummy_env.observation_space.low[0, 0]
        max_ = dummy_env.observation_space.high[0, 0]
        observ = 2.0 * (observ - min_) / (max_ - min_) - 1.0
        return observ

    def normalize_action(act):
        min_ = dummy_env.action_space[3].low
        max_ = dummy_env.action_space[3].high
        act = 2.0 * (act - min_) / (max_ - min_) - 1.0
        return act

    def denormalize_observ(observ):
        min_ = dummy_env.observation_space.low[0]
        max_ = dummy_env.observation_space.high[0]
        observ = (observ + 1.0) * (max_ - min_) / 2.0 + min_
        return observ

    # env to testing
    vanilla_env = gym.make(config.env)
    vanilla_env = BBallWrapper(vanilla_env, data=h5py.File('bball_strategies/data/OrderedGAILTransitionData_522.hdf5', 'r'), init_mode=1, fps=config.FPS, time_limit=50)
    vanilla_env = MonitorWrapper(vanilla_env, directory=os.path.join(config.logdir, 'gail_testing_G{}_D{}/'.format(config.max_length, config.D_len)), video_callable=lambda _: True,
                                 # init from dataset
                                 init_mode=1)

    # PPO graph
    if config.update_every % config.num_agents:
        tf.logging.warn('Number of agents should divide episodes per update.')
    with tf.device('/cpu:0'):
        batch_env = utility.define_batch_env(
            lambda: _create_environment(config),
            config.num_agents, env_processes, outdir=outdir)
        graph = utility.define_simulation_graph(
            batch_env, config.algorithm, config)
        loop = _define_loop(
            graph, config.logdir,
            config.update_every * config.max_length,
            config.eval_episodes * config.max_length)
        total_steps = int(
            config.steps / config.update_every *
            (config.update_every + config.eval_episodes))
    # Agent to genrate acttion
    ppo_policy = PPOPolicy(config, dummy_env)
    # summary writer of Discriminator
    summary_writer = tf.summary.FileWriter(config.logdir + '/Disciminator')
    # TF Session
    # NOTE: _num_finished_episodes => Variable:0
    saver = utility.define_saver(
        exclude=(r'.*_temporary.*', r'.*memory.*', r'Variable:0'))
    sess_config = tf.ConfigProto(
        allow_soft_placement=True, log_device_placement=config.log_device_placement)
    sess_config.gpu_options.allow_growth = True
    with tf.Session(config=sess_config) as sess:
        utility.initialize_variables(
            sess, saver, config.logdir, checkpoint=FLAGS.checkpoint, resume=FLAGS.resume)
        # NOTE reset variables in optimizer for different stages of curriculum learning
        opt_reset_D = tf.group(
            [v.initializer for v in graph.algo.D.optimizer.variables()])
        # reset PPO optimizer
        opt_reset = tf.group(
            [v.initializer for v in graph.algo._optimizer.variables()])
        sess.run([opt_reset, opt_reset_D])
        # visulization stuff
        if FLAGS.tally_only:
            tally_reward_line_chart(config, sess.run(
                graph.algo.D._steps), ppo_policy, D, normalize_observ, normalize_action)
            tally_reward_line_chart(config, sess.run(
                graph.algo.D._steps), ppo_policy, D, normalize_observ, normalize_action, stochastic=True)
            exit()
        # GAIL
        cumulate_steps = sess.run(graph.step)
        counter = 0
        while True:
            # train Discriminator
            gail_timer = time.time()
            if counter > config.pretrain_d_times:
                num_d_to_train = config.train_d_per_ppo
            else:
                num_d_to_train = config.pretrain_d_per_ppo
            for _ in range(num_d_to_train):
                # train D
                feed_dict = {
                    graph.is_training: True,
                    graph.should_log: True,
                    graph.do_report: True,
                    graph.force_reset: False}
                gail_counter = 0
                while gail_counter < config.gail_steps:
                    gail_summary = sess.run(
                        graph.gail_summary, feed_dict=feed_dict)
                    if gail_summary:
                        summary_writer.add_summary(
                            gail_summary, global_step=sess.run(graph.algo.D._steps))
                    gail_counter += 1
                # testing
                if counter % (config.vis_testing_freq) == 0:
                    test_policy(config, vanilla_env, sess.run(graph.algo.D._steps), ppo_policy,
                                graph.algo.D, denormalize_observ)
                if counter % (config.tally_line_chart_freq) == 0:
                    tally_reward_line_chart(config, sess.run(
                        graph.algo.D._steps), ppo_policy, graph.algo.D, normalize_observ, normalize_action)
                    tally_reward_line_chart(config, sess.run(
                        graph.algo.D._steps), ppo_policy, graph.algo.D, normalize_observ, normalize_action, stochastic=True)
                counter += 1
            print('Time Cost of Discriminator per Update: {}'.format(
                (time.time() - gail_timer) / num_d_to_train))
            # train ppo
            cumulate_steps += total_steps
            for score in loop.run(sess, saver, cumulate_steps):
                yield score
    batch_env.close()
    vanilla_env.close()
Ejemplo n.º 6
0
def train(config, env_processes, outdir):
    """ Training and evaluation entry point yielding scores.

    Resolves some configuration attributes, creates environments, graph, and
    training loop. By default, assigns all operations to the CPU.

    Args
    ----
    config : Object providing configurations via attributes.
    env_processes : Whether to step environment in external processes.
    outdir : Directory path to save rendering result while traning.

    Yields
    ------
    score : Evaluation scores.
    """
    tf.reset_default_graph()
    # env to get config
    dummy_env = gym.make(config.env)

    def normalize_observ(observ):
        min_ = dummy_env.observation_space.low[0]
        max_ = dummy_env.observation_space.high[0]
        observ = 2.0 * (observ - min_) / (max_ - min_) - 1.0
        return observ

    def normalize_action(act):
        min_ = dummy_env.action_space[3].low
        max_ = dummy_env.action_space[3].high
        act = 2.0 * (act - min_) / (max_ - min_) - 1.0
        return act

    def denormalize_observ(observ):
        min_ = dummy_env.observation_space.low[0]
        max_ = dummy_env.observation_space.high[0]
        observ = (observ + 1.0) * (max_ - min_) / 2.0 + min_
        return observ

    # env to testing
    vanilla_env = gym.make(config.env)
    vanilla_env = BBallWrapper(vanilla_env, init_mode=1, fps=config.FPS, if_back_real=False,
                               time_limit=50)
    vanilla_env = MonitorWrapper(vanilla_env, directory=os.path.join(config.logdir, 'gail_testing_G{}_D{}/'.format(config.train_len, config.D_len)), if_back_real=False, video_callable=lambda _: True,
                                 # init from dataset
                                 init_mode=1)
    vanilla_env.data = np.load('bball_strategies/data/GAILEnvData_51.npy')
    # env to generate fake state
    env = gym.make(config.env)
    env = BBallWrapper(env, init_mode=3, fps=config.FPS, if_back_real=config.if_back_real,
                       time_limit=config.max_length)
    env = MonitorWrapper(env, directory=os.path.join(config.logdir, 'gail_training/'), if_back_real=config.if_back_real,
                         # init from dataset in order
                         init_mode=3)
    # PPO graph
    if config.update_every % config.num_agents:
        tf.logging.warn('Number of agents should divide episodes per update.')
    with tf.device('/cpu:0'):
        batch_env = utility.define_batch_env(
            lambda: _create_environment(config),
            config.num_agents, env_processes, outdir=outdir, is_gail=config.is_gail)
        graph = utility.define_simulation_graph(
            batch_env, config.algorithm, config)
        loop = _define_loop(
            graph, config.logdir,
            config.update_every * config.max_length,
            config.eval_episodes * config.max_length)
        total_steps = int(
            config.steps / config.update_every *
            (config.update_every + config.eval_episodes))
    # Agent to genrate acttion
    ppo_policy = PPOPolicy(config, env)
    # Data
    all_data = h5py.File(
        'bball_strategies/data/GAILTransitionData_{}.hdf5'.format(config.train_len), 'r')
    expert_data, valid_expert_data = np.split(
        all_data['OBS'].value, [all_data['OBS'].value.shape[0] * 9 // 10])
    expert_action, valid_expert_action = np.split(
        all_data['DEF_ACT'].value, [all_data['DEF_ACT'].value.shape[0] * 9 // 10])
    print('expert_data', expert_data.shape)
    print('valid_expert_data', valid_expert_data.shape)
    print('expert_action', expert_action.shape)
    print('valid_expert_action', valid_expert_action.shape)
    # Preprocessing/ Normalization
    expert_data = normalize_observ(expert_data)
    valid_expert_data = normalize_observ(valid_expert_data)
    expert_action = normalize_action(expert_action)
    valid_expert_action = normalize_action(valid_expert_action)
    # summary writer of Discriminator
    summary_writer = tf.summary.FileWriter(config.logdir + '/Disciminator')
    # TF Session
    # TODO _num_finished_episodes => Variable:0
    saver = utility.define_saver(
        exclude=(r'.*_temporary.*', r'.*memory.*', r'Variable:0', r'.*Adam.*', r'.*beta.*'))
    sess_config = tf.ConfigProto(
        allow_soft_placement=True, log_device_placement=config.log_device_placement)
    sess_config.gpu_options.allow_growth = True
    with tf.Session(config=sess_config) as sess:
        utility.initialize_variables(
            sess, saver, config.logdir, resume=FLAGS.resume)
        # NOTE reset variables in optimizer
        # opt_reset_D = tf.group(
        #     [v.initializer for v in graph.algo.D.optimizer.variables()])
        # # reset PPO optimizer
        # opt_reset = tf.group(
        #     [v.initializer for v in graph.algo._optimizer.variables()])
        # sess.run([opt_reset, opt_reset_D])
        # visulization stuff
        if FLAGS.tally_only:
            tally_reward_line_chart(config, sess.run(
                graph.algo.D._steps), ppo_policy, D, denormalize_observ, normalize_observ, normalize_action)
            exit()
        
        # GAIL
        cumulate_steps = sess.run(graph.step)
        episode_idx = 0
        while True:
            if episode_idx > (expert_data.shape[0] - config.episodes_per_batch * config.train_d_per_ppo) or episode_idx == 0:
                episode_idx = 0
                perm_idx = np.random.permutation(expert_data.shape[0])
                expert_data = expert_data[perm_idx]
                expert_action = expert_action[perm_idx]
            # # testing
            if episode_idx % (config.train_d_per_ppo * 100 * config.episodes_per_batch) == 0:
                test_policy(config, vanilla_env, sess.run(graph.algo.D._steps), ppo_policy,
                            graph.algo.D, denormalize_observ)
            if episode_idx % (config.train_d_per_ppo * 1000 * config.episodes_per_batch) == 0:
                tally_reward_line_chart(config, sess.run(
                    graph.algo.D._steps), ppo_policy, graph.algo.D, denormalize_observ, normalize_observ, normalize_action)

            # # train Discriminator
            gail_timer = time.time()
            for _ in range(config.train_d_per_ppo):
                if config.is_double_curiculum:
                    observ = expert_data[episode_idx:episode_idx +config.episodes_per_batch, 1:]
                    action = expert_action[episode_idx:episode_idx+config.episodes_per_batch, :-1]
                    if config.use_padding:
                        # 1. padding with buffer
                        buffer = observ[:, 0, :-1]
                        padded_observ = np.concatenate([buffer, observ[:, :, -1]], axis=1)
                        padded_act = np.concatenate([np.zeros(shape=[action.shape[0], 9, 5, 2]), action], axis=1)
                        # 2. split the whole episode into training data of Discriminator with length=config.D_len
                        training_obs = []
                        training_act = []
                        for i in range(config.max_length-config.D_len+10):
                            training_obs.append(padded_observ[:, i:i+config.D_len])
                            training_act.append(padded_act[:, i:i+config.D_len])
                        training_obs = np.concatenate(training_obs, axis=0)
                        training_act = np.concatenate(training_act, axis=0)
                    else:
                        pass
                else:
                    training_obs = expert_data[episode_idx:episode_idx +config.episodes_per_batch, 1:, -1]
                    training_act = expert_action[episode_idx:episode_idx+config.episodes_per_batch, :-1]
                feed_dict = {
                    graph.is_training: True,
                    graph.should_log: True,
                    graph.do_report: True,
                    graph.force_reset: False,
                    graph.algo.D._expert_s: training_obs,
                    graph.algo.D._expert_a: training_act}
                gail_counter = 0
                while gail_counter < config.gail_steps:
                    gail_summary = sess.run(
                        graph.gail_summary, feed_dict=feed_dict)
                    if gail_summary:
                        summary_writer.add_summary(
                            gail_summary, global_step=sess.run(graph.algo.D._steps))
                    gail_counter += 1
                episode_idx += config.episodes_per_batch
            print('Time Cost of Discriminator per Update: {}'.format(
                (time.time() - gail_timer) / config.train_d_per_ppo))
            # train ppo
            cumulate_steps += total_steps
            for score in loop.run(sess, saver, cumulate_steps):
                yield score
    batch_env.close()
    vanilla_env.close()
    env.close()