Esempio n. 1
0
    def gym_train(self):
        """
        Inputs:
            env:                gym environment
            gym_model:          algorithm model
            begin_episode:      initial episode
            save_frequency:     how often to save checkpoints
            max_step:           maximum number of steps in an episode
            max_episode:        maximum number of episodes in this training task
            render:             specify whether render the env or not
            render_episode:     if 'render' is false, specify from which episode to render the env
            policy_mode:        'on-policy' or 'off-policy'
        """
        begin_episode = int(self.train_args['begin_episode'])
        render = bool(self.train_args['render'])
        render_episode = int(self.train_args.get('render_episode', 50000))
        save_frequency = int(self.train_args['save_frequency'])
        max_step = int(self.train_args['max_step'])
        max_episode = int(self.train_args['max_episode'])
        eval_while_train = bool(self.train_args['eval_while_train'])
        max_eval_episode = int(self.train_args.get('max_eval_episode'))
        off_policy_step_eval = bool(self.train_args['off_policy_step_eval'])
        off_policy_step_eval_num = int(
            self.train_args.get('off_policy_step_eval_num'))
        policy_mode = str(self.model_args['policy_mode'])
        moving_average_episode = int(self.train_args['moving_average_episode'])
        add_noise2buffer = bool(self.train_args['add_noise2buffer'])
        add_noise2buffer_episode_interval = int(
            self.train_args['add_noise2buffer_episode_interval'])
        add_noise2buffer_steps = int(self.train_args['add_noise2buffer_steps'])

        total_step_control = bool(self.train_args['total_step_control'])
        max_total_step = int(self.train_args['max_total_step'])
        if total_step_control:
            max_episode = max_total_step

        i, state, new_state = self.init_variables()
        sma = SMA(moving_average_episode)
        total_step = 0
        for episode in range(begin_episode, max_episode):
            state[i] = self.env.reset()
            dones_flag = np.full(self.env.n, False)
            step = 0
            r = np.zeros(self.env.n)
            last_done_step = -1
            while True:
                step += 1
                r_tem = np.zeros(self.env.n)
                if render or episode > render_episode:
                    self.env.render()
                action = self.model.choose_action(s=state[0],
                                                  visual_s=state[1])
                new_state[i], reward, done, info = self.env.step(action)
                unfinished_index = np.where(dones_flag == False)[0]
                dones_flag += done
                r_tem[unfinished_index] = reward[unfinished_index]
                r += r_tem
                self.model.store_data(s=state[0],
                                      visual_s=state[1],
                                      a=action,
                                      r=reward,
                                      s_=new_state[0],
                                      visual_s_=new_state[1],
                                      done=done)

                if policy_mode == 'off-policy':
                    self.model.learn(episode=episode, step=1)
                    if off_policy_step_eval:
                        self.gym_step_eval(total_step, self.model,
                                           off_policy_step_eval_num, max_step)
                total_step += 1
                if total_step_control and total_step > max_total_step:
                    return

                if all(dones_flag):
                    if last_done_step == -1:
                        last_done_step = step
                    if policy_mode == 'off-policy':
                        break

                if step >= max_step:
                    break

                if len(self.env.dones_index):  # 判断是否有线程中的环境需要局部reset
                    new_state[i][
                        self.env.dones_index] = self.env.partial_reset()
                state[i] = new_state[i]

            sma.update(r)
            if policy_mode == 'on-policy':
                self.model.learn(episode=episode, step=step)
            self.model.writer_summary(episode,
                                      reward_mean=r.mean(),
                                      reward_min=r.min(),
                                      reward_max=r.max(),
                                      step=last_done_step,
                                      **sma.rs)
            self.pwi('-' * 40)
            self.pwi(
                f'Episode: {episode:3d} | step: {step:4d} | last_done_step {last_done_step:4d} | rewards: {arrprint(r, 3)}'
            )
            if episode % save_frequency == 0:
                self.model.save_checkpoint(episode)

            if add_noise2buffer and episode % add_noise2buffer_episode_interval == 0:
                self.gym_random_sample(steps=add_noise2buffer_steps)

            if eval_while_train and self.env.reward_threshold is not None:
                if r.max() >= self.env.reward_threshold:
                    self.pwi(
                        f'-------------------------------------------Evaluate episode: {episode:3d}--------------------------------------------------'
                    )
                    self.gym_evaluate()
Esempio n. 2
0
File: gym.py Progetto: HackyLee/RLs
def gym_train(env, model, print_func, begin_train_step, begin_frame_step,
              begin_episode, render, render_episode, save_frequency,
              max_step_per_episode, max_train_episode, eval_while_train,
              max_eval_episode, off_policy_step_eval_episodes,
              off_policy_train_interval, policy_mode, moving_average_episode,
              add_noise2buffer, add_noise2buffer_episode_interval,
              add_noise2buffer_steps, off_policy_eval_interval, max_train_step,
              max_frame_step):
    """
    TODO: Annotation
    """

    i, state, new_state = init_variables(env)
    sma = SMA(moving_average_episode)
    frame_step = begin_frame_step
    train_step = begin_train_step
    total_step = 0

    for episode in range(begin_episode, max_train_episode):
        model.reset()
        state[i] = env.reset()
        dones_flag = np.full(env.n, False)
        step = 0
        r = np.zeros(env.n)
        last_done_step = -1
        while True:
            step += 1
            if render or episode > render_episode:
                env.render(record=False)
            action = model.choose_action(s=state[0], visual_s=state[1])
            new_state[i], reward, done, info, correct_new_state = env.step(
                action)
            unfinished_index = np.where(dones_flag == False)[0]
            dones_flag += done
            r[unfinished_index] += reward[unfinished_index]
            model.store_data(s=state[0],
                             visual_s=state[1],
                             a=action,
                             r=reward,
                             s_=new_state[0],
                             visual_s_=new_state[1],
                             done=done)
            model.partial_reset(done)
            state[i] = correct_new_state

            if policy_mode == 'off-policy':
                if total_step % off_policy_train_interval == 0:
                    model.learn(episode=episode, train_step=train_step)
                    train_step += 1
                if train_step % save_frequency == 0:
                    model.save_checkpoint(train_step=train_step,
                                          episode=episode,
                                          frame_step=frame_step)
                if off_policy_eval_interval > 0 and train_step % off_policy_eval_interval == 0:
                    gym_step_eval(deepcopy(env), train_step, model,
                                  off_policy_step_eval_episodes,
                                  max_step_per_episode)

            frame_step += env.n
            total_step += 1
            if 0 < max_train_step <= train_step or 0 < max_frame_step <= frame_step:
                model.save_checkpoint(train_step=train_step,
                                      episode=episode,
                                      frame_step=frame_step)
                logger.info(
                    f'End Training, learn step: {train_step}, frame_step: {frame_step}'
                )
                return

            if all(dones_flag):
                if last_done_step == -1:
                    last_done_step = step
                if policy_mode == 'off-policy':
                    break

            if step >= max_step_per_episode:
                break

        sma.update(r)
        if policy_mode == 'on-policy':
            model.learn(episode=episode, train_step=train_step)
            train_step += 1
            if train_step % save_frequency == 0:
                model.save_checkpoint(train_step=train_step,
                                      episode=episode,
                                      frame_step=frame_step)
        model.writer_summary(episode,
                             reward_mean=r.mean(),
                             reward_min=r.min(),
                             reward_max=r.max(),
                             step=last_done_step,
                             **sma.rs)
        print_func('-' * 40, out_time=True)
        print_func(
            f'Episode: {episode:3d} | step: {step:4d} | last_done_step {last_done_step:4d} | rewards: {arrprint(r, 2)}'
        )

        if add_noise2buffer and episode % add_noise2buffer_episode_interval == 0:
            gym_no_op(env,
                      model,
                      pre_fill_steps=add_noise2buffer_steps,
                      print_func=print_func,
                      prefill_choose=False,
                      desc='adding noise')

        if eval_while_train and env.reward_threshold is not None:
            if r.max() >= env.reward_threshold:
                print_func(
                    f'-------------------------------------------Evaluate episode: {episode:3d}--------------------------------------------------'
                )
                gym_evaluate(env, model, max_step_per_episode,
                             max_eval_episode, print_func)
Esempio n. 3
0
def gym_train(env, model, print_func,
              begin_episode, render, render_episode,
              save_frequency, max_step, max_episode, eval_while_train, max_eval_episode,
              off_policy_step_eval, off_policy_step_eval_num, 
              policy_mode, moving_average_episode, add_noise2buffer, add_noise2buffer_episode_interval, add_noise2buffer_steps,
              total_step_control, eval_interval, max_total_step):
    """
    TODO: Annotation
    """
    if total_step_control:
        max_episode = max_total_step

    i, state, new_state = init_variables(env)
    sma = SMA(moving_average_episode)
    total_step = 0

    for episode in range(begin_episode, max_episode):
        model.reset()
        state[i] = env.reset()
        dones_flag = np.full(env.n, False)
        step = 0
        r = np.zeros(env.n)
        last_done_step = -1
        while True:
            step += 1
            r_tem = np.zeros(env.n)
            if render or episode > render_episode:
                env.render(record=False)
            action = model.choose_action(s=state[0], visual_s=state[1])
            new_state[i], reward, done, info, correct_new_state = env.step(action)
            unfinished_index = np.where(dones_flag == False)[0]
            dones_flag += done
            r_tem[unfinished_index] = reward[unfinished_index]
            r += r_tem
            model.store_data(
                s=state[0],
                visual_s=state[1],
                a=action,
                r=reward,
                s_=new_state[0],
                visual_s_=new_state[1],
                done=done
            )
            model.partial_reset(done)
            state[i] = correct_new_state

            if policy_mode == 'off-policy':
                model.learn(episode=episode, step=1)
                if off_policy_step_eval and total_step % eval_interval == 0:
                    gym_step_eval(env.eval_env, total_step, model, off_policy_step_eval_num, max_step)
            total_step += 1
            if total_step_control and total_step > max_total_step:
                return

            if all(dones_flag):
                if last_done_step == -1:
                    last_done_step = step
                if policy_mode == 'off-policy':
                    break

            if step >= max_step:
                break

        sma.update(r)
        if policy_mode == 'on-policy':
            model.learn(episode=episode, step=step)
        model.writer_summary(
            episode,
            reward_mean=r.mean(),
            reward_min=r.min(),
            reward_max=r.max(),
            step=last_done_step,
            **sma.rs
        )
        print_func('-' * 40, out_time=True)
        print_func(f'Episode: {episode:3d} | step: {step:4d} | last_done_step {last_done_step:4d} | rewards: {arrprint(r, 3)}')
        if episode % save_frequency == 0:
            model.save_checkpoint(episode)

        if add_noise2buffer and episode % add_noise2buffer_episode_interval == 0:
            gym_random_sample(env, steps=add_noise2buffer_steps, print_func=print_func)

        if eval_while_train and env.reward_threshold is not None:
            if r.max() >= env.reward_threshold:
                print_func(f'-------------------------------------------Evaluate episode: {episode:3d}--------------------------------------------------')
                gym_evaluate(env, model, max_step, max_eval_episode, print_func)
Esempio n. 4
0
    def train(env, gym_model, begin_episode, save_frequency, max_step, max_episode,
              eval_while_train, max_eval_episode, render, render_episode, policy_mode):
        """
        Inputs:
            env:                gym environment
            gym_model:          algorithm model
            begin_episode:      initial episode
            save_frequency:     how often to save checkpoints
            max_step:           maximum number of steps in an episode
            max_episode:        maximum number of episodes in this training task
            render:             specify whether render the env or not
            render_episode:     if 'render' is false, specify from which episode to render the env
        """
        i, state, new_state = init_variables(env)
        sma = SMA(100)
        for episode in range(begin_episode, max_episode):
            state[i] = env.reset()
            dones_flag = np.full(env.n, False)
            step = 0
            r = np.zeros(env.n)
            last_done_step = -1
            while True:
                step += 1
                r_tem = np.zeros(env.n)
                if render or episode > render_episode:
                    env.render()
                action = gym_model.choose_action(s=state[0], visual_s=state[1])
                new_state[i], reward, done, info = env.step(action)
                unfinished_index = np.where(dones_flag == False)[0]
                dones_flag += done
                r_tem[unfinished_index] = reward[unfinished_index]
                r += r_tem
                gym_model.store_data(
                    s=state[0],
                    visual_s=state[1],
                    a=action,
                    r=reward,
                    s_=new_state[0],
                    visual_s_=new_state[1],
                    done=done
                )

                if all(dones_flag):
                    if last_done_step == -1:
                        last_done_step = step
                    if policy_mode == 'off-policy':
                        break

                if step >= max_step:
                    break

                if len(env.dones_index):    # 判断是否有线程中的环境需要局部reset
                    new_state[i][env.dones_index] = env.partial_reset()
                state[i] = new_state[i]

            sma.update(r)
            gym_model.learn(episode=episode, step=step)
            gym_model.writer_summary(
                episode,
                reward_mean=r.mean(),
                reward_min=r.min(),
                reward_max=r.max(),
                step=last_done_step,
                **sma.rs
            )
            print('-' * 40)
            print(f'Episode: {episode:3d} | step: {step:4d} | last_done_step {last_done_step:4d} | rewards: {r}')
            if episode % save_frequency == 0:
                gym_model.save_checkpoint(episode)

            if eval_while_train and env.reward_threshold is not None:
                if r.max() >= env.reward_threshold:
                    ave_r, ave_step = Loop.evaluate(env, gym_model, max_step, max_eval_episode)
                    solved = True if ave_r >= env.reward_threshold else False
                    print(f'-------------------------------------------Evaluate episode: {episode:3d}--------------------------------------------------')
                    print(f'evaluate number: {max_eval_episode:3d} | average step: {ave_step} | average reward: {ave_r} | SOLVED: {solved}')
                    print('----------------------------------------------------------------------------------------------------------------------------')