Esempio n. 1
0
    def gym_train(self):
        """
        Inputs:
            env:                gym environment
            gym_model:          algorithm model
            begin_episode:      initial episode
            save_frequency:     how often to save checkpoints
            max_step:           maximum number of steps in an episode
            max_episode:        maximum number of episodes in this training task
            render:             specify whether render the env or not
            render_episode:     if 'render' is false, specify from which episode to render the env
            policy_mode:        'on-policy' or 'off-policy'
        """
        begin_episode = int(self.train_args['begin_episode'])
        render = bool(self.train_args['render'])
        render_episode = int(self.train_args.get('render_episode', 50000))
        save_frequency = int(self.train_args['save_frequency'])
        max_step = int(self.train_args['max_step'])
        max_episode = int(self.train_args['max_episode'])
        eval_while_train = bool(self.train_args['eval_while_train'])
        max_eval_episode = int(self.train_args.get('max_eval_episode'))
        off_policy_step_eval = bool(self.train_args['off_policy_step_eval'])
        off_policy_step_eval_num = int(
            self.train_args.get('off_policy_step_eval_num'))
        policy_mode = str(self.model_args['policy_mode'])
        moving_average_episode = int(self.train_args['moving_average_episode'])
        add_noise2buffer = bool(self.train_args['add_noise2buffer'])
        add_noise2buffer_episode_interval = int(
            self.train_args['add_noise2buffer_episode_interval'])
        add_noise2buffer_steps = int(self.train_args['add_noise2buffer_steps'])

        total_step_control = bool(self.train_args['total_step_control'])
        max_total_step = int(self.train_args['max_total_step'])
        if total_step_control:
            max_episode = max_total_step

        i, state, new_state = self.init_variables()
        sma = SMA(moving_average_episode)
        total_step = 0
        for episode in range(begin_episode, max_episode):
            state[i] = self.env.reset()
            dones_flag = np.full(self.env.n, False)
            step = 0
            r = np.zeros(self.env.n)
            last_done_step = -1
            while True:
                step += 1
                r_tem = np.zeros(self.env.n)
                if render or episode > render_episode:
                    self.env.render()
                action = self.model.choose_action(s=state[0],
                                                  visual_s=state[1])
                new_state[i], reward, done, info = self.env.step(action)
                unfinished_index = np.where(dones_flag == False)[0]
                dones_flag += done
                r_tem[unfinished_index] = reward[unfinished_index]
                r += r_tem
                self.model.store_data(s=state[0],
                                      visual_s=state[1],
                                      a=action,
                                      r=reward,
                                      s_=new_state[0],
                                      visual_s_=new_state[1],
                                      done=done)

                if policy_mode == 'off-policy':
                    self.model.learn(episode=episode, step=1)
                    if off_policy_step_eval:
                        self.gym_step_eval(total_step, self.model,
                                           off_policy_step_eval_num, max_step)
                total_step += 1
                if total_step_control and total_step > max_total_step:
                    return

                if all(dones_flag):
                    if last_done_step == -1:
                        last_done_step = step
                    if policy_mode == 'off-policy':
                        break

                if step >= max_step:
                    break

                if len(self.env.dones_index):  # 判断是否有线程中的环境需要局部reset
                    new_state[i][
                        self.env.dones_index] = self.env.partial_reset()
                state[i] = new_state[i]

            sma.update(r)
            if policy_mode == 'on-policy':
                self.model.learn(episode=episode, step=step)
            self.model.writer_summary(episode,
                                      reward_mean=r.mean(),
                                      reward_min=r.min(),
                                      reward_max=r.max(),
                                      step=last_done_step,
                                      **sma.rs)
            self.pwi('-' * 40)
            self.pwi(
                f'Episode: {episode:3d} | step: {step:4d} | last_done_step {last_done_step:4d} | rewards: {arrprint(r, 3)}'
            )
            if episode % save_frequency == 0:
                self.model.save_checkpoint(episode)

            if add_noise2buffer and episode % add_noise2buffer_episode_interval == 0:
                self.gym_random_sample(steps=add_noise2buffer_steps)

            if eval_while_train and self.env.reward_threshold is not None:
                if r.max() >= self.env.reward_threshold:
                    self.pwi(
                        f'-------------------------------------------Evaluate episode: {episode:3d}--------------------------------------------------'
                    )
                    self.gym_evaluate()
Esempio n. 2
0
    def unity_train(self):
        """
        Train loop. Execute until episode reaches its maximum or press 'ctrl+c' artificially.
        Inputs:
            env:                    Environment for interaction.
            models:                 all models for this trianing task.
            save_frequency:         how often to save checkpoints.
            reset_config:           configuration to reset for Unity environment.
            max_step:               maximum number of steps for an episode.
            sampler_manager:        sampler configuration parameters for 'reset_config'.
            resampling_interval:    how often to resample parameters for env reset.
        Variables:
            brain_names:    a list of brain names set in Unity.
            state: store    a list of states for each brain. each item contain a list of states for each agents that controlled by the same brain.
            visual_state:   store a list of visual state information for each brain.
            action:         store a list of actions for each brain.
            dones_flag:     store a list of 'done' for each brain. use for judge whether an episode is finished for every agents.
            rewards:        use to record rewards of agents for each brain.
        """
        begin_episode = int(self.train_args['begin_episode'])
        save_frequency = int(self.train_args['save_frequency'])
        max_step = int(self.train_args['max_step'])
        max_episode = int(self.train_args['max_episode'])
        policy_mode = str(self.model_args['policy_mode'])
        moving_average_episode = int(self.train_args['moving_average_episode'])
        add_noise2buffer = bool(self.train_args['add_noise2buffer'])
        add_noise2buffer_episode_interval = int(
            self.train_args['add_noise2buffer_episode_interval'])
        add_noise2buffer_steps = int(self.train_args['add_noise2buffer_steps'])

        if self.use_GCN:
            adj, x, visual_state, action, dones_flag, rewards = zeros_initializer(
                self.env.brain_num, 6)
            sma = [
                SMA(moving_average_episode) for i in range(self.env.brain_num)
            ]

            for episode in range(begin_episode, max_episode):
                ObsRewDone = self.env.reset()
                for i, (_adj, _x, _vs, _r, _d) in enumerate(ObsRewDone):
                    dones_flag[i] = np.zeros(self.env.brain_agents[i])
                    rewards[i] = np.zeros(self.env.brain_agents[i])
                    adj[i] = _adj
                    x[i] = _x
                    visual_state[i] = _vs
                step = 0
                last_done_step = -1
                while True:
                    step += 1
                    for i in range(self.env.brain_num):
                        action[i] = self.models[i].choose_action(
                            adj=adj[i], x=x[i], visual_s=visual_state[i])
                    actions = {
                        f'{brain_name}': action[i]
                        for i, brain_name in enumerate(self.env.brain_names)
                    }
                    ObsRewDone = self.env.step(vector_action=actions)

                    for i, (_adj, _x, _vs, _r, _d) in enumerate(ObsRewDone):
                        unfinished_index = np.where(dones_flag[i] == False)[0]
                        dones_flag[i] += _d
                        self.models[i].store_data_gcn(adj=adj[i],
                                                      x=x[i],
                                                      visual_s=visual_state[i],
                                                      a=action[i],
                                                      r=_r,
                                                      adj_=_adj,
                                                      x_=_x,
                                                      visual_s_=_vs,
                                                      done=_d)
                        rewards[i][unfinished_index] += _r[unfinished_index]
                        adj[i] = _adj
                        x[i] = _x
                        visual_state[i] = _vs
                        if policy_mode == 'off-policy':
                            # print("advfdvsdfvfvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv")
                            self.models[i].learn(episode=episode, step=1)

                    if all([
                            all(dones_flag[i])
                            for i in range(self.env.brain_num)
                    ]):
                        if last_done_step == -1:
                            last_done_step = step
                        if policy_mode == 'off-policy':
                            break

                    if step >= max_step:
                        break

                for i in range(self.env.brain_num):
                    sma[i].update(rewards[i])
                    if policy_mode == 'on-policy':
                        self.models[i].learn(episode=episode, step=step)
                    self.models[i].writer_summary(
                        episode,
                        reward_mean=rewards[i].mean(),
                        reward_min=rewards[i].min(),
                        reward_max=rewards[i].max(),
                        step=last_done_step,
                        **sma[i].rs)
                self.pwi('-' * 40)
                self.pwi(
                    f'episode {episode:3d} | step {step:4d} | last_done_step {last_done_step:4d}'
                )
                for i in range(self.env.brain_num):
                    self.pwi(f'brain {i:2d} reward: {arrprint(rewards[i], 3)}')
                if episode % save_frequency == 0:
                    for i in range(self.env.brain_num):
                        self.models[i].save_checkpoint(episode)

                if add_noise2buffer and episode % add_noise2buffer_episode_interval == 0:
                    self.unity_random_sample(steps=add_noise2buffer_steps)

        else:

            state, visual_state, action, dones_flag, rewards = zeros_initializer(
                self.env.brain_num, 5)
            sma = [
                SMA(moving_average_episode) for i in range(self.env.brain_num)
            ]

            for episode in range(begin_episode, max_episode):
                ObsRewDone = self.env.reset()
                for i, (_v, _vs, _r, _d) in enumerate(ObsRewDone):
                    dones_flag[i] = np.zeros(self.env.brain_agents[i])
                    rewards[i] = np.zeros(self.env.brain_agents[i])
                    state[i] = _v
                    visual_state[i] = _vs
                step = 0
                last_done_step = -1
                while True:
                    step += 1
                    for i in range(self.env.brain_num):
                        action[i] = self.models[i].choose_action(
                            s=state[i], visual_s=visual_state[i])
                    actions = {
                        f'{brain_name}': action[i]
                        for i, brain_name in enumerate(self.env.brain_names)
                    }
                    ObsRewDone = self.env.step(vector_action=actions)

                    for i, (_v, _vs, _r, _d) in enumerate(ObsRewDone):
                        unfinished_index = np.where(dones_flag[i] == False)[0]
                        dones_flag[i] += _d
                        self.models[i].store_data(s=state[i],
                                                  visual_s=visual_state[i],
                                                  a=action[i],
                                                  r=_r,
                                                  s_=_v,
                                                  visual_s_=_vs,
                                                  done=_d)
                        rewards[i][unfinished_index] += _r[unfinished_index]
                        state[i] = _v
                        visual_state[i] = _vs
                        if policy_mode == 'off-policy':
                            self.models[i].learn(episode=episode, step=1)

                    if all([
                            all(dones_flag[i])
                            for i in range(self.env.brain_num)
                    ]):
                        if last_done_step == -1:
                            last_done_step = step
                        if policy_mode == 'off-policy':
                            break

                    if step >= max_step:
                        break

                for i in range(self.env.brain_num):
                    sma[i].update(rewards[i])
                    if policy_mode == 'on-policy':
                        self.models[i].learn(episode=episode, step=step)
                    self.models[i].writer_summary(
                        episode,
                        reward_mean=rewards[i].mean(),
                        reward_min=rewards[i].min(),
                        reward_max=rewards[i].max(),
                        step=last_done_step,
                        **sma[i].rs)
                self.pwi('-' * 40)
                self.pwi(
                    f'episode {episode:3d} | step {step:4d} | last_done_step {last_done_step:4d}'
                )
                for i in range(self.env.brain_num):
                    self.pwi(f'brain {i:2d} reward: {arrprint(rewards[i], 3)}')
                if episode % save_frequency == 0:
                    for i in range(self.env.brain_num):
                        self.models[i].save_checkpoint(episode)

                if add_noise2buffer and episode % add_noise2buffer_episode_interval == 0:
                    self.unity_random_sample(steps=add_noise2buffer_steps)
Esempio n. 3
0
    def unity_train(self):
        """
        Train loop. Execute until episode reaches its maximum or press 'ctrl+c' artificially.
        Inputs:
            env:                    Environment for interaction.
            models:                 all models for this trianing task.
            save_frequency:         how often to save checkpoints.
            reset_config:           configuration to reset for Unity environment.
            max_step:               maximum number of steps for an episode.
            sampler_manager:        sampler configuration parameters for 'reset_config'.
            resampling_interval:    how often to resample parameters for env reset.
        Variables:
            brain_names:    a list of brain names set in Unity.
            state: store    a list of states for each brain. each item contain a list of states for each agents that controlled by the same brain.
            visual_state:   store a list of visual state information for each brain.
            action:         store a list of actions for each brain.
            dones_flag:     store a list of 'done' for each brain. use for judge whether an episode is finished for every agents.
            agents_num:     use to record 'number' of agents for each brain.
            rewards:        use to record rewards of agents for each brain.
        """
        begin_episode = int(self.train_args['begin_episode'])
        save_frequency = int(self.train_args['save_frequency'])
        max_step = int(self.train_args['max_step'])
        max_episode = int(self.train_args['max_episode'])
        policy_mode = str(self.model_args['policy_mode'])

        brains_num = len(self.env.brain_names)
        state = [0] * brains_num
        visual_state = [0] * brains_num
        action = [0] * brains_num
        dones_flag = [0] * brains_num
        agents_num = [0] * brains_num
        rewards = [0] * brains_num
        sma = [SMA(100) for i in range(brains_num)]

        for episode in range(begin_episode, max_episode):
            obs = self.env.reset()
            for i, brain_name in enumerate(self.env.brain_names):
                agents_num[i] = len(obs[brain_name].agents)
                dones_flag[i] = np.zeros(agents_num[i])
                rewards[i] = np.zeros(agents_num[i])
            step = 0
            last_done_step = -1
            while True:
                step += 1
                for i, brain_name in enumerate(self.env.brain_names):
                    state[i] = obs[brain_name].vector_observations
                    visual_state[i] = self.get_visual_input(
                        agents_num[i], self.models[i].visual_sources,
                        obs[brain_name])
                    action[i] = self.models[i].choose_action(
                        s=state[i], visual_s=visual_state[i])
                actions = {
                    f'{brain_name}': action[i]
                    for i, brain_name in enumerate(self.env.brain_names)
                }
                obs = self.env.step(vector_action=actions)

                for i, brain_name in enumerate(self.env.brain_names):
                    unfinished_index = np.where(dones_flag[i] == False)[0]
                    dones_flag[i] += obs[brain_name].local_done
                    next_state = obs[brain_name].vector_observations
                    next_visual_state = self.get_visual_input(
                        agents_num[i], self.models[i].visual_sources,
                        obs[brain_name])
                    self.models[i].store_data(
                        s=state[i],
                        visual_s=visual_state[i],
                        a=action[i],
                        r=np.asarray(obs[brain_name].rewards),
                        s_=next_state,
                        visual_s_=next_visual_state,
                        done=np.asarray(obs[brain_name].local_done))
                    rewards[i][unfinished_index] += np.asarray(
                        obs[brain_name].rewards)[unfinished_index]
                    if policy_mode == 'off-policy':
                        self.models[i].learn(episode=episode, step=1)

                if all([all(dones_flag[i]) for i in range(brains_num)]):
                    if last_done_step == -1:
                        last_done_step = step
                    if policy_mode == 'off-policy':
                        break

                if step >= max_step:
                    break

            for i in range(brains_num):
                sma[i].update(rewards[i])
                if policy_mode == 'on-policy':
                    self.models[i].learn(episode=episode, step=step)
                self.models[i].writer_summary(episode,
                                              reward_mean=rewards[i].mean(),
                                              reward_min=rewards[i].min(),
                                              reward_max=rewards[i].max(),
                                              step=last_done_step,
                                              **sma[i].rs)
            self.pwi('-' * 40)
            self.pwi(
                f'episode {episode:3d} | step {step:4d} | last_done_step {last_done_step:4d}'
            )
            for i in range(brains_num):
                self.pwi(f'brain {i:2d} reward: {arrprint(rewards[i], 3)}')
            if episode % save_frequency == 0:
                for i in range(brains_num):
                    self.models[i].save_checkpoint(episode)
Esempio n. 4
0
def unity_train(env, models, print_func, begin_train_step, begin_frame_step,
                begin_episode, save_frequency, max_step_per_episode,
                max_train_episode, policy_mode, moving_average_episode,
                add_noise2buffer, add_noise2buffer_episode_interval,
                add_noise2buffer_steps, max_train_step, max_frame_step,
                real_done, off_policy_train_interval):
    """
    TODO: Annotation
    Train loop. Execute until episode reaches its maximum or press 'ctrl+c' artificially.
    Inputs:
        env:                    Environment for interaction.
        models:                 all models for this training task.
        save_frequency:         how often to save checkpoints.
        reset_config:           configuration to reset for Unity environment.
        max_step_per_episode:               maximum number of steps for an episode.
        sampler_manager:        sampler configuration parameters for 'reset_config'.
        resampling_interval:    how often to resample parameters for env reset.
    Variables:
        brain_names:    a list of brain names set in Unity.
        state: store    a list of states for each brain. each item contain a list of states for each agents that controlled by the same brain.
        visual_state:   store a list of visual state information for each brain.
        action:         store a list of actions for each brain.
        dones_flag:     store a list of 'done' for each brain. use for judge whether an episode is finished for every agents.
        rewards:        use to record rewards of agents for each brain.
    """

    state, visual_state, action, dones_flag, rewards = zeros_initializer(
        env.brain_num, 5)
    sma = [SMA(moving_average_episode) for i in range(env.brain_num)]
    frame_step = begin_frame_step
    min_of_all_agents = min(env.brain_agents)
    train_step = [begin_train_step for _ in range(env.brain_num)]

    for episode in range(begin_episode, max_train_episode):
        [model.reset() for model in models]
        ObsRewDone = env.reset()
        for i, (_v, _vs, _r, _d, _info) in enumerate(ObsRewDone):
            dones_flag[i] = np.zeros(env.brain_agents[i])
            rewards[i] = np.zeros(env.brain_agents[i])
            state[i] = _v
            visual_state[i] = _vs
        step = 0
        last_done_step = -1
        while True:
            step += 1
            for i in range(env.brain_num):
                action[i] = models[i].choose_action(s=state[i],
                                                    visual_s=visual_state[i])
            actions = {
                f'{brain_name}': action[i]
                for i, brain_name in enumerate(env.brain_names)
            }
            ObsRewDone = env.step(actions)

            for i, (_v, _vs, _r, _d, _info) in enumerate(ObsRewDone):
                unfinished_index = np.where(dones_flag[i] == False)[0]
                dones_flag[i] += _d
                models[i].store_data(
                    s=state[i],
                    visual_s=visual_state[i],
                    a=action[i],
                    r=_r,
                    s_=_v,
                    visual_s_=_vs,
                    done=_info['real_done'] if real_done else _d)
                models[i].partial_reset(_d)
                rewards[i][unfinished_index] += _r[unfinished_index]
                state[i] = _v
                visual_state[i] = _vs
                if policy_mode == 'off-policy':
                    if train_step[i] % off_policy_train_interval == 0:
                        models[i].learn(episode=episode, train_step=train_step)
                    train_step[i] += 1
                    if train_step[i] % save_frequency == 0:
                        models[i].save_checkpoint(train_step=train_step[i],
                                                  episode=episode,
                                                  frame_step=frame_step)

            frame_step += min_of_all_agents
            if 0 < max_train_step < min(
                    train_step) or 0 < max_frame_step < frame_step:
                for i in range(env.brain_num):
                    models[i].save_checkpoint(train_step=train_step[i],
                                              episode=episode,
                                              frame_step=frame_step)
                logger.info(
                    f'End Training, learn step: {train_step}, frame_step: {frame_step}'
                )
                return

            if all([all(dones_flag[i]) for i in range(env.brain_num)]):
                if last_done_step == -1:
                    last_done_step = step
                if policy_mode == 'off-policy':
                    break

            if step >= max_step_per_episode:
                break

        for i in range(env.brain_num):
            sma[i].update(rewards[i])
            if policy_mode == 'on-policy':
                models[i].learn(episode=episode, train_step=train_step)
                train_step[i] += 1
                if train_step[i] % save_frequency == 0:
                    models[i].save_checkpoint(train_step=train_step[i],
                                              episode=episode,
                                              frame_step=frame_step)
            models[i].writer_summary(episode,
                                     reward_mean=rewards[i].mean(),
                                     reward_min=rewards[i].min(),
                                     reward_max=rewards[i].max(),
                                     step=last_done_step,
                                     **sma[i].rs)
        print_func('-' * 40, out_time=True)
        print_func(
            f'episode {episode:3d} | step {step:4d} | last_done_step {last_done_step:4d}'
        )
        for i, bn in enumerate(env.brain_names):
            print_func(f'{bn} reward: {arrprint(rewards[i], 2)}')

        if add_noise2buffer and episode % add_noise2buffer_episode_interval == 0:
            unity_no_op(env,
                        models,
                        print_func=print_func,
                        pre_fill_steps=add_noise2buffer_steps,
                        prefill_choose=False,
                        real_done=real_done,
                        desc='adding noise')
Esempio n. 5
0
def gym_train(env, model, print_func,
              begin_episode, render, render_episode,
              save_frequency, max_step, max_episode, eval_while_train, max_eval_episode,
              off_policy_step_eval, off_policy_step_eval_num, 
              policy_mode, moving_average_episode, add_noise2buffer, add_noise2buffer_episode_interval, add_noise2buffer_steps,
              total_step_control, eval_interval, max_total_step):
    """
    TODO: Annotation
    """
    if total_step_control:
        max_episode = max_total_step

    i, state, new_state = init_variables(env)
    sma = SMA(moving_average_episode)
    total_step = 0

    for episode in range(begin_episode, max_episode):
        model.reset()
        state[i] = env.reset()
        dones_flag = np.full(env.n, False)
        step = 0
        r = np.zeros(env.n)
        last_done_step = -1
        while True:
            step += 1
            r_tem = np.zeros(env.n)
            if render or episode > render_episode:
                env.render(record=False)
            action = model.choose_action(s=state[0], visual_s=state[1])
            new_state[i], reward, done, info, correct_new_state = env.step(action)
            unfinished_index = np.where(dones_flag == False)[0]
            dones_flag += done
            r_tem[unfinished_index] = reward[unfinished_index]
            r += r_tem
            model.store_data(
                s=state[0],
                visual_s=state[1],
                a=action,
                r=reward,
                s_=new_state[0],
                visual_s_=new_state[1],
                done=done
            )
            model.partial_reset(done)
            state[i] = correct_new_state

            if policy_mode == 'off-policy':
                model.learn(episode=episode, step=1)
                if off_policy_step_eval and total_step % eval_interval == 0:
                    gym_step_eval(env.eval_env, total_step, model, off_policy_step_eval_num, max_step)
            total_step += 1
            if total_step_control and total_step > max_total_step:
                return

            if all(dones_flag):
                if last_done_step == -1:
                    last_done_step = step
                if policy_mode == 'off-policy':
                    break

            if step >= max_step:
                break

        sma.update(r)
        if policy_mode == 'on-policy':
            model.learn(episode=episode, step=step)
        model.writer_summary(
            episode,
            reward_mean=r.mean(),
            reward_min=r.min(),
            reward_max=r.max(),
            step=last_done_step,
            **sma.rs
        )
        print_func('-' * 40, out_time=True)
        print_func(f'Episode: {episode:3d} | step: {step:4d} | last_done_step {last_done_step:4d} | rewards: {arrprint(r, 3)}')
        if episode % save_frequency == 0:
            model.save_checkpoint(episode)

        if add_noise2buffer and episode % add_noise2buffer_episode_interval == 0:
            gym_random_sample(env, steps=add_noise2buffer_steps, print_func=print_func)

        if eval_while_train and env.reward_threshold is not None:
            if r.max() >= env.reward_threshold:
                print_func(f'-------------------------------------------Evaluate episode: {episode:3d}--------------------------------------------------')
                gym_evaluate(env, model, max_step, max_eval_episode, print_func)
Esempio n. 6
0
File: gym.py Progetto: HackyLee/RLs
def gym_train(env, model, print_func, begin_train_step, begin_frame_step,
              begin_episode, render, render_episode, save_frequency,
              max_step_per_episode, max_train_episode, eval_while_train,
              max_eval_episode, off_policy_step_eval_episodes,
              off_policy_train_interval, policy_mode, moving_average_episode,
              add_noise2buffer, add_noise2buffer_episode_interval,
              add_noise2buffer_steps, off_policy_eval_interval, max_train_step,
              max_frame_step):
    """
    TODO: Annotation
    """

    i, state, new_state = init_variables(env)
    sma = SMA(moving_average_episode)
    frame_step = begin_frame_step
    train_step = begin_train_step
    total_step = 0

    for episode in range(begin_episode, max_train_episode):
        model.reset()
        state[i] = env.reset()
        dones_flag = np.full(env.n, False)
        step = 0
        r = np.zeros(env.n)
        last_done_step = -1
        while True:
            step += 1
            if render or episode > render_episode:
                env.render(record=False)
            action = model.choose_action(s=state[0], visual_s=state[1])
            new_state[i], reward, done, info, correct_new_state = env.step(
                action)
            unfinished_index = np.where(dones_flag == False)[0]
            dones_flag += done
            r[unfinished_index] += reward[unfinished_index]
            model.store_data(s=state[0],
                             visual_s=state[1],
                             a=action,
                             r=reward,
                             s_=new_state[0],
                             visual_s_=new_state[1],
                             done=done)
            model.partial_reset(done)
            state[i] = correct_new_state

            if policy_mode == 'off-policy':
                if total_step % off_policy_train_interval == 0:
                    model.learn(episode=episode, train_step=train_step)
                    train_step += 1
                if train_step % save_frequency == 0:
                    model.save_checkpoint(train_step=train_step,
                                          episode=episode,
                                          frame_step=frame_step)
                if off_policy_eval_interval > 0 and train_step % off_policy_eval_interval == 0:
                    gym_step_eval(deepcopy(env), train_step, model,
                                  off_policy_step_eval_episodes,
                                  max_step_per_episode)

            frame_step += env.n
            total_step += 1
            if 0 < max_train_step <= train_step or 0 < max_frame_step <= frame_step:
                model.save_checkpoint(train_step=train_step,
                                      episode=episode,
                                      frame_step=frame_step)
                logger.info(
                    f'End Training, learn step: {train_step}, frame_step: {frame_step}'
                )
                return

            if all(dones_flag):
                if last_done_step == -1:
                    last_done_step = step
                if policy_mode == 'off-policy':
                    break

            if step >= max_step_per_episode:
                break

        sma.update(r)
        if policy_mode == 'on-policy':
            model.learn(episode=episode, train_step=train_step)
            train_step += 1
            if train_step % save_frequency == 0:
                model.save_checkpoint(train_step=train_step,
                                      episode=episode,
                                      frame_step=frame_step)
        model.writer_summary(episode,
                             reward_mean=r.mean(),
                             reward_min=r.min(),
                             reward_max=r.max(),
                             step=last_done_step,
                             **sma.rs)
        print_func('-' * 40, out_time=True)
        print_func(
            f'Episode: {episode:3d} | step: {step:4d} | last_done_step {last_done_step:4d} | rewards: {arrprint(r, 2)}'
        )

        if add_noise2buffer and episode % add_noise2buffer_episode_interval == 0:
            gym_no_op(env,
                      model,
                      pre_fill_steps=add_noise2buffer_steps,
                      print_func=print_func,
                      prefill_choose=False,
                      desc='adding noise')

        if eval_while_train and env.reward_threshold is not None:
            if r.max() >= env.reward_threshold:
                print_func(
                    f'-------------------------------------------Evaluate episode: {episode:3d}--------------------------------------------------'
                )
                gym_evaluate(env, model, max_step_per_episode,
                             max_eval_episode, print_func)
Esempio n. 7
0
    def train(env, gym_model, begin_episode, save_frequency, max_step, max_episode,
              eval_while_train, max_eval_episode, render, render_episode, policy_mode):
        """
        Inputs:
            env:                gym environment
            gym_model:          algorithm model
            begin_episode:      initial episode
            save_frequency:     how often to save checkpoints
            max_step:           maximum number of steps in an episode
            max_episode:        maximum number of episodes in this training task
            render:             specify whether render the env or not
            render_episode:     if 'render' is false, specify from which episode to render the env
        """
        i, state, new_state = init_variables(env)
        sma = SMA(100)
        for episode in range(begin_episode, max_episode):
            state[i] = env.reset()
            dones_flag = np.full(env.n, False)
            step = 0
            r = np.zeros(env.n)
            last_done_step = -1
            while True:
                step += 1
                r_tem = np.zeros(env.n)
                if render or episode > render_episode:
                    env.render()
                action = gym_model.choose_action(s=state[0], visual_s=state[1])
                new_state[i], reward, done, info = env.step(action)
                unfinished_index = np.where(dones_flag == False)[0]
                dones_flag += done
                r_tem[unfinished_index] = reward[unfinished_index]
                r += r_tem
                gym_model.store_data(
                    s=state[0],
                    visual_s=state[1],
                    a=action,
                    r=reward,
                    s_=new_state[0],
                    visual_s_=new_state[1],
                    done=done
                )

                if all(dones_flag):
                    if last_done_step == -1:
                        last_done_step = step
                    if policy_mode == 'off-policy':
                        break

                if step >= max_step:
                    break

                if len(env.dones_index):    # 判断是否有线程中的环境需要局部reset
                    new_state[i][env.dones_index] = env.partial_reset()
                state[i] = new_state[i]

            sma.update(r)
            gym_model.learn(episode=episode, step=step)
            gym_model.writer_summary(
                episode,
                reward_mean=r.mean(),
                reward_min=r.min(),
                reward_max=r.max(),
                step=last_done_step,
                **sma.rs
            )
            print('-' * 40)
            print(f'Episode: {episode:3d} | step: {step:4d} | last_done_step {last_done_step:4d} | rewards: {r}')
            if episode % save_frequency == 0:
                gym_model.save_checkpoint(episode)

            if eval_while_train and env.reward_threshold is not None:
                if r.max() >= env.reward_threshold:
                    ave_r, ave_step = Loop.evaluate(env, gym_model, max_step, max_eval_episode)
                    solved = True if ave_r >= env.reward_threshold else False
                    print(f'-------------------------------------------Evaluate episode: {episode:3d}--------------------------------------------------')
                    print(f'evaluate number: {max_eval_episode:3d} | average step: {ave_step} | average reward: {ave_r} | SOLVED: {solved}')
                    print('----------------------------------------------------------------------------------------------------------------------------')