Esempio n. 1
0
    def run(self):
        n = self.env.n
        i = 1 if self.env.obs_type == 'visual' else 0
        state = [np.full((n, 0), []), np.full((n, 0), [])]
        sma = SMA(100)
        total_step = 0
        episode = 0

        while True:
            if episode % self.pull_interval:
                self.model.set_worker_params(self.callback_func())
                logger.info('pull parameters from success.')
            episode += 1
            self.model.reset()
            state[i] = self.env.reset()
            dones_flag = np.zeros(self.env.n)
            step = 0
            rets = np.zeros(self.env.n)
            last_done_step = -1
            while True:
                step += 1
                # env.render(record=False)
                action = self.model.choose_action(s=state[0], visual_s=state[1])
                _, reward, done, info, state[i] = self.env.step(action)
                rets += (1 - dones_flag) * reward
                dones_flag = np.sign(dones_flag + done)
                self.model.partial_reset(done)
                total_step += 1
                if all(dones_flag):
                    if last_done_step == -1:
                        last_done_step = step
                    break

                if step >= 200:
                    break

            sma.update(rets)
            self.model.writer_summary(
                episode,
                reward_mean=rets.mean(),
                reward_min=rets.min(),
                reward_max=rets.max(),
                step=last_done_step,
                **sma.rs
            )
            logger.info(f'Eps: {episode:3d} | S: {step:4d} | LDS {last_done_step:4d} | R: {arrprint(rets, 2)}')
            time.sleep(self.episode_sleep)
Esempio n. 2
0
    def evaluate(env, model):
        n = env.n
        i = 1 if env.obs_type == 'visual' else 0
        state = [np.full((n, 0), []), np.full((n, 0), [])]
        sma = SMA(100)
        total_step = 0
        episode = 0

        while True:
            episode += 1
            model.reset()
            state[i] = env.reset()
            dones_flag = np.zeros(env.n)
            step = 0
            rets = np.zeros(env.n)
            last_done_step = -1
            while True:
                step += 1
                # env.render(record=False)
                action = model.choose_action(s=state[0], visual_s=state[1])
                _, reward, done, info, state[i] = env.step(action)
                rets += (1 - dones_flag) * reward
                dones_flag = np.sign(dones_flag + done)
                model.partial_reset(done)
                total_step += 1
                if all(dones_flag):
                    if last_done_step == -1:
                        last_done_step = step
                    break

                if step >= 200:
                    break

            sma.update(rets)
            model.writer_summary(
                episode,
                reward_mean=rets.mean(),
                reward_min=rets.min(),
                reward_max=rets.max(),
                step=last_done_step,
                **sma.rs
            )
            print(f'Eps: {episode:3d} | S: {step:4d} | LDS {last_done_step:4d} | R: {arrprint(rets, 2)}')
            time.sleep(5)
Esempio n. 3
0
def gym_train(env, model, print_func: Callable[[str], None],
              begin_train_step: int, begin_frame_step: int, begin_episode: int,
              render: bool, render_episode: int, save_frequency: int,
              max_step_per_episode: int, max_train_episode: int,
              eval_while_train: bool, max_eval_episode: int,
              off_policy_step_eval_episodes: int,
              off_policy_train_interval: int, policy_mode: str,
              moving_average_episode: int, add_noise2buffer: bool,
              add_noise2buffer_episode_interval: int,
              add_noise2buffer_steps: int, off_policy_eval_interval: int,
              max_train_step: int, max_frame_step: int) -> NoReturn:
    """
    TODO: Annotation
    """

    i, state, new_state = init_variables(env)
    sma = SMA(moving_average_episode)
    frame_step = begin_frame_step
    train_step = begin_train_step
    total_step = 0

    for episode in range(begin_episode, max_train_episode):
        model.reset()
        state[i] = env.reset()
        dones_flag = np.zeros(env.n)
        step = 0
        rets = np.zeros(env.n)
        last_done_step = -1
        while True:
            step += 1
            if render or episode > render_episode:
                env.render(record=False)
            action = model.choose_action(s=state[0], visual_s=state[1])
            new_state[i], reward, done, info, correct_new_state = env.step(
                action)
            rets += (1 - dones_flag) * reward
            dones_flag = np.sign(dones_flag + done)
            model.store_data(s=state[0],
                             visual_s=state[1],
                             a=action,
                             r=reward,
                             s_=new_state[0],
                             visual_s_=new_state[1],
                             done=done)
            model.partial_reset(done)
            state[i] = correct_new_state

            if policy_mode == 'off-policy':
                if total_step % off_policy_train_interval == 0:
                    model.learn(episode=episode, train_step=train_step)
                    train_step += 1
                if train_step % save_frequency == 0:
                    model.save_checkpoint(train_step=train_step,
                                          episode=episode,
                                          frame_step=frame_step)
                if off_policy_eval_interval > 0 and train_step % off_policy_eval_interval == 0:
                    gym_step_eval(deepcopy(env), model, train_step,
                                  off_policy_step_eval_episodes,
                                  max_step_per_episode)

            frame_step += env.n
            total_step += 1
            if 0 < max_train_step <= train_step or 0 < max_frame_step <= frame_step:
                model.save_checkpoint(train_step=train_step,
                                      episode=episode,
                                      frame_step=frame_step)
                logger.info(
                    f'End Training, learn step: {train_step}, frame_step: {frame_step}'
                )
                return

            if all(dones_flag):
                if last_done_step == -1:
                    last_done_step = step
                if policy_mode == 'off-policy':
                    break

            if step >= max_step_per_episode:
                break

        sma.update(rets)
        if policy_mode == 'on-policy':
            model.learn(episode=episode, train_step=train_step)
            train_step += 1
            if train_step % save_frequency == 0:
                model.save_checkpoint(train_step=train_step,
                                      episode=episode,
                                      frame_step=frame_step)
        model.writer_summary(episode,
                             reward_mean=rets.mean(),
                             reward_min=rets.min(),
                             reward_max=rets.max(),
                             step=last_done_step,
                             **sma.rs)
        print_func('-' * 40, out_time=True)
        print_func(
            f'Episode: {episode:3d} | step: {step:4d} | last_done_step {last_done_step:4d} | rewards: {arrprint(rets, 2)}'
        )

        if add_noise2buffer and episode % add_noise2buffer_episode_interval == 0:
            gym_no_op(env,
                      model,
                      pre_fill_steps=add_noise2buffer_steps,
                      print_func=print_func,
                      prefill_choose=False,
                      desc='adding noise')

        if eval_while_train and env.reward_threshold is not None:
            if rets.max() >= env.reward_threshold:
                print_func(
                    f'-------------------------------------------Evaluate episode: {episode:3d}--------------------------------------------------'
                )
                gym_evaluate(env, model, max_step_per_episode,
                             max_eval_episode, print_func)
Esempio n. 4
0
def ma_unity_train(env,
                   model,
                   print_func: Callable[[str], None],
                   begin_train_step: int,
                   begin_frame_step: int,
                   begin_episode: int,
                   max_train_step: int,
                   max_frame_step: int,
                   off_policy_train_interval: int,
                   moving_average_episode: int,
                   save_frequency: int,
                   max_step_per_episode: int,
                   max_train_episode: int,
                   policy_mode: str,
                   real_done: bool = True) -> NoReturn:
    assert policy_mode == 'off-policy', "multi-agents algorithms now support off-policy only."

    frame_step = begin_frame_step
    train_step = begin_train_step

    data_change_func = multi_agents_data_preprocess(env.env_copys,
                                                    env.brain_controls)
    action_reshape_func = multi_agents_action_reshape(env.env_copys,
                                                      env.brain_controls)
    agents_num_per_copy = sum(env.brain_controls)

    sma = [SMA(moving_average_episode) for _ in range(agents_num_per_copy)]

    for episode in range(begin_episode, max_train_episode):

        dones_flag = np.zeros(env.env_copys)
        rewards = np.zeros((agents_num_per_copy, env.env_copys))

        model.reset()
        s, visual_s, _, _, _ = env.reset()
        s, visual_s = map(data_change_func, [s, visual_s])

        step = 0
        last_done_step = -1
        while True:
            action = model.choose_action(
                s=s, visual_s=visual_s)  # [total_agents, batch, dimension]
            action = action_reshape_func(action)
            actions = {
                f'{brain_name}': action[i]
                for i, brain_name in enumerate(env.brain_names)
            }
            s_, visual_s_, r, done, info = env.step(
                actions)  # [Brains, Agents, Dims]
            step += 1

            if real_done:
                done = [b['real_done'] for b in info]

            # [Agents_perCopy, Copys, Dims]
            action, r, done, s_, visual_s_ = map(
                data_change_func, [action, r, done, s_, visual_s_])
            done = np.sign(np.asarray(done).sum((0, 2)))  # [Copys,]

            rewards += np.asarray(r).reshape(-1,
                                             env.env_copys) * (1 - dones_flag)

            dones_flag = np.sign(dones_flag + done)
            model.store_data(*s, *visual_s, *action, *r, *s_, *visual_s_,
                             done[np.newaxis, :])
            model.partial_reset(done)
            s = s_
            visual_s = visual_s_

            if policy_mode == 'off-policy':
                if train_step % off_policy_train_interval == 0:
                    model.learn(episode=episode, train_step=train_step)
                train_step += 1
                if train_step % save_frequency == 0:
                    model.save_checkpoint(train_step=train_step,
                                          episode=episode,
                                          frame_step=frame_step)

            frame_step += 1
            if 0 < max_train_step < train_step or 0 < max_frame_step < frame_step:
                model.save_checkpoint(train_step=train_step,
                                      episode=episode,
                                      frame_step=frame_step)
                logger.info(
                    f'End Training, learn step: {train_step}, frame_step: {frame_step}'
                )
                return
            if all(dones_flag):
                if last_done_step == -1:
                    last_done_step = step
                if policy_mode == 'off-policy':
                    break

            if step >= max_step_per_episode:
                break

        for i in range(agents_num_per_copy):
            sma[i].update(rewards[i])
            model.writer_summary(
                episode,
                agent_idx=i,
                reward_mean=rewards[i].mean(),
                reward_min=rewards[i].min(),
                reward_max=rewards[i].max(),
                # step=last_done_step,
                **sma[i].rs)

        print_func('-' * 40, out_time=True)
        print_func(
            f'episode {episode:3d} | step {step:4d} | last_done_step {last_done_step:4d}'
        )
        for i in range(agents_num_per_copy):
            print_func(f'agent {i} reward: {arrprint(rewards[i], 2)}')
Esempio n. 5
0
def unity_train(env, models, print_func: Callable[[str],
                                                  None], begin_train_step: int,
                begin_frame_step: int, begin_episode: int, save_frequency: int,
                max_step_per_episode: int, max_train_episode: int,
                policy_mode: str, moving_average_episode: int,
                add_noise2buffer: bool, add_noise2buffer_episode_interval: int,
                add_noise2buffer_steps: int, max_train_step: int,
                max_frame_step: int, real_done: bool,
                off_policy_train_interval: int) -> NoReturn:
    """
    TODO: Annotation
    Train loop. Execute until episode reaches its maximum or press 'ctrl+c' artificially.
    Inputs:
        env:                    Environment for interaction.
        models:                 all models for this training task.
        save_frequency:         how often to save checkpoints.
        max_step_per_episode:   maximum number of steps for an episode.
        resampling_interval:    how often to resample parameters for env reset.
    Variables:
        brain_names:    a list of brain names set in Unity.
        state: store    a list of states for each brain. each item contain a list of states for each agents that controlled by the same brain.
        visual_state:   store a list of visual state information for each brain.
        action:         store a list of actions for each brain.
        dones_flag:     store a list of 'done' for each brain. use for judge whether an episode is finished for every agents.
        rewards:        use to record rewards of agents for each brain.
    """

    state, visual_state, action, dones_flag, rewards = zeros_initializer(
        env.brain_num, 5)
    sma = [SMA(moving_average_episode) for i in range(env.brain_num)]
    frame_step = begin_frame_step
    min_of_all_agents = min(env.brain_agents)
    train_step = [begin_train_step for _ in range(env.brain_num)]

    for episode in range(begin_episode, max_train_episode):
        [model.reset() for model in models]
        ObsRewDone = zip(*env.reset())
        for i, (_v, _vs, _r, _d, _info) in enumerate(ObsRewDone):
            dones_flag[i] = np.zeros(env.brain_agents[i])
            rewards[i] = np.zeros(env.brain_agents[i])
            state[i] = _v
            visual_state[i] = _vs
        step = 0
        last_done_step = -1
        while True:
            step += 1
            for i in range(env.brain_num):
                action[i] = models[i].choose_action(s=state[i],
                                                    visual_s=visual_state[i])
            actions = {
                f'{brain_name}': action[i]
                for i, brain_name in enumerate(env.brain_names)
            }
            ObsRewDone = zip(*env.step(actions))

            for i, (_v, _vs, _r, _d, _info) in enumerate(ObsRewDone):
                models[i].store_data(
                    s=state[i],
                    visual_s=visual_state[i],
                    a=action[i],
                    r=_r,
                    s_=_v,
                    visual_s_=_vs,
                    done=_info['real_done'] if real_done else _d)
                models[i].partial_reset(_d)
                rewards[i] += (1 - dones_flag[i]) * _r
                dones_flag[i] = np.sign(dones_flag[i] + _d)
                state[i] = _v
                visual_state[i] = _vs
                if policy_mode == 'off-policy':
                    if train_step[i] % off_policy_train_interval == 0:
                        models[i].learn(episode=episode, train_step=train_step)
                    train_step[i] += 1
                    if train_step[i] % save_frequency == 0:
                        models[i].save_checkpoint(train_step=train_step[i],
                                                  episode=episode,
                                                  frame_step=frame_step)

            frame_step += min_of_all_agents
            if 0 < max_train_step < min(
                    train_step) or 0 < max_frame_step < frame_step:
                for i in range(env.brain_num):
                    models[i].save_checkpoint(train_step=train_step[i],
                                              episode=episode,
                                              frame_step=frame_step)
                logger.info(
                    f'End Training, learn step: {train_step}, frame_step: {frame_step}'
                )
                return

            if all([all(dones_flag[i]) for i in range(env.brain_num)]):
                if last_done_step == -1:
                    last_done_step = step
                if policy_mode == 'off-policy':
                    break

            if step >= max_step_per_episode:
                break

        for i in range(env.brain_num):
            sma[i].update(rewards[i])
            if policy_mode == 'on-policy':
                models[i].learn(episode=episode, train_step=train_step)
                train_step[i] += 1
                if train_step[i] % save_frequency == 0:
                    models[i].save_checkpoint(train_step=train_step[i],
                                              episode=episode,
                                              frame_step=frame_step)
            models[i].writer_summary(episode,
                                     reward_mean=rewards[i].mean(),
                                     reward_min=rewards[i].min(),
                                     reward_max=rewards[i].max(),
                                     step=last_done_step,
                                     **sma[i].rs)
        print_func('-' * 40, out_time=True)
        print_func(
            f'episode {episode:3d} | step {step:4d} | last_done_step {last_done_step:4d}'
        )
        for i, bn in enumerate(env.brain_names):
            print_func(f'{bn} reward: {arrprint(rewards[i], 2)}')

        if add_noise2buffer and episode % add_noise2buffer_episode_interval == 0:
            unity_no_op(env,
                        models,
                        print_func=print_func,
                        pre_fill_steps=add_noise2buffer_steps,
                        prefill_choose=False,
                        real_done=real_done,
                        desc='adding noise')
Esempio n. 6
0
def gym_train(env, model,
              print_func: Callable[[str], None],
              begin_train_step: int,
              begin_frame_step: int,
              begin_episode: int,
              render: bool,
              render_episode: int,
              save_frequency: int,
              max_step_per_episode: int,
              max_train_episode: int,
              eval_while_train: bool,
              max_eval_episode: int,
              off_policy_step_eval_episodes: int,
              off_policy_train_interval: int,
              policy_mode: str,
              moving_average_episode: int,
              add_noise2buffer: bool,
              add_noise2buffer_episode_interval: int,
              add_noise2buffer_steps: int,
              off_policy_eval_interval: int,
              max_train_step: int,
              max_frame_step: int) -> NoReturn:
    """
    TODO: Annotation
    """

    sma = SMA(moving_average_episode)
    frame_step = begin_frame_step
    train_step = begin_train_step
    total_step = 0

    for episode in range(begin_episode, max_train_episode):
        model.reset()
        obs = env.reset()
        dones_flag = np.zeros(env.n)
        step = 0
        returns = np.zeros(env.n)
        last_done_step = -1
        while True:
            step += 1
            if render or episode > render_episode:
                env.render(record=False)
            action = model.choose_action(obs=obs)
            ret = env.step(action)
            model.store_data(BatchExperiences(obs=obs,
                                              action=action,
                                              reward=ret.reward[:, np.newaxis],  # [B, ] => [B, 1]
                                              obs_=ret.obs,
                                              done=ret.done[:, np.newaxis]))
            model.partial_reset(ret.done)
            returns += (1 - dones_flag) * ret.reward
            dones_flag = np.sign(dones_flag + ret.done)
            obs = ret.corrected_obs

            if policy_mode == 'off-policy':
                if total_step % off_policy_train_interval == 0:
                    model.learn(episode=episode, train_step=train_step)
                    train_step += 1
                if train_step % save_frequency == 0:
                    model.save_checkpoint(train_step=train_step, episode=episode, frame_step=frame_step)
                if off_policy_eval_interval > 0 and train_step % off_policy_eval_interval == 0:
                    gym_step_eval(deepcopy(env), model, train_step, off_policy_step_eval_episodes, max_step_per_episode)

            frame_step += env.n
            total_step += 1
            if 0 < max_train_step <= train_step or 0 < max_frame_step <= frame_step:
                model.save_checkpoint(train_step=train_step, episode=episode, frame_step=frame_step)
                logger.info(f'End Training, learn step: {train_step}, frame_step: {frame_step}')
                return

            if all(dones_flag):
                if last_done_step == -1:
                    last_done_step = step
                if policy_mode == 'off-policy':
                    break

            if step >= max_step_per_episode:
                break

        sma.update(returns)
        if policy_mode == 'on-policy':
            model.learn(episode=episode, train_step=train_step)
            train_step += 1
            if train_step % save_frequency == 0:
                model.save_checkpoint(train_step=train_step, episode=episode, frame_step=frame_step)
        model.writer_summary(
            episode,
            reward_mean=returns.mean(),
            reward_min=returns.min(),
            reward_max=returns.max(),
            step=last_done_step,
            **sma.rs
        )
        print_func(f'Eps: {episode:3d} | S: {step:4d} | LDS {last_done_step:4d} | R: {arrprint(returns, 2)}', out_time=True)

        if add_noise2buffer and episode % add_noise2buffer_episode_interval == 0:
            gym_no_op(env, model, pre_fill_steps=add_noise2buffer_steps, prefill_choose=False, desc='adding noise')

        if eval_while_train and env.reward_threshold is not None:
            if returns.max() >= env.reward_threshold:
                print_func(f'-------------------------------------------Evaluate episode: {episode:3d}--------------------------------------------------')
                gym_evaluate(env, model, max_step_per_episode, max_eval_episode, print_func)