def run(self): n = self.env.n i = 1 if self.env.obs_type == 'visual' else 0 state = [np.full((n, 0), []), np.full((n, 0), [])] sma = SMA(100) total_step = 0 episode = 0 while True: if episode % self.pull_interval: self.model.set_worker_params(self.callback_func()) logger.info('pull parameters from success.') episode += 1 self.model.reset() state[i] = self.env.reset() dones_flag = np.zeros(self.env.n) step = 0 rets = np.zeros(self.env.n) last_done_step = -1 while True: step += 1 # env.render(record=False) action = self.model.choose_action(s=state[0], visual_s=state[1]) _, reward, done, info, state[i] = self.env.step(action) rets += (1 - dones_flag) * reward dones_flag = np.sign(dones_flag + done) self.model.partial_reset(done) total_step += 1 if all(dones_flag): if last_done_step == -1: last_done_step = step break if step >= 200: break sma.update(rets) self.model.writer_summary( episode, reward_mean=rets.mean(), reward_min=rets.min(), reward_max=rets.max(), step=last_done_step, **sma.rs ) logger.info(f'Eps: {episode:3d} | S: {step:4d} | LDS {last_done_step:4d} | R: {arrprint(rets, 2)}') time.sleep(self.episode_sleep)
def evaluate(env, model): n = env.n i = 1 if env.obs_type == 'visual' else 0 state = [np.full((n, 0), []), np.full((n, 0), [])] sma = SMA(100) total_step = 0 episode = 0 while True: episode += 1 model.reset() state[i] = env.reset() dones_flag = np.zeros(env.n) step = 0 rets = np.zeros(env.n) last_done_step = -1 while True: step += 1 # env.render(record=False) action = model.choose_action(s=state[0], visual_s=state[1]) _, reward, done, info, state[i] = env.step(action) rets += (1 - dones_flag) * reward dones_flag = np.sign(dones_flag + done) model.partial_reset(done) total_step += 1 if all(dones_flag): if last_done_step == -1: last_done_step = step break if step >= 200: break sma.update(rets) model.writer_summary( episode, reward_mean=rets.mean(), reward_min=rets.min(), reward_max=rets.max(), step=last_done_step, **sma.rs ) print(f'Eps: {episode:3d} | S: {step:4d} | LDS {last_done_step:4d} | R: {arrprint(rets, 2)}') time.sleep(5)
def gym_train(env, model, print_func: Callable[[str], None], begin_train_step: int, begin_frame_step: int, begin_episode: int, render: bool, render_episode: int, save_frequency: int, max_step_per_episode: int, max_train_episode: int, eval_while_train: bool, max_eval_episode: int, off_policy_step_eval_episodes: int, off_policy_train_interval: int, policy_mode: str, moving_average_episode: int, add_noise2buffer: bool, add_noise2buffer_episode_interval: int, add_noise2buffer_steps: int, off_policy_eval_interval: int, max_train_step: int, max_frame_step: int) -> NoReturn: """ TODO: Annotation """ i, state, new_state = init_variables(env) sma = SMA(moving_average_episode) frame_step = begin_frame_step train_step = begin_train_step total_step = 0 for episode in range(begin_episode, max_train_episode): model.reset() state[i] = env.reset() dones_flag = np.zeros(env.n) step = 0 rets = np.zeros(env.n) last_done_step = -1 while True: step += 1 if render or episode > render_episode: env.render(record=False) action = model.choose_action(s=state[0], visual_s=state[1]) new_state[i], reward, done, info, correct_new_state = env.step( action) rets += (1 - dones_flag) * reward dones_flag = np.sign(dones_flag + done) model.store_data(s=state[0], visual_s=state[1], a=action, r=reward, s_=new_state[0], visual_s_=new_state[1], done=done) model.partial_reset(done) state[i] = correct_new_state if policy_mode == 'off-policy': if total_step % off_policy_train_interval == 0: model.learn(episode=episode, train_step=train_step) train_step += 1 if train_step % save_frequency == 0: model.save_checkpoint(train_step=train_step, episode=episode, frame_step=frame_step) if off_policy_eval_interval > 0 and train_step % off_policy_eval_interval == 0: gym_step_eval(deepcopy(env), model, train_step, off_policy_step_eval_episodes, max_step_per_episode) frame_step += env.n total_step += 1 if 0 < max_train_step <= train_step or 0 < max_frame_step <= frame_step: model.save_checkpoint(train_step=train_step, episode=episode, frame_step=frame_step) logger.info( f'End Training, learn step: {train_step}, frame_step: {frame_step}' ) return if all(dones_flag): if last_done_step == -1: last_done_step = step if policy_mode == 'off-policy': break if step >= max_step_per_episode: break sma.update(rets) if policy_mode == 'on-policy': model.learn(episode=episode, train_step=train_step) train_step += 1 if train_step % save_frequency == 0: model.save_checkpoint(train_step=train_step, episode=episode, frame_step=frame_step) model.writer_summary(episode, reward_mean=rets.mean(), reward_min=rets.min(), reward_max=rets.max(), step=last_done_step, **sma.rs) print_func('-' * 40, out_time=True) print_func( f'Episode: {episode:3d} | step: {step:4d} | last_done_step {last_done_step:4d} | rewards: {arrprint(rets, 2)}' ) if add_noise2buffer and episode % add_noise2buffer_episode_interval == 0: gym_no_op(env, model, pre_fill_steps=add_noise2buffer_steps, print_func=print_func, prefill_choose=False, desc='adding noise') if eval_while_train and env.reward_threshold is not None: if rets.max() >= env.reward_threshold: print_func( f'-------------------------------------------Evaluate episode: {episode:3d}--------------------------------------------------' ) gym_evaluate(env, model, max_step_per_episode, max_eval_episode, print_func)
def ma_unity_train(env, model, print_func: Callable[[str], None], begin_train_step: int, begin_frame_step: int, begin_episode: int, max_train_step: int, max_frame_step: int, off_policy_train_interval: int, moving_average_episode: int, save_frequency: int, max_step_per_episode: int, max_train_episode: int, policy_mode: str, real_done: bool = True) -> NoReturn: assert policy_mode == 'off-policy', "multi-agents algorithms now support off-policy only." frame_step = begin_frame_step train_step = begin_train_step data_change_func = multi_agents_data_preprocess(env.env_copys, env.brain_controls) action_reshape_func = multi_agents_action_reshape(env.env_copys, env.brain_controls) agents_num_per_copy = sum(env.brain_controls) sma = [SMA(moving_average_episode) for _ in range(agents_num_per_copy)] for episode in range(begin_episode, max_train_episode): dones_flag = np.zeros(env.env_copys) rewards = np.zeros((agents_num_per_copy, env.env_copys)) model.reset() s, visual_s, _, _, _ = env.reset() s, visual_s = map(data_change_func, [s, visual_s]) step = 0 last_done_step = -1 while True: action = model.choose_action( s=s, visual_s=visual_s) # [total_agents, batch, dimension] action = action_reshape_func(action) actions = { f'{brain_name}': action[i] for i, brain_name in enumerate(env.brain_names) } s_, visual_s_, r, done, info = env.step( actions) # [Brains, Agents, Dims] step += 1 if real_done: done = [b['real_done'] for b in info] # [Agents_perCopy, Copys, Dims] action, r, done, s_, visual_s_ = map( data_change_func, [action, r, done, s_, visual_s_]) done = np.sign(np.asarray(done).sum((0, 2))) # [Copys,] rewards += np.asarray(r).reshape(-1, env.env_copys) * (1 - dones_flag) dones_flag = np.sign(dones_flag + done) model.store_data(*s, *visual_s, *action, *r, *s_, *visual_s_, done[np.newaxis, :]) model.partial_reset(done) s = s_ visual_s = visual_s_ if policy_mode == 'off-policy': if train_step % off_policy_train_interval == 0: model.learn(episode=episode, train_step=train_step) train_step += 1 if train_step % save_frequency == 0: model.save_checkpoint(train_step=train_step, episode=episode, frame_step=frame_step) frame_step += 1 if 0 < max_train_step < train_step or 0 < max_frame_step < frame_step: model.save_checkpoint(train_step=train_step, episode=episode, frame_step=frame_step) logger.info( f'End Training, learn step: {train_step}, frame_step: {frame_step}' ) return if all(dones_flag): if last_done_step == -1: last_done_step = step if policy_mode == 'off-policy': break if step >= max_step_per_episode: break for i in range(agents_num_per_copy): sma[i].update(rewards[i]) model.writer_summary( episode, agent_idx=i, reward_mean=rewards[i].mean(), reward_min=rewards[i].min(), reward_max=rewards[i].max(), # step=last_done_step, **sma[i].rs) print_func('-' * 40, out_time=True) print_func( f'episode {episode:3d} | step {step:4d} | last_done_step {last_done_step:4d}' ) for i in range(agents_num_per_copy): print_func(f'agent {i} reward: {arrprint(rewards[i], 2)}')
def unity_train(env, models, print_func: Callable[[str], None], begin_train_step: int, begin_frame_step: int, begin_episode: int, save_frequency: int, max_step_per_episode: int, max_train_episode: int, policy_mode: str, moving_average_episode: int, add_noise2buffer: bool, add_noise2buffer_episode_interval: int, add_noise2buffer_steps: int, max_train_step: int, max_frame_step: int, real_done: bool, off_policy_train_interval: int) -> NoReturn: """ TODO: Annotation Train loop. Execute until episode reaches its maximum or press 'ctrl+c' artificially. Inputs: env: Environment for interaction. models: all models for this training task. save_frequency: how often to save checkpoints. max_step_per_episode: maximum number of steps for an episode. resampling_interval: how often to resample parameters for env reset. Variables: brain_names: a list of brain names set in Unity. state: store a list of states for each brain. each item contain a list of states for each agents that controlled by the same brain. visual_state: store a list of visual state information for each brain. action: store a list of actions for each brain. dones_flag: store a list of 'done' for each brain. use for judge whether an episode is finished for every agents. rewards: use to record rewards of agents for each brain. """ state, visual_state, action, dones_flag, rewards = zeros_initializer( env.brain_num, 5) sma = [SMA(moving_average_episode) for i in range(env.brain_num)] frame_step = begin_frame_step min_of_all_agents = min(env.brain_agents) train_step = [begin_train_step for _ in range(env.brain_num)] for episode in range(begin_episode, max_train_episode): [model.reset() for model in models] ObsRewDone = zip(*env.reset()) for i, (_v, _vs, _r, _d, _info) in enumerate(ObsRewDone): dones_flag[i] = np.zeros(env.brain_agents[i]) rewards[i] = np.zeros(env.brain_agents[i]) state[i] = _v visual_state[i] = _vs step = 0 last_done_step = -1 while True: step += 1 for i in range(env.brain_num): action[i] = models[i].choose_action(s=state[i], visual_s=visual_state[i]) actions = { f'{brain_name}': action[i] for i, brain_name in enumerate(env.brain_names) } ObsRewDone = zip(*env.step(actions)) for i, (_v, _vs, _r, _d, _info) in enumerate(ObsRewDone): models[i].store_data( s=state[i], visual_s=visual_state[i], a=action[i], r=_r, s_=_v, visual_s_=_vs, done=_info['real_done'] if real_done else _d) models[i].partial_reset(_d) rewards[i] += (1 - dones_flag[i]) * _r dones_flag[i] = np.sign(dones_flag[i] + _d) state[i] = _v visual_state[i] = _vs if policy_mode == 'off-policy': if train_step[i] % off_policy_train_interval == 0: models[i].learn(episode=episode, train_step=train_step) train_step[i] += 1 if train_step[i] % save_frequency == 0: models[i].save_checkpoint(train_step=train_step[i], episode=episode, frame_step=frame_step) frame_step += min_of_all_agents if 0 < max_train_step < min( train_step) or 0 < max_frame_step < frame_step: for i in range(env.brain_num): models[i].save_checkpoint(train_step=train_step[i], episode=episode, frame_step=frame_step) logger.info( f'End Training, learn step: {train_step}, frame_step: {frame_step}' ) return if all([all(dones_flag[i]) for i in range(env.brain_num)]): if last_done_step == -1: last_done_step = step if policy_mode == 'off-policy': break if step >= max_step_per_episode: break for i in range(env.brain_num): sma[i].update(rewards[i]) if policy_mode == 'on-policy': models[i].learn(episode=episode, train_step=train_step) train_step[i] += 1 if train_step[i] % save_frequency == 0: models[i].save_checkpoint(train_step=train_step[i], episode=episode, frame_step=frame_step) models[i].writer_summary(episode, reward_mean=rewards[i].mean(), reward_min=rewards[i].min(), reward_max=rewards[i].max(), step=last_done_step, **sma[i].rs) print_func('-' * 40, out_time=True) print_func( f'episode {episode:3d} | step {step:4d} | last_done_step {last_done_step:4d}' ) for i, bn in enumerate(env.brain_names): print_func(f'{bn} reward: {arrprint(rewards[i], 2)}') if add_noise2buffer and episode % add_noise2buffer_episode_interval == 0: unity_no_op(env, models, print_func=print_func, pre_fill_steps=add_noise2buffer_steps, prefill_choose=False, real_done=real_done, desc='adding noise')
def gym_train(env, model, print_func: Callable[[str], None], begin_train_step: int, begin_frame_step: int, begin_episode: int, render: bool, render_episode: int, save_frequency: int, max_step_per_episode: int, max_train_episode: int, eval_while_train: bool, max_eval_episode: int, off_policy_step_eval_episodes: int, off_policy_train_interval: int, policy_mode: str, moving_average_episode: int, add_noise2buffer: bool, add_noise2buffer_episode_interval: int, add_noise2buffer_steps: int, off_policy_eval_interval: int, max_train_step: int, max_frame_step: int) -> NoReturn: """ TODO: Annotation """ sma = SMA(moving_average_episode) frame_step = begin_frame_step train_step = begin_train_step total_step = 0 for episode in range(begin_episode, max_train_episode): model.reset() obs = env.reset() dones_flag = np.zeros(env.n) step = 0 returns = np.zeros(env.n) last_done_step = -1 while True: step += 1 if render or episode > render_episode: env.render(record=False) action = model.choose_action(obs=obs) ret = env.step(action) model.store_data(BatchExperiences(obs=obs, action=action, reward=ret.reward[:, np.newaxis], # [B, ] => [B, 1] obs_=ret.obs, done=ret.done[:, np.newaxis])) model.partial_reset(ret.done) returns += (1 - dones_flag) * ret.reward dones_flag = np.sign(dones_flag + ret.done) obs = ret.corrected_obs if policy_mode == 'off-policy': if total_step % off_policy_train_interval == 0: model.learn(episode=episode, train_step=train_step) train_step += 1 if train_step % save_frequency == 0: model.save_checkpoint(train_step=train_step, episode=episode, frame_step=frame_step) if off_policy_eval_interval > 0 and train_step % off_policy_eval_interval == 0: gym_step_eval(deepcopy(env), model, train_step, off_policy_step_eval_episodes, max_step_per_episode) frame_step += env.n total_step += 1 if 0 < max_train_step <= train_step or 0 < max_frame_step <= frame_step: model.save_checkpoint(train_step=train_step, episode=episode, frame_step=frame_step) logger.info(f'End Training, learn step: {train_step}, frame_step: {frame_step}') return if all(dones_flag): if last_done_step == -1: last_done_step = step if policy_mode == 'off-policy': break if step >= max_step_per_episode: break sma.update(returns) if policy_mode == 'on-policy': model.learn(episode=episode, train_step=train_step) train_step += 1 if train_step % save_frequency == 0: model.save_checkpoint(train_step=train_step, episode=episode, frame_step=frame_step) model.writer_summary( episode, reward_mean=returns.mean(), reward_min=returns.min(), reward_max=returns.max(), step=last_done_step, **sma.rs ) print_func(f'Eps: {episode:3d} | S: {step:4d} | LDS {last_done_step:4d} | R: {arrprint(returns, 2)}', out_time=True) if add_noise2buffer and episode % add_noise2buffer_episode_interval == 0: gym_no_op(env, model, pre_fill_steps=add_noise2buffer_steps, prefill_choose=False, desc='adding noise') if eval_while_train and env.reward_threshold is not None: if returns.max() >= env.reward_threshold: print_func(f'-------------------------------------------Evaluate episode: {episode:3d}--------------------------------------------------') gym_evaluate(env, model, max_step_per_episode, max_eval_episode, print_func)