def train_agent(agent, env, steps, outdir, max_episode_len=None, step_offset=0, evaluator=None, successful_score=None, step_hooks=[], logger=None): logger = logger or logging.getLogger(__name__) episode_r = 0 episode_idx = 0 # o_0, r_0 obs = env.reset() r = 0 t = step_offset if hasattr(agent, 't'): agent.t = step_offset episode_len = 0 try: while t < steps: # a_t action = agent.act_and_train(obs, r) # o_{t+1}, r_{t+1} obs, r, done, info = env.step(action) t += 1 episode_r += r episode_len += 1 for hook in step_hooks: hook(env, agent, t) if done or episode_len == max_episode_len or t == steps: agent.stop_episode_and_train(obs, r, done=done) logger.info('outdir:%s step:%s episode:%s R:%s', outdir, t, episode_idx, episode_r) logger.info('statistics:%s', agent.get_statistics()) if evaluator is not None: evaluator.evaluate_if_necessary( t=t, episodes=episode_idx + 1) if (successful_score is not None and evaluator.max_score >= successful_score): break if t == steps: break # Start a new episode episode_r = 0 episode_idx += 1 episode_len = 0 obs = env.reset() r = 0 except (Exception, KeyboardInterrupt): # Save the current model before being killed save_agent(agent, t, outdir, logger, suffix='_except') raise # Save the final model save_agent(agent, t, outdir, logger, suffix='_finish')
def train_agent(agent, env, steps, outdir, max_episode_len=None, step_offset=0, evaluator=None, successful_score=None): episode_r = 0 episode_idx = 0 # o_0, r_0 obs = env.reset() r = 0 done = False t = step_offset agent.t = step_offset episode_len = 0 try: while t < steps: # a_t action = agent.act_and_train(obs, r) # o_{t+1}, r_{t+1} obs, r, done, info = env.step(action) t += 1 episode_r += r episode_len += 1 if done or episode_len == max_episode_len or t == steps: agent.stop_episode_and_train(obs, r, done=done) print('outdir:{} step:{} episode:{} R:{}'.format( outdir, t, episode_idx, episode_r)) print('statistics:{}'.format(agent.get_statistics())) if evaluator is not None: evaluator.evaluate_if_necessary(t) if (successful_score is not None and evaluator.max_score >= successful_score): break if t == steps: break # Start a new episode episode_r = 0 episode_idx += 1 episode_len = 0 obs = env.reset() r = 0 done = False except Exception: # Save the current model before being killed save_agent(agent, t, outdir, suffix='_except') raise # Save the final model save_agent(agent, t, outdir, suffix='_finish')
def sending_mission_quit_commands(self, overall_reward_agent_Tom, overall_reward_agent_Jerry, time_step, obs1, r1, obs2, r2, outdir, t, tom, jerry, experiment_ID): self.agent_host1.sendCommand("quit") self.agent_host2.sendCommand("quit") self.agent_host3.sendCommand("quit") dirname = os.path.join(outdir, 'plots') print("dirname: ", dirname) """ save and show results of reward calculations """ self.save_results(overall_reward_agent_Tom, overall_reward_agent_Jerry, time_step) print("Final Reward Tom: ", overall_reward_agent_Tom) print("Final Reward Jerry: ", overall_reward_agent_Jerry) """ end episode, save results """ tom.stop_episode_and_train(obs1, r1, done=True) jerry.stop_episode_and_train(obs2, r2, done=True) print("outdir: %s step: %s " % (outdir, t)) print("Tom's statistics: ", tom.get_statistics()) print("Jerry's statistics: ", jerry.get_statistics()) """ save the final model and results """ save_agent(tom, t, outdir, logger, suffix='_finish_01') save_agent(jerry, t, outdir, logger, suffix='_finish_02') """ save all the collected data for evaluation graphs """ self.save_data_for_evaluation_plots(t, time_step, overall_reward_agent_Tom, overall_reward_agent_Jerry, dirname) time.sleep(2) """ initialisation for the next episode, reset parameters, build new world """ t += 1 self.episode_counter += 1 r1 = r2 = 0 done1 = done2 = self.mission_end = False overall_reward_agent_Jerry = overall_reward_agent_Tom = 0 self.save_new_round(t) obs1, obs2 = self.reset_world(experiment_ID) self.too_close_counter = 0 self.winner_agent = "-" self.time_step_tom_won = self.time_step_jerry_won = None self.time_step_tom_captured_the_flag = self.time_step_jerry_captured_the_flag = None self.time_step_agents_ran_into_each_other = None self.steps_tom = 0 self.steps_jerry = 0 """ recover """ """if evaluator1 and evaluator2 is not None: evaluator1.evaluate_if_necessary( t=t, episodes=episode_idx + 1) evaluator2.evaluate_if_necessary( t=t, episodes=episode_idx + 1) if (successful_score is not None and evaluator1.max_score >= successful_score and evaluator2.max_score >= successful_score): break""" return t, obs1, obs2, r1, r2, done1, done2, overall_reward_agent_Jerry, overall_reward_agent_Tom
def train_agent_batch(agent, env, steps, outdir, log_interval=None, max_episode_len=None, eval_interval=None, step_offset=0, evaluator=None, successful_score=None, step_hooks=[], return_window_size=100, logger=None): """Train an agent in a batch environment. Args: agent: Agent to train. env: Environment to train the agent against. steps (int): Number of total time steps for training. eval_interval (int): Interval of evaluation. outdir (str): Path to the directory to output things. log_interval (int): Interval of logging. max_episode_len (int): Maximum episode length. step_offset (int): Time step from which training starts. return_window_size (int): Number of training episodes used to estimate the average returns of the current agent. successful_score (float): Finish training if the mean score is greater or equal to thisvalue if not None step_hooks (list): List of callable objects that accepts (env, agent, step) as arguments. They are called every step. See chainerrl.experiments.hooks. logger (logging.Logger): Logger used in this function. """ logger = logger or logging.getLogger(__name__) recent_returns = deque(maxlen=return_window_size) num_envs = env.num_envs episode_r = np.zeros(num_envs, dtype=np.float64) episode_idx = np.zeros(num_envs, dtype='i') episode_len = np.zeros(num_envs, dtype='i') # o_0, r_0 obss = env.reset() rs = np.zeros(num_envs, dtype='f') t = step_offset if hasattr(agent, 't'): agent.t = step_offset try: while True: # a_t actions = agent.batch_act_and_train(obss) # o_{t+1}, r_{t+1} obss, rs, dones, infos = env.step(actions) episode_r += rs episode_len += 1 # Compute mask for done and reset if max_episode_len is None: resets = np.zeros(num_envs, dtype=bool) else: resets = (episode_len == max_episode_len) resets = np.logical_or( resets, [info.get('needs_reset', False) for info in infos]) # Agent observes the consequences agent.batch_observe_and_train(obss, rs, dones, resets) # Make mask. 0 if done/reset, 1 if pass end = np.logical_or(resets, dones) not_end = np.logical_not(end) # For episodes that ends, do the following: # 1. increment the episode count # 2. record the return # 3. clear the record of rewards # 4. clear the record of the number of steps # 5. reset the env to start a new episode # 3-5 are skipped when training is already finished. episode_idx += end recent_returns.extend(episode_r[end]) for _ in range(num_envs): t += 1 for hook in step_hooks: hook(env, agent, t) if (log_interval is not None and t >= log_interval and t % log_interval < num_envs): logger.info( 'outdir:{} step:{} episode:{} last_R: {} average_R:{}'. format( # NOQA outdir, t, np.sum(episode_idx), recent_returns[-1] if recent_returns else np.nan, np.mean(recent_returns) if recent_returns else np.nan, )) logger.info('statistics: {}'.format(agent.get_statistics())) if evaluator: if evaluator.evaluate_if_necessary( t=t, episodes=np.sum(episode_idx)): if (successful_score is not None and evaluator.max_score >= successful_score): break if t >= steps: break # Start new episodes if needed episode_r[end] = 0 episode_len[end] = 0 obss = env.reset(not_end) except (Exception, KeyboardInterrupt): # Save the current model before being killed save_agent(agent, t, outdir, logger, suffix='_except') env.close() if evaluator: evaluator.env.close() raise else: # Save the final model save_agent(agent, t, outdir, logger, suffix='_finish')
def train_agent(agent, env, steps, outdir, max_episode_len=None, step_offset=0, evaluator=None, successful_score=None, step_hooks=[], episode_hooks=[], logger=None): logger = logger or logging.getLogger(__name__) episode_r = 0 episode_idx = 0 # o_0, r_0 obs = env.reset() last_obs = [] r = 0 done = False t = step_offset if hasattr(agent, 't'): agent.t = step_offset episode_len = 0 try: while t < steps: # a_t action = agent.act_and_train(obs, r) last_obs = copy.copy(obs) # o_{t+1}, r_{t+1} obs, r, done, info = env.step(action) with open(path, 'a+') as f: f.write("action is {}\n".format(episode_idx, episode_len, action)) f.write("obs is {}\n".format(obs - last_obs)) t += 1 episode_r += r episode_len += 1 for hook in step_hooks: hook(env, agent, t) if done or episode_len == max_episode_len or t == steps: with open(path, 'a+') as f: f.write("done\n") agent.stop_episode_and_train(obs, r, done=done) logger.info('outdir:%s step:%s episode:%s R:%s', outdir, t, episode_idx, episode_r) logger.info('statistics:%s', agent.get_statistics()) if evaluator is not None: evaluator.evaluate_if_necessary(t=t, episodes=episode_idx + 1) if (successful_score is not None and evaluator.max_score >= successful_score): break if t == steps: break for hook in episode_hooks: hook(env, agent, episode_idx, episode_len) # Start a new episode episode_r = 0 episode_idx += 1 episode_len = 0 obs = env.reset() r = 0 done = False except Exception: # Save the current model before being killed save_agent(agent, t, outdir, logger, suffix='_except') raise # Save the final model save_agent(agent, t, outdir, logger, suffix='_finish')
def parallel_train_agent_batch(start_weighted_size, all_agents, env, steps, outdir, log_interval=None, max_episode_len=None, eval_interval=None, step_offset=0, evaluator=None, before_evaluator=None, successful_score=None, step_hooks=[], return_window_size=100, logger=None, step_callback=None, schedule_args={}): """Train an agent in a batch environment. Args: agent: Agent to train. env: Environment to train the agent against. steps (int): Number of total time steps for training. eval_interval (int): Interval of evaluation. outdir (str): Path to the directory to output things. log_interval (int): Interval of logging. max_episode_len (int): Maximum episode length. step_offset (int): Time step from which training starts. return_window_size (int): Number of training episodes used to estimate the average returns of the current agent. successful_score (float): Finish training if the mean score is greater or equal to thisvalue if not None step_hooks (list): List of callable objects that accepts (env, agent, step) as arguments. They are called every step. See chainerrl.experiments.hooks. logger (logging.Logger): Logger used in this function. """ logger = logger or logging.getLogger(__name__) # TODO: set a buffer for recent returns n_agents = len(all_agents) select_probs = np.array([1 / n_agents] * n_agents) agent_ids = [id(agent) for agent in all_agents] assert len(np.unique(agent_ids)) == n_agents all_recent_returns = [ deque(maxlen=return_window_size) for i in range(len(all_agents)) ] all_eval_returns = [0 for i in range(len(all_agents))] num_envs = env.num_envs assert num_envs == 1 episode_r = np.zeros(num_envs, dtype=np.float64) episode_idx = np.zeros(num_envs, dtype='i') episode_len = np.zeros(num_envs, dtype='i') # o_0, r_0 obss = env.reset() rs = np.zeros(num_envs, dtype='f') t = step_offset # TODO: initialize an agent index for the first episode (initially we use uniform distribution) perf_metric = schedule_args['perf_metric'] selected_agent_idx = np.random.randint(low=0, high=len(all_agents)) logger.info('selected_idx: {}'.format(selected_agent_idx)) try: while True: # TODO: take the actins from the selected agent agent = all_agents[selected_agent_idx] # a_t actions = agent.batch_act_and_train(obss) # HACK: to set the other agents' batch_last_obs as [] for idx in range(n_agents): if idx != selected_agent_idx: all_agents[idx].batch_last_obs = [None] * num_envs # o_{t+1}, r_{t+1} obss_next, rs, dones, infos = env.step(actions) obss = obss_next episode_r += rs episode_len += 1 # Compute mask for done and reset if max_episode_len is None: resets = np.zeros(num_envs, dtype=bool) else: resets = (episode_len == max_episode_len) resets = np.logical_or( resets, [info.get('needs_reset', False) for info in infos]) # Agent observes the consequences for idx, agent_in_list in enumerate(all_agents): # Only the selected agent will add replay buffer. The others just update. agent_in_list.batch_observe_and_train(obss, rs, dones, resets) logger.debug('agent_{}: t = {}, selected_idx = {}'.format( idx, agent_in_list.t, selected_agent_idx)) after_ts = [agent_in_list.t for agent_in_list in all_agents] assert len(np.unique(after_ts)) == 1 # Make mask. 0 if done/reset, 1 if pass end = np.logical_or(resets, dones) not_end = np.logical_not(end) if before_evaluator: before_max_mean, before_all_means = before_evaluator.evaluate_if_necessary( t=t + 1, episodes=np.sum(episode_idx)) if step_callback is not None: new_select_probs = step_callback(t, all_recent_returns, all_eval_returns, np.any(end)) if new_select_probs is not None and schedule_args[ 'select_prob_update'] == 'interval': select_probs = new_select_probs # For episodes that ends, do the following: # 1. increment the episode count # 2. record the return # 3. clear the record of rewards # 4. clear the record of the number of steps # 5. reset the env to start a new episode # 3-5 are skipped when training is already finished. episode_idx += end # TODO: append to the selected returns all_recent_returns[selected_agent_idx].extend(episode_r[end]) for _ in range(num_envs): t += 1 for hook in step_hooks: hook(env, agent, t) if (log_interval is not None and t >= log_interval and t % log_interval < num_envs): logger.info( 'outdir:{} agent_idx: {} step:{} episode:{} last_R: {} average_R:{}' .format( # NOQA outdir, selected_agent_idx, t, np.sum(episode_idx), all_recent_returns[selected_agent_idx][-1] if all_recent_returns[selected_agent_idx] else np.nan, np.mean(all_recent_returns[selected_agent_idx]) if all_recent_returns[selected_agent_idx] else np.nan, )) logger.info('statistics: {}'.format(agent.get_statistics())) if evaluator: max_mean, all_means = evaluator.evaluate_if_necessary( t=t, episodes=np.sum(episode_idx)) all_eval_returns = all_means if max_mean: if (successful_score is not None and evaluator.max_score >= successful_score): break if t >= steps: break # Start new episodes if needed episode_r[end] = 0 episode_len[end] = 0 obss = env.reset(not_end) # TODO: update the selected agent index if np.any(end): logger.debug('selected_agent.last_obs: {}'.format( agent.batch_last_obs)) #TODO: weighted selection, uniform, or eps-greedy if t < start_weighted_size or schedule_args[ 'select_algo'] == 'uniform': # prevent not yet eval if t < start_weighted_size: all_mean_returns = '(warm up)' else: if perf_metric == 'train': all_mean_returns = np.asarray([ np.mean(recent_returns) for recent_returns in all_recent_returns ]) else: all_mean_returns = np.asarray(all_eval_returns) logger.info( 't: {} (new agent idx) selection prob.: (uniform), mean returns: {}' .format(t, all_mean_returns)) selected_agent_idx = np.random.randint( low=0, high=len(all_agents)) else: if perf_metric == 'train': all_mean_returns = np.asarray([ np.mean(recent_returns) for recent_returns in all_recent_returns ]) else: all_mean_returns = np.asarray(all_eval_returns) # determine sample or greedy using eps or algo if schedule_args['select_algo'] == 'eps-greedy': eps = schedule_args['eps_schedule'].value(t) use_sample = (np.random.random() < eps) logger.info( 't: {}, eps: {}, sample: {}, mean returns: {}'. format(t, eps, use_sample, all_mean_returns)) selected_agent_idx = np.random.choice( n_agents, 1)[0] if use_sample else np.argmax( all_mean_returns) else: temp = 1.0 if 'select_prob_temp' in schedule_args: temp = schedule_args['select_prob_temp'] logger.info('use temperature: {}'.format( schedule_args['select_prob_temp'])) if schedule_args['select_prob_update'] == 'episode': select_probs = softmax(all_mean_returns / temp) else: logger.info('Use periodically update') logger.info( 't: {} (new agent idx) selection prob.: {}, mean returns: {}' .format(t, select_probs, all_mean_returns)) selected_agent_idx = np.random.choice( n_agents, 1, p=select_probs)[0] logger.info('t: {} selected_idx: {}'.format( t, selected_agent_idx)) # TODO: Call step callbacks except (Exception, KeyboardInterrupt): # Save the current model before being killed for idx, agent in enumerate(all_agents): save_agent(agent, t, outdir, logger, suffix='_except_{}'.format(idx)) env.close() if evaluator: evaluator.env.close() raise else: # Save the final model for idx, agent in enumerate(all_agents): save_agent(agent, t, outdir, logger, suffix='_finish_{}'.format(idx))
def train_agent_batch(agent, env, steps, outdir, log_interval=None, max_episode_len=None, eval_interval=None, step_offset=0, evaluator=None, successful_score=None, step_hooks=[], return_window_size=100, logger=None, use_humans_reward=False, humans_reward_interval=2048): """Train an agent in a batch environment. Args: agent: Agent to train. env: Environment train the againt against. steps (int): Number of total time steps for training. eval_interval (int): Interval of evaluation. outdir (str): Path to the directory to output things. log_interval (int): Interval of logging. max_episode_len (int): Maximum episode length. step_offset (int): Time step from which training starts. return_window_size (int): Number of training episodes used to estimate the average returns of the current agent. successful_score (float): Finish training if the mean score is greater or equal to thisvalue if not None step_hooks (list): List of callable objects that accepts (env, agent, step) as arguments. They are called every step. See chainerrl.experiments.hooks. logger (logging.Logger): Logger used in this function. use_humans_reward: Boolean for whether or not to use humans sessions reward humans_reward_interval: Number of episodes between reevaluation of agent performance """ env_prop = update_global_env_prop_from_cfg() schema_name = SchemaName(cfg.schema) agent_humanity_rate = -1 log_idx = 0 logger = logger or logging.getLogger(__name__) arch = ArchName(cfg.arch) recent_returns = deque(maxlen=return_window_size) recent_returns_without_humanity = deque(maxlen=return_window_size) recent_rewards = defaultdict(lambda: deque(maxlen=return_window_size)) # a deque that stores the entropy values of the various actions distributions during training entropy_values = deque(maxlen=100) # a counter for the number of actions from each type action_types_cntr = {"back": 0, "filter": 0, "group": 0} # a counter where key is a function index (numerical ID) and value # is number of occurrences actions_cntr = defaultdict(int) agent_humanity_info = {} num_envs = env.num_envs episode_r = np.zeros(num_envs, dtype=np.float64) episode_r_without_humanity = np.zeros(num_envs, dtype=np.float64) episode_idx = np.zeros(num_envs, dtype='i') episode_len = np.zeros(num_envs, dtype='i') # o_0, r_0 obss = env.reset() rs = np.zeros(num_envs, dtype='f') t = step_offset if hasattr(agent, 't'): agent.t = step_offset try: while t < steps: # a_t #actions = agent.batch_act_and_train(obss) actions, actions_distrib = batch_act_and_train(agent, obss) # trace entropy_values entropy_values.append(actions_distrib.entropy.data[0]) # o_{t+1}, r_{t+1} obss, rs, dones, infos = env.step(actions) # Update actions of agent to actual actions made in environment # This step is being done because we change illegal filter actions # to legal actions inside the environment # if arch is ArchName.FF_PARAM_SOFTMAX: # for info_idx, info in enumerate(infos): # actions[info_idx] = info["actual_parametric_softmax_idx"] # add action types to counter action_types = [] for action in actions: if arch is ArchName.FF_GAUSSIAN: action = env_prop.compressed2full_range(action) elif arch is ArchName.FF_PARAM_SOFTMAX or arch is ArchName.FF_SOFTMAX: actions_cntr[action] += 1 action = env_prop.static_param_softmax_idx_to_action_type( action) action_disc = ATENAEnvCont.cont2dis(action) action_type = ATENAUtils.OPERATOR_TYPE_LOOKUP.get( action_disc[0]) action_types_cntr[action_type] += 1 action_types.append(action_type) episode_r_without_humanity += rs # save rewards for logging purposes for i, info in enumerate(infos): action_type = action_types[i] step_reward_info = info["reward_info"] for reward_type, value in step_reward_info.items(): if (value != 0 or reward_type in { "back", "same_display_seen_already", "empty_display", "empty_groupings", "humanity" } or (action_type == "group" and reward_type in {"compaction_gain", "diversity"}) or (action_type == "filter" and reward_type in {"kl_distance", "diversity"})): recent_rewards[reward_type].append(value) # add rewards to rewards of episode episode_r += rs episode_len += 1 # Compute mask for done and reset if max_episode_len is None: resets = np.zeros(num_envs, dtype=bool) else: resets = (episode_len == max_episode_len) # Agent observes the consequences agent.batch_observe_and_train(obss, rs, dones, resets) # Make mask. 0 if done/reset, 1 if pass end = np.logical_or(resets, dones) not_end = np.logical_not(end) # For episodes that ends, do the following: # 1. increment the episode count # 2. record the return # 3. clear the record of rewards # 4. clear the record of the number of steps # 5. reset the env to start a new episode episode_idx += end recent_returns.extend(episode_r[end]) recent_returns_without_humanity.extend( episode_r_without_humanity[end]) episode_r[end] = 0 episode_r_without_humanity[end] = 0 episode_len[end] = 0 obss = env.reset(not_end) for _ in range(num_envs): t += 1 for hook in step_hooks: hook(env, agent, t) # log and save using Tensorboard if (log_interval is not None and t >= log_interval and t % log_interval < num_envs): logger.info( 'outdir:{} step:{} episode:{} last_R: {} average_R:{}'. format( # NOQA outdir, t, np.sum(episode_idx), recent_returns[-1] if recent_returns else np.nan, np.mean(recent_returns) if recent_returns else np.nan, )) summary_writer.add_scalar('episode_reward', np.mean(recent_returns), t) summary_writer.add_scalar( 'episode_r_without_humanity', np.mean(recent_returns_without_humanity), t) summary_writer.add_scalar('agent_humanity_rate', agent_humanity_rate, t) for reward_type, reward_vals in recent_rewards.items(): summary_writer.add_scalar(reward_type, np.mean(reward_vals), t) if not cfg.obs_with_step_num and cfg.stack_obs_num == 1: for elem in [ 'success_count_per_action_type', 'failure_count_per_action_type' ]: summary_writer.add_scalars(elem, agent_humanity_info[elem], t) summary_writer.add_scalars('action_types_count', action_types_cntr, t) log_idx += 1 logger.info('statistics: {}'.format(agent.get_statistics())) # k_probs = 18 # k_highest_act_probs = actions_distrib.k_highest_probablities(k_probs) # avg_k_highest_act_probs = np.mean(k_highest_act_probs, axis=0) # logger.info('actions_distribution ({} highest probs):\n{}'.format( # k_probs, k_highest_act_probs)) # summary_writer.add_scalar('avg_highest_act_prob', avg_k_highest_act_probs[0], t) # summary_writer.add_scalar('avg_second_highest_act_prob', avg_k_highest_act_probs[1], t) summary_writer.add_scalar('avg_entropy', np.mean(entropy_values), t) if evaluator: if evaluator.evaluate_if_necessary( t=t, episodes=np.sum(episode_idx)): if (successful_score is not None and evaluator.max_score >= successful_score): break except (Exception, KeyboardInterrupt): # save the actions counter before killed # Store data (serialize) if arch is ArchName.FF_PARAM_SOFTMAX or arch is ArchName.FF_SOFTMAX: actions_cntr_path = os.path.join(outdir, 'actions_cntr.pickle') with open(actions_cntr_path, 'wb') as handle: pickle.dump(actions_cntr, handle, protocol=pickle.HIGHEST_PROTOCOL) # Save the current model before being killed save_agent(agent, t, outdir, logger, suffix='_except') env.close() if evaluator: evaluator.env.close() # save the current difficult human observations before killed # Store data (serialize) if not cfg.obs_with_step_num and cfg.stack_obs_num == 1: failure_obs_path = os.path.join(outdir, 'hard_obs.pickle') with open(failure_obs_path, 'wb') as handle: pickle.dump(agent_humanity_info['failure_obs'], handle, protocol=pickle.HIGHEST_PROTOCOL) raise else: # save the actions counter # Store data (serialize) if arch is ArchName.FF_PARAM_SOFTMAX or arch is ArchName.FF_SOFTMAX: actions_cntr_path = os.path.join(outdir, 'actions_cntr.pickle') with open(actions_cntr_path, 'wb') as handle: pickle.dump(actions_cntr, handle, protocol=pickle.HIGHEST_PROTOCOL) # Save the final model save_agent(agent, t, outdir, logger, suffix='_finish') # save the final difficult human observations # Store data (serialize) if not cfg.obs_with_step_num and cfg.stack_obs_num == 1: failure_obs_path = os.path.join(outdir, 'hard_obs.pickle') with open(failure_obs_path, 'wb') as handle: pickle.dump(agent_humanity_info['failure_obs'], handle, protocol=pickle.HIGHEST_PROTOCOL)
def train_agent(agent, env, steps, outdir, max_episode_len=None, step_offset=0, evaluator=None, successful_score=None, step_hooks=[], logger=None): logger = logger or logging.getLogger(__name__) episode_r = 0 # vector where each index contains the number that represents the type # of action in step == index, in the current episode # the actions are 0, 1, 2 for back, filter, group resp. episode_action_type_hist = [] episode_idx = 0 # o_0, r_0 obs = env.reset() r = 0 t = step_offset if hasattr(agent, 't'): agent.t = step_offset episode_len = 0 try: while t < steps: # a_t action = agent.act_and_train(obs, r) if episode_idx % SUMMARY_EPISODE_SLOT == 0: # env.env.env exists only if args.monitor is set to True action = env.env_prop.compressed2full_range(action) action_disc = env.cont2dis(action) episode_action_type_hist.append(action_disc[0]) # o_{t+1}, r_{t+1} obs, r, done, info = env.step(action) t += 1 episode_r += r episode_len += 1 for hook in step_hooks: hook(env, agent, t) if done or episode_len == max_episode_len or t == steps: agent.stop_episode_and_train(obs, r, done=done) # logger.info('outdir:%s step:%s episode:%s R:%s', # outdir, t, episode_idx, episode_r) # logger.info('statistics:%s', agent.get_statistics()) log_results(logger, outdir, t, episode_idx, episode_r, agent.get_statistics(), episode_action_type_hist) if episode_idx % SUMMARY_EPISODE_SLOT == 0: summary_writer.add_scalar('episode_reward', episode_r, episode_idx) summary_writer.add_histogram( 'operators_hist', np.array(episode_action_type_hist), episode_idx) episode_action_type_hist = [] if evaluator is not None: evaluator.evaluate_if_necessary(t=t, episodes=episode_idx + 1) if (successful_score is not None and evaluator.max_score >= successful_score): break if t == steps: break # Start a new episode episode_r = 0 episode_idx += 1 episode_len = 0 obs = env.reset() r = 0 except (Exception, KeyboardInterrupt): # Save the current model before being killed save_agent(agent, t, outdir, logger, suffix='_except') raise # Save the final model save_agent(agent, t, outdir, logger, suffix='_finish')
def train_agent(agent, env, steps, outdir, checkpoint_freq=None, max_episode_len=None, step_offset=0, evaluator=None, successful_score=None, step_hooks=(), logger=None): logger = logger or logging.getLogger(__name__) episode_r = 0 episode_idx = 0 # o_0, r_0 obs = env.reset() r = 0 t = step_offset if hasattr(agent, 't'): agent.t = step_offset start_time = time.time() last_chunk_end_time = start_time chunk = 1000 episode_len = 0 try: while t < steps: # print(t, t % 1000 == 0) if t % chunk == 0: now = time.time() chunk_elapsed = now - last_chunk_end_time last_chunk_end_time = now logger.critical( "STEPS: {} / {}, {:0.0f}s total elapsed {:0.0f} for this chunk of {}" .format(t, steps, now - start_time, chunk_elapsed, chunk)) # a_t action = agent.act_and_train(obs, r) # o_{t+1}, r_{t+1} obs, r, done, info = env.step(action) t += 1 episode_r += r episode_len += 1 for hook in step_hooks: hook(env, agent, t) reset = (episode_len == max_episode_len or info.get('needs_reset', False)) if done or reset or t == steps: agent.stop_episode_and_train(obs, r, done=done) logger.info('outdir:%s step:%s episode:%s R:%s', outdir, t, episode_idx, episode_r) logger.info('statistics:%s', agent.get_statistics()) if evaluator is not None: evaluator.evaluate_if_necessary(t=t, episodes=episode_idx + 1) if (successful_score is not None and evaluator.max_score >= successful_score): break if t == steps: break # Start a new episode episode_r = 0 episode_idx += 1 episode_len = 0 obs = env.reset() r = 0 if checkpoint_freq and t % checkpoint_freq == 0: save_agent(agent, t, outdir, logger, suffix='_checkpoint') except (Exception, KeyboardInterrupt): # Save the current model before being killed save_agent(agent, t, outdir, logger, suffix='_except') raise # Save the final model save_agent(agent, t, outdir, logger, suffix='_finish')
hook(env, jerry, t) """ check, if an agent captured the flag """ env.check_inventory(time_step) print("--------------------------------------------------------------------") """ end mission when both agents finished or time is over, start over again """ if env.mission_end or done1 or done2 or (time_step > 1920): # 960 = 16 min | 1920 = 32 min """ send mission QuitCommands to tell Malmo that the mission has ended,save and reset everything """ t, obs1, obs2, r1, r2, done1, done2, overall_reward_agent_Jerry, overall_reward_agent_Tom = \ env.sending_mission_quit_commands(overall_reward_agent_Tom, overall_reward_agent_Jerry, time_step, obs1, r1, obs2, r2, outdir, t, tom, jerry, experiment_ID) time_stamp_start = time.time() """ recover """ time.sleep(5) if t == 1001: print("Mission-Set finished. Congratulations! Check results and parameters. Start over.") break except (Exception, KeyboardInterrupt): # Save the current model before being killed save_agent(tom, t, outdir, logger, suffix='_except01') save_agent(jerry, t, outdir, logger, suffix='_except02') raise