#Check if it's over by flag "Done" if all(dones) == True: print(i) break print(f"\n--SOTL Results--") print(f"Steps: {steps}") print(f"Episodes Rewards: {episodes_rewards/steps:.4f}") # for metric in env.metric: # print(f"{metric.name}: {metric.eval():.4f}") #start wandb u.wand_init( "TLC - Results C2", f"SOTL: {options['green_time']} {options['green_v']} {options['red_v']}", "SOTL") eval_dict = {} eval_dict["epsilon"] = 0 eval_dict["steps"] = steps eval_dict["mean_episode_reward"] = episodes_rewards / steps for metric in env.metric: eval_dict[metric.name] = metric.eval() print(f"{metric.name}: {metric.eval():.4f}") for e in range(200): eval_dict["episode"] = e u.wand_log(eval_dict)
def train(args, env): for e in range(episodes): for agent in agents: agent.reset_episode_infos() first_obs = np.array(env.reset())*0.01 current_obs = first_obs if e % args.save_rate == args.save_rate - 1: env.eng.set_save_replay(True) env.eng.set_replay_file("replay_%s.txt" % e) else: env.eng.set_save_replay(False) episodes_rewards = [0] * n_agents episodes_decision_num = [0] * n_agents i = 0 while i < args.steps: ### Requsita nova ação (phase + time) quando acaba o tempo da ação atual for agent_id, agent in enumerate(agents): agent_obs = current_obs[agent_id] if agent.episode_action_time <= i: if agent.episode_action_time == i: agent.change_phase() initial_phase = agent.actual_phase a_phase = initial_phase obs_te = env.world.get_state_of_three_by_phase(agent.I,a_phase) while obs_te[0] == 0: agent.change_phase() a_phase = agent.actual_phase obs_te = env.world.get_state_of_three_by_phase(agent.I,a_phase) if initial_phase == a_phase: break agent.replay() #agent.action_time = -1 #print(i,agent.get_phase()) if agent.episode_action_time+yellow_phase_time+offset_phase <= i: #print(first_obs[agent_id], agent_obs) #print("----") first_obs[agent_id] = agent_obs time = agent.get_action(first_obs[agent_id]) agent.action_time = time agent.episode_action_time += (time+1)*5 ## Parte de 0 segundos + tempo decidido pelo modelo (0,5,10,15,20...) phase = agent.I.current_phase #print(i,agent_obs,time,phase,agent.actual_phase) #print(time) ### Para cada action interval for _ in range(args.action_interval): actions = [agent.get_phase() for agent in agents] current_obs, current_rewards, dones, current_info = env.step(actions) current_obs = np.array(current_obs)*0.01 i += 1 #u.append_new_line_states(file_name+"_0",[e,i,first_obs,current_obs,[agents[0].get_phase(),agents[0].I.current_phase],[current_rewards[0],agents[0].real_reward(first_obs[0],current_obs[0])]]) for agent_id, agent in enumerate(agents): reward = agent.real_reward(first_obs[agent_id],current_obs[agent_id]) #print(reward,current_rewards[agent_id]) agent.current_reward.append(current_rewards[agent_id]) if flag_default_reward else agent.current_reward.append(reward) if agent.episode_action_time+yellow_phase_time+offset_phase == i: action_time = agent.action_time agent_reward = np.mean(agent.current_reward) if flag_mean_reward else agent.current_reward[-yellow_phase_time] #print('----------------') #print("Reward: ", agent_reward,"; min:",np.min(agent.current_reward),"; Méd:",np.mean(agent.current_reward),"; Max:",np.max(agent.current_reward),"; Contagem:",len(agent.current_reward) ) #print('----------------') agent.current_reward = [] phase = agent.actual_phase next_p = agent.next_phase(phase) u.append_new_line(file_name+f"_{agent_id}",[[first_obs[agent_id],phase], action_time, agent_reward, [current_obs[agent_id],next_p],e,i]) ob = first_obs[agent_id].tolist() nob = current_obs[agent_id].tolist() agent.remember( [ob,phase] , action_time, agent_reward, [nob,next_p]) episodes_rewards[agent_id] += agent_reward episodes_decision_num[agent_id] += 1 if agent.total_decision > agent.learning_start: agent.decay_epsilon() #agent.replay() agent.update_target_network() #if agent.total_decision > agent.learning_start and not(agent.total_decision%agent.update_target_model_freq) : if not (e % args.save_rate): if not os.path.exists(args.save_dir): os.makedirs(args.save_dir) for agent in agents: agent.save_model(args.save_dir) eval_dict = {} logger.info(f"episode:{e}/{episodes-1}, steps:{i}") eval_dict["episode"]=e eval_dict["steps"]=i for metric in env.metric: logger.info(f"{metric.name}: {metric.eval()}") eval_dict[metric.name]=metric.eval() for agent_id, agent in enumerate(agents): logger.info(f"agent:{agent_id}, epsilon:{agent.epsilon}, mean_episode_reward:{episodes_rewards[agent_id] / episodes_decision_num[agent_id]}") eval_dict["epsilon"]=agents[0].epsilon eval_dict["mean_episode_reward"]=episodes_rewards[0] / episodes_decision_num[0] u.wand_log(eval_dict) logger.info("Parametros Utilizados") agent = agents[0] #logger.info(f"BUFFER: buffer_size:{agent.buffer_size}; batch_size:{agent.batch_size}; learning_start:{agent.learning_start};") #logger.info(f"MODEL UPDATE: update_model_freq:{agent.update_model_freq}; update_target_model_freq:{agent.update_target_model_freq};") #logger.info(f"LEARNING: gamma:{agent.gamma}; epsilon:{agent.epsilon_start}; epsilon_min:{agent.epsilon_min}; epsilon_decay:{agent.epsilon_decay}; learning_rate:{agent.learning_rate};") logger.info(f"PHASE: n_phases:{agent.n_phases}; start_phase:{agent.start_phase};") logger.info(f"TRAINING: total_decision:{agent.total_decision};") #logger.info(f"ACTIVATION: activation:{agent.activation};") logger.info(f"STATE: ob_generator:{agent.ob_generator.fns[0]};") logger.info(f"REWARD: reward_generator:{agent.reward_generator.fns[0]};") logger.info(str(info_file))
def train(args, env): total_decision_num = 0 for e in range(episodes): last_obs = env.reset() if e % args.save_rate == args.save_rate - 1: env.eng.set_save_replay(True) env.eng.set_replay_file("replay_%s.txt" % e) else: env.eng.set_save_replay(False) episodes_rewards = [0 for i in agents] episodes_decision_num = 0 i = 0 while i < args.steps: if i % action_interval == 0: actions = [] for agent_id, agent in enumerate(agents): if total_decision_num > agent.learning_start: actions.append(agents[0].get_action( last_obs[agent_id])) else: actions.append(agents[0].sample()) rewards_list = [] for _ in range(action_interval): obs, rewards, dones, _ = env.step(actions) i += 1 rewards_list.append(rewards) rewards = np.mean(rewards_list, axis=0) for agent_id, agent in enumerate(agents): #u.append_new_line(file_name+f"_{agent_id}",[[last_obs[agent_id],-1], actions[agent_id], rewards[agent_id], [obs[agent_id],-1],e,i]) agents[0].remember(last_obs[agent_id], actions[agent_id], rewards[agent_id], obs[agent_id]) episodes_rewards[agent_id] += rewards[agent_id] episodes_decision_num += 1 total_decision_num += 1 last_obs = obs #for agent_id, agent in enumerate(agents): if total_decision_num > agents[ 0].learning_start and total_decision_num % agents[ 0].update_model_freq == agents[ 0].update_model_freq - 1: agents[0].replay() if total_decision_num > agents[ 0].learning_start and total_decision_num % agents[ 0].update_target_model_freq == agents[ 0].update_target_model_freq - 1: agents[0].update_target_network() #if all(dones): # break if e % args.save_rate == args.save_rate - 1: if not os.path.exists(args.save_dir): os.makedirs(args.save_dir) #for agent in agents: agents[0].save_model(args.save_dir) # break eval_dict = {} logger.info(f"episode:{e}/{episodes-1}, steps:{i}") eval_dict["episode"] = e eval_dict["steps"] = i for agent_id, agent in enumerate(agents): logger.info("\tagent:{}, mean_episode_reward:{}".format( agent_id, episodes_rewards[agent_id] / episodes_decision_num)) for metric in env.metric: logger.info(f"\t{metric.name}: {metric.eval()}") eval_dict[metric.name] = metric.eval() eval_dict["epsilon"] = agents[0].epsilon eval_dict["mean_episode_reward"] = episodes_rewards[ 0] / episodes_decision_num u.wand_log(eval_dict) #for agent in agents: agents[0].save_model(args.save_dir)
def train(args, env): total_decision_num = 0 for e in range(args.episodes): last_obs = env.reset() if e % args.save_rate == args.save_rate - 1: env.eng.set_save_replay(True) env.eng.set_replay_file("replay_%s.txt" % e) else: env.eng.set_save_replay(False) episodes_rewards = [0 for i in agents] episodes_decision_num = 0 i = 0 while i < args.steps: if i % args.action_interval == 0: actions = [] for agent_id, agent in enumerate(agents): if total_decision_num > agent.learning_start: actions.append(agent.get_action(last_obs[agent_id])) else: actions.append(agent.sample()) rewards_list = [] for _ in range(args.action_interval): obs, rewards, dones, _ = env.step(actions) i += 1 rewards_list.append(rewards) rewards = np.mean(rewards_list, axis=0) for agent_id, agent in enumerate(agents): agent.remember(last_obs[agent_id], actions[agent_id], rewards[agent_id], obs[agent_id]) episodes_rewards[agent_id] += rewards[agent_id] episodes_decision_num += 1 total_decision_num += 1 last_obs = obs for agent_id, agent in enumerate(agents): if total_decision_num > agent.learning_start and total_decision_num % agent.update_model_freq == agent.update_model_freq - 1: agent.replay() if total_decision_num > agent.learning_start and total_decision_num % agent.update_target_model_freq == agent.update_target_model_freq - 1: agent.update_target_network() if all(dones): break if e % args.save_rate == args.save_rate - 1: if not os.path.exists(args.save_dir): os.makedirs(args.save_dir) for agent in agents: agent.save_model(args.save_dir) logger.info("episode:{}/{}, average travel time:{}".format(e, args.episodes, env.eng.get_average_travel_time())) for agent_id, agent in enumerate(agents): logger.info("agent:{}, mean_episode_reward:{}".format(agent_id, episodes_rewards[agent_id] / episodes_decision_num)) eval_dict = {} for metric in env.metric: print("{} is {:.4f}".format(metric.name, metric.eval())) eval_dict[metric.name]=metric.eval() u.wand_log(eval_dict)