Exemple #1
0
    #Check if it's over by flag "Done"
    if all(dones) == True:
        print(i)
        break

print(f"\n--SOTL Results--")
print(f"Steps: {steps}")
print(f"Episodes Rewards: {episodes_rewards/steps:.4f}")
# for metric in env.metric:
#     print(f"{metric.name}: {metric.eval():.4f}")

#start wandb
u.wand_init(
    "TLC - Results C2",
    f"SOTL: {options['green_time']} {options['green_v']} {options['red_v']}",
    "SOTL")

eval_dict = {}
eval_dict["epsilon"] = 0
eval_dict["steps"] = steps
eval_dict["mean_episode_reward"] = episodes_rewards / steps
for metric in env.metric:
    eval_dict[metric.name] = metric.eval()
    print(f"{metric.name}: {metric.eval():.4f}")

for e in range(200):
    eval_dict["episode"] = e

    u.wand_log(eval_dict)
Exemple #2
0
def train(args, env):
    for e in range(episodes):

        for agent in agents: agent.reset_episode_infos()

        first_obs = np.array(env.reset())*0.01
        current_obs = first_obs

        if e % args.save_rate == args.save_rate - 1:
            env.eng.set_save_replay(True)
            env.eng.set_replay_file("replay_%s.txt" % e)
        else:
            env.eng.set_save_replay(False)

        episodes_rewards = [0] * n_agents
        episodes_decision_num = [0] * n_agents

        i = 0
        while i < args.steps:
            
            ### Requsita nova ação (phase + time) quando acaba o tempo da ação atual
            for agent_id, agent in enumerate(agents):
                agent_obs = current_obs[agent_id]
                if agent.episode_action_time <= i:
                    if agent.episode_action_time == i:
                        agent.change_phase()
                        
                        initial_phase = agent.actual_phase
                        a_phase = initial_phase
                        obs_te = env.world.get_state_of_three_by_phase(agent.I,a_phase)
                        while obs_te[0] == 0:
                            agent.change_phase() 
                            a_phase = agent.actual_phase
                            obs_te = env.world.get_state_of_three_by_phase(agent.I,a_phase)
                            if initial_phase == a_phase:
                                break

                        
                        agent.replay() 
                        #agent.action_time = -1
                        #print(i,agent.get_phase())

                    if agent.episode_action_time+yellow_phase_time+offset_phase <= i:
                        
                        #print(first_obs[agent_id], agent_obs)
                        #print("----")
                        first_obs[agent_id] = agent_obs
                        
                        time = agent.get_action(first_obs[agent_id])
                        agent.action_time = time
                        agent.episode_action_time += (time+1)*5 ## Parte de 0 segundos + tempo decidido pelo modelo (0,5,10,15,20...)
                        phase = agent.I.current_phase
                        #print(i,agent_obs,time,phase,agent.actual_phase)
                        #print(time)

            ### Para cada action interval
            for _ in range(args.action_interval):
                actions = [agent.get_phase() for agent in agents]
                current_obs, current_rewards, dones, current_info = env.step(actions)
                current_obs = np.array(current_obs)*0.01
                i += 1
                
                #u.append_new_line_states(file_name+"_0",[e,i,first_obs,current_obs,[agents[0].get_phase(),agents[0].I.current_phase],[current_rewards[0],agents[0].real_reward(first_obs[0],current_obs[0])]])
                
                for agent_id, agent in enumerate(agents):


                    reward = agent.real_reward(first_obs[agent_id],current_obs[agent_id])
                    #print(reward,current_rewards[agent_id])

                    agent.current_reward.append(current_rewards[agent_id]) if flag_default_reward else agent.current_reward.append(reward) 

                    if agent.episode_action_time+yellow_phase_time+offset_phase == i:
                        action_time = agent.action_time

                        agent_reward = np.mean(agent.current_reward) if flag_mean_reward else agent.current_reward[-yellow_phase_time]
                        #print('----------------')
                        #print("Reward: ", agent_reward,"; min:",np.min(agent.current_reward),"; Méd:",np.mean(agent.current_reward),"; Max:",np.max(agent.current_reward),"; Contagem:",len(agent.current_reward) )
                        #print('----------------')
                        agent.current_reward = []

                        phase = agent.actual_phase
                        next_p = agent.next_phase(phase)

                        u.append_new_line(file_name+f"_{agent_id}",[[first_obs[agent_id],phase], action_time, agent_reward, [current_obs[agent_id],next_p],e,i])
                        ob = first_obs[agent_id].tolist()
                        nob = current_obs[agent_id].tolist()
                        agent.remember( [ob,phase] , action_time, agent_reward, [nob,next_p])
                            
                        episodes_rewards[agent_id] += agent_reward
                        episodes_decision_num[agent_id] += 1

        if agent.total_decision > agent.learning_start:
            agent.decay_epsilon()
            #agent.replay()
            agent.update_target_network()
        #if agent.total_decision > agent.learning_start and not(agent.total_decision%agent.update_target_model_freq) :

        if not (e % args.save_rate):
            if not os.path.exists(args.save_dir):
                os.makedirs(args.save_dir)
            for agent in agents:
                agent.save_model(args.save_dir)
                
        eval_dict = {}

        logger.info(f"episode:{e}/{episodes-1}, steps:{i}")
        eval_dict["episode"]=e
        eval_dict["steps"]=i

        for metric in env.metric:
            logger.info(f"{metric.name}: {metric.eval()}")
            eval_dict[metric.name]=metric.eval()

        for agent_id, agent in enumerate(agents):
            logger.info(f"agent:{agent_id}, epsilon:{agent.epsilon}, mean_episode_reward:{episodes_rewards[agent_id] / episodes_decision_num[agent_id]}")

        eval_dict["epsilon"]=agents[0].epsilon
        eval_dict["mean_episode_reward"]=episodes_rewards[0] / episodes_decision_num[0]
        
        u.wand_log(eval_dict)

    logger.info("Parametros Utilizados")
    agent = agents[0]
    #logger.info(f"BUFFER: buffer_size:{agent.buffer_size}; batch_size:{agent.batch_size}; learning_start:{agent.learning_start};")
    #logger.info(f"MODEL UPDATE: update_model_freq:{agent.update_model_freq}; update_target_model_freq:{agent.update_target_model_freq};")
    #logger.info(f"LEARNING: gamma:{agent.gamma}; epsilon:{agent.epsilon_start}; epsilon_min:{agent.epsilon_min}; epsilon_decay:{agent.epsilon_decay}; learning_rate:{agent.learning_rate};")
    logger.info(f"PHASE: n_phases:{agent.n_phases}; start_phase:{agent.start_phase};")
    logger.info(f"TRAINING: total_decision:{agent.total_decision};")
    #logger.info(f"ACTIVATION: activation:{agent.activation};")
    logger.info(f"STATE: ob_generator:{agent.ob_generator.fns[0]};")
    logger.info(f"REWARD: reward_generator:{agent.reward_generator.fns[0]};")
    logger.info(str(info_file))
def train(args, env):
    total_decision_num = 0
    for e in range(episodes):

        last_obs = env.reset()
        if e % args.save_rate == args.save_rate - 1:
            env.eng.set_save_replay(True)
            env.eng.set_replay_file("replay_%s.txt" % e)
        else:
            env.eng.set_save_replay(False)
        episodes_rewards = [0 for i in agents]
        episodes_decision_num = 0
        i = 0

        while i < args.steps:

            if i % action_interval == 0:
                actions = []
                for agent_id, agent in enumerate(agents):
                    if total_decision_num > agent.learning_start:
                        actions.append(agents[0].get_action(
                            last_obs[agent_id]))
                    else:
                        actions.append(agents[0].sample())

                rewards_list = []
                for _ in range(action_interval):
                    obs, rewards, dones, _ = env.step(actions)
                    i += 1
                    rewards_list.append(rewards)
                rewards = np.mean(rewards_list, axis=0)

                for agent_id, agent in enumerate(agents):
                    #u.append_new_line(file_name+f"_{agent_id}",[[last_obs[agent_id],-1], actions[agent_id], rewards[agent_id], [obs[agent_id],-1],e,i])
                    agents[0].remember(last_obs[agent_id], actions[agent_id],
                                       rewards[agent_id], obs[agent_id])
                    episodes_rewards[agent_id] += rewards[agent_id]
                    episodes_decision_num += 1

                total_decision_num += 1
                last_obs = obs

                #for agent_id, agent in enumerate(agents):
                if total_decision_num > agents[
                        0].learning_start and total_decision_num % agents[
                            0].update_model_freq == agents[
                                0].update_model_freq - 1:
                    agents[0].replay()
                if total_decision_num > agents[
                        0].learning_start and total_decision_num % agents[
                            0].update_target_model_freq == agents[
                                0].update_target_model_freq - 1:
                    agents[0].update_target_network()

            #if all(dones):
            #    break

        if e % args.save_rate == args.save_rate - 1:
            if not os.path.exists(args.save_dir):
                os.makedirs(args.save_dir)
            #for agent in agents:
            agents[0].save_model(args.save_dir)
            #    break

        eval_dict = {}

        logger.info(f"episode:{e}/{episodes-1}, steps:{i}")
        eval_dict["episode"] = e
        eval_dict["steps"] = i

        for agent_id, agent in enumerate(agents):
            logger.info("\tagent:{}, mean_episode_reward:{}".format(
                agent_id, episodes_rewards[agent_id] / episodes_decision_num))

        for metric in env.metric:
            logger.info(f"\t{metric.name}: {metric.eval()}")
            eval_dict[metric.name] = metric.eval()

        eval_dict["epsilon"] = agents[0].epsilon
        eval_dict["mean_episode_reward"] = episodes_rewards[
            0] / episodes_decision_num

        u.wand_log(eval_dict)

    #for agent in agents:
    agents[0].save_model(args.save_dir)
Exemple #4
0
def train(args, env):
    total_decision_num = 0
    for e in range(args.episodes):
        
        last_obs = env.reset()
        if e % args.save_rate == args.save_rate - 1:
            env.eng.set_save_replay(True)
            env.eng.set_replay_file("replay_%s.txt" % e)
        else:
            env.eng.set_save_replay(False)
        episodes_rewards = [0 for i in agents]
        episodes_decision_num = 0
        i = 0
        while i < args.steps:
            if i % args.action_interval == 0:
                actions = []
                for agent_id, agent in enumerate(agents):
                    if total_decision_num > agent.learning_start:
                        actions.append(agent.get_action(last_obs[agent_id]))
                    else:
                        actions.append(agent.sample())

                rewards_list = []
                for _ in range(args.action_interval):
                    obs, rewards, dones, _ = env.step(actions)
                    i += 1
                    rewards_list.append(rewards)
                rewards = np.mean(rewards_list, axis=0)

                for agent_id, agent in enumerate(agents):
                    agent.remember(last_obs[agent_id], actions[agent_id], rewards[agent_id], obs[agent_id])
                    episodes_rewards[agent_id] += rewards[agent_id]
                    episodes_decision_num += 1
                    total_decision_num += 1
                
                last_obs = obs

            for agent_id, agent in enumerate(agents):
                if total_decision_num > agent.learning_start and total_decision_num % agent.update_model_freq == agent.update_model_freq - 1:
                    agent.replay()
                if total_decision_num > agent.learning_start and total_decision_num % agent.update_target_model_freq == agent.update_target_model_freq - 1:
                    agent.update_target_network()
            if all(dones):
                break
        if e % args.save_rate == args.save_rate - 1:
            if not os.path.exists(args.save_dir):
                os.makedirs(args.save_dir)
            for agent in agents:
                agent.save_model(args.save_dir)
        logger.info("episode:{}/{}, average travel time:{}".format(e, args.episodes, env.eng.get_average_travel_time()))
            
        for agent_id, agent in enumerate(agents):
            logger.info("agent:{}, mean_episode_reward:{}".format(agent_id, episodes_rewards[agent_id] / episodes_decision_num))
        
        eval_dict = {}

        for metric in env.metric:
            print("{} is {:.4f}".format(metric.name, metric.eval()))
            eval_dict[metric.name]=metric.eval()

        u.wand_log(eval_dict)