def main():
    ################ load ###################
    if os.path.exists(
            'D:/Software/PythonWork/GAMA_python/A2C-TD-single-car-intersection/weight/actor.pkl'
    ):
        actor = Actor(state_size, action_size).to(device)
        actor.load_state_dict(
            torch.load(
                'D:/Software/PythonWork/GAMA_python/A2C-TD-single-car-intersection/weight/actor.pkl'
            ))
        print('Actor Model loaded')
    else:
        actor = Actor(state_size, action_size).to(device)
    if os.path.exists(
            'D:/Software/PythonWork/GAMA_python/A2C-TD-single-car-intersection/weight/critic.pkl'
    ):
        critic = Critic(state_size, action_size).to(device)
        critic.load_state_dict(
            torch.load(
                'D:/Software/PythonWork/GAMA_python/A2C-TD-single-car-intersection/weight/critic.pkl'
            ))
        print('Critic Model loaded')
    else:
        critic = Critic(state_size, action_size).to(device)
    print("Waiting for GAMA...")
    ################### initialization ########################
    reset()
    lr = 0.00007

    optimizerA = optim.Adam(actor.parameters(), lr, betas=(0.95, 0.999))
    optimizerC = optim.Adam(critic.parameters(), lr, betas=(0.95, 0.999))

    episode = 0
    test = "GAMA"
    state, reward, done, time_pass, over = GAMA_connect(test)  #connect
    print("done:", done, "timepass:"******"restart acceleration: 0")
            send_to_GAMA([[1, 0]])
            #先传后计算
            rewards.append(reward)  #contains the last
            reward = torch.tensor([reward], dtype=torch.float, device=device)
            rewards.append(reward)  #contains the last
            total_reward = sum(rewards)
            total_rewards.append(total_reward)

            #state = torch.FloatTensor(state).reshape(1,4).to(device)
            #last_value= critic(state)

            with torch.autograd.set_detect_anomaly(True):
                advantage = reward.detach(
                ) - value  #+ last_value   最后一回的V(s+1) = 0
                actor_loss = -(log_prob * advantage.detach())
                print("actor_loss, ", actor_loss, " size", actor_loss.dim())
                critic_loss = (reward.detach() - value).pow(2)  #+ last_value
                lstm_loss = critic_loss

                optimizerA.zero_grad()
                optimizerC.zero_grad()

                critic_loss.backward(retain_graph=True)
                actor_loss.backward(retain_graph=True)
                loss.append(critic_loss)

                optimizerA.step()
                optimizerC.step()

            print(
                "----------------------------------Net_Trained---------------------------------------"
            )
            print('--------------------------Iteration:', episode,
                  'over--------------------------------')
            episode += 1
            log_probs = []
            values = []
            rewards = []
            masks = []
            torch.save(
                actor.state_dict(),
                'D:/Software/PythonWork/GAMA_python/A2C-TD-single-car-intersection/weight/actor.pkl'
            )
            torch.save(
                critic.state_dict(),
                'D:/Software/PythonWork/GAMA_python/A2C-TD-single-car-intersection/weight/critic.pkl'
            )
            loss_sum = sum(loss)
            total_loss.append(loss_sum)
            cross_loss_curve(total_loss, total_rewards)
            loss = []
            if episode > 50:  #50
                lr = 0.0002
                if episode > 115:
                    lr = 0.0001
                new_lr = lr * (0.94**((episode - 40) // 10))  #40
                optimizerA = optim.Adam(actor.parameters(),
                                        new_lr,
                                        betas=(0.95, 0.999))
                optimizerC = optim.Adam(critic.parameters(),
                                        new_lr,
                                        betas=(0.95, 0.999))

        #最初の時
        else:
            print('Iteration:', episode)
            state = np.reshape(state, (1, len(state)))  #xxx
            state = torch.FloatTensor(state).reshape(1, 4).to(device)
            value = critic(
                state)  #dist,  # now is a tensoraction = dist.sample()
            action, log_prob, entropy = actor(state)
            print("acceleration: ", action.cpu().numpy())
            send_to_GAMA([[1, float(action.cpu().numpy() * 10)]])
            log_prob = log_prob.unsqueeze(0)
            entropy += entropy

        state, reward, done, time_pass, over = GAMA_connect(test)
    return None
def main():
    ################ load ###################
    if os.path.exists('D:/Software/GamaWorkspace/Python/weight/actor.pkl'):
        actor =  Actor(state_size, action_size).to(device)
        actor.load_state_dict(torch.load('D:/Software/GamaWorkspace/Python/weight/actor.pkl'))
        print('Actor Model loaded')
    else:
        actor = Actor(state_size, action_size).to(device)
    if os.path.exists('D:/Software/GamaWorkspace/Python/weight/critic.pkl'):
        critic = Critic(state_size, action_size).to(device)
        critic.load_state_dict(torch.load('D:/Software/GamaWorkspace/Python/weight/critic.pkl'))
        print('Critic Model loaded')
    else:
        critic = Critic(state_size, action_size).to(device)
    print("Waiting for GAMA...")
    ################### initialization ########################
    reset()

    optimizerA = optim.Adam(actor.parameters(), lr, betas=(0.95, 0.999))#optim.Adam(actor.parameters())  
    optimizerC = optim.Adam(critic.parameters(), lr, betas=(0.95, 0.999))#optim.Adam(critic.parameters())

    episode = 0
    test = "GAMA"
    state,reward,done,time_pass,over = GAMA_connect(test)
    print("done:",done,"timepass:"******"acceleration: ",action.cpu().numpy())#,"action.cpu().numpy()",type(float(action.cpu().numpy()))
            to_GAMA = [[1,float(action.cpu().numpy()*10)]] #行
            np.savetxt(from_python_1,to_GAMA,delimiter=',')
            np.savetxt(from_python_2,to_GAMA,delimiter=',')
            #前回の報酬
            rewards.append(torch.tensor([reward], dtype=torch.float, device=device)) #contains the last
            masks.append(torch.tensor([1-done], dtype=torch.float, device=device))   #over-0; otherwise-1 contains the last
            log_prob = log_prob.unsqueeze(0) #log_prob = dist.log_prob(action).unsqueeze(0)       # entropy += dist.entropy().mean()
            log_probs.append(log_prob)
            values.append(value)
            entropy += entropy
        # 終わり 
        elif done == 1:
            print("restart acceleration: 0")
            to_GAMA = [[1,0]]
            np.savetxt(from_python_1,to_GAMA,delimiter=',')
            np.savetxt(from_python_2,to_GAMA,delimiter=',')
            #先传后计算
            rewards.append(torch.tensor([reward], dtype=torch.float, device=device)) #contains the last
            masks.append(torch.tensor([1-done], dtype=torch.float, device=device))   #over-0; otherwise-1 contains the last
            
            total_reward = sum(rewards)
            total_rewards.append(total_reward)

            last_state = torch.FloatTensor(state).to(device)
            last_value = critic(last_state)
            returns = compute_returns(last_value, rewards, masks) 
            values_next = returns[1:]#values[1:]
            values_next.append(torch.tensor([0], dtype=torch.float, device=device))
            
            log_probs = torch.cat(log_probs,1).squeeze()  #Concatenates the given sequence of seq tensors in the given dimension.
            returns = torch.cat(returns).detach()
            values = torch.cat(values)
            values_next = torch.cat(values_next)
            rewards = torch.cat(rewards)

            # TD:r(s) + v(s+1) - v(s)      #rewards.detach() + values_next - values  r(s) MC: returns.detach() - values???
            advantage = returns.detach() - values
            actor_loss = -(log_probs * advantage.detach()).mean()
            loss = advantage.pow(2).sum()
            loss.detach()
            critic_loss = (returns.detach() - values).pow(2).mean()

            optimizerA.zero_grad()
            optimizerC.zero_grad()
            actor_loss.backward()
            critic_loss.backward()
            optimizerA.step()
            optimizerC.step()

            print("--------------------------Net_Trained-------------------------------")
            print('--------------------------Iteration:',episode,'over--------------------------------')
            episode += 1
            log_probs = []
            values = []
            rewards = []
            masks = []
            torch.save(actor.state_dict(), 'D:/Software/GamaWorkspace/Python/weight/actor.pkl')
            torch.save(critic.state_dict(), 'D:/Software/GamaWorkspace/Python/weight/critic.pkl')
            #print("entropy: ",entropy,"total_rewards:",total_rewards)
            entropys.append(entropy)
            total_loss.append(loss)
            if(episode!=0):
                cross_loss_curve(total_loss,total_rewards)
            loss = 0

            if episode > 90  :
                new_lr = lr * (0.92 ** ((episode-80) // 10))
                optimizerA = optim.Adam(actor.parameters(), new_lr, betas=(0.95, 0.999))
                optimizerC = optim.Adam(critic.parameters(), new_lr, betas=(0.95, 0.999))

        #最初の時
        else:
            print('Iteration:',episode)
            state = torch.FloatTensor(state).to(device)
            value =  critic(state)  #dist,  # now is a tensoraction = dist.sample() 
            action,log_prob,entropy = actor(state)
            print("acceleration: ",float(action.cpu().numpy()*10))
            to_GAMA = [[1,action.cpu().numpy()]]
            np.savetxt(from_python_1,to_GAMA,delimiter=',')
            np.savetxt(from_python_1,to_GAMA,delimiter=',')
            log_prob = log_prob.unsqueeze(0) #log_prob = dist.log_prob(action).unsqueeze(0) #entropy += dist.entropy().mean()
            log_probs.append(log_prob)
            values.append(value)
            entropy += entropy

        state,reward,done,time_pass,over = GAMA_connect(test)
    return None #[action,log_prob_return,value]
Ejemplo n.º 3
0
def main():
    ################ load ###################
    actor_path = os.path.abspath(
        os.curdir) + '/Generate_Traffic_Flow_MAS_RL/weight/AC_TD2_actor.pkl'
    if os.path.exists(actor_path):
        actor = Actor(state_size, action_size).to(device)
        actor.load_state_dict(torch.load(actor_path))
        print('Actor Model loaded')
    else:
        actor = Actor(state_size, action_size).to(device)
    print("Waiting for GAMA...")
    ################### initialization ########################
    reset()

    Using_LSTM = False
    test = "GAMA"
    N_agent = 20
    list_hidden = []

    count = 0
    ##################  start  #########################
    state = GAMA_connect(test)
    print("Connected")
    while True:
        if Using_LSTM == False:
            state = [
                torch.DoubleTensor(elem).reshape(1, state_size).to(device)
                for elem in state
            ]
            state = torch.stack(state).to(device).detach()
            tensor_cv = generate_img()
            tensor_cv = [
                torch.from_numpy(np.transpose(elem,
                                              (2, 0, 1))).double().to(device) /
                255 for elem in tensor_cv
            ]
            tensor_cv = torch.stack(tensor_cv).to(device).detach()

            action, h_state_cv_a, h_state_n_a = actor(state, tensor_cv)

            send_to_GAMA([[1, float(action.cpu().numpy() * 10)]])

        else:
            if len(list_hidden) < N_agent:
                state = [
                    torch.DoubleTensor(elem).reshape(1, state_size).to(device)
                    for elem in state
                ]
                state = torch.stack(state).to(device).detach()
                tensor_cv = generate_img()
                tensor_cv = [
                    torch.from_numpy(np.transpose(
                        elem, (2, 0, 1))).double().to(device) / 255
                    for elem in tensor_cv
                ]
                tensor_cv = torch.stack(tensor_cv).to(device).detach()

                action, h_state_cv_a, h_state_n_a = actor(state, tensor_cv)

                send_to_GAMA([[1, float(action.cpu().numpy() * 10)]])
                list_hidden.append(Memory(h_state_cv_a, h_state_n_a))
                count += 1

            else:
                state = [
                    torch.DoubleTensor(elem).reshape(1, state_size).to(device)
                    for elem in state
                ]
                state = torch.stack(state).to(device).detach()
                tensor_cv = generate_img()
                tensor_cv = [
                    torch.from_numpy(np.transpose(
                        elem, (2, 0, 1))).double().to(device) / 255
                    for elem in tensor_cv
                ]
                tensor_cv = torch.stack(tensor_cv).to(device).detach()

                action, h_state_cv_a, h_state_n_a = actor(
                    state, tensor_cv,
                    list_hidden[count % N_agent].h_state_cv_a,
                    list_hidden[count % N_agent].h_state_n_a)

                send_to_GAMA([[1, float(action.cpu().numpy() * 10)]])
                list_hidden[count % N_agent].set_hidden(
                    h_state_cv_a, h_state_n_a)
                count += 1

        state = GAMA_connect(test)

    return None
Ejemplo n.º 4
0
def main():
    ################ load ###################
    actor_path = os.path.abspath(
        os.curdir) + '/PPO_Mixedinput_Navigation_Model/weight/AC_TD3_actor.pkl'
    critic_path = os.path.abspath(
        os.curdir
    ) + '/PPO_Mixedinput_Navigation_Model/weight/AC_TD3_critic.pkl'
    if os.path.exists(actor_path):
        actor = Actor(state_size, action_size).to(device)
        actor.load_state_dict(torch.load(actor_path))
        print('Actor Model loaded')
    else:
        actor = Actor(state_size, action_size).to(device)
    if os.path.exists(critic_path):
        critic = Critic(state_size, action_size).to(device)
        critic.load_state_dict(torch.load(critic_path))
        print('Critic Model loaded')
    else:
        critic = Critic(state_size, action_size).to(device)
    critic_next = Critic(state_size, action_size).to(device)
    critic_next.load_state_dict(critic.state_dict())
    print("Waiting for GAMA...")
    ################### initialization ########################
    reset()

    episode = 4000
    training_stage = 70  #100#80
    Decay = training_stage * 18

    lr = 0.0001
    sample_lr = [
        0.0001, 0.00009, 0.00008, 0.00007, 0.00006, 0.00005, 0.00004, 0.00003,
        0.00002, 0.00001, 0.000009, 0.000008, 0.000007, 0.000006, 0.000005,
        0.000004, 0.000003, 0.000002, 0.000001
    ]  #900 960 1020 1080 1140
    if episode >= training_stage:  #50 100
        try:
            lr = sample_lr[int(episode // training_stage)]
        except (IndexError):
            lr = 0.000001 * (0.9**((episode - Decay // training_stage))
                             )  #100-1800#80-1440#65-1170#570 -- 30

    optimizerA = optim.Adam(actor.parameters(), lr, betas=(0.95, 0.999))
    optimizerC = optim.Adam(critic.parameters(), lr, betas=(0.95, 0.999))

    test = "GAMA"
    state, reward, done, time_pass, over, _ = GAMA_connect(test)  #connect
    print("done:", done, "timepass:"******"----------------------------------Net_Trained---------------------------------------"
            )
            print('--------------------------Iteration:', episode,
                  'over--------------------------------')
            episode += 1

        #最初の時
        else:
            print('Iteration:', episode, "lr:", lr)
            state = np.reshape(state, (1, len(state)))  #xxx
            state_img = generate_img()
            tensor_cv = torch.from_numpy(np.transpose(
                state_img, (2, 0, 1))).double().to(device) / 255
            state = torch.DoubleTensor(state).reshape(1, state_size).to(device)

            for _ in range(Memory_size):
                memory.states.append(state)
                memory.states_img.append(tensor_cv)
            state = torch.stack(memory.states).to(device).detach()  ###
            tensor_cv = torch.stack(memory.states_img).to(device).detach()
            value, h_state_cv_c, h_state_n_c, h_state_3_c = critic(
                state,
                tensor_cv)  #dist,  # now is a tensoraction = dist.sample()
            action, log_prob, entropy = actor(
                state, tensor_cv)  #, h_state_cv_a,h_state_n_a,h_state_3_a
            print("acceleration: ", action.cpu().numpy())
            send_to_GAMA([[1, float(action.cpu().numpy() * 10)]])
            log_prob = log_prob.unsqueeze(0)
            #entropy += entropy

        state, reward, done, time_pass, over, average_speed_NPC = GAMA_connect(
            test)
    return None
Ejemplo n.º 5
0
def main():
    ############## Hyperparameters ##############
    K_epochs = 3  # update policy for K epochs  lr太大会出现NAN?
    eps_clip = 0.2
    gamma = 0.9  # 要较弱;较强关联? 对每一正确步也有打击

    episode = 3

    lr_first = 0.00001
    lr = lr_first  #random_seed = None
    state_dim = 6
    action_dim = 1
    #(self, state_dim, action_dim, lr, betas, gamma, K_epochs, eps_clip)
    actor_path = os.getcwd(
    ) + '\\GAMA_python\\PPO_Mixedinput_Navigation_Model\\weight\\ppo_MC_actor.pkl'
    critic_path = os.getcwd(
    ) + '\\GAMA_python\\PPO_Mixedinput_Navigation_Model\\weight\\ppo_MC_critic.pkl'
    ################ load ###################
    if episode > 30:  #50 100
        lr_first = 0.00001
        lr = lr_first * (0.7**((episode - 20) // 10))
    ppo = PPO(state_dim, action_dim, lr, gamma, K_epochs, eps_clip)
    if os.path.exists(actor_path):
        ppo.actor.load_state_dict(torch.load(actor_path))
        print('Actor Model loaded')
    if os.path.exists(critic_path):
        ppo.critic.load_state_dict(torch.load(critic_path))
        print('Critic Model loaded')
    print("Waiting for GAMA...")

    ################### initialization ########################
    save_curve_pic = os.getcwd(
    ) + '\\GAMA_python\\PPO_Mixedinput_Navigation_Model\\result\\PPO_MC_loss_curve.png'
    save_critic_loss = os.getcwd(
    ) + '\\GAMA_python\\PPO_Mixedinput_Navigation_Model\\training_data\\PPO_MC_critic_loss.csv'
    save_reward = os.getcwd(
    ) + '\\GAMA_python\\PPO_Mixedinput_Navigation_Model\\training_data\\PPO_MC_reward.csv'
    reset()
    memory = Memory()

    advantages = 0  #global value
    loss = []
    total_loss = []
    rewards = []
    total_rewards = []
    test = "GAMA"
    state, reward, done, time_pass, over = GAMA_connect(test)  #connect
    #[real_speed/10, target_speed/10, elapsed_time_ratio, distance_left/100,distance_front_car/10,distance_behind_car/10,reward,done,over]
    print("done:", done, "timepass:"******"----------------------------------Net_Trained---------------------------------------"
            )
            print('--------------------------Iteration:', episode,
                  'over--------------------------------')
            episode += 1
            loss_sum = sum(loss).cpu().detach().numpy()
            total_loss.append(loss_sum)
            total_reward = sum(rewards)
            total_rewards.append(total_reward)
            cross_loss_curve(loss_sum.squeeze(0), total_reward, save_curve_pic,
                             save_critic_loss, save_reward)
            rewards = []
            loss = []
            if episode > 30:  #50 100
                lr = lr_first * (0.94**((episode - 20) // 10))
                #if episode > 80:
                #   lr_first = 0.0001
                #  lr = lr_first * (0.94 ** ((episode-70) // 10))
            torch.save(ppo.actor.state_dict(), actor_path)
            torch.save(ppo.critic.state_dict(), critic_path)

        #最初の時
        else:
            print('Iteration:', episode)
            state = torch.DoubleTensor(state).reshape(1, 6).to(device)
            state_img = generate_img(
            )  # numpy image: H x W x C (500, 500, 3) -> (3,500,500)
            tensor_cv = torch.from_numpy(np.transpose(
                state_img, (2, 0, 1))).double().to(
                    device
                )  # np.transpose( xxx,  (2, 0, 1)) torch image: C x H x W
            action = ppo.select_action(state, tensor_cv, memory)
            print("acceleration: ", action)  #.cpu().numpy()
            send_to_GAMA([[1, float(action.cpu().numpy() * 10)]])

        state, reward, done, time_pass, over = GAMA_connect(test)
    return None
Ejemplo n.º 6
0
def main():

    ############## Hyperparameters ##############
    update_timestep = 1  #TD use == 1 # update policy every n timesteps  set for TD
    K_epochs = 4  # update policy for K epochs  lr太大会出现NAN?
    eps_clip = 0.2
    gamma = 0.9

    episode = 512
    sample_lr = [
        0.0001, 0.00009, 0.00008, 0.00007, 0.00006, 0.00005, 0.00004, 0.00003,
        0.00002, 0.00001, 0.000009, 0.000008, 0.000007, 0.000006, 0.000005,
        0.000004, 0.000003, 0.000002, 0.000001
    ]
    lr = 0.0001  #random_seed = None
    state_dim = 5
    action_dim = 1
    #(self, state_dim, action_dim, lr, betas, gamma, K_epochs, eps_clip)
    actor_path = os.getcwd(
    ) + '/PPO_Mixedinput_Navigation_Model/weight/ppo_TD2lstm_actor.pkl'
    critic_path = os.getcwd(
    ) + '/PPO_Mixedinput_Navigation_Model/weight/ppo_TD2lstm_critic.pkl'
    ################ load ###################
    if episode > 50:  #50 100
        try:
            lr = sample_lr[int(episode // 50)]
        except (IndexError):
            lr = 0.000001

    ppo = PPO(state_dim, action_dim, lr, gamma, K_epochs, eps_clip)
    if os.path.exists(actor_path):
        ppo.actor.load_state_dict(torch.load(actor_path))
        print('Actor Model loaded')
    if os.path.exists(critic_path):
        ppo.critic.load_state_dict(torch.load(critic_path))
        print('Critic Model loaded')
    print("Waiting for GAMA...")

    ################### initialization ########################
    save_curve_pic = os.getcwd(
    ) + '/PPO_Mixedinput_Navigation_Model/result/PPO_2LSTM_loss_curve.png'
    save_critic_loss = os.getcwd(
    ) + '/PPO_Mixedinput_Navigation_Model/training_data/PPO_TD2_critic_loss.csv'
    save_reward = os.getcwd(
    ) + '/PPO_Mixedinput_Navigation_Model/training_data/PPO_TD2_reward.csv'
    reset()
    memory = Memory()

    advantages = 0  #global value
    loss = []
    total_loss = []
    rewards = []
    total_rewards = []
    test = "GAMA"
    state, reward, done, time_pass, over = GAMA_connect(test)  #connect
    #[real_speed/10, target_speed/10, elapsed_time_ratio, distance_left/100,distance_front_car/10,distance_behind_car/10,reward,done,over]
    print("done:", done, "timepass:"******"state ",state)
            rewards.append(reward)
            memory.rewards.append(reward)
            memory.is_terminals.append(done)
            state = torch.DoubleTensor(state).reshape(1, state_dim).to(device)
            state_img = generate_img()
            tensor_cv = torch.from_numpy(np.transpose(
                state_img, (2, 0, 1))).double().to(device)
            if len(memory.states_next) == 0:
                for _ in range(3):
                    memory.states_next = memory.states
                    memory.states_next[2] = state
                    memory.states_img_next = memory.states_img
                    memory.states_img_next[2] = tensor_cv
            else:
                del memory.states_next[:1]
                del memory.states_img_next[:1]
                memory.states_next.append(state)
                memory.states_img_next.append(tensor_cv)
            loss_ = ppo.update(memory, lr, advantages, done)
            loss.append(loss_)
            del memory.logprobs[:]
            del memory.rewards[:]
            del memory.is_terminals[:]
            #memory.clear_memory()

            action = ppo.select_action(state, tensor_cv, memory)
            send_to_GAMA([[1, float(action * 10)]])
            #print("acceleration ",float(action))

        # 終わり
        elif done == 1:
            #先传后计算
            print("state_last", state)
            send_to_GAMA([[1, 0]])
            rewards.append(reward)

            del memory.states_next[:1]
            del memory.states_img_next[:1]
            state = torch.DoubleTensor(state).reshape(1, state_dim).to(
                device)  #转化成1行
            memory.states_next.append(state)
            state_img = generate_img()
            tensor_cv = torch.from_numpy(np.transpose(
                state_img, (2, 0, 1))).double().to(device)
            memory.states_img_next.append(tensor_cv)

            memory.rewards.append(reward)
            memory.is_terminals.append(done)
            loss_ = ppo.update(memory, lr, advantages, done)
            loss.append(loss_)
            memory.clear_memory()

            print(
                "----------------------------------Net_Trained---------------------------------------"
            )
            print('--------------------------Iteration:', episode,
                  'over--------------------------------')
            episode += 1
            loss_sum = sum(loss).cpu().detach().numpy()
            total_loss.append(loss_sum)
            total_reward = sum(rewards)
            total_rewards.append(total_reward)
            cross_loss_curve(loss_sum.squeeze(0), total_reward, save_curve_pic,
                             save_critic_loss, save_reward)
            rewards = []
            loss = []
            if episode > 50:  #50 100
                try:
                    lr = sample_lr[int(episode // 50)]
                except (IndexError):
                    lr = 0.000001
            torch.save(ppo.actor.state_dict(), actor_path)
            torch.save(ppo.critic.state_dict(), critic_path)

        #最初の時
        else:
            print('Iteration:', episode, "lr:", lr)
            state = torch.DoubleTensor(state).reshape(1, state_dim).to(device)
            state_img = generate_img(
            )  # numpy image: H x W x C (500, 500, 3) -> (3,500,500)
            tensor_cv = torch.from_numpy(np.transpose(
                state_img, (2, 0, 1))).double().to(
                    device
                )  # np.transpose( xxx,  (2, 0, 1)) torch image: C x H x W
            action = ppo.select_action(state, tensor_cv, memory)
            print("acceleration: ", action)
            send_to_GAMA([[1, float(action * 10)]])

        state, reward, done, time_pass, over = GAMA_connect(test)

    return None
def main():
    ################ load ###################
    #train_agent
    actor_train_path = os.path.abspath(
        os.curdir) + '/Generate_Traffic_Flow_MAS_RL/weight/AC_TD3_actor.pkl'
    critic_train_path = os.path.abspath(
        os.curdir) + '/Generate_Traffic_Flow_MAS_RL/weight/AC_TD3_critic.pkl'
    if os.path.exists(actor_train_path):
        actor_train = Actor(state_size, action_size).to(device)
        actor_train.load_state_dict(torch.load(actor_train_path))
        print('Actor_Train Model loaded')
    else:
        actor_train = Actor(state_size, action_size).to(device)
    if os.path.exists(critic_train_path):
        critic_train = Critic(state_size, action_size).to(device)
        critic_train.load_state_dict(torch.load(critic_train_path))
        print('Critic_Train Model loaded')
    else:
        critic_train = Critic(state_size, action_size).to(device)
    critic_next_train = Critic(state_size, action_size).to(device)
    critic_next_train.load_state_dict(critic_train.state_dict())
    #agents
    actor_path = os.path.abspath(
        os.curdir) + '/Generate_Traffic_Flow_MAS_RL/weight/AC_TD_MAS_actor.pkl'
    if os.path.exists(actor_path):
        actor = Actor(state_size, action_size).to(device)
        actor.load_state_dict(torch.load(actor_path))
        print('Actor Model loaded')

    print("Waiting for GAMA...")

    ################### initialization ########################
    reset()

    episode = 0

    training_stage = 65

    lr = 0.0001

    sample_lr = [
        0.0001, 0.00009, 0.00008, 0.00007, 0.00006, 0.00005, 0.00004, 0.00003,
        0.00002, 0.00001, 0.000009, 0.000008, 0.000007, 0.000006, 0.000005,
        0.000004, 0.000003, 0.000002, 0.000001
    ]
    if episode > training_stage:  #50 100
        try:
            lr = sample_lr[int(episode // training_stage)] * 0.01
        except (IndexError):
            lr = 0.000001 * 0.9  #* (0.9 ** ((episode-1000) // 60))

    optimizerA = optim.Adam(actor_train.parameters(), lr, betas=(0.95, 0.999))
    optimizerC = optim.Adam(critic_train.parameters(), lr, betas=(0.95, 0.999))

    values = []
    rewards = []
    masks = []
    total_loss = []
    total_rewards = []
    loss = []
    average_speed = []

    value = 0
    gama = 0.9
    over = 0
    log_prob = 0
    memory = Memory()

    A_T, state, reward, done, time_pass, over, average_speed_NPC = GAMA_connect(
    )
    print("Connected")
    ##################  start  #########################
    while over != 1:
        #training_agent
        if A_T == 0:
            #普通の場合
            average_speed.append(state[0])
            if (done == 0 and time_pass != 0):
                #前回の報酬
                reward = torch.tensor([reward],
                                      dtype=torch.float,
                                      device=device)
                rewards.append(reward)
                state = torch.DoubleTensor(state).reshape(
                    1, state_size).to(device)
                state_img = generate_img_train()
                tensor_cv = torch.from_numpy(np.transpose(
                    state_img, (2, 0, 1))).double().to(device) / 255
                if len(memory.states_next) == 0:
                    #for _ in range(3):
                    memory.states_next = memory.states
                    memory.states_next[2] = state
                    memory.states_img_next = memory.states_img
                    memory.states_img_next[2] = tensor_cv
                else:
                    del memory.states_next[:1]
                    del memory.states_img_next[:1]
                    memory.states_next.append(state)
                    memory.states_img_next.append(tensor_cv)

                state_next = torch.stack(
                    memory.states_next).to(device).detach()
                tensor_cv_next = torch.stack(
                    memory.states_img_next).to(device).detach()
                value_next, _, _, _ = critic_next_train(
                    state_next, tensor_cv_next, h_state_cv_c, h_state_n_c,
                    h_state_3_c)  #_next
                with torch.autograd.set_detect_anomaly(True):
                    # TD:r(s) +  gama*v(s+1) - v(s)
                    advantage = reward.detach(
                    ) + gama * value_next.detach() - value
                    actor_loss = -(log_prob * advantage.detach())
                    critic_loss = (reward.detach() +
                                   gama * value_next.detach() - value).pow(2)
                    optimizerA.zero_grad()
                    optimizerC.zero_grad()
                    critic_loss.backward()
                    actor_loss.backward()
                    loss.append(critic_loss)
                    optimizerA.step()
                    optimizerC.step()
                    critic_next_train.load_state_dict(
                        critic_train.state_dict())

                del memory.states[:1]
                del memory.states_img[:1]
                memory.states.append(state)
                memory.states_img.append(tensor_cv)
                state = torch.stack(memory.states).to(device).detach()
                tensor_cv = torch.stack(memory.states_img).to(device).detach()
                value, h_state_cv_c, h_state_n_c, h_state_3_c = critic_train(
                    state, tensor_cv, h_state_cv_c, h_state_n_c, h_state_3_c)
                action, log_prob = actor_train(state, tensor_cv)
                log_prob = log_prob.unsqueeze(0)

                send_to_GAMA([[1, float(action.cpu().numpy() * 10)]])  #行
                masks.append(
                    torch.tensor([1 - done], dtype=torch.float, device=device))
                values.append(value)

            # 終わり
            elif done == 1:
                average_speed.append(state[0])
                send_to_GAMA([[1, 0]])
                #先传后计算
                print(state)
                rewards.append(reward)  #contains the last
                reward = torch.tensor([reward],
                                      dtype=torch.float,
                                      device=device)
                rewards.append(reward)  #contains the last
                total_reward = sum(rewards).cpu().detach().numpy()
                total_rewards.append(total_reward)

                with torch.autograd.set_detect_anomaly(True):
                    advantage = reward.detach(
                    ) - value  #+ last_value   最后一回的V(s+1) = 0
                    actor_loss = -(log_prob * advantage.detach())
                    critic_loss = (reward.detach() - value).pow(
                        2)  #+ last_value

                    optimizerA.zero_grad()
                    optimizerC.zero_grad()

                    critic_loss.backward()
                    actor_loss.backward()
                    loss.append(critic_loss)

                    optimizerA.step()
                    optimizerC.step()

                    critic_next_train.load_state_dict(
                        critic_train.state_dict())

                print(
                    "----------------------------------Net_Trained---------------------------------------"
                )
                print('--------------------------Iteration:', episode,
                      'over--------------------------------')
                episode += 1
                values = []
                rewards = []
                loss_sum = sum(loss).cpu().detach().numpy()
                total_loss.append(loss_sum)
                cross_loss_curve(loss_sum.squeeze(0), total_reward,
                                 save_curve_pic, save_critic_loss, save_reward,
                                 np.mean(average_speed), save_speed,
                                 average_speed_NPC, save_NPC_speed)
                #total_loss,total_rewards#np.mean(average_speed)/10
                loss = []
                average_speed = []
                memory.clear_memory()

                torch.save(actor_train.state_dict(), actor_train_path)
                torch.save(critic_train.state_dict(), critic_train_path)

                if episode > training_stage:  #50 100
                    try:
                        lr = sample_lr[int(episode // training_stage)] * 0.01
                    except (IndexError):
                        lr = 0.000001 * 0.9  #* (0.9 ** ((episode-1000) // 60))

                optimizerA = optim.Adam(actor_train.parameters(),
                                        lr,
                                        betas=(0.95, 0.999))
                optimizerC = optim.Adam(critic_train.parameters(),
                                        lr,
                                        betas=(0.95, 0.999))

            #最初の時
            if time_pass == 0:
                print('Iteration:', episode, "lr:", lr)
                state = np.reshape(state, (1, len(state)))
                state_img = generate_img_train()
                tensor_cv = torch.from_numpy(np.transpose(
                    state_img, (2, 0, 1))).double().to(device) / 255
                state = torch.DoubleTensor(state).reshape(
                    1, state_size).to(device)

                for _ in range(3):
                    memory.states.append(state)
                    memory.states_img.append(tensor_cv)

                state = torch.stack(memory.states).to(device).detach()  ###
                tensor_cv = torch.stack(memory.states_img).to(device).detach()
                value, h_state_cv_c, h_state_n_c, h_state_3_c = critic_train(
                    state,
                    tensor_cv)  #dist,  # now is a tensoraction = dist.sample()
                action, log_prob = actor_train(state, tensor_cv)
                print("acceleration: ", action.cpu().numpy())
                send_to_GAMA([[1, float(action.cpu().numpy() * 10)]])

        #agents
        if A_T == 1:
            state = [
                torch.DoubleTensor(elem).reshape(1, state_size).to(device)
                for elem in state
            ]
            state = torch.stack(state).to(device).detach()
            tensor_cv_MAS = generate_img()
            tensor_cv_MAS = [
                torch.from_numpy(np.transpose(elem,
                                              (2, 0, 1))).double().to(device) /
                255 for elem in tensor_cv_MAS
            ]
            tensor_cv_MAS = torch.stack(tensor_cv_MAS).to(device).detach()

            action, _ = actor(state, tensor_cv_MAS)

            send_to_GAMA([[1, float(action.cpu().numpy() * 10)]])

        A_T, state, reward, done, time_pass, over, average_speed_NPC = GAMA_connect(
        )

    return None
def main():

    ############## Hyperparameters ##############
    update_timestep = 1  #TD use == 1 # update policy every n timesteps  set for TD
    K_epochs = 2  # update policy for K epochs  lr太大会出现NAN?
    eps_clip = 0.2
    gamma = 0.9

    episode = 376

    lr_first = 0.0001
    lr = lr_first  #random_seed = None
    state_dim = 6
    action_dim = 1
    #(self, state_dim, action_dim, lr, betas, gamma, K_epochs, eps_clip)
    actor_path = os.getcwd(
    ) + '\\GAMA_python\\PPO_Navigation_Model\\weight\\ppo_TD_actor.pkl'
    critic_path = os.getcwd(
    ) + '\\GAMA_python\\PPO_Navigation_Model\\weight\\ppo_TD_critic.pkl'
    ################ load ###################
    if episode > 70:  #50 100
        lr_first = 0.00001
        lr = lr_first * (0.65**((episode - 60) // 10))
    ppo = PPO(state_dim, action_dim, lr, gamma, K_epochs, eps_clip)
    if os.path.exists(actor_path):
        ppo.actor.load_state_dict(torch.load(actor_path))
        print('Actor Model loaded')
    if os.path.exists(critic_path):
        ppo.critic.load_state_dict(torch.load(critic_path))
        print('Critic Model loaded')
    print("Waiting for GAMA...")

    ################### initialization ########################
    save_curve_pic = os.getcwd(
    ) + '\\GAMA_python\\PPO_Navigation_Model\\result\\PPO_TD_loss_curve.png'
    save_critic_loss = os.getcwd(
    ) + '\\GAMA_python\\PPO_Navigation_Model\\training_data\\PPO_TD_critic_loss.csv'
    save_reward = os.getcwd(
    ) + '\\GAMA_python\\PPO_Navigation_Model\\training_data\\PPO_TD_reward.csv'
    reset()
    memory = Memory()

    advantages = 0  #global value
    loss = []
    total_loss = []
    rewards = []
    total_rewards = []
    test = "GAMA"
    state, reward, done, time_pass, over = GAMA_connect(test)  #connect
    #[real_speed/10, target_speed/10, elapsed_time_ratio, distance_left/100,distance_front_car/10,distance_behind_car/10,reward,done,over]
    print("done:", done, "timepass:"******"state_last", state)
            send_to_GAMA([[1, 0]])
            rewards.append(reward)

            state = torch.DoubleTensor(state).reshape(1, 6).to(device)  #转化成1行
            memory.states_next.append(state)
            memory.rewards.append(reward)
            memory.is_terminals.append(done)
            loss_ = ppo.update(memory, lr, advantages, done)
            loss.append(loss_)
            memory.clear_memory()

            print(
                "----------------------------------Net_Trained---------------------------------------"
            )
            print('--------------------------Iteration:', episode,
                  'over--------------------------------')
            episode += 1
            loss_sum = sum(loss).cpu().detach().numpy()
            total_loss.append(loss_sum)
            total_reward = sum(rewards)
            total_rewards.append(total_reward)
            cross_loss_curve(loss_sum.squeeze(0), total_reward, save_curve_pic,
                             save_critic_loss, save_reward)
            rewards = []
            loss = []
            if episode > 70:  #50 100
                lr_first = 0.00001
                lr = lr_first * (0.65**((episode - 60) // 10))  #40 90
            torch.save(ppo.actor.state_dict(), actor_path)
            torch.save(ppo.critic.state_dict(), critic_path)

        #最初の時
        else:
            print('Iteration:', episode)
            state = torch.DoubleTensor(state).reshape(1, 6).to(device)
            action = ppo.select_action(state, memory)
            print("acceleration: ", action)  #.cpu().numpy()
            send_to_GAMA([[1, float(action * 10)]])

        state, reward, done, time_pass, over = GAMA_connect(test)
    return None
def main():
    ################ load ###################
    actor_path = os.path.abspath(
        os.curdir) + '/PPO_Mixedinput_Navigation_Model/weight/AC_TD0_actor.pkl'
    critic_path = os.path.abspath(
        os.curdir
    ) + '/PPO_Mixedinput_Navigation_Model/weight/AC_TD0_critic.pkl'
    if os.path.exists(actor_path):
        actor = Actor(state_size, action_size).to(device)
        actor.load_state_dict(torch.load(actor_path))
        print('Actor Model loaded')
    else:
        actor = Actor(state_size, action_size).to(device)
    if os.path.exists(critic_path):
        critic = Critic(state_size, action_size).to(device)
        critic.load_state_dict(torch.load(critic_path))
        print('Critic Model loaded')
    else:
        critic = Critic(state_size, action_size).to(device)
    critic_next = Critic(state_size, action_size).to(device)
    critic_next.load_state_dict(critic.state_dict())
    print("Waiting for GAMA...")
    ################### initialization ########################
    reset()

    episode = 1257

    lr = 0.0001
    sample_lr = [
        0.0001, 0.00009, 0.00008, 0.00007, 0.00006, 0.00005, 0.00004, 0.00003,
        0.00002, 0.00001, 0.000009, 0.000008, 0.000007, 0.000006, 0.000005,
        0.000004, 0.000003, 0.000002, 0.000001
    ]
    if episode > 50:  #50 100
        try:
            lr = sample_lr[int(episode // 50)]
        except (IndexError):
            lr = 0.000001

    optimizerA = optim.Adam(actor.parameters(), lr, betas=(0.95, 0.999))
    optimizerC = optim.Adam(critic.parameters(), lr, betas=(0.95, 0.999))

    test = "GAMA"
    state, reward, done, time_pass, over = GAMA_connect(test)  #connect
    print("done:", done, "timepass:"******"----------------------------------Net_Trained---------------------------------------"
            )
            print('--------------------------Iteration:', episode,
                  'over--------------------------------')
            episode += 1
            log_probs = []
            values = []
            rewards = []
            masks = []
            loss_sum = sum(loss).cpu().detach().numpy()
            total_loss.append(loss_sum)
            cross_loss_curve(loss_sum.squeeze(0), total_reward, save_curve_pic,
                             save_critic_loss,
                             save_reward)  #total_loss,total_rewards
            loss = []
            memory.clear_memory()
            if episode > 50:  #50 100
                try:
                    lr = sample_lr[int(episode // 50)]
                except (IndexError):
                    lr = 0.000001
                optimizerA = optim.Adam(actor.parameters(),
                                        lr,
                                        betas=(0.95, 0.999))
                optimizerC = optim.Adam(critic.parameters(),
                                        lr,
                                        betas=(0.95, 0.999))

            torch.save(actor.state_dict(), actor_path)
            torch.save(critic.state_dict(), critic_path)

        #最初の時
        else:
            print('Iteration:', episode, "lr:", lr)
            state = np.reshape(state, (1, len(state)))  #xxx
            state_img = generate_img()
            tensor_cv = torch.from_numpy(np.transpose(
                state_img, (2, 0, 1))).double().to(device)
            state = torch.DoubleTensor(state).reshape(1, state_size).to(device)

            for _ in range(3):
                memory.states.append(state)
                memory.states_img.append(tensor_cv)
            state = torch.stack(memory.states).to(device).detach()  ###
            tensor_cv = torch.stack(memory.states_img).to(device).detach()
            value = critic(
                state,
                tensor_cv)  #dist,  # now is a tensoraction = dist.sample()
            action, log_prob, entropy = actor(state, tensor_cv)
            print("acceleration: ", action.cpu().numpy())
            send_to_GAMA([[1, float(action.cpu().numpy() * 10)]])
            log_prob = log_prob.unsqueeze(0)
            entropy += entropy

        state, reward, done, time_pass, over = GAMA_connect(test)
    return None
Ejemplo n.º 10
0
def main():
    ################ load ###################
    actor_path = os.path.abspath(
        os.curdir
    ) + '\\GAMA_python\\PPO_Mixedinput_Navigation_Model\\weight\\AC_TD_actor.pkl'
    critic_path = os.path.abspath(
        os.curdir
    ) + '\\GAMA_python\\PPO_Mixedinput_Navigation_Model\\weight\\AC_TD_critic.pkl'
    if os.path.exists(actor_path):
        actor = Actor(state_size, action_size).to(device)
        actor.load_state_dict(torch.load(actor_path))
        print('Actor Model loaded')
    else:
        actor = Actor(state_size, action_size).to(device)
    if os.path.exists(critic_path):
        critic = Critic(state_size, action_size).to(device)
        critic.load_state_dict(torch.load(critic_path))
        print('Critic Model loaded')
    else:
        critic = Critic(state_size, action_size).to(device)
    critic_next = Critic(state_size, action_size).to(device)
    critic_next.load_state_dict(critic.state_dict())
    print("Waiting for GAMA...")
    ################### initialization ########################
    reset()

    episode = 237

    lr = 0.0001
    if episode > 50:
        lr = 0.00008
        new_lr = lr * (0.9**((episode - 40) // 10))
        #if episode > 110:
        #   lr = 0.000008
        #  new_lr = lr * (0.9 ** ((episode-90) // 10)) #40

    optimizerA = optim.Adam(actor.parameters(), lr, betas=(0.95, 0.999))
    optimizerC = optim.Adam(critic.parameters(), lr, betas=(0.95, 0.999))

    test = "GAMA"
    state, reward, done, time_pass, over = GAMA_connect(test)  #connect
    print("done:", done, "timepass:"******"restart acceleration: 0")
            send_to_GAMA([[1, 0]])
            #先传后计算
            print(state)
            rewards.append(reward)  #contains the last
            reward = torch.tensor([reward], dtype=torch.float, device=device)
            rewards.append(reward)  #contains the last
            total_reward = sum(rewards).cpu().detach().numpy()
            total_rewards.append(total_reward)

            #state = torch.FloatTensor(state).reshape(1,4).to(device)
            #last_value= critic(state)

            with torch.autograd.set_detect_anomaly(True):
                advantage = reward.detach(
                ) - value  #+ last_value   最后一回的V(s+1) = 0
                actor_loss = -(log_prob * advantage.detach())
                critic_loss = (reward.detach() - value).pow(2)  #+ last_value
                lstm_loss = critic_loss

                optimizerA.zero_grad()
                optimizerC.zero_grad()

                critic_loss.backward()
                actor_loss.backward()
                loss.append(critic_loss)

                optimizerA.step()
                optimizerC.step()

                critic_next.load_state_dict(critic.state_dict())

            print(
                "----------------------------------Net_Trained---------------------------------------"
            )
            print('--------------------------Iteration:', episode,
                  'over--------------------------------')
            episode += 1
            log_probs = []
            values = []
            rewards = []
            masks = []
            loss_sum = sum(loss).cpu().detach().numpy()
            total_loss.append(loss_sum)
            cross_loss_curve(loss_sum.squeeze(0), total_reward, save_curve_pic,
                             save_critic_loss,
                             save_reward)  #total_loss,total_rewards
            loss = []
            if episode > 50:
                lr = 0.00008
                new_lr = lr * (0.9**((episode - 40) // 10))
                #if episode > 110:
                #   lr = 0.000008
                #  new_lr = lr * (0.9 ** ((episode-90) // 10)) #40
                optimizerA = optim.Adam(actor.parameters(),
                                        new_lr,
                                        betas=(0.95, 0.999))
                optimizerC = optim.Adam(critic.parameters(),
                                        new_lr,
                                        betas=(0.95, 0.999))

            torch.save(actor.state_dict(), actor_path)
            torch.save(critic.state_dict(), critic_path)

        #最初の時
        else:
            print('Iteration:', episode)
            state = np.reshape(state, (1, len(state)))  #xxx
            state_img = generate_img()
            tensor_cv = torch.from_numpy(np.transpose(
                state_img, (2, 0, 1))).double().to(device)
            state = torch.DoubleTensor(state).reshape(1, 6).to(device)
            value = critic(
                state,
                tensor_cv)  #dist,  # now is a tensoraction = dist.sample()
            action, log_prob, entropy = actor(state, tensor_cv)
            print("acceleration: ", action.cpu().numpy())
            send_to_GAMA([[1, float(action.cpu().numpy() * 10)]])
            log_prob = log_prob.unsqueeze(0)
            entropy += entropy

        state, reward, done, time_pass, over = GAMA_connect(test)
    return None