def main(): ################ load ################### if os.path.exists( 'D:/Software/PythonWork/GAMA_python/A2C-TD-single-car-intersection/weight/actor.pkl' ): actor = Actor(state_size, action_size).to(device) actor.load_state_dict( torch.load( 'D:/Software/PythonWork/GAMA_python/A2C-TD-single-car-intersection/weight/actor.pkl' )) print('Actor Model loaded') else: actor = Actor(state_size, action_size).to(device) if os.path.exists( 'D:/Software/PythonWork/GAMA_python/A2C-TD-single-car-intersection/weight/critic.pkl' ): critic = Critic(state_size, action_size).to(device) critic.load_state_dict( torch.load( 'D:/Software/PythonWork/GAMA_python/A2C-TD-single-car-intersection/weight/critic.pkl' )) print('Critic Model loaded') else: critic = Critic(state_size, action_size).to(device) print("Waiting for GAMA...") ################### initialization ######################## reset() lr = 0.00007 optimizerA = optim.Adam(actor.parameters(), lr, betas=(0.95, 0.999)) optimizerC = optim.Adam(critic.parameters(), lr, betas=(0.95, 0.999)) episode = 0 test = "GAMA" state, reward, done, time_pass, over = GAMA_connect(test) #connect print("done:", done, "timepass:"******"restart acceleration: 0") send_to_GAMA([[1, 0]]) #先传后计算 rewards.append(reward) #contains the last reward = torch.tensor([reward], dtype=torch.float, device=device) rewards.append(reward) #contains the last total_reward = sum(rewards) total_rewards.append(total_reward) #state = torch.FloatTensor(state).reshape(1,4).to(device) #last_value= critic(state) with torch.autograd.set_detect_anomaly(True): advantage = reward.detach( ) - value #+ last_value 最后一回的V(s+1) = 0 actor_loss = -(log_prob * advantage.detach()) print("actor_loss, ", actor_loss, " size", actor_loss.dim()) critic_loss = (reward.detach() - value).pow(2) #+ last_value lstm_loss = critic_loss optimizerA.zero_grad() optimizerC.zero_grad() critic_loss.backward(retain_graph=True) actor_loss.backward(retain_graph=True) loss.append(critic_loss) optimizerA.step() optimizerC.step() print( "----------------------------------Net_Trained---------------------------------------" ) print('--------------------------Iteration:', episode, 'over--------------------------------') episode += 1 log_probs = [] values = [] rewards = [] masks = [] torch.save( actor.state_dict(), 'D:/Software/PythonWork/GAMA_python/A2C-TD-single-car-intersection/weight/actor.pkl' ) torch.save( critic.state_dict(), 'D:/Software/PythonWork/GAMA_python/A2C-TD-single-car-intersection/weight/critic.pkl' ) loss_sum = sum(loss) total_loss.append(loss_sum) cross_loss_curve(total_loss, total_rewards) loss = [] if episode > 50: #50 lr = 0.0002 if episode > 115: lr = 0.0001 new_lr = lr * (0.94**((episode - 40) // 10)) #40 optimizerA = optim.Adam(actor.parameters(), new_lr, betas=(0.95, 0.999)) optimizerC = optim.Adam(critic.parameters(), new_lr, betas=(0.95, 0.999)) #最初の時 else: print('Iteration:', episode) state = np.reshape(state, (1, len(state))) #xxx state = torch.FloatTensor(state).reshape(1, 4).to(device) value = critic( state) #dist, # now is a tensoraction = dist.sample() action, log_prob, entropy = actor(state) print("acceleration: ", action.cpu().numpy()) send_to_GAMA([[1, float(action.cpu().numpy() * 10)]]) log_prob = log_prob.unsqueeze(0) entropy += entropy state, reward, done, time_pass, over = GAMA_connect(test) return None
def main(): ################ load ################### if os.path.exists('D:/Software/GamaWorkspace/Python/weight/actor.pkl'): actor = Actor(state_size, action_size).to(device) actor.load_state_dict(torch.load('D:/Software/GamaWorkspace/Python/weight/actor.pkl')) print('Actor Model loaded') else: actor = Actor(state_size, action_size).to(device) if os.path.exists('D:/Software/GamaWorkspace/Python/weight/critic.pkl'): critic = Critic(state_size, action_size).to(device) critic.load_state_dict(torch.load('D:/Software/GamaWorkspace/Python/weight/critic.pkl')) print('Critic Model loaded') else: critic = Critic(state_size, action_size).to(device) print("Waiting for GAMA...") ################### initialization ######################## reset() optimizerA = optim.Adam(actor.parameters(), lr, betas=(0.95, 0.999))#optim.Adam(actor.parameters()) optimizerC = optim.Adam(critic.parameters(), lr, betas=(0.95, 0.999))#optim.Adam(critic.parameters()) episode = 0 test = "GAMA" state,reward,done,time_pass,over = GAMA_connect(test) print("done:",done,"timepass:"******"acceleration: ",action.cpu().numpy())#,"action.cpu().numpy()",type(float(action.cpu().numpy())) to_GAMA = [[1,float(action.cpu().numpy()*10)]] #行 np.savetxt(from_python_1,to_GAMA,delimiter=',') np.savetxt(from_python_2,to_GAMA,delimiter=',') #前回の報酬 rewards.append(torch.tensor([reward], dtype=torch.float, device=device)) #contains the last masks.append(torch.tensor([1-done], dtype=torch.float, device=device)) #over-0; otherwise-1 contains the last log_prob = log_prob.unsqueeze(0) #log_prob = dist.log_prob(action).unsqueeze(0) # entropy += dist.entropy().mean() log_probs.append(log_prob) values.append(value) entropy += entropy # 終わり elif done == 1: print("restart acceleration: 0") to_GAMA = [[1,0]] np.savetxt(from_python_1,to_GAMA,delimiter=',') np.savetxt(from_python_2,to_GAMA,delimiter=',') #先传后计算 rewards.append(torch.tensor([reward], dtype=torch.float, device=device)) #contains the last masks.append(torch.tensor([1-done], dtype=torch.float, device=device)) #over-0; otherwise-1 contains the last total_reward = sum(rewards) total_rewards.append(total_reward) last_state = torch.FloatTensor(state).to(device) last_value = critic(last_state) returns = compute_returns(last_value, rewards, masks) values_next = returns[1:]#values[1:] values_next.append(torch.tensor([0], dtype=torch.float, device=device)) log_probs = torch.cat(log_probs,1).squeeze() #Concatenates the given sequence of seq tensors in the given dimension. returns = torch.cat(returns).detach() values = torch.cat(values) values_next = torch.cat(values_next) rewards = torch.cat(rewards) # TD:r(s) + v(s+1) - v(s) #rewards.detach() + values_next - values r(s) MC: returns.detach() - values??? advantage = returns.detach() - values actor_loss = -(log_probs * advantage.detach()).mean() loss = advantage.pow(2).sum() loss.detach() critic_loss = (returns.detach() - values).pow(2).mean() optimizerA.zero_grad() optimizerC.zero_grad() actor_loss.backward() critic_loss.backward() optimizerA.step() optimizerC.step() print("--------------------------Net_Trained-------------------------------") print('--------------------------Iteration:',episode,'over--------------------------------') episode += 1 log_probs = [] values = [] rewards = [] masks = [] torch.save(actor.state_dict(), 'D:/Software/GamaWorkspace/Python/weight/actor.pkl') torch.save(critic.state_dict(), 'D:/Software/GamaWorkspace/Python/weight/critic.pkl') #print("entropy: ",entropy,"total_rewards:",total_rewards) entropys.append(entropy) total_loss.append(loss) if(episode!=0): cross_loss_curve(total_loss,total_rewards) loss = 0 if episode > 90 : new_lr = lr * (0.92 ** ((episode-80) // 10)) optimizerA = optim.Adam(actor.parameters(), new_lr, betas=(0.95, 0.999)) optimizerC = optim.Adam(critic.parameters(), new_lr, betas=(0.95, 0.999)) #最初の時 else: print('Iteration:',episode) state = torch.FloatTensor(state).to(device) value = critic(state) #dist, # now is a tensoraction = dist.sample() action,log_prob,entropy = actor(state) print("acceleration: ",float(action.cpu().numpy()*10)) to_GAMA = [[1,action.cpu().numpy()]] np.savetxt(from_python_1,to_GAMA,delimiter=',') np.savetxt(from_python_1,to_GAMA,delimiter=',') log_prob = log_prob.unsqueeze(0) #log_prob = dist.log_prob(action).unsqueeze(0) #entropy += dist.entropy().mean() log_probs.append(log_prob) values.append(value) entropy += entropy state,reward,done,time_pass,over = GAMA_connect(test) return None #[action,log_prob_return,value]
def main(): ################ load ################### actor_path = os.path.abspath( os.curdir) + '/Generate_Traffic_Flow_MAS_RL/weight/AC_TD2_actor.pkl' if os.path.exists(actor_path): actor = Actor(state_size, action_size).to(device) actor.load_state_dict(torch.load(actor_path)) print('Actor Model loaded') else: actor = Actor(state_size, action_size).to(device) print("Waiting for GAMA...") ################### initialization ######################## reset() Using_LSTM = False test = "GAMA" N_agent = 20 list_hidden = [] count = 0 ################## start ######################### state = GAMA_connect(test) print("Connected") while True: if Using_LSTM == False: state = [ torch.DoubleTensor(elem).reshape(1, state_size).to(device) for elem in state ] state = torch.stack(state).to(device).detach() tensor_cv = generate_img() tensor_cv = [ torch.from_numpy(np.transpose(elem, (2, 0, 1))).double().to(device) / 255 for elem in tensor_cv ] tensor_cv = torch.stack(tensor_cv).to(device).detach() action, h_state_cv_a, h_state_n_a = actor(state, tensor_cv) send_to_GAMA([[1, float(action.cpu().numpy() * 10)]]) else: if len(list_hidden) < N_agent: state = [ torch.DoubleTensor(elem).reshape(1, state_size).to(device) for elem in state ] state = torch.stack(state).to(device).detach() tensor_cv = generate_img() tensor_cv = [ torch.from_numpy(np.transpose( elem, (2, 0, 1))).double().to(device) / 255 for elem in tensor_cv ] tensor_cv = torch.stack(tensor_cv).to(device).detach() action, h_state_cv_a, h_state_n_a = actor(state, tensor_cv) send_to_GAMA([[1, float(action.cpu().numpy() * 10)]]) list_hidden.append(Memory(h_state_cv_a, h_state_n_a)) count += 1 else: state = [ torch.DoubleTensor(elem).reshape(1, state_size).to(device) for elem in state ] state = torch.stack(state).to(device).detach() tensor_cv = generate_img() tensor_cv = [ torch.from_numpy(np.transpose( elem, (2, 0, 1))).double().to(device) / 255 for elem in tensor_cv ] tensor_cv = torch.stack(tensor_cv).to(device).detach() action, h_state_cv_a, h_state_n_a = actor( state, tensor_cv, list_hidden[count % N_agent].h_state_cv_a, list_hidden[count % N_agent].h_state_n_a) send_to_GAMA([[1, float(action.cpu().numpy() * 10)]]) list_hidden[count % N_agent].set_hidden( h_state_cv_a, h_state_n_a) count += 1 state = GAMA_connect(test) return None
def main(): ################ load ################### actor_path = os.path.abspath( os.curdir) + '/PPO_Mixedinput_Navigation_Model/weight/AC_TD3_actor.pkl' critic_path = os.path.abspath( os.curdir ) + '/PPO_Mixedinput_Navigation_Model/weight/AC_TD3_critic.pkl' if os.path.exists(actor_path): actor = Actor(state_size, action_size).to(device) actor.load_state_dict(torch.load(actor_path)) print('Actor Model loaded') else: actor = Actor(state_size, action_size).to(device) if os.path.exists(critic_path): critic = Critic(state_size, action_size).to(device) critic.load_state_dict(torch.load(critic_path)) print('Critic Model loaded') else: critic = Critic(state_size, action_size).to(device) critic_next = Critic(state_size, action_size).to(device) critic_next.load_state_dict(critic.state_dict()) print("Waiting for GAMA...") ################### initialization ######################## reset() episode = 4000 training_stage = 70 #100#80 Decay = training_stage * 18 lr = 0.0001 sample_lr = [ 0.0001, 0.00009, 0.00008, 0.00007, 0.00006, 0.00005, 0.00004, 0.00003, 0.00002, 0.00001, 0.000009, 0.000008, 0.000007, 0.000006, 0.000005, 0.000004, 0.000003, 0.000002, 0.000001 ] #900 960 1020 1080 1140 if episode >= training_stage: #50 100 try: lr = sample_lr[int(episode // training_stage)] except (IndexError): lr = 0.000001 * (0.9**((episode - Decay // training_stage)) ) #100-1800#80-1440#65-1170#570 -- 30 optimizerA = optim.Adam(actor.parameters(), lr, betas=(0.95, 0.999)) optimizerC = optim.Adam(critic.parameters(), lr, betas=(0.95, 0.999)) test = "GAMA" state, reward, done, time_pass, over, _ = GAMA_connect(test) #connect print("done:", done, "timepass:"******"----------------------------------Net_Trained---------------------------------------" ) print('--------------------------Iteration:', episode, 'over--------------------------------') episode += 1 #最初の時 else: print('Iteration:', episode, "lr:", lr) state = np.reshape(state, (1, len(state))) #xxx state_img = generate_img() tensor_cv = torch.from_numpy(np.transpose( state_img, (2, 0, 1))).double().to(device) / 255 state = torch.DoubleTensor(state).reshape(1, state_size).to(device) for _ in range(Memory_size): memory.states.append(state) memory.states_img.append(tensor_cv) state = torch.stack(memory.states).to(device).detach() ### tensor_cv = torch.stack(memory.states_img).to(device).detach() value, h_state_cv_c, h_state_n_c, h_state_3_c = critic( state, tensor_cv) #dist, # now is a tensoraction = dist.sample() action, log_prob, entropy = actor( state, tensor_cv) #, h_state_cv_a,h_state_n_a,h_state_3_a print("acceleration: ", action.cpu().numpy()) send_to_GAMA([[1, float(action.cpu().numpy() * 10)]]) log_prob = log_prob.unsqueeze(0) #entropy += entropy state, reward, done, time_pass, over, average_speed_NPC = GAMA_connect( test) return None
def main(): ############## Hyperparameters ############## K_epochs = 3 # update policy for K epochs lr太大会出现NAN? eps_clip = 0.2 gamma = 0.9 # 要较弱;较强关联? 对每一正确步也有打击 episode = 3 lr_first = 0.00001 lr = lr_first #random_seed = None state_dim = 6 action_dim = 1 #(self, state_dim, action_dim, lr, betas, gamma, K_epochs, eps_clip) actor_path = os.getcwd( ) + '\\GAMA_python\\PPO_Mixedinput_Navigation_Model\\weight\\ppo_MC_actor.pkl' critic_path = os.getcwd( ) + '\\GAMA_python\\PPO_Mixedinput_Navigation_Model\\weight\\ppo_MC_critic.pkl' ################ load ################### if episode > 30: #50 100 lr_first = 0.00001 lr = lr_first * (0.7**((episode - 20) // 10)) ppo = PPO(state_dim, action_dim, lr, gamma, K_epochs, eps_clip) if os.path.exists(actor_path): ppo.actor.load_state_dict(torch.load(actor_path)) print('Actor Model loaded') if os.path.exists(critic_path): ppo.critic.load_state_dict(torch.load(critic_path)) print('Critic Model loaded') print("Waiting for GAMA...") ################### initialization ######################## save_curve_pic = os.getcwd( ) + '\\GAMA_python\\PPO_Mixedinput_Navigation_Model\\result\\PPO_MC_loss_curve.png' save_critic_loss = os.getcwd( ) + '\\GAMA_python\\PPO_Mixedinput_Navigation_Model\\training_data\\PPO_MC_critic_loss.csv' save_reward = os.getcwd( ) + '\\GAMA_python\\PPO_Mixedinput_Navigation_Model\\training_data\\PPO_MC_reward.csv' reset() memory = Memory() advantages = 0 #global value loss = [] total_loss = [] rewards = [] total_rewards = [] test = "GAMA" state, reward, done, time_pass, over = GAMA_connect(test) #connect #[real_speed/10, target_speed/10, elapsed_time_ratio, distance_left/100,distance_front_car/10,distance_behind_car/10,reward,done,over] print("done:", done, "timepass:"******"----------------------------------Net_Trained---------------------------------------" ) print('--------------------------Iteration:', episode, 'over--------------------------------') episode += 1 loss_sum = sum(loss).cpu().detach().numpy() total_loss.append(loss_sum) total_reward = sum(rewards) total_rewards.append(total_reward) cross_loss_curve(loss_sum.squeeze(0), total_reward, save_curve_pic, save_critic_loss, save_reward) rewards = [] loss = [] if episode > 30: #50 100 lr = lr_first * (0.94**((episode - 20) // 10)) #if episode > 80: # lr_first = 0.0001 # lr = lr_first * (0.94 ** ((episode-70) // 10)) torch.save(ppo.actor.state_dict(), actor_path) torch.save(ppo.critic.state_dict(), critic_path) #最初の時 else: print('Iteration:', episode) state = torch.DoubleTensor(state).reshape(1, 6).to(device) state_img = generate_img( ) # numpy image: H x W x C (500, 500, 3) -> (3,500,500) tensor_cv = torch.from_numpy(np.transpose( state_img, (2, 0, 1))).double().to( device ) # np.transpose( xxx, (2, 0, 1)) torch image: C x H x W action = ppo.select_action(state, tensor_cv, memory) print("acceleration: ", action) #.cpu().numpy() send_to_GAMA([[1, float(action.cpu().numpy() * 10)]]) state, reward, done, time_pass, over = GAMA_connect(test) return None
def main(): ############## Hyperparameters ############## update_timestep = 1 #TD use == 1 # update policy every n timesteps set for TD K_epochs = 4 # update policy for K epochs lr太大会出现NAN? eps_clip = 0.2 gamma = 0.9 episode = 512 sample_lr = [ 0.0001, 0.00009, 0.00008, 0.00007, 0.00006, 0.00005, 0.00004, 0.00003, 0.00002, 0.00001, 0.000009, 0.000008, 0.000007, 0.000006, 0.000005, 0.000004, 0.000003, 0.000002, 0.000001 ] lr = 0.0001 #random_seed = None state_dim = 5 action_dim = 1 #(self, state_dim, action_dim, lr, betas, gamma, K_epochs, eps_clip) actor_path = os.getcwd( ) + '/PPO_Mixedinput_Navigation_Model/weight/ppo_TD2lstm_actor.pkl' critic_path = os.getcwd( ) + '/PPO_Mixedinput_Navigation_Model/weight/ppo_TD2lstm_critic.pkl' ################ load ################### if episode > 50: #50 100 try: lr = sample_lr[int(episode // 50)] except (IndexError): lr = 0.000001 ppo = PPO(state_dim, action_dim, lr, gamma, K_epochs, eps_clip) if os.path.exists(actor_path): ppo.actor.load_state_dict(torch.load(actor_path)) print('Actor Model loaded') if os.path.exists(critic_path): ppo.critic.load_state_dict(torch.load(critic_path)) print('Critic Model loaded') print("Waiting for GAMA...") ################### initialization ######################## save_curve_pic = os.getcwd( ) + '/PPO_Mixedinput_Navigation_Model/result/PPO_2LSTM_loss_curve.png' save_critic_loss = os.getcwd( ) + '/PPO_Mixedinput_Navigation_Model/training_data/PPO_TD2_critic_loss.csv' save_reward = os.getcwd( ) + '/PPO_Mixedinput_Navigation_Model/training_data/PPO_TD2_reward.csv' reset() memory = Memory() advantages = 0 #global value loss = [] total_loss = [] rewards = [] total_rewards = [] test = "GAMA" state, reward, done, time_pass, over = GAMA_connect(test) #connect #[real_speed/10, target_speed/10, elapsed_time_ratio, distance_left/100,distance_front_car/10,distance_behind_car/10,reward,done,over] print("done:", done, "timepass:"******"state ",state) rewards.append(reward) memory.rewards.append(reward) memory.is_terminals.append(done) state = torch.DoubleTensor(state).reshape(1, state_dim).to(device) state_img = generate_img() tensor_cv = torch.from_numpy(np.transpose( state_img, (2, 0, 1))).double().to(device) if len(memory.states_next) == 0: for _ in range(3): memory.states_next = memory.states memory.states_next[2] = state memory.states_img_next = memory.states_img memory.states_img_next[2] = tensor_cv else: del memory.states_next[:1] del memory.states_img_next[:1] memory.states_next.append(state) memory.states_img_next.append(tensor_cv) loss_ = ppo.update(memory, lr, advantages, done) loss.append(loss_) del memory.logprobs[:] del memory.rewards[:] del memory.is_terminals[:] #memory.clear_memory() action = ppo.select_action(state, tensor_cv, memory) send_to_GAMA([[1, float(action * 10)]]) #print("acceleration ",float(action)) # 終わり elif done == 1: #先传后计算 print("state_last", state) send_to_GAMA([[1, 0]]) rewards.append(reward) del memory.states_next[:1] del memory.states_img_next[:1] state = torch.DoubleTensor(state).reshape(1, state_dim).to( device) #转化成1行 memory.states_next.append(state) state_img = generate_img() tensor_cv = torch.from_numpy(np.transpose( state_img, (2, 0, 1))).double().to(device) memory.states_img_next.append(tensor_cv) memory.rewards.append(reward) memory.is_terminals.append(done) loss_ = ppo.update(memory, lr, advantages, done) loss.append(loss_) memory.clear_memory() print( "----------------------------------Net_Trained---------------------------------------" ) print('--------------------------Iteration:', episode, 'over--------------------------------') episode += 1 loss_sum = sum(loss).cpu().detach().numpy() total_loss.append(loss_sum) total_reward = sum(rewards) total_rewards.append(total_reward) cross_loss_curve(loss_sum.squeeze(0), total_reward, save_curve_pic, save_critic_loss, save_reward) rewards = [] loss = [] if episode > 50: #50 100 try: lr = sample_lr[int(episode // 50)] except (IndexError): lr = 0.000001 torch.save(ppo.actor.state_dict(), actor_path) torch.save(ppo.critic.state_dict(), critic_path) #最初の時 else: print('Iteration:', episode, "lr:", lr) state = torch.DoubleTensor(state).reshape(1, state_dim).to(device) state_img = generate_img( ) # numpy image: H x W x C (500, 500, 3) -> (3,500,500) tensor_cv = torch.from_numpy(np.transpose( state_img, (2, 0, 1))).double().to( device ) # np.transpose( xxx, (2, 0, 1)) torch image: C x H x W action = ppo.select_action(state, tensor_cv, memory) print("acceleration: ", action) send_to_GAMA([[1, float(action * 10)]]) state, reward, done, time_pass, over = GAMA_connect(test) return None
def main(): ################ load ################### #train_agent actor_train_path = os.path.abspath( os.curdir) + '/Generate_Traffic_Flow_MAS_RL/weight/AC_TD3_actor.pkl' critic_train_path = os.path.abspath( os.curdir) + '/Generate_Traffic_Flow_MAS_RL/weight/AC_TD3_critic.pkl' if os.path.exists(actor_train_path): actor_train = Actor(state_size, action_size).to(device) actor_train.load_state_dict(torch.load(actor_train_path)) print('Actor_Train Model loaded') else: actor_train = Actor(state_size, action_size).to(device) if os.path.exists(critic_train_path): critic_train = Critic(state_size, action_size).to(device) critic_train.load_state_dict(torch.load(critic_train_path)) print('Critic_Train Model loaded') else: critic_train = Critic(state_size, action_size).to(device) critic_next_train = Critic(state_size, action_size).to(device) critic_next_train.load_state_dict(critic_train.state_dict()) #agents actor_path = os.path.abspath( os.curdir) + '/Generate_Traffic_Flow_MAS_RL/weight/AC_TD_MAS_actor.pkl' if os.path.exists(actor_path): actor = Actor(state_size, action_size).to(device) actor.load_state_dict(torch.load(actor_path)) print('Actor Model loaded') print("Waiting for GAMA...") ################### initialization ######################## reset() episode = 0 training_stage = 65 lr = 0.0001 sample_lr = [ 0.0001, 0.00009, 0.00008, 0.00007, 0.00006, 0.00005, 0.00004, 0.00003, 0.00002, 0.00001, 0.000009, 0.000008, 0.000007, 0.000006, 0.000005, 0.000004, 0.000003, 0.000002, 0.000001 ] if episode > training_stage: #50 100 try: lr = sample_lr[int(episode // training_stage)] * 0.01 except (IndexError): lr = 0.000001 * 0.9 #* (0.9 ** ((episode-1000) // 60)) optimizerA = optim.Adam(actor_train.parameters(), lr, betas=(0.95, 0.999)) optimizerC = optim.Adam(critic_train.parameters(), lr, betas=(0.95, 0.999)) values = [] rewards = [] masks = [] total_loss = [] total_rewards = [] loss = [] average_speed = [] value = 0 gama = 0.9 over = 0 log_prob = 0 memory = Memory() A_T, state, reward, done, time_pass, over, average_speed_NPC = GAMA_connect( ) print("Connected") ################## start ######################### while over != 1: #training_agent if A_T == 0: #普通の場合 average_speed.append(state[0]) if (done == 0 and time_pass != 0): #前回の報酬 reward = torch.tensor([reward], dtype=torch.float, device=device) rewards.append(reward) state = torch.DoubleTensor(state).reshape( 1, state_size).to(device) state_img = generate_img_train() tensor_cv = torch.from_numpy(np.transpose( state_img, (2, 0, 1))).double().to(device) / 255 if len(memory.states_next) == 0: #for _ in range(3): memory.states_next = memory.states memory.states_next[2] = state memory.states_img_next = memory.states_img memory.states_img_next[2] = tensor_cv else: del memory.states_next[:1] del memory.states_img_next[:1] memory.states_next.append(state) memory.states_img_next.append(tensor_cv) state_next = torch.stack( memory.states_next).to(device).detach() tensor_cv_next = torch.stack( memory.states_img_next).to(device).detach() value_next, _, _, _ = critic_next_train( state_next, tensor_cv_next, h_state_cv_c, h_state_n_c, h_state_3_c) #_next with torch.autograd.set_detect_anomaly(True): # TD:r(s) + gama*v(s+1) - v(s) advantage = reward.detach( ) + gama * value_next.detach() - value actor_loss = -(log_prob * advantage.detach()) critic_loss = (reward.detach() + gama * value_next.detach() - value).pow(2) optimizerA.zero_grad() optimizerC.zero_grad() critic_loss.backward() actor_loss.backward() loss.append(critic_loss) optimizerA.step() optimizerC.step() critic_next_train.load_state_dict( critic_train.state_dict()) del memory.states[:1] del memory.states_img[:1] memory.states.append(state) memory.states_img.append(tensor_cv) state = torch.stack(memory.states).to(device).detach() tensor_cv = torch.stack(memory.states_img).to(device).detach() value, h_state_cv_c, h_state_n_c, h_state_3_c = critic_train( state, tensor_cv, h_state_cv_c, h_state_n_c, h_state_3_c) action, log_prob = actor_train(state, tensor_cv) log_prob = log_prob.unsqueeze(0) send_to_GAMA([[1, float(action.cpu().numpy() * 10)]]) #行 masks.append( torch.tensor([1 - done], dtype=torch.float, device=device)) values.append(value) # 終わり elif done == 1: average_speed.append(state[0]) send_to_GAMA([[1, 0]]) #先传后计算 print(state) rewards.append(reward) #contains the last reward = torch.tensor([reward], dtype=torch.float, device=device) rewards.append(reward) #contains the last total_reward = sum(rewards).cpu().detach().numpy() total_rewards.append(total_reward) with torch.autograd.set_detect_anomaly(True): advantage = reward.detach( ) - value #+ last_value 最后一回的V(s+1) = 0 actor_loss = -(log_prob * advantage.detach()) critic_loss = (reward.detach() - value).pow( 2) #+ last_value optimizerA.zero_grad() optimizerC.zero_grad() critic_loss.backward() actor_loss.backward() loss.append(critic_loss) optimizerA.step() optimizerC.step() critic_next_train.load_state_dict( critic_train.state_dict()) print( "----------------------------------Net_Trained---------------------------------------" ) print('--------------------------Iteration:', episode, 'over--------------------------------') episode += 1 values = [] rewards = [] loss_sum = sum(loss).cpu().detach().numpy() total_loss.append(loss_sum) cross_loss_curve(loss_sum.squeeze(0), total_reward, save_curve_pic, save_critic_loss, save_reward, np.mean(average_speed), save_speed, average_speed_NPC, save_NPC_speed) #total_loss,total_rewards#np.mean(average_speed)/10 loss = [] average_speed = [] memory.clear_memory() torch.save(actor_train.state_dict(), actor_train_path) torch.save(critic_train.state_dict(), critic_train_path) if episode > training_stage: #50 100 try: lr = sample_lr[int(episode // training_stage)] * 0.01 except (IndexError): lr = 0.000001 * 0.9 #* (0.9 ** ((episode-1000) // 60)) optimizerA = optim.Adam(actor_train.parameters(), lr, betas=(0.95, 0.999)) optimizerC = optim.Adam(critic_train.parameters(), lr, betas=(0.95, 0.999)) #最初の時 if time_pass == 0: print('Iteration:', episode, "lr:", lr) state = np.reshape(state, (1, len(state))) state_img = generate_img_train() tensor_cv = torch.from_numpy(np.transpose( state_img, (2, 0, 1))).double().to(device) / 255 state = torch.DoubleTensor(state).reshape( 1, state_size).to(device) for _ in range(3): memory.states.append(state) memory.states_img.append(tensor_cv) state = torch.stack(memory.states).to(device).detach() ### tensor_cv = torch.stack(memory.states_img).to(device).detach() value, h_state_cv_c, h_state_n_c, h_state_3_c = critic_train( state, tensor_cv) #dist, # now is a tensoraction = dist.sample() action, log_prob = actor_train(state, tensor_cv) print("acceleration: ", action.cpu().numpy()) send_to_GAMA([[1, float(action.cpu().numpy() * 10)]]) #agents if A_T == 1: state = [ torch.DoubleTensor(elem).reshape(1, state_size).to(device) for elem in state ] state = torch.stack(state).to(device).detach() tensor_cv_MAS = generate_img() tensor_cv_MAS = [ torch.from_numpy(np.transpose(elem, (2, 0, 1))).double().to(device) / 255 for elem in tensor_cv_MAS ] tensor_cv_MAS = torch.stack(tensor_cv_MAS).to(device).detach() action, _ = actor(state, tensor_cv_MAS) send_to_GAMA([[1, float(action.cpu().numpy() * 10)]]) A_T, state, reward, done, time_pass, over, average_speed_NPC = GAMA_connect( ) return None
def main(): ############## Hyperparameters ############## update_timestep = 1 #TD use == 1 # update policy every n timesteps set for TD K_epochs = 2 # update policy for K epochs lr太大会出现NAN? eps_clip = 0.2 gamma = 0.9 episode = 376 lr_first = 0.0001 lr = lr_first #random_seed = None state_dim = 6 action_dim = 1 #(self, state_dim, action_dim, lr, betas, gamma, K_epochs, eps_clip) actor_path = os.getcwd( ) + '\\GAMA_python\\PPO_Navigation_Model\\weight\\ppo_TD_actor.pkl' critic_path = os.getcwd( ) + '\\GAMA_python\\PPO_Navigation_Model\\weight\\ppo_TD_critic.pkl' ################ load ################### if episode > 70: #50 100 lr_first = 0.00001 lr = lr_first * (0.65**((episode - 60) // 10)) ppo = PPO(state_dim, action_dim, lr, gamma, K_epochs, eps_clip) if os.path.exists(actor_path): ppo.actor.load_state_dict(torch.load(actor_path)) print('Actor Model loaded') if os.path.exists(critic_path): ppo.critic.load_state_dict(torch.load(critic_path)) print('Critic Model loaded') print("Waiting for GAMA...") ################### initialization ######################## save_curve_pic = os.getcwd( ) + '\\GAMA_python\\PPO_Navigation_Model\\result\\PPO_TD_loss_curve.png' save_critic_loss = os.getcwd( ) + '\\GAMA_python\\PPO_Navigation_Model\\training_data\\PPO_TD_critic_loss.csv' save_reward = os.getcwd( ) + '\\GAMA_python\\PPO_Navigation_Model\\training_data\\PPO_TD_reward.csv' reset() memory = Memory() advantages = 0 #global value loss = [] total_loss = [] rewards = [] total_rewards = [] test = "GAMA" state, reward, done, time_pass, over = GAMA_connect(test) #connect #[real_speed/10, target_speed/10, elapsed_time_ratio, distance_left/100,distance_front_car/10,distance_behind_car/10,reward,done,over] print("done:", done, "timepass:"******"state_last", state) send_to_GAMA([[1, 0]]) rewards.append(reward) state = torch.DoubleTensor(state).reshape(1, 6).to(device) #转化成1行 memory.states_next.append(state) memory.rewards.append(reward) memory.is_terminals.append(done) loss_ = ppo.update(memory, lr, advantages, done) loss.append(loss_) memory.clear_memory() print( "----------------------------------Net_Trained---------------------------------------" ) print('--------------------------Iteration:', episode, 'over--------------------------------') episode += 1 loss_sum = sum(loss).cpu().detach().numpy() total_loss.append(loss_sum) total_reward = sum(rewards) total_rewards.append(total_reward) cross_loss_curve(loss_sum.squeeze(0), total_reward, save_curve_pic, save_critic_loss, save_reward) rewards = [] loss = [] if episode > 70: #50 100 lr_first = 0.00001 lr = lr_first * (0.65**((episode - 60) // 10)) #40 90 torch.save(ppo.actor.state_dict(), actor_path) torch.save(ppo.critic.state_dict(), critic_path) #最初の時 else: print('Iteration:', episode) state = torch.DoubleTensor(state).reshape(1, 6).to(device) action = ppo.select_action(state, memory) print("acceleration: ", action) #.cpu().numpy() send_to_GAMA([[1, float(action * 10)]]) state, reward, done, time_pass, over = GAMA_connect(test) return None
def main(): ################ load ################### actor_path = os.path.abspath( os.curdir) + '/PPO_Mixedinput_Navigation_Model/weight/AC_TD0_actor.pkl' critic_path = os.path.abspath( os.curdir ) + '/PPO_Mixedinput_Navigation_Model/weight/AC_TD0_critic.pkl' if os.path.exists(actor_path): actor = Actor(state_size, action_size).to(device) actor.load_state_dict(torch.load(actor_path)) print('Actor Model loaded') else: actor = Actor(state_size, action_size).to(device) if os.path.exists(critic_path): critic = Critic(state_size, action_size).to(device) critic.load_state_dict(torch.load(critic_path)) print('Critic Model loaded') else: critic = Critic(state_size, action_size).to(device) critic_next = Critic(state_size, action_size).to(device) critic_next.load_state_dict(critic.state_dict()) print("Waiting for GAMA...") ################### initialization ######################## reset() episode = 1257 lr = 0.0001 sample_lr = [ 0.0001, 0.00009, 0.00008, 0.00007, 0.00006, 0.00005, 0.00004, 0.00003, 0.00002, 0.00001, 0.000009, 0.000008, 0.000007, 0.000006, 0.000005, 0.000004, 0.000003, 0.000002, 0.000001 ] if episode > 50: #50 100 try: lr = sample_lr[int(episode // 50)] except (IndexError): lr = 0.000001 optimizerA = optim.Adam(actor.parameters(), lr, betas=(0.95, 0.999)) optimizerC = optim.Adam(critic.parameters(), lr, betas=(0.95, 0.999)) test = "GAMA" state, reward, done, time_pass, over = GAMA_connect(test) #connect print("done:", done, "timepass:"******"----------------------------------Net_Trained---------------------------------------" ) print('--------------------------Iteration:', episode, 'over--------------------------------') episode += 1 log_probs = [] values = [] rewards = [] masks = [] loss_sum = sum(loss).cpu().detach().numpy() total_loss.append(loss_sum) cross_loss_curve(loss_sum.squeeze(0), total_reward, save_curve_pic, save_critic_loss, save_reward) #total_loss,total_rewards loss = [] memory.clear_memory() if episode > 50: #50 100 try: lr = sample_lr[int(episode // 50)] except (IndexError): lr = 0.000001 optimizerA = optim.Adam(actor.parameters(), lr, betas=(0.95, 0.999)) optimizerC = optim.Adam(critic.parameters(), lr, betas=(0.95, 0.999)) torch.save(actor.state_dict(), actor_path) torch.save(critic.state_dict(), critic_path) #最初の時 else: print('Iteration:', episode, "lr:", lr) state = np.reshape(state, (1, len(state))) #xxx state_img = generate_img() tensor_cv = torch.from_numpy(np.transpose( state_img, (2, 0, 1))).double().to(device) state = torch.DoubleTensor(state).reshape(1, state_size).to(device) for _ in range(3): memory.states.append(state) memory.states_img.append(tensor_cv) state = torch.stack(memory.states).to(device).detach() ### tensor_cv = torch.stack(memory.states_img).to(device).detach() value = critic( state, tensor_cv) #dist, # now is a tensoraction = dist.sample() action, log_prob, entropy = actor(state, tensor_cv) print("acceleration: ", action.cpu().numpy()) send_to_GAMA([[1, float(action.cpu().numpy() * 10)]]) log_prob = log_prob.unsqueeze(0) entropy += entropy state, reward, done, time_pass, over = GAMA_connect(test) return None
def main(): ################ load ################### actor_path = os.path.abspath( os.curdir ) + '\\GAMA_python\\PPO_Mixedinput_Navigation_Model\\weight\\AC_TD_actor.pkl' critic_path = os.path.abspath( os.curdir ) + '\\GAMA_python\\PPO_Mixedinput_Navigation_Model\\weight\\AC_TD_critic.pkl' if os.path.exists(actor_path): actor = Actor(state_size, action_size).to(device) actor.load_state_dict(torch.load(actor_path)) print('Actor Model loaded') else: actor = Actor(state_size, action_size).to(device) if os.path.exists(critic_path): critic = Critic(state_size, action_size).to(device) critic.load_state_dict(torch.load(critic_path)) print('Critic Model loaded') else: critic = Critic(state_size, action_size).to(device) critic_next = Critic(state_size, action_size).to(device) critic_next.load_state_dict(critic.state_dict()) print("Waiting for GAMA...") ################### initialization ######################## reset() episode = 237 lr = 0.0001 if episode > 50: lr = 0.00008 new_lr = lr * (0.9**((episode - 40) // 10)) #if episode > 110: # lr = 0.000008 # new_lr = lr * (0.9 ** ((episode-90) // 10)) #40 optimizerA = optim.Adam(actor.parameters(), lr, betas=(0.95, 0.999)) optimizerC = optim.Adam(critic.parameters(), lr, betas=(0.95, 0.999)) test = "GAMA" state, reward, done, time_pass, over = GAMA_connect(test) #connect print("done:", done, "timepass:"******"restart acceleration: 0") send_to_GAMA([[1, 0]]) #先传后计算 print(state) rewards.append(reward) #contains the last reward = torch.tensor([reward], dtype=torch.float, device=device) rewards.append(reward) #contains the last total_reward = sum(rewards).cpu().detach().numpy() total_rewards.append(total_reward) #state = torch.FloatTensor(state).reshape(1,4).to(device) #last_value= critic(state) with torch.autograd.set_detect_anomaly(True): advantage = reward.detach( ) - value #+ last_value 最后一回的V(s+1) = 0 actor_loss = -(log_prob * advantage.detach()) critic_loss = (reward.detach() - value).pow(2) #+ last_value lstm_loss = critic_loss optimizerA.zero_grad() optimizerC.zero_grad() critic_loss.backward() actor_loss.backward() loss.append(critic_loss) optimizerA.step() optimizerC.step() critic_next.load_state_dict(critic.state_dict()) print( "----------------------------------Net_Trained---------------------------------------" ) print('--------------------------Iteration:', episode, 'over--------------------------------') episode += 1 log_probs = [] values = [] rewards = [] masks = [] loss_sum = sum(loss).cpu().detach().numpy() total_loss.append(loss_sum) cross_loss_curve(loss_sum.squeeze(0), total_reward, save_curve_pic, save_critic_loss, save_reward) #total_loss,total_rewards loss = [] if episode > 50: lr = 0.00008 new_lr = lr * (0.9**((episode - 40) // 10)) #if episode > 110: # lr = 0.000008 # new_lr = lr * (0.9 ** ((episode-90) // 10)) #40 optimizerA = optim.Adam(actor.parameters(), new_lr, betas=(0.95, 0.999)) optimizerC = optim.Adam(critic.parameters(), new_lr, betas=(0.95, 0.999)) torch.save(actor.state_dict(), actor_path) torch.save(critic.state_dict(), critic_path) #最初の時 else: print('Iteration:', episode) state = np.reshape(state, (1, len(state))) #xxx state_img = generate_img() tensor_cv = torch.from_numpy(np.transpose( state_img, (2, 0, 1))).double().to(device) state = torch.DoubleTensor(state).reshape(1, 6).to(device) value = critic( state, tensor_cv) #dist, # now is a tensoraction = dist.sample() action, log_prob, entropy = actor(state, tensor_cv) print("acceleration: ", action.cpu().numpy()) send_to_GAMA([[1, float(action.cpu().numpy() * 10)]]) log_prob = log_prob.unsqueeze(0) entropy += entropy state, reward, done, time_pass, over = GAMA_connect(test) return None