def test(): env = gym.make(GAME).unwrapped agent = PPO(state_space=S_DIM, action_space=A_DIM, max_episode_num=EP_MAX, episode_lens=EP_LEN, discount_factor=GAMMA, actor_learning_rate=A_LR, critic_learning_rate=C_LR, mini_batch_size=MINI_BATCH_SIZE, epochs=EPOCHS) agent.load_weights(SAVE_INDEX) # env_reset # state = env.reset() # print(state) # state = env.reset() # print(state) state = env.reset() print(state) steps = 0 episode_r = 0 all_value = [] while steps < 1000: # env.render() # get action action = agent.choose_action(state) # execute one action state_after_action, reward, done, _ = env.step(action) steps += 1 episode_r += reward state = state_after_action state_value = agent.get_value(state) all_value.append(state_value) plt.plot(np.arange(len(all_value)), all_value) plt.xlabel('state') plt.ylabel('state value') plt.show() create_path('figure/'+SAVE_INDEX+'/figure') plt.savefig('weights/'+SAVE_INDEX+'/figure/fig.png') print("test 1000 steps, got reward: %i" % episode_r)
from ppo import PPO import tensorflow as tf import numpy as np import matplotlib.pyplot as plt from gym_unity.envs import UnityEnv GameDir = 'test.app' multi_thread = True if multi_thread: modelPath = "multi_thread_Model/" else: modelPath = "single_thread_Model/" MAXEP = 1000 if __name__ == "__main__": env = UnityEnv('test.app', 0, use_visual=True) ppo = PPO(env, load=True, testing=True, ModelPath=modelPath) for ep in range(MAXEP): s = env.reset() ep_r = 0 done = False while not done: env.render() a, v = ppo.choose_action(s) s_, r, done, _ = env.step(a) s = s_ ep_r += r print("episode = {}, ep_r = {}".format(ep, ep_r))
class Agent: def __init__(self, traci, is_not_train=False): self.controlTLIds = traci.trafficlight.getIDList() # tuple ('0',) self.controlTLIds = self.controlTLIds[0] # string '0' self.phaseDefs = ['GrrrGrrr', 'rGrrrGrr', 'rrGrrrGr', 'rrrGrrrG'] self.yelloPhases = ['yrrryrrr', 'ryrrryrr', 'rryrrryr', 'rrryrrry'] action_mask = [1, 1, 1, 1] self.detectorIDs = traci.inductionloop.getIDList() self.controlLanes = get_laneID(self.detectorIDs) self.reset() state_size = len(self.state) self.learner = Learner(state_size, action_mask, is_not_train) return # reset before each epoch def reset(self): self.prev_avg_wait_time = 0 self.totalReward = 0 self.totalCost = 0 self.state = self._get_state() self.action = 1 def _get_state(self): # 生成输入神经网络的state state = get_queue_state(self.controlLanes) + get_head_waiting_time( self.controlLanes) state = np.array(state) return state def setRYG(self, index, is_yello): light = self.controlTLIds if is_yello: ryg = self.yelloPhases[index] else: ryg = self.phaseDefs[index] traci.trafficlight.setRedYellowGreenState(light, ryg) def getPhase(self): light = self.controlTLIds phase_state = traci.trafficlight.getRedYellowGreenState(light) for i in range(len(self.phaseDefs)): if phase_state == self.phaseDefs[i]: phase = i if phase_state == self.yelloPhases[i]: phase = i return phase def agentAct(self): self.currentPhase = self.getPhase() self.action = self.learner.choose_action(self.state) return self.action, self.currentPhase def agentCulReward(self, is_sim=False): next_state = self._get_state() avg_wait_time = get_avarage_waiting_time(self.controlLanes) reward = self.prev_avg_wait_time - avg_wait_time self.prev_avg_wait_time = avg_wait_time self.totalReward += reward self.totalCost += avg_wait_time if not is_sim: self.learner.learn(self.state, self.action, reward, next_state) self.state = next_state return reward, avg_wait_time def printLog(self, outfile, simulation): print("************agent:simulation output*********") print("Simulation {} group {}: totalCost {}, totalReward {}\n".format( simulation, self.controlTLIds, self.totalCost, self.totalReward)) outfile.write( "Simulation {} group {}: totalCost {}, totalReward {}\n".format( simulation, self.controlTLIds, self.totalCost, self.totalReward)) outfile.flush()
defender_rewards = [] attacker_rewards = [] state = field.reset() # (1, s_dim) defender.init_ac(np.array([state])) attacker.init_ac(np.array([state])) running_mean_a = [] running_mean_d = [] all_rewards_a = [] all_rewards_d = [] for i in range(100): # fix defender, train attacker print("Training attacker ... iter", i + 1) for ep in range(MAX_EP_A): state = field.get_state() action_d, prob_d = defender.choose_action(np.array([state])) pun_d = field.defense(action_d) r_d = -pun_d action_a, prob_a = attacker.choose_action(np.array([state])) reward_a, pun_a = field.attack(action_a) r_a = reward_a - pun_a r_d -= reward_a attacker_memory.memorize(state, action_a, r_a, prob_a, True, None) running_mean_d.append(r_d) running_mean_a.append(r_a) state = field.next() if (ep + 1) % BATCH_SIZE == 0 or ep == MAX_EP_A - 1:
from ppo import PPO import tensorflow as tf import numpy as np import matplotlib.pyplot as plt multi_thread = True if multi_thread: modelPath = "multi_thread_Model/" else: modelPath = "single_thread_Model/" MAXEP = 1000 if __name__ == "__main__": ppo = PPO(None, load=True, testing=True, ModelPath=modelPath) for ep in range(MAXEP): s = 1 # first image ep_r = 0 done = False while not done: a, v = ppo.choose_action( s) # a == 1 press D, a == 2 press F, a == 0 do nothing s_ = 2 # push the buttom and return the image s = s_
def train(nameIndx): global l_run, r_run T_REWARD = [] MU_REWARD = 0 BEST_R = 0 env = Test(nameIndx) #0 = right # agent = DDPG(a_dim, s_dim, a_bound, SIDE[nameIndx]) agent = PPO(act_dim=7, obs_dim=39, lr_actor=0.0001, lr_value=0.0002, gamma=0.9, clip_range=0.2, name=SIDE[nameIndx]) var = 0.8 # control exploration rar = 0.3 cnt = 0 if nameIndx == 0: r_run = False elif nameIndx == 1: l_run = False while r_run or l_run: time.sleep(0) time.sleep(0.5) t1 = time.time() while int(time.time()) % 5 != 0: time.sleep(0) for i in range(MAX_EPISODES): if nameIndx == 0: r_run = False elif nameIndx == 1: l_run = False while r_run or l_run: time.sleep(0) s = env.reset() time.sleep(0.1) if nameIndx == 0: r_run = True elif nameIndx == 1: l_run = True ep_reward = 0 for j in range(MAX_EP_STEPS): a, neglogp, _ = agent.choose_action(s) # a = np.clip(np.random.normal(a, var), -1, 1) # add randomness to action selection for exploration s_, r, done, info = env.step(a) agent.memory.store_transition(s, a, (r + 8) / 8, neglogp) if (j + 1) % 32 == 0 or j == MAX_EP_STEPS - 1: _, _, last_value = agent.choose_action(s_) agent.learn(last_value, done, cnt) s = s_ ep_reward += r cnt += 1 if len(T_REWARD) >= 100: T_REWARD.pop(0) T_REWARD.append(ep_reward) r_sum = 0 for k in T_REWARD: r_sum += k MU_REWARD = r_sum / 100 BEST_R = MU_REWARD if MU_REWARD > BEST_R else BEST_R print( 'Episode:', i, ' Reward: %i' % int(ep_reward), 'MU_REWARD: ', int(MU_REWARD), 'BEST_R: ', int(BEST_R), 'cnt = ', j ) # , 't_step:', int(t23), 't_learn: ', int(t32)) #'var: %.3f' % var, 'rar: %.3f' % rar) if MU_REWARD > GOAL_REWARD: break if os.path.isdir(agent.path): shutil.rmtree(agent.path) os.mkdir(agent.path) ckpt_path = os.path.join(agent.path, 'DDPG.ckpt') save_path = agent.saver.save(agent.sess, ckpt_path, write_meta_graph=False) print("\nSave Model %s\n" % save_path) print('Running time: ', time.time() - t1)