class MobileAvoidance(EnvSpace): def env_init(self): self.env = CarEnv() self.state = self.env.reset() self.send_state_get_action(self.state) self.var = 1 def on_predict_response(self, action): self.var = self.var * 0.9995 if self.ep_use_step > cfg['DDPG'][ 'memory_capacity'] else self.var a = np.clip(np.random.normal(action, self.var), *self.env.action_bound) next_state, reward, done, _ = self.env.step(action) # print(next_state) done = True if self.ep_use_step >= EP_MAXSTEP else done self.send_train_get_action(self.state, action, reward, done, next_state) self.state = next_state # print('self.env_name=',self.env_name) if self.ep >= 30 and RENDER: self.env.render() if done: self.state = self.env.reset() self.send_state_get_action(self.state)
class Worker(object): def __init__(self, wid): self.wid = wid self.env = CarEnv() self.ppo = GLOBAL_PPO def work(self): global GLOBAL_EP, GLOBAL_RUNNING_R, GLOBAL_UPDATE_COUNTER while not COORD.should_stop(): s = self.env.reset() ep_r = 0 buffer_s, buffer_a, buffer_r = [], [], [] for t in range(EP_LEN): if not ROLLING_EVENT.is_set(): # while global PPO is updating ROLLING_EVENT.wait() # wait until PPO is updated buffer_s, buffer_a, buffer_r = [], [], [ ] # clear history buffer a = self.ppo.choose_action(s) s_, r, done = self.env.step(a) buffer_s.append(s) buffer_a.append(a) buffer_r.append(r) # normalize reward, find to be useful s = s_ ep_r += r GLOBAL_UPDATE_COUNTER += 1 # count to minimum batch size if t == EP_LEN - 1 or GLOBAL_UPDATE_COUNTER >= MIN_BATCH_SIZE or done == 1: v_s_ = self.ppo.get_v(s_) discounted_r = [] # compute discounted reward for r in buffer_r[::-1]: v_s_ = r + GAMMA * v_s_ discounted_r.append(v_s_) discounted_r.reverse() bs, ba, br = np.vstack(buffer_s), np.vstack( buffer_a), np.array(discounted_r)[:, np.newaxis] buffer_s, buffer_a, buffer_r = [], [], [] QUEUE.put(np.hstack((bs, ba, br))) if GLOBAL_UPDATE_COUNTER >= MIN_BATCH_SIZE: ROLLING_EVENT.clear() # stop collecting data UPDATE_EVENT.set() # globalPPO update if GLOBAL_EP >= EP_MAX: # stop training COORD.request_stop() break if t == EP_LEN - 1 or done == 1: break # record reward changes, plot later if len(GLOBAL_RUNNING_R) == 0: GLOBAL_RUNNING_R.append(ep_r) else: GLOBAL_RUNNING_R.append(GLOBAL_RUNNING_R[-1] * 0.9 + ep_r * 0.1) GLOBAL_EP += 1 print( '{0:.1f}%'.format(GLOBAL_EP / EP_MAX * 100), '|W%i' % self.wid, '|Ep_r: %.2f' % ep_r, )
class Worker(object): def __init__(self, name, globalAC): self.env = CarEnv() self.name = name self.AC = ACNet(name, globalAC) def work(self): global GLOBAL_RUNNING_R, GLOBAL_EP total_step = 1 buffer_s, buffer_a, buffer_r = [], [], [] while not COORD.should_stop() and GLOBAL_EP < MAX_GLOBAL_EP: s = self.env.reset() ep_r = 0 for ep_t in range(MAX_EP_STEP): # if self.name == 'W_0': # self.env.render() a = self.AC.choose_action(s) s_, r, done = self.env.step(a) if ep_t == MAX_EP_STEP - 1: done = True ep_r += r buffer_s.append(s) buffer_a.append(a) buffer_r.append(r) if total_step % UPDATE_GLOBAL_ITER == 0 or done or ep_t == MAX_EP_STEP - 1: # update global and assign to local net if done: v_s_ = 0 # terminal else: v_s_ = SESS.run(self.AC.v, {self.AC.s: s_[np.newaxis, :]})[0, 0] buffer_v_target = [] for r in buffer_r[::-1]: # reverse buffer r v_s_ = r + GAMMA * v_s_ buffer_v_target.append(v_s_) buffer_v_target.reverse() buffer_s, buffer_a, buffer_v_target = np.vstack( buffer_s), np.vstack(buffer_a), np.vstack( buffer_v_target) feed_dict = { self.AC.s: buffer_s, self.AC.a_his: buffer_a, self.AC.v_target: buffer_v_target, } test = self.AC.update_global(feed_dict) buffer_s, buffer_a, buffer_r = [], [], [] self.AC.pull_global() s = s_ total_step += 1 if done or ep_t == MAX_EP_STEP - 1: if len(GLOBAL_RUNNING_R ) == 0: # record running episode reward GLOBAL_RUNNING_R.append(ep_r) else: GLOBAL_RUNNING_R.append(0.9 * GLOBAL_RUNNING_R[-1] + 0.1 * ep_r) print( self.name, "Ep:", GLOBAL_EP, "| Ep_r: %i" % GLOBAL_RUNNING_R[-1], '| Var:', test, ) GLOBAL_EP += 1 break
UPDATE_EVENT.clear() # no update now ROLLING_EVENT.set() # start to roll out workers = [Worker(wid=i) for i in range(N_WORKER)] GLOBAL_UPDATE_COUNTER, GLOBAL_EP = 0, 0 GLOBAL_RUNNING_R = [] COORD = tf.train.Coordinator() QUEUE = queue.Queue() threads = [] for worker in workers: # worker threads t = threading.Thread(target=worker.work, args=()) t.start() threads.append(t) # add a PPO updating thread threads.append(threading.Thread(target=GLOBAL_PPO.update, )) threads[-1].start() COORD.join(threads) # plot reward change and testing plt.plot(np.arange(len(GLOBAL_RUNNING_R)), GLOBAL_RUNNING_R) plt.xlabel('Episode') plt.ylabel('Moving reward') plt.ion() plt.show() while True: s = env.reset() for t in range(400): env.render() s, _, done = env.step(GLOBAL_PPO.choose_action(s))[0] if done == 1: break
from car_env import CarEnv import pygame env = CarEnv() state = env.reset() close_screen = False while True: action = 4 for event in pygame.event.get(): if event.type == pygame.KEYDOWN and event.key == pygame.K_DOWN: action = 0 if event.type == pygame.KEYDOWN and event.key == pygame.K_RIGHT: action = 1 if event.type == pygame.KEYDOWN and event.key == pygame.K_UP: action = 2 if event.type == pygame.KEYDOWN and event.key == pygame.K_LEFT: action = 3 if event.type == pygame.QUIT: close_screen = True next_state, reward, done, info = env.step(action) env.render() if done or close_screen: break pygame.display.quit() pygame.quit()