def sample_task(): range_pose = 0.3 target_pose = np.random.rand(2) * range_pose + [0.5, 0.5] screen_size = 1000 target_pose = target_pose * screen_size env = Reacher(target_pos=target_pose, render=True) return env, target_pose
def __init__(self, wid): self.wid = wid # self.env = gym.make(GAME).unwrapped self.env=Reacher(render=True) self.ppo = GLOBAL_PPO
if __name__ == '__main__': GLOBAL_PPO = PPO() UPDATE_EVENT, ROLLING_EVENT = threading.Event(), threading.Event() UPDATE_EVENT.clear() # not update now ROLLING_EVENT.set() # start to roll out workers = [Worker(wid=i) for i in range(N_WORKER)] GLOBAL_UPDATE_COUNTER, GLOBAL_EP = 0, 0 GLOBAL_RUNNING_R = [] COORD = tf.train.Coordinator() QUEUE = queue.Queue() # workers putting data in this queue threads = [] for worker in workers: # worker threads t = threading.Thread(target=worker.work, args=()) t.start() # training threads.append(t) # add a PPO updating thread threads.append(threading.Thread(target=GLOBAL_PPO.update,)) threads[-1].start() COORD.join(threads) # plot reward change and test plt.plot(np.arange(len(GLOBAL_RUNNING_R)), GLOBAL_RUNNING_R) plt.xlabel('Episode'); plt.ylabel('Moving reward'); plt.ion(); plt.show() # env = gym.make('Pendulum-v0') env=Reacher(render=True) while True: s = env.reset() for t in range(300): # env.render() s = env.step(GLOBAL_PPO.choose_action(s))[0]
class Worker(object): def __init__(self, wid): self.wid = wid # self.env = gym.make(GAME).unwrapped self.env=Reacher(render=True) self.ppo = GLOBAL_PPO def work(self): global GLOBAL_EP, GLOBAL_RUNNING_R, GLOBAL_UPDATE_COUNTER step_set=[] epr_set=[] step=0 while not COORD.should_stop(): s = self.env.reset() step+=1 ep_r = 0 buffer_s, buffer_a, buffer_r = [], [], [] for t in range(EP_LEN): if not ROLLING_EVENT.is_set(): # while global PPO is updating ROLLING_EVENT.wait() # wait until PPO is updated buffer_s, buffer_a, buffer_r = [], [], [] # clear history buffer, use new policy to collect data a = self.ppo.choose_action(s) s_, r, done = self.env.step(a) # print('a: ',a) # shape: [] # print('s: ',s_) # shape: [] # print('r: ',r) # shape: scalar # print('done: ', done) # shape: True/False buffer_s.append(s) buffer_a.append(a) buffer_r.append((r + 8) / 8) # normalize reward, find to be useful s = s_ ep_r += r GLOBAL_UPDATE_COUNTER += 1 # count to minimum batch size, no need to wait other workers if t == EP_LEN - 1 or GLOBAL_UPDATE_COUNTER >= MIN_BATCH_SIZE: v_s_ = self.ppo.get_v(s_) discounted_r = [] # compute discounted reward for r in buffer_r[::-1]: v_s_ = r + GAMMA * v_s_ discounted_r.append(v_s_) discounted_r.reverse() bs, ba, br = np.vstack(buffer_s), np.vstack(buffer_a), np.array(discounted_r)[:, np.newaxis] buffer_s, buffer_a, buffer_r = [], [], [] QUEUE.put(np.hstack((bs, ba, br))) # put data in the queue if GLOBAL_UPDATE_COUNTER >= MIN_BATCH_SIZE: ROLLING_EVENT.clear() # stop collecting data UPDATE_EVENT.set() # globalPPO update if GLOBAL_EP >= EP_MAX: # stop training COORD.request_stop() break # record reward changes, plot later if len(GLOBAL_RUNNING_R) == 0: GLOBAL_RUNNING_R.append(ep_r) else: GLOBAL_RUNNING_R.append(GLOBAL_RUNNING_R[-1]*0.9+ep_r*0.1) GLOBAL_EP += 1 print('{0:.1f}%'.format(GLOBAL_EP/EP_MAX*100), '|W%i' % self.wid, '|Ep_r: %.2f' % ep_r,) step_set.append(step) epr_set.append(ep_r) if step % 500==0: plt.plot(step_set,epr_set) plt.savefig('./ppo.png')
self.rewards = [] self.entropy = [] def forward(self, x): x1 = self.hidden_acti(self.affine1(x)) x2 = self.hidden_acti(self.affine2(x1)) action_dis_mu = 360. * self.output_acti(self.affine3_mu(x2)) # scale control action exploration noise, its a 1 dim tensor, should be learnable, but not here # scale = torch.from_numpy(np.array(self.output_size*[1.])).float() scale = self.output_acti_sigma(self.affine3_sigma(x2)) return action_dis_mu, scale # return Normal(loc=action_dis_mu, scale=scale) env = Reacher(render=True) policy = Policy(env.num_observations, 100, 200, env.num_actions) optimizer = optim.Adam(policy.parameters(), lr=1e-3) eps = np.finfo(np.float32).eps.item() def select_action(state): state = torch.from_numpy(state).float() # state: 2 dim tensor mu, scale = policy(state) # print('mu: ', mu) # print('scale: ',scale.squeeze()) action_dis = Normal(loc=mu, scale=scale.squeeze()) # mu is 2d, scale is 1d # scale+=1e-6 # action_dis =Normal(loc=mu, scale=scale)