class Worker(object): def __init__(self, name): self.env = SpyndraEnv(N_S, N_A) self.name = name self.agent = RandomSearch(N_A) def work(self): global GLOBAL_RUNNING_R, GLOBAL_EP while GLOBAL_EP < MAX_GLOBAL_EP: rewards = [] for idx_pattern in range(self.agent.n_patterns): s = self.env._reset() ep_r, dist_traveled = 0, -999. for step in range(MAX_STEP): a = self.agent.choose_action(idx_pattern, step % self.agent.n_gates) s_, r, done, info = self.env._step(a, s) #done = True if ep_t == MAX_EP_STEP - 1 else False if self.name == 'W_0': print("Ep %4i, %2i th agent, step %4i" % (GLOBAL_EP, idx_pattern, step)) print("distance to goal=", info, "reward=", r) print("position before action=", list(s[:8])) print("action=", list(a)) print("position after action=", list(s_[:8])) dist_traveled = 10. - info print("dist_traveled", dist_traveled) if done: print("We get our best model!!!") np.save('best_pattern.npy', self.agent.patterns[idx_pattern]) break s = s_ rewards.append(dist_traveled) self.agent.update(rewards) GLOBAL_EP += 1 print("Best pattern traveled for", max(rewards)) with open('ep_reward.txt', 'a') as f: f.write('ep=%i, distence traveled=%f\n' % (GLOBAL_EP, np.mean(sorted(rewards)[-5:])))
def __init__(self, name): self.env = SpyndraEnv(N_S, N_A) self.name = name self.agent = RandomSearch(N_A)
class Worker(object): def __init__(self, name, globalAC): self.env = SpyndraEnv(N_S, N_A) self.name = name self.AC = ACNet(name, globalAC) def work(self): global GLOBAL_RUNNING_R, GLOBAL_EP total_step = 1 # state, action, reward buffer buffer_s, buffer_a, buffer_r = [], [], [] while not COORD.should_stop() and GLOBAL_EP < MAX_GLOBAL_EP: s = self.env._reset() ep_r, dist_traveled = 0, -999. for ep_t in range(MAX_EP_STEP): # if self.name == 'W_0': # self.env.render() a = self.AC.choose_action(s) s_, r, done, info = self.env._step(a, s) #done = True if ep_t == MAX_EP_STEP - 1 else False if self.name == 'W_0': print("Ep %4i, step %4i" % (GLOBAL_EP, ep_t)) print("distance to goal=", info, "reward=", r) print("position before action=", list(s[:8])) print("action=", list((np.array(a)*10).round())) print("position after action=", list(s_[:8])) ep_r += r # normalize state before sending into A3C s = np.array(s) s_[14:28] = normalize(s_[14:28]) s[:14] = normalize(s[:14]) buffer_s.append(s) buffer_a.append(a) buffer_r.append(r/10.) dist_traveled = 10. - info print("dist_traveled", dist_traveled) if total_step % UPDATE_GLOBAL_ITER == 0 or done: # update global and assign to local net if done: v_s_ = 0 # terminal reward (based on A3C pesudo code) else: v_s_ = SESS.run(self.AC.v, {self.AC.s: s_[np.newaxis, :]})[0, 0] buffer_v_target = [] for r in buffer_r[::-1]: # reverse buffer r v_s_ = r + GAMMA * v_s_ # reward discount : R <-- ri + GAMMA * R (reverse order) buffer_v_target.append(v_s_) buffer_v_target.reverse() buffer_s, buffer_a, buffer_v_target = np.vstack(buffer_s), np.vstack(buffer_a), np.vstack(buffer_v_target) feed_dict = { self.AC.s: buffer_s, self.AC.a_his: buffer_a, self.AC.v_target: buffer_v_target, } a_l, c_l = self.AC.update_global(feed_dict) buffer_s, buffer_a, buffer_r = [], [], [] self.AC.pull_global() with open('loss.txt', 'a') as f: f.write("A loss="+ str(a_l) + ", C loss=" + str(c_l) + "\n") print("A loss = ", a_l, "C loss = ", c_l) s = s_ total_step += 1 if done: if len(GLOBAL_RUNNING_R) == 0: # record running episode reward GLOBAL_RUNNING_R.append(ep_r) else: GLOBAL_RUNNING_R.append(0.9 * GLOBAL_RUNNING_R[-1] + 0.1 * ep_r) print( self.name, "Ep:", GLOBAL_EP, "| Ep_r: %i" % GLOBAL_RUNNING_R[-1], ) break GLOBAL_EP += 1 with open('ep_reward.txt', 'a') as f: f.write('ep=%i, reward=%d, distence traveled=%f\n' % (GLOBAL_EP, ep_r, dist_traveled))
def __init__(self, name, globalAC): self.env = SpyndraEnv(N_S, N_A) self.name = name self.AC = ACNet(name, globalAC)
def __init__(self, name): self.env = SpyndraEnv(N_S, N_A) self.name = name self.agent = HillClimber(N_A)