class TestWorker(object): def __init__(self, name, globalAC): self.env = Market(filename, size, train_size, test_size, valid_size, CATEGORY) self.name = name self.AC = ACNet(name, globalAC) self.AC.pull_global() def reset(self): self.AC.pull_global() def work(self, flag="test"): total_step = 1 buffer_s, buffer_a, buffer_r = [], [], [] s = self.env.reset_for_test(flag) while True: a = self.AC.choose_action(s, test=True) s_, r, done, info = self.env.step(a) print("a:", a, "r:", r, "time:", self.env.time, "len:", len(self.env.observation_space)) buffer_s.append(s[-1]) buffer_a.append(a) buffer_r.append(r) s = s_ total_step += 1 if done: prob = np.array(buffer_r) + 1 print("prof:", prob.prod(), "len", len(prob)) with open(LOG_DIR + "/state_detail.txt", 'w') as f: f.write(str(buffer_s)) break return prob
class Worker(object): def __init__(self, name, globalAC, step=60): self.env = Market(filename,size,train_size,test_size,valid_size,CATEGORY) self.name = name self.AC = ACNet(name, globalAC) self.step = step def set_scope(self,scope): return self.env.set_env(scope[0],scope[1]) def work(self): global GLOBAL_RUNNING_R, GLOBAL_EP total_step = 1 buffer_s, buffer_a, buffer_r = [], [], [] while not COORD.should_stop() and GLOBAL_EP < MAX_GLOBAL_EP: s = self.env.reset() R = [1] while True: a = self.AC.choose_action(s) s_, r, done, info = self.env.step(a) #print("a:",a,"r:",r,"time:",self.env.time,"len:",len(self.env.observation_space)) buffer_s.append(s) buffer_a.append(a) buffer_r.append(r) R.append((r+1)*R[-1]) if total_step % self.step == 0 or done: # update global and assign to local net if done: v_s_ = 0 # terminal else: v_s_ = SESS.run(self.AC.v, {self.AC.s: s_[np.newaxis, :]})[0, 0] buffer_v_target = [] for r in buffer_r[::-1]: # reverse buffer r v_s_ = r + GAMMA * v_s_ buffer_v_target.append(v_s_) buffer_v_target.reverse() buffer_s, buffer_a, buffer_v_target = np.vstack(buffer_s), np.array(buffer_a), np.vstack(buffer_v_target) feed_dict = { self.AC.s: buffer_s, self.AC.a_his: buffer_a, self.AC.v_target: buffer_v_target, } self.AC.update_global(feed_dict) buffer_s, buffer_a, buffer_r = [], [], [] self.AC.pull_global() s = s_ total_step += 1 if done: GLOBAL_RUNNING_R[self.name].append(R[-1]) GLOBAL_EP += 1 print(self.name,"Ep:", GLOBAL_EP, "prof:",R[-1],"len",len(R)) #for temp in R: #print(temp+1) break
if OUTPUT_GRAPH: summary_writer = tf.summary.FileWriter("logs/", sess.graph) prob = [] ep_rs_nextsum = [] for i_episode in range(MAX_EPISODE): s = env.reset() t = 0 track_r = [] path = [] loss = [] while True: a = actor.choose_action(s) s_next, r, done, info = env.step(a) # print(( a, env.state, r, env.close[env.time],env.close[env.time-1])) track_r.append(r) # actor 将在s状态下计算得到的r和s_next传入个给critic, 分别计算出S和S_next对应的value(V和V_) # 将计算得到的奖励至td_error传递给actor,代替police gradient中的tf_vt td_error = critic.learn( s, r, s_next) # gradient = grad[r + gamma * V(s_next) - V(s)] loss.append(td_error) actor.learn(s, a, td_error) # true_gradient = grad[logPi(s,a) * td_error] s = s_next t += 1 if done: