Example #1
0
class Worker(object):
    def __init__(self, name, globalAC, step=60):
        self.env = Market(filename,size,train_size,test_size,valid_size,CATEGORY)
        self.name = name
        self.AC = ACNet(name, globalAC)
        self.step = step

    def set_scope(self,scope):
        return self.env.set_env(scope[0],scope[1])


    def work(self):
        global GLOBAL_RUNNING_R, GLOBAL_EP
        total_step = 1
        buffer_s, buffer_a, buffer_r = [], [], []
        while not COORD.should_stop() and GLOBAL_EP < MAX_GLOBAL_EP:
            s = self.env.reset()
            R = [1]
            while True:
                a = self.AC.choose_action(s)
                s_, r, done, info = self.env.step(a)
                #print("a:",a,"r:",r,"time:",self.env.time,"len:",len(self.env.observation_space))
                buffer_s.append(s)
                buffer_a.append(a)
                buffer_r.append(r)
                R.append((r+1)*R[-1])

                if total_step % self.step == 0 or done:   # update global and assign to local net
                    if done:
                        v_s_ = 0   # terminal
                    else:
                        v_s_ = SESS.run(self.AC.v, {self.AC.s: s_[np.newaxis, :]})[0, 0]
                    buffer_v_target = []
                    for r in buffer_r[::-1]:    # reverse buffer r
                        v_s_ = r + GAMMA * v_s_
                        buffer_v_target.append(v_s_)
                    buffer_v_target.reverse()

                    buffer_s, buffer_a, buffer_v_target = np.vstack(buffer_s), np.array(buffer_a), np.vstack(buffer_v_target)
                    feed_dict = {
                        self.AC.s: buffer_s,
                        self.AC.a_his: buffer_a,
                        self.AC.v_target: buffer_v_target,
                    }
                    self.AC.update_global(feed_dict)

                    buffer_s, buffer_a, buffer_r = [], [], []
                    self.AC.pull_global()

                s = s_
                total_step += 1   
                if done:
                    GLOBAL_RUNNING_R[self.name].append(R[-1])
                    GLOBAL_EP += 1
                    print(self.name,"Ep:", GLOBAL_EP, "prof:",R[-1],"len",len(R))  
                    #for temp in R:
                        #print(temp+1)                                                 

                    break
Example #2
0
sess = tf.Session()

actor = Actor(sess, n_features=N_F, n_actions=N_A, lr=LR_A)
critic = Critic(
    sess, n_features=N_F, lr=LR_C
)  # we need a good teacher, so the teacher should learn faster than the actor

sess.run(tf.global_variables_initializer())

if OUTPUT_GRAPH:
    summary_writer = tf.summary.FileWriter("logs/", sess.graph)

prob = []
ep_rs_nextsum = []
for i_episode in range(MAX_EPISODE):
    s = env.reset()
    t = 0
    track_r = []
    path = []
    loss = []
    while True:
        a = actor.choose_action(s)

        s_next, r, done, info = env.step(a)
        # print(( a, env.state, r, env.close[env.time],env.close[env.time-1]))

        track_r.append(r)
        # actor 将在s状态下计算得到的r和s_next传入个给critic,  分别计算出S和S_next对应的value(V和V_)
        # 将计算得到的奖励至td_error传递给actor,代替police gradient中的tf_vt
        td_error = critic.learn(
            s, r, s_next)  # gradient = grad[r + gamma * V(s_next) - V(s)]