예제 #1
0
class TestWorker(object):
    def __init__(self, name, globalAC):
        self.env = Market(filename, size, train_size, test_size, valid_size,
                          CATEGORY)
        self.name = name
        self.AC = ACNet(name, globalAC)
        self.AC.pull_global()

    def reset(self):
        self.AC.pull_global()

    def work(self, flag="test"):
        total_step = 1
        buffer_s, buffer_a, buffer_r = [], [], []
        s = self.env.reset_for_test(flag)
        while True:
            a = self.AC.choose_action(s, test=True)
            s_, r, done, info = self.env.step(a)
            print("a:", a, "r:", r, "time:", self.env.time, "len:",
                  len(self.env.observation_space))
            buffer_s.append(s[-1])
            buffer_a.append(a)
            buffer_r.append(r)

            s = s_
            total_step += 1
            if done:
                prob = np.array(buffer_r) + 1
                print("prof:", prob.prod(), "len", len(prob))

                with open(LOG_DIR + "/state_detail.txt", 'w') as f:
                    f.write(str(buffer_s))
                break
        return prob
예제 #2
0
class Worker(object):
    def __init__(self, name, globalAC, step=60):
        self.env = Market(filename,size,train_size,test_size,valid_size,CATEGORY)
        self.name = name
        self.AC = ACNet(name, globalAC)
        self.step = step

    def set_scope(self,scope):
        return self.env.set_env(scope[0],scope[1])


    def work(self):
        global GLOBAL_RUNNING_R, GLOBAL_EP
        total_step = 1
        buffer_s, buffer_a, buffer_r = [], [], []
        while not COORD.should_stop() and GLOBAL_EP < MAX_GLOBAL_EP:
            s = self.env.reset()
            R = [1]
            while True:
                a = self.AC.choose_action(s)
                s_, r, done, info = self.env.step(a)
                #print("a:",a,"r:",r,"time:",self.env.time,"len:",len(self.env.observation_space))
                buffer_s.append(s)
                buffer_a.append(a)
                buffer_r.append(r)
                R.append((r+1)*R[-1])

                if total_step % self.step == 0 or done:   # update global and assign to local net
                    if done:
                        v_s_ = 0   # terminal
                    else:
                        v_s_ = SESS.run(self.AC.v, {self.AC.s: s_[np.newaxis, :]})[0, 0]
                    buffer_v_target = []
                    for r in buffer_r[::-1]:    # reverse buffer r
                        v_s_ = r + GAMMA * v_s_
                        buffer_v_target.append(v_s_)
                    buffer_v_target.reverse()

                    buffer_s, buffer_a, buffer_v_target = np.vstack(buffer_s), np.array(buffer_a), np.vstack(buffer_v_target)
                    feed_dict = {
                        self.AC.s: buffer_s,
                        self.AC.a_his: buffer_a,
                        self.AC.v_target: buffer_v_target,
                    }
                    self.AC.update_global(feed_dict)

                    buffer_s, buffer_a, buffer_r = [], [], []
                    self.AC.pull_global()

                s = s_
                total_step += 1   
                if done:
                    GLOBAL_RUNNING_R[self.name].append(R[-1])
                    GLOBAL_EP += 1
                    print(self.name,"Ep:", GLOBAL_EP, "prof:",R[-1],"len",len(R))  
                    #for temp in R:
                        #print(temp+1)                                                 

                    break
예제 #3
0
if OUTPUT_GRAPH:
    summary_writer = tf.summary.FileWriter("logs/", sess.graph)

prob = []
ep_rs_nextsum = []
for i_episode in range(MAX_EPISODE):
    s = env.reset()
    t = 0
    track_r = []
    path = []
    loss = []
    while True:
        a = actor.choose_action(s)

        s_next, r, done, info = env.step(a)
        # print(( a, env.state, r, env.close[env.time],env.close[env.time-1]))

        track_r.append(r)
        # actor 将在s状态下计算得到的r和s_next传入个给critic,  分别计算出S和S_next对应的value(V和V_)
        # 将计算得到的奖励至td_error传递给actor,代替police gradient中的tf_vt
        td_error = critic.learn(
            s, r, s_next)  # gradient = grad[r + gamma * V(s_next) - V(s)]
        loss.append(td_error)
        actor.learn(s, a,
                    td_error)  # true_gradient = grad[logPi(s,a) * td_error]

        s = s_next
        t += 1

        if done: