def main(train = False): data = np.loadtxt('./data.csv',delimiter = ',',skiprows=1) data = data[230:-1] #delete the first day data angent = DQN_Trade() for i in range(0,10): iters =len(data)/240 for iter_step in range(0,iters): #print iter_step iter_data =data[iter_step*240:iter_step*240+240] env =StockEnv(iter_data) s = env.reset() while True: action = angent.egreedy_action(s) s_,reward,done =env.gostep(action) print action angent.precive(s,action,reward,s_,done) s= s_ if done: break angent.save_model(step=i)
def main(train=False): data = np.loadtxt('./data.csv', delimiter=',', skiprows=1) data = data[230:-1] #delete the first day data angent = DQN_Trade() for i in range(0, 10): iters = len(data) / 240 for iter_step in range(0, iters): #print iter_step iter_data = data[iter_step * 240:iter_step * 240 + 240] env = StockEnv(iter_data) s = env.reset() while True: action = angent.egreedy_action(s) s_, reward, done = env.gostep(action) print(action) angent.precive(s, action, reward, s_, done) s = s_ if done: break angent.save_model(step=i)
class Worker(object): def __init__(self, name, globalAC): self.env = StockEnv() self.name = name self.AC = ACNet(name, globalAC) def work(self): global GLOBAL_RUNNING_R, GLOBAL_EP total_step = 1 buffer_s, buffer_a, buffer_r = [], [], [] while not COORD.should_stop() and GLOBAL_EP < MAX_GLOBAL_EP: s = self.env.reset() ep_r = 0 for ep_t in range(MAX_EP_STEP): if self.name == 'W_0': self.env.render() a = self.AC.choose_action(s) s_, r, done = self.env.step(a) if ep_t == MAX_EP_STEP - 1: done = True ep_r += r buffer_s.append(s) buffer_a.append(a) buffer_r.append(r) if total_step % UPDATE_GLOBAL_ITER == 0 or done: # update global and assign to local net if done: v_s_ = 0 # terminal else: v_s_ = SESS.run(self.AC.v, {self.AC.s: s_[np.newaxis, :]})[0, 0] buffer_v_target = [] for r in buffer_r[::-1]: # reverse buffer r v_s_ = r + GAMMA * v_s_ buffer_v_target.append(v_s_) buffer_v_target.reverse() buffer_s, buffer_a, buffer_v_target = np.vstack( buffer_s), np.vstack(buffer_a), np.vstack( buffer_v_target) feed_dict = { self.AC.s: buffer_s, self.AC.a_his: buffer_a, self.AC.v_target: buffer_v_target, } test = self.AC.update_global(feed_dict) buffer_s, buffer_a, buffer_r = [], [], [] self.AC.pull_global() s = s_ total_step += 1 if done: if len(GLOBAL_RUNNING_R ) == 0: # record running episode reward GLOBAL_RUNNING_R.append(ep_r) else: GLOBAL_RUNNING_R.append(0.9 * GLOBAL_RUNNING_R[-1] + 0.1 * ep_r) print( self.name, "Ep:", GLOBAL_EP, "| Ep_r: %i" % GLOBAL_RUNNING_R[-1], '| Var:', test, ) GLOBAL_EP += 1 break
MAX_GLOBAL_EP = 2000 MAX_EP_STEP = 300 UPDATE_GLOBAL_ITER = 5 N_WORKERS = multiprocessing.cpu_count() LR_A = 1e-4 # learning rate for actor LR_C = 2e-4 # learning rate for critic GAMMA = 0.9 # reward discount # MODE = ['easy', 'hard'] # n_model = 1 GLOBAL_NET_SCOPE = 'Global_Net' ENTROPY_BETA = 0.01 GLOBAL_RUNNING_R = [] GLOBAL_EP = 0 env = StockEnv() N_S = env.reset().shape[0] N_A = 1 A_BOUND = env.action_bound[1] del env class ACNet(object): def __init__(self, scope, globalAC=None): if scope == GLOBAL_NET_SCOPE: # get global network with tf.variable_scope(scope): self.s = tf.placeholder(tf.float32, [None, N_S], 'S') self._build_net() self.a_params = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope + '/actor') self.c_params = tf.get_collection(
total += self.portfolio[i] * p self.value.append(total + self.balance) ### Begin Simulation ### env = StockEnv(NUM_SECTORS) agent = q_agent(len(env.sectors)) fig = plt.figure() ax = fig.add_subplot(111) fig.suptitle('Hard Coded Agent') for episode in range(NUM_EPISODES): # reset the environment and initialize the portfolio value agent.reset() p0 = env.reset() agent.update_value(p0) for t in range(MAX_T): # select the next action action = agent.select_action(p0) # execute the next action and get next state and reward p = env.step() for i, a in enumerate(action): agent.act(i, a, p[i]) agent.update_value(p) # render the portfolio value graph env.render(ax, agent.value)
num_episods = 30000 gamma = 0.9 reward_history = [] duration_history = [] prev_score = avg_return = None decaying = 0.99 e = 1.0 for episode in range(num_episods): # e = 1.0 / (np.sqrt(episode) * 1 + 1) e = 1.0 / (episode / 10 + 1) # e = ramdom_action_prob # e = 0.2 / (episode / 5000 + 1) * 0.5 * (1 + np.cos(2 * np.pi * episode/5000)) if episode > 0.9 * num_episods: e = 0.0 state = env.reset() reward_sum = 0.0 for step in range(5000): # if episode % 100 == 0: # env.render() if np.random.rand(1) < e: a = env.random_action() else: Qs = predDQN(state) _, i = torch.max(Qs.data, 0) a = i[0] new_state, reward, done, info = env.step(a) replay_buffer.append(Replay(state, a, new_state, reward, done)) if len(replay_buffer) > BUFFER_SIZE:
if not test: ac.train() else: df = pd.read_csv("csvs/norm_all_stocks_5yr.csv") stock_df = df[df.Name == env.test_stock_name] policy_mlp = ac.policy_mlp policy_mlp.load_state_dict(torch.load("policy_mlp.pth")) value_mlp = ac.value_mlp value_mlp.load_state_dict(torch.load("value_mlp.pth")) obss = [] actions = [] rewards = [] obs = env.reset() while True: obss.append(obs) action, _ = policy_mlp(torch.as_tensor(obs, dtype=torch.float32)) obs, reward, done, _ = env.step(action.detach().numpy()) actions.append(action) rewards.append(reward) if done: break obss_passive = [] actions_passive = [] rewards_passive = [] obs = env.reset()
class Worker(object): def __init__(self, name, globalAC): self.env = StockEnv() self.name = name self.AC = ACNet(name,self.env.get_state().shape[0], 4, globalAC) def _update_global_reward(self, ep_r): global GLOBAL_RUNNING_R, GLOBAL_EP if len(GLOBAL_RUNNING_R) == 0: # record running episode reward GLOBAL_RUNNING_R.append(ep_r) else: GLOBAL_RUNNING_R.append(0.99 * GLOBAL_RUNNING_R[-1] + 0.01 * ep_r) logger.debug( [self.name, "Ep:", GLOBAL_EP, "| Ep_r: %i" % GLOBAL_RUNNING_R[-1]] ) GLOBAL_EP += 1 def _update_globa_acnet(self, done, s_, buffer_s, buffer_a, buffer_r): if done: v_s_ = 0 # terminal else: v_s_ = SESS.run(self.AC.v, {self.AC.s: s_[np.newaxis, :]})[0, 0] buffer_v_target = [] for r in buffer_r[::-1]: # reverse buffer r v_s_ = r + GAMMA * v_s_ buffer_v_target.append(v_s_) buffer_v_target.reverse() buffer_s, buffer_a, buffer_v_target = np.vstack( buffer_s), np.array(buffer_a), np.vstack(buffer_v_target) feed_dict = { self.AC.s: buffer_s, self.AC.a_his: buffer_a, self.AC.v_target: buffer_v_target, } self.AC.update_global(feed_dict) def work(self): total_step = 1 buffer_s, buffer_a, buffer_r = [], [], [] self.env.reset() if self.name == 'W_0': self.env.render() while not COORD.should_stop(): ep_r = 0 while True: s = self.env._get_state() a, p = self.AC.choose_action(s) s_, r, done = self.env.step(a) if done: r = -0.5 ep_r += r buffer_s.append(s) buffer_a.append(a) buffer_r.append(r) if total_step % UPDATE_GLOBAL_ITER == 0 or done: self._update_globa_acnet(done, s_, buffer_s, buffer_a, buffer_r) buffer_s, buffer_a, buffer_r = [], [], [] self.AC.pull_global() # s = s_ total_step += 1 if done: self._update_global_reward(ep_r) break if self.name == 'W_0': logger.debug(["s", s, " a:", a, " p:", p, " r:", r, " total_step:", total_step, 'total', self.env.total]) time.sleep(0.5) def train(self): global GLOBAL_RUNNING_R, GLOBAL_EP total_step = 1 buffer_s, buffer_a, buffer_r = [], [], [] while not COORD.should_stop() and GLOBAL_EP < MAX_GLOBAL_EP: s = self.env.reset() ep_r = 0 while True: # if self.name == 'W_0': # self.env.render() a, p = self.AC.choose_action(s) s_, r, done = self.env.step(a) if done: r = -0.5 ep_r += r buffer_s.append(s) buffer_a.append(a) buffer_r.append(r) if total_step % UPDATE_GLOBAL_ITER == 0 or done: # update global and assign to local net self._update_globa_acnet(done, s_, buffer_s, buffer_a, buffer_r) buffer_s, buffer_a, buffer_r = [], [], [] self.AC.pull_global() if done: self._update_global_reward(ep_r) logger.debug(["s", s, " a:", a, " p:", p, " r:", r, " total_step:", total_step, 'total', self.env.total]) break s = s_ total_step += 1
class Worker(object): GAMMA = 0.9 GLOBAL_RUNNING_R = [] GLOBAL_EP = 0 def __init__(self, sess, name, N_S, N_A, globalAC): self.SESS = sess self.N_S = N_S self.N_A = N_A self.env = StockEnv() self.name = name self.AC = A3CNet(self.SESS, self.name, self.N_S, self.N_A, globalAC) # self.saver = tf.train.Saver() def _record_global_reward_and_print(self, global_runing_rs, ep_r, global_ep, total_step): global_runing_rs.append(ep_r) try: print(self.name, "Ep:", global_ep, "| Ep_r: %i" % global_runing_rs[-1], "| total step:", total_step) except Exception as e: print(e) def train(self): buffer_s, buffer_a, buffer_r = [], [], [] s = self.env.reset() ep_r = 0 total_step = 1 def reset(): nonlocal ep_r, total_step self.env.reset() ep_r = 0 total_step = 1 while not COORD.should_stop() and self.GLOBAL_EP < MAX_GLOBAL_EP: # s = self.env.reset() # ep_r = 0 # total_step = 1 reset() while total_step < MAX_TOTAL_STEP: try: s = self.env.get_state() a, p = self.AC.choose_action(s) s_, r, done = self.env.step(a) if done: r = -2 ep_r += r buffer_s.append(s) buffer_a.append(a) buffer_r.append(r) if total_step % UPDATE_GLOBAL_ITER == 0 or done: # update global and assign to local net self.AC.update(done, s_, buffer_r, buffer_s, buffer_a) buffer_s, buffer_a, buffer_r = [], [], [] if done: self._record_global_reward_and_print( self.GLOBAL_RUNNING_R, ep_r, self.GLOBAL_EP, total_step) self.GLOBAL_EP += 1 reset() # s = s_ total_step += 1 if self.name == 'W_0': self.env.render() time.sleep(0.05) logger.debug([ "s ", s, " v ", self.AC.get_v(s), " a ", a, " p ", p, " ep_r ", ep_r, " total ", self.env.total, " acct ", self.env.acct ]) except Exception as e: print(e) try: print(self.name, " not done,may be donkey!", " total_step:", total_step) except Exception as e: print(e)