def __init__(self, sess, name, N_S, N_A, globalAC): self.SESS = sess self.N_S = N_S self.N_A = N_A self.env = StockEnv() self.name = name self.AC = A3CNet(self.SESS, self.name, self.N_S, self.N_A, globalAC)
def main(train=False): data = np.loadtxt('./data.csv', delimiter=',', skiprows=1) data = data[230:-1] #delete the first day data angent = DQN_Trade() for i in range(0, 10): iters = len(data) / 240 for iter_step in range(0, iters): #print iter_step iter_data = data[iter_step * 240:iter_step * 240 + 240] env = StockEnv(iter_data) s = env.reset() while True: action = angent.egreedy_action(s) s_, reward, done = env.gostep(action) print(action) angent.precive(s, action, reward, s_, done) s = s_ if done: break angent.save_model(step=i)
def main(train = False): data = np.loadtxt('./data.csv',delimiter = ',',skiprows=1) data = data[230:-1] #delete the first day data angent = DQN_Trade() for i in range(0,10): iters =len(data)/240 for iter_step in range(0,iters): #print iter_step iter_data =data[iter_step*240:iter_step*240+240] env =StockEnv(iter_data) s = env.reset() while True: action = angent.egreedy_action(s) s_,reward,done =env.gostep(action) print action angent.precive(s,action,reward,s_,done) s= s_ if done: break angent.save_model(step=i)
def run_stock(): n_epoch = 100 mean = [] for x in range(n_epoch): env = Env(STOCK.Baidu) env.set_count(999) # from 1000 to 2000 for i in range(1000): action = random.random() * 2 - 1 # [-1, 1] env.step(action) mean.append(env.asset - 10000) # while True: # action = random.random() * 2 - 1 # [-1, 1] # observation_, reward, done = env.step(action) # # if done: # mean.append(env.asset - 10000) # break print(np.mean(mean), np.var(mean)) # end of game print('game over')
def test(test_data, model, tickers, randomize, num_rand_stocks=0): """ the test function: test agent on test_data if randomize == True, we know we have randomized stocks in training, so we test on num_rand_stocks randomly selected stocks from test_data if otherwise, we used the entire test_data set :param test_data: the testing set :param model: the trained model :param tickers: stocks tickers corresponding to test_data, including "CASH" :param randomize: boolean indicating whether we have randomized stocks :param num_rand_stocks: number of stocks randomized in training """ if randomize: # the last element of tickers is "CASH", we don't include "CASH" in randomization rand_stock_indices = np.random.choice(len(tickers) - 1, num_rand_stocks, replace=False) # get randomly selected stock names episode_tickers = [tickers[index] for index in rand_stock_indices] episode_tickers.append("CASH") rand_stock_indices = tf.reshape(rand_stock_indices, (len(rand_stock_indices), 1)) # saving the randomization to a new variable so we don't mess w/ test_data episode_input = tf.gather_nd(test_data, rand_stock_indices) else: episode_input = test_data episode_tickers = tickers env = StockEnv(episode_input, episode_tickers, is_testing=True) states, actions, rewards = env.generate_episode(model) min_testing_episode_len = 20 while len(rewards) < min_testing_episode_len: print("test episode not long enough") states, actions, rewards = env.generate_episode(model) print(f'final portfolio total value: {rewards[-1]}')
class Worker(object): def __init__(self, name, globalAC): self.env = StockEnv() self.name = name self.AC = ACNet(name, globalAC) def work(self): global GLOBAL_RUNNING_R, GLOBAL_EP total_step = 1 buffer_s, buffer_a, buffer_r = [], [], [] while not COORD.should_stop() and GLOBAL_EP < MAX_GLOBAL_EP: s = self.env.reset() ep_r = 0 for ep_t in range(MAX_EP_STEP): if self.name == 'W_0': self.env.render() a = self.AC.choose_action(s) s_, r, done = self.env.step(a) if ep_t == MAX_EP_STEP - 1: done = True ep_r += r buffer_s.append(s) buffer_a.append(a) buffer_r.append(r) if total_step % UPDATE_GLOBAL_ITER == 0 or done: # update global and assign to local net if done: v_s_ = 0 # terminal else: v_s_ = SESS.run(self.AC.v, {self.AC.s: s_[np.newaxis, :]})[0, 0] buffer_v_target = [] for r in buffer_r[::-1]: # reverse buffer r v_s_ = r + GAMMA * v_s_ buffer_v_target.append(v_s_) buffer_v_target.reverse() buffer_s, buffer_a, buffer_v_target = np.vstack( buffer_s), np.vstack(buffer_a), np.vstack( buffer_v_target) feed_dict = { self.AC.s: buffer_s, self.AC.a_his: buffer_a, self.AC.v_target: buffer_v_target, } test = self.AC.update_global(feed_dict) buffer_s, buffer_a, buffer_r = [], [], [] self.AC.pull_global() s = s_ total_step += 1 if done: if len(GLOBAL_RUNNING_R ) == 0: # record running episode reward GLOBAL_RUNNING_R.append(ep_r) else: GLOBAL_RUNNING_R.append(0.9 * GLOBAL_RUNNING_R[-1] + 0.1 * ep_r) print( self.name, "Ep:", GLOBAL_EP, "| Ep_r: %i" % GLOBAL_RUNNING_R[-1], '| Var:', test, ) GLOBAL_EP += 1 break
MAX_GLOBAL_EP = 2000 MAX_EP_STEP = 300 UPDATE_GLOBAL_ITER = 5 N_WORKERS = multiprocessing.cpu_count() LR_A = 1e-4 # learning rate for actor LR_C = 2e-4 # learning rate for critic GAMMA = 0.9 # reward discount # MODE = ['easy', 'hard'] # n_model = 1 GLOBAL_NET_SCOPE = 'Global_Net' ENTROPY_BETA = 0.01 GLOBAL_RUNNING_R = [] GLOBAL_EP = 0 env = StockEnv() N_S = env.reset().shape[0] N_A = 1 A_BOUND = env.action_bound[1] del env class ACNet(object): def __init__(self, scope, globalAC=None): if scope == GLOBAL_NET_SCOPE: # get global network with tf.variable_scope(scope): self.s = tf.placeholder(tf.float32, [None, N_S], 'S') self._build_net() self.a_params = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope + '/actor')
self.portfolio[i] -= 1 elif a == 1 and self.balance - p - TRANS_FEE >= 0: # buy self.balance -= (p + TRANS_FEE) self.portfolio[i] += 1 # else is hold so you do nothing def update_value(self, prices): total = 0 for i, p in enumerate(prices): total += self.portfolio[i] * p self.value.append(total + self.balance) ### Begin Simulation ### env = StockEnv(NUM_SECTORS) agent = q_agent(len(env.sectors)) fig = plt.figure() ax = fig.add_subplot(111) fig.suptitle('Hard Coded Agent') for episode in range(NUM_EPISODES): # reset the environment and initialize the portfolio value agent.reset() p0 = env.reset() agent.update_value(p0) for t in range(MAX_T): # select the next action action = agent.select_action(p0) # execute the next action and get next state and reward
def train(train_data, model, tickers, randomize, num_rand_stocks=0, episode_max_days=200): """ the train function, train the model for an entire epoch :param train_data: the preprocessed training data, of shape [num_stocks, num_days, datum_size] :param model: the model to be trained :param tickers: stock tickers corresponding to train_data, including "CASH" :param randomize: boolean indicating whether we have randomized stocks :param num_rand_stocks: number of stocks randomized in training :param episode_max_days: the maximum number of days of trading actions in an episode :return losses and rewards """ num_days = train_data.shape[1] loss_list = [] offset = model.past_num - 1 # extra days of price history needed at beginning start = 0 # start of price history slice (inclusive) end = start + episode_max_days + offset # end of price history slice (exclusive) num_episodes = (num_days - offset) // episode_max_days # a list of total cash value rewards_list = [] for episode in range(num_episodes): print(f"Training episode {episode+1} of {num_episodes}") if randomize: # the last element of tickers is "CASH", we don't include "CASH" in randomization rand_stock_indices = np.random.choice(len(tickers) - 1, num_rand_stocks, replace=False) # get randomly selected stock names episode_tickers = [tickers[index] for index in rand_stock_indices] episode_tickers.append("CASH") rand_stock_indices = tf.reshape(rand_stock_indices, (len(rand_stock_indices), 1)) episode_input = tf.gather_nd(train_data, rand_stock_indices) else: episode_input = train_data episode_tickers = tickers # Slice of pricing history to generate this episode on episode_input = episode_input[:, start:end, :] start += episode_max_days end += episode_max_days # (REMOVED) randomize starting date in each episode # rand_start = randint(0, int(episode_max_days / 5)) # episode_input = episode_input[:, rand_start:episode_max_days,:] # Hyperparameters to be adjusted below: env = StockEnv(episode_input, episode_tickers, interest_annual=0.1, borrow_interest_annual=0.2, transaction_penalty=0.0001) with tf.GradientTape() as tape: states, actions, rewards = env.generate_episode(model) rewards_list.extend(rewards) discounted_rewards = discount(rewards) model.remember(states, actions, discounted_rewards) repl_states, repl_actions, repl_discounted_rewards = model.experience_replay( ) model_loss = model.loss(repl_states, repl_actions, repl_discounted_rewards) gradients = tape.gradient(model_loss, model.trainable_variables) model.optimizer.apply_gradients( zip(gradients, model.trainable_variables)) loss_list.append(model_loss.numpy()) # reward at end of batch return list(loss_list), rewards_list
class Worker(object): GAMMA = 0.9 GLOBAL_RUNNING_R = [] GLOBAL_EP = 0 def __init__(self, sess, name, N_S, N_A, globalAC): self.SESS = sess self.N_S = N_S self.N_A = N_A self.env = StockEnv() self.name = name self.AC = A3CNet(self.SESS, self.name, self.N_S, self.N_A, globalAC) # self.saver = tf.train.Saver() def _record_global_reward_and_print(self, global_runing_rs, ep_r, global_ep, total_step): global_runing_rs.append(ep_r) try: print(self.name, "Ep:", global_ep, "| Ep_r: %i" % global_runing_rs[-1], "| total step:", total_step) except Exception as e: print(e) def train(self): buffer_s, buffer_a, buffer_r = [], [], [] s = self.env.reset() ep_r = 0 total_step = 1 def reset(): nonlocal ep_r, total_step self.env.reset() ep_r = 0 total_step = 1 while not COORD.should_stop() and self.GLOBAL_EP < MAX_GLOBAL_EP: # s = self.env.reset() # ep_r = 0 # total_step = 1 reset() while total_step < MAX_TOTAL_STEP: try: s = self.env.get_state() a, p = self.AC.choose_action(s) s_, r, done = self.env.step(a) if done: r = -2 ep_r += r buffer_s.append(s) buffer_a.append(a) buffer_r.append(r) if total_step % UPDATE_GLOBAL_ITER == 0 or done: # update global and assign to local net self.AC.update(done, s_, buffer_r, buffer_s, buffer_a) buffer_s, buffer_a, buffer_r = [], [], [] if done: self._record_global_reward_and_print( self.GLOBAL_RUNNING_R, ep_r, self.GLOBAL_EP, total_step) self.GLOBAL_EP += 1 reset() # s = s_ total_step += 1 if self.name == 'W_0': self.env.render() time.sleep(0.05) logger.debug([ "s ", s, " v ", self.AC.get_v(s), " a ", a, " p ", p, " ep_r ", ep_r, " total ", self.env.total, " acct ", self.env.acct ]) except Exception as e: print(e) try: print(self.name, " not done,may be donkey!", " total_step:", total_step) except Exception as e: print(e)
import torch import numpy as np import pandas as pd import matplotlib.pyplot as plt from stock_env import StockEnv from actor_critic_eligibility_trace import ActorCriticEligibilityTrace test = True env = StockEnv(test=test, random_flips=False) env_passive = StockEnv(test=test) ac = ActorCriticEligibilityTrace(env) if not test: ac.train() else: df = pd.read_csv("csvs/norm_all_stocks_5yr.csv") stock_df = df[df.Name == env.test_stock_name] policy_mlp = ac.policy_mlp policy_mlp.load_state_dict(torch.load("policy_mlp.pth")) value_mlp = ac.value_mlp value_mlp.load_state_dict(torch.load("value_mlp.pth")) obss = [] actions = [] rewards = [] obs = env.reset() while True: obss.append(obs)
def __init__(self, name, globalAC): self.env = StockEnv() self.name = name self.AC = ACNet(name,self.env.get_state().shape[0], 4, globalAC)
class Worker(object): def __init__(self, name, globalAC): self.env = StockEnv() self.name = name self.AC = ACNet(name,self.env.get_state().shape[0], 4, globalAC) def _update_global_reward(self, ep_r): global GLOBAL_RUNNING_R, GLOBAL_EP if len(GLOBAL_RUNNING_R) == 0: # record running episode reward GLOBAL_RUNNING_R.append(ep_r) else: GLOBAL_RUNNING_R.append(0.99 * GLOBAL_RUNNING_R[-1] + 0.01 * ep_r) logger.debug( [self.name, "Ep:", GLOBAL_EP, "| Ep_r: %i" % GLOBAL_RUNNING_R[-1]] ) GLOBAL_EP += 1 def _update_globa_acnet(self, done, s_, buffer_s, buffer_a, buffer_r): if done: v_s_ = 0 # terminal else: v_s_ = SESS.run(self.AC.v, {self.AC.s: s_[np.newaxis, :]})[0, 0] buffer_v_target = [] for r in buffer_r[::-1]: # reverse buffer r v_s_ = r + GAMMA * v_s_ buffer_v_target.append(v_s_) buffer_v_target.reverse() buffer_s, buffer_a, buffer_v_target = np.vstack( buffer_s), np.array(buffer_a), np.vstack(buffer_v_target) feed_dict = { self.AC.s: buffer_s, self.AC.a_his: buffer_a, self.AC.v_target: buffer_v_target, } self.AC.update_global(feed_dict) def work(self): total_step = 1 buffer_s, buffer_a, buffer_r = [], [], [] self.env.reset() if self.name == 'W_0': self.env.render() while not COORD.should_stop(): ep_r = 0 while True: s = self.env._get_state() a, p = self.AC.choose_action(s) s_, r, done = self.env.step(a) if done: r = -0.5 ep_r += r buffer_s.append(s) buffer_a.append(a) buffer_r.append(r) if total_step % UPDATE_GLOBAL_ITER == 0 or done: self._update_globa_acnet(done, s_, buffer_s, buffer_a, buffer_r) buffer_s, buffer_a, buffer_r = [], [], [] self.AC.pull_global() # s = s_ total_step += 1 if done: self._update_global_reward(ep_r) break if self.name == 'W_0': logger.debug(["s", s, " a:", a, " p:", p, " r:", r, " total_step:", total_step, 'total', self.env.total]) time.sleep(0.5) def train(self): global GLOBAL_RUNNING_R, GLOBAL_EP total_step = 1 buffer_s, buffer_a, buffer_r = [], [], [] while not COORD.should_stop() and GLOBAL_EP < MAX_GLOBAL_EP: s = self.env.reset() ep_r = 0 while True: # if self.name == 'W_0': # self.env.render() a, p = self.AC.choose_action(s) s_, r, done = self.env.step(a) if done: r = -0.5 ep_r += r buffer_s.append(s) buffer_a.append(a) buffer_r.append(r) if total_step % UPDATE_GLOBAL_ITER == 0 or done: # update global and assign to local net self._update_globa_acnet(done, s_, buffer_s, buffer_a, buffer_r) buffer_s, buffer_a, buffer_r = [], [], [] self.AC.pull_global() if done: self._update_global_reward(ep_r) logger.debug(["s", s, " a:", a, " p:", p, " r:", r, " total_step:", total_step, 'total', self.env.total]) break s = s_ total_step += 1
LOG_DIR = './log' N_WORKERS = multiprocessing.cpu_count() MAX_GLOBAL_EP = 50 GLOBAL_NET_SCOPE = 'Global_Net' UPDATE_GLOBAL_ITER = 50 # GAMMA = 0.9 GAMMA = 0.8 # ENTROPY_BETA = 0.001 ENTROPY_BETA = 0.1 LR_A = 0.001 # learning rate for actor # LR_C = 0.001 # learning rate for critic LR_C = 0.01 # learning rate for critic GLOBAL_RUNNING_R = [] GLOBAL_EP = 0 env = StockEnv() N_S = env.get_state().shape[0] N_A = 4 logger = Logger('A3C') class ACNet(object): def __init__(self, scope, N_S, A_S, globalAC=None): if scope == GLOBAL_NET_SCOPE: # get global network with tf.variable_scope(scope): self.s = tf.placeholder(tf.float32, [None, N_S], 'S') self.a_params, self.c_params = self._build_net(scope)[-2:] else: # local net, calculate losses with tf.variable_scope(scope): self.s = tf.placeholder(tf.float32, [None, N_S], 'S')
from stock_env import StockEnv from stock_env import STOCK from stock_env_discrete import StockEnv as StockEnvD from stock_env_discrete import STOCK as STOCKD from sklearn.externals import joblib import warnings if __name__ == '__main__': warnings.filterwarnings("ignore") # ignore warnings stockHMM = StockHMM(STOCK.Baidu) # load model stockHMM.model = joblib.load("BaiDuHMM.pkl") print('Continuous: ') env = StockEnv(STOCK.Baidu) # [1000, 2000) env.set_count(999) for x in range(1000): p_states = stockHMM.predict(x + 1000) # order: 2 3 4 1 0 my_action = p_states[ 2] + p_states[3] * 0.5 - p_states[1] * 0.5 - p_states[0] env.step(my_action) print(env.asset - 10000) print('Discrete: ') env = StockEnvD(STOCKD.Baidu) # [1000, 2000) env.set_count(999)
def __init__(self, name, globalAC): self.env = StockEnv() self.name = name self.AC = ACNet(name, globalAC)
from rl.agents.dqn import DQNAgent from rl.policy import LinearAnnealedPolicy, BoltzmannQPolicy, EpsGreedyQPolicy from rl.memory import SequentialMemory from rl.core import Processor from rl.callbacks import FileLogger, ModelIntervalCheckpoint from stock_env import StockEnv import models INPUT_SHAPE = (30, 180) WINDOW_LENGTH = 4 model = models.build_combined_model() # Get the environment and extract the number of actions. env = StockEnv() nb_actions = 3 # Next, we build our model. We use the same model that was described by Mnih et al. (2015). input_shape = (WINDOW_LENGTH, ) + INPUT_SHAPE # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and # even the metrics! memory = SequentialMemory(limit=10000000, window_length=WINDOW_LENGTH) # Select a policy. We use eps-greedy action selection, which means that a random action is selected # with probability eps. We anneal eps from 1.0 to 0.1 over the course of 1M steps. This is done so that # the agent initially explores the environment (high eps) and then gradually sticks to what it knows # (low eps). We also set a dedicated eps value that is used during testing. Note that we set it to 0.05 # so that the agent still performs some random actions. This ensures that the agent cannot get stuck. policy = LinearAnnealedPolicy(EpsGreedyQPolicy(),
loss = torch.mean((Qpred - Qs) ** 2) # print(f"loss:{loss.data[0]}") loss.backward() optim.step() ''' 035720 카카오 005930 삼성전자 000660 SK하이닉스 000120 CJ대한통운 285130 SK케미칼 008970 동양철관 ''' # env = StockEnv("000660") env = StockEnv("000660") Replay = namedtuple("Replay", ["state", "action", "new_state", "reward", "done"]) predDQN = DQN(env.num_state(), env.num_action(), 40) targetDQN = DQN(env.num_state(), env.num_action(), 40) # optim = torch.optim.Adam(predDQN.parameters(), lr=0.1) optim = torch.optim.SGD(predDQN.parameters(), lr=0.1) replay_buffer = deque() BUFFER_SIZE = 5000 num_episods = 30000 gamma = 0.9 reward_history = [] duration_history = []
from stock_env import StockEnv env = StockEnv() if __name__ == '__main__': env.render() # print(env.step(1)) s,r,done = env.step(1) print(s) print(s.shape) print(r) print("=====================") s,r,done = env.step(0) print(s) print(r) print("====================") s,r,done = env.step(2) print(s) print(r)
self.env.acct ]) except Exception as e: print(e) try: print(self.name, " not done,may be donkey!", " total_step:", total_step) except Exception as e: print(e) if __name__ == "__main__": GLOBAL_NET_SCOPE = 'Global_Net' N_A = 4 N_S = StockEnv().get_state().shape[0] SESS = tf.Session() with tf.device("/cpu:0"): GLOBAL_AC = A3CNet(SESS, GLOBAL_NET_SCOPE, N_S, N_A) # we only need its params workers = [] # Create worker for i in range(N_WORKERS): i_name = 'W_%i' % i # worker name workers.append(Worker(SESS, i_name, N_S, N_A, GLOBAL_AC)) COORD = tf.train.Coordinator() SESS.run(tf.global_variables_initializer())