Ejemplo n.º 1
0
 def __init__(self, sess, name, N_S, N_A, globalAC):
     self.SESS = sess
     self.N_S = N_S
     self.N_A = N_A
     self.env = StockEnv()
     self.name = name
     self.AC = A3CNet(self.SESS, self.name, self.N_S, self.N_A, globalAC)
Ejemplo n.º 2
0
def main(train=False):
    data = np.loadtxt('./data.csv', delimiter=',', skiprows=1)
    data = data[230:-1]  #delete the first day data
    angent = DQN_Trade()

    for i in range(0, 10):
        iters = len(data) / 240
        for iter_step in range(0, iters):
            #print iter_step
            iter_data = data[iter_step * 240:iter_step * 240 + 240]
            env = StockEnv(iter_data)
            s = env.reset()
            while True:
                action = angent.egreedy_action(s)
                s_, reward, done = env.gostep(action)
                print(action)
                angent.precive(s, action, reward, s_, done)
                s = s_
                if done:
                    break
        angent.save_model(step=i)
Ejemplo n.º 3
0
def main(train = False):
	data = np.loadtxt('./data.csv',delimiter = ',',skiprows=1)
	data = data[230:-1]  #delete the first day data
	angent = DQN_Trade()
    
	for i in range(0,10):
		iters =len(data)/240
		for iter_step in range(0,iters):
			#print iter_step
			iter_data =data[iter_step*240:iter_step*240+240]
			env =StockEnv(iter_data)
			s = env.reset()
			while True:
				action = angent.egreedy_action(s)
				s_,reward,done =env.gostep(action)
				print action
				angent.precive(s,action,reward,s_,done)
				s= s_
				if done:
					break
		angent.save_model(step=i)		
Ejemplo n.º 4
0
def run_stock():
    n_epoch = 100
    mean = []
    for x in range(n_epoch):
        env = Env(STOCK.Baidu)
        env.set_count(999)  # from 1000 to 2000
        for i in range(1000):
            action = random.random() * 2 - 1  # [-1, 1]
            env.step(action)

        mean.append(env.asset - 10000)

        # while True:
        #     action = random.random() * 2 - 1  # [-1, 1]
        #     observation_, reward, done = env.step(action)
        #
        #     if done:
        #         mean.append(env.asset - 10000)
        #         break

    print(np.mean(mean), np.var(mean))
    # end of game
    print('game over')
def test(test_data, model, tickers, randomize, num_rand_stocks=0):
    """
    the test function: test agent on test_data
    if randomize == True, we know we have randomized stocks in training, 
    so we test on num_rand_stocks randomly selected stocks from test_data
    if otherwise, we used the entire test_data set

    :param test_data: the testing set
    :param model: the trained model
    :param tickers: stocks tickers corresponding to test_data, including "CASH"
    :param randomize: boolean indicating whether we have randomized stocks
    :param num_rand_stocks: number of stocks randomized in training
    """
    if randomize:
        # the last element of tickers is "CASH", we don't include "CASH" in randomization
        rand_stock_indices = np.random.choice(len(tickers) - 1,
                                              num_rand_stocks,
                                              replace=False)
        # get randomly selected stock names
        episode_tickers = [tickers[index] for index in rand_stock_indices]
        episode_tickers.append("CASH")
        rand_stock_indices = tf.reshape(rand_stock_indices,
                                        (len(rand_stock_indices), 1))
        # saving the randomization to a new variable so we don't mess w/ test_data
        episode_input = tf.gather_nd(test_data, rand_stock_indices)
    else:
        episode_input = test_data
        episode_tickers = tickers

    env = StockEnv(episode_input, episode_tickers, is_testing=True)
    states, actions, rewards = env.generate_episode(model)
    min_testing_episode_len = 20
    while len(rewards) < min_testing_episode_len:
        print("test episode not long enough")
        states, actions, rewards = env.generate_episode(model)
    print(f'final portfolio total value: {rewards[-1]}')
Ejemplo n.º 6
0
Archivo: A3C.py Proyecto: linbirg/RL
class Worker(object):
    def __init__(self, name, globalAC):
        self.env = StockEnv()
        self.name = name
        self.AC = ACNet(name, globalAC)

    def work(self):
        global GLOBAL_RUNNING_R, GLOBAL_EP
        total_step = 1
        buffer_s, buffer_a, buffer_r = [], [], []
        while not COORD.should_stop() and GLOBAL_EP < MAX_GLOBAL_EP:
            s = self.env.reset()
            ep_r = 0
            for ep_t in range(MAX_EP_STEP):
                if self.name == 'W_0':
                    self.env.render()
                a = self.AC.choose_action(s)
                s_, r, done = self.env.step(a)
                if ep_t == MAX_EP_STEP - 1: done = True
                ep_r += r
                buffer_s.append(s)
                buffer_a.append(a)
                buffer_r.append(r)

                if total_step % UPDATE_GLOBAL_ITER == 0 or done:  # update global and assign to local net
                    if done:
                        v_s_ = 0  # terminal
                    else:
                        v_s_ = SESS.run(self.AC.v,
                                        {self.AC.s: s_[np.newaxis, :]})[0, 0]
                    buffer_v_target = []
                    for r in buffer_r[::-1]:  # reverse buffer r
                        v_s_ = r + GAMMA * v_s_
                        buffer_v_target.append(v_s_)
                    buffer_v_target.reverse()

                    buffer_s, buffer_a, buffer_v_target = np.vstack(
                        buffer_s), np.vstack(buffer_a), np.vstack(
                            buffer_v_target)
                    feed_dict = {
                        self.AC.s: buffer_s,
                        self.AC.a_his: buffer_a,
                        self.AC.v_target: buffer_v_target,
                    }
                    test = self.AC.update_global(feed_dict)
                    buffer_s, buffer_a, buffer_r = [], [], []
                    self.AC.pull_global()

                s = s_
                total_step += 1
                if done:
                    if len(GLOBAL_RUNNING_R
                           ) == 0:  # record running episode reward
                        GLOBAL_RUNNING_R.append(ep_r)
                    else:
                        GLOBAL_RUNNING_R.append(0.9 * GLOBAL_RUNNING_R[-1] +
                                                0.1 * ep_r)
                    print(
                        self.name,
                        "Ep:",
                        GLOBAL_EP,
                        "| Ep_r: %i" % GLOBAL_RUNNING_R[-1],
                        '| Var:',
                        test,
                    )
                    GLOBAL_EP += 1
                    break
Ejemplo n.º 7
0
Archivo: A3C.py Proyecto: linbirg/RL
MAX_GLOBAL_EP = 2000
MAX_EP_STEP = 300
UPDATE_GLOBAL_ITER = 5
N_WORKERS = multiprocessing.cpu_count()
LR_A = 1e-4  # learning rate for actor
LR_C = 2e-4  # learning rate for critic
GAMMA = 0.9  # reward discount
# MODE = ['easy', 'hard']
# n_model = 1
GLOBAL_NET_SCOPE = 'Global_Net'
ENTROPY_BETA = 0.01
GLOBAL_RUNNING_R = []
GLOBAL_EP = 0

env = StockEnv()
N_S = env.reset().shape[0]
N_A = 1
A_BOUND = env.action_bound[1]
del env


class ACNet(object):
    def __init__(self, scope, globalAC=None):

        if scope == GLOBAL_NET_SCOPE:  # get global network
            with tf.variable_scope(scope):
                self.s = tf.placeholder(tf.float32, [None, N_S], 'S')
                self._build_net()
                self.a_params = tf.get_collection(
                    tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope + '/actor')
Ejemplo n.º 8
0
            self.portfolio[i] -= 1
        elif a == 1 and self.balance - p - TRANS_FEE >= 0:  # buy
            self.balance -= (p + TRANS_FEE)
            self.portfolio[i] += 1
        # else is hold so you do nothing

    def update_value(self, prices):
        total = 0
        for i, p in enumerate(prices):
            total += self.portfolio[i] * p
        self.value.append(total + self.balance)


### Begin Simulation ###

env = StockEnv(NUM_SECTORS)
agent = q_agent(len(env.sectors))
fig = plt.figure()
ax = fig.add_subplot(111)
fig.suptitle('Hard Coded Agent')

for episode in range(NUM_EPISODES):
    # reset the environment and initialize the portfolio value
    agent.reset()
    p0 = env.reset()
    agent.update_value(p0)

    for t in range(MAX_T):
        # select the next action
        action = agent.select_action(p0)
        # execute the next action and get next state and reward
def train(train_data,
          model,
          tickers,
          randomize,
          num_rand_stocks=0,
          episode_max_days=200):
    """
    the train function, train the model for an entire epoch

    :param train_data: the preprocessed training data, of shape [num_stocks, num_days, datum_size]
    :param model: the model to be trained
    :param tickers: stock tickers corresponding to train_data, including "CASH"
    :param randomize: boolean indicating whether we have randomized stocks
    :param num_rand_stocks: number of stocks randomized in training
    :param episode_max_days: the maximum number of days of trading actions in an episode

    :return losses and rewards
    """
    num_days = train_data.shape[1]
    loss_list = []

    offset = model.past_num - 1  # extra days of price history needed at beginning
    start = 0  # start of price history slice (inclusive)
    end = start + episode_max_days + offset  # end of price history slice (exclusive)
    num_episodes = (num_days - offset) // episode_max_days

    # a list of total cash value
    rewards_list = []

    for episode in range(num_episodes):
        print(f"Training episode {episode+1} of {num_episodes}")

        if randomize:
            # the last element of tickers is "CASH", we don't include "CASH" in randomization
            rand_stock_indices = np.random.choice(len(tickers) - 1,
                                                  num_rand_stocks,
                                                  replace=False)
            # get randomly selected stock names
            episode_tickers = [tickers[index] for index in rand_stock_indices]
            episode_tickers.append("CASH")
            rand_stock_indices = tf.reshape(rand_stock_indices,
                                            (len(rand_stock_indices), 1))
            episode_input = tf.gather_nd(train_data, rand_stock_indices)
        else:
            episode_input = train_data
            episode_tickers = tickers

        # Slice of pricing history to generate this episode on
        episode_input = episode_input[:, start:end, :]
        start += episode_max_days
        end += episode_max_days

        # (REMOVED) randomize starting date in each episode
        # rand_start = randint(0, int(episode_max_days / 5))
        # episode_input = episode_input[:, rand_start:episode_max_days,:]

        # Hyperparameters to be adjusted below:
        env = StockEnv(episode_input,
                       episode_tickers,
                       interest_annual=0.1,
                       borrow_interest_annual=0.2,
                       transaction_penalty=0.0001)

        with tf.GradientTape() as tape:
            states, actions, rewards = env.generate_episode(model)
            rewards_list.extend(rewards)
            discounted_rewards = discount(rewards)
            model.remember(states, actions, discounted_rewards)
            repl_states, repl_actions, repl_discounted_rewards = model.experience_replay(
            )
            model_loss = model.loss(repl_states, repl_actions,
                                    repl_discounted_rewards)
        gradients = tape.gradient(model_loss, model.trainable_variables)
        model.optimizer.apply_gradients(
            zip(gradients, model.trainable_variables))
        loss_list.append(model_loss.numpy())  # reward at end of batch
    return list(loss_list), rewards_list
Ejemplo n.º 10
0
class Worker(object):
    GAMMA = 0.9
    GLOBAL_RUNNING_R = []
    GLOBAL_EP = 0

    def __init__(self, sess, name, N_S, N_A, globalAC):
        self.SESS = sess
        self.N_S = N_S
        self.N_A = N_A
        self.env = StockEnv()
        self.name = name
        self.AC = A3CNet(self.SESS, self.name, self.N_S, self.N_A, globalAC)
        # self.saver = tf.train.Saver()

    def _record_global_reward_and_print(self, global_runing_rs, ep_r,
                                        global_ep, total_step):
        global_runing_rs.append(ep_r)
        try:
            print(self.name, "Ep:", global_ep,
                  "| Ep_r: %i" % global_runing_rs[-1], "| total step:",
                  total_step)
        except Exception as e:
            print(e)

    def train(self):
        buffer_s, buffer_a, buffer_r = [], [], []
        s = self.env.reset()
        ep_r = 0
        total_step = 1

        def reset():
            nonlocal ep_r, total_step
            self.env.reset()
            ep_r = 0
            total_step = 1

        while not COORD.should_stop() and self.GLOBAL_EP < MAX_GLOBAL_EP:
            # s = self.env.reset()
            # ep_r = 0
            # total_step = 1
            reset()
            while total_step < MAX_TOTAL_STEP:
                try:
                    s = self.env.get_state()
                    a, p = self.AC.choose_action(s)
                    s_, r, done = self.env.step(a)
                    if done:
                        r = -2

                    ep_r += r
                    buffer_s.append(s)
                    buffer_a.append(a)
                    buffer_r.append(r)

                    if total_step % UPDATE_GLOBAL_ITER == 0 or done:  # update global and assign to local net
                        self.AC.update(done, s_, buffer_r, buffer_s, buffer_a)
                        buffer_s, buffer_a, buffer_r = [], [], []

                    if done:
                        self._record_global_reward_and_print(
                            self.GLOBAL_RUNNING_R, ep_r, self.GLOBAL_EP,
                            total_step)
                        self.GLOBAL_EP += 1
                        reset()

                    # s = s_
                    total_step += 1
                    if self.name == 'W_0':
                        self.env.render()
                        time.sleep(0.05)
                        logger.debug([
                            "s ", s, " v ",
                            self.AC.get_v(s), " a ", a, " p ", p, " ep_r ",
                            ep_r, " total ", self.env.total, " acct ",
                            self.env.acct
                        ])
                except Exception as e:
                    print(e)

            try:
                print(self.name, " not done,may be donkey!", " total_step:",
                      total_step)
            except Exception as e:
                print(e)
Ejemplo n.º 11
0
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from stock_env import StockEnv
from actor_critic_eligibility_trace import ActorCriticEligibilityTrace

test = True

env = StockEnv(test=test, random_flips=False)
env_passive = StockEnv(test=test)
ac = ActorCriticEligibilityTrace(env)

if not test:
    ac.train()
else:
    df = pd.read_csv("csvs/norm_all_stocks_5yr.csv")
    stock_df = df[df.Name == env.test_stock_name]

    policy_mlp = ac.policy_mlp
    policy_mlp.load_state_dict(torch.load("policy_mlp.pth"))
    value_mlp = ac.value_mlp
    value_mlp.load_state_dict(torch.load("value_mlp.pth"))

    obss = []
    actions = []
    rewards = []

    obs = env.reset()
    while True:
        obss.append(obs)
Ejemplo n.º 12
0
 def __init__(self, name, globalAC):
     self.env = StockEnv()
     self.name = name
     self.AC = ACNet(name,self.env.get_state().shape[0], 4, globalAC)
Ejemplo n.º 13
0
class Worker(object):
    def __init__(self, name, globalAC):
        self.env = StockEnv()
        self.name = name
        self.AC = ACNet(name,self.env.get_state().shape[0], 4, globalAC)

    def _update_global_reward(self, ep_r):
        global GLOBAL_RUNNING_R, GLOBAL_EP
        if len(GLOBAL_RUNNING_R) == 0:  # record running episode reward
            GLOBAL_RUNNING_R.append(ep_r)
        else:
            GLOBAL_RUNNING_R.append(0.99 * GLOBAL_RUNNING_R[-1] + 0.01 * ep_r)
            logger.debug(
                [self.name,
                "Ep:",
                GLOBAL_EP,
                "| Ep_r: %i" % GLOBAL_RUNNING_R[-1]]
            )
            GLOBAL_EP += 1

    def _update_globa_acnet(self, done, s_, buffer_s, buffer_a, buffer_r):
        if done:
            v_s_ = 0  # terminal
        else:
            v_s_ = SESS.run(self.AC.v, {self.AC.s: s_[np.newaxis, :]})[0, 0]
            buffer_v_target = []
            for r in buffer_r[::-1]:  # reverse buffer r
                v_s_ = r + GAMMA * v_s_
                buffer_v_target.append(v_s_)
            buffer_v_target.reverse()

            buffer_s, buffer_a, buffer_v_target = np.vstack(
                buffer_s), np.array(buffer_a), np.vstack(buffer_v_target)
            feed_dict = {
                self.AC.s: buffer_s,
                self.AC.a_his: buffer_a,
                self.AC.v_target: buffer_v_target,
            }
            self.AC.update_global(feed_dict)

    def work(self):
        total_step = 1
        buffer_s, buffer_a, buffer_r = [], [], []
        self.env.reset()
        if self.name == 'W_0':
            self.env.render()
        while not COORD.should_stop():
            ep_r = 0
            while True:
                s = self.env._get_state()
                a, p = self.AC.choose_action(s)
                s_, r, done = self.env.step(a)
                if done: 
                    r = -0.5
                ep_r += r
                buffer_s.append(s)
                buffer_a.append(a)
                buffer_r.append(r)

                if total_step % UPDATE_GLOBAL_ITER == 0 or done:
                    self._update_globa_acnet(done, s_, buffer_s, buffer_a,
                                             buffer_r)
                    buffer_s, buffer_a, buffer_r = [], [], []
                    self.AC.pull_global()
                # s = s_
                total_step += 1
                if done:
                    self._update_global_reward(ep_r)
                    break
                
                if self.name == 'W_0':
                    logger.debug(["s", s, " a:", a, " p:", p, " r:", r, " total_step:", total_step, 'total', self.env.total])
                    time.sleep(0.5)

    def train(self):
        global GLOBAL_RUNNING_R, GLOBAL_EP
        total_step = 1
        buffer_s, buffer_a, buffer_r = [], [], []
        while not COORD.should_stop() and GLOBAL_EP < MAX_GLOBAL_EP:
            s = self.env.reset()
            ep_r = 0
            while True:
                # if self.name == 'W_0':
                    # self.env.render()
                a, p = self.AC.choose_action(s)
                s_, r, done = self.env.step(a)
                if done: r = -0.5
                ep_r += r
                buffer_s.append(s)
                buffer_a.append(a)
                buffer_r.append(r)

                if total_step % UPDATE_GLOBAL_ITER == 0 or done:  # update global and assign to local net
                    self._update_globa_acnet(done, s_, buffer_s, buffer_a,
                                             buffer_r)
                    buffer_s, buffer_a, buffer_r = [], [], []
                    self.AC.pull_global()

                if done:
                    self._update_global_reward(ep_r)
                    logger.debug(["s", s, " a:", a, " p:", p, " r:", r, " total_step:", total_step, 'total', self.env.total])
                    break

                s = s_
                total_step += 1
Ejemplo n.º 14
0
LOG_DIR = './log'
N_WORKERS = multiprocessing.cpu_count()
MAX_GLOBAL_EP = 50
GLOBAL_NET_SCOPE = 'Global_Net'
UPDATE_GLOBAL_ITER = 50
# GAMMA = 0.9
GAMMA = 0.8
# ENTROPY_BETA = 0.001
ENTROPY_BETA = 0.1
LR_A = 0.001  # learning rate for actor
# LR_C = 0.001  # learning rate for critic
LR_C = 0.01  # learning rate for critic
GLOBAL_RUNNING_R = []
GLOBAL_EP = 0

env = StockEnv()
N_S = env.get_state().shape[0]
N_A = 4

logger = Logger('A3C')

class ACNet(object):
    def __init__(self, scope, N_S, A_S, globalAC=None):

        if scope == GLOBAL_NET_SCOPE:  # get global network
            with tf.variable_scope(scope):
                self.s = tf.placeholder(tf.float32, [None, N_S], 'S')
                self.a_params, self.c_params = self._build_net(scope)[-2:]
        else:  # local net, calculate losses
            with tf.variable_scope(scope):
                self.s = tf.placeholder(tf.float32, [None, N_S], 'S')
Ejemplo n.º 15
0
from stock_env import StockEnv
from stock_env import STOCK
from stock_env_discrete import StockEnv as StockEnvD
from stock_env_discrete import STOCK as STOCKD
from sklearn.externals import joblib
import warnings

if __name__ == '__main__':
    warnings.filterwarnings("ignore")  # ignore warnings

    stockHMM = StockHMM(STOCK.Baidu)
    # load model
    stockHMM.model = joblib.load("BaiDuHMM.pkl")

    print('Continuous: ')
    env = StockEnv(STOCK.Baidu)
    # [1000, 2000)
    env.set_count(999)
    for x in range(1000):
        p_states = stockHMM.predict(x + 1000)
        # order: 2 3 4 1 0
        my_action = p_states[
            2] + p_states[3] * 0.5 - p_states[1] * 0.5 - p_states[0]
        env.step(my_action)

    print(env.asset - 10000)

    print('Discrete: ')
    env = StockEnvD(STOCKD.Baidu)
    # [1000, 2000)
    env.set_count(999)
Ejemplo n.º 16
0
Archivo: A3C.py Proyecto: linbirg/RL
 def __init__(self, name, globalAC):
     self.env = StockEnv()
     self.name = name
     self.AC = ACNet(name, globalAC)
Ejemplo n.º 17
0
from rl.agents.dqn import DQNAgent
from rl.policy import LinearAnnealedPolicy, BoltzmannQPolicy, EpsGreedyQPolicy
from rl.memory import SequentialMemory
from rl.core import Processor
from rl.callbacks import FileLogger, ModelIntervalCheckpoint
from stock_env import StockEnv
import models

INPUT_SHAPE = (30, 180)
WINDOW_LENGTH = 4

model = models.build_combined_model()

# Get the environment and extract the number of actions.
env = StockEnv()
nb_actions = 3

# Next, we build our model. We use the same model that was described by Mnih et al. (2015).
input_shape = (WINDOW_LENGTH, ) + INPUT_SHAPE

# Finally, we configure and compile our agent. You can use every built-in Keras optimizer and
# even the metrics!
memory = SequentialMemory(limit=10000000, window_length=WINDOW_LENGTH)

# Select a policy. We use eps-greedy action selection, which means that a random action is selected
# with probability eps. We anneal eps from 1.0 to 0.1 over the course of 1M steps. This is done so that
# the agent initially explores the environment (high eps) and then gradually sticks to what it knows
# (low eps). We also set a dedicated eps value that is used during testing. Note that we set it to 0.05
# so that the agent still performs some random actions. This ensures that the agent cannot get stuck.
policy = LinearAnnealedPolicy(EpsGreedyQPolicy(),
Ejemplo n.º 18
0
    loss = torch.mean((Qpred - Qs) ** 2)
    # print(f"loss:{loss.data[0]}")
    loss.backward()
    optim.step()

'''
035720 카카오
005930 삼성전자
000660 SK하이닉스
000120 CJ대한통운
285130 SK케미칼
008970 동양철관
'''
# env = StockEnv("000660")
env = StockEnv("000660")
Replay = namedtuple("Replay",
    ["state", "action", "new_state", "reward", "done"])
predDQN = DQN(env.num_state(), env.num_action(), 40)
targetDQN = DQN(env.num_state(), env.num_action(), 40)
# optim = torch.optim.Adam(predDQN.parameters(), lr=0.1)
optim = torch.optim.SGD(predDQN.parameters(), lr=0.1)

replay_buffer = deque()
BUFFER_SIZE = 5000

num_episods = 30000
gamma = 0.9

reward_history = []
duration_history = []
Ejemplo n.º 19
0
from stock_env import StockEnv


env = StockEnv()

if __name__ == '__main__':
    env.render()
    # print(env.step(1))
    s,r,done = env.step(1)
    print(s)
    print(s.shape)
    print(r)

    print("=====================")
    s,r,done = env.step(0)
    print(s)
    print(r)

    print("====================")
    s,r,done = env.step(2)
    print(s)
    print(r)


    
Ejemplo n.º 20
0
                            self.env.acct
                        ])
                except Exception as e:
                    print(e)

            try:
                print(self.name, " not done,may be donkey!", " total_step:",
                      total_step)
            except Exception as e:
                print(e)


if __name__ == "__main__":
    GLOBAL_NET_SCOPE = 'Global_Net'
    N_A = 4
    N_S = StockEnv().get_state().shape[0]

    SESS = tf.Session()

    with tf.device("/cpu:0"):
        GLOBAL_AC = A3CNet(SESS, GLOBAL_NET_SCOPE, N_S,
                           N_A)  # we only need its params
        workers = []

        # Create worker
        for i in range(N_WORKERS):
            i_name = 'W_%i' % i  # worker name
            workers.append(Worker(SESS, i_name, N_S, N_A, GLOBAL_AC))

    COORD = tf.train.Coordinator()
    SESS.run(tf.global_variables_initializer())