Example #1
0
def main(args):
    expert = None
    expert_state_dim = 0
    if args.policy_path is not None:
        policy_path = args.policy_path
        expert = PPO.load(policy_path)
        expert_state_dim = expert.observation_space.shape[0]

    factory = EnvFactory(args.env)
    env = DummyVecEnv([factory.make_env])
    if args.stats_path is not None:
        env = VecNormalize.load(args.stats_path, env)
        env.training = False
    else:
        env = VecNormalize(env, training=False)

    obs = env.reset()
    env.render()
    total_reward = 0
    while True:
        if expert is None:
            action = env.action_space.sample()
            action = np.zeros_like(action)
        else:
            good_obs = obs[:, :expert_state_dim]
            action, _ = expert.predict(good_obs, deterministic=True)
        obs, reward, done, info = env.step(action)
        env.render()
        reward = env.get_original_reward()
        total_reward += reward[0]
        if done:
            print("Total reward: {:.3f}".format(total_reward))
            obs = env.reset()
            total_reward = 0
Example #2
0
def main():
    # multiprocess environment
    # n_cpu = 8
    # env = SubprocVecEnv([lambda: gym.make('DYROSTocabi-v1') for i in range(n_cpu)])
    # env = VecNormalize(env, norm_obs=True, clip_obs=2.0, norm_reward=False, training=True)

    n_cpu = 1
    env = gym.make('DYROSTocabi-v1')
    env = DummyVecEnv([lambda: env])
    env = VecNormalize(env,
                       norm_obs=True,
                       clip_obs=2.0,
                       norm_reward=False,
                       training=True)

    model = PPO('MlpPolicy',
                env,
                verbose=1,
                n_steps=int(4096 / n_cpu),
                wandb_use=False)
    model.learn(total_timesteps=40000000)
    file_name = "ppo2_DYROSTocabi_" + str(datetime.datetime.now())
    model.save(file_name)
    env.save(file_name + "_env.pkl")

    model.policy.to("cpu")
    for name, param in model.policy.state_dict().items():
        weight_file_name = "./result/" + name + ".txt"
        np.savetxt(weight_file_name, param.data)

    np.savetxt("./result/obs_mean.txt", env.obs_rms.mean)
    np.savetxt("./result/obs_variance.txt", env.obs_rms.var)

    del model  # remove to demonstrate saving and loading
    del env

    # file_name = "ppo2_DYROSTocabi_2021-01-08 07:18:00.267089"

    env = gym.make('DYROSTocabi-v1')
    env = DummyVecEnv([lambda: env])
    env = VecNormalize.load(file_name + "_env.pkl", env)
    env.training = False

    model = PPO.load(file_name, env=env, wandb_use=False)

    #Enjoy trained agent
    obs = np.copy(env.reset())
    epi_reward = 0

    while True:
        action, _states = model.predict(obs, deterministic=True)

        obs, rewards, dones, info = env.step(action)
        env.render()
        epi_reward += rewards

        if dones:
            print("Episode Reward: ", epi_reward)
            epi_reward = 0
Example #3
0
    def DRL_prediction(self,model,name,last_state,iter_num,turbulence_threshold,initial):
        ### make a prediction based on trained model###

        ## trading env
        trade_data = data_split(self.df, start=self.unique_trade_date[iter_num - self.rebalance_window], end=self.unique_trade_date[iter_num])
        trade_env = DummyVecEnv([lambda: StockTradingEnv(trade_data,
                                                        self.stock_dim,
                                                        self.hmax,
                                                        self.initial_amount,
                                                        self.buy_cost_pct,
                                                        self.sell_cost_pct,
                                                        self.reward_scaling,
                                                        self.state_space,
                                                        self.action_space,
                                                        self.tech_indicator_list,
                                                        turbulence_threshold=turbulence_threshold,
                                                        initial=initial,
                                                        previous_state=last_state,
                                                        model_name=name,
                                                        mode='trade',
                                                        iteration=iter_num,
                                                        print_verbosity=self.print_verbosity)])

        trade_obs = trade_env.reset()

        for i in range(len(trade_data.index.unique())):
            action, _states = model.predict(trade_obs)
            trade_obs, rewards, dones, info = trade_env.step(action)
            if i == (len(trade_data.index.unique()) - 2):
                # print(env_test.render())
                last_state = trade_env.render()

        df_last_state = pd.DataFrame({'last_state': last_state})
        df_last_state.to_csv('results/last_state_{}_{}.csv'.format(name, i), index=False)
        return last_state
Example #4
0
def DRL_prediction(df,
                   model,
                   name,
                   last_state,
                   iter_num,
                   unique_trade_date,
                   rebalance_window,
                   turbulence_threshold,
                   initial):
    ### make a prediction based on trained model###

    ## trading env
    trade_data = data_split(df, start=unique_trade_date[iter_num - rebalance_window], end=unique_trade_date[iter_num])
    env_trade = DummyVecEnv([lambda: StockEnvTrade(trade_data,
                                                   turbulence_threshold=turbulence_threshold,
                                                   initial=initial,
                                                   previous_state=last_state,
                                                   model_name=name,
                                                   iteration=iter_num)])
    obs_trade = env_trade.reset()

    for i in range(len(trade_data.index.unique())):
        action, _states = model.predict(obs_trade)
        obs_trade, rewards, dones, info = env_trade.step(action)
        if i == (len(trade_data.index.unique()) - 2):
            # print(env_test.render())
            last_state = env_trade.render()

    df_last_state = pd.DataFrame({'last_state': last_state})
    df_last_state.to_csv('results/last_state_{}_{}.csv'.format(name, i), index=False)
    return last_state
Example #5
0
def random_train_test():
    import gym
    import datetime as dt
    import matplotlib.pyplot as plt

    from stable_baselines3 import PPO
    from stable_baselines3.common.vec_env import DummyVecEnv

    import pandas as pd

    from lutils.stock import LTdxHq

    ltdxhq = LTdxHq()
    df = ltdxhq.get_k_data_1min('000032') # 000032 300142 603636 
    df = df[-240:]
    ltdxhq.close()

    model = PPO.load('ppo_stock')

    env = DummyVecEnv([lambda: LStockDailyEnv(df)])
    obs = env.reset()

    rewards = []
    actions = []
    net_worths = []
    for i in range(NEXT_OBSERVATION_SIZE, df.shape[0]):
        action, _states = model.predict(obs)
        obs, reward, done, info = env.step(action)
        rewards.append(reward)
        actions.append(action[0][0])
        net_worths.append(info[0]['net_worth'])
        # print(info[0]['current_step'])
        env.render()

    fig, ax = plt.subplots()
    ax.plot(rewards, label='rewards')
    ax.plot(actions, label='actions')
    ax.legend()
    ax2 = ax.twinx()
    ax2.plot(net_worths, label='net worth', color='red')
    ax2.legend()
    plt.show()
Example #6
0
def stock_trade(stock_file):
    day_profits = []
    df = pd.read_csv(stock_file)
    df = df.sort_values('date')

    # The algorithms require a vectorized environment to run
    env = DummyVecEnv([lambda: StockTradingEnv(df)])

    model = PPO('MlpPolicy', env, verbose=0, tensorboard_log='./log')
    model.learn(total_timesteps=int(1e6))

    df_test = pd.read_csv(stock_file.replace('train', 'test'))

    env = DummyVecEnv([lambda: StockTradingEnv(df_test)])
    obs = env.reset()
    for i in range(len(df_test) - 1):
        action, _states = model.predict(obs)
        obs, rewards, done, info = env.step(action)
        profit = env.render()
        day_profits.append(profit)
        if done:
            break
    return day_profits
Example #7
0
import gym
import json
import datetime as dt

from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3 import PPO

from env.StockTradingEnv import StockTradingEnv

import pandas as pd

df = pd.read_csv('./data/AAPL.csv')
df = df.sort_values('Date')

# The algorithms require a vectorized environment to run
env = DummyVecEnv([lambda: StockTradingEnv(df)])

model = PPO('MlpPolicy', env, verbose=1)
model.learn(total_timesteps=20000)

obs = env.reset()
for i in range(2000):
    action, _states = model.predict(obs)
    obs, rewards, done, info = env.step(action)
    env.render()
Example #8
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Project  : crypto
# @Time     : 2021/4/26 13:48
# @Author   : Adolf
# @File     : ppo_base.py
# @Function  :
import pandas as pd
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv

from strategy.reinforcement_base.environment.crypto_env_v0 import CryptoEnv

df = pd.read_csv("dataset/1d/BTC.csv")
df_test = pd.read_csv("dataset/1d/ETH.csv")

env = DummyVecEnv([lambda: CryptoEnv(df)])

model = PPO("MlpPolicy", env, verbose=1)
model.learn(total_timesteps=20000)

env_test = DummyVecEnv([lambda: CryptoEnv(df)])
obs = env_test.reset()
for i in range(2000):
    action, _states = model.predict(obs)
    obs, rewards, done, info = env_test.step(action)
    env_test.render()
    envs = SubprocVecEnv([make_env] * num_envs)
    envs = VecFrameStack(envs, n_stack=4)

    model = PPO.load("./subzero_model")
    model.set_env(envs)
    obs = envs.reset()
    print(obs.shape)

    # Create one env for testing 
    env = DummyVecEnv([make_env])
    env = VecFrameStack(env, n_stack=4)
    obs = env.reset()

    # model.predict(test_obs) would through an error
    # because the number of test env is different from the number of training env
    # so we need to complete the observation with zeroes
    zero_completed_obs = np.zeros((num_envs,) + envs.observation_space.shape)
    zero_completed_obs[0, :] = obs
    obs = zero_completed_obs

    while True:
        action, _states = model.predict(obs)
        obs, rewards, dones, info = env.step(action)
        env.render(mode="human")
        if dones.all() == True:
            break

        zero_completed_obs = np.zeros((num_envs,) + envs.observation_space.shape)
        zero_completed_obs[0, :] = obs
        obs = zero_completed_obs
import gym

#from stable_baselines.common.policies import MlpPolicy
from stable_baselines3.ppo import MlpPolicy
#from stable_baselines.common.vec_env import DummyVecEnv
from stable_baselines3.common.vec_env import DummyVecEnv
#from stable_baselines import PPO2
from stable_baselines3 import PPO

from env.StockTradingEnv import StockTradingEnv

import pandas as pd

df = pd.read_csv('./data/MSFT.csv')
df = df.sort_values('Date')

# The algorithms require a vectorized environment to run
env = DummyVecEnv([lambda: StockTradingEnv(df)])

model = PPO(MlpPolicy, env, verbose=1)
model.learn(total_timesteps=50)

obs = env.reset()
print(f"Number of iterations {len(df['Date'])}")
for i in range(len(df['Date'])):
    action, _states = model.predict(obs)
    obs, rewards, done, info = env.step(action)
    env.render(mode='live')
Example #11
0
def test_rl():
    import gym
    import datetime as dt
    import matplotlib.pyplot as plt

    # from stable_baselines.common.policies import MlpPolicy, CnnPolicy, MlpLstmPolicy, ActorCriticPolicy, LstmPolicy
    # from stable_baselines.common.vec_env import DummyVecEnv
    # from stable_baselines import PPO2, PPO1, A2C, DQN, TD3, SAC

    # from stable_baselines3.common.policies import MlpPolicy
    from stable_baselines3 import PPO
    from stable_baselines3.common.vec_env import DummyVecEnv
    from stable_baselines3.common.evaluation import evaluate_policy

    from sklearn import preprocessing

    import pandas as pd

    from lutils.stock import LTdxHq

    ltdxhq = LTdxHq()
    code = '600519' # 000032 300142 603636 600519
    df = ltdxhq.get_k_data_1min(code, end='2021-09-02') # 000032 300142 603636 600519
    # df = ltdxhq.get_k_data_daily('603636', end='2019-01-01') # 000032 300142 603636 600519
    df = StockDataFrame(df.rename(columns={'vol': 'volume'}))

    # min_max_scaler = preprocessing.MinMaxScaler()
    # df = pd.DataFrame(min_max_scaler.fit_transform(df.drop(columns=['date', 'code'])))
    # df.columns = ['open', 'close', 'high', 'low', 'volume', 'amount']

    df_eval = ltdxhq.get_k_data_1min(code, start='2021-09-01')
    df_eval = StockDataFrame(df_eval.rename(columns={'vol': 'volume'}))

    ltdxhq.close()
    # df = ltdxhq.get_k_data_5min('603636')
    # df = ltdxhq.get_k_data_daily('603636')

    # df1 = df[:-240]
    # df2 = df[-240:]
    # The algorithms require a vectorized environment to run
    env = DummyVecEnv([lambda: LStockDailyEnv(df)])
    # model = PPO2(MlpPolicy, env, verbose=1) # , tensorboard_log='log')
    model = PPO('MlpPolicy', env, verbose=1) # , tensorboard_log='log')
    model.learn(100000)
    # model = PPO1(LstmPolicy, env, verbose=1)
    # model.learn(total_timesteps=1000)



    # env.set_attr('df', df2)
    # obs = env.reset()

    # rewards = []
    # actions = []
    # net_worths = []
    # # for i in range(220):
    # for i in range(NEXT_OBSERVATION_SIZE, df2.shape[0]):
    #     # actual_obs = observation(df2, i)
    #     # action, _states = model.predict(actual_obs)
    #     # action = [action]
    #     action, _states = model.predict(obs)
    #     obs, reward, done, info = env.step(action)
    #     rewards.append(reward)
    #     actions.append(action[0][0])
    #     net_worths.append(info[0]['net_worth'])
    #     # print(info[0]['current_step'])
    #     env.render()

    # mean_reward, _  = evaluate_policy(model, eval_env, n_eval_episodes=1, render=True) # EVAL_EPS

    # print(mean_reward)

    model.save('ppo_stock')
    # model = PPO.load('ppo_stock')

    eval_env = DummyVecEnv([lambda: LStockDailyEnv(df_eval)])
    obs = eval_env.reset()

    net_worths = []
    actions = []
    done, state = False, None
    while not done:
        action, state = model.predict(obs, state=state, deterministic=True)
        obs, reward, done, _info = eval_env.step(action)
        net_worths.append(_info[0]['net_worth'])
        # if is_recurrent:
        #     obs[0, :] = new_obs
        # else:
        #     obs = new_obs

        # if action[0] < Actions.Buy: # Buy
        #     actions.append(1)
        # elif action[0] < Actions.Sell: # Sell
        #     actions.append(2)
        # else:
        #     actions.append(0)
        actions.append(action[0])
        eval_env.render()

    plt.plot(net_worths)
    plt.plot(actions)
    plt.show()