def optimize_agent(trial): env_params = optimize_envs(trial) print("Trial with params") print(env_params) train_env = DummyVecEnv( [lambda: BitcoinTradingEnv(train_df, **env_params)]) test_env = DummyVecEnv([lambda: BitcoinTradingEnv(test_df, **env_params)]) model_params = optimize_ppo2(trial) # model = PPO2(MlpLnLstmPolicy, train_env, verbose=0, nminibatches=1, # tensorboard_log="./tensorboard", **model_params) model = PPO2(MlpLnLstmPolicy, train_env, verbose=1, nminibatches=1, **model_params) last_reward = -np.finfo(np.float16).max evaluation_interval = int(len(train_df) / n_evaluations) for eval_idx in range(n_evaluations): print("Eval index: " + str(eval_idx)) try: model.learn(evaluation_interval) except AssertionError: raise rewards = [] n_episodes, reward_sum = 0, 0.0 obs = test_env.reset() while n_episodes < n_test_episodes: action, _ = model.predict(obs) obs, reward, done, _ = test_env.step(action) reward_sum += reward if done: rewards.append(reward_sum) reward_sum = 0.0 n_episodes += 1 obs = test_env.reset() last_reward = np.mean(rewards) trial.report(-1 * last_reward, eval_idx) if trial.should_prune(eval_idx): raise optuna.structs.TrialPruned() return -1 * last_reward
def test(self, model_instance: 0): study_name = 'ppo2_' + self.reward_strategy study = optuna.load_study(study_name=study_name, storage=self.params_db_file) params = study.best_trial.params test_env = DummyVecEnv([ lambda: BitcoinTradingEnv(self.test_df, reward_func=self.reward_strategy, forecast_len=int(params['forecast_len']), confidence_interval=params[ 'confidence_interval']) ]) model_params = self.model_params(params) model = PPO2.load(os.path.join( '.', 'agents', 'ppo2_' + reward_strategy + '_' + str(model_instance) + '.pkl'), env=test_env) obs, done = test_env.reset(), False while not done: action, _states = model.predict(obs) obs, reward, done, info = test_env.step(action) test_env.render(mode="human")
def optimize_envs(trial): params = { 'n_forecasts': int(trial.suggest_loguniform('n_forecasts', 4, 100)), 'confidence_interval': trial.suggest_uniform('confidence_interval', 0.7, 0.99), } df = pd.read_csv('./data/coinbase_hourly.csv') df = df.drop(['Symbol'], axis=1) test_len = int(len(df) * 0.2) train_len = int(len(df)) - test_len train_df = df[:train_len] test_df = df[train_len:] train_env = DummyVecEnv([lambda: BitcoinTradingEnv( train_df, reward_func='profit', **params)]) test_env = DummyVecEnv([lambda: BitcoinTradingEnv( test_df, reward_func='profit', **params)]) return train_env, test_env
import torch.optim as optim import torch.nn.functional as F from torch.autograd import Variable import matplotlib.pyplot as plt import functools from env.BitcoinTradingEnv import BitcoinTradingEnv from env.indicators import prepare_indicators from REINFORCE import PolicyNetwork, update_policy if __name__ == '__main__': sdf = prepare_indicators('data/bitstampUSD_1-min_data_2012-01-01_to_2019-08-12.csv') N = 500_000 train_df = sdf[:N] train_env = BitcoinTradingEnv(train_df, lookback_window_size=60, commission=1e-4, initial_balance=1000, serial=False) input_dim, seq_length = train_env.observation_space.shape output_dim1 = train_env.action_space.nvec[0] output_dim2 = train_env.action_space.nvec[1] hidden_dim = 128 lstm_layers = 2 # choose device device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") #device = torch.device('cpu') print(f"Device used: {device}") policy_net = PolicyNetwork(input_dim, output_dim1, output_dim2, hidden_dim, n_layers=lstm_layers) # Loading the best model model_name = 'model/state_dict3.pt'
#Hyperparameters T_horizon = 30 n_episodes = 10000 print_interval = 2 config = { 'lr': 0.0005, 'gamma': 0.90, 'lmbda': 0.95, 'eps_clip': 0.1, 'K_epoch': 3, } df = pd.read_csv('./data/1 Dec 2017 - 1 Dec 2018.csv') test_env = BitcoinTradingEnv(df, serial=True) print('observation space:', test_env.observation_space.shape) print('action space:', test_env.action_space) memory = Memory() model = MODEL(c_in=test_env.observation_space.shape[0], c_out=test_env.action_space.n, seq_len=test_env.observation_space.shape[1]) model = model.to(device) agent = PPO(model=model, memory=memory, config=config, device=device) if os.path.exists('./save/model.m5'): agent.model.load_state_dict(torch.load('./save/model.m5'))
print(params) df = pd.read_csv('./data/coinbase_hourly.csv') df = df.drop(['Symbol'], axis=1) df = df.sort_values(['Date']) test_len = int(len(df) * 0.2) train_len = 100 # int(len(df)) - test_len train_df = df[:train_len] test_df = df[train_len:] train_env = DummyVecEnv([ lambda: BitcoinTradingEnv(train_df, n_forecasts=int(params['n_forecasts']), confidence_interval=params['confidence_interval'] ) ]) model_params = { 'n_steps': int(params['n_steps']), 'gamma': params['gamma'], 'learning_rate': params['learning_rate'], 'ent_coef': params['ent_coef'], 'cliprange': params['cliprange'], 'noptepochs': int(params['noptepochs']), 'lam': params['lam'] } model = PPO2(MlpLstmPolicy, train_env,
df = requestCandles(api, gran, from_, to, instr) saveDf = True filename = join('data', '{}.{}.out'.format(instr, gran)) if saveDf: df.to_csv(filename) test_len = int(len(df) * 0.2) train_len = int(len(df)) - test_len train_df = df[:train_len] test_df = df[train_len:] # ====== ENVIRONMENT SETUP ======= trainEnv = DummyVecEnv([lambda: BitcoinTradingEnv(train_df)]) testEnv = DummyVecEnv([lambda: BitcoinTradingEnv(test_df)]) model_params = { 'n_steps': 243, 'gamma': 0.94715, 'learning_rate': 0.00157, 'ent_coef': 2.29869, 'cliprange': 0.38388, 'noptepochs': 35, 'lam': 0.89837, } # This is stupid if curr_idx == -1:
percentageToUse=mainparams.get('dataset_percentage')) for td in testDirs: params = getConfiguration(join(td, 'config.yaml')) # ====== IMPORT MODEL ====== modelToUse = selectFunctionAccordingToParams('model', params.get('model')) polictyToUse = selectFunctionAccordingToParams('policy', params.get('policy')) # ====== ENVIRONMENT SETUP ======= trainEnv = DummyVecEnv([ lambda: BitcoinTradingEnv(train_df, reward_func=params.get('reward_strategy'), forecast_len=params.get('forecast_len'), confidence_interval=params.get( 'confidence_interval')) ]) testEnv = DummyVecEnv([ lambda: BitcoinTradingEnv(test_df, reward_func=params.get('reward_strategy'), forecast_len=params.get('forecast_len'), confidence_interval=params.get( 'confidence_interval')) ]) boardDir = join(td, 'tensorboard') if not exists(boardDir): makedirs(boardDir)
df = pd.read_csv('./data/coinbase_hourly.csv') df = df.drop(['Symbol'], axis=1) df = df.sort_values(['Date']) df = add_indicators(df.reset_index()) test_len = int(len(df) * 0.2) train_len = int(len(df)) - test_len train_df = df[:train_len] test_df = df[train_len:] train_env = DummyVecEnv([ lambda: BitcoinTradingEnv(train_df, reward_func="calmar", forecast_len=int(params['forecast_len']), confidence_interval=params['confidence_interval'] ) ]) test_env = DummyVecEnv([ lambda: BitcoinTradingEnv(test_df, reward_func="calmar", forecast_len=int(params['forecast_len']), confidence_interval=params['confidence_interval'] ) ]) model_params = { 'n_steps': int(params['n_steps']), 'gamma': params['gamma'],
df = pd.read_csv('./data/wdo_small.csv') # df = df.drop(['Symbol'], axis=1) df = df.sort_values(['Date']) df = add_indicators(df.reset_index()) test_len = int(len(df) * 0.2) train_len = int(len(df)) - test_len test_df = df[train_len:] profit_study = optuna.load_study(study_name='ppo2_profit', storage='sqlite:///params.db') profit_env = DummyVecEnv([ lambda: BitcoinTradingEnv( test_df, reward_func="profit", forecast_len=int(profit_study.best_trial.params['forecast_len']), confidence_interval=profit_study.best_trial.params[ 'confidence_interval']) ]) sortino_study = optuna.load_study(study_name='ppo2_sortino', storage='sqlite:///params.db') sortino_env = DummyVecEnv([ lambda: BitcoinTradingEnv( test_df, reward_func="profit", forecast_len=int(sortino_study.best_trial.params['forecast_len']), confidence_interval=sortino_study.best_trial.params[ 'confidence_interval']) ])
def train(self): if not self.train_df: self.logger.info("Running built-in data preparation") self.prepare_data() else: self.logger.info("Using provided data (Length: %d)" % len(self.train_df)) study_name = 'ppo2_' + self.reward_strategy study = optuna.load_study(study_name=study_name, storage=self.params_db_file) params = study.best_trial.params train_env = DummyVecEnv([ lambda: BitcoinTradingEnv(self.train_df, reward_func=self.reward_strategy, forecast_len=int(params['forecast_len']), confidence_interval=params[ 'confidence_interval']) ]) test_env = DummyVecEnv([ lambda: BitcoinTradingEnv(self.test_df, reward_func=self.reward_strategy, forecast_len=int(params['forecast_len']), confidence_interval=params[ 'confidence_interval']) ]) model_params = self.model_params(params) model = PPO2(MlpLnLstmPolicy, train_env, verbose=0, nminibatches=1, tensorboard_log=os.path.join('.', 'tensorboard'), **model_params) models_to_train = 1 self.logger.info("Training {} model instances".format(models_to_train)) for idx in range( 0, models_to_train): #Not sure why we are doing this, tbh self.logger.info('[', idx, '] Training for: ', len(self.train_df), ' time steps') model.learn(total_timesteps=len(self.train_df)) obs = test_env.reset() done, reward_sum = False, 0 while not done: action, _states = model.predict(obs) obs, reward, done, info = test_env.step(action) reward_sum += reward self.logger.info('[', idx, '] Total reward: ', reward_sum, ' (' + self.reward_strategy + ')') model.save( os.path.join( '.', 'agents', 'ppo2_' + self.reward_strategy + '_' + str(idx) + '.pkl')) self.logger.info("Trained {} model instances".format(models_to_train))
params = study.best_trial.params print("Testing PPO2 agent with params:", params) print("Best trial:", study.best_trial.value) df = pd.read_csv('./data/coinbase_hourly.csv') df = df.drop(['Symbol'], axis=1) df = df.sort_values(['Date']) df = add_indicators(df.reset_index()) test_len = int(len(df) * 0.2) train_len = int(len(df)) - test_len test_df = df[train_len:] test_env = DummyVecEnv([lambda: BitcoinTradingEnv( test_df, reward_func="sortino", forecast_len=int(params['forecast_len']), confidence_interval=params['confidence_interval'])]) model_params = { 'n_steps': int(params['n_steps']), 'gamma': params['gamma'], 'learning_rate': params['learning_rate'], 'ent_coef': params['ent_coef'], 'cliprange': params['cliprange'], 'noptepochs': int(params['noptepochs']), 'lam': params['lam'], } model = PPO2.load('./agents/ppo2_sortino_3.pkl', env=test_env) obs, done = test_env.reset(), False while not done:
import pandas as pd import torch from env.BitcoinTradingEnv import BitcoinTradingEnv import numpy as np import torch.optim as optim from agent.ACER import ACER from model.GRUFCN.models.RNN_FCN import MGRU_FCN as MODEL from agent.ActorCritic import ActorCritic from utils.ReplayMemory import ReplayBuffer import gym from torch.distributions.categorical import Categorical import os df = pd.read_csv('./data/1 Dec 2019 - 1 Dec 2020.csv') test_env = BitcoinTradingEnv(df, serial=True) print('observation space:', test_env.observation_space.shape) print('action space:', test_env.action_space) model = MODEL(c_in=test_env.observation_space.shape[0], c_out=test_env.action_space.n, seq_len=test_env.observation_space.shape[1]) #acer = ACER(model=model, memory=memory, config=acer_config) if os.path.exists('./save/model.m5'): model.load_state_dict(torch.load('./save/model.m5')) avg_t = 0 avg_r = 0
def objective(trial): # Define what to optimize in environment envParams = { 'reward_func': reward_strategy, 'forecast_len': int(trial.suggest_loguniform('forecast_len', 1, 200)), 'confidence_interval': trial.suggest_uniform('confidence_interval', 0.7, 0.99), } train_df, test_df = getDatasets( params.get('input_data_file'), percentageToUse=params.get('dataset_percentage')) trainEnv = DummyVecEnv([lambda: BitcoinTradingEnv(train_df, **envParams)]) testEnv = DummyVecEnv([lambda: BitcoinTradingEnv(test_df, **envParams)]) # Define what to optimize in agent agentParams = { 'n_steps': int(trial.suggest_loguniform('n_steps', 16, 2048)), 'gamma': trial.suggest_loguniform('gamma', 0.9, 0.9999), 'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1.), 'ent_coef': trial.suggest_loguniform('ent_coef', 1e-8, 1e-1), 'cliprange': trial.suggest_uniform('cliprange', 0.1, 0.4), 'noptepochs': int(trial.suggest_loguniform('noptepochs', 1, 48)), 'lam': trial.suggest_uniform('lam', 0.8, 1.) } model = PPO2(MlpLnLstmPolicy, trainEnv, verbose=0, nminibatches=1, **agentParams) # Run optimizer last_reward = -np.finfo(np.float16).max evaluation_interval = int(len(train_df) / params.get('n_test_episodes')) for eval_idx in range(params.get('n_evaluations')): try: model.learn(evaluation_interval) except AssertionError: raise rewards = [] n_episodes, reward_sum = 0, 0.0 obs = testEnv.reset() while n_episodes < params.get('n_test_episodes'): action, _ = model.predict(obs) obs, reward, done, _ = testEnv.step(action) reward_sum += reward if done: rewards.append(reward_sum) reward_sum = 0.0 n_episodes += 1 obs = testEnv.reset() last_reward = np.mean(rewards) trial.report(-1 * last_reward, eval_idx) if trial.should_prune(eval_idx): raise optuna.structs.TrialPruned() return -1 * last_reward
import pandas as pd from stable_baselines.common.policies import MlpPolicy from stable_baselines.common.vec_env import DummyVecEnv from stable_baselines import A2C from env.BitcoinTradingEnv import BitcoinTradingEnv df = pd.read_csv('./data/bitstamp.csv') df = df.sort_values('Timestamp') slice_point = int(len(df) - 50000) train_df = df[:slice_point] test_df = df[slice_point:] train_env = DummyVecEnv([lambda: BitcoinTradingEnv(train_df, serial=True)]) model = A2C(MlpPolicy, train_env, verbose=1, tensorboard_log="./tensorboard/") model.learn(total_timesteps=200000) test_env = DummyVecEnv([lambda: BitcoinTradingEnv(test_df, serial=True)]) obs = test_env.reset() for i in range(50000): action, _states = model.predict(obs) obs, rewards, done, info = test_env.step(action) test_env.render(mode="system", title="BTC") test_env.close()
print("Training PPO2 agent with params:", params) print("Best trial reward:", -1 * study.best_trial.value) df = pd.read_csv(input_data_file) df = df.drop(['Symbol'], axis=1) df = df.sort_values(['Date']) df = add_indicators(df.reset_index()) test_len = int(len(df) * 0.2) train_len = int(len(df)) - test_len train_df = df[:train_len] test_df = df[train_len:] train_env = DummyVecEnv([lambda: BitcoinTradingEnv( train_df, reward_func=reward_strategy, forecast_len=int(params['forecast_len']), confidence_interval=params['confidence_interval'])]) test_env = DummyVecEnv([lambda: BitcoinTradingEnv( test_df, reward_func=reward_strategy, forecast_len=int(params['forecast_len']), confidence_interval=params['confidence_interval'])]) model_params = { 'n_steps': int(params['n_steps']), 'gamma': params['gamma'], 'learning_rate': params['learning_rate'], 'ent_coef': params['ent_coef'], 'cliprange': params['cliprange'], 'noptepochs': int(params['noptepochs']), 'lam': params['lam'], } if curr_idx == -1:
df = df.sort_values(['Date']) df = add_indicators(df.reset_index()) test_len = int(len(df) * 0.2) train_len = int(len(df)) - test_len train_df = df[:train_len] test_df = df[train_len:] # Enable multiprocess environment n_cpu = 32 train_env = SubprocVecEnv([ lambda: BitcoinTradingEnv(train_df, reward_func=reward_strategy, forecast_len=int(params['forecast_len']), confidence_interval=params['confidence_interval'] ) for i in range(n_cpu) ]) test_env = SubprocVecEnv([ lambda: BitcoinTradingEnv(test_df, reward_func=reward_strategy, forecast_len=int(params['forecast_len']), confidence_interval=params['confidence_interval'] ) for i in range(n_cpu) ]) model_params = { 'n_steps': int(params['n_steps']), 'gamma': params['gamma'],