def setUp(self): self.order_book_id_number = 100 self.feature_number = 10 self.toy_data = create_toy_data(order_book_ids_number=order_book_id_number, feature_number=self.feature_number, start="2019-05-01", end="2019-12-12", frequency="D") self.env_2d = PortfolioTradingGym(data_df=self.toy_data, sequence_window=1, add_cash=False, mode="numpy") self.sequence_window =3 self.env_3d = PortfolioTradingGym(data_df = self.toy_data, sequence_window=self.sequence_window, add_cash=False, mode="numpy")
#!/usr/bin/env python3 # -*- coding: utf-8 -*- import pdb import numpy as np import pandas as pd from sklearn.linear_model import LinearRegression from trading_gym.utils.data.toy import create_toy_data from trading_gym.envs.portfolio_gym.portfolio_gym import PortfolioTradingGym order_book_id_number = 100 toy_data = create_toy_data(order_book_ids_number=order_book_id_number, feature_number=10, start="2019-05-01", end="2019-12-12", frequency="D") env = PortfolioTradingGym(data_df=toy_data, sequence_window=1, add_cash=False) state = env.reset() while True: next_state, reward, done, info = env.step(action=None) label = info["one_step_fwd_returns"] print(state) print(label) # regressor = LinearRegression() regressor.fit(state.values, label.values) #display and store print(regressor.coef_)
def main(): order_book_id_number = 10 toy_data = create_toy_data(order_book_ids_number=order_book_id_number, feature_number=20, start="2019-05-01", end="2019-12-12", frequency="D") env = PortfolioTradingGym(data_df=toy_data, sequence_window=5, add_cash=True) env = Numpy(env) env = ch.envs.Logger(env, interval=1000) env = ch.envs.Torch(env) env = ch.envs.Runner(env) # create net action_size = env.action_space.shape[0] number_asset, seq_window, features_number = env.observation_space.shape input_size = features_number actor = Actor(input_size=input_size, hidden_size=50, action_size=action_size) critic = Critic(input_size=input_size, hidden_size=50, action_size=action_size) target_actor = create_target_network(actor) target_critic = create_target_network(critic) actor_optimiser = optim.Adam(actor.parameters(), lr=LEARNING_RATE_ACTOR) critic_optimiser = optim.Adam(critic.parameters(), lr=LEARNING_RATE_CRITIC) replay = ch.ExperienceReplay() ou_noise = OrnsteinUhlenbeckNoise(mu=np.zeros(action_size)) def get_action(state): action = actor(state) action = action + ou_noise()[0] return action def get_random_action(state): action = torch.softmax(torch.randn(action_size), dim=0) return action for step in range(1, MAX_STEPS + 1): with torch.no_grad(): if step < UPDATE_START: replay += env.run(get_random_action, steps=1) else: replay += env.run(get_action, steps=1) replay = replay[-REPLAY_SIZE:] if step > UPDATE_START and step % UPDATE_INTERVAL == 0: sample = random.sample(replay, BATCH_SIZE) batch = ch.ExperienceReplay(sample) next_values = target_critic(batch.next_state(), target_actor(batch.next_state())).view( -1, 1) values = critic(batch.state(), batch.action()).view(-1, 1) rewards = ch.normalize(batch.reward()) #rewards = batch.reward()/100.0 change the convergency a lot value_loss = ch.algorithms.ddpg.state_value_loss( values, next_values.detach(), rewards, batch.done(), DISCOUNT) critic_optimiser.zero_grad() value_loss.backward() critic_optimiser.step() # Update policy by one step of gradient ascent policy_loss = -critic(batch.state(), actor(batch.state())).mean() actor_optimiser.zero_grad() policy_loss.backward() actor_optimiser.step() # Update target networks ch.models.polyak_average(target_critic, critic, POLYAK_FACTOR) ch.models.polyak_average(target_actor, actor, POLYAK_FACTOR)
#!/usr/bin/env python3 # -*- coding: utf-8 -*- import numpy as np from trading_gym.utils.data.toy import create_toy_data from trading_gym.envs import PortfolioTradingGym from trading_gym.envs.portfolio_gym.costs import TCostModel import pdb np.random.seed(64) commitment_fee = TCostModel(half_spread=0.01) mock_data = create_toy_data(order_book_ids_number=2, feature_number=3, start="2019-01-01", end="2019-01-6") ''' 0001.XSHE 2019-01-01 0.0219 2019-01-02 -0.0103 2019-01-03 0.0175 2019-01-04 -0.0017 2019-01-05 -0.0039 2019-01-06 0.0059 2019-01-07 -0.0049 2019-01-08 -0.0003 2019-01-09 -0.0136 2019-01-10 0.0068 2019-01-11 0.0077 0002.XSHE 2019-01-01 0.0136 2019-01-02 -0.0022 2019-01-03 -0.0012 2019-01-04 -0.0186 2019-01-05 0.0098 2019-01-06 -0.0030 2019-01-07 0.0065
def main(): order_book_id_number = 10 toy_data = create_toy_data(order_book_ids_number=order_book_id_number, feature_number=20, start="2019-05-01", end="2019-12-12", frequency="D") env = PortfolioTradingGym(data_df=toy_data, sequence_window=5, add_cash=True) env = Numpy(env) env = ch.envs.Logger(env, interval=1000) env = ch.envs.Torch(env) env = ch.envs.Runner(env) # create net action_size = env.action_space.shape[0] number_asset, seq_window, features_number = env.observation_space.shape input_size = features_number agent = ActorCritic(input_size=input_size, hidden_size=HIDDEN_SIZE, action_size=action_size) actor_optimiser = optim.Adam(agent.actor.parameters(), lr=LEARNING_RATE) critic_optimiser = optim.Adam(agent.critic.parameters(), lr=LEARNING_RATE) replay = ch.ExperienceReplay() for step in range(1, MAX_STEPS + 1): replay += env.run(agent, episodes=1) if len(replay) >= BATCH_SIZE: with torch.no_grad(): advantages = pg.generalized_advantage(DISCOUNT, TRACE_DECAY, replay.reward(), replay.done(), replay.value(), torch.zeros(1)) advantages = ch.normalize(advantages, epsilon=1e-8) returns = td.discount(DISCOUNT, replay.reward(), replay.done()) old_log_probs = replay.log_prob() # here is to add readability new_values = replay.value() new_log_probs = replay.log_prob() for epoch in range(PPO_EPOCHS): # Recalculate outputs for subsequent iterations if epoch > 0: _, infos = agent(replay.state()) masses = infos['mass'] new_values = infos['value'] new_log_probs = masses.log_prob( replay.action()).unsqueeze(-1) # Update the policy by maximising the PPO-Clip objective policy_loss = ch.algorithms.ppo.policy_loss( new_log_probs, old_log_probs, advantages, clip=PPO_CLIP_RATIO) actor_optimiser.zero_grad() policy_loss.backward() actor_optimiser.step() # Fit value function by regression on mean-squared error value_loss = ch.algorithms.a2c.state_value_loss( new_values, returns) critic_optimiser.zero_grad() value_loss.backward() critic_optimiser.step() replay.empty()
#!/usr/bin/env python3 # -*- coding: utf-8 -*- import numpy as np from trading_gym.utils.data.toy import create_toy_data from trading_gym.envs.portfolio_gym.portfolio_gym import PortfolioTradingGym # ============================================= # # todo: # # ============================================= # order_book_id_number = 1 toy_data = create_toy_data(order_book_ids_number=order_book_id_number, feature_number=3, start="2019-05-01", end="2019-7-12", frequency="D",random_seed=123) env = PortfolioTradingGym(data_df=toy_data, sequence_window=1, add_cash=False) observation = env.reset() print(observation) action = np.array([1.]) total_steps=list(range(2)) for step in total_steps: next_state, reward, done, info = env.step(action) print(next_state, reward)