def main(): #env_id = "CartPole-v1" vix_env = trading_vix_env.trading_vix_env() num_cpu = 20 # Number of processes to use # Create the vectorized environment env = SubprocVecEnv([make_env(vix_env, i) for i in range(num_cpu)]) model = A2C('MlpPolicy', env, verbose=1, n_steps=5) model.learn(total_timesteps=2500000000)
def main(): #env_id = "CartPole-v1" vix_env = trading_vix_env.trading_vix_env() num_cpu = 20 # Number of processes to use # Create the vectorized environment env = SubprocVecEnv([make_env(vix_env, i) for i in range(num_cpu)]) # Create log dir log_dir = './ppo_data' os.makedirs(log_dir, exist_ok=True) env = VecMonitor(env, log_dir) callback = custom_call_back.CustomCallback(check_freq = 1000,log_dir = log_dir) model = PPO('MlpPolicy', env, verbose=1,n_steps=500,batch_size = 10000) model.learn(total_timesteps=2500000000,callback = callback)
def main(): # Create log dir log_dir = './ddpg_data' os.makedirs(log_dir, exist_ok=True) vix_env = trading_vix_env.trading_vix_env() env = Monitor(vix_env, log_dir) # Create action noise because TD3 and DDPG use a deterministic policy n_actions = env.action_space.shape[-1] action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions)) # Create the callback: check every 20000 steps callback = custom_call_back.CustomCallback(check_freq = 20000,log_dir = log_dir) # Create RL model model = DDPG('MlpPolicy',env,action_noise = action_noise, verbose=2,batch_size = 10000) # Train the agent model.learn(total_timesteps=int(5e9), callback=callback)
import config as C import numpy as np import trading_vix import trading_vix_env import utils import matplotlib.pyplot as plt import jsonpickle import train import os if __name__ == '__main__': np.random.seed(1234) #env = trading_vix.trading_vix() env = trading_vix_env.trading_vix_env() theta, episode_rewards = train.train(N=C.batch_size, T=C.training_epoch, delta=1e-2, env=env) ''' param N: number of trajectories to sample in each time step param T: number of iterations to train the model param delta: trust region size, because we are using trpo param env: the environment for the policy to learn ''' # #test the training result # observation = env.reset() # current_feature = utils.extract_features(observation,C.output_dim) # for t in range(200):