Exemple #1
0
def main():
    #env_id = "CartPole-v1"
    vix_env = trading_vix_env.trading_vix_env()
    num_cpu = 20  # Number of processes to use
    # Create the vectorized environment
    env = SubprocVecEnv([make_env(vix_env, i) for i in range(num_cpu)])

    model = A2C('MlpPolicy', env, verbose=1, n_steps=5)
    model.learn(total_timesteps=2500000000)
Exemple #2
0
def main():
    #env_id = "CartPole-v1"
    vix_env = trading_vix_env.trading_vix_env()
    num_cpu = 20  # Number of processes to use
    # Create the vectorized environment
    env = SubprocVecEnv([make_env(vix_env, i) for i in range(num_cpu)])

    # Create log dir
    log_dir = './ppo_data'
    os.makedirs(log_dir, exist_ok=True)
    env = VecMonitor(env, log_dir)
    callback = custom_call_back.CustomCallback(check_freq = 1000,log_dir = log_dir)

    model = PPO('MlpPolicy', env, verbose=1,n_steps=500,batch_size = 10000)
    model.learn(total_timesteps=2500000000,callback = callback)
Exemple #3
0
def main():
    # Create log dir
    log_dir = './ddpg_data'
    os.makedirs(log_dir, exist_ok=True)

    vix_env = trading_vix_env.trading_vix_env()
    env = Monitor(vix_env, log_dir)

    # Create action noise because TD3 and DDPG use a deterministic policy
    n_actions = env.action_space.shape[-1]
    action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions))
    # Create the callback: check every 20000 steps
    callback = custom_call_back.CustomCallback(check_freq = 20000,log_dir = log_dir)
    # Create RL model
    model = DDPG('MlpPolicy',env,action_noise = action_noise, verbose=2,batch_size = 10000)
    # Train the agent
    model.learn(total_timesteps=int(5e9), callback=callback)
Exemple #4
0
import config as C
import numpy as np
import trading_vix
import trading_vix_env
import utils
import matplotlib.pyplot as plt
import jsonpickle
import train
import os

if __name__ == '__main__':

    np.random.seed(1234)
    #env = trading_vix.trading_vix()
    env = trading_vix_env.trading_vix_env()

    theta, episode_rewards = train.train(N=C.batch_size,
                                         T=C.training_epoch,
                                         delta=1e-2,
                                         env=env)
    '''
    param N: number of trajectories to sample in each time step
    param T: number of iterations to train the model
    param delta: trust region size, because we are using trpo
    param env: the environment for the policy to learn
    '''

    # #test the training result
    # observation = env.reset()
    # current_feature = utils.extract_features(observation,C.output_dim)
    # for t in range(200):