def main(env_id, train, logdir, exploration, env_dt, **kwargs): sess = tf.InteractiveSession() # start a tensorflow system env = gym.make(env_id) u_bound = (env.action_space.low, env.action_space.high) sig = inspect.signature(DDPG) ddpg_kwargs = dict() for key in sig.parameters: if key in kwargs: ddpg_kwargs[key] = kwargs[key] kwargs.pop(key) action_dim = env.action_space.shape[ 0] # get the dimesnions of action space state_dim = env.observation_space.shape[ 0] # get the dimensions of state space noise = U.OUNoise(action_dim, 0.0, 0.15, 0.2, 0.05, exploration, env_dt) # noise implementation print(noise) agent = DDPG(sess, state_dim, action_dim, u_bound=u_bound, noise=noise, **ddpg_kwargs) # creating a ddpg agent play = U.Play(sess, env, agent, logdir) # play the board if train: play.train(kwargs['nb_episodes'], kwargs['nb_eval_episodes']) # training play.run_env(train=False) env.close()
def get_agent(self): agent_type = self.cfg["Agent"]["Type"] mode = self.cfg["Agent"]["Setup"]["mode"] if agent_type == "DDPG": return DDPG(self.action_mode, self.obs_config, self.task_class, self.cfg) elif agent_type == "TD3": return TD3(self.action_mode, self.obs_config, self.task_class, self.cfg) elif agent_type == "OpenAIES": # We use DDPG's validation methods as it is faster if mode == "validation_mult": return DDPG(self.action_mode, self.obs_config, self.task_class, self.cfg) else: return OpenAIES(self.action_mode, self.obs_config, self.task_class, self.cfg) else: raise ValueError( "%s is not a supported agent type. Please check your config-file." % agent_type)
def session(config,mode): from data.environment import Environment codes, start_date, end_date, features, agent_config, market,predictor, framework, window_length,noise_flag, record_flag, plot_flag,reload_flag,trainable,method=parse_config(config,mode) env = Environment(start_date, end_date, codes, features, int(window_length),market) global M M=len(codes)+1 if framework == 'DDPG': print("*-----------------Loading DDPG Agent---------------------*") from agents.ddpg import DDPG agent = DDPG(predictor, len(codes) + 1, int(window_length), len(features), '-'.join(agent_config), reload_flag,trainable) elif framework == 'PPO': print("*-----------------Loading PPO Agent---------------------*") from agents.ppo import PPO agent = PPO(predictor, len(codes) + 1, int(window_length), len(features), '-'.join(agent_config), reload_flag,trainable) stocktrader=StockTrader() if mode=='train': print("Training with {:d}".format(epochs)) for epoch in range(epochs): print("Now we are at epoch", epoch) traversal(stocktrader,agent,env,epoch,noise_flag,framework,method,trainable) if record_flag=='True': stocktrader.write(epoch) if plot_flag=='True': stocktrader.plot_result() stocktrader.print_result(epoch,agent) stocktrader.reset() elif mode=='test': traversal(stocktrader, agent, env, 1, noise_flag,framework,method,trainable) stocktrader.write(1) stocktrader.plot_result() stocktrader.print_result(1, agent)
def main(env_id, train, logdir, exploration, env_dt, **kwargs): sess = tf.InteractiveSession() env = gym.make(env_id) u_bound = (env.action_space.low, env.action_space.high) sig = inspect.signature(DDPG) ddpg_kwargs = dict() for key in sig.parameters: if key in kwargs: ddpg_kwargs[key] = kwargs[key] kwargs.pop(key) action_dim = env.action_space.shape[0] state_dim = env.observation_space.shape[0] noise = U.OUNoise(action_dim, 0.0, 0.15, 0.2, 0.05, exploration, env_dt) agent = DDPG(sess, state_dim, action_dim, u_bound=u_bound, noise=noise, **ddpg_kwargs) play = U.Play(sess, env, agent, logdir) if train: play.train(kwargs['nb_episodes'], kwargs['nb_eval_episodes']) play.run_env(train=False) env.close()
def session(config, args): global PATH_prefix codes, start_date, end_date, features, agent_config, \ market,predictor, framework, window_length,noise_flag, record_flag,\ plot_flag,reload_flag,trainable,method=parse_config(config,args) env = Environment() global M M = codes + 1 stocktrader = StockTrader() PATH_prefix = "result/DDPG/" + str(args['num']) + '/' if args['mode'] == 'train': if not os.path.exists(PATH_prefix): os.makedirs(PATH_prefix) train_start_date, train_end_date, test_start_date, test_end_date, codes = env.get_repo( start_date, end_date, codes, market) env.get_data(train_start_date, train_end_date, features, window_length, market, codes) print("Codes:", codes) print('Training Time Period:', train_start_date, ' ', train_end_date) print('Testing Time Period:', test_start_date, ' ', test_end_date) with open(PATH_prefix + 'config.json', 'w') as f: json.dump( { "train_start_date": train_start_date.strftime('%Y-%m-%d'), "train_end_date": train_end_date.strftime('%Y-%m-%d'), "test_start_date": test_start_date.strftime('%Y-%m-%d'), "test_end_date": test_end_date.strftime('%Y-%m-%d'), "codes": codes }, f) print("finish writing config") else: with open("result/DDPG/" + str(args['num']) + '/config.json', 'r') as f: dict_data = json.load(f) print("successfully load config") train_start_date, train_end_date, codes = datetime.datetime.strptime( dict_data['train_start_date'], '%Y-%m-%d'), datetime.datetime.strptime( dict_data['train_end_date'], '%Y-%m-%d'), dict_data['codes'] env.get_data(train_start_date, train_end_date, features, window_length, market, codes) for noise_flag in [ 'True' ]: #['False','True'] to train agents with noise and without noise in assets prices print("*-----------------Loading DDPG Agent---------------------*") agent = DDPG(predictor, len(codes) + 1, int(window_length), len(features), '-'.join(agent_config), reload_flag, trainable) print("Training with {:d}".format(epochs)) for epoch in range(epochs): print("Now we are at epoch", epoch) traversal(stocktrader, agent, env, epoch, noise_flag, framework, method, trainable) if record_flag == 'True': stocktrader.write(epoch, framework) if plot_flag == 'True': stocktrader.plot_result() agent.reset_buffer() stocktrader.print_result(epoch, agent, noise_flag) stocktrader.reset() agent.close() del agent elif args['mode'] == 'test': with open(PATH_prefix + 'config.json', 'r') as f: dict_data = json.load(f) test_start_date, test_end_date, codes = datetime.datetime.strptime( dict_data['test_start_date'], '%Y-%m-%d'), datetime.datetime.strptime( dict_data['test_end_date'], '%Y-%m-%d'), dict_data['codes'] env.get_data(test_start_date, test_end_date, features, window_length, market, codes) backtest([ DDPG(predictor, len(codes) + 1, int(window_length), len(features), '-'.join(agent_config), "True", "False") ], env)
from agents.reinforce import REINFORCE agent = REINFORCE(env, model, buffer, logger, args) elif args.algo == "vpg": from agents.vpg import VPG agent = VPG(env, model, buffer, logger, args) elif args.algo == "ppo": from agents.ppo import PPO agent = PPO(env, model, buffer, logger, args) elif args.algo in args.q_learning: if isinstance(env.action_space, Box): # Action limit for clamping # Critically: assumes all dimensions share the same bound! args.act_limit = env.action_space.high[0] if args.algo == "ddpg": from agents.ddpg import DDPG agent = DDPG(env, model, buffer, logger, args) elif args.algo == "td3": from agents.td3 import TD3 agent = TD3(env, model, buffer, logger, args) elif args.algo == "sac": from agents.sac import SAC agent = SAC(env, model, buffer, logger, args) else: algos = tuple(args.policy_gradient + args.q_learning) raise NotImplementedError(f"Expected `algo` argument to be one of " + f"{algos}, but got '{args.algo}'.") """ Train """ # Train the agent!
import rospy import matplotlib.pyplot as plt save = 0 current_path = os.getcwd() t_listener = TorqueListener() env = KomodoEnvironment() state_shape = env.state_shape action_shape = env.action_shape model = 'ddpg' if model == 'ddpg': agent = DDPG(state_shape, action_shape, batch_size=128, gamma=0.995, tau=0.001, actor_lr=0.0005, critic_lr=0.001, use_layer_norm=True) print('DDPG agent configured') agent.load_model(agent.current_path + '/model/model.ckpt') elif model == 'a2c': agent = A2C(state_shape, action_shape, gamma=0.995, actor_lr=0.0002, critic_lr=0.001, use_layer_norm=True) print('A2C agent configured') agent.load_model(agent.current_path + '/model_a2c/model.ckpt')
def _get_action(self, o): # Get actions just like DDPG return DDPG._get_action(self, o)
import matplotlib.pyplot as plt HALF_KOMODO = 0.53 / 2 np.set_printoptions(precision=1) current_path = os.getcwd() env = KomodoEnvironment() state_shape = env.state_shape action_shape = env.action_shape model = 'a2c' if model == 'ddpg': agent = DDPG(state_shape, action_shape, batch_size=128, gamma=0.995, tau=0.001, actor_lr=0.0001, critic_lr=0.001, use_layer_norm=True) print('DDPG agent configured') elif model == 'a2c': agent = A2C(state_shape, action_shape, gamma=0.995, actor_lr=0.0001, critic_lr=0.001, use_layer_norm=True) print('A2C agent configured') max_episode = 1000 tot_rewards = []
def train(env, nb_epochs, nb_epoch_cycles, render_eval, reward_scale, render, param_noise, actor, critic, normalize_returns, normalize_observations, critic_l2_reg, actor_lr, critic_lr, action_noise, popart, gamma, clip_norm, nb_train_steps, nb_rollout_steps, nb_eval_steps, batch_size, memory, tau=0.01, eval_env=None, param_noise_adaption_interval=50): rank = MPI.COMM_WORLD.Get_rank() assert (np.abs(env.action_space.low) == env.action_space.high ).all() # we assume symmetric actions. max_action = env.action_space.high logger.info( 'scaling actions by {} before executing in env'.format(max_action)) agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape, gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale) logger.info('Using agent with the following configuration:') logger.info(str(agent.__dict__.items())) # Set up logging stuff only for a single worker. if rank == 0: saver = tf.train.Saver() else: saver = None step = 0 episode = 0 eval_episode_rewards_history = deque(maxlen=100) episode_rewards_history = deque(maxlen=100) with U.single_threaded_session() as sess: # Prepare everything. agent.initialize(sess) sess.graph.finalize() agent.reset() obs = env.reset() if eval_env is not None: eval_obs = eval_env.reset() done = False episode_reward = 0. episode_step = 0 episodes = 0 t = 0 epoch = 0 start_time = time.time() epoch_episode_rewards = [] epoch_episode_steps = [] epoch_episode_eval_rewards = [] epoch_episode_eval_steps = [] epoch_start_time = time.time() epoch_actions = [] epoch_qs = [] epoch_episodes = 0 for epoch in range(nb_epochs): for cycle in range(nb_epoch_cycles): # Perform rollouts. for t_rollout in range(nb_rollout_steps): # Predict next action. action, q = agent.pi(obs, apply_noise=True, compute_Q=True) # print(action.shape) # print(env.action_space.shape) assert action.shape == env.action_space.shape # Execute next action. if rank == 0 and render: env.render() assert max_action.shape == action.shape new_obs, r, done, info = env.step( max_action * action ) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) t += 1 if rank == 0 and render: env.render() episode_reward += r episode_step += 1 # Book-keeping. epoch_actions.append(action) epoch_qs.append(q) agent.store_transition(obs, action, r, new_obs, done) obs = new_obs if done: # Episode done. epoch_episode_rewards.append(episode_reward) episode_rewards_history.append(episode_reward) epoch_episode_steps.append(episode_step) episode_reward = 0. episode_step = 0 epoch_episodes += 1 episodes += 1 agent.reset() obs = env.reset() # Train. epoch_actor_losses = [] epoch_critic_losses = [] epoch_adaptive_distances = [] for t_train in range(nb_train_steps): # Adapt param noise, if necessary. if memory.nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0: distance = agent.adapt_param_noise() epoch_adaptive_distances.append(distance) cl, al = agent.train() epoch_critic_losses.append(cl) epoch_actor_losses.append(al) agent.update_target_net() # Evaluate. eval_episode_rewards = [] eval_qs = [] if eval_env is not None: eval_episode_reward = 0. for t_rollout in range(nb_eval_steps): eval_action, eval_q = agent.pi(eval_obs, apply_noise=False, compute_Q=True) eval_obs, eval_r, eval_done, eval_info = eval_env.step( max_action * eval_action ) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) if render_eval: eval_env.render() eval_episode_reward += eval_r eval_qs.append(eval_q) if eval_done: eval_obs = eval_env.reset() eval_episode_rewards.append(eval_episode_reward) eval_episode_rewards_history.append( eval_episode_reward) eval_episode_reward = 0. mpi_size = MPI.COMM_WORLD.Get_size() # Log stats. # XXX shouldn't call np.mean on variable length lists duration = time.time() - start_time stats = agent.get_stats() combined_stats = stats.copy() combined_stats['rollout/return'] = np.mean(epoch_episode_rewards) combined_stats['rollout/return_history'] = np.mean( episode_rewards_history) combined_stats['rollout/episode_steps'] = np.mean( epoch_episode_steps) combined_stats['rollout/actions_mean'] = np.mean(epoch_actions) combined_stats['rollout/Q_mean'] = np.mean(epoch_qs) combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses) combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses) combined_stats['train/param_noise_distance'] = np.mean( epoch_adaptive_distances) combined_stats['total/duration'] = duration combined_stats['total/steps_per_second'] = float(t) / float( duration) combined_stats['total/episodes'] = episodes combined_stats['rollout/episodes'] = epoch_episodes combined_stats['rollout/actions_std'] = np.std(epoch_actions) # Evaluation statistics. if eval_env is not None: combined_stats['eval/return'] = np.mean(eval_episode_rewards) combined_stats['eval/return_history'] = np.mean( eval_episode_rewards_history) # combined_stats['eval/Q'] = eval_qs combined_stats['eval/episodes'] = len(eval_episode_rewards) def as_scalar(x): if isinstance(x, np.ndarray): assert x.size == 1 return x[0] elif np.isscalar(x): return x else: raise ValueError('expected scalar, got %s' % x) combined_stats_sums = MPI.COMM_WORLD.allreduce( np.array([as_scalar(x) for x in combined_stats.values()])) combined_stats = { k: v / mpi_size for (k, v) in zip(combined_stats.keys(), combined_stats_sums) } # Total statistics. combined_stats['total/epochs'] = epoch + 1 combined_stats['total/steps'] = t for key in sorted(combined_stats.keys()): logger.record_tabular(key, combined_stats[key]) logger.dump_tabular() logger.info('') logdir = logger.get_dir() if rank == 0 and logdir: if hasattr(env, 'get_state'): with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f: pickle.dump(env.get_state(), f) if eval_env and hasattr(eval_env, 'get_state'): with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as f: pickle.dump(eval_env.get_state(), f)
def _update_networks(self): # Update networks just like DDPG (but overloading the functions below) return DDPG._update_networks(self)
def __init__(self, state_size, action_size, seed, num_agents, memory, ActorNetwork, CriticNetwork, device, BOOTSTRAP_SIZE=5, GAMMA=0.99, TAU=1e-3, LR_CRITIC=5e-4, LR_ACTOR=5e-4, UPDATE_EVERY=1, TRANSFER_EVERY=2, UPDATE_LOOP=10, ADD_NOISE_EVERY=5, WEIGHT_DECAY=0, FILE_NAME="multi_ddpg"): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed num_agents: number of running agents memory: instance of ReplayBuffer ActorNetwork: a class inheriting from torch.nn.Module that define the structure of the actor neural network CriticNetwork: a class inheriting from torch.nn.Module that define the structure of the critic neural network device: cpu or cuda:0 if available BOOTSTRAP_SIZE: length of the bootstrap GAMMA: discount factor TAU: for soft update of target parameters LR_CRITIC: learning rate of the critics LR_ACTOR: learning rate of the actors UPDATE_EVERY: how often to update the networks TRANSFER_EVERY: after how many update do we transfer from the online network to the targeted fixed network UPDATE_LOOP: Number of loops of learning whenever the agent is learning ADD_NOISE_EVERY: how often to add noise to favor exploration WEIGHT_DECAY: Parameter of the Adam Optimizer of the Critic Network FILE_NAME: default prefix to the saved model """ # Instantiate n agent with n network self.agents = [ DDPG(state_size, action_size, seed, memory, ActorNetwork, CriticNetwork, device, BOOTSTRAP_SIZE, GAMMA, TAU, LR_CRITIC, LR_ACTOR, UPDATE_EVERY, TRANSFER_EVERY, UPDATE_LOOP, ADD_NOISE_EVERY, WEIGHT_DECAY, FILE_NAME=FILE_NAME + "_" + str(i)) for i in range(num_agents) ] self.rewards = [ deque(maxlen=BOOTSTRAP_SIZE) for i in range(num_agents) ] self.states = [deque(maxlen=BOOTSTRAP_SIZE) for i in range(num_agents)] self.gammas = np.array([GAMMA**i for i in range(BOOTSTRAP_SIZE)])
def session(config, args): global PATH_prefix from data.environment import Environment codes, start_date, end_date, features, agent_config, market, predictor, framework, window_length, noise_flag, record_flag, plot_flag, reload_flag, trainable, method = parse_config( config, args) env = Environment() global M if market == 'China': M = codes + 1 else: M = len(codes) + 1 # print("len codes",len(codes)) # M=codes+1 # M = số lượng stock -> ảnh huong đến noise - chi tiết from agents.ornstein_uhlenbeck import OrnsteinUhlenbeckActionNoise # if framework == 'DDPG': # print("*-----------------Loading DDPG Agent---------------------*") # from agents.ddpg import DDPG # agent = DDPG(predictor, len(codes) + 1, int(window_length), len(features), '-'.join(agent_config), reload_flag,trainable) # # elif framework == 'PPO': # print("*-----------------Loading PPO Agent---------------------*") # from agents.ppo import PPO # agent = PPO(predictor, len(codes) + 1, int(window_length), len(features), '-'.join(agent_config), reload_flag,trainable) stocktrader = StockTrader() PATH_prefix = "./result_new/PG/" + str(args['num']) + '/' #<- if args['mode'] == 'train': if not os.path.exists(PATH_prefix): print('Create new path at', PATH_prefix) os.makedirs(PATH_prefix) if market == "China": train_start_date, train_end_date, test_start_date, test_end_date, codes = env.get_repo( start_date, end_date, codes, market) else: train_start_date, train_end_date, test_start_date, test_end_date, codes = env.get_repo( start_date, end_date, len(codes), market) env.get_data(train_start_date, train_end_date, features, window_length, market, codes) print("Codes:", codes) print('Training Time Period:', train_start_date, ' ', train_end_date) print('Testing Time Period:', test_start_date, ' ', test_end_date) with open(PATH_prefix + 'config.json', 'w') as f: json.dump( { "train_start_date": train_start_date.strftime('%Y-%m-%d'), "train_end_date": train_end_date.strftime('%Y-%m-%d'), "test_start_date": test_start_date.strftime('%Y-%m-%d'), "test_end_date": test_end_date.strftime('%Y-%m-%d'), "codes": codes }, f) print("finish writing config") else: with open("./result_new/PG/" + str(args['num']) + '/config.json', 'r') as f: dict_data = json.load(f) print("successfully load config") if market == "China": train_start_date, train_end_date, test_start_date, test_end_date, codes = env.get_repo( start_date, end_date, codes, market) else: train_start_date, train_end_date, test_start_date, test_end_date, codes = env.get_repo( start_date, end_date, len(codes), market) env.get_data(train_start_date, train_end_date, features, window_length, market, codes) # train_start_date, train_end_date, codes = datetime.datetime.strptime(dict_data['train_start_date'], '%Y-%m-%d'), datetime.datetime.strptime(dict_data['train_end_date'], '%Y-%m-%d'), dict_data['codes'] # env.get_data(train_start_date, train_end_date, features, window_length, market, codes) for noise_flag in [ 'True' ]: #['False','True'] to train agents with noise and without noise in assets prices if framework == 'PG': print( "*-----------------Loading PG Agent---------------------*") agent = PG( len(codes) + 1, int(window_length), len(features), '-'.join(agent_config), reload_flag, trainable, noise_flag, args['num']) print("Finish import {}".format(agent.name)) elif framework == 'DDPG': print( "*-----------------Loading DDPG Agent---------------------*" ) from agents.ddpg import DDPG agent = DDPG(predictor, len(codes) + 1, int(window_length), len(features), '-'.join(agent_config), reload_flag, trainable) print("Finish import {}".format(agent.name)) print("Training with {:d}".format(epochs)) for epoch in range(epochs): print("Now we are at epoch", epoch) traversal(stocktrader, agent, env, epoch, noise_flag, framework, method, trainable) if record_flag == 'True': stocktrader.write(epoch, framework) if plot_flag == 'True': stocktrader.plot_result() #print(agent) agent.reset_buffer() stocktrader.print_result(epoch, agent, noise_flag) stocktrader.reset() agent.close() del agent ####### # TESTING elif args['mode'] == 'test': with open("./result_new/PG/" + str(args['num']) + '/config.json', 'r') as f: dict_data = json.load(f) test_start_date, test_end_date, codes = datetime.datetime.strptime( dict_data['test_start_date'], '%Y-%m-%d'), datetime.datetime.strptime( dict_data['test_end_date'], '%Y-%m-%d'), dict_data['codes'] env.get_data(test_start_date, test_end_date, features, window_length, market, codes) backtest([ PG( len(codes) + 1, int(window_length), len(features), '-'.join(agent_config), 'True', 'False', 'True', args['num']) ], env, market)
def session(config, args): codes, start_date, end_date, features, agent_config, \ market, predictor, framework, window_length, noise_flag, record_flag, \ plot_flag, reload_flag, trainable, method, epochs = parse_config(config, args) env = Environment(args.seed) stocktrader = StockTrader() path = "result/{}/{}/".format(framework, args.num) logger.info('Mode: {}'.format(args.mode)) if args.mode == 'train': if not os.path.exists(path): os.makedirs(path) train_start_date, train_end_date, test_start_date, test_end_date, codes = env.get_repo( start_date, end_date, codes, market) logger.debug("Training with codes: {}".format(codes)) env.get_data(train_start_date, train_end_date, features, window_length, market, codes) with open(path + 'config.json', 'w') as f: print(train_start_date) print(train_end_date) print(test_start_date) print(test_end_date) json.dump( { "train_start_date": train_start_date.strftime('%Y-%m-%d'), "train_end_date": train_end_date.strftime('%Y-%m-%d'), "test_start_date": test_start_date.strftime('%Y-%m-%d'), "test_end_date": test_end_date.strftime('%Y-%m-%d'), "codes": codes }, f) else: with open('result/{}/{}/config.json'.format(framework, args.num), 'r') as f: dict_data = json.load(f) train_start_date, train_end_date, codes = datetime.strptime( dict_data['train_start_date'], '%Y-%m-%d'), datetime.strptime(dict_data['train_end_date'], '%Y-%m-%d'), dict_data['codes'] env.get_data(train_start_date, train_end_date, features, window_length, market, codes) if framework == 'PG': logger.debug("Loading PG Agent") agent = PG( len(codes) + 1, int(window_length), len(features), '-'.join(agent_config), reload_flag, trainable, args.num) elif framework == 'DDPG': logger.debug("Loading DDPG Agent") agent = DDPG( len(codes) + 1, int(window_length), len(features), '-'.join(agent_config), reload_flag, trainable, args.num) logger.info("Training: %d epochs", epochs) for epoch in range(epochs): traversal(stocktrader, agent, env, epoch, True, framework, method, trainable) if record_flag: stocktrader.write(epoch, framework) if plot_flag: stocktrader.plot_result() agent.reset_buffer() stocktrader.print_result(epoch, agent, True) stocktrader.reset() agent.close() elif args.mode == 'test': with open("result/{}/{}/config.json".format(framework, args.num), 'r') as f: dict_data = json.load(f) test_start_date, test_end_date, codes = datetime.strptime( dict_data['test_start_date'], '%Y-%m-%d'), datetime.strptime(dict_data['test_end_date'], '%Y-%m-%d'), dict_data['codes'] env.get_data(test_start_date, test_end_date, features, window_length, market, codes) if framework == 'PG': logger.info("Loading PG Agent") agent = PG( len(codes) + 1, int(window_length), len(features), '-'.join(agent_config), True, False, args.num) elif framework == 'DDPG': logger.info("Loading DDPG Agent") agent = DDPG( len(codes) + 1, int(window_length), len(features), '-'.join(agent_config), True, False, args.num) backtest([agent], env, "result/{}/{}/".format(framework, args.num), framework)
processor=processor) else: assert not opt.recurrent # Setup random process for exploration random_process = [ GaussianWhiteNoiseProcess(sigma=0.0, mu=1.0), GaussianWhiteNoiseProcess(sigma=1.0, mu=0.0) ] # Setup DDPG agent model actor, critic, action_input = DDPG_Model( window_length=opt.ddpg_window_length, num_actions=env.available_actions) # Setup DDPG agent agent = DDPG(actor=actor, critic=critic, critic_action_input=action_input, num_actions=env.available_actions, processor=processor, random_process=random_process) print(mission_name + ' initialized.') # Setup weights path path = os.path.join('weights', 'Malmo', '{}'.format(mission_name)) if not os.path.exists(path): os.makedirs(path) weights_path = os.path.join(path, '{}.hdf5'.format(name)) # Run the agent agent.fit(env=env, num_steps=args.steps, weights_path=weights_path,
else: writer = None env = gym.make(args.env_name) action_dim = env.action_space.shape[0] state_dim = env.observation_space.shape[0] state_rms = RunningMeanStd(state_dim) if args.algo == 'ppo': agent = PPO(writer, device, state_dim, action_dim, agent_args) elif args.algo == 'sac': agent = SAC(writer, device, state_dim, action_dim, agent_args) elif args.algo == 'ddpg': from utils.noise import OUNoise noise = OUNoise(action_dim, 0) agent = DDPG(writer, device, state_dim, action_dim, agent_args, noise) if (torch.cuda.is_available()) and (args.use_cuda): agent = agent.cuda() if args.load != 'no': agent.load_state_dict(torch.load("./model_weights/" + args.load)) score_lst = [] state_lst = [] if agent_args.on_policy == True: score = 0.0 state_ = (env.reset()) state = np.clip((state_ - state_rms.mean) / (state_rms.var**0.5 + 1e-8), -5, 5)