def process_batch(engine, batch): optimizer.zero_grad() loss_v = common.calc_loss(batch, net, tgt_net.target_model, gamma=GAMMA**REWARD_STEPS, device=device) loss_v.backward() optimizer.step() eps_tracker.frame(engine.state.iteration) if getattr(engine.state, "eval_states", None) is None: eval_states = buffer.sample(STATES_TO_EVALUATE) eval_states = [ np.array(transition.state, copy=False) for transition in eval_states ] engine.state.eval_states = np.array(eval_states, copy=False) return { "loss": loss_v.item(), "epsilon": selector.epsilon, }
eval_states = buffer.sample(STATES_TO_EVALUATE) eval_states = [np.array(transition.state, copy=False) for transition in eval_states] eval_states = np.array(eval_states, copy=False) if step_idx % EVAL_EVERY_STEP == 0: mean_val = common.calc_values_of_states(eval_states, net, device=device) writer.add_scalar("values_mean", mean_val, step_idx) if best_mean_val is None or best_mean_val < mean_val: if best_mean_val is not None: print("%d: Best mean value updated %.3f -> %.3f" % (step_idx, best_mean_val, mean_val)) best_mean_val = mean_val torch.save(net.state_dict(), os.path.join(saves_path, "mean_val-%.3f.data" % mean_val)) optimizer.zero_grad() batch = buffer.sample(BATCH_SIZE) loss_v = common.calc_loss(batch, net, tgt_net.target_model, GAMMA ** REWARD_STEPS, device=device) loss_v.backward() optimizer.step() if step_idx % TARGET_NET_SYNC == 0: tgt_net.sync() if step_idx % CHECKPOINT_EVERY_STEP == 0: idx = step_idx // CHECKPOINT_EVERY_STEP torch.save(net.state_dict(), os.path.join(saves_path, "checkpoint-%3d.data" % idx)) if step_idx % VALIDATION_EVERY_STEP == 0: res = validation.validation_run(env_tst, net, device=device) for key, val in res.items(): writer.add_scalar(key + "_test", val, step_idx) res = validation.validation_run(env_val, net, device=device)
new_rewards = exp_source.pop_rewards_steps() if new_rewards: reward_tracker.reward(new_rewards[0], step_idx, selector.epsilon) if len(buffer) < REPLAY_INITIAL: continue optimizer.zero_grad() batch = buffer.sample(BATCH_SIZE) # init the hidden both in network and tgt network net_processor.train_mode(batch_size=BATCH_SIZE) loss_v = common.calc_loss(batch, net, tgt_net.target_model, GAMMA**REWARD_STEPS, train_on_gpu=TRAIN_ON_GPU) loss_v.backward() optimizer.step() loss_value = loss_v.item() loss_tracker.loss(loss_value, step_idx) if step_idx % TARGET_NET_SYNC == 0: tgt_net.sync() if step_idx % CHECKPOINT_EVERY_STEP == 0: # idx = step_idx // CHECKPOINT_EVERY_STEP checkpoint = {"state_dict": net.state_dict()} with open( os.path.join(NET_SAVE_PATH,
def train_agent( run_name, data_paths=conf.default_data_paths, validation_paths=conf.default_validation_paths, model=models.DQNConv1D, large=False, load_checkpoint=None, saves_path=None, eps_steps=None, ): """ Main function for training the agents :run_name: a string of choice that dictates where to save :data_paths: dict specifying what data to train with :validation_paths: dict specifying what data to validate with :model: what model to use :large: whether or not to use large feature set :load_checkpoint: an optinal path to checkpoint to load from """ print("=" * 80) print("Training starting".rjust(40 + 17 // 2)) print("=" * 80) # Get training data stock_data = data.get_data_as_dict(data_paths, large=large) val_data = data.get_data_as_dict(validation_paths, large=large) # Setup before training can begin step_idx = 0 eval_states = None best_mean_val = None EPSILON_STEPS = eps_steps if eps_steps is not None else conf.EPSILON_STEPS # Use GPU if available, else fall back on CPU device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print(f"[Info] Using device: {device}") # Set up the path to save the checkpoints to if saves_path is None: saves_path = os.path.join("saves", run_name) else: saves_path = os.path.join(saves_path, run_name) print(f"[Info] Saving to path: {saves_path}") os.makedirs(saves_path, exist_ok=True) # Create the gym-environment that the agent will interact with during training env = environ.StocksEnv( stock_data, bars_count=conf.BARS_COUNT, reset_on_close=conf.RESET_ON_CLOSE, random_ofs_on_reset=conf.RANDOM_OFS_ON_RESET, reward_on_close=conf.REWARD_ON_CLOSE, large=large, ) env = wrappers.TimeLimit(env, max_episode_steps=1000) # Create the gym-environment that the agent will interact with when validating env_val = environ.StocksEnv( val_data, bars_count=conf.BARS_COUNT, reset_on_close=conf.RESET_ON_CLOSE, random_ofs_on_reset=conf.RANDOM_OFS_ON_RESET, reward_on_close=conf.REWARD_ON_CLOSE, large=large, ) # Create the model net = model(env.observation_space.shape, env.action_space.n).to(device) print("Using network:".rjust(40 + 14 // 2)) print("=" * 80) print(net) # Initialize agent and epsilon-greedy action-selector from the ptan package # The ptan package provides some helper and wrapper functions for ease of # use of reinforcement learning tgt_net = ptan.agent.TargetNet(net) selector = ptan.actions.EpsilonGreedyActionSelector(conf.EPSILON_START) agent = ptan.agent.DQNAgent(net, selector, device=device) exp_source = ptan.experience.ExperienceSourceFirstLast( env, agent, conf.GAMMA, steps_count=conf.REWARD_STEPS) buffer = ptan.experience.ExperienceReplayBuffer(exp_source, conf.REPLAY_SIZE) optimizer = optim.Adam(net.parameters(), lr=conf.LEARNING_RATE) # If a checkpoint is supplied to the function –> resume the training from there if load_checkpoint is not None: state = torch.load(load_checkpoint) net.load_state_dict(state["model_state_dict"]) optimizer.load_state_dict(state["optimizer_state_dict"]) step_idx = state["step_idx"] best_mean_val = state["best_mean_val"] print( f"State loaded –> step index: {step_idx}, best mean val: {best_mean_val}" ) net.train() # Create a reward tracker, i.e. an object that keeps track of the # rewards the agent gets during training reward_tracker = common.RewardTracker(np.inf, group_rewards=100) # The main training loop print("Training loop starting".rjust(40 + 22 // 2)) print("=" * 80) # Run the main training loop while True: step_idx += 1 buffer.populate(1) # Get current epsilon for epsilon-greedy action-selection selector.epsilon = max(conf.EPSILON_STOP, conf.EPSILON_START - step_idx / EPSILON_STEPS) # Take a step and get rewards new_rewards = exp_source.pop_rewards_steps() if new_rewards: reward_tracker.reward(new_rewards[0], step_idx, selector.epsilon) # As long as not enough data is in buffer, go to top again if len(buffer) < conf.REPLAY_INITIAL: continue if eval_states is None: print("Initial buffer populated, start training") eval_states = buffer.sample(conf.STATES_TO_EVALUATE) eval_states = [ np.array(transition.state, copy=False) for transition in eval_states ] eval_states = np.array(eval_states, copy=False) # Evaluate the model every x number of steps # and update the currently best performance if better value gotten if step_idx % conf.EVAL_EVERY_STEP == 0: mean_val = common.calc_values_of_states(eval_states, net, device=device) # If new best value –> save the model, both with meta data for resuming training # and as the full object for use in testing if best_mean_val is None or best_mean_val < mean_val: if best_mean_val is not None: print( f"{step_idx}: Best mean value updated {best_mean_val:.3f} -> {mean_val:.3f}" ) best_mean_val = mean_val # Save checkpoint with meta data torch.save( { "model_state_dict": net.state_dict(), "optimizer_state_dict": optimizer.state_dict(), "step_idx": step_idx, "best_mean_val": best_mean_val, }, os.path.join(saves_path, f"mean_val-{mean_val:.3f}.data"), ) # Save full object for testing torch.save( net, os.path.join(saves_path, f"mean_val-{mean_val:.3f}-fullmodel.data"), ) # Reset optimizer's gradients before optimization step optimizer.zero_grad() batch = buffer.sample(conf.BATCH_SIZE) # Calculate the loss loss_v = common.calc_loss( batch, net, tgt_net.target_model, conf.GAMMA**conf.REWARD_STEPS, device=device, ) # Calculate the gradient loss_v.backward() # Do one step of gradient descent optimizer.step() # Sync up the to networks we're using # Two networks in this manner should increase the agent's ability to converge if step_idx % conf.TARGET_NET_SYNC == 0: tgt_net.sync() # Every 1 million steps, save model in case something happens # so we can resume training in that case if step_idx % conf.CHECKPOINT_EVERY_STEP == 0: idx = step_idx // conf.CHECKPOINT_EVERY_STEP torch.save( { "model_state_dict": net.state_dict(), "optimizer_state_dict": optimizer.state_dict(), "step_idx": step_idx, "best_mean_val": best_mean_val, }, os.path.join(saves_path, f"checkpoint-{idx}.data"), ) torch.save(net, os.path.join(saves_path, f"fullmodel-{idx}.data")) print("Training done")
def train_model(cuda, phase, premodel, pdays): """ cuda : True / False phase : 1~3 premodel: data/phase1_model.data pdays: integer """ device = torch.device("cuda" if cuda else "cpu") phase = int(phase) if phase == 1: config = sconfig elif phase == 2: config = mconfig elif phase == 3: config = pconfig run_name = "v" + config.version + "-phase" + str(phase) saves_path = os.path.join("saves", run_name) os.makedirs(saves_path, exist_ok=True) save_name = "" writer = SummaryWriter(comment=run_name) prices_list, val_prices_list = data.load_prices(config.choices) if phase == 1: s_env = environ.StocksEnvS(prices_list) stock_env = s_env val_stock_env = environ.StocksEnvS(val_prices_list) save_name = "{}.data".format(run_name) elif phase == 2: # phase 1 의 network 그래프를 로드한다. s_env = environ.StocksEnvS(prices_list) prenet = models.SimpleFFDQN(s_env.observation_space.shape[0], s_env.action_space.n) #.to(device) models.load_model(premodel, prenet) # phase2 환경 생성 stock_env = environ.StocksEnvM(prices_list, prenet) val_stock_env = environ.StocksEnvM(val_prices_list, prenet) save_name = "{}.data".format(run_name) elif phase == 3: predict_days = int(pdays) stock_env = pdenviron.PredEnv(prices_list=prices_list, predict_days=7) val_stock_env = pdenviron.PredEnv(prices_list=prices_list, predict_days=7) save_name = "{}-{}.data".format(run_name, predict_days) net = models.SimpleFFDQN(stock_env.observation_space.shape[0], stock_env.action_space.n).to(device) tgt_net = ptan.agent.TargetNet(net) selector = ptan.actions.EpsilonGreedyActionSelector(config.epsilon_start) agent = ptan.agent.DQNAgent(net, selector, device=device) exp_source = ptan.experience.ExperienceSourceFirstLast( stock_env, agent, config.gamma, steps_count=config.reward_steps) buffer = ptan.experience.ExperienceReplayBuffer(exp_source, config.replay_size) optimizer = optim.Adam(net.parameters(), lr=config.learning_rate) # main training loop step_idx = 0 eval_states = None best_mean_val = None with common.RewardTracker(writer, np.inf, group_rewards=100) as reward_tracker: while step_idx < config.end_step: step_idx += 1 buffer.populate(1) selector.epsilon = max( config.epsilon_stop, config.epsilon_start - step_idx / config.epsilon_steps) new_rewards = exp_source.pop_rewards_steps() if new_rewards: reward_tracker.reward(new_rewards[0], step_idx, selector.epsilon) if len(buffer) < config.replay_initial: continue if eval_states is None: print("Initial buffer populated, start training") eval_states = buffer.sample(config.states_to_evaluate) eval_states = [ np.array(transition.state, copy=False) for transition in eval_states ] eval_states = np.array(eval_states, copy=False) if step_idx % config.eval_every_step == 0: mean_val = common.calc_values_of_states(eval_states, net, device=device) writer.add_scalar("values_mean", mean_val, step_idx) if best_mean_val is None or best_mean_val < mean_val: if best_mean_val is not None: print("%d: Best mean value updated %.3f -> %.3f" % (step_idx, best_mean_val, mean_val)) best_mean_val = mean_val #torch.save(net.state_dict(), os.path.join(saves_path, "mean_val-%.3f.data" % mean_val)) optimizer.zero_grad() batch = buffer.sample(config.batch_size) loss_v = common.calc_loss(batch, net, tgt_net.target_model, config.gamma**config.reward_steps, device=device) loss_v.backward() optimizer.step() if step_idx % config.target_net_sync == 0: tgt_net.sync() if step_idx % config.checkpoint_every_step == 0: idx = step_idx // config.checkpoint_every_step torch.save( net.state_dict(), os.path.join(saves_path, "checkpoint-%d.data" % idx)) if step_idx % config.validation_every_step == 0: res = validation.validation_run(stock_env, net, device=device) for key, val in res.items(): writer.add_scalar(key + "_test", val, step_idx) res = validation.validation_run(val_stock_env, net, device=device) for key, val in res.items(): writer.add_scalar(key + "_val", val, step_idx) models.save_model(os.path.join(saves_path, save_name), net, {"predict_days": predict_days})
writer.add_scalar("values_mean", mean_val, step_idx) if best_mean_val is None or best_mean_val < mean_val: if best_mean_val is not None: print("%d: Best mean value updated %.3f -> %.3f" % (step_idx, best_mean_val, mean_val)) best_mean_val = mean_val torch.save( net.state_dict(), os.path.join(saves_path, "mean_val-%.3f.data" % mean_val)) optimizer.zero_grad() batch = buffer.sample(sconfig.batch_size) loss_v = common.calc_loss(batch, net, tgt_net.target_model, sconfig.gamma**sconfig.reward_steps, device=device) loss_v.backward() optimizer.step() if step_idx % sconfig.target_net_sync == 0: tgt_net.sync() if step_idx % sconfig.checkpoint_every_step == 0: idx = step_idx // sconfig.checkpoint_every_step torch.save( net.state_dict(), os.path.join(saves_path, "checkpoint-%3d.data" % idx)) if step_idx % sconfig.validation_every_step == 0:
eval_states = buffer.sample(STATES_TO_EVALUATE) eval_states = [np.array(transition.state, copy=False) for transition in eval_states] eval_states = np.array(eval_states, copy=False) if step_idx % EVAL_EVERY_STEP == 0: mean_val = common.calc_values_of_states(eval_states, net, cuda=args.cuda) writer.add_scalar("values_mean", mean_val, step_idx) if best_mean_val is None or best_mean_val < mean_val: if best_mean_val is not None: print("%d: Best mean value updated %.3f -> %.3f" % (step_idx, best_mean_val, mean_val)) best_mean_val = mean_val torch.save(net.state_dict(), os.path.join(saves_path, "mean_val-%.3f.data" % mean_val)) optimizer.zero_grad() batch = buffer.sample(BATCH_SIZE) loss_v = common.calc_loss(batch, net, tgt_net.target_model, GAMMA ** REWARD_STEPS, cuda=args.cuda) loss_v.backward() optimizer.step() if step_idx % TARGET_NET_SYNC == 0: tgt_net.sync() if step_idx % CHECKPOINT_EVERY_STEP == 0: idx = step_idx // CHECKPOINT_EVERY_STEP torch.save(net.state_dict(), os.path.join(saves_path, "checkpoint-%3d.data" % idx)) if step_idx % VALIDATION_EVERY_STEP == 0: res = validation.validation_run(env_tst, net, cuda=args.cuda) for key, val in res.items(): writer.add_scalar(key + "_test", val, step_idx) res = validation.validation_run(env_val, net, cuda=args.cuda)