test_env_config['min_episode_length'] = args.min_episode_length test_env_config['max_episode_length'] = args.max_episode_length test_env_config['max_position'] = (args.num_actions - 1) / 2 # Initiate and Load sdae models (TODO: add argument to specify file name) #sdae_model_name = "AAL_sdae_model_lr4_g_noise_var0.001_pre100000fine500000.pt" sdae_model_name = "AAPL_p130_p260_sdae_model_lr5_g_noise_var1e-09_pre100000fine1000000_filtered_fyear2008.pt" #sdae_model_name = "AAPL_p110_p225_sdae_model_lr5_g_noise_var1e-09_pre100000fine1000000_filtered_fyear2008.pt" #sdae_model_name = "AAPL_p110_p225_sdae_model_lr5_g_noise_var1e-09_pre100000fine1000000.pt" sdae_model = SDAE(args.input_dim) sdae_saved_state = torch.load(SDAE_PATH + sdae_model_name, map_location=lambda storage, loc: storage) sdae_model.load_state_dict(sdae_saved_state) # Initiate shared model shared_model = A3C_LSTM(args.rl_input_dim, args.num_actions) if args.load: saved_state = torch.load('{0}.pt'.format(args.load_model_path), map_location=lambda storage, loc: storage) shared_model.load_state_dict(saved_state) shared_model.share_memory() if args.shared_optimizer: if args.optimizer == 'RMSprop': optimizer = SharedRMSprop(shared_model.parameters(), lr=args.lr) if args.optimizer == 'Adam': optimizer = SharedAdam(shared_model.parameters(), lr=args.lr, amsgrad=args.amsgrad) optimizer.share_memory() else:
def test_one_episode(args, sdae_model, shared_model, env_config): # Environment variables stock_raw_data = env_config['stock_raw_data'] stock_norm_data = env_config['stock_norm_data'] starting_capital = env_config['starting_capital'] min_episode_length = env_config['min_episode_length'] max_episode_length = env_config['max_episode_length'] max_position = env_config['max_position'] trans_cost_rate = env_config['trans_cost_rate'] slippage_rate = env_config['slippage_rate'] gpu_id = args.gpu_ids[-1] # Set seed torch.manual_seed(args.seed) if gpu_id >= 0: torch.cuda.manual_seed(args.seed) np.random.seed(args.seed) # Initialize environment if (trans_cost_rate is not None and slippage_rate is not None): if (args.full_env): env = Single_Stock_Full_Env(stock_raw_data, stock_norm_data, starting_capital, min_episode_length, max_episode_length, max_position, trans_cost_rate, slippage_rate, full_data_episode=True) else: env = Single_Stock_BS_Env(stock_raw_data, stock_norm_data, starting_capital, min_episode_length, max_episode_length, max_position, trans_cost_rate, slippage_rate, full_data_episode=True) else: if (args.full_env): env = Single_Stock_Full_Env(stock_raw_data, stock_norm_data, starting_capital, min_episode_length, max_episode_length, max_position, full_data_episode=True) else: env = Single_Stock_BS_Env(stock_raw_data, stock_norm_data, starting_capital, min_episode_length, max_episode_length, max_position, full_data_episode=True) state = env.get_current_input_to_model() agent_model = A3C_LSTM(args.rl_input_dim, args.num_actions) agent = Agent(sdae_model, agent_model, args) agent.gpu_id = gpu_id cx = Variable(torch.zeros(1, LSTM_SIZE)) hx = Variable(torch.zeros(1, LSTM_SIZE)) if gpu_id >= 0: with torch.cuda.device(gpu_id): agent.model = agent.model.cuda() agent.model.train() cx = cx.cuda() hx = hx.cuda() state = state.cuda() if gpu_id >= 0: with torch.cuda.device(gpu_id): agent.model.load_state_dict(shared_model.state_dict()) else: agent.model.load_state_dict(shared_model.state_dict()) episodic_reward = 0.0 count = 0 actions = [] rewards = [] pv_list = [] pv_change_list = [] while env.done is False: action, (next_hx, next_cx) = agent.select_action(state, (hx, cx), training=False) actions.append(action) reward, next_state, _ = env.step(action) rewards.append(reward) """ pv_list.append(env.calc_total_portfolio_value()) if(count == 0): pv_change_list.append(0.0) else: pv_change_list.append(pv_list[count] - pv_list[count - 1]) """ episodic_reward += reward state = next_state (hx, cx) = (next_hx, next_cx) count += 1 index_list = [i for i in range(1, len(pv_list) + 1)] """ #print(pv_list) print(max(pv_list)) print(min(pv_list)) print(sum(rewards)) fig, (ax1, ax2, ax3) = plt.subplots(1, 3) ax1.plot(index_list, pv_list) ax2.plot(index_list, rewards) ax3.plot(index_list, pv_change_list) plt.show() exit() """ # Results logging port_value = env.calc_total_portfolio_value() #print("Test num: " + str(test_num) + " | Test reward: " + str(episodic_reward) + " | Final equity: " + str(port_value)) #print(env.curr_holdings) print( "Test reward: {0} | Holdings: {1}/{2} | End Capital: {3} | Final equity : {4}" .format(episodic_reward, env.curr_holdings[0], env.curr_holdings[1], env.curr_capital, port_value)) print("\n") sys.stdout.flush() return episodic_reward, rewards, actions
) if config["mode"] == "resnet": shared_model = ResNet_LSTM(config["agent"], config["task"]["num-actions"]) elif config["mode"] == "conv-stacked": shared_model = A3C_ConvStackedLSTM(config["agent"], config["task"]["num-actions"]) elif config["mode"] == "stacked": shared_model = A3C_StackedLSTM(config["agent"], config["task"]["num-actions"]) elif config["mode"] == "conv-vanilla": shared_model = A3C_ConvLSTM(config["agent"], config["task"]["num-actions"]) elif config["mode"] == "vanilla": shared_model = A3C_LSTM(config["agent"], config["task"]["num-actions"]) else: raise ValueError(config["mode"]) print(shared_model) shared_model.share_memory() shared_model.to(config['device']) optim_class = SharedAdam if config[ "optimizer"] == "adam" else SharedRMSprop optimizer = optim_class(shared_model.parameters(), lr=config["agent"]["lr"]) optimizer.share_memory() processes = [] update_counter = 0
def test(args, sdae_model, shared_model, env_config, train_process_finish_flags): # Environment variables stock_raw_data = env_config['stock_raw_data'] stock_norm_data = env_config['stock_norm_data'] starting_capital = env_config['starting_capital'] min_episode_length = env_config['min_episode_length'] max_episode_length = env_config['max_episode_length'] max_position = env_config['max_position'] trans_cost_rate = env_config['trans_cost_rate'] slippage_rate = env_config['slippage_rate'] gpu_id = args.gpu_ids[-1] # Set seed torch.manual_seed(args.seed) if gpu_id >= 0: torch.cuda.manual_seed(args.seed) np.random.seed(args.seed) # Initialize environment if (trans_cost_rate is not None and slippage_rate is not None): if (args.full_env): env = Single_Stock_Full_Env(stock_raw_data, stock_norm_data, starting_capital, min_episode_length, max_episode_length, max_position, trans_cost_rate, slippage_rate, full_data_episode=True) else: env = Single_Stock_BS_Env(stock_raw_data, stock_norm_data, starting_capital, min_episode_length, max_episode_length, max_position, trans_cost_rate, slippage_rate, full_data_episode=True) else: if (args.full_env): env = Single_Stock_Full_Env(stock_raw_data, stock_norm_data, starting_capital, min_episode_length, max_episode_length, max_position, full_data_episode=True) else: env = Single_Stock_BS_Env(stock_raw_data, stock_norm_data, starting_capital, min_episode_length, max_episode_length, max_position, full_data_episode=True) state = env.get_current_input_to_model() agent_model = A3C_LSTM(args.rl_input_dim, args.num_actions) agent = Agent(sdae_model, agent_model, args) agent.gpu_id = gpu_id cx = Variable(torch.zeros(1, LSTM_SIZE)) hx = Variable(torch.zeros(1, LSTM_SIZE)) if gpu_id >= 0: with torch.cuda.device(gpu_id): agent.model = agent.model.cuda() agent.model.train() cx = cx.cuda() hx = hx.cuda() state = state.cuda() test_num = 0 reward_list = [] final_equity_list = [] max_reward = -1e10 # If all training processes have ended this will be True. Then, one more run would be done to capture final result terminate_next_iter = False while True: if gpu_id >= 0: with torch.cuda.device(gpu_id): agent.model.load_state_dict(shared_model.state_dict()) else: agent.model.load_state_dict(shared_model.state_dict()) episodic_reward = 0.0 count = 0 actions = [] rewards = [] pv_list = [] pv_change_list = [] while env.done is False: action, (next_hx, next_cx) = agent.select_action(state, (hx, cx), training=False) actions.append(action - 3) reward, next_state, _ = env.step(action) """ rewards.append(reward) pv_list.append(env.calc_total_portfolio_value()) if(count == 0): pv_change_list.append(0.0) else: pv_change_list.append(pv_list[count] - pv_list[count - 1]) """ episodic_reward += reward state = next_state (hx, cx) = (next_hx, next_cx) count += 1 index_list = [i for i in range(1, len(pv_list) + 1)] """ #print(pv_list) print(max(pv_list)) print(min(pv_list)) print(sum(rewards)) fig, (ax1, ax2, ax3) = plt.subplots(1, 3) ax1.plot(index_list, pv_list) ax2.plot(index_list, rewards) ax3.plot(index_list, pv_change_list) plt.show() exit() """ # Results logging reward_list.append(episodic_reward) port_value = env.calc_total_portfolio_value() final_equity_list.append(port_value) test_num += 1 #print("Test num: " + str(test_num) + " | Test reward: " + str(episodic_reward) + " | Final equity: " + str(port_value)) #print(env.curr_holdings) print( "Test num: {0} | Test reward: {1} | Holdings: {2} | End Capital: {3} | Final equity : {4}" .format(test_num, episodic_reward, env.curr_holdings[0], env.curr_capital, port_value)) print(Counter(actions)) print("\n") sys.stdout.flush() env.reset() state = env.get_current_input_to_model() if gpu_id >= 0: with torch.cuda.device(gpu_id): hx = Variable(torch.zeros(1, LSTM_SIZE).cuda()) cx = Variable(torch.zeros(1, LSTM_SIZE).cuda()) state = state.cuda() else: hx = Variable(torch.zeros(1, LSTM_SIZE)) cx = Variable(torch.zeros(1, LSTM_SIZE)) # Save model if (args.use_filter_data): model_name = args.stock_env + "_p1" + str( args.period_1) + "_p2" + str(args.period_2) + "_minEL" + str( args.min_episode_length ) + "_maxEL" + str(args.max_episode_length) + "_nstep" + str( args.num_steps ) + "_ntrainstep" + str(args.num_train_steps) + "_lr" + str( args.lr) + "_gamma" + str(args.gamma) + "_tau" + str( args.tau) + "_best_filtered_fyear" + str( args.filter_by_year ) + "_full" if args.full_env else "" + ".pt" else: model_name = args.stock_env + "_p1" + str( args.period_1) + "_p2" + str(args.period_2) + "_minEL" + str( args.min_episode_length ) + "_maxEL" + str(args.max_episode_length) + "_nstep" + str( args.num_steps ) + "_ntrainstep" + str(args.num_train_steps) + "_lr" + str( args.lr) + "_gamma" + str(args.gamma) + "_tau" + str( args.tau ) + "_full" if args.full_env else "" + "_best.pt" if (terminate_next_iter): if (args.use_filter_data): model_name = args.stock_env + "_p1" + str( args.period_1 ) + "_p2" + str(args.period_2) + "_minEL" + str( args.min_episode_length ) + "_maxEL" + str(args.max_episode_length) + "_nstep" + str( args.num_steps ) + "_ntrainstep" + str(args.num_train_steps) + "_lr" + str( args.lr) + "_gamma" + str(args.gamma) + "_tau" + str( args.tau) + "_final_filtered_fyear" + str( args.filter_by_year ) + "_full" if args.full_env else "" + ".pt" else: model_name = args.stock_env + "_p1" + str( args.period_1 ) + "_p2" + str(args.period_2) + "_minEL" + str( args.min_episode_length ) + "_maxEL" + str(args.max_episode_length) + "_nstep" + str( args.num_steps ) + "_ntrainstep" + str(args.num_train_steps) + "_lr" + str( args.lr) + "_gamma" + str(args.gamma) + "_tau" + str( args.tau ) + "_full" if args.full_env else "" + "_final.pt" if gpu_id >= 0: with torch.cuda.device(gpu_id): state_to_save = agent.model.state_dict() torch.save( state_to_save, '{0}{1}.dat'.format(args.save_model_dir, model_name)) else: state_to_save = agent.model.state_dict() torch.save( state_to_save, '{0}{1}.dat'.format(args.save_model_dir, model_name)) print("saved final") break else: if (episodic_reward > max_reward): #model_name = args.stock_env + "_p1" + str(args.period_1) + "_p2" + str(args.period_2) + "_minEL" + str(args.min_episode_length) + "_maxEL" + str(args.max_episode_length) + "_nstep" + str(args.num_steps) + "_ntrainstep" + str(args.num_train_steps) + "_lr" + str(args.lr) + "_gamma" + str(args.gamma) + "_tau" + str(args.tau) + "_best.pt" if gpu_id >= 0: with torch.cuda.device(gpu_id): state_to_save = agent.model.state_dict() torch.save( state_to_save, '{0}{1}.dat'.format(args.save_model_dir, model_name)) else: state_to_save = agent.model.state_dict() torch.save( state_to_save, '{0}{1}.dat'.format(args.save_model_dir, model_name)) # Save results if (args.use_filter_data): np.save(RESULT_DATA_PATH + "epi_reward_filtered_" + model_name, np.array(reward_list)) np.save(RESULT_DATA_PATH + "portfolio_filtered_" + model_name, np.array(final_equity_list)) else: np.save(RESULT_DATA_PATH + "epi_reward_" + model_name, np.array(reward_list)) np.save(RESULT_DATA_PATH + "portfolio_" + model_name, np.array(final_equity_list)) if (torch.all(train_process_finish_flags == torch.ones( train_process_finish_flags.size(0)))): terminate_next_iter = True print("From test process: all training process terminated") sys.stdout.flush()
def train(config, shared_model, optimizer, rank, task_config, counter, ): T.manual_seed(config["seed"] + rank) np.random.seed(config["seed"] + rank) T.random.manual_seed(config["seed"] + rank) device = config["device"] lab_env = lab.Lab("contributed/psychlab/harlow", ['RGB_INTERLEAVED'], config=task_config) env = HarlowWrapper(lab_env, config, rank) if config["mode"] == "resnet": agent = ResNet_LSTM(config["agent"], env.num_actions) elif config["mode"] == "conv-vanilla": agent = A3C_ConvLSTM(config["agent"], env.num_actions) elif config["mode"] == "vanilla": agent = A3C_LSTM(config["agent"], env.num_actions) else: raise ValueError(config["mode"]) agent.to(device) agent.train() ### hyper-parameters ### gamma = config["agent"]["gamma"] gae_lambda = config["agent"]["gae-lambda"] val_coeff = config["agent"]["value-loss-weight"] entropy_coeff = config["agent"]["entropy-weight"] n_step_update = config["agent"]["n-step-update"] writer = SummaryWriter(log_dir=os.path.join(config["log-path"], config["run-title"] + f"_{rank}")) save_path = os.path.join(config["save-path"], config["run-title"], config["run-title"]+"_{epi:04d}") save_interval = config["save-interval"] done = True state = env.reset() p_action, p_reward = [0]*config["task"]["num-actions"], 0 print('='*50) print(f"Starting Worker {rank}") print('='*50) episode_reward = 0 update_counter = counter total_rewards = [] while True: agent.load_state_dict(shared_model.state_dict()) if done: rnn_state = agent.get_init_states(device) else: if config["agent"]["cell-type"] == "lstm": rnn_state = rnn_state[0].detach(), rnn_state[1].detach() else: rnn_state = rnn_state.detach() values = [] log_probs = [] rewards = [] entropies = [] for _ in range(n_step_update): logit, value, rnn_state = agent( T.tensor([state]).to(device), ( T.tensor([p_action]).float().to(device), T.tensor([[p_reward]]).float().to(device)), rnn_state ) logit = logit.squeeze(0) prob = F.softmax(logit, dim=-1) log_prob = F.log_softmax(logit, dim=-1) entropy = -(log_prob * prob).sum(1, keepdim=True) entropies += [entropy] action = prob.multinomial(num_samples=1).detach() log_prob = log_prob.gather(1, action) state, reward, done, _ = env.step(int(action)) # if done: # env.save_frames(os.path.join(config["save-path"], "frames.gif")) # exit() episode_reward += reward p_action = np.eye(env.num_actions)[int(action)] p_reward = reward log_probs += [log_prob] values += [value] rewards += [reward] if done: state = env.reset() total_rewards += [episode_reward] avg_reward_100 = np.array(total_rewards[-100:]).mean() writer.add_scalar("perf/reward_t", episode_reward, env.episode_num) writer.add_scalar("perf/avg_reward_100", avg_reward_100, env.episode_num) episode_reward = 0 if env.episode_num % save_interval == 0: T.save({ "state_dict": shared_model.state_dict(), "avg_reward_100": avg_reward_100, "update_counter": update_counter }, save_path.format(epi=env.episode_num) + ".pt") break R = T.zeros(1, 1).to(device) if not done: _, value, _ = agent( T.tensor([state]).to(device), ( T.tensor([p_action]).float().to(device), T.tensor([[p_reward]]).float().to(device)), rnn_state ) R = value.detach() values += [R] policy_loss = 0 value_loss = 0 gae = T.zeros(1, 1).to(device) for i in reversed(range(len(rewards))): R = gamma * R + rewards[i] advantage = R - values[i] value_loss = value_loss + 0.5 * advantage.pow(2) # Generalized Advantage Estimation delta_t = rewards[i] + gamma * values[i + 1] - values[i] gae = gae * gamma * gae_lambda + delta_t policy_loss = policy_loss - \ log_probs[i] * gae.detach() - entropy_coeff * entropies[i] loss = policy_loss + val_coeff * value_loss optimizer.zero_grad() loss.backward() ensure_shared_grads(agent, shared_model) optimizer.step() update_counter += 1 writer.add_scalar("losses/total_loss", loss.item(), update_counter)
yaml.dump(config, fout) ############## Start Here ############## print(f"> Running {config['run-title']} {config['mode']} using {config['optimizer']}") params = (config["agent"], config["task"]["num-actions"]) if config["mode"] == "densenet-stacked": agent = DenseNet_StackedLSTM(*params) elif config["mode"] == "conv-stacked": agent = A3C_ConvStackedLSTM(*params) elif config["mode"] == "stacked": agent = A3C_StackedLSTM(*params) elif config["mode"] == "conv-vanilla": agent = A3C_ConvLSTM(*params) elif config["mode"] == "vanilla": agent = A3C_LSTM(*params) else: raise ValueError(config["mode"]) print(agent) agent.to(config['device']) optim_class = T.optim.RMSprop if config["optimizer"] == "rmsprop" else T.optim.AdamW optimizer = optim_class(agent.parameters(), lr=config["agent"]["lr"]) T.manual_seed(config["seed"]) np.random.seed(config["seed"]) T.random.manual_seed(config["seed"]) update_counter = 0 if config["copy-encoder"]:
n_seeds = 1 device = config["device"] ############## Start Here ############## print(f"> Running {config['run-title']} {config['mode']}") if config["mode"] == "conv-stacked": agent = A3C_ConvStackedLSTM(config["agent"], config["task"]["num-actions"]) elif config["mode"] == "stacked": agent = A3C_StackedLSTM(config["agent"], config["task"]["num-actions"]) elif config["mode"] == "conv-vanilla": agent = A3C_ConvLSTM(config["agent"], config["task"]["num-actions"]) elif config["mode"] == "vanilla": agent = A3C_LSTM(config["agent"], config["task"]["num-actions"]) else: raise ValueError(config["mode"]) filepath = os.path.join( config["save-path"], config["load-title"], f"{config['load-title']}_{config['start-episode']:04d}.pt") print(f"> Loading Checkpoint {filepath}") agent.load_state_dict( T.load(filepath, map_location=T.device(config["device"]))["state_dict"]) lab_env = lab.Lab("contributed/psychlab/harlow", ['RGB_INTERLEAVED'], config=task_config) env = HarlowWrapper(lab_env, config, 0)
def train(rank, args, sdae_model, shared_model, optimizer, env_config, train_process_finish_flags): # Environment variables stock_raw_data = env_config['stock_raw_data'] stock_norm_data = env_config['stock_norm_data'] starting_capital = env_config['starting_capital'] min_episode_length = env_config['min_episode_length'] max_episode_length = env_config['max_episode_length'] max_position = env_config['max_position'] trans_cost_rate = env_config['trans_cost_rate'] slippage_rate = env_config['slippage_rate'] gpu_id = args.gpu_ids[rank % len(args.gpu_ids)] # Set seed torch.manual_seed(args.seed + rank) if gpu_id >= 0: torch.cuda.manual_seed(args.seed + rank) np.random.seed(args.seed + rank) # Initialize environment if (trans_cost_rate is not None and slippage_rate is not None): if (args.full_env): env = Single_Stock_Full_Env(stock_raw_data, stock_norm_data, starting_capital, min_episode_length, max_episode_length, max_position, trans_cost_rate, slippage_rate) else: env = Single_Stock_BS_Env(stock_raw_data, stock_norm_data, starting_capital, min_episode_length, max_episode_length, max_position, trans_cost_rate, slippage_rate) else: if (args.full_env): env = Single_Stock_Full_Env(stock_raw_data, stock_norm_data, starting_capital, min_episode_length, max_episode_length, max_position) else: env = Single_Stock_BS_Env(stock_raw_data, stock_norm_data, starting_capital, min_episode_length, max_episode_length, max_position) state = env.get_current_input_to_model() # Initialize optimizers if optimizer is None: if args.optimizer_type == 'RMSprop': optimizer = optim.RMSprop(shared_model.parameters(), lr=args.lr) if args.optimizer_type == 'Adam': optimizer = optim.Adam(shared_model.parameters(), lr=args.lr, amsgrad=args.amsgrad) agent_model = A3C_LSTM(args.rl_input_dim, args.num_actions) agent = Agent(sdae_model, agent_model, args) agent.gpu_id = gpu_id cx = Variable(torch.zeros(1, LSTM_SIZE)) hx = Variable(torch.zeros(1, LSTM_SIZE)) if gpu_id >= 0: with torch.cuda.device(gpu_id): agent.model = agent.model.cuda() agent.model.train() cx = cx.cuda() hx = hx.cuda() state = state.cuda() eps_num = 0 total_steps = 0 while True: if gpu_id >= 0: with torch.cuda.device(gpu_id): agent.model.load_state_dict(shared_model.state_dict()) else: agent.model.load_state_dict(shared_model.state_dict()) # For truncating LSTM's update if env.done: if gpu_id >= 0: with torch.cuda.device(gpu_id): hx = Variable(torch.zeros(1, LSTM_SIZE).cuda()) cx = Variable(torch.zeros(1, LSTM_SIZE).cuda()) else: hx = Variable(torch.zeros(1, LSTM_SIZE)) cx = Variable(torch.zeros(1, LSTM_SIZE)) else: hx = Variable(hx.data) cx = Variable(cx.data) for step in range(args.num_steps): action, val, log_prob, entropy, (next_hx, next_cx) = agent.select_action( state, (hx, cx)) #print("Before act") #print(env.curr_holdings) #print(env.curr_capital) #print(state) #reward, next_state, _ = env.step(6 if step == 0 else 1) reward, next_state, _ = env.step(action) #print("After act") #print(env.curr_holdings) #print(env.curr_capital) #print(next_state) #print(reward) #if(step == 1): # exit() agent.step(val, log_prob, entropy, reward) state = next_state (hx, cx) = (next_hx, next_cx) total_steps += 1 if ((total_steps % 500000) == 0): print("Rank: " + str(rank) + " | Training episode: " + str(eps_num) + " | Total steps: " + str(total_steps)) sys.stdout.flush() if (total_steps >= args.num_train_steps): break if env.done: break R = torch.zeros(1, 1).float() # Get values of current state if the episode is not done if not env.done: env_state, private_state = state env_state = torch.from_numpy(env_state).float() private_state = torch.from_numpy(private_state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): env_state = env_state.cuda() private_state = private_state.cuda() with torch.no_grad(): sdae_state = agent.sdae_model(env_state, training=False) value, _, _ = agent.model( (Variable(torch.cat( (sdae_state, private_state)).unsqueeze(0)), (hx, cx))) R = value.data if gpu_id >= 0: with torch.cuda.device(gpu_id): R = R.cuda() agent.values.append(Variable(R)) policy_loss = 0 value_loss = 0 gae = torch.zeros(1, 1) if gpu_id >= 0: with torch.cuda.device(gpu_id): gae = gae.cuda() R = Variable(R) for i in reversed(range(len(agent.rewards))): R = args.gamma * R + agent.rewards[i] advantage = R - agent.values[i] value_loss = value_loss + 0.5 * advantage.pow(2) # Generalized Advantage Estimataion delta_t = agent.rewards[i] + args.gamma * agent.values[ i + 1].data - agent.values[i].data gae = gae * args.gamma * args.tau + delta_t policy_loss = policy_loss - agent.log_probs[i] * Variable( gae) - 0.01 * agent.entropies[i] agent.model.zero_grad() (policy_loss + 0.5 * value_loss).backward() #(policy_loss + value_loss).backward() ensure_shared_grads(agent.model, shared_model, gpu=gpu_id >= 0) optimizer.step() agent.clear_values() if env.done: eps_num += 1 env.reset() state = env.get_current_input_to_model() if (total_steps >= args.num_train_steps): train_process_finish_flags[rank] = 1 print("Train worker " + str(rank) + " done") sys.stdout.flush() break