def train_v_upper_envelope(states, actions, returns, state_dim, device, seed, upper_learning_rate=3e-3, weight_decay=0.02, max_step_num=int(1e6), consecutive_steps=4, k=10000): states = torch.from_numpy(np.array(states)) actions = torch.from_numpy(np.array(actions)) returns = torch.from_numpy(np.array(returns)) # returns is actually Gts use_gpu = True if device == "cuda:0" else False # Init upper_envelope net (*use relu as activation function upper_envelope = Value(state_dim, activation='relu') upper_envelope_retrain = Value(state_dim, activation='relu') optimizer_upper = torch.optim.Adam(upper_envelope.parameters(), lr=upper_learning_rate, weight_decay=weight_decay) optimizer_upper_retrain = torch.optim.Adam( upper_envelope_retrain.parameters(), lr=upper_learning_rate, weight_decay=weight_decay) if use_gpu: upper_envelope = upper_envelope.cuda() upper_envelope_retrain = upper_envelope_retrain.cuda() # =========================== # # Split data into training and testing # # But make sure the highest Ri is in the training set # pick out the highest data point highestR, indice = torch.max(returns, 0) highestR = highestR.view(-1, 1) highestS = states[indice] highestA = actions[indice] print("HighestR:", highestR) statesW = torch.cat((states[:indice], states[indice + 1:])) actionsW = torch.cat((actions[:indice], actions[indice + 1:])) returnsW = torch.cat((returns[:indice], returns[indice + 1:])) # shuffle the data perm = np.arange(statesW.shape[0]) np.random.shuffle(perm) perm = LongTensor(perm).cuda() if use_gpu else LongTensor(perm) statesW, actionsW, returnsW = statesW[perm], actionsW[perm], returnsW[perm] # divide data into train/test divide = int(states.shape[0] * 0.8) train_states, train_actions, train_returns = statesW[: divide], actionsW[: divide], returnsW[: divide] test_states, test_actions, test_returns = statesW[divide:], actionsW[ divide:], returnsW[divide:] # add the highest data into training print(train_states.size(), highestS.size()) print(train_actions.size(), highestA.size()) print(train_returns.size(), highestR.size()) train_states = torch.cat((train_states.squeeze(), highestS.unsqueeze(0))) train_actions = torch.cat((train_actions.squeeze(), highestA.unsqueeze(0))) train_returns = torch.cat( (train_returns.squeeze(), highestR.squeeze().unsqueeze(0))) # train upper envelope # env_dummy = env_factory(0) # state_dim = env_dummy.observation_space.shape[0] # upper_envelope = Value(state_dim) # optimizer = torch.optim.Adam(upper_envelope.parameters(), lr=0.003, weight_decay=20) epoch_n = 100 batch_size = 64 optim_iter_num = int(math.ceil(train_states.shape[0] / batch_size)) num_increase = 0 previous_loss = math.inf calculate_vali = 2 best_parameters = upper_envelope.state_dict() running_traning_steps = 0 best_training_steps = running_traning_steps # Upper Envelope Training starts upper_envelope.train() while num_increase < consecutive_steps: # update theta for n steps, n = calculate_vali # train calculate_vali steps for i in range(calculate_vali): train_loss = 0 perm = np.arange(train_states.shape[0]) np.random.shuffle(perm) perm = LongTensor(perm).cuda() if use_gpu else LongTensor(perm) train_states, train_actions, train_returns = train_states[ perm], train_actions[perm], train_returns[perm] for i in range(optim_iter_num): ind = slice(i * batch_size, min((i + 1) * batch_size, states.shape[0])) states_b, returns_b = train_states[ind], train_returns[ind] states_b = Variable(states_b.float()) returns_b = Variable(returns_b.float()) Vsi = upper_envelope(states_b) # loss = loss_fn(Vsi, returns_b) loss = L2PenaltyLoss(Vsi, returns_b, k_val=k) train_loss += loss.detach() upper_envelope.zero_grad() loss.backward() optimizer_upper.step() # early stopping running_traning_steps += calculate_vali # calculate validation error test_iter = int(math.ceil(test_states.shape[0] / batch_size)) validation_loss = 0 for n in range(test_iter): ind = slice(n * batch_size, min((n + 1) * batch_size, states.shape[0])) states_t, returns_t = test_states[ind], test_returns[ind] states_t = Variable(states_t.float()) returns_t = Variable(returns_t.float()) Vsi = upper_envelope(states_t) loss = L2PenaltyLoss(Vsi, returns_t, k_val=k) validation_loss += loss if validation_loss < previous_loss: best_training_steps = running_traning_steps previous_loss = validation_loss best_parameters = upper_envelope.state_dict() num_increase = 0 else: num_increase += 1 print("best_training_steps:", best_training_steps) upper_envelope.load_state_dict(best_parameters) # retrain on the whole set upper_envelope_retrain.train() optim_iter_num = int(math.ceil(states.shape[0] / batch_size)) for i in range(best_training_steps): train_loss = 0 perm = np.arange(states.shape[0]) np.random.shuffle(perm) perm = LongTensor(perm).cuda() if use_gpu else LongTensor(perm) states, actions, returns = states[perm], actions[perm], returns[perm] for i in range(optim_iter_num): ind = slice(i * batch_size, min((i + 1) * batch_size, states.shape[0])) states_b, returns_b = states[ind], returns[ind] states_b = Variable(states_b.float()) returns_b = Variable(returns_b.float()) Vsi = upper_envelope_retrain(states_b) #loss = loss_fn(Vsi, returns_b) loss = L2PenaltyLoss(Vsi, returns_b, k_val=k) train_loss += loss.detach() upper_envelope_retrain.zero_grad() loss.backward() optimizer_upper_retrain.step() upper_envelope.load_state_dict(upper_envelope_retrain.state_dict()) print("Policy training is complete.") return upper_envelope
# running_reward = ZFilter((1,), demean=False, clip=10) """define actor and critic""" if args.model_path is None: if is_disc_action: policy_net = DiscretePolicy(state_dim, env_dummy.action_space.n) else: policy_net = Policy(state_dim, env_dummy.action_space.shape[0], log_std=args.log_std) value_net = Value(state_dim) else: policy_net, value_net, running_state = pickle.load( open(args.model_path, "rb")) if use_gpu: policy_net = policy_net.cuda() value_net = value_net.cuda() del env_dummy """create agent""" agent = Agent(env_factory, policy_net, running_state=running_state, render=args.render, num_threads=args.num_threads) def update_params(batch): states = torch.from_numpy(np.stack(batch.state)) actions = torch.from_numpy(np.stack(batch.action)) rewards = torch.from_numpy(np.stack(batch.reward)) masks = torch.from_numpy(np.stack(batch.mask).astype(np.float64)) if use_gpu:
policy_net = DiscretePolicy(state_dim, env_dummy.action_space.n) else: policy_net = Policy(state_dim, env_dummy.action_space.shape[0], hidden_size=policy_size, scale_cov=args.scale_cov) #policy_net = Policy(state_dim, env_dummy.action_space.shape[0], hidden_size=policy_size, log_std=0) value_net = Value(state_dim, hidden_size=critic_size) advantage_net = Advantage((state_dim, action_dim), hidden_size=advantage_size) else: policy_net, value_net, advantage_net, running_state = pickle.load( open(args.model_path, "rb")) if use_gpu: policy_net = policy_net.cuda() value_net = value_net.cuda() advantage_net = advantage_net.cuda() del env_dummy optimizer_policy = torch.optim.Adam(policy_net.parameters(), lr=args.learning_rate) optimizer_value = torch.optim.Adam(value_net.parameters(), lr=args.learning_rate) optimizer_advantage = torch.optim.Adam(advantage_net.parameters(), lr=args.learning_rate) # optimization epoch number and batch size for PPO optim_epochs = 5 optim_batch_size = 4096 """create agent""" agent = Agent(env_factory,