def __init__(self, env, gamma, tau, buffer_maxlen, value_lr, q_lr, policy_lr): self.env = env self.state_dim = env.observation_space.shape[0] self.action_dim = env.action_space.shape[0] self.action_range = [env.action_space.low, env.action_space.high] # hyperparameters self.gamma = gamma self.tau = tau # initialize networks self.value_net = ValueNet(self.state_dim).to(device) self.target_value_net = ValueNet(self.state_dim).to(device) self.q1_net = SoftQNet(self.state_dim, self.action_dim).to(device) self.q2_net = SoftQNet(self.state_dim, self.action_dim).to(device) self.policy_net = PolicyNet(self.state_dim, self.action_dim).to(device) # Load the target value network parameters for target_param, param in zip(self.target_value_net.parameters(), self.value_net.parameters()): target_param.data.copy_(self.tau * param + (1 - self.tau) * target_param) # Initialize the optimizer self.value_optimizer = optim.Adam(self.value_net.parameters(), lr=value_lr) self.q1_optimizer = optim.Adam(self.q1_net.parameters(), lr=q_lr) self.q2_optimizer = optim.Adam(self.q2_net.parameters(), lr=q_lr) self.policy_optimizer = optim.Adam(self.policy_net.parameters(), lr=policy_lr) # Initialize thebuffer self.buffer = ReplayBeffer(buffer_maxlen)
def __init__(self, config): self.config = config self.epsilon = config.epsilon # replay memory self.replay_memory = deque(maxlen=self.config.n_replay_memory) # 가치신경망 생성 self.model = ValueNet(self.config.n_state, self.config.n_action) self.model.to(device) self.model_optimizer = torch.optim.Adam(self.model.parameters(), lr=self.config.learning_rate)
def __init__(self, config): self.config = config # 리플레이메모리 self.replay_memory = deque(maxlen=self.config.n_replay_memory) # 정책신경망 생성 self.actor = PolicyNet(self.config.n_state, self.config.n_action) self.actor.to(device) # 가치신경망 생성 self.critic = ValueNet(self.config.n_state, 1) self.critic.to(device)
def __init__(self, config): self.config = config # 정책신경망 생성 self.actor = PolicyNet(self.config.n_state, self.config.n_action) self.actor.to(device) self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=self.config.actor_lr) # 가치신경망 생성 self.critic = ValueNet(self.config.n_state, 1) self.critic.to(device) self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=self.config.critic_lr)
def __init__(self): self.value_net = ValueNet(2) self.target_value_net = ValueNet(2) # self.value_net.load_state_dict(torch.load('./value_net.pkl')) torch.save(self.value_net.state_dict(), './value_net.pkl') self.target_value_net.load_state_dict(torch.load('./value_net.pkl')) self.episode = 0 self.explore = 0.3 self.buffer = [] self.buffer_capacity = 20 self.buffer_index = 0
class A3CLocal: def __init__(self, config): self.config = config # 리플레이메모리 self.replay_memory = deque(maxlen=self.config.n_replay_memory) # 정책신경망 생성 self.actor = PolicyNet(self.config.n_state, self.config.n_action) self.actor.to(device) # 가치신경망 생성 self.critic = ValueNet(self.config.n_state, 1) self.critic.to(device) # 정책신경망의 출력을 받아 확률적으로 행동을 선택 def get_action(self, state): state = torch.tensor(state, dtype=torch.float).to(device) policy = self.actor(state) policy = policy.detach().cpu().numpy()[0] return np.random.choice(self.config.n_action, 1, p=policy)[0] # 리플에이메모리 추가 def append_replay(self, state, action, reward, next_state): act = np.zeros(self.config.n_action) act[action] = 1 self.replay_memory.append((state, act, reward, next_state)) # 리플에이메모리 조회 및 클리어 def get_replay(self): # 히스토리를 배열 형태로 정렬 replay_memory = np.array(self.replay_memory) self.replay_memory.clear() states = np.vstack(replay_memory[:, 0]) actions = list(replay_memory[:, 1]) rewards = list(replay_memory[:, 2]) next_states = list(replay_memory[:, 3]) return states, actions, rewards, next_states # 글로벌신병망의 wegith를 로컬신경망으로 복사 def update_local_model(self, actor_dict, critic_dict): self.actor.load_state_dict(actor_dict) self.critic.load_state_dict(critic_dict) # GPU 메모리 반납 def close(self): del self.actor del self.critic
def run_rl_game(i): print("Generating RL game number {} from generation {}".format(i, rl_step-1)) np.random.seed(int.from_bytes(os.urandom(4), byteorder='little')) random.seed(int.from_bytes(os.urandom(4), byteorder='little')) value_net = ValueNet(rl_model_filepath, rl_step-1) # load from previous generation rl_player = RLValueMinimaxPlayer(value_net, rl_minimax_depth) _, _, trace = play_game(rl_player, rl_player, verbose=1, limit_to_draw=limit_to_draw, random_burn_in=random_burn_in, trace_min=trace_min) return trace
def load_checkpoint(file_dir, i_epoch, layer_sizes, input_size, device='cuda'): checkpoint = torch.load(os.path.join(file_dir, "ckpt_eps%d.pt" % i_epoch), map_location=device) policy_net = PolicyNet(layer_sizes).to(device) value_net = ValueNet(input_size).to(device) policy_net.load_state_dict(checkpoint["policy_net"]) policy_net.train() value_net.load_state_dict(checkpoint["value_net"]) value_net.train() policy_lr = checkpoint["policy_lr"] valuenet_lr = checkpoint["valuenet_lr"] policynet_optim = optim.Adam(policy_net.parameters(), lr=policy_lr) policynet_optim.load_state_dict(checkpoint["policynet_optim"]) valuenet_optim = optim.Adam(value_net.parameters(), lr=valuenet_lr) valuenet_optim.load_state_dict(checkpoint["valuenet_optim"]) checkpoint.pop("policy_net") checkpoint.pop("value_net") checkpoint.pop("policynet_optim") checkpoint.pop("valuenet_optim") checkpoint.pop("i_epoch") checkpoint.pop("policy_lr") checkpoint.pop("valuenet_lr") return policy_net, value_net, policynet_optim, valuenet_optim, checkpoint
def run_test_game(i): np.random.seed(int.from_bytes(os.urandom(4), byteorder='little')) value_net = ValueNet(rl_model_filepath, rl_step-1) # load from previous generation rl_player = RLValueMinimaxPlayer(value_net, rl_minimax_depth) minimax_player = SimpleMinimaxPlayer(base_minimax_depth) if i % 2: win, step, _ = play_game(rl_player, minimax_player, verbose=0, limit_to_draw=limit_to_draw) else: win, step, _ = play_game(minimax_player, rl_player, verbose=0, limit_to_draw=limit_to_draw) return win, step
def play(board, color): rl_minimax_depth = 8 rl_step = 21 rl_model_filepath = './mlp_200_model.h5' value_net = ValueNet(rl_model_filepath, rl_step) # load from previous generation rl_player = RLValueMinimaxPlayer(value_net, rl_minimax_depth) nboard = board if isinstance(board, np.ndarray) else board_to_numpy(board) best_move, _ = rl_player.play(nboard, color) return best_move
def load_model_checkpoint(c):#returns the model at given chkpoint dir_name = tf.train.latest_checkpoint(c.model_dir) #if ver_name =='None': # check_or_make_dir(dir_name) #else: # dir_name = os.path.join(dir_name,ver_name) dummy_env= TFPyEnvironment(StockEnvBasic(**c.default_env)) time_step = dummy_env.reset() temp = ValueNet(**c.model_vars) #initialize model temp(time_step.observation) checkpoint2 = tf.train.Checkpoint(module=temp) status=checkpoint2.restore(dir_name) return temp,checkpoint2
class SAC: def __init__(self, env, gamma, tau, buffer_maxlen, value_lr, q_lr, policy_lr): self.env = env self.state_dim = env.observation_space.shape[0] self.action_dim = env.action_space.shape[0] self.action_range = [env.action_space.low, env.action_space.high] # hyperparameters self.gamma = gamma self.tau = tau # initialize networks self.value_net = ValueNet(self.state_dim).to(device) self.target_value_net = ValueNet(self.state_dim).to(device) self.q1_net = SoftQNet(self.state_dim, self.action_dim).to(device) self.q2_net = SoftQNet(self.state_dim, self.action_dim).to(device) self.policy_net = PolicyNet(self.state_dim, self.action_dim).to(device) # Load the target value network parameters for target_param, param in zip(self.target_value_net.parameters(), self.value_net.parameters()): target_param.data.copy_(self.tau * param + (1 - self.tau) * target_param) # Initialize the optimizer self.value_optimizer = optim.Adam(self.value_net.parameters(), lr=value_lr) self.q1_optimizer = optim.Adam(self.q1_net.parameters(), lr=q_lr) self.q2_optimizer = optim.Adam(self.q2_net.parameters(), lr=q_lr) self.policy_optimizer = optim.Adam(self.policy_net.parameters(), lr=policy_lr) # Initialize thebuffer self.buffer = ReplayBeffer(buffer_maxlen) def get_action(self, state): action = self.policy_net.action(state) action = action * (self.action_range[1] - self.action_range[0]) / 2.0 + \ (self.action_range[1] + self.action_range[0]) / 2.0 return action def update(self, batch_size): state, action, reward, next_state, done = self.buffer.sample(batch_size) new_action, log_prob = self.policy_net.evaluate(state) # V value loss value = self.value_net(state) new_q1_value = self.q1_net(state, new_action) new_q2_value = self.q2_net(state, new_action) next_value = torch.min(new_q1_value, new_q2_value) - log_prob value_loss = F.mse_loss(value, next_value.detach()) # Soft q loss q1_value = self.q1_net(state, action) q2_value = self.q2_net(state, action) target_value = self.target_value_net(next_state) target_q_value = reward + done * self.gamma * target_value q1_value_loss = F.mse_loss(q1_value, target_q_value.detach()) q2_value_loss = F.mse_loss(q2_value, target_q_value.detach()) # Policy loss policy_loss = (log_prob - torch.min(new_q1_value, new_q2_value)).mean() # Update v self.value_optimizer.zero_grad() value_loss.backward() self.value_optimizer.step() # Update Soft q self.q1_optimizer.zero_grad() self.q2_optimizer.zero_grad() q1_value_loss.backward() q2_value_loss.backward() self.q1_optimizer.step() self.q2_optimizer.step() # Update Policy self.policy_optimizer.zero_grad() policy_loss.backward() self.policy_optimizer.step() # Update target networks for target_param, param in zip(self.target_value_net.parameters(), self.value_net.parameters()): target_param.data.copy_(self.tau * param + (1 - self.tau) * target_param)
class Qlearning: def __init__(self): self.value_net = ValueNet(2) self.target_value_net = ValueNet(2) # self.value_net.load_state_dict(torch.load('./value_net.pkl')) torch.save(self.value_net.state_dict(), './value_net.pkl') self.target_value_net.load_state_dict(torch.load('./value_net.pkl')) self.episode = 0 self.explore = 0.3 self.buffer = [] self.buffer_capacity = 20 self.buffer_index = 0 def value(self, board): board_np = np.array(board, dtype=np.float32) board_flat = board_np.flatten() board_tensor = torch.from_numpy(board_flat) values_tensor = self.value_net(board_tensor.detach()) return values_tensor def target_value(self, board): board_np = np.array(board, dtype=np.float32) board_flat = board_np.flatten() board_tensor = torch.from_numpy(board_flat) target_values_tensor = self.target_value_net(board_tensor.detach()) return target_values_tensor def action(self, board): epsilon = random.random() if self.explore <= 0.9: self.explore *= 1.001 print('explore = %.3f' % self.explore) if epsilon > self.explore: action_num = random.randint(0, 3) return action_num board_np = np.array(board, dtype=np.float32) board_flat = board_np.flatten() board_tensor = torch.from_numpy(board_flat) values_tensor = self.value(board_tensor.detach()) values_np = values_tensor.detach().numpy() max_idx = np.argmax(values_np.tolist()) return max_idx def train(self): # randomly select samples as one batch train_data = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] sample_size = 10 #self.buffer_capacity indices = np.random.choice(range(10), sample_size) sampler1 = torch.utils.data.SubsetRandomSampler(indices) print('sampler1 = ', sampler1) exit() value_loss = self.value_net.loss_function(v, v_.detach(), reward) self.value_net.zero_grad() value_loss[idx].backward() self.value_net.opt_Adam.step() def update(self): self.episode += 1 print('episode = ', self.episode) if self.episode % 2 == 0: torch.save(self.value_net.state_dict(), './value_net.pkl') self.target_value_net.load_state_dict( torch.load('./value_net.pkl'))
def main(): config = Settings() # |TODO| go to Setting() train_filename = config.train_file # train_filename_1 = config.train_file_1 # train_filename_2 = config.train_file_2 test_filename = config.test_file dataset_path = os.path.join(os.getcwd(), config.path) if not os.path.exists(config.exp_dir): os.mkdir(config.exp_dir) model_dir = os.path.join(config.exp_dir, config.model_name) logger = SummaryWriter(model_dir) if config.data_type == 'success': # with open(os.path.join(dataset_path, train_filename), 'rb') as f: # train_dataset = pickle.load(f) # with open(os.path.join(dataset_path, test_filename), 'rb') as f: # test_dataset = pickle.load(f) dataset = glob.glob(f'{dataset_path}/{train_filename}/*.pickle') # test_dataset = glob.glob(f'{dataset_path}/{test_filename}/*.pickle') # train_dataset = dataset[:1500000] # test_dataset = dataset[-200000:] train_dataset = dataset[:-20000] test_dataset = dataset[-20000:] print('#trajectories of train_dataset:', len(train_dataset)) print('#trajectories of test_dataset:', len(test_dataset)) elif config.data_type == 'mcts': dataset = glob.glob(f'{dataset_path}/{train_filename}/*.pickle') train_dataset = dataset[:-20000] test_dataset = dataset[-20000:] # train_dataset = glob.glob(f'{dataset_path}/{train_filename}/*.pickle') # test_dataset = glob.glob(f'{dataset_path}/{test_filename}/*.pickle') if config.filter: filtered_data_train = [] filtered_data_test = [] total_reward_filt = [] total_reward_not_filt = [] avg_total_reward_not_filt = 0 avg_total_reward_filt = 0 for data in train_dataset: with open(data, 'rb') as f: traj = pickle.load(f) avg_total_reward_not_filt += traj[-1] total_reward_not_filt.append(traj[-1]) if traj[-1] > config.filter: filtered_data_train.append(data) avg_total_reward_filt += traj[-1] total_reward_filt.append(traj[-1]) for data in test_dataset: with open(data, 'rb') as f: traj = pickle.load(f) if traj[-1] > config.filter: filtered_data_test.append(data) total_reward_not_filt_std = np.std( np.asarray(total_reward_not_filt)) total_reward_filt_std = np.std(np.asarray(total_reward_filt)) print('Average of total reward(not filtered):', avg_total_reward_not_filt / len(train_dataset)) print('std of total reward(not filtered):', total_reward_not_filt_std) print('Average of total reward(filtered):', avg_total_reward_filt / len(filtered_data_train)) print('std of total reward(filtered):', total_reward_filt_std) train_dataset = filtered_data_train test_dataset = filtered_data_test print('#trajectories of train_dataset:', len(train_dataset)) print('#trajectories of test_dataset:', len(test_dataset)) # # For mixed dataset # train_dataset_1 = glob.glob(f'{dataset_path}/{train_filename_1}/*.pickle') # dataset_2 = glob.glob(f'{dataset_path}/{train_filename_2}/*.pickle') # train_dataset_2 = dataset_2[:100000] # test_dataset = dataset_2[100000:] # if config.filter: # filtered_data_train = [] # filtered_data_test = [] # total_reward_filt = [] # total_reward_not_filt = [] # avg_total_reward_not_filt = 0 # avg_total_reward_filt = 0 # for data in train_dataset_2: # with open(data, 'rb') as f: # traj = pickle.load(f) # avg_total_reward_not_filt += traj[-1] # total_reward_not_filt.append(traj[-1]) # if traj[-1] > config.filter: # filtered_data_train.append(data) # avg_total_reward_filt += traj[-1] # total_reward_filt.append(traj[-1]) # for data in test_dataset: # with open(data, 'rb') as f: # traj = pickle.load(f) # if traj[-1] > config.filter: # filtered_data_test.append(data) # total_reward_not_filt_std = np.std(np.asarray(total_reward_not_filt)) # total_reward_filt_std = np.std(np.asarray(total_reward_filt)) # print('Average of total reward(not filtered):', avg_total_reward_not_filt/len(train_dataset_2)) # print('std of total reward(not filtered):', total_reward_not_filt_std) # print('Average of total reward(filtered):', avg_total_reward_filt/len(filtered_data_train)) # print('std of total reward(filtered):', total_reward_filt_std) # train_dataset = train_dataset_1 + filtered_data_train # test_dataset = filtered_data_test # print('#trajectories of train_dataset:', len(train_dataset)) # print('#trajectories of test_dataset:', len(test_dataset)) # generate dataloader train_loader = get_loader(config, train_dataset) test_loader = get_loader(config, test_dataset) # model device = th.device(config.device) if config.model == 'GPT': model = GPT2(config).to(device) elif config.model == 'RNN': model = RNN(config).to(device) elif config.model == 'LSTM': model = LSTM(config).to(device) elif config.model == 'CVAE' or config.model == 'PolicyValueNet': model = CVAE(config).to(device) elif config.model == 'ValueNet': model = ValueNet(config).to(device) else: raise Exception( f'"{config.model}" is not support!! You should select "GPT", "RNN", "LSTM", "CVAE", "ValueNet", or "PolicyValueNet.' ) # optimizer optimizer = th.optim.AdamW(model.parameters(), lr=config.learning_rate, weight_decay=config.weight_decay) # learning rate scheduler if config.optimizer == 'AdamW': scheduler = th.optim.lr_scheduler.LambdaLR( optimizer, lambda step: min((step + 1) / config.warmup_step, 1)) elif config.optimizer == 'AdamWR': scheduler = CosineAnnealingWarmUpRestarts(optimizer=optimizer, T_0=config.T_0, T_mult=config.T_mult, eta_max=config.lr_max, T_up=config.warmup_step, gamma=config.lr_mult) else: raise Exception( f'"{config.optimizer}" is not support!! You should select "AdamW" or "AdamWR".' ) # Metric # |TODO| implement Chamfer distance if config.model == 'CVAE': loss_fn = ELBOLoss(config) eval_fn = ELBOLoss(config) elif config.model == 'ValueNet': loss_fn = RegressionLossValue(config) eval_fn = RegressionLossValue(config) elif config.model == 'PolicyValueNet': loss_fn = None eval_fn = None else: loss_fn = RegressionLossPolicy(config) eval_fn = RegressionLossPolicy(config) # Trainer & Evaluator trainer = Trainer(config=config, loader=train_loader, model=model, optimizer=optimizer, scheduler=scheduler, loss_fn=loss_fn, eval_fn=eval_fn) evaluator = Evaluator(config=config, loader=test_loader, model=model, eval_fn=eval_fn) # save configuration config.save(model_dir + '/config.yaml') # Logging model graph dummy = next(iter(test_loader)) for k in dummy: dummy[k].to(device).detach() logger.add_graph(ModelAsTuple(config, model), dummy) start_epoch = 1 best_error = 10000. # load checkpoint for resuming if config.resume is not None: filename = os.path.join(model_dir, config.resume) if os.path.isfile(filename): start_epoch, best_error, model, optimizer, scheduler = load_checkpoint( config, filename, model, optimizer, scheduler) start_epoch += 1 print("Loaded checkpoint '{}' (epoch {})".format( config.resume, start_epoch)) else: raise Exception("No checkpoint found at '{}'".format( config.resume)) # load checkpoint for pre-trained if config.pre_trained is not None: pre_trained_path = os.path.join(config.exp_dir, config.pre_trained) if os.path.isfile(pre_trained_path): start_epoch, best_error, model, optimizer, scheduler = load_checkpoint( config, pre_trained_path, model, optimizer, scheduler) start_epoch = 1 print("Loaded checkpoint '{}'".format(config.pre_trained)) else: raise Exception("No checkpoint found at '{}'".format( config.resume)) for epoch in range(start_epoch, config.epochs + 1): print(f'===== Start {epoch} epoch =====') # Training one epoch print("Training...") train_loss, train_val = trainer.train(epoch) # Logging if config.model == 'CVAE': logger.add_scalar('Loss(total)/train', train_loss['total'], epoch) logger.add_scalar('Loss(Reconstruction)/train', train_loss['Recon'], epoch) logger.add_scalar('Loss(KL_divergence)/train', train_loss['KL_div'], epoch) elif config.model == 'ValueNet': logger.add_scalar('Loss/train', train_loss['total'], epoch) elif config.model == 'PolicyValueNet': logger.add_scalar('Loss(total)/train', train_loss['total'], epoch) logger.add_scalar('Loss(action)/train', train_loss['action'], epoch) logger.add_scalar('Loss(accumulated reward)/train', train_loss['accumulated_reward'], epoch) # logger.add_scalar('Eval(action)/train', train_val['action'], epoch) else: logger.add_scalar('Loss(total)/train', train_loss['total'], epoch) logger.add_scalar('Loss(action)/train', train_loss['action'], epoch) # if config.use_reward: # logger.add_scalar('Loss(reward)/train', train_loss['reward'], epoch) # logger.add_scalar('Eval(action)/train', train_val['action'], epoch) # if config.use_reward: # logger.add_scalar('Eval(reward)/train', train_val['reward'], epoch) # |FIXME| debug for eff_grad: "RuntimeError: Boolean value of Tensor with more than one value is ambiguous" log_gradients(model, logger, epoch, log_grad=config.log_grad, log_param=config.log_para, eff_grad=config.eff_grad, print_num_para=config.print_num_para) # evaluating if epoch % config.test_eval_freq == 0: print("Validating...") test_val = evaluator.eval(epoch) # save the best model # |TODO| change 'action' to 'total' @ trainer.py & evaluator.py -> merge 'CVAE' & others if config.model == 'CVAE' or config.model == 'ValueNet' or config.model == 'PolicyValueNet': if test_val['total'] < best_error: best_error = test_val['total'] save_checkpoint('Saving the best model!', os.path.join(model_dir, 'best.pth'), epoch, best_error, model, optimizer, scheduler) else: if test_val['action'] < best_error: best_error = test_val['action'] save_checkpoint('Saving the best model!', os.path.join(model_dir, 'best.pth'), epoch, best_error, model, optimizer, scheduler) # Logging if config.model == 'CVAE': logger.add_scalar('Eval(total)/test', test_val['total'], epoch) logger.add_scalar('Eval(Reconstruction)/test', test_val['Recon'], epoch) logger.add_scalar('Eval(KL_divergence)/test', test_val['KL_div'], epoch) elif config.model == 'ValueNet': logger.add_scalar('Eval/test', test_val['total'], epoch) elif config.model == 'PolicyValueNet': logger.add_scalar('Eval(total)/test', test_val['total'], epoch) logger.add_scalar('Eval(action)/test', test_val['action'], epoch) logger.add_scalar('Eval(accumulated reward)/test', test_val['accumulated_reward'], epoch) else: logger.add_scalar('Eval(action)/test', test_val['action'], epoch) # if config.use_reward: # logger.add_scalar('Eval(reward)/test', test_val['reward'], epoch) # save the model if epoch % config.save_freq == 0: save_checkpoint('Saving...', os.path.join(model_dir, f'ckpt_epoch_{epoch}.pth'), epoch, best_error, model, optimizer, scheduler) print(f'===== End {epoch} epoch =====')
import torch from torch import optim, distributions import torch.nn.functional as F env = gym.make("CartPole-v1") # observation = env.reset() # print(observation) # print(env.observation_space) MAXSTEP = 100 BATCHSIZE = 16 EPOCH = 1000 GAMMA = 0.99 policy_net = PolicyNet() value_net = ValueNet() policy_net.cuda() value_net.cuda() opt1 = optim.Adam(policy_net.parameters(), lr=1e-3) opt2 = optim.Adam(value_net.parameters(), lr=1e-3) # train one epoch def train_step(): observ_batch = [] reward_batch = [] action_batch = [] mask_batch = []
def main(): #Parse arguments #---------------------------- parser = argparse.ArgumentParser() parser.add_argument("--env", default="CartPole-v0") parser.add_argument("--conti", action="store_true") parser.add_argument("--unwrap", action="store_true") args = parser.parse_args() #Parameters #---------------------------- n_env = 8 n_step = 128 mb_size = n_env * n_step sample_mb_size = 64 sample_n_epoch = 4 clip_val = 0.2 lamb = 0.95 gamma = 0.99 ent_weight = 0.0 max_grad_norm = 0.5 beta = 0.1 lr = 1e-4 n_iter = 30000 disp_step = 30 save_step = 300 save_dir = "./save" device = "cuda:0" expert_path = "../save/{}_traj.pkl".format(args.env) #Create multiple environments #---------------------------- env = MultiEnv([ make_env(i, env_id=args.env, unwrap=args.unwrap, rand_seed=int(time.time())) for i in range(n_env) ]) if args.conti: s_dim = env.ob_space.shape[0] a_dim = env.ac_space.shape[0] else: s_dim = env.ob_space.shape[0] a_dim = env.ac_space.n runner = EnvRunner(env, s_dim, a_dim, n_step, gamma, lamb, device=device, conti=args.conti) #Load expert trajectories #---------------------------- if os.path.exists(expert_path): s_real, a_real = pkl.load(open(expert_path, "rb")) sa_real = [] if args.conti: for i in range(len(s_real)): sa_real.append(np.concatenate([s_real[i], a_real[i]], 1)) else: for i in range(len(s_real)): a_real_onehot = np.zeros((len(a_real[i]), a_dim), dtype=np.float32) for j in range(len(a_real[i])): a_real_onehot[j, a_real[i][j]] = 1 sa_real.append(np.concatenate([s_real[i], a_real_onehot], 1)) sa_real = np.concatenate(sa_real, 0) else: print("ERROR: No expert trajectory file found") sys.exit(1) #Create model #---------------------------- policy_net = PolicyNet(s_dim, a_dim, conti=args.conti).to(device) value_net = ValueNet(s_dim).to(device) dis_net = DiscriminatorNet(s_dim + a_dim).to(device) agent = PPO(policy_net, value_net, dis_net, a_dim, beta, lr, max_grad_norm, ent_weight, clip_val, sample_n_epoch, sample_mb_size, mb_size, device=device, conti=args.conti) #Load model #---------------------------- if not os.path.exists(save_dir): os.mkdir(save_dir) if os.path.exists(os.path.join(save_dir, "{}.pt".format(args.env))): print("Loading the model ... ", end="") checkpoint = torch.load( os.path.join(save_dir, "{}.pt".format(args.env))) policy_net.load_state_dict(checkpoint["PolicyNet"]) value_net.load_state_dict(checkpoint["ValueNet"]) dis_net.load_state_dict(checkpoint["DiscriminatorNet"]) agent.beta = checkpoint["beta"] start_it = checkpoint["it"] print("Done.") else: start_it = 0 #Start training #---------------------------- t_start = time.time() policy_net.train() value_net.train() for it in range(start_it, n_iter): #Run the environment with torch.no_grad(): mb_obs, mb_actions, mb_old_a_logps, mb_values, mb_returns = runner.run( policy_net, value_net, dis_net) mb_advs = mb_returns - mb_values mb_advs = (mb_advs - mb_advs.mean()) / (mb_advs.std() + 1e-6) #Train pg_loss, v_loss, ent, dis_loss, dis_real, dis_fake, avg_kl = agent.train( policy_net, value_net, dis_net, mb_obs, mb_actions, mb_values, mb_advs, mb_returns, mb_old_a_logps, sa_real) #Print the result if it % disp_step == 0: agent.lr_decay(it, n_iter) policy_net.eval() value_net.eval() n_sec = time.time() - t_start fps = int((it - start_it) * n_env * n_step / n_sec) mean_true_return, std_true_return, mean_return, std_return, mean_len = runner.get_performance( ) policy_net.train() value_net.train() print("[{:5d} / {:5d}]".format(it, n_iter)) print("----------------------------------") print("Timesteps = {:d}".format((it - start_it) * mb_size)) print("Elapsed time = {:.2f} sec".format(n_sec)) print("FPS = {:d}".format(fps)) print("actor loss = {:.6f}".format(pg_loss)) print("critic loss = {:.6f}".format(v_loss)) print("dis loss = {:.6f}".format(dis_loss)) print("entropy = {:.6f}".format(ent)) print("avg_kl = {:.6f}".format(avg_kl)) print("beta = {:.6f}".format(agent.beta)) print("mean true return = {:.6f}".format(mean_true_return)) print("mean return = {:.6f}".format(mean_return)) print("mean length = {:.2f}".format(mean_len)) print("dis_real = {:.3f}".format(dis_real)) print("dis_fake = {:.3f}".format(dis_fake)) print() #Save model if it % save_step == 0: print("Saving the model ... ", end="") torch.save( { "beta": agent.beta, "it": it, "PolicyNet": policy_net.state_dict(), "ValueNet": value_net.state_dict(), "DiscriminatorNet": dis_net.state_dict() }, os.path.join(save_dir, "{}.pt".format(args.env))) print("Done.") print() env.close()
class A3CGlobal: def __init__(self, config): self.config = config # 정책신경망 생성 self.actor = PolicyNet(self.config.n_state, self.config.n_action) self.actor.to(device) self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=self.config.actor_lr) # 가치신경망 생성 self.critic = ValueNet(self.config.n_state, 1) self.critic.to(device) self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=self.config.critic_lr) # 리턴값 계산 def get_returns(self, rewards, done, next_value): returns = torch.zeros(len(rewards), dtype=torch.float).to(self.config.device) R = 0 if done else next_value for i in reversed(range(0, len(rewards))): R = rewards[i] + self.config.discount_factor * R returns[i] = R return returns # 각 타임스텝마다 정책신경망과 가치신경망을 업데이트 def train_model(self, states, actions, rewards, next_states, done): states = torch.tensor(states, dtype=torch.float).to(self.config.device) actions = torch.tensor(actions, dtype=torch.float).to(self.config.device) next_states = torch.tensor(next_states, dtype=torch.float).to(self.config.device) next_values = self.critic(next_states).view(-1) # 리턴값 계산 returns = self.get_returns(rewards, done, next_values[-1]) values = self.critic(states).view(-1) # 가치신경망 학습 critic_loss = self.train_critic(values, returns) # 정책신경망 학습 actor_loss = self.train_actor(states, actions, returns - values) return actor_loss, critic_loss # 정책신경망을 업데이트하는 함수 def train_actor(self, states, actions, advantages): policy = self.actor(states) action_prob = torch.sum(actions * policy, dim=1) cross_entropy = torch.log(action_prob + 1.e-7) * advantages.detach() actor_loss = -torch.mean(cross_entropy) self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() return actor_loss.item() # 가치신경망을 업데이트하는 states def train_critic(self, values, targets): critic_loss = torch.mean(torch.pow(targets - values, 2)) self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() return critic_loss.item() # GPU 메모리 반납 def close(self): del self.actor del self.critic
def learn(i): boards, values = zip(*data_buffer) value_net = ValueNet(rl_model_filepath, rl_step-1) value_net.learn(boards, values, epochs=epochs, batch_size=batch_size)
class DQNAgent: def __init__(self, config): self.config = config self.epsilon = config.epsilon # replay memory self.replay_memory = deque(maxlen=self.config.n_replay_memory) # 가치신경망 생성 self.model = ValueNet(self.config.n_state, self.config.n_action) self.model.to(device) self.model_optimizer = torch.optim.Adam(self.model.parameters(), lr=self.config.learning_rate) # 정책신경망의 출력을 받아 확률적으로 행동을 선택 def get_action(self, state): if np.random.rand() <= self.epsilon: return random.randrange(self.config.n_action) else: state = torch.tensor(state, dtype=torch.float).to(self.config.device) output = self.model(state) return output.argmax().item() # 히스토리 추가 def append_replay(self, state, action, reward, next_state, done): self.replay_memory.append((state, action, reward, next_state, done)) # 각 타임스텝마다 정책신경망과 가치신경망을 업데이트 def train_model(self): # 학습이 계속 될 수 록 탐험 학률을 줄여 줌 if self.epsilon > self.config.epsilon_min: self.epsilon *= self.config.epsilon_decay # 히스토리를 배열 형태로 정렬 replay_memory = np.array( random.sample(self.replay_memory, self.config.n_batch)) states = np.vstack(replay_memory[:, 0]) actions = list(replay_memory[:, 1]) rewards = list(replay_memory[:, 2]) next_states = list(replay_memory[:, 3]) dones = list(replay_memory[:, 4]) states = torch.tensor(states, dtype=torch.float).to(device) next_states = torch.tensor(next_states, dtype=torch.float).to(device) targets = self.model(states) next_values = self.model(next_states) for i in range(len(targets)): if dones[i]: targets[i][actions[i]] = rewards[i] # Vt = Rt+1 else: targets[i][ actions[i]] = rewards[i] + self.config.discount_factor * ( torch.max(next_values[i])) # Vt = Rt+1 + rVt+1 loss = self.train_value(states, targets) return loss # 가치신경망을 업데이트하는 함수 def train_value(self, states, targets): values = self.model(states) loss = torch.mean(torch.pow(targets - values, 2)) self.model_optimizer.zero_grad() loss.backward() self.model_optimizer.step() return loss.item() # model의 weight를 파일로 저장 def save(self): torch.save(self.model.state_dict(), self.config.save_file) # 파일로 부터 model의 weight를 읽어 옴 def load(self): self.model.load_state_dict(torch.load(self.config.save_file)) # GPU 메모리 반납 def close(self): del self.model
# Turn on pyplot's interactive mode # VERY IMPORTANT because otherwise training stats plot will hault plt.ion() # Create OpenAI gym environment env = gym.make(env_name) if is_unwrapped: env = env.unwrapped # Get device device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') print("Current usable device is: ", device) # Create the model policy_net = PolicyNet(layer_sizes, action_lim).to(device) # Policy network value_net = ValueNet(input_size).to(device) # Value network # Set up memory memory = Memory(capacity, device) # Set up optimizer policynet_optimizer = optim.Adam(policy_net.parameters(), lr=policy_lr) valuenet_optimizer = optim.Adam(value_net.parameters(), lr=valuenet_lr) ################################################################### # Start training # Dictionary for extra training information to save to checkpoints training_info = { "epoch mean durations": [], "epoch mean rewards": [],
def main(): #Parse arguments #---------------------------- parser = argparse.ArgumentParser() parser.add_argument("--env", default="CartPole-v0") parser.add_argument("--conti", action="store_true") parser.add_argument("--unwrap", action="store_true") args = parser.parse_args() #Parameters #---------------------------- n_env = 8 n_step = 128 mb_size = n_env * n_step sample_mb_size = 64 sample_n_epoch = 4 clip_val = 0.2 lamb = 0.95 gamma = 0.99 ent_weight = 0.0 max_grad_norm = 0.5 lr = 1e-4 n_iter = 30000 disp_step = 30 save_step = 300 save_dir = "./save" device = "cuda:0" #Create multiple environments #---------------------------- env = MultiEnv([ make_env(i, env_id=args.env, unwrap=args.unwrap, rand_seed=int(time.time())) for i in range(n_env) ]) if args.conti: s_dim = env.ob_space.shape[0] a_dim = env.ac_space.shape[0] else: s_dim = env.ob_space.shape[0] a_dim = env.ac_space.n runner = EnvRunner(env, s_dim, a_dim, n_step, gamma, lamb, device=device, conti=args.conti) #Create model #---------------------------- policy_net = PolicyNet(s_dim, a_dim, conti=args.conti).to(device) value_net = ValueNet(s_dim).to(device) agent = PPO(policy_net, value_net, lr, max_grad_norm, ent_weight, clip_val, sample_n_epoch, sample_mb_size, mb_size, device=device) #Load model #---------------------------- if not os.path.exists(save_dir): os.mkdir(save_dir) if os.path.exists(os.path.join(save_dir, "{}.pt".format(args.env))): print("Loading the model ... ", end="") checkpoint = torch.load( os.path.join(save_dir, "{}.pt".format(args.env))) policy_net.load_state_dict(checkpoint["PolicyNet"]) value_net.load_state_dict(checkpoint["ValueNet"]) start_it = checkpoint["it"] print("Done.") else: start_it = 0 #Start training #---------------------------- t_start = time.time() policy_net.train() value_net.train() for it in range(start_it, n_iter): #Run the environment with torch.no_grad(): mb_obs, mb_actions, mb_old_a_logps, mb_values, mb_returns = runner.run( policy_net, value_net) mb_advs = mb_returns - mb_values mb_advs = (mb_advs - mb_advs.mean()) / (mb_advs.std() + 1e-6) #Train pg_loss, v_loss, ent = agent.train(policy_net, value_net, mb_obs, mb_actions, mb_values, mb_advs, mb_returns, mb_old_a_logps) #Print the result if it % disp_step == 0: agent.lr_decay(it, n_iter) policy_net.eval() value_net.eval() n_sec = time.time() - t_start fps = int((it - start_it) * n_env * n_step / n_sec) mean_return, std_return, mean_len = runner.get_performance() policy_net.train() value_net.train() print("[{:5d} / {:5d}]".format(it, n_iter)) print("----------------------------------") print("Timesteps = {:d}".format((it - start_it) * mb_size)) print("Elapsed time = {:.2f} sec".format(n_sec)) print("FPS = {:d}".format(fps)) print("actor loss = {:.6f}".format(pg_loss)) print("critic loss = {:.6f}".format(v_loss)) print("entropy = {:.6f}".format(ent)) print("mean return = {:.6f}".format(mean_return)) print("mean length = {:.2f}".format(mean_len)) print() #Save model if it % save_step == 0: print("Saving the model ... ", end="") torch.save( { "it": it, "PolicyNet": policy_net.state_dict(), "ValueNet": value_net.state_dict() }, os.path.join(save_dir, "{}.pt".format(args.env))) print("Done.") print() env.close()
class A2CAgent: def __init__(self, config): self.config = config # replay memory self.replay_memory = deque(maxlen=self.config.n_replay_memory) # 정책신경망 생성 self.actor = PolicyNet(self.config.n_state, self.config.n_action) self.actor.to(device) self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=self.config.actor_lr) # 가치신경망 생성 self.critic = ValueNet(self.config.n_state, 1) self.critic.to(device) self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=self.config.critic_lr) # 정책신경망의 출력을 받아 확률적으로 행동을 선택 def get_action(self, state): state = torch.tensor(state, dtype=torch.float).to(device) policy = self.actor(state) policy = policy.detach().cpu().numpy()[0] return np.random.choice(self.config.n_action, 1, p=policy)[0] # 히스토리 추가 def append_replay(self, state, action, reward, next_state): act = np.zeros(self.config.n_action) act[action] = 1 self.replay_memory.append((state, act, reward, next_state)) # 리턴값 계산 def get_returns(self, rewards, done, next_value): returns = torch.zeros(len(rewards), dtype=torch.float).to(self.config.device) R = 0 if done else next_value for i in reversed(range(0, len(rewards))): R = rewards[i] + self.config.discount_factor * R returns[i] = R return returns # 각 타임스텝마다 정책신경망과 가치신경망을 업데이트 def train_model(self, done): # 히스토리를 배열 형태로 정렬 replay_memory = np.array(self.replay_memory) self.replay_memory.clear() states = np.vstack(replay_memory[:, 0]) actions = list(replay_memory[:, 1]) rewards = list(replay_memory[:, 2]) next_states = list(replay_memory[:, 3]) states = torch.tensor(states, dtype=torch.float).to(self.config.device) actions = torch.tensor(actions, dtype=torch.float).to(self.config.device) next_states = torch.tensor(next_states, dtype=torch.float).to(self.config.device) next_values = self.critic(next_states).view(-1) # 리턴값 계산 returns = self.get_returns(rewards, done, next_values[-1]) values = self.critic(states).view(-1) # 가치신경망 학습 critic_loss = self.train_critic(values, returns) # 정책신경망 학습 actor_loss = self.train_actor(states, actions, returns - values) return actor_loss, critic_loss # 정책신경망을 업데이트하는 함수 def train_actor(self, states, actions, advantages): policy = self.actor(states) action_prob = torch.sum(actions * policy, dim=1) cross_entropy = torch.log(action_prob + 1.e-7) * advantages.detach() actor_loss = -torch.mean(cross_entropy) self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() return actor_loss.item() # 가치신경망을 업데이트하는 states def train_critic(self, values, targets): critic_loss = torch.mean(torch.pow(targets - values, 2)) self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() return critic_loss.item() # model의 weight를 파일로 저장 def save(self): torch.save(self.actor.state_dict(), self.config.save_file + ".actor") torch.save(self.critic.state_dict(), self.config.save_file + ".critic") # 파일로 부터 model의 weight를 읽어 옴 def load(self): self.actor.load_state_dict(torch.load(self.config.save_file + ".actor")) self.critic.load_state_dict(torch.load(self.config.save_file + ".critic")) # GPU 메모리 반납 def close(self): del self.actor del self.critic
def main(): #Parse arguments #---------------------------- parser = argparse.ArgumentParser() parser.add_argument("--env", default="BipedalWalker-v3") parser.add_argument("--discrete", action="store_true") parser.add_argument("--unwrap", action="store_true") args = parser.parse_args() #Parameters #---------------------------- clip_val = 0.2 sample_mb_size = 64 sample_n_epoch = 4 lamb = 0.95 gamma = 0.99 ent_weight = 0.01 max_grad_norm = 0.5 lr = 1e-4 n_iter = 10000 disp_step = 30 save_step = 300 save_dir = "./save" device = "cuda:0" if torch.cuda.is_available() else "cpu" #Create environment #---------------------------- env = gym.make(args.env) if args.discrete: s_dim = env.observation_space.shape[0] a_dim = env.action_space.n else: s_dim = env.observation_space.shape[0] a_dim = env.action_space.shape[0] if args.unwrap: env = env.unwrapped runner = EnvRunner(s_dim, a_dim, gamma, lamb, max_step=2048, device=device, conti=not args.discrete) #Create model #---------------------------- policy_net = PolicyNet(s_dim, a_dim, conti=not args.discrete).to(device) value_net = ValueNet(s_dim).to(device) agent = PPO(policy_net, value_net, lr, max_grad_norm, ent_weight, clip_val, sample_n_epoch, sample_mb_size, device=device) #Load model #---------------------------- if not os.path.exists(save_dir): os.mkdir(save_dir) if os.path.exists(os.path.join(save_dir, "{}.pt".format(args.env))): print("Loading the model ... ", end="") checkpoint = torch.load( os.path.join(save_dir, "{}.pt".format(args.env))) policy_net.load_state_dict(checkpoint["PolicyNet"]) value_net.load_state_dict(checkpoint["ValueNet"]) start_it = checkpoint["it"] print("Done.") else: start_it = 0 #Start training #---------------------------- t_start = time.time() policy_net.train() value_net.train() mean_total_reward = 0 mean_length = 0 for it in range(start_it, n_iter): #Run the environment with torch.no_grad(): mb_obs, mb_actions, mb_old_a_logps, mb_values, mb_returns, mb_rewards = runner.run( env, policy_net, value_net) mb_advs = mb_returns - mb_values mb_advs = (mb_advs - mb_advs.mean()) / (mb_advs.std() + 1e-6) #Train pg_loss, v_loss, ent = agent.train(policy_net, value_net, mb_obs, mb_actions, mb_values, mb_advs, mb_returns, mb_old_a_logps) mean_total_reward += mb_rewards.sum() mean_length += len(mb_obs) print("[Episode {:4d}] total reward = {:.6f}, length = {:d}".format( it, mb_rewards.sum(), len(mb_obs))) #Print the result if it % disp_step == 0: print("\n[{:5d} / {:5d}]".format(it, n_iter)) print("----------------------------------") print("Elapsed time = {:.2f} sec".format(time.time() - t_start)) print("actor loss = {:.6f}".format(pg_loss)) print("critic loss = {:.6f}".format(v_loss)) print("entropy = {:.6f}".format(ent)) print("mean return = {:.6f}".format(mean_total_reward / disp_step)) print("mean length = {:.2f}".format(mean_length / disp_step)) print() agent.lr_decay(it, n_iter) mean_total_reward = 0 mean_length = 0 #Save model if it % save_step == 0: print("Saving the model ... ", end="") torch.save( { "it": it, "PolicyNet": policy_net.state_dict(), "ValueNet": value_net.state_dict() }, os.path.join(save_dir, "{}.pt".format(args.env))) print("Done.") print() env.close()