def __init__(self, meta_controller_experience_memory=None, lr=0.00025, alpha=0.95, eps=0.01, batch_size=32, gamma=0.99, num_options=12): # expereince replay memory self.meta_controller_experience_memory = meta_controller_experience_memory self.lr = lr # learning rate self.alpha = alpha # optimizer parameter self.eps = 0.01 # optimizer parameter self.gamma = 0.99 # BUILD MODEL USE_CUDA = torch.cuda.is_available() if torch.cuda.is_available() and torch.cuda.device_count() > 1: self.device = torch.device("cuda:1") elif torch.cuda.device_count() == 1: self.device = torch.device("cuda:0") else: self.device = torch.device("cpu") dfloat_cpu = torch.FloatTensor dfloat_gpu = torch.cuda.FloatTensor dlong_cpu = torch.LongTensor dlong_gpu = torch.cuda.LongTensor duint_cpu = torch.ByteTensor dunit_gpu = torch.cuda.ByteTensor dtype = torch.cuda.FloatTensor if torch.cuda.is_available( ) else torch.FloatTensor dlongtype = torch.cuda.LongTensor if torch.cuda.is_available( ) else torch.LongTensor duinttype = torch.cuda.ByteTensor if torch.cuda.is_available( ) else torch.ByteTensor self.dtype = dtype self.dlongtype = dlongtype self.duinttype = duinttype Q = DQN(in_channels=4, num_actions=num_options).type(dtype) Q_t = DQN(in_channels=4, num_actions=num_options).type(dtype) Q_t.load_state_dict(Q.state_dict()) Q_t.eval() for param in Q_t.parameters(): param.requires_grad = False Q = Q.to(self.device) Q_t = Q_t.to(self.device) self.batch_size = batch_size self.Q = Q self.Q_t = Q_t # optimizer optimizer = optim.RMSprop(Q.parameters(), lr=lr, alpha=alpha, eps=eps) self.optimizer = optimizer print('init: Meta Controller --> OK')
class Agent(): def __init__(self, action_size): self.action_size = action_size # These are hyper parameters for the DQN self.discount_factor = 0.99 self.epsilon = 1.0 self.epsilon_min = 0.01 self.explore_step = 500000 self.epsilon_decay = (self.epsilon - self.epsilon_min) / self.explore_step self.train_start = 100000 self.update_target = 1000 # Generate the memory self.memory = ReplayMemory() # Create the policy net and the target net self.policy_net = DQN(action_size) self.policy_net.to(device) self.target_net = DQN(action_size) self.target_net.to(device) self.optimizer = optim.Adam(params=self.policy_net.parameters(), lr=learning_rate) self.scheduler = optim.lr_scheduler.StepLR( self.optimizer, step_size=scheduler_step_size, gamma=scheduler_gamma) # Initialize a target network and initialize the target network to the policy net ### CODE ### self.update_target_net() def load_policy_net(self, path): self.policy_net = torch.load(path) # after some time interval update the target net to be same with policy net def update_target_net(self): ### CODE ### self.target_net.load_state_dict(self.policy_net.state_dict()) """Get action using policy net using epsilon-greedy policy""" def get_action(self, state): if np.random.rand() <= self.epsilon: ### CODE #### (copy over from agent.py!) return torch.tensor([[random.randrange(self.action_size)]], device=device, dtype=torch.long) else: ### CODE #### (copy over from agent.py!) with torch.no_grad(): state = torch.FloatTensor(state).unsqueeze(0).cuda() return self.policy_net(state).max(1)[1].view(1, 1) # pick samples randomly from replay memory (with batch_size) def train_policy_net(self, frame): if self.epsilon > self.epsilon_min: self.epsilon -= self.epsilon_decay mini_batch = self.memory.sample_mini_batch(frame) mini_batch = np.array(mini_batch).transpose() history = np.stack(mini_batch[0], axis=0) states = np.float32(history[:, :4, :, :]) / 255. states = torch.from_numpy(states).cuda() actions = list(mini_batch[1]) actions = torch.LongTensor(actions).cuda() rewards = list(mini_batch[2]) rewards = torch.FloatTensor(rewards).cuda() next_states = np.float32(history[:, 1:, :, :]) / 255. next_states = torch.tensor(next_states).cuda() dones = mini_batch[3] # checks if the game is over musk = torch.tensor(list(map(int, dones == False)), dtype=torch.bool) # Your agent.py code here with double DQN modifications ### CODE ### # Compute Q(s_t, a), the Q-value of the current state ### CODE #### state_action_values = self.policy_net(states).gather( 1, actions.view(batch_size, -1)) # Compute Q function of next state ### CODE #### next_state_values = torch.zeros(batch_size, device=device).cuda() non_final_mask = torch.tensor(tuple( map(lambda s: s is not None, next_states)), device=device, dtype=torch.uint8) non_final_next_states = torch.cat([ i for i in next_states if i is not None ]).view(states.size()).cuda() # Compute the expected Q values next_state_values[non_final_mask] = self.target_net( non_final_next_states).max(1)[0].detach() expected_state_action_values = (next_state_values * self.discount_factor) + rewards # Compute the Huber Loss ### CODE #### loss = F.smooth_l1_loss(state_action_values.view(32), expected_state_action_values) # Optimize the model, .step() both the optimizer and the scheduler! ### CODE #### self.optimizer.zero_grad() loss.backward() for param in self.policy_net.parameters(): param.grad.data.clamp_(-1, 1) self.optimizer.step()
class Agent(): def __init__(self, action_size): self.action_size = action_size # These are hyper parameters for the DQN self.discount_factor = 0.99 self.epsilon = 1.0 self.epsilon_min = 0.01 self.explore_step = 1000000 self.epsilon_decay = (self.epsilon - self.epsilon_min) / self.explore_step self.train_start = 100000 self.update_target = 1000 # Generate the memory self.memory = ReplayMemory() # Create the policy net and the target net self.policy_net = DQN(action_size) self.policy_net.to(device) self.optimizer = optim.Adam(params=self.policy_net.parameters(), lr=learning_rate) self.scheduler = optim.lr_scheduler.StepLR(self.optimizer, step_size=scheduler_step_size, gamma=scheduler_gamma) # Initialize a target network and initialize the target network to the policy net ### CODE ### self.target_net = DQN(action_size).to(device) self.update_target_net() self.target_net.eval() def load_policy_net(self, path): self.policy_net = torch.load(path) # after some time interval update the target net to be same with policy net def update_target_net(self): ### CODE ### self.target_net.load_state_dict(self.policy_net.state_dict()) """Get action using policy net using epsilon-greedy policy""" def get_action(self, state): if np.random.rand() <= self.epsilon: ### CODE #### (copy over from agent.py!) a = torch.tensor([[random.randrange(self.action_size)]], device=device, dtype=torch.long) else: ### CODE #### (copy over from agent.py!) with torch.no_grad(): state = torch.from_numpy(state).reshape(1,4,84,84).to(device) a = self.policy_net(state).max(1)[1].view(1, 1) return a # pick samples randomly from replay memory (with batch_size) def train_policy_net(self, frame): if self.epsilon > self.epsilon_min: self.epsilon -= self.epsilon_decay mini_batch = self.memory.sample_mini_batch(frame) mini_batch = np.array(mini_batch).transpose() history = np.stack(mini_batch[0], axis=0) states = np.float32(history[:, :4, :, :]) / 255. states = torch.from_numpy(states).cuda() actions = list(mini_batch[1]) actions = torch.LongTensor(actions).cuda() rewards = list(mini_batch[2]) rewards = torch.FloatTensor(rewards).cuda() next_states = np.float32(history[:, 1:, :, :]) / 255. dones = mini_batch[3] # checks if the game is over musk = torch.tensor(list(map(int, dones==False)),dtype=torch.uint8) # Your agent.py code here with double DQN modifications ### CODE ### curr_Q = self.policy_net(states).gather(1, actions.unsqueeze(1)).squeeze(1) next_state_values = torch.zeros(32, device=device) next_states = torch.from_numpy(next_states).to(device) next_state_values[musk==1] = self.target_net(next_states[musk==1]).max(1)[0].detach() #next_state_values[musk] = self.target_net(next_states[musk]).detach().gather(1, self.policy_net(next_states[musk]).argmax(1).unsqueeze(1)).squeeze(1) target_Q = next_state_values * self.discount_factor + rewards loss = F.smooth_l1_loss(curr_Q, target_Q) self.optimizer.zero_grad() loss.backward() #torch.nn.utils.clip_grad_norm_(self.policy_net.parameters(), 10) for param in self.policy_net.parameters(): param.grad.data.clamp_(-1, 1) self.optimizer.step() self.scheduler.step()
def __init__(self, experience_memory=None, lr=0.00025, alpha=0.95, eps=0.01, batch_size=32, gamma=0.99, load_pretrained=False, saved_model_path='./models/a.model'): self.experience_memory = experience_memory # expereince replay memory self.lr = lr # learning rate self.alpha = alpha # optimizer parameter self.eps = 0.01 # optimizer parameter self.gamma = 0.99 # BUILD MODEL USE_CUDA = torch.cuda.is_available() if torch.cuda.is_available(): self.device = torch.device("cuda:0") else: self.device = torch.device("cpu") dfloat_cpu = torch.FloatTensor dfloat_gpu = torch.cuda.FloatTensor dlong_cpu = torch.LongTensor dlong_gpu = torch.cuda.LongTensor duint_cpu = torch.ByteTensor dunit_gpu = torch.cuda.ByteTensor dtype = torch.cuda.FloatTensor if torch.cuda.is_available( ) else torch.FloatTensor dlongtype = torch.cuda.LongTensor if torch.cuda.is_available( ) else torch.LongTensor duinttype = torch.cuda.ByteTensor if torch.cuda.is_available( ) else torch.ByteTensor self.dtype = dtype self.dlongtype = dlongtype self.duinttype = duinttype Q = DQN(in_channels=5, num_actions=18).type(dtype) if load_pretrained: Q.load_state_dict(torch.load(saved_model_path)) Q_t = DQN(in_channels=5, num_actions=18).type(dtype) Q_t.load_state_dict(Q.state_dict()) Q_t.eval() for param in Q_t.parameters(): param.requires_grad = False Q = Q.to(self.device) Q_t = Q_t.to(self.device) # if torch.cuda.device_count() > 0: # Q = nn.DataParallel(Q).to(self.device) # Q_t = nn.DataParallel(Q_t).to(self.device) # batch_size = batch_size * torch.cuda.device_count() # else: # batch_size = batch_size self.batch_size = batch_size self.Q = Q self.Q_t = Q_t # optimizer optimizer = optim.RMSprop(Q.parameters(), lr=lr, alpha=alpha, eps=eps) self.optimizer = optimizer print('init: Controller --> OK')
class Agent(object): def __init__(self, args, obs): self.net = DQN(args.n_obs, args.n_action) self.target_net = DQN(args.n_obs, args.n_action) if os.path.isfile('./weights/ckpt.pth'): self.net.load_state_dict(torch.load('./weights/ckpt.pth')) self.target_net.load_state_dict(torch.load('./weights/ckpt.pth')) self.device = torch.device( 'cuda' if torch.cuda.is_available() else 'cpu') self.state_preproc = StatePreproc(self.device) self.n_action = args.n_action self.gamma = args.gamma self.max_grad_norm = args.max_grad_norm self.num_procs = args.num_procs self.memory = ReplayBuffer(args) self.optimizer = torch.optim.Adam(self.net.parameters(), lr=args.lr, betas=(0.9, 0.99)) self.criterion = torch.nn.MSELoss() # log self.log_episode_rewards = torch.zeros(self.num_procs, device=self.device, dtype=torch.float) self.episode_rewards = deque([0] * 100, maxlen=100) self.episode = 1 self.init(obs) # eval self.test_episode = args.test_episode def init(self, obs): self.net.to(self.device) self.target_net.to(self.device) self.obs_tensor = self.state_preproc( obs) # size: [num_proc, 4, height, width] def act(self, obs, epsilon): if random.random() > epsilon: with torch.no_grad(): q_vals = self.net(obs) action = q_vals.argmax(dim=1) else: action = torch.tensor(np.random.randint(0, self.n_action, size=obs.shape[0]), device=self.device, dtype=torch.int64) return action def collect_experiences(self, env, num_frames, epsilon): for i in range(num_frames): actions = self.act(self.obs_tensor, epsilon) # size: [num_proc] next_obs, rewards, dones, _ = env.step(actions.cpu().numpy()) next_obs_tensor = self.state_preproc(next_obs) rewards_tensor = torch.tensor( rewards, device=self.device, dtype=torch.float) # size: [num_proc] dones_tensor = 1 - torch.tensor( dones, device=self.device, dtype=torch.float) # size: [num_proc] self.memory.add(self.obs_tensor, actions, rewards_tensor, next_obs_tensor, dones_tensor) self.obs_tensor = next_obs_tensor # for log self.log_episode_rewards += rewards_tensor for i, done in enumerate(dones): if done: self.episode_rewards.append( self.log_episode_rewards[i].item()) self.log_episode_rewards[i] = 0 self.episode += 1 log = { 'episode': self.episode, 'average_reward': np.mean(self.episode_rewards) } return log def improve_policy(self, update_times): for _ in range(update_times): states, acts, rewards, next_states, dones = self.memory.sample() with torch.no_grad(): q_vals = self.target_net( next_states ) # next_states size: [batch_size * num_proc, h, w, channel] target_max_q = rewards + self.gamma * torch.max(q_vals, 1)[0] curr_q_vals = self.net(states) curr_max_q = curr_q_vals.gather(1, acts.unsqueeze(1)).squeeze(1) # actions = torch.zeros([acts.shape[0], self.n_action], device=self.device, dtype=torch.float) # for i, act in enumerate(acts): # actions[i][act.item()] = 1.0 # curr_max_q = curr_q_vals * actions loss = self.criterion(curr_max_q, target_max_q) self.optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm_(self.net.parameters(), self.max_grad_norm) self.optimizer.step() info = {'value': curr_max_q.mean().item(), 'loss': loss.item()} return info def update_target_net(self): for target_param, param in zip(self.target_net.parameters(), self.net.parameters()): target_param.data.copy_(param.data) def save_weights(self): torch.save(self.net.state_dict(), './weights/ckpt.pth') def evaluate(self, env): self.net.eval() episode_return_list = [] for i in range(self.test_episode): seed = np.random.randint(0, 0xFFFFFF) env.seed(seed) obs = env.reset() done = False episode_return = 0 while not done: obs_tensor = self.state_preproc([obs]) action = self.act(obs_tensor, 0.0) obs, reward, done, _ = env.step(action.cpu().numpy()) episode_return += reward episode_return_list.append(episode_return) info = {'average_return': np.mean(episode_return_list)} self.net.train() return info def display(self, env): self.net.eval() seed = np.random.randint(0, 0xFFFFFF) env.seed(seed) obs = env.reset() need_key = True episode = 0 episode_return = 0 print('`Enter`: next step\n`E`: Run until end-of-episode\n`Q`: Quit') while True: if need_key: key = input('Press key:') if key == 'q': # quit break if key == 'e': # Run until end-of-episode need_key = False env.render() obs_tensor = self.state_preproc([obs]) action = self.act(obs_tensor, 0.0).squeeze(0) obs, reward, done, _ = env.step(action.cpu().numpy()) episode_return += reward if not need_key: time.sleep(0.1) if done: episode += 1 obs = env.reset() print('episode: {}, episode_return: {}'.format( episode, episode_return)) episode_return = 0 need_key = True self.net.train()
class Agent(): def __init__(self, action_size): self.action_size = action_size # These are hyper parameters for the DQN self.discount_factor = 0.99 self.epsilon = 1.0 self.epsilon_min = 0.01 self.explore_step = 500000 self.epsilon_decay = (self.epsilon - self.epsilon_min) / self.explore_step self.train_start = 100000 self.update_target = 1000 # Generate the memory self.memory = ReplayMemory() # Create the policy net and the target net self.policy_net = DQN(action_size) self.policy_net.to(device) self.optimizer = optim.Adam(params=self.policy_net.parameters(), lr=learning_rate) self.scheduler = optim.lr_scheduler.StepLR(self.optimizer, step_size=scheduler_step_size, gamma=scheduler_gamma) # Initialize a target network and initialize the target network to the policy net ### CODE ### self.target_net = DQN(action_size) self.target_net.load_state_dict(self.policy_net.state_dict()) self.target_net.to(device) def load_policy_net(self, path): self.policy_net = torch.load(path) # after some time interval update the target net to be same with policy net def update_target_net(self): ### CODE ### self.target_net.load_state_dict(self.polic_net.state_dict()) """Get action using policy net using epsilon-greedy policy""" def get_action(self, state): if np.random.rand() <= self.epsilon: ### CODE #### # Choose a random action a = torch.tensor([[random.randrange(self.action_size)]], dtype=torch.long) else: ### CODE #### # Choose the best action state = torch.from_numpy(state).cuda() with torch.no_grad(): a = self.policy_net(state).max(1)[1].view(1, 1) return a # pick samples randomly from replay memory (with batch_size) def train_policy_net(self, frame): if self.epsilon > self.epsilon_min: self.epsilon -= self.epsilon_decay mini_batch = self.memory.sample_mini_batch(frame) mini_batch = np.array(mini_batch).transpose() history = np.stack(mini_batch[0], axis=0) states = np.float32(history[:, :4, :, :]) / 255. states = torch.from_numpy(states).cuda() actions = list(mini_batch[1]) actions = torch.LongTensor(actions).cuda() rewards = list(mini_batch[2]) rewards = torch.FloatTensor(rewards).cuda() next_states = np.float32(history[:, 1:, :, :]) / 255. dones = mini_batch[3] # checks if the game is over mask = torch.tensor(list(map(int, dones==False)),dtype=torch.bool) # Compute Q(s_t, a), the Q-value of the current state state_action_values = self.policy_net(states).gather(1, actions.unsqueeze(1)) # Compute Q function of next state next_states = torch.from_numpy(next_states).cuda() non_final_next_states = next_states[mask] net_outputs = self.policy_net(non_final_next_states) # Find maximum Q-value of action at next state from policy net net_outputs = self.target_net(non_final_next_states) next_states_value[mask] = net_outputs.max(1)[0].detach() # Compute the Huber Loss expected_states_value = rewards + self.discount_factor * next_states_value loss = F.smooth_l1_loss(state_action_values, expected_state_action_values.unsqueeze(1)) # Optimize the model, .step() both the optimizer and the scheduler! self.optimizer.zero_grad() loss.backward() for param in self.policy_net.parameters(): param.grad.data.clamp_(-1, 1) self.optimizer.step()
def __init__(self, experience_memory=None, num_actions=4, lr=0.00025, alpha=0.95, eps=0.01, batch_size=32, gamma=0.99, load_pretrained=False, saved_model_path='./models/a.model', optim_method='RMSprop', use_multiple_gpu=True): self.experience_memory = experience_memory # expereince replay memory self.lr = lr # learning rate self.alpha = alpha # optimizer parameter self.eps = 0.01 # optimizer parameter self.gamma = 0.99 self.num_actions = num_actions self.use_multiple_gpu = use_multiple_gpu self.loss_list = [] self.L = 0.0 # BUILD MODEL if torch.cuda.is_available(): self.device = torch.device("cuda:0") else: self.device = torch.device("cpu") dfloat_cpu = torch.FloatTensor dfloat_gpu = torch.cuda.FloatTensor dlong_cpu = torch.LongTensor dlong_gpu = torch.cuda.LongTensor duint_cpu = torch.ByteTensor dunit_gpu = torch.cuda.ByteTensor dtype = torch.cuda.FloatTensor if torch.cuda.is_available( ) else torch.FloatTensor dlongtype = torch.cuda.LongTensor if torch.cuda.is_available( ) else torch.LongTensor duinttype = torch.cuda.ByteTensor if torch.cuda.is_available( ) else torch.ByteTensor self.dtype = dtype self.dlongtype = dlongtype self.duinttype = duinttype Q = DQN(in_channels=4, num_actions=num_actions).type(dtype) if load_pretrained: Q.load_state_dict(torch.load(saved_model_path)) Q_t = DQN(in_channels=4, num_actions=num_actions).type(dtype) Q_t.load_state_dict(Q.state_dict()) Q_t.eval() for param in Q_t.parameters(): param.requires_grad = False Q = Q.to(self.device) Q_t = Q_t.to(self.device) if torch.cuda.device_count() > 1 and self.use_multiple_gpu: Q = nn.DataParallel(Q).to(self.device) Q_t = nn.DataParallel(Q_t).to(self.device) self.batch_size = batch_size self.Q = Q self.Q_t = Q_t # optimizer if optim_method == 'SGD': optimizer = torch.optim.SGD(Q.parameters(), lr=self.lr) elif optim_method == 'RMSprop': optimizer = optim.RMSprop(Q.parameters(), lr=lr, alpha=alpha, eps=eps) else: optimizer = optim.RMSprop(Q.parameters(), lr=lr, alpha=alpha, eps=eps) self.optimizer = optimizer print('init: Controller --> OK')
class Agent(): def __init__(self, action_size): self.action_size = action_size # These are hyper parameters for the DQN self.discount_factor = 0.99 self.epsilon = 1.0 self.epsilon_min = 0.01 self.explore_step = 500000 self.epsilon_decay = (self.epsilon - self.epsilon_min) / self.explore_step self.train_start = 100000 self.update_target = 1000 # Generate the memory self.memory = ReplayMemory() # Create the policy net self.policy_net = DQN(action_size) self.policy_net.to(device) self.optimizer = optim.Adam(params=self.policy_net.parameters(), lr=learning_rate) self.scheduler = optim.lr_scheduler.StepLR(self.optimizer, step_size=scheduler_step_size, gamma=scheduler_gamma) def load_policy_net(self, path): self.policy_net = torch.load(path) """Get action using policy net using epsilon-greedy policy""" def get_action(self, state): if np.random.rand() <= self.epsilon: ### CODE #### # Choose a random action a = torch.tensor([[random.randrange(self.action_size)]], device=device, dtype=torch.long) else: ### CODE #### # Choose the best action with torch.no_grad(): state = torch.from_numpy(state).reshape(1,4,84,84).to(device) a = self.policy_net(state).max(1)[1].view(1, 1) return a # pick samples randomly from replay memory (with batch_size) def train_policy_net(self, frame): if self.epsilon > self.epsilon_min: self.epsilon -= self.epsilon_decay mini_batch = self.memory.sample_mini_batch(frame) mini_batch = np.array(mini_batch).transpose() history = np.stack(mini_batch[0], axis=0) states = np.float32(history[:, :4, :, :]) / 255. states = torch.from_numpy(states).cuda() actions = list(mini_batch[1]) actions = torch.LongTensor(actions).cuda() rewards = list(mini_batch[2]) rewards = torch.FloatTensor(rewards).cuda() next_states = np.float32(history[:, 1:, :, :]) / 255. dones = mini_batch[3] # checks if the game is over musk = torch.tensor(list(map(int, dones==False)),dtype=torch.uint8) # Compute Q(s_t, a), the Q-value of the current state ### CODE #### curr_Q = self.policy_net(states).gather(1, actions.unsqueeze(1)).squeeze(1) # Compute Q function of next state ### CODE #### next_state_values = torch.zeros(32, device=device) next_states = torch.from_numpy(next_states).to(device) next_state_values[musk==1] = self.policy_net(next_states[musk==1]).max(1)[0].detach() # Find maximum Q-value of action at next state from target net ### CODE #### target_Q = next_state_values * self.discount_factor + rewards # Compute the Huber Loss ### CODE #### loss = F.smooth_l1_loss(curr_Q, target_Q) # Optimize the model, .step() both the optimizer and the scheduler! ### CODE #### self.optimizer.zero_grad() loss.backward() #torch.nn.utils.clip_grad_norm_(self.policy_net.parameters(), 10) for param in self.policy_net.parameters(): param.grad.data.clamp_(-1, 1) self.optimizer.step()