def initialize(game, model_name, warm_start): # Initialize environment env = gym.make(game) num_actions = env.action_space.n # Initialize constants num_frames = 4 capacity = int(1e4) # Cold start if not warm_start: # Initialize model model = DQN(in_channels=num_frames, num_actions=num_actions) optimizer = optim.RMSprop(model.parameters(), lr=1.0e-4, weight_decay=0.01) # Initialize replay memory memory_buffer = ReplayMemory(capacity) # Initialize statistics running_reward = None running_rewards = [] # Warm start if warm_start: data_file = 'results/{}_{}.p'.format(game, model_name) try: with open(data_file, 'rb') as f: running_rewards = pickle.load(f) running_reward = running_rewards[-1] prior_eps = len(running_rewards) model_file = 'saved_models/{}_{}_ep_{}.p'.format( game, model_name, prior_eps) with open(model_file, 'rb') as f: saved_model = pickle.load(f) model, optimizer, memory_buffer = saved_model except OSError: print('Saved file not found. Creating new cold start model.') model = DQN(in_channels=num_frames, num_actions=num_actions) optimizer = optim.RMSprop(model.parameters(), lr=1.0e-4, weight_decay=0.01) # Initialize replay memory memory_buffer = ReplayMemory(capacity) running_reward = None running_rewards = [] cuda = torch.cuda.is_available() if cuda: model = model.cuda() criterion = torch.nn.MSELoss() return env, model, optimizer, criterion, memory_buffer, cuda, running_reward, running_rewards
class Agent: def __init__(self): self.model, self.target = DQN(), DQN() if USE_CUDA: self.model.cuda() self.target.cuda() self.exp_buffer = Memory() self.exp_number = 0 # size of exp buffer so far self.param_updates = 0 # track how many times params updated self.opt = torch.optim.RMSprop(self.model.parameters(), lr=LEARNING_RATE) self.loss = nn.SmoothL1Loss() # Make an action given a state def act(self, state, explore=True): if explore and np.random.rand() <= EPSILON: # Act randomly a = np.random.randint(NUM_ACTIONS) else: # Send state to model a_vec = self.model(state) a = int(torch.argmax(torch.squeeze(a_vec))) return a # clear the buffer def clear_exp_buffer(self): self.exp_buffer = Memory() self.exp_number = 0 # Add experience to exp buffer def add_exp(self, exp): self.exp_buffer.add(exp) self.exp_number += 1 # Replay gets batch and trains on it def replay(self, batch_size): q_loss = 0 # If experience buffer isn't right size yet, don't do anything if self.exp_number < MIN_BUFFER_SIZE: return # Get batch from experience_buffer batch = self.exp_buffer.get_batch(batch_size) s, a, r, s_new, _ = zip(*batch) s_new = s_new[:-1] # Remove last item (it is 'None') # First turn batch into something we can run through model s = torch.cat(s) a = torch.LongTensor(a).unsqueeze(1) r = torch.FloatTensor(r).unsqueeze(1) s_new = torch.cat(s_new) #print(a.shape,r.shape, s.shape, s_new.shape) if USE_CUDA: a = a.cuda() r = r.cuda() # Get q vals for s (what model outputted) from a # .gather gets us q value for specific action a pred_q_vals = self.model(s).gather(1, a) # Having chosen a in s, # What is the highest possible reward we can get from s_new? # We add q of performing a in s then add best q from next state # cat 0 to end for the terminal state s_new_q_vals = self.target(s_new).max(1)[0] zero = torch.FloatTensor(0) if USE_CUDA: zero = zero.cuda() s_new_q_vals = torch.cat((s_new_q_vals, zero)) exp_q_vals = r + s_new_q_vals * GAMMA myloss = self.loss(pred_q_vals, exp_q_vals) self.opt.zero_grad() myloss.backward() self.opt.step() if WEIGHT_CLIPPING: for param in self.model.parameters(): param.grad.data.clamp_( -1, 1) # Weight clipping avoids exploding gradients if self.param_updates % TARGET_UPDATE_INTERVAL == 0: self.target.load_state_dict(self.model.state_dict()) self.param_updates += 1 global EPSILON if EPSILON > EPSILON_MIN: EPSILON *= EPSILON_DECAY return myloss.item()
class Agent: def __init__(self): self.controller, self.target = DQN(), DQN() # For RL self.vision = VAE() if USE_CUDA: self.controller.cuda() self.target.cuda() self.vision.cuda() # Init weights based on init function self.controller.apply(init_weights) self.vision.apply(init_weights) # Load model params into target self.target.load_state_dict(self.controller.state_dict()) self.action_number = 0 # actions taken (to determine whether or not to update) # NOTE: DQN exp buffer should use embeddings generated by vision module # The vision module (aka the VAE) has memory consisting of game states self.exp_buffer = [] # exp buffer self.exp_number = 0 # size of exp buffer so far self.opt = torch.optim.Adam(self.controller.parameters(),lr=DQN_LEARNING_RATE) self.loss = nn.SmoothL1Loss() # Make an action given a state def act(self, state, explore=True): self.action_number += 1 # Update target if self.action_number % TARGET_INTERVAL == 0: self.target.load_state_dict(self.model.state_dict()) if explore and np.random.rand() <= EPSILON: # Act randomly a = np.random.randint(NUM_ACTIONS) return a # Send state to model a_vec = self.controller(self.vision.encode(state)) a = int(torch.argmax(torch.squeeze(a_vec))) return a def load_params(self): # Looks in current directory for params for model and for VAE if LOAD_CHECKPOINT_VAE: try: self.vision.load_state_dict(torch.load("VAEparams.pt")) print("Loaded checkpoint for VAE") except: print("Could not load VAE checkpoint") if LOAD_CHECKPOINT_DQN: try: self.controller.load_state_dict(torch.load("DQNparams.pt")) self.target.load_state_dict(torch.load("DQNparams.pt")) print("Loaded checkpoint for DQN") except: print("Could not load DQN checkpoint") def save_params(self): torch.save(agent.controller.state_dict(), "DQNparams.pt") torch.save(agent.vision.state_dict(), "VAEparams.pt") # clear the buffer def clear_exp_buffer(self): self.exp_buffer = [] self.exp_number = 0 self.vision.memory = [] self.vision.memory_num = 0 # Add experience to exp buffer def add_exp(self, exp): self.vision.remember(exp[0]) if self.exp_number >= EXP_BUFFER_MAX: del self.exp_buffer[0] else: self.exp_number += 1 exp[0] = self.vision.encode(exp[0]) exp[3] = self.vision.encode(exp[3]) self.exp_buffer.append(exp) # Replay gets batch and trains on it # Returns [vision loss, controller loss] def replay(self, batch_size): v_loss, q_loss = 0,0 # Init to 0 in case we need to return without any training # Train vision component first if self.action_number % VAE_UPDATE_INTERVAL == 0: v_loss = self.vision.replay() # If experience buffer isn't right size yet, don't do anything if self.exp_number < EXP_BUFFER_MIN or self.action_number % TRAINING_INTERVAL != 0: return [v_loss, q_loss] # Get batch from experience_buffer batch = random.sample(self.exp_buffer, batch_size) s,a,r,s_new,_ = zip(*batch) s_new = s_new[:-1] # Remove last # First turn batch into something we can run through model s = torch.cat(s) a = torch.LongTensor(a).unsqueeze(1) r = torch.FloatTensor(r) s_new = torch.cat(s_new) if USE_CUDA: a = a.cuda() r = r.cuda() # Get q vals for s (what model outputted) from a # .gather gets us q value for specific action a pred_q_vals = self.model(s).gather(1,a).squeeze() # Having chosen a in s, # What is the highest possible reward we can get from s_new? # We add q of performing a in s then add best q from next state # cat 0 to end for the terminal state s_new_q_vals = self.target(s_new).max(1)[0] zero = torch.zeros(1) if USE_CUDA: zero = zero.cuda() s_new_q_vals = torch.cat((s_new_q_vals, zero)) exp_q_vals = r + s_new_q_vals*GAMMA myloss = self.loss(pred_q_vals, exp_q_vals) self.opt.zero_grad() myloss.backward() if WEIGHT_CLIPPING: for param in self.model.parameters(): param.grad.data.clamp_(-1,1) # Weight clipping avoids exploding gradients self.opt.step() global EPSILON if EPSILON > EPSILON_MIN: EPSILON *= EPSILON_DECAY return [v_loss, myloss.item()]