class Agent_DQN(Agent): def __init__(self, env, args): """ Initialize everything you need here. For example: paramters for neural network initialize Q net and target Q net parameters for repaly buffer parameters for q-learning; decaying epsilon-greedy ... """ super(Agent_DQN, self).__init__(env) ########################### # YOUR IMPLEMENTATION HERE # # Declare variables self.exp_id = uuid.uuid4().__str__().replace('-', '_') self.args = args self.env = env self.eps_threshold = None self.nA = env.action_space.n self.action_list = np.arange(self.nA) self.reward_list = deque( maxlen=args.window) # np.zeros(args.window, np.float32) self.max_q_list = deque( maxlen=args.window) # np.zeros(args.window, np.float32) self.loss_list = deque( maxlen=args.window) # np.zeros(args.window, np.float32) self.probability_list = np.zeros(env.action_space.n, np.float32) self.cur_eps = self.args.eps self.t = 0 self.ep_len = 0 self.mode = None if self.args.use_pri_buffer: self.replay_buffer = NaivePrioritizedBuffer( capacity=self.args.capacity, args=self.args) else: self.replay_buffer = ReplayBuffer(capacity=self.args.capacity, args=self.args) self.position = 0 self.args.save_dir += f'/{self.exp_id}/' os.system(f"mkdir -p {self.args.save_dir}") self.meta = MetaData(fp=open( os.path.join(self.args.save_dir, 'result.csv'), 'w'), args=self.args) self.eps_delta = (self.args.eps - self.args.eps_min) / self.args.eps_decay_window self.beta_by_frame = lambda frame_idx: min( 1.0, args.pri_beta_start + frame_idx * (1.0 - args.pri_beta_start) / args.pri_beta_decay) # Create Policy and Target Networks if self.args.use_dueling: print("Using dueling dqn . . .") self.policy_net = DuelingDQN(env, self.args).to(self.args.device) self.target_net = DuelingDQN(env, self.args).to(self.args.device) elif self.args.use_crnn: print("Using dueling crnn . . .") self.policy_net = CrnnDQN(env).to(self.args.device) self.target_net = CrnnDQN(env).to(self.args.device) else: self.policy_net = DQN(env, self.args).to(self.args.device) self.target_net = DQN(env, self.args).to(self.args.device) self.target_net.load_state_dict(self.policy_net.state_dict()) self.optimizer = optim.Adam(self.policy_net.parameters(), lr=self.args.lr, eps=self.args.optimizer_eps) if self.args.lr_scheduler: print("Enabling LR Decay . . .") self.scheduler = optim.lr_scheduler.ExponentialLR( optimizer=self.optimizer, gamma=self.args.lr_decay) self.cur_lr = self.optimizer.param_groups[0]['lr'] # Compute Huber loss self.loss = F.smooth_l1_loss # todo: Support for Multiprocessing. Bug in pytorch - https://github.com/pytorch/examples/issues/370 self.policy_net.share_memory() self.target_net.share_memory() # Set defaults for networks self.policy_net.train() self.target_net.eval() self.target_net.load_state_dict(self.policy_net.state_dict()) if args.test_dqn: # you can load your model here ########################### # YOUR IMPLEMENTATION HERE # print('loading trained model') self.load_model() if args.use_pri_buffer: print('Using priority buffer . . .') if args.use_double_dqn: print('Using double dqn . . .') if args.use_bnorm: print("Using batch normalization . . .") print("Arguments: \n", json.dumps(vars(self.args), indent=2), '\n') def init_game_setting(self): pass def make_action(self, observation, test=True): """ Return predicted action of your agent Input: observation: np.array stack 4 last preprocessed frames, shape: (84, 84, 4) Return: action: int the predicted action from trained model """ ########################### # YOUR IMPLEMENTATION HERE # with torch.no_grad(): if self.args.test_dqn: q, argq = self.policy_net( Variable( self.channel_first(observation))).data.cpu().max(1) return self.action_list[argq] # Fill up probability list equal for all actions self.probability_list.fill(self.cur_eps / self.nA) # Fetch q from the model prediction q, argq = self.policy_net(Variable( self.channel_first(observation))).data.cpu().max(1) # Increase the probability for the selected best action self.probability_list[argq[0].item()] += 1 - self.cur_eps # Use random choice to decide between a random action / best action action = torch.tensor( [np.random.choice(self.action_list, p=self.probability_list)]) ########################### return action.item(), q.item() def optimize_model(self): """ Function to perform optimization on DL Network :return: Loss """ # Return if initial buffer is not filled. if len(self.replay_buffer.memory) < self.args.mem_init_size: return 0 if self.args.use_pri_buffer: batch_state, batch_action, batch_next_state, batch_reward, batch_done, indices, weights = self.replay_buffer.sample( self.args.batch_size, beta=self.beta_by_frame(self.t)) else: batch_state, batch_action, batch_next_state, batch_reward, batch_done = self.replay_buffer.sample( self.args.batch_size) batch_state = Variable( self.channel_first( torch.tensor(np.array(batch_state), dtype=torch.float32))) batch_action = Variable( torch.tensor(np.array(batch_action), dtype=torch.long)) batch_next_state = Variable( self.channel_first( torch.tensor(np.array(batch_next_state), dtype=torch.float32))) batch_reward = Variable( torch.tensor(np.array(batch_reward), dtype=torch.float32)) batch_done = Variable( torch.tensor(np.array(batch_done), dtype=torch.float32)) policy_max_q = self.policy_net(batch_state).gather( 1, batch_action.unsqueeze(1)).squeeze(1) if self.args.use_double_dqn: policy_ns_max_q = self.policy_net(batch_next_state) next_q_value = self.target_net(batch_next_state).gather( 1, torch.max(policy_ns_max_q, 1)[1].unsqueeze(1)).squeeze(1) target_max_q = next_q_value * self.args.gamma * (1 - batch_done) else: target_max_q = self.target_net(batch_next_state).detach().max( 1)[0].squeeze(0) * self.args.gamma * (1 - batch_done) # Compute Huber loss if self.args.use_pri_buffer: loss = (policy_max_q - (batch_reward + target_max_q.detach())).pow(2) * Variable( torch.tensor(weights, dtype=torch.float32)) prios = loss + 1e-5 loss = loss.mean() else: loss = self.loss(policy_max_q, batch_reward + target_max_q) # Optimize the model self.optimizer.zero_grad() loss.backward() # Clip gradients between -1 and 1 for param in self.policy_net.parameters(): param.grad.data.clamp_(-1, 1) if self.args.use_pri_buffer: self.replay_buffer.update_priorities(indices, prios.data.cpu().numpy()) self.optimizer.step() return loss.cpu().detach().numpy() def train(self): """ Implement your training algorithm here """ ########################### # YOUR IMPLEMENTATION HERE # def train_fn(): self.t = 1 self.mode = "Random" train_start = time.time() if not self.args.load_dir == '': self.load_model() for i_episode in range(1, self.args.max_episodes + 1): # Initialize the environment and state start_time = time.time() state = self.env.reset() self.reward_list.append(0) self.loss_list.append(0) self.max_q_list.append(0) self.ep_len = 0 done = False # Save Model self.save_model(i_episode) # Collect garbage self.collect_garbage(i_episode) # Run the game while not done: # Update the target network, copying all weights and biases in DQN if self.t % self.args.target_update == 0: print("Updating target network . . .") self.target_net.load_state_dict( self.policy_net.state_dict()) # Select and perform an action self.cur_eps = max(self.args.eps_min, self.cur_eps - self.eps_delta) if self.cur_eps == self.args.eps_min: self.mode = 'Exploit' else: self.mode = "Explore" action, q = self.make_action(state) next_state, reward, done, _ = self.env.step(action) self.reward_list[-1] += reward self.max_q_list[-1] = max(self.max_q_list[-1], q) # Store the transition in memory self.replay_buffer.push(state, action, next_state, reward, done) self.meta.update_step(self.t, self.cur_eps, self.reward_list[-1], self.max_q_list[-1], self.loss_list[-1], self.cur_lr) # Increment step and Episode Length self.t += 1 self.ep_len += 1 # Move to the next state state = next_state # Perform one step of the optimization (on the target network) if self.ep_len % self.args.learn_freq == 0: loss = self.optimize_model() self.loss_list[-1] += loss self.loss_list[-1] /= self.ep_len # Decay Step: if self.args.lr_scheduler: self.cur_lr = self.scheduler.get_lr()[0] if i_episode % self.args.lr_decay_step == 0 and self.cur_lr > self.args.lr_min: self.scheduler.step(i_episode) # Update meta self.meta.update_episode( i_episode, self.t, time.time() - start_time, time.time() - train_start, self.ep_len, len(self.replay_buffer.memory), self.cur_eps, self.reward_list[-1], np.mean(self.reward_list), self.max_q_list[-1], np.mean(self.max_q_list), self.loss_list[-1], np.mean(self.loss_list), self.mode, self.cur_lr) import multiprocessing as mp processes = [] for rank in range(4): p = mp.Process(target=train_fn) p.start() processes.append(p) for p in processes: p.join()
class Agent(): """ RL Agent that interacts with a given environment, learns and adapts succesfull behaviour. """ def __init__(self,state_size, action_size ,batch_size,learn_step_size,buffer_size ,gamma , learning_rate, tau ,seed): """ Intialize the agent and its learning parameter set. Parameters ========= state_size (int): Size of the state space action_size (int): Size of the action space batch_size (int): Size of the batch size used in each learning step learn_step_size (int): Number of steps until agent ist trained again buffer_size (int): Size of replay memory buffer gamma (float): Discount rate that scales future discounts learning_rate (float): Learning rate of neural network tau (float): Update strenght between local and target network seed (float): Random set for initialization """ # ----- Parameter init ----- # State and action size from environment self.state_size = state_size self.action_size = action_size # Replay buffer and learning properties self.batch_size = batch_size self.learn_step_size = learn_step_size self.gamma = gamma self.tau = tau # General self.seed = random.seed(seed) # ----- Network and memory init ----- # Init identical NN as local and target networks and set optimizer self.qnetwork_local = DQN(state_size, action_size, seed).to(device) self.qnetwork_target = DQN(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=learning_rate) # Initialize replay memory and time step (for updating every learn_step_size steps) self.memory = ReplayBuffer(action_size, buffer_size, batch_size, seed) self.t_step = 0 def step(self, state, action, reward, next_state, done): """ Append information of past step in memory and trigger learning. Parameters ========== state (array_like): State before action action (array_like): Action that was taken reward (float): Reward for action next_state (array_like): State after action done (bool): Indicator if env was solved after action """ # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every learn_step_size time steps. self.t_step = (self.t_step + 1) % self.learn_step_size if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if self.memory.get_memory_size() > self.batch_size: self.learn() def act(self, state, eps=0.): """Returns actions for given state as per current policy. Parameters ========== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ # Transform state to PyTorch tensor state = torch.from_numpy(state).float().unsqueeze(0).to(device) # Get action scores for state from network self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self): """ Get sample of experience tuples and value parameters target network. """ # Get tuples from experience buffer experiences = self.memory.get_sample() states, actions, rewards, next_states, dones = experiences # -----DQN ----- #Optional: to be replaced with Double DQN (see below) #Q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1) # ----- Double DQN ----- # Detach to not update weights during learning # Select maximum value # Unsqueeze to reduce the tensor dimension to one expected_next_actions = self.qnetwork_local(next_states).detach().max(1)[1].unsqueeze(1) # Get Q values for next actions from target Q-network Q_targets_next = self.qnetwork_target(next_states).detach().gather(1, expected_next_actions) # Compute Q targets for current states Q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones)) # Get expected Q values from local model # Gather values alon an axis specified by dim Q_expected = self.qnetwork_local(states).gather(1, actions) # Compute loss loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() # ----- Update target network ----- #Soft update model parameters. #θ_target = τ*θ_local + (1 - τ)*θ_target for target_param, local_param in zip(self.qnetwork_target.parameters(), self.qnetwork_local.parameters()): target_param.data.copy_(self.tau*local_param.data + (1.0-self.tau)*target_param.data)
class Agent_DQN(): def __init__(self, env, test=False): self.cuda = torch.device('cuda') print("Using device: " + torch.cuda.get_device_name(self.cuda), flush=True) self.env = env self.state_shape = env.observation_space.shape self.n_actions = env.action_space.n self.memory = deque(maxlen=100000) self.batch_size = 32 self.mem_threshold = 50000 self.gamma = 0.99 self.learning_rate = 1e-4 self.epsilon = 1.0 self.epsilon_min = 0.05 self.epsilon_period = 10000 self.epsilon_decay = (self.epsilon - self.epsilon_min) / self.epsilon_period self.update_rate = 4 self.start_epoch = 1 self.epochs = 10 self.epoch = 10000 self.model = DQN(self.state_shape, self.n_actions).to(self.cuda) print("DQN parameters: {}".format(count_parameters(self.model))) self.target = DQN(self.state_shape, self.n_actions).to(self.cuda) self.target.eval() self.target_update = 10000 self.optimizer = optim.Adam(self.model.parameters(), lr=self.learning_rate) if test: self.model.load_state_dict(torch.load('model.pt')) def init_game_setting(self): pass def make_action(self, observation, test=False): epsilon = 0.01 if test else self.epsilon # turn action into tensor observation = torch.tensor(observation, device=self.cuda, dtype=torch.float) # turn off learning self.model.eval() # epsilon greedy policy if random.random() > epsilon: # no need to calculate gradient with torch.no_grad(): # choose highest value action b = self.model(observation) b = b.cpu().data.numpy() action = np.random.choice( np.flatnonzero(np.isclose(b, b.max()))) else: # random action action = random.choice(np.arange(self.n_actions)) # turn learning back on self.model.train() return action def replay_buffer(self): # Return tuple of sars transitions states, actions, rewards, next_states, dones = zip( *random.sample(self.memory, self.batch_size)) states = torch.tensor(np.vstack(states), device=self.cuda, dtype=torch.float) actions = torch.tensor(np.array(actions), device=self.cuda, dtype=torch.long) rewards = torch.tensor(np.array(rewards, dtype=np.float32), device=self.cuda, dtype=torch.float) next_states = torch.tensor(np.vstack(next_states), device=self.cuda, dtype=torch.float) dones = torch.tensor(np.array(dones, dtype=np.float32), device=self.cuda, dtype=torch.float) return states, actions, rewards, next_states, dones def experience_replay(self, n=0): # clamp gradient clamp = False # Reset gradient (because it accumulates by default) self.optimizer.zero_grad() # sample experience memory states, actions, rewards, next_states, dones = self.replay_buffer() # get Q(s,a) for sample Q = self.model(states).gather(1, actions.unsqueeze(-1)).squeeze(-1) # get max_a' Q(s',a') Q_prime = self.target(next_states).detach().max(1)[0] # calculate y = r + gamma * max_a' Q(s',a') for non-terminal states Y = rewards + (self.gamma * Q_prime) * (1 - dones) # Huber loss of Q and Y loss = F.smooth_l1_loss(Q, Y) # Compute dloss/dx loss.backward() # Clamp gradient if clamp: for param in self.model.parameters(): param.grad.data.clamp_(-1, 1) # Change the weights self.optimizer.step() def train(self): step = 0 learn_step = 0 print("Begin Training:", flush=True) learn_curve = [] last30 = deque(maxlen=30) for epoch in range(self.start_epoch, self.epochs + 1): durations = [] rewards = [] flag = [] # progress bar epoch_bar = tqdm(range(self.epoch), total=self.epoch, ncols=200) for episode in epoch_bar: # reset state state = self.env.reset() # decay epsilon if self.epsilon > self.epsilon_min: self.epsilon -= self.epsilon_decay # run one episode done = False ep_duration = 0 ep_reward = 0 while not done: step += 1 ep_duration += 1 # get epsilon-greedy action action = self.make_action(state) # do action next_state, reward, done, info = self.env.step(action) ep_reward += reward # add transition to replay memory self.memory.append( Transition(state, action, reward, next_state, done)) state = next_state # learn from experience, if available if step % self.update_rate == 0 and len( self.memory) > self.mem_threshold: self.experience_replay(learn_step) learn_step += 1 # update target network if step % self.target_update == 1: self.target.load_state_dict(self.model.state_dict()) durations.append(ep_duration) rewards.append(ep_reward) last30.append(ep_reward) learn_curve.append(np.mean(last30)) flag.append(info['flag_get']) epoch_bar.set_description( "epoch {}/{}, avg duration = {:.2f}, avg reward = {:.2f}, last30 = {:2f}" .format(epoch, self.epochs, np.mean(durations), np.mean(rewards), learn_curve[-1])) # save model every epoch plt.clf() plt.plot(learn_curve) plt.title(f"DQN Epoch {epoch} with {save_prefix} Reward") plt.xlabel('Episodes') plt.ylabel('Moving Average Reward') if not os.path.exists(f"{save_prefix}_DQN"): os.mkdir(f"{save_prefix}_DQN") torch.save(self.model.state_dict(), f'{save_prefix}_DQN/DQN_model_ep{epoch}.pt') pickle.dump( rewards, open(f"{save_prefix}_DQN/DQN_reward_ep{epoch}.pkl", 'wb')) pickle.dump(flag, open(f"{save_prefix}_DQN/flag_ep{epoch}.pkl", 'wb')) plt.savefig(f"{save_prefix}_DQN/epoch{epoch}.png") learn_curve = []
class Agent_DQN(Agent): def __init__(self, env, args): """ Initialize everything you need here. For example: paramters for neural network initialize Q net and target Q net parameters for repaly buffer parameters for q-learning; decaying epsilon-greedy ... """ super(Agent_DQN, self).__init__(env) ########################### # YOUR IMPLEMENTATION HERE # self.env = env self.args = args self.gamma = self.args.gamma self.batch_size = self.args.batch_size self.memory_cap = self.args.memory_cap self.n_episode = self.args.n_episode self.lr = self.args.learning_rate self.epsilon = self.args.epsilon self.epsilon_decay_window = self.args.epsilon_decay_window self.epsilon_min = self.args.epsilon_min self.epsilon_decay = (self.epsilon - self.epsilon_min) / self.epsilon_decay_window self.n_step = self.args.n_step self.f_update = self.args.f_update self.load_model = self.args.load_model self.action_size = self.args.action_size # self.algorithm = self.args.algorithm self.use_cuda = torch.cuda.is_available() self.device = torch.device("cuda" if self.use_cuda else "cpu") print('using device ', torch.cuda.get_device_name(0)) self.FloatTensor = torch.cuda.FloatTensor if self.use_cuda else torch.FloatTensor self.LongTensor = torch.cuda.LongTensor if self.use_cuda else torch.LongTensor self.ByteTensor = torch.cuda.ByteTensor if self.use_cuda else torch.ByteTensor self.Tensor = self.FloatTensor # Create the policy net and the target net self.policy_net = DQN() self.policy_net.to(self.device) # if self.algorithm == 'DDQN': # self.policy_net_2 = DQN() # self.policy_net_2.to(self.device) self.target_net = DQN() self.target_net.to(self.device) self.policy_net.train() self.target_net.load_state_dict(self.policy_net.state_dict()) self.target_net.eval() self.optimizer = optim.Adam(params=self.policy_net.parameters(), lr=self.lr) # buffer self.memory = [] ## self.mean_window = 100 self.print_frequency = 100 self.out_dir = "DQN_Module_b1_1/" if args.test_dqn: #you can load your model here print('loading trained model') ########################### # YOUR IMPLEMENTATION HERE # self.policy_net.load_state_dict( torch.load('model.pth', map_location=self.device)) self.target_net.load_state_dict(self.policy_net.state_dict()) if self.algorithm == 'DDQN': self.policy_net_2.load_state_dict( torch.load('model.pth', map_location=self.device)) self.print_test = True def init_game_setting(self): """ Testing function will call this function at the begining of new game Put anything you want to initialize if necessary. If no parameters need to be initialized, you can leave it as blank. """ ########################### # YOUR IMPLEMENTATION HERE # ########################### pass def make_action(self, observation, test=False): """ Return predicted action of your agent Input: observation: np.array stack 4 last preprocessed frames, shape: (84, 84, 4) Return: action: int the predicted action from trained model """ ########################### # YOUR IMPLEMENTATION HERE # if test: self.epsilon = self.epsilon_min * 0.5 observation = observation / 255. else: self.epsilon = max(self.epsilon - self.epsilon_decay, self.epsilon_min) if random.random() > self.epsilon: observation = self.Tensor(observation.reshape( (1, 84, 84, 4))).transpose(1, 3).transpose(2, 3) state_action_value = self.policy_net( observation).data.cpu().numpy() action = np.argmax(state_action_value) else: action = random.randint(0, self.action_size - 1) ########################### return action def push(self, state, action, reward, next_state, done): """ You can add additional arguments as you need. Push new data to buffer and remove the old one if the buffer is full. Hints: ----- you can consider deque(maxlen = 10000) list """ ########################### # YOUR IMPLEMENTATION HERE # if len(self.memory) >= self.memory_cap: self.memory.pop(0) self.memory.append((state, action, reward, next_state, done)) ########################### def replay_buffer(self): """ You can add additional arguments as you need. Select batch from buffer. """ ########################### # YOUR IMPLEMENTATION HERE # self.mini_batch = random.sample(self.memory, self.batch_size) ########################### return def train(self): """ Implement your training algorithm here """ ########################### # YOUR IMPLEMENTATION HERE # self.steps_done = 0 self.steps = [] self.rewards = [] self.mean_rewards = [] self.time = [] self.best_reward = 0 self.last_saved_reward = 0 self.start_time = time.time() print('train') # continue training from where it stopped if self.load_model: self.policy_net.load_state_dict( torch.load(self.out_dir + 'model.pth', map_location=self.device)) self.target_net.load_state_dict(self.policy_net.state_dict()) self.epsilon = self.epsilon_min print('Loaded') for episode in range(self.n_episode): # Initialize the environment and state state = self.env.reset() / 255. # self.last_life = 5 total_reward = 0 self.step = 0 done = False while (not done) and self.step < 10000: # move to next state self.step += 1 self.steps_done += 1 action = self.make_action(state) next_state, reward, done, life = self.env.step(action) # lives matter # self.now_life = life['ale.lives'] # dead = self.now_life < self.last_life # self.last_life = self.now_life next_state = next_state / 255. # Store the transition in memory self.push(state, action, reward, next_state, done) state = next_state total_reward += reward if done: self.rewards.append(total_reward) self.mean_reward = np.mean( self.rewards[-self.mean_window:]) self.mean_rewards.append(self.mean_reward) self.time.append(time.time() - self.start_time) self.steps.append(self.step) # print the process to terminal progress = "episode: " + str( episode) + ",\t epsilon: " + str( self.epsilon ) + ",\t Current mean reward: " + "{:.2f}".format( self.mean_reward) progress += ',\t Best mean reward: ' + "{:.2f}".format( self.best_reward) + ",\t time: " + time.strftime( '%H:%M:%S', time.gmtime(self.time[-1])) print(progress) if episode % self.print_frequency == 0: self.print_and_plot() # save the best model if self.mean_reward > self.best_reward and len( self.memory) >= 5000: print('~~~~~~~~~~<Model updated with best reward = ', self.mean_reward, '>~~~~~~~~~~') checkpoint_path = self.out_dir + 'model.pth' torch.save(self.policy_net.state_dict(), checkpoint_path) self.last_saved_reward = self.mean_reward self.best_reward = self.mean_reward if len(self.memory) >= 5000 and self.steps_done % 4 == 0: # if self.algorithm == 'DQN': self.optimize_DQN() if self.steps_done % self.f_update == 0: self.target_net.load_state_dict( self.policy_net.state_dict()) # print('-------<target net updated at step,',self.steps_done,'>-------') ########################### def optimize_DQN(self): # sample self.replay_buffer() state, action, reward, next_state, done = zip(*self.mini_batch) # transfer 1*84*84*4 to 1*4*84*84, which is 0,3,1,2 state = self.Tensor(np.float32(state)).permute(0, 3, 1, 2).to(self.device) action = self.LongTensor(action).to(self.device) reward = self.Tensor(reward).to(self.device) next_state = self.Tensor(np.float32(next_state)).permute( 0, 3, 1, 2).to(self.device) done = self.Tensor(done).to(self.device) # Compute Q(s_t, a) state_action_values = self.policy_net(state).gather( 1, action.unsqueeze(1)).squeeze(1) # Compute next Q, including the mask next_state_values = self.target_net(next_state).detach().max(1)[0] # Compute the expected Q value. stop update if done expected_state_action_values = reward + (next_state_values * self.gamma) * (1 - done) # Compute Huber loss self.loss = F.smooth_l1_loss(state_action_values, expected_state_action_values.data) # Optimize the model self.optimizer.zero_grad() self.loss.backward() self.optimizer.step() return def print_and_plot(self): fig1 = plt.figure(1) plt.clf() plt.title('Training...') plt.xlabel('Episode') plt.ylabel('Steps') plt.plot(self.steps) fig1.savefig(self.out_dir + 'steps.png') fig2 = plt.figure(2) plt.clf() plt.title('Training...') plt.xlabel('Episode') plt.ylabel('Reward') plt.plot(self.mean_rewards) fig2.savefig(self.out_dir + 'rewards.png') fig2 = plt.figure(3) plt.clf() plt.title('Training...') plt.xlabel('Episode') plt.ylabel('Time') plt.plot(self.time) fig2.savefig(self.out_dir + 'time.png')
class DQNAgent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed, alpha, gamma, tau): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.alpha = alpha self.gamma = gamma self.tau = tau # Q Learning Network self.qnetwork_local = DQN(state_size, action_size, seed).to(device) self.qnetwork_target = DQN(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=self.alpha) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.fill_replay_buffer(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if self.memory.__len__() > BATCH_SIZE: experiences = self.memory.get_sample_replay_buffer() self.learn_DDQN(experiences, self.gamma, self.alpha, self.tau) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn_DDQN(self, experiences, gamma, alpha, tau): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # Get index of maximum value for next state from Q_expected Q_argmax = self.qnetwork_local(next_states).detach() _, a_prime = Q_argmax.max(1) #print (self.qnetwork_local(states).detach()) # Get max predicted Q values (for next states) from target model Q_targets_next = self.qnetwork_target(next_states).detach().gather( 1, a_prime.unsqueeze(1)) #print (Q_targets_next.shape) # Compute Q targets for current states Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) #print (Q_targets.shape) # Get expected Q values from local model Q_expected = self.qnetwork_local(states).gather(1, actions) #print (Q_expected.shape) # Compute loss loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, self.tau) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
reward = 10 # 蓝色胜利,负的奖励 elif GAME_STATE == BLUE_WIN: if GAME_OVER_LX == OUT_OF_MAP: reward = -10 elif GAME_OVER_LX == ATTACKED: reward = -1 # 谁都没赢,没奖励 else: reward = 0 # todo 新增如果距离太远就发弹,则惩罚agent 负的reward pass agent.remember(s, player1_action, next_s, reward) agent.train() score += reward # 如果游戏结束了 if GAME_STATE: score_list.append(score) print('episode:', episode+1, 'score:', score, 'max:', max(score_list)) break FPS_COUNT = 0 s = player1.get_obs(player2, bullet_list) player1_action = agent.act(s) # player1执行action listen_model_action(player1_action, player1, player2) # 如果游戏结束了,这个逻辑也不需要执行了,等待进入下一次玩家1决策的FPS_COUNT即可 if not GAME_STATE: # 玩家二行为,在玩家二类中定义,自动决策
class Agent_DQN(Agent): def __init__(self, env, args): """ Initialize everything you need here. For example: paramters for neural network initialize Q net and target Q net parameters for repaly buffer parameters for q-learning; decaying epsilon-greedy ... """ super(Agent_DQN, self).__init__(env) ########################### # YOUR IMPLEMENTATION HERE # self.env = env self.batch_size = BATCH_SIZE self.gamma = 0.999 self.eps_start = EPS_START self.eps_decay = EPS_DECAY self.TARGET_UPDATE = TARGET_UPDATE self.policy_net = DQN(self.env.action_space.n) self.target_net = DQN(self.env.action_space.n) self.target_net.load_state_dict(self.policy_net.state_dict()) self.target_net.eval() if use_cuda: self.policy_net.cuda() self.target_net.cuda() self.optimizer = optim.RMSprop(self.policy_net.parameters(), lr=1e-5) self.memory = deque(maxlen=10000) if args.test_dqn: # you can load your model here print('loading trained model') ########################### # YOUR IMPLEMENTATION HERE # def init_game_setting(self): """ Testing function will call this function at the begining of new game Put anything you want to initialize if necessary. If no parameters need to be initialized, you can leave it as blank. """ ########################### # YOUR IMPLEMENTATION HERE # ########################### pass def make_action(self, observation, test=True): """ Return predicted action of your agent Input: observation: np.array stack 4 last preprocessed frames, shape: (84, 84, 4) Return: action: int the predicted action from trained model """ ########################### # YOUR IMPLEMENTATION HERE # global steps_done self.policy_net.eval() sample = random.random() eps_threshold = EPS_END + (EPS_START - EPS_END) * \ math.exp(-1. * steps_done / EPS_DECAY) steps_done += 1 if sample > eps_threshold: return self.policy_net( Variable(torch.from_numpy(observation), volatile=True).type(FloatTensor)).data.max(1)[1].view( 1, 1) else: return LongTensor([[random.randrange(self.env.action_space.n)]]) ########################### return action def push(self, s, a, r, s_, done): """ You can add additional arguments as you need. Push new data to buffer and remove the old one if the buffer is full. Hints: ----- you can consider deque(maxlen = 10000) list """ ########################### # YOUR IMPLEMENTATION HERE # self.memory.append((s, a, r, s_, done)) if len(self.memory) > self.maxlen: self.replay_memory_store.popleft() self.memory_counter += 1 ########################### def replay_buffer(self): """ You can add additional arguments as you need. Select batch from buffer. """ ########################### # YOUR IMPLEMENTATION HERE # #print("memory", len(self.memory), self.BATCH_SIZE) minibatch = random.sample(self.memory, self.BATCH_SIZE) minibatch = np.array(minibatch).transpose(0, 3, 1, 2) minibatch = torch.tensor(minibatch / 255.0) ########################### return minibatch def optimize_model(self): if len(self.memory) < BATCH_SIZE: return transitions = self.memory.sample(BATCH_SIZE) # Transpose the batch (see http://stackoverflow.com/a/19343/3343043 for # detailed explanation). batch = Transition(*zip(*transitions)) # Compute a mask of non-final states and concatenate the batch elements non_final_mask = ByteTensor( tuple(map(lambda s: s is not None, batch.next_state))) non_final_next_states = Variable(torch.cat( [s for s in batch.next_state if s is not None]), volatile=True).cuda() state_batch = Variable(torch.cat(batch.state)).cuda() action_batch = Variable(torch.cat(batch.action)).cuda() reward_batch = Variable(torch.cat(batch.reward)).cuda() # Compute Q(s_t, a) - the model computes Q(s_t), then we select the # columns of actions taken self.policy_net.train() state_action_values = self.policy_net(state_batch).gather( 1, action_batch) # Compute V(s_{t+1}) for all next states. next_state_values = Variable( torch.zeros(BATCH_SIZE).type(Tensor)).cuda() next_state_values[non_final_mask] = self.target_net( non_final_next_states).max(1)[0] # Compute the expected Q values expected_state_action_values = (next_state_values * GAMMA) + reward_batch # Undo volatility (which was used to prevent unnecessary gradients) expected_state_action_values = Variable( expected_state_action_values.data).cuda() # Compute Huber loss loss = F.smooth_l1_loss(state_action_values, expected_state_action_values) # Optimize the model self.optimizer.zero_grad() loss.backward() for param in self.policy_net.parameters(): param.grad.data.clamp_(-1, 1) self.optimizer.step() def train(self): """ Implement your training algorithm here """ ########################### # YOUR IMPLEMENTATION HERE # num_episodes = 1400000 for i_episode in range(num_episodes): # Initialize the environment and state observation = self.env.reset() observation = observation.transpose((2, 0, 1)) observation = observation[np.newaxis, :] state = observation for t in count(): # Select and perform an action action = self.make_action(state) next_state, reward, done, _ = self.env.step(action[0, 0]) next_state = next_state.transpose((2, 0, 1)) next_state = next_state[np.newaxis, :] reward = Tensor([reward]) # Store the transition in memory self.memory.push(torch.from_numpy(state), action, torch.from_numpy(next_state), reward) # Observe new state if not done: state = next_state else: state = None # Perform one step of the optimization (on the target network) self.optimize_model() if done: print( 'resetting env. episode %d \'s reward total was %d.' % (i_episode + 1, t + 1)) break # Update the target network if i_episode % TARGET_UPDATE == 0: self.target_net.load_state_dict(self.policy_net.state_dict()) if i_episode % 50 == 0: checkpoint_path = os.path.join('save_dir', 'model-best.pth') torch.save(self.policy_net.state_dict(), checkpoint_path) print("model saved to {}".format(checkpoint_path))
class Agent_DQN(Agent): def __init__(self, env, args): """ Initialize everything you need here. For example: paramters for neural network initialize Q net and target Q net parameters for repaly buffer parameters for q-learning; decaying epsilon-greedy ... """ super(Agent_DQN, self).__init__(env) self.action = env.get_action_space() ########################### # YOUR IMPLEMENTATION HERE # self.device = torch.device( 'cuda' if torch.cuda.is_available() else 'cpu') print('Using device:', self.device) self.model = DQN().to(self.device) self.model_target = DQN().to(self.device) self.episode = 100000 self.max_steps_per_episode = 14000 self.update_target_network = 10000 self.epsilon = 1.0 self.min_epsilon = 0.1 self.step_epsilon = (self.epsilon - self.min_epsilon) / (1E6) self.env = env self.history = [] self.buffer_size = min(args.history_size // 5, 2000) self.history_size = args.history_size self.learning_rate = 1e-4 self.name = args.name self.batch_size = 32 self.gamma = 0.99 self.priority = [] self.w = 144 self.h = 256 self.mode = args.mode self.delay = args.delay self.epoch = args.continue_epoch if args.test_dqn or self.epoch > 0: #you can load your model here print('loading trained model') ########################### self.model.load_state_dict( torch.load(self.name + '.pth', map_location=self.device)) self.model_target.load_state_dict( torch.load(self.name + '.pth', map_location=self.device)) # YOUR IMPLEMENTATION HERE # def init_game_setting(self): """ Testing function will call this function at the begining of new game Put anything you want to initialize if necessary. If no parameters need to be initialized, you can leave it as blank. """ ########################### # YOUR IMPLEMENTATION HERE # ########################### pass def make_action(self, observation, test=True): """ Return predicted action of your agent Input: observation: np.array stack 4 last preprocessed frames, shape: (84, 84, 4) Return: action: int the predicted action from trained model """ ########################### # YOUR IMPLEMENTATION HERE # self.model.eval() with torch.no_grad(): if test == False: if np.random.random() < self.epsilon or len( self.history) < self.buffer_size: action = int(np.random.choice([0, 1], 1)[0]) else: obs = torch.from_numpy(observation).to(self.device).float() action_prob = self.model(obs.view(1, 12, self.h, self.w)) action = torch.argmax(action_prob).detach().item() return action else: observation = np.swapaxes(observation, 0, 2) / 255. obs = torch.from_numpy(observation).to(self.device).float() action_prob = self.model(obs.view(1, 12, self.h, self.w)) action = torch.argmax(action_prob).detach().item() return self.action[action] ########################### def push(self, state, action, reward, done, state_next, smooth=None): """ You can add additional arguments as you need. Push new data to buffer and remove the old one if the buffer is full. Hints: ----- you can consider deque(maxlen = 10000) list """ ########################### # YOUR IMPLEMENTATION HERE # self.history.append( np.array([state, action, reward, done, state_next, smooth])) if len(self.history) > self.history_size: self.history.pop(0) ########################### def replay_buffer(self, refresh=False): """ You can add additional arguments as you need. Select batch from buffer. """ ########################### # YOUR IMPLEMENTATION HERE # if 'prioritized' in self.mode.split('_'): if refresh: self.priority = np.zeros(len(self.history)) for i in range(len(self.history)): max_reward, _ = torch.max(self.model_target( torch.from_numpy(self.history[i][4]).to( self.device).float().view(1, 12, self.h, self.w)), axis=1) max_reward = max_reward.detach().item() Q = self.model( torch.from_numpy( self.history[i][0]).to(self.device).float().view( 1, 12, self.h, self.w))[0, self.history[i][1]].detach().item() self.priority[i] = abs( (self.history[i][2] + self.gamma * max_reward - Q)) self.priority = self.priority / sum(self.priority) return 0 priority = np.zeros(len(self.history)) priority[:len(self.priority)] = self.priority if sum(priority) == 0: indices = np.random.choice(range(len(self.history)), size=self.batch_size) else: indices = np.random.choice(range(len(self.history)), size=self.batch_size, p=priority) ########################### return indices else: return np.random.choice(range(len(self.history)), size=self.batch_size) def train(self): """ Implement your training algorithm here """ ########################### # YOUR IMPLEMENTATION HERE # episode_reward_history = [] best_reward = -10 optimizer = torch.optim.Adam(self.model.parameters(), lr=self.learning_rate) # optimizer = torch.optim.RMSprop(self.model.parameters(), lr=self.learning_rate,momentum=0.5) loss_fn = torch.nn.SmoothL1Loss() frame_count = 0 if self.epoch > 0: f = open(self.name + '.txt', "a") else: f = open(self.name + '.txt', "w") done = False for ep in range(self.epoch, self.episode): state = self.env.reset() state = np.swapaxes(state, 0, 2) / 255. episode_reward = 0 pre_action = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0] smooth = 0 for timestep in range(0, self.max_steps_per_episode): frame_count += 1 action = self.make_action(state, test=False) if done: action = 1 # Decay self.epsilon -= self.step_epsilon self.epsilon = max(self.epsilon, self.min_epsilon) # next frame state_next, reward, done, _ = self.env.step( self.action[action]) state_next = np.swapaxes(state_next, 0, 2) / 255. episode_reward += reward # print(reward) #normalize reward # reward = np.sign(reward) # Save actions and states in replay buffer state = state_next if 'smooth1' in self.mode.split('_'): pre_action.pop(0) pre_action.append(action) smooth = float(np.mean(pre_action) - 0.5) self.push(state, action, reward, done, state_next, smooth) if frame_count % 8 == 0 and len( self.history) >= self.buffer_size: if frame_count % self.history_size // 10 == 0 and 'prioritized' in self.mode.split( '_'): #update priority vector self.replay_buffer(refresh=True) indice = self.replay_buffer() self.model.train() # data_batch = torch.from_numpy(np.array(self.history)[indice]).to(self.device).float() state_sample = torch.from_numpy( np.array([self.history[i][0] for i in indice])).to(self.device).float() action_sample = torch.from_numpy( np.array([self.history[i][1] for i in indice])).to(self.device).float() rewards_sample = torch.from_numpy( np.array([self.history[i][2] for i in indice])).to(self.device).float() done_sample = torch.from_numpy( np.array([self.history[i][3] for i in indice])).to(self.device).float() next_state_sample = torch.from_numpy( np.array([self.history[i][4] for i in indice])).to(self.device).float() smooth_sample = torch.from_numpy( np.array([self.history[i][5] for i in indice])).to(self.device).float() future_rewards = self.model_target(next_state_sample) max_reward, _ = torch.max(future_rewards, axis=1) updated_q_values = rewards_sample + self.gamma * max_reward updated_q_values = updated_q_values * ( 1 - done_sample) - done_sample mask = F.one_hot(action_sample.long(), 2).to(self.device).float() q_values = self.model(state_sample) q_action = torch.sum(q_values * mask, axis=1) loss = loss_fn(q_action, updated_q_values) if 'smooth1' in self.mode.split('_') and self.delay < ep: penalty = torch.abs((ep - self.delay) / self.episode * torch.sum(smooth_sample)) loss += penalty optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm(self.model.parameters(), 1.0) optimizer.step() if frame_count % self.update_target_network == 0: self.model_target.load_state_dict(self.model.state_dict()) if done: break episode_reward_history.append(episode_reward) if len(episode_reward_history) > 30: del episode_reward_history[:1] running_reward = np.mean(episode_reward_history) # if ep%500==0: # print("Episode:\t{},\t Avereged reward: {:.2f}\n".format(ep,running_reward)) f.write("Episode:\t{},\t Avereged reward: {:.2f}\n".format( ep, running_reward)) if running_reward > best_reward: best_reward = running_reward torch.save(self.model.state_dict(), self.name + '.pth') f.close()