def main(): policy_net = DQN(U_num, num_actions).to(device) #初始化Q网络 policy_net.apply(init_weights) if pretrained: ckp = torch.load('/data2/jiangjigang/ckp/dqn.pth') policy_net.load_state_dict( {k.replace('module.', ''): v for k, v in ckp.items()}) target_net = DQN(U_num, num_actions).to(device) #初始化target_Q网络 target_net.load_state_dict(policy_net.state_dict()) #用Q网络的参数初始化target_Q网络 target_net.eval() optimizer_policy = torch.optim.Adam(policy_net.parameters(), lr=learning_rate) #定义优化器Adam,可以更换 buffer = ReplayBuffer( buffer_size ) #定义一个经验池 PS:经验池储存经验数据,后随机从经验池中抽取经验数据来训练更新网络参数 在Buffer.py中 criterion = torch.nn.MSELoss(reduction='sum') # training for i_episode in range(num_episodes): state0 = [user_loc, user_dis, node_loc, use_buff] #获得一个初始化状态 error = 0.0 all_reward = 0 for t in count(): # 选择动作 action = e_greedy_select_action(state0, policy_net) a = np.array([action.data.cpu().numpy()]) #print("action selected by e_greedy is {}".format(action)) # 利用状态转移函数,得到当前状态下采取当前行为得到的下一个状态,和下一个状态的终止情况 state1, done, flag = transition_function(state0, action) # 利用奖励函数,获得当前的奖励值 reward, cost_migration = reward_function(state0, action, state1, flag) all_reward = all_reward + reward # 将经验数据存储至buffer中 buffer.add(state0, a, reward, state1, done) # exit an episode after MAX_T steps if t > MAX_T: break #当episode>10时进行网络参数更新,目的是为了让经验池中有较多的数据,使得训练较为稳定。 if i_episode > 1: # 从buffer中取出一批训练样本,训练数据batch由BATCH_SIZE参数决定 batch = buffer.getBatch(BATCH_SIZE) policy_net, target_net, bellman_error = optimize_model( batch, policy_net, target_net, optimizer_policy, criterion) error = error + bellman_error.data.cpu().numpy() # 进入下一状态 state0 = state1 ave_error = error / (t * 1.00) ave_reward = all_reward / (t * 1.00) print(ave_error, ave_reward) torch.save(policy_net.state_dict(), '/data2/jiangjigang/ckp/dqn.pth')
U_num).tolist() #边缘节点位置 1-100号 (so we suppose that node_num == U_num???) user_loc = np.random.randint(0, 101, U_num).tolist() #用户位置 1-100号 user_dis = random_displacement(user_loc) #用户未来位移 上下左右 -10,10,-1,1 use_buff = np.random.randint(3, 8, U_num).tolist() #资源所需 state0 = [user_loc, user_dis, node_loc, use_buff] device = torch.device("cuda" if torch.cuda.is_available() else "cpu") #主程序部分 policy_net = DQN(U_num, num_actions).to(device) #初始化Q网络 target_net = DQN(U_num, num_actions).to(device) #初始化target_Q网络 target_net.load_state_dict(policy_net.state_dict()) #用Q网络的参数初始化target_Q网络 target_net.eval() optimizer_policy = torch.optim.Adam(policy_net.parameters(), lr=learning_rate) #定义优化器Adam,可以更换 buffer = ReplayBuffer( buffer_size) #定义一个经验池 PS:经验池储存经验数据,后随机从经验池中抽取经验数据来训练更新网络参数 在Buffer.py中 criterion = torch.nn.MSELoss(reduction='sum') # training for i_episode in range(num_episodes): #state0 #获得一个初始化状态 for t in count(): # 选择动作 action = e_greedy_select_action(state0) print("action selected by e_greedy is {}".format(action)) # 利用状态转移函数,得到当前状态下采取当前行为得到的下一个状态,和下一个状态的终止情况
class Agent_DQN(Agent): def __init__(self, env, args): """ Initialize everything you need here. For example: paramters for neural network initialize Q net and target Q net parameters for repaly buffer parameters for q-learning; decaying epsilon-greedy ... """ super(Agent_DQN, self).__init__(env) self.action = env.get_action_space() ########################### # YOUR IMPLEMENTATION HERE # self.device = torch.device( 'cuda' if torch.cuda.is_available() else 'cpu') print('Using device:', self.device) self.model = DQN().to(self.device) self.model_target = DQN().to(self.device) self.episode = 100000 self.max_steps_per_episode = 14000 self.update_target_network = 10000 self.epsilon = 1.0 self.min_epsilon = 0.1 self.step_epsilon = (self.epsilon - self.min_epsilon) / (1E6) self.env = env self.history = [] self.buffer_size = min(args.history_size // 5, 2000) self.history_size = args.history_size self.learning_rate = 1e-4 self.name = args.name self.batch_size = 32 self.gamma = 0.99 self.priority = [] self.w = 144 self.h = 256 self.mode = args.mode self.delay = args.delay self.epoch = args.continue_epoch if args.test_dqn or self.epoch > 0: #you can load your model here print('loading trained model') ########################### self.model.load_state_dict( torch.load(self.name + '.pth', map_location=self.device)) self.model_target.load_state_dict( torch.load(self.name + '.pth', map_location=self.device)) # YOUR IMPLEMENTATION HERE # def init_game_setting(self): """ Testing function will call this function at the begining of new game Put anything you want to initialize if necessary. If no parameters need to be initialized, you can leave it as blank. """ ########################### # YOUR IMPLEMENTATION HERE # ########################### pass def make_action(self, observation, test=True): """ Return predicted action of your agent Input: observation: np.array stack 4 last preprocessed frames, shape: (84, 84, 4) Return: action: int the predicted action from trained model """ ########################### # YOUR IMPLEMENTATION HERE # self.model.eval() with torch.no_grad(): if test == False: if np.random.random() < self.epsilon or len( self.history) < self.buffer_size: action = int(np.random.choice([0, 1], 1)[0]) else: obs = torch.from_numpy(observation).to(self.device).float() action_prob = self.model(obs.view(1, 12, self.h, self.w)) action = torch.argmax(action_prob).detach().item() return action else: observation = np.swapaxes(observation, 0, 2) / 255. obs = torch.from_numpy(observation).to(self.device).float() action_prob = self.model(obs.view(1, 12, self.h, self.w)) action = torch.argmax(action_prob).detach().item() return self.action[action] ########################### def push(self, state, action, reward, done, state_next, smooth=None): """ You can add additional arguments as you need. Push new data to buffer and remove the old one if the buffer is full. Hints: ----- you can consider deque(maxlen = 10000) list """ ########################### # YOUR IMPLEMENTATION HERE # self.history.append( np.array([state, action, reward, done, state_next, smooth])) if len(self.history) > self.history_size: self.history.pop(0) ########################### def replay_buffer(self, refresh=False): """ You can add additional arguments as you need. Select batch from buffer. """ ########################### # YOUR IMPLEMENTATION HERE # if 'prioritized' in self.mode.split('_'): if refresh: self.priority = np.zeros(len(self.history)) for i in range(len(self.history)): max_reward, _ = torch.max(self.model_target( torch.from_numpy(self.history[i][4]).to( self.device).float().view(1, 12, self.h, self.w)), axis=1) max_reward = max_reward.detach().item() Q = self.model( torch.from_numpy( self.history[i][0]).to(self.device).float().view( 1, 12, self.h, self.w))[0, self.history[i][1]].detach().item() self.priority[i] = abs( (self.history[i][2] + self.gamma * max_reward - Q)) self.priority = self.priority / sum(self.priority) return 0 priority = np.zeros(len(self.history)) priority[:len(self.priority)] = self.priority if sum(priority) == 0: indices = np.random.choice(range(len(self.history)), size=self.batch_size) else: indices = np.random.choice(range(len(self.history)), size=self.batch_size, p=priority) ########################### return indices else: return np.random.choice(range(len(self.history)), size=self.batch_size) def train(self): """ Implement your training algorithm here """ ########################### # YOUR IMPLEMENTATION HERE # episode_reward_history = [] best_reward = -10 optimizer = torch.optim.Adam(self.model.parameters(), lr=self.learning_rate) # optimizer = torch.optim.RMSprop(self.model.parameters(), lr=self.learning_rate,momentum=0.5) loss_fn = torch.nn.SmoothL1Loss() frame_count = 0 if self.epoch > 0: f = open(self.name + '.txt', "a") else: f = open(self.name + '.txt', "w") done = False for ep in range(self.epoch, self.episode): state = self.env.reset() state = np.swapaxes(state, 0, 2) / 255. episode_reward = 0 pre_action = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0] smooth = 0 for timestep in range(0, self.max_steps_per_episode): frame_count += 1 action = self.make_action(state, test=False) if done: action = 1 # Decay self.epsilon -= self.step_epsilon self.epsilon = max(self.epsilon, self.min_epsilon) # next frame state_next, reward, done, _ = self.env.step( self.action[action]) state_next = np.swapaxes(state_next, 0, 2) / 255. episode_reward += reward # print(reward) #normalize reward # reward = np.sign(reward) # Save actions and states in replay buffer state = state_next if 'smooth1' in self.mode.split('_'): pre_action.pop(0) pre_action.append(action) smooth = float(np.mean(pre_action) - 0.5) self.push(state, action, reward, done, state_next, smooth) if frame_count % 8 == 0 and len( self.history) >= self.buffer_size: if frame_count % self.history_size // 10 == 0 and 'prioritized' in self.mode.split( '_'): #update priority vector self.replay_buffer(refresh=True) indice = self.replay_buffer() self.model.train() # data_batch = torch.from_numpy(np.array(self.history)[indice]).to(self.device).float() state_sample = torch.from_numpy( np.array([self.history[i][0] for i in indice])).to(self.device).float() action_sample = torch.from_numpy( np.array([self.history[i][1] for i in indice])).to(self.device).float() rewards_sample = torch.from_numpy( np.array([self.history[i][2] for i in indice])).to(self.device).float() done_sample = torch.from_numpy( np.array([self.history[i][3] for i in indice])).to(self.device).float() next_state_sample = torch.from_numpy( np.array([self.history[i][4] for i in indice])).to(self.device).float() smooth_sample = torch.from_numpy( np.array([self.history[i][5] for i in indice])).to(self.device).float() future_rewards = self.model_target(next_state_sample) max_reward, _ = torch.max(future_rewards, axis=1) updated_q_values = rewards_sample + self.gamma * max_reward updated_q_values = updated_q_values * ( 1 - done_sample) - done_sample mask = F.one_hot(action_sample.long(), 2).to(self.device).float() q_values = self.model(state_sample) q_action = torch.sum(q_values * mask, axis=1) loss = loss_fn(q_action, updated_q_values) if 'smooth1' in self.mode.split('_') and self.delay < ep: penalty = torch.abs((ep - self.delay) / self.episode * torch.sum(smooth_sample)) loss += penalty optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm(self.model.parameters(), 1.0) optimizer.step() if frame_count % self.update_target_network == 0: self.model_target.load_state_dict(self.model.state_dict()) if done: break episode_reward_history.append(episode_reward) if len(episode_reward_history) > 30: del episode_reward_history[:1] running_reward = np.mean(episode_reward_history) # if ep%500==0: # print("Episode:\t{},\t Avereged reward: {:.2f}\n".format(ep,running_reward)) f.write("Episode:\t{},\t Avereged reward: {:.2f}\n".format( ep, running_reward)) if running_reward > best_reward: best_reward = running_reward torch.save(self.model.state_dict(), self.name + '.pth') f.close()
def dqn_learing(env, q_func, optimizer_spec, exploration, stopping_criterion=None, replay_buffer_size=1000000, batch_size=32, gamma=0.99, learning_starts=50000, learning_freq=4, frame_history_len=4, target_update_freq=10000): """Run Deep Q-learning algorithm. You can specify your own convnet using q_func. All schedules are w.r.t. total number of steps taken in the environment. Parameters ---------- env: gym.Env gym environment to train on. q_func: function Model to use for computing the q function. It should accept the following named arguments: input_channel: int number of channel of input. num_actions: int number of actions optimizer_spec: OptimizerSpec Specifying the constructor and kwargs, as well as learning rate schedule for the optimizer exploration: Schedule (defined in utils.schedule) schedule for probability of chosing random action. stopping_criterion: (env) -> bool should return true when it's ok for the RL algorithm to stop. takes in env and the number of steps executed so far. replay_buffer_size: int How many memories to store in the replay buffer. batch_size: int How many transitions to sample each time experience is replayed. gamma: float Discount Factor learning_starts: int After how many environment steps to start replaying experiences learning_freq: int How many steps of environment to take between every experience replay frame_history_len: int How many past frames to include as input to the model. target_update_freq: int How many experience replay rounds (not steps!) to perform between each update to the target Q network """ assert type(env.observation_space) == gym.spaces.Box assert type(env.action_space) == gym.spaces.Discrete ############### # BUILD MODEL # ############### if len(env.observation_space.shape) == 1: # This means we are running on low-dimensional observations (e.g. RAM) input_arg = env.observation_space.shape[0] else: img_h, img_w, img_c = env.observation_space.shape input_arg = frame_history_len * img_c num_actions = env.action_space.n # Construct an epilson greedy policy with given exploration schedule def select_epilson_greedy_action(model, obs, t): sample = random.random() eps_threshold = exploration.value(t) if sample > eps_threshold: obs = torch.from_numpy(obs).type(dtype).unsqueeze(0) / 255.0 # Use volatile = True if variable is only used in inference mode, i.e. don’t save the history return model(Variable(obs, volatile=True)).data.max(1)[1].cpu() else: return torch.IntTensor([[random.randrange(num_actions)]]) # Initialize target q function and q function, i.e. build the model. ###### model = DQN(in_channels=input_arg, num_actions=num_actions) target_Q = DQN(in_channels=input_arg, num_actions=num_actions) if USE_CUDA: target_Q = target_Q.cuda() model = model.cuda() ###### # Construct Q network optimizer function optimizer = optimizer_spec.constructor(model.parameters(), **optimizer_spec.kwargs) # Construct the replay buffer replay_buffer = ReplayBuffer(replay_buffer_size, frame_history_len) ############### # RUN ENV # ############### num_param_updates = 0 mean_episode_reward = -float('nan') best_mean_episode_reward = -float('inf') last_obs = env.reset() LOG_EVERY_N_STEPS = 10000 for t in count(): ### 1. Check stopping criterion if stopping_criterion is not None and stopping_criterion(env): break ### 2. Step the env and store the transition # At this point, "last_obs" contains the latest observation that was # recorded from the simulator. Here, your code needs to store this # observation and its outcome (reward, next observation, etc.) into # the replay buffer while stepping the simulator forward one step. # At the end of this block of code, the simulator should have been # advanced one step, and the replay buffer should contain one more # transition. # Specifically, last_obs must point to the new latest observation. # Useful functions you'll need to call: # obs, reward, done, info = env.step(action) # this steps the environment forward one step # obs = env.reset() # this resets the environment if you reached an episode boundary. # Don't forget to call env.reset() to get a new observation if done # is true!! # Note that you cannot use "last_obs" directly as input # into your network, since it needs to be processed to include context # from previous frames. You should check out the replay buffer # implementation in dqn_utils.py to see what functionality the replay # buffer exposes. The replay buffer has a function called # encode_recent_observation that will take the latest observation # that you pushed into the buffer and compute the corresponding # input that should be given to a Q network by appending some # previous frames. # Don't forget to include epsilon greedy exploration! # And remember that the first time you enter this loop, the model # may not yet have been initialized (but of course, the first step # might as well be random, since you haven't trained your net...) ##### OUR CODE idx = replay_buffer.store_frame(last_obs) encoded_obs = replay_buffer.encode_recent_observation() if t > learning_starts: action = select_epilson_greedy_action(model, encoded_obs, t)[0] else: action = random.randrange(num_actions) obs, reward, done, info = env.step(action) reward = max(-1.0, min(reward, 1.0)) replay_buffer.store_effect(idx, action, reward, done) if done: obs = env.reset() last_obs = obs ##### # at this point, the environment should have been advanced one step (and # reset if done was true), and last_obs should point to the new latest # observation ### 3. Perform experience replay and train the network. # Note that this is only done if the replay buffer contains enough samples # for us to learn something useful -- until then, the model will not be # initialized and random actions should be taken if (t > learning_starts and t % learning_freq == 0 and replay_buffer.can_sample(batch_size)): # Here, you should perform training. Training consists of four steps: # 3.a: use the replay buffer to sample a batch of transitions (see the # replay buffer code for function definition, each batch that you sample # should consist of current observations, current actions, rewards, # next observations, and done indicator). # Note: Move the variables to the GPU if avialable obs_batch, act_batch, rew_batch, next_obs_batch, done_mask = replay_buffer.sample( batch_size) obs_batch = Variable( torch.from_numpy(obs_batch).type(dtype) / 255.0) next_obs_batch = Variable( torch.from_numpy(next_obs_batch).type(dtype) / 255.0) act_batch = Variable( torch.Tensor(act_batch).type(torch.LongTensor)) rew_batch = Variable(torch.from_numpy(rew_batch)) done_mask = Variable( torch.Tensor([1. if val == 0 else 0. for val in done_mask])) if USE_CUDA: done_mask = done_mask.cuda() act_batch = act_batch.cuda() rew_batch = rew_batch.cuda() obs_batch = obs_batch.cuda() next_obs_batch = next_obs_batch.cuda() # 3.b: fill in your own code to compute the Bellman error. This requires # evaluating the current and next Q-values and constructing the corresponding error. # Note: don't forget to clip the error between [-1,1], multiply is by -1 (since pytorch minimizes) and # maskout post terminal status Q-values (see ReplayBuffer code). # We choose Q based on action taken. current_Q_values = model(obs_batch).gather( 1, act_batch.unsqueeze(1)) #[0, act_batch] # 5. Obtain maxQ' and set our target value for chosen action using the bellman equation. next_max_q = target_Q(next_obs_batch).detach().max(1)[0] next_Q_values = torch.mul(done_mask, next_max_q) target_Q_values = rew_batch + (gamma * next_Q_values) if USE_CUDA: target_Q_values = target_Q_values.cuda() d_error = target_Q_values.unsqueeze(1) - current_Q_values d_error = d_error.clamp(-1, 1) * -1 # 3.c: train the model. To do this, use the bellman error you calculated perviously. # Pytorch will differentiate this error for you, to backward the error use the following API: # current.backward(d_error.data.unsqueeze(1)) # Where "current" is the variable holding current Q Values and d_error is the clipped bellman error. # Your code should produce one scalar-valued tensor. # Note: don't forget to call optimizer.zero_grad() before the backward call and # optimizer.step() after the backward call. optimizer.zero_grad() current_Q_values.backward(d_error) optimizer.step() num_param_updates += 1 # 3.d: periodically update the target network by loading the current Q network weights into the # target_Q network. see state_dict() and load_state_dict() methods. # you should update every target_update_freq steps, and you may find the # variable num_param_updates useful for this (it was initialized to 0) ##### if num_param_updates % target_update_freq == 0: target_Q.load_state_dict(model.state_dict()) # YOUR CODE HERE ##### ### 4. Log progress and keep track of statistics episode_rewards = get_wrapper_by_name(env, "Monitor").get_episode_rewards() if len(episode_rewards) > 0: mean_episode_reward = np.mean(episode_rewards[-100:]) if hasattr(exploration, 'add_reward'): exploration.add_reward(episode_rewards) if len(episode_rewards) > 100: best_mean_episode_reward = max(best_mean_episode_reward, mean_episode_reward) Statistic["mean_episode_rewards"].append(mean_episode_reward) Statistic["best_mean_episode_rewards"].append(best_mean_episode_reward) if t % LOG_EVERY_N_STEPS == 0 and t > learning_starts: print("Timestep %d" % (t, )) print("mean reward (100 episodes) %f" % mean_episode_reward) print("best mean reward %f" % best_mean_episode_reward) print("episodes %d" % len(episode_rewards)) print("exploration %f" % exploration.value(t)) sys.stdout.flush() # Dump statistics to pickle with open('statistics.pkl', 'wb') as f: pickle.dump(Statistic, f) print("Saved to %s" % 'statistics.pkl')
class Agent_DQN(): def __init__(self, env, args): # Parameters for q-learning super(Agent_DQN, self).__init__() self.env = env state = env.reset() state = state.transpose(2, 0, 1) self.policy_net = DQN(state.shape, self.env.action_space.n) # Behavior Q self.target_net = DQN(state.shape, self.env.action_space.n) # Target Q self.target_net.load_state_dict(self.policy_net.state_dict()) #Initial Q if USE_CUDA: print("Using CUDA . . . ") self.policy_net = self.policy_net.cuda() self.target_net = self.target_net.cuda() print('hyperparameters and network initialized') if args.test_dqn or LOAD == True: print('loading trained model') checkpoint = torch.load('trainData') self.policy_net.load_state_dict(checkpoint['model_state_dict']) self.target_net.load_state_dict(self.policy_net.state_dict()) def init_game_setting(self): print('loading trained model') checkpoint = torch.load('trainData') self.policy_net.load_state_dict(checkpoint['model_state_dict']) def push(self, state, action, reward, next_state, done): state = np.expand_dims(state, 0) next_state = np.expand_dims(next_state, 0) memory.append((state, action, reward, next_state, done)) def replay_buffer(self): state, action, reward, next_state, done = zip( *random.sample(memory, batch_size)) return np.concatenate(state), action, reward, np.concatenate( next_state), done def __len__(self): return len(self.buffer) def make_action(self, observation, test=True): observation = observation.transpose(2, 0, 1) if np.random.random() > EPSILON or test == True: observation = Variable(torch.FloatTensor( np.float32(observation)).unsqueeze(0), volatile=True) q_value = self.policy_net.forward(observation) action = q_value.max(1)[1].data[0] action = int(action.item()) else: action = random.randrange(4) return action def optimize_model(self): states, actions, next_states, rewards, dones = self.replay_buffer() states_v = Variable(torch.FloatTensor(np.float32(states))) next_states_v = Variable(torch.FloatTensor(np.float32(next_states)), volatile=True) actions_v = Variable(torch.LongTensor(actions)) rewards_v = Variable(torch.FloatTensor(rewards)) done = Variable(torch.FloatTensor(dones)) state_action_values = self.policy_net(states_v).gather( 1, actions_v.unsqueeze(1)).squeeze(1) next_state_values = self.target_net(next_states_v).max(1)[0] expected_q_value = rewards_v + next_state_values * GAMMA * ( 1 - done) #+ rewards_v loss = (state_action_values - Variable(expected_q_value.data)).pow(2).mean() return loss def train(self): optimizer = optim.Adam(self.policy_net.parameters(), lr=ALPHA) # Fill the memory with experiences print('Gathering experiences ...') meanScore = 0 AvgRewards = [] AllScores = [] step = 1 iEpisode = 0 while meanScore < 50: state = self.env.reset() done = False EpisodeScore = 0 tBegin = time.time() done = False while not done: action = self.make_action(state) nextState, reward, done, _ = self.env.step(action) self.push(state.transpose(2, 0, 1), action, nextState.transpose(2, 0, 1), reward, done) state = nextState if len(memory) > StartLearning: loss = self.optimize_model() optimizer.zero_grad() loss.backward() optimizer.step() else: iEpisode = 0 continue # Update exploration factor EPSILON = EPS_END + (EPS_START - EPS_END) * math.exp( -1. * step / EPS_DECAY) storeEpsilon.append(EPSILON) step += 1 EpisodeScore += reward if step % TARGET_UPDATE == 0: print('Updating Target Network . . .') self.target_net.load_state_dict( self.policy_net.state_dict()) iEpisode += 1 AllScores.append(EpisodeScore) meanScore = np.mean(AllScores[-100:]) AvgRewards.append(meanScore) if len(memory) > StartLearning: print('Episode: ', iEpisode, ' score:', EpisodeScore, ' Avg Score:', meanScore, ' epsilon: ', EPSILON, ' t: ', time.time() - tBegin, ' loss:', loss.item()) else: print('Gathering Data . . .') if iEpisode % 500 == 0: torch.save( { 'epoch': iEpisode, 'model_state_dict': self.policy_net.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'loss': loss, 'AvgRewards': AvgRewards }, 'trainData') os.remove("Rewards.csv") with open('Rewards.csv', mode='w') as dataFile: rewardwriter = csv.writer(dataFile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) rewardwriter.writerow(AvgRewards) print('======== Complete ========') torch.save( { 'epoch': iEpisode, 'model_state_dict': self.policy_net.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'loss': loss, 'AvgRewards': AvgRewards }, 'trainData') with open('Rewards.csv', mode='w') as dataFile: rewardwriter = csv.writer(dataFile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) rewardwriter.writerow(AvgRewards)
class Agent_DQN(Agent): def __init__(self, env, args): """ Initialize everything you need here. For example: paramters for neural network initialize Q net and target Q net parameters for repaly buffer parameters for q-learning; decaying epsilon-greedy ... """ super(Agent_DQN, self).__init__(env) ########################### # YOUR IMPLEMENTATION HERE # self.memory = [] self.env = env self.n_actions = env.env.action_space.n self.policy_net = DQN(4, self.n_actions).to(device).float() self.target_net = DQN(4, self.n_actions).to(device).float() # self.policy_net.load_state_dict(torch.load("best_weights_model.pt")) self.target_net.load_state_dict(self.policy_net.state_dict()) self.eps_threshold = EPS_START self.args = args self.test_count = 0 self.max_reward = 0 self.max_reward_so_far = 0 self.reward_buffer = [] self.flag = 0 self.steps_done = 0 # self.target_net.eval() self.transition = [] self.test_mean_reward = 0 self.optimizer = torch.optim.Adam(self.policy_net.parameters(), lr=LEARNING_RATE) if args.test_dqn: #you can load your model here print('loading trained model') ########################### # YOUR IMPLEMENTATION HERE # self.policy_net.load_state_dict( torch.load("best_weights_model.pt", map_location=torch.device('cpu'))) def init_game_setting(self): """ Testing function will call this function at the begining of new game Put anything you want to initialize if necessary. If no parameters need to be initialized, you can leave it as blank. """ ########################### # YOUR IMPLEMENTATION HERE # # self.env.reset() ########################### pass def make_action(self, observation, test=True): """ Return predicted action of your agent Input: observation: np.array stack 4 last preprocessed frames, shape: (84, 84, 4) Return: action: int the predicted action from trained model """ ########################### # YOUR IMPLEMENTATION HERE # observation = np.transpose(observation, (2, 0, 1)) if not test: # print("Helllo") # global steps_done sample = random.random() self.eps_threshold = self.eps_threshold - (EPS_START - EPS_END) / EPS_DECAY if self.eps_threshold < EPS_END: self.eps_threshold = EPS_END # print("Steps after increment ", self.steps_done) if sample > self.eps_threshold: with torch.no_grad(): q_sa = self.policy_net( torch.from_numpy(observation).unsqueeze(0).to(device)) index = torch.argmax(q_sa.data, dim=1).item() return index else: return np.random.randint(0, self.n_actions) else: q_sa = self.policy_net( torch.from_numpy(observation).unsqueeze(0).to(device)) index = torch.argmax(q_sa.data, dim=1).item() return index ########################### # return action def push(self): """ You can add additional arguments as you need. Push new data to buffer and remove the old one if the buffer is full. Hints: ----- you can consider deque(maxlen = 10000) list """ ########################### # YOUR IMPLEMENTATION HERE # if len(self.memory) >= 50000: self.memory.pop(0) self.memory.append(self.transition) # if(len(self.memory)%500==0 or len(self.memory)>= 50000): # print("Memory size : ", len(self.memory)) ########################### def replay_buffer(self, batch_size): """ You can add additional arguments as you need. Select batch from buffer. """ ########################### # YOUR IMPLEMENTATION HERE # transitions = random.sample(self.memory, BATCH_SIZE) batch = Transition(*zip(*transitions)) ########################### return batch def train(self): """ Implement your training algorithm here """ ########################### # YOUR IMPLEMENTATION HERE # # reward_buffer = deque([]) current_loss = 0.0 mean_reward = 0.0 for i_episode in range(NUM_EPISODES): # Initialize the environment and state # self.env.reset() # last_screen = get_screen() # current_screen = get_screen() state = self.env.reset() # state = np.transpose(state,(2,0,1)) #New # state = torch.tensor([state]) episode_Reward = 0.0 for t in range(EPISODE_STEP_LIMIT): # Render here # self.env.env.render() self.steps_done += 1 action = self.make_action(state, False) # 'Transition',('state', 'action', 'next_state', 'reward', 'done')) next_state, reward, done, _ = self.env.step(action) episode_Reward += reward state = np.transpose(state, (2, 0, 1)) #New next_state = np.transpose(next_state, (2, 0, 1)) self.transition = (state, action, next_state, reward, done) self.push() # Move to the next state state = next_state # self.env.render() # Update the target network, copying all weights and biases in DQN # print("Steps : ",steps_done) if self.steps_done % TARGET_UPDATE == 0: print("**********Updating Target********") self.target_net.load_state_dict( self.policy_net.state_dict()) # Perform one step of the optimization (on the target network) # optimize step start # print("Memory Size", len(self.memory)) # print("Completed 10,000 steps") if len(self.memory) > 10000 and len(self.memory) % 4 == 0: if self.flag == 0: print("Crossed 10000") self.flag = 1 batch = self.replay_buffer(BATCH_SIZE) # 'Transition',('state', 'action', 'next_state', 'reward', 'done')) state_batch = torch.from_numpy(np.asarray(batch[0])) action_batch = torch.from_numpy(np.asarray(batch[1])) next_state_batch = torch.from_numpy(np.asarray(batch[2])) reward_batch = torch.from_numpy(np.asarray( batch[3])).to(device) done_batch = torch.from_numpy(np.asarray( batch[4])).to(device) state_action_values = self.policy_net( state_batch.to(device)).gather( 1, action_batch[:, None].to(device)).squeeze(1) q_max = self.target_net( next_state_batch.to(device)).max(1)[0].detach() q_max[done_batch] = 0 expected_state_action_values = ( q_max) * GAMMA + reward_batch #print (state_action_values.double().size()) #print (expected_state_action_values.double().size()) loss = F.smooth_l1_loss( state_action_values.double(), expected_state_action_values.double()) current_loss = loss # print("Episode : ", i_episode, ", iteration : ",t, " Loss : ", current_loss, " Steps : ", steps_done," Epsilon : ", self.eps_threshold, " Mean Reward : ", mean_reward) #optimze the model self.optimizer.zero_grad() loss.backward() self.optimizer.step() if done: if len(self.reward_buffer) >= REWARD_BUFFER_SIZE: self.reward_buffer.pop(0) self.reward_buffer.append(episode_Reward) mean_reward = np.mean(self.reward_buffer) break if (i_episode % 500 == 0): env2 = env('BreakoutNoFrameskip-v4', self.args, atari_wrapper=True, test=True) test(self, env2, total_episodes=100) writer.add_scalar('Test Mean Reward', self.test_mean_reward, i_episode) if self.test_mean_reward > self.max_reward_so_far: torch.save(self.policy_net.state_dict(), "best_weights_model.pt") self.max_reward_so_far = self.test_mean_reward writer.add_scalar('Train Mean Reward', mean_reward, i_episode) writer.add_scalar('Training LOSS', current_loss, i_episode) # To calculate mean reward if i_episode % 100 == 0: # print("*****************") print("TRAIN Mean Reward after ", i_episode, " episodes is ", mean_reward, " Epsilon ", self.eps_threshold) if i_episode % 500 == 0: torch.save(self.policy_net.state_dict(), "saved_model.pt") print("Saved Model after ", i_episode, " episodes") self.env.env.close() self.writer.close()
class Agent_DQN(Agent): def __init__(self, env, args): """ Initialize every things you need here. For example: building your model """ super(Agent_DQN, self).__init__(env) self.env = env self.args = args self.episode = 0 self.n_actions = self.env.action_space.n self.epsilon_start = 1.0 self.epsilon_final = 0.025 self.epsilon_decay = 3000 self.epsilon_by_frame = lambda frame_idx: self.epsilon_final + ( self.epsilon_start - self.epsilon_final) * math.exp( -1. * frame_idx / self.epsilon_decay) self.epsilon = 0 self.eval_net = DQN().cuda() self.target_net = DQN().cuda() self.target_net.load_state_dict(self.eval_net.state_dict()) self.criterion = nn.MSELoss() #self._model = Net(self.env.observation_space.shape, self.env.action_space.n) self._use_cuda = torch.cuda.is_available() self.optim = torch.optim.Adam(self.eval_net.parameters(), lr=self.args.learning_rate) if self._use_cuda: self.eval_net = self.eval_net.cuda() self.target_net = self.target_net.cuda() self.criterion = self.criterion.cuda() # self.replaybuffer = ReplayBuffer(args.buffer_size) self.buffer = deque(maxlen=10000) if args.test_dqn: #you can load your model here print('loading trained model') self.eval_net.load_state_dict(torch.load(args.model_dqn)) self.target_net.load_state_dict(self.eval_net.state_dict()) if self._use_cuda: self.eval_net = self.eval_net.cuda() self.target_net = self.target_net.cuda() ################## # YOUR CODE HERE # ################## def init_game_setting(self): """ Testing function will call this function at the begining of new game Put anything you want to initialize if necessary """ ################## # YOUR CODE HERE # ################## pass def push(self, state, action, reward, next_state, done): state = np.expand_dims(state, 0) next_state = np.expand_dims(next_state, 0) self.buffer.append((state, action, reward, next_state, done)) def replay_buffer(self, batch_size): state, action, reward, next_state, done = zip( *random.sample(self.buffer, batch_size)) return np.concatenate(state), action, reward, np.concatenate( next_state), done def train(self): """ Implement your training algorithm here """ ################## # YOUR CODE HERE # ################## print('begin train...') # if self.args.log_file is not None: # fp_log = open(self.args.log_file, 'w', buffering=1) fout = open('dqn_score.log', 'w') if os.path.exists('model') == False: os.makedirs('model') losses = [] all_rewards = [] episode_reward = 0 best_mean_reward = 0 state = self.env.reset() for i_step in range(self.args.max_steps): self.epsilon = self.epsilon_by_frame(i_step) action = self.make_action(state) next_state, reward, done, _ = self.env.step(action) self.push(state, action, reward, next_state, done) state = next_state episode_reward += reward if done: state = self.env.reset() all_rewards.append(episode_reward) self.episode += 1 print('{},{}'.format(self.episode, episode_reward)) fout.write('Episode{},episode_reward{}\n'.format( self.episode, episode_reward)) episode_reward = 0 if len(self.buffer) == self.args.buffer_size: if i_step % self.args.eval_net_update_step == 0: loss = self.optimize_model() losses.append(loss) if i_step % self.args.target_net_update_step == 0: self.target_net.load_state_dict(self.eval_net.state_dict()) if i_step % self.args.save_freq == 0: mean_reward = \ sum(all_rewards[-100:]) / 100 if best_mean_reward < mean_reward: print('save best model with mean reward = %f' % mean_reward) best_mean_reward = mean_reward torch.save(self.eval_net.state_dict(), self.args.model_dqn) def make_action(self, observation, test=True): """ Return predicted action of your agent Input: observation: np.array stack 4 last preprocessed frames, shape: (84, 84, 4) Return: action: int the predicted action from trained model """ ################## # YOUR CODE HERE # ################## observation = torch.cuda.FloatTensor( observation.reshape((1, 84, 84, 4))).transpose(1, 3).transpose(2, 3) # print(type(observation)) Q_value = self.eval_net.forward(observation).data.cpu().numpy() if random.random() > self.epsilon: action = np.argmax(Q_value) else: action = self.env.get_random_action() return action def optimize_model(self): state, action, reward, next_state, done = self.replay_buffer( self.args.batch_size) state = torch.FloatTensor(np.float32(state)).permute(0, 3, 1, 2) next_state = torch.FloatTensor(np.float32(next_state)).permute( 0, 3, 1, 2) action = torch.LongTensor(action) reward = torch.FloatTensor(reward) done = torch.ByteTensor(done) if self._use_cuda: state = state.cuda() next_state = next_state.cuda() action = action.cuda() reward = reward.cuda() done = done.cuda() q_values = self.eval_net(state) # next_q_values = self.target_net(next_state).detach() q_value = q_values.gather(1, action.unsqueeze(1)).squeeze(1) next_q_values = self.target_net(next_state).detach() next_q_value = next_q_values.max(1)[0] expected_q_value = reward + self.args.gamma * next_q_value * (1 - done) loss = self.criterion(q_value, expected_q_value.data) self.optim.zero_grad() loss.backward() self.optim.step() return loss
class Agent_DQN(Agent): def __init__(self, env, args): """ Initialize everything you need here. For example: paramters for neural network initialize Q net and target Q net parameters for repaly buffer parameters for q-learning; decaying epsilon-greedy ... """ super(Agent_DQN, self).__init__(env) ########################### # YOUR IMPLEMENTATION HERE # self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") self.env = env self.buffer = ReplayBuffer() self.num_action = self.env.get_action_space().n self.cur_step = 0 self.greedyPolicy = EpsilonGreedyStrategy(1, 0.025, 0.01) self.policy_net = DQN().to(self.device) self.target_net = DQN().to(self.device) self.num_episode = args.num_episode self.learning_rate = args.learning_rate self.sample_batch_size = args.sample_batch_size self.gamma = args.gamma self.e = 1 if args.test_dqn: # you can load your model here print('loading trained model') ########################### # YOUR IMPLEMENTATION HERE # def init_game_setting(self): """ Testing function will call this function at the begining of new game Put anything you want to initialize if necessary. If no parameters need to be initialized, you can leave it as blank. """ ########################### # YOUR IMPLEMENTATION HERE # ########################### pass def make_action(self, observation, test=True): """ Return predicted action of your agent Input: observation: np.array stack 4 last preprocessed frames, shape: (84, 84, 4) Return: action: int the predicted action from trained model """ ########################### # YOUR IMPLEMENTATION HERE # if test: with torch.no_grad(): action = self.policy_net(observation).argmax(dim=1).item() else: if self.e > random.random(): action = random.randrange(self.num_action) else: observation = self.transform(observation) with torch.no_grad(): action = self.policy_net(observation).argmax(dim=1).item() self.e -= self.greedyPolicy.get_exploration_rate() ########################### return action def push(self,experience): """ You can add additional arguments as you need. Push new data to buffer and remove the old one if the buffer is full. Hints: ----- you can consider deque(maxlen = 10000) list """ ########################### # YOUR IMPLEMENTATION HERE # self.buffer.append(experience) ########################### def replay_buffer(self, batch_size=32): """ You can add additional arguments as you need. Select batch from buffer. """ ########################### # YOUR IMPLEMENTATION HERE # experience = self.buffer.sample(batch_size) ########################### return experience def transform(self, state): state = np.asarray(state) / 255. state = torch.tensor(state) state = state.unsqueeze(0) state = state.permute(0, 3, 1, 2) state = state.to(device=self.device, dtype=torch.float) return state def extract_tensors(self, experiences): batch = Experience(*zip(*experiences)) t1 = batch.state t2 = batch.action t3 = batch.next_state t4 = batch.reward t5 = batch.termination return t1, t2, t3, t4, t5 def get_current_q(self, states, actions): states = np.asarray(states) / 255. a = np.count_nonzero(states) states = torch.tensor(states, device=self.device, dtype=torch.float) states = states.permute(0, 3, 1, 2) actions = torch.tensor(np.asarray(actions), device=self.device, dtype=torch.long).unsqueeze(-1) QS = self.policy_net(states).gather(1, actions)#.requires_grad_(True) QS = QS.permute(1, 0) return QS[0] def get_next_q(self, next_states, terminations): next_states = np.asarray(next_states) / 255. next_states = torch.tensor(next_states,device=self.device, dtype=torch.float) next_states = next_states.permute(0, 3, 1, 2) QS = self.target_net(next_states).max(1)[0].detach()#.requires_grad_(True) QS = QS * torch.tensor(terminations, device=self.device, dtype=torch.float, requires_grad= True) return QS def train(self): """ Implement your training algorithm here """ ########################### # YOUR IMPLEMENTATION HERE # self.target_net.load_state_dict(self.policy_net.state_dict()) self.target_net.eval() optimizor = optim.Adam(params=self.policy_net.parameters(), lr = self.learning_rate, betas = (0.5, 0.999)) max_reward = -1 rewards_sum = 0 reward_collection = [] episode_collection = [] print(self.device) for episode in range(self.num_episode): done = False state = self.env.reset() while not done: action = self.make_action(state, False) next_state, reward, done, info = self.env.step(action) self.push( Experience(state, action, next_state, reward, (not done) ) ) rewards_sum += reward state = next_state if self.buffer.can_sample(): experiences = self.buffer.sample(self.sample_batch_size) states, actions, next_states, rewards, terminations = self.extract_tensors(experiences) current_q = self.get_current_q(states, actions) next_q = self.get_next_q(next_states, terminations) target_q = self.gamma * next_q + torch.tensor(rewards, device=self.device, dtype=torch.float) loss = F.smooth_l1_loss(current_q, target_q) optimizor.zero_grad() loss.backward() for param in self.policy_net.parameters(): param.grad.data.clamp_(-1, 1) optimizor.step() if episode % 3000 == 0: self.target_net.load_state_dict(self.policy_net.state_dict()) if episode % 30 == 0: print("episode: ", episode, "\t", "average reward :", rewards_sum/30) reward_collection.append(rewards_sum/30) episode_collection.append(episode) if rewards_sum > max_reward: torch.save(self.policy_net.state_dict(), "model/policy_net_max_reward.pth") rewards_sum = 0 if episode%1000 == 0: torch.save(self.policy_net.state_dict(), "model/policy_net.pth") torch.save(self.policy_net.state_dict(), "model/policy_net.pth") x = episode_collection y = reward_collection plt.plot(x,y) plt.show() plt.savefig('episode-reward.png')
class Agent_DQN(Agent): def __init__(self, env, args): """ Initialize everything you need here. For example: paramters for neural network initialize Q net and target Q net parameters for repaly buffer parameters for q-learning; decaying epsilon-greedy ... """ super(Agent_DQN, self).__init__(env) ########################### # YOUR IMPLEMENTATION HERE # self.env = env self.batch_size = BATCH_SIZE self.gamma = 0.999 self.eps_start = EPS_START self.eps_decay = EPS_DECAY self.TARGET_UPDATE = TARGET_UPDATE self.policy_net = DQN(self.env.action_space.n) self.target_net = DQN(self.env.action_space.n) self.target_net.load_state_dict(self.policy_net.state_dict()) self.target_net.eval() if use_cuda: self.policy_net.cuda() self.target_net.cuda() self.optimizer = optim.RMSprop(self.policy_net.parameters(), lr=1e-5) self.memory = deque(maxlen=10000) if args.test_dqn: # you can load your model here print('loading trained model') ########################### # YOUR IMPLEMENTATION HERE # def init_game_setting(self): """ Testing function will call this function at the begining of new game Put anything you want to initialize if necessary. If no parameters need to be initialized, you can leave it as blank. """ ########################### # YOUR IMPLEMENTATION HERE # ########################### pass def make_action(self, observation, test=True): """ Return predicted action of your agent Input: observation: np.array stack 4 last preprocessed frames, shape: (84, 84, 4) Return: action: int the predicted action from trained model """ ########################### # YOUR IMPLEMENTATION HERE # global steps_done self.policy_net.eval() sample = random.random() eps_threshold = EPS_END + (EPS_START - EPS_END) * \ math.exp(-1. * steps_done / EPS_DECAY) steps_done += 1 if sample > eps_threshold: return self.policy_net( Variable(torch.from_numpy(observation), volatile=True).type(FloatTensor)).data.max(1)[1].view( 1, 1) else: return LongTensor([[random.randrange(self.env.action_space.n)]]) ########################### return action def push(self, s, a, r, s_, done): """ You can add additional arguments as you need. Push new data to buffer and remove the old one if the buffer is full. Hints: ----- you can consider deque(maxlen = 10000) list """ ########################### # YOUR IMPLEMENTATION HERE # self.memory.append((s, a, r, s_, done)) if len(self.memory) > self.maxlen: self.replay_memory_store.popleft() self.memory_counter += 1 ########################### def replay_buffer(self): """ You can add additional arguments as you need. Select batch from buffer. """ ########################### # YOUR IMPLEMENTATION HERE # #print("memory", len(self.memory), self.BATCH_SIZE) minibatch = random.sample(self.memory, self.BATCH_SIZE) minibatch = np.array(minibatch).transpose(0, 3, 1, 2) minibatch = torch.tensor(minibatch / 255.0) ########################### return minibatch def optimize_model(self): if len(self.memory) < BATCH_SIZE: return transitions = self.memory.sample(BATCH_SIZE) # Transpose the batch (see http://stackoverflow.com/a/19343/3343043 for # detailed explanation). batch = Transition(*zip(*transitions)) # Compute a mask of non-final states and concatenate the batch elements non_final_mask = ByteTensor( tuple(map(lambda s: s is not None, batch.next_state))) non_final_next_states = Variable(torch.cat( [s for s in batch.next_state if s is not None]), volatile=True).cuda() state_batch = Variable(torch.cat(batch.state)).cuda() action_batch = Variable(torch.cat(batch.action)).cuda() reward_batch = Variable(torch.cat(batch.reward)).cuda() # Compute Q(s_t, a) - the model computes Q(s_t), then we select the # columns of actions taken self.policy_net.train() state_action_values = self.policy_net(state_batch).gather( 1, action_batch) # Compute V(s_{t+1}) for all next states. next_state_values = Variable( torch.zeros(BATCH_SIZE).type(Tensor)).cuda() next_state_values[non_final_mask] = self.target_net( non_final_next_states).max(1)[0] # Compute the expected Q values expected_state_action_values = (next_state_values * GAMMA) + reward_batch # Undo volatility (which was used to prevent unnecessary gradients) expected_state_action_values = Variable( expected_state_action_values.data).cuda() # Compute Huber loss loss = F.smooth_l1_loss(state_action_values, expected_state_action_values) # Optimize the model self.optimizer.zero_grad() loss.backward() for param in self.policy_net.parameters(): param.grad.data.clamp_(-1, 1) self.optimizer.step() def train(self): """ Implement your training algorithm here """ ########################### # YOUR IMPLEMENTATION HERE # num_episodes = 1400000 for i_episode in range(num_episodes): # Initialize the environment and state observation = self.env.reset() observation = observation.transpose((2, 0, 1)) observation = observation[np.newaxis, :] state = observation for t in count(): # Select and perform an action action = self.make_action(state) next_state, reward, done, _ = self.env.step(action[0, 0]) next_state = next_state.transpose((2, 0, 1)) next_state = next_state[np.newaxis, :] reward = Tensor([reward]) # Store the transition in memory self.memory.push(torch.from_numpy(state), action, torch.from_numpy(next_state), reward) # Observe new state if not done: state = next_state else: state = None # Perform one step of the optimization (on the target network) self.optimize_model() if done: print( 'resetting env. episode %d \'s reward total was %d.' % (i_episode + 1, t + 1)) break # Update the target network if i_episode % TARGET_UPDATE == 0: self.target_net.load_state_dict(self.policy_net.state_dict()) if i_episode % 50 == 0: checkpoint_path = os.path.join('save_dir', 'model-best.pth') torch.save(self.policy_net.state_dict(), checkpoint_path) print("model saved to {}".format(checkpoint_path))
def ddqn_rank_train(env, exploreScheduler, betaScheduler, optimizer_constructor, model_type, batch_size, rp_start, rp_size, exp_frame, exp_initial, exp_final, prob_alpha, gamma, target_update_steps, frames_per_epoch, frames_per_state, output_directory, last_checkpoint, max_frames, envo): """ Implementation of the training algorithm for DDQN using Rank-based prioritization. Information with regards to the algorithm can be found in the paper, "Prioritized Experience Replay" by Tom Schaul, John Quan, Ioannis Antonoglou and David Silver. Refer to section 3.3 in the paper for more info. """ gym.undo_logger_setup() logging.basicConfig(filename=envo + '_' + 'ddqn_rank_weighted_training.log', level=logging.INFO) num_actions = env.action_space.n env.reset() print('No. of actions: ', num_actions) print(env.unwrapped.get_action_meanings()) # initialize action value and target network with the same weights model = DQN(num_actions) target = DQN(num_actions) if use_cuda: model.cuda() target.cuda() frames_count = 1 if last_checkpoint: model.load_state_dict(torch.load(last_checkpoint)) print(last_checkpoint) print('weights loaded...') #TODO: Implementation of resume # exp_replay = util.initialize_rank_replay_resume(env, rp_start, rp_size, frames_per_state, # model, target, gamma, batch_size) # frames_count = get_index_from_checkpoint_path(last_checkpoint) else: exp_replay = util.initialize_rank_replay(env, rp_start, rp_size, frames_per_state, model, target, gamma, prob_alpha) target.load_state_dict(model.state_dict()) optimizer = optimizer_constructor.type( model.parameters(), lr=optimizer_constructor.kwargs['lr'], alpha=optimizer_constructor.kwargs['alpha'], eps=optimizer_constructor.kwargs['eps']) episodes_count = 1 epsiodes_durations = [] rewards_per_episode = 0 rewards_duration = [] loss_per_epoch = [] current_state, _, _, _ = util.play_game(env, frames_per_state) wLoss_func = Weighted_Loss() print('Starting training...') for frames_count in range(1, max_frames): epsilon = exploreScheduler.anneal_linear(frames_count) beta = betaScheduler.anneal_linear(frames_count) choice = random.uniform(0, 1) # epsilon greedy algorithm if choice <= epsilon: action = LongTensor([[random.randrange(num_actions)]]) else: action = util.get_greedy_action(model, current_state) curr_obs, reward, done, _ = util.play_game(env, frames_per_state, action[0][0]) rewards_per_episode += reward reward = Tensor([[reward]]) td_error = 1 temp_exp = Experience(current_state, action, reward, curr_obs, td_error) current_state = curr_obs # compute y if len(exp_replay) >= batch_size: # Get batch samples # start = time.time() if frames_count % rp_size == 0: obs_samples, obs_priorityVals = exp_replay.sample(batch_size - 1, prob_alpha, sort=True) else: obs_samples, obs_priorityVals = exp_replay.sample(batch_size - 1, prob_alpha, sort=False) obs_samples.append(temp_exp) obs_priorityVals.append(td_error) obs_pVals_tensor = torch.from_numpy(np.array(obs_priorityVals)) # print("P(i): ", obs_pVals_tensor) IS_weights = torch.pow((obs_pVals_tensor * rp_size), -beta) max_weight = torch.max(IS_weights) IS_weights_norm = torch.div(IS_weights, max_weight).type(Tensor) IS_weights_norm[-1] = torch.max(IS_weights_norm) # print("Norm W(i): ", IS_weights_norm) batch = Experience(*zip(*obs_samples)) loss, new_weights = ddqn_compute_y(batch, batch_size, model, target, gamma, IS_weights_norm, wLoss_func) new_weights = torch.pow(new_weights, prob_alpha) new_exp = Experience(temp_exp.state, temp_exp.action, temp_exp.reward, temp_exp.next_state, new_weights[batch_size - 1]) exp_replay.update(obs_samples, new_weights, new_exp) optimizer.zero_grad() loss.backward() # print("loss: ", loss.data) optimizer.step() loss_per_epoch.append(loss.data.cpu().numpy()[0]) else: exp_replay.push(new_exp.state, new_exp.action, new_exp.reward, new_exp.next_state, td_error) # end = time.time() # duration = end-start # print('duration : ', duration) if done: # print('Game: ', rewards_per_episode) rewards_duration.append(rewards_per_episode) rewards_per_episode = 0 episodes_count += 1 env.reset() current_state, _, _, _ = util.play_game(env, frames_per_state) if episodes_count % 100 == 0: avg_episode_reward = sum(rewards_duration) / 100.0 avg_reward_content = 'Episode from', episodes_count - 99, ' to ', episodes_count, ' has an average of ', avg_episode_reward, ' reward and loss of ', sum( loss_per_epoch) print(avg_reward_content) logging.info(avg_reward_content) rewards_duration = [] loss_per_epoch = [] # update weights of target network for every TARGET_UPDATE_FREQ steps if frames_count % target_update_steps == 0: target.load_state_dict(model.state_dict()) #Save weights every 250k frames if frames_count % 250000 == 0: util.make_sure_path_exists(output_directory + '/' + envo + '/') torch.save( model.state_dict(), output_directory + '/' + envo + '/rank_uniform' + str(frames_count) + '.pth') #Print frame count and sort experience replay for every 1000000 (one million) frames: if frames_count % 1000000 == 0: training_update = 'frame count: ', frames_count, 'episode count: ', episodes_count, 'epsilon: ', epsilon print(training_update) logging.info(training_update)
def ddqn_rankWeight_train(env, exploreScheduler, betaScheduler, optimizer_constructor, model_type, batch_size, rp_start, rp_size, exp_frame, exp_initial, exp_final, prob_alpha, gamma, target_update_steps, frames_per_epoch, frames_per_state, output_directory, last_checkpoint, max_frames, envo): """ Implementation of the training algorithm for DDQN using Rank-based prioritization. Information with regards to the algorithm can be found in the paper, "Prioritized Experience Replay" by Tom Schaul, John Quan, Ioannis Antonoglou and David Silver. Refer to section 3.3 in the paper for more info. """ gym.undo_logger_setup() logging.basicConfig(filename=envo+'_'+'ddqn_rank_weightedLoss_training.log',level=logging.INFO) num_actions = env.action_space.n env.reset() print('No. of actions: ', num_actions) print(env.unwrapped.get_action_meanings()) # initialize action value and target network with the same weights model = DQN(num_actions) target = DQN(num_actions) if use_cuda: model.cuda() target.cuda() frames_count = 1 if last_checkpoint: model.load_state_dict(torch.load(last_checkpoint)) print(last_checkpoint) print('weights loaded...') #TODO: Implementation of resume # exp_replay = util.initialize_rank_replay_resume(env, rp_start, rp_size, frames_per_state, # model, target, gamma, batch_size) # frames_count = get_index_from_checkpoint_path(last_checkpoint) else: exp_replay = util.initialize_rank_replay(env, rp_start, rp_size, frames_per_state, model, target, gamma, prob_alpha) target.load_state_dict(model.state_dict()) optimizer = optimizer_constructor.type(model.parameters(), lr=optimizer_constructor.kwargs['lr'], alpha=optimizer_constructor.kwargs['alpha'], eps=optimizer_constructor.kwargs['eps'] ) episodes_count = 1 frames_per_episode = 1 epsiodes_durations = [] rewards_per_episode = 0 rewards_duration = [] loss_per_epoch = [] wLoss_func = Weighted_Loss() current_state, _, _, _ = util.play_game(env, frames_per_state) print('Starting training...') for frames_count in range(1, max_frames): epsilon=exploreScheduler.anneal_linear(frames_count) beta = betaScheduler.anneal_linear(frames_count) choice = random.uniform(0,1) # epsilon greedy algorithm if choice <= epsilon: action = LongTensor([[random.randrange(num_actions)]]) else: action = util.get_greedy_action(model, current_state) curr_obs, reward, done, _ = util.play_game(env, frames_per_state, action[0][0]) rewards_per_episode += reward reward = Tensor([[reward]]) current_state_ex = Variable(current_state, volatile=True) curr_obs_ex = Variable(curr_obs, volatile=True) action_ex = Variable(action, volatile=True) reward_ex = Variable(reward, volatile=True) #compute td-error for one sample td_error = ddqn_compute_td_error(batch_size=1, state_batch=current_state_ex, reward_batch=reward_ex, action_batch=action_ex, next_state_batch=curr_obs_ex, model=model, target=target, gamma=gamma) td_error = torch.pow(torch.abs(td_error)+1e-8, prob_alpha) exp_replay.push(current_state, action, reward, curr_obs, td_error) current_state = curr_obs # compute y if len(exp_replay) >= batch_size: # Get batch samples obs_samples, obs_ranks, obs_priorityVals = exp_replay.sample(batch_size) num_samples_per_batch = len(obs_samples) obs_priorityTensor = torch.from_numpy(np.array(obs_priorityVals)) p_batch = 1/ obs_priorityTensor w_batch_raw = (1/len(exp_replay) * p_batch)**beta max_weight = exp_replay.get_max_weight(beta) w_batch = w_batch_raw/max_weight w_batch = w_batch.type(Tensor) batch = Experience(*zip(*obs_samples)) loss, new_weights = ddqn_compute_y(batch, num_samples_per_batch, model, target, gamma, w_batch, wLoss_func) loss_abs = torch.abs(new_weights) exp_replay.update(obs_ranks, loss_abs) optimizer.zero_grad() loss.backward() for param in model.parameters(): param.grad.data.clamp_(-1,1) optimizer.step() loss_per_epoch.append(loss.data.cpu().numpy()[0]) frames_per_episode+= frames_per_state if done: rewards_duration.append(rewards_per_episode) rewards_per_episode = 0 frames_per_episode=1 episodes_count+=1 env.reset() current_state, _, _, _ = util.play_game(env, frames_per_state) if episodes_count % 100 == 0: avg_episode_reward = sum(rewards_duration)/100.0 avg_reward_content = 'Episode from', episodes_count-99, ' to ', episodes_count, ' has an average of ', avg_episode_reward, ' reward and loss of ', sum(loss_per_epoch) print(avg_reward_content) logging.info(avg_reward_content) rewards_duration = [] loss_per_epoch = [] # update weights of target network for every TARGET_UPDATE_FREQ steps if frames_count % target_update_steps == 0: target.load_state_dict(model.state_dict()) # sort memory replay every half of it's capacity iterations if frames_count % int(rp_size/2) == 0: exp_replay.sort() #Save weights every 250k frames if frames_count % 250000 == 0: util.make_sure_path_exists(output_directory+'/'+envo+'/') torch.save(model.state_dict(), output_directory+'/'+envo+'/rank_weightedLoss_'+ str(frames_count)+'.pth') #Print frame count and sort experience replay for every 1000000 (one million) frames: if frames_count % 1000000 == 0: training_update = 'frame count: ', frames_count, 'episode count: ', episodes_count, 'epsilon: ', epsilon print(training_update) logging.info(training_update)
class Agent_DQN(Agent): def __init__(self, env, args): """ Initialize everything you need here. For example: paramters for neural network initialize Q net and target Q net parameters for repaly buffer parameters for q-learning; decaying epsilon-greedy ... """ super(Agent_DQN,self).__init__(env) ########################### # YOUR IMPLEMENTATION HERE # self.epsilon_start = 1 self.epsilon_end = 0.02 self.epsilon_decay = 200000 self.epsilon = self.epsilon_start self.gamma = 0.99 self.env = env self.buffer_size = 30000 self.buffer = deque(maxlen=30000) self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.policy_net = DQN().to(self.device) self.target_net = DQN().to(self.device) self.target_net.load_state_dict(self.policy_net.state_dict()) self.optimizer = torch.optim.Adam(self.policy_net.parameters(),lr=0.00015) self.reward_array = [] self.reward_x_axis = [] self.batch_size = 32 if args.test_dqn: #you can load your model here print('loading trained model') ########################### # YOUR IMPLEMENTATION HERE # def init_game_setting(self): """ Testing function will call this function at the begining of new game Put anything you want to initialize if necessary. If no parameters need to be initialized, you can leave it as blank. """ ########################### # YOUR IMPLEMENTATION HERE # self.policy_net.load_state_dict(torch.load('policy_model')) self.target_net.load_state_dict(self.policy_net.state_dict()) self.policy_net = self.policy_net.cuda() self.target_net = self.target_net.cuda() ########################### pass def make_action(self, observation, test=True): """ Return predicted action of your agent Input: observation: np.array stack 4 last preprocessed frames, shape: (84, 84, 4) Return: action: int the predicted action from trained model """ ########################### # YOUR IMPLEMENTATION HERE # if test==True: self.epsilon = 0 observation=torch.cuda.FloatTensor(observation.reshape((1,84,84,4))).transpose(1,3).transpose(2,3) q = self.policy_net(observation).data.cpu().numpy() if random.random() > self.epsilon: action = np.argmax(q) else: action = random.randint(0,4) return action def push(self,data): """ You can add additional arguments as you need. Push new data to buffer and remove the old one if the buffer is full. Hints: ----- you can consider deque(maxlen = 10000) list """ ########################### # YOUR IMPLEMENTATION HERE # self.buffer.append(data) ########################### def replay_buffer(self,batch_size): """ You can add additional arguments as you need. Select batch from buffer. """ ########################### # YOUR IMPLEMENTATION HERE # ########################### return random.sample(self.buffer,batch_size) def play_game(self,start_state): action = self.make_action(start_state) n_s,r,terminal,_ = self.env.step(action) self.push((start_state,action,r,n_s,terminal)) return n_s,r,terminal def loss_function(self): data = self.replay_buffer(self.batch_size) s,a,r,n_s,terminal = zip(*data) s = torch.FloatTensor(np.float32(s)).permute(0,3,1,2).to(self.device) a = torch.LongTensor(a).to(self.device) r = torch.FloatTensor(r).to(self.device) n_s = torch.FloatTensor(np.float32(n_s)).permute(0,3,1,2).to(self.device).to(self.device) terminal = torch.FloatTensor(terminal).to(self.device) q = self.policy_net(s).gather(1,a.unsqueeze(1)).squeeze(1) n_q = self.target_net(n_s).detach().max(1)[0] expected_q = r + self.gamma * n_q * (1 - terminal) loss = F.smooth_l1_loss(q, expected_q.data) self.optimizer.zero_grad() loss.backward() self.optimizer.step() def train(self): """ Implement your training algorithm here """ ########################### # YOUR IMPLEMENTATION HERE # rewards_array = [] reward_ = 0 best_mean = 0 print_rate = 100 last_saved = None start_state = self.env.reset() for frames in range (3500000): self.epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * math.exp(-1. *frames / self.epsilon_decay) n_s,r,terminal = self.play_game(start_state) start_state = n_s reward_ += r if terminal: start_state = self.env.reset() rewards_array.append(reward_) if len(rewards_array) % print_rate==0: print('%%%%%%%%%%%%%%%%%%%%%%%%%') print('Frames = ', frames) print('Current Epsilon = ', self.epsilon) print('Episode = ', len(rewards_array)) print('Reward = ', np.mean(rewards_array[-100:]))#sum(rewards_array[-100:]) / 100) print('Buffer Length = ', len(self.buffer)) self.reward_array.append(np.mean(rewards_array[-100:])) self.reward_x_axis.append(len(rewards_array)) self.print_graph() if last_saved != None: print("last saved = ", best_mean) print('%%%%%%%%%%%%%%%%%%%%%%%%%') reward_ = 0 if len(self.buffer)<10000: continue if len(self.buffer) > 10000 and frames % 4 ==0: self.loss_function() if frames % 1000 == 0: print("Target net updated") self.target_net.load_state_dict(self.policy_net.state_dict()) mean_reward = np.mean(rewards_array[-100:]) if mean_reward > best_mean and frames % 100==0: print("Saving model with reward = ", mean_reward) best_mean = mean_reward last_saved = mean_reward torch.save(self.policy_net.state_dict(), 'policy_model_') ########################### def print_graph(self): fig = plt.figure() ax = plt.subplot(111) ax.plot(self.reward_x_axis,self.reward_array,label='$y = Rewards, $x = episodes') ax.legend() fig.savefig('plot.png')
class Agent_DQN(Agent): def __init__(self, env, args): """ Initialize everything you need here. For example: paramters for neural network initialize Q net and target Q net parameters for repaly buffer parameters for q-learning; decaying epsilon-greedy ... """ super(Agent_DQN,self).__init__(env) ########################### # YOUR IMPLEMENTATION HERE # self.epochs = 10 self.n_episodes = 1000000 self.env = env self.nA = self.env.action_space.n # self.nS = self.env.observation_space self.batch_size = 32 self.DQN = DQN() self.Target_DQN = DQN() self.buffer_memory = 1000000 self.train_buffer_size = 4 self.min_buffer_size = 10000 self.target_update_buffer = 10000 self.learning_rate = 0.0001 self.discount_factor = 0.999 self.epsilon = 1 self.min_epsilon = 0.01 # self.decay_rate = 0.999 self.ep_decrement = (self.epsilon - self.min_epsilon)/self.n_episodes self.criteria = nn.MSELoss() self.optimiser = optim.Adam(self.DQN.parameters(),self.learning_rate) self.buffer=[] self.Evaluation = 100000 self.total_evaluation__episodes = 100 self.full_train = 100000 if args.test_dqn: #you can load your model here print('loading trained model') ########################### # YOUR IMPLEMENTATION HERE # def init_game_setting(self): """ Testing function will call this function at the begining of new game Put anything you want to initialize if necessary. If no parameters need to be initialized, you can leave it as blank. """ ########################### # YOUR IMPLEMENTATION HERE # obs = self.env.reset() ########################### pass def make_action(self, observation, test=True): """ Return predicted action of your agent Input: observation: np.array stack 4 last preprocessed frames, shape: (84, 84, 4) Return: action: int the predicted action from trained model """ ########################### # YOUR IMPLEMENTATION HERE # if not test: p = random.random() if p < self.epsilon: action = np.random.randint(0,self.nA) else: a = self.DQN(torch.from_numpy(observation).unsqueeze(0)) action = np.argmax(a.detach().numpy()) else: a = self.Target_DQN(torch.from_numpy(observation).unsqueeze(0)) action = np.argmax(a.detach().numpy()) ########################### return action def push(self,episode): """ You can add additional arguments as you need. Push new data to buffer and remove the old one if the buffer is full. Hints: ----- you can consider deque(maxlen = 10000) list """ ########################### # YOUR IMPLEMENTATION HERE # if len(self.buffer) < self.buffer_memory: self.buffer.append(episode) else: self.buffer.pop(0) self.buffer.append(episode) ########################### def replay_buffer(self): """ You can add additional arguments as you need. Select batch from buffer. """ ########################### # YOUR IMPLEMENTATION HERE # batch = random.sample(self.buffer,self.batch_size) # print(np.shape(batch[0][:])) batch = list(zip(*batch)) # print(np.asarray(batch[1])) batch_x = torch.from_numpy(np.asarray(batch[0])) act = torch.from_numpy(np.vstack(batch[1])) rew = torch.from_numpy(np.asarray(batch[2])) dones = torch.from_numpy(np.asarray(batch[3])) batch_y = torch.from_numpy(np.asarray(batch[4])) # print(act.shape) ########################### return batch_x,act,rew,dones,batch_y def train(self): """ Implement your training algorithm here """ ########################### # YOUR IMPLEMENTATION HERE # current = 1 reward_list =[] loss_list= [] current_train = 1 current_target = 1 for x in range(self.n_episodes): obs = np.transpose(self.env.reset(),(2,0,1)) # print(obs[0][40][:]) e_list=[] done = False accumulated_rewards = 0 while not done: # self.env.render() action = self.make_action(obs,False) next_obs,reward,done,info = self.env.step(action) next_obs = np.transpose(next_obs,(2,0,1)) # print(info['ale.lives']) # print(np.shape(e_list[-1])) accumulated_rewards+=reward self.push([obs,action,reward,done,next_obs]) self.epsilon-=self.ep_decrement if current_train % self.train_buffer_size == 0 and len(self.buffer) > self.min_buffer_size: batch_x,act,rew,dones,batch_y = self.replay_buffer() self.optimiser.zero_grad() future_return = self.Target_DQN(batch_y).max(1)[0].detach() * self.discount_factor future_return[dones] = 0 y = rew + future_return c_q = self.DQN(batch_x).gather(1,act) loss = self.criteria(c_q.double(),(y.double()).unsqueeze(1)) loss_list.append(loss.detach()) loss.backward() # self.env.render() self.optimiser.step() current_train = 1 if current_target > self.target_update_buffer: self.Target_DQN.load_state_dict(self.DQN.state_dict()) current_target = 1 if current % self.full_train == 0: # current = 1 # print("\n Weights: \n",list(self.DQN.parameters()),"\r") dataset = my_dataset(self.buffer) for i in range(self.epochs): loader = torch.utils.data.DataLoader(dataset, batch_size = 32, shuffle = True) # print(len(list(loader))) for batch in list(loader): batch_x,act,rew,dones,batch_y=batch self.optimiser.zero_grad() future_return = self.Target_DQN(batch_y).max(1)[0].detach() * self.discount_factor future_return[dones] = 0 y = rew + future_return c_q = self.DQN(batch_x).gather(1,act.unsqueeze(1)) loss = self.criteria(c_q.double(),y.double().unsqueeze(1)) loss_list.append(loss.detach()) loss.backward() self.optimiser.step() if current % self.Evaluation == 0: # current = 1 # print("\n Weights: \n",list(self.DQN.parameters()),"\r") print("\n","#" * 40, "Evaluation number %d"%(current/self.Evaluation),"#" * 40) for i in range(self.total_evaluation__episodes): state = np.transpose(self.env.reset(),(2,0,1)) done = False episode_reward = 0.0 rewards=[] #playing one game while(not done): action = self.make_action(state, test=True) state, reward, done, info = self.env.step(action) episode_reward += reward state = np.transpose(state,(2,0,1)) rewards.append(episode_reward) print('Run %d episodes'%(self.total_evaluation__episodes)) print('Mean:', np.mean(rewards)) print("#" * 40, "Evaluation Ended!","#" * 40,"\n") current+=1 current_train += 1 current_target += 1 obs = next_obs reward_list.append(accumulated_rewards) if len(reward_list) % 200 == 0: reward_list = reward_list[-150:] # print(reward_list) loss_list = loss_list[-150:] if x%100 == 0: print("Current = %d, episode = %d, Average_reward = %0.2f, epsilon = %0.2f"%(current, x+1, np.mean(reward_list[-100:]), self.epsilon)) ###########################
class Agent_DQN_Trainer(object): def __init__(self, env, args): # Training Parameters self.args = args self.env = env self.batch_size = args.batch_size self.lr = args.lr self.gamma = args.gamma_reward_decay self.n_actions = env.action_space.n self.output_logs = args.output_logs self.step = 8e6 self.curr_step = 0 self.ckpt_path = args.save_dir self.epsilon = args.eps_start self.eps_end = args.eps_end self.target_update = args.update_target self.observe_steps = args.observe_steps self.explore_steps = args.explore_steps self.saver_steps = args.saver_steps self.resume = args.resume self.writer = TensorboardSummary(self.args.log_dir).create_summary() # Model Settings self.cuda = torch.device( 'cuda' if torch.cuda.is_available() else 'cpu') self.policy_net = DQN(4, self.n_actions) self.target_net = DQN(4, self.n_actions) self.target_net.load_state_dict(self.policy_net.state_dict()) if self.cuda: self.policy_net.to(self.cuda) self.target_net.to(self.cuda) self.target_net.eval() train_params = self.policy_net.parameters() self.optimizer = optim.RMSprop(train_params, self.lr, momentum=0.95, eps=0.01) self.memory = ReplayMemory(args.replay_memory_size) if args.resume: if not os.path.isfile(args.resume): raise RuntimeError("=> no checkpoint found at '{}'".format( args.resume)) checkpoint = torch.load(args.resume) self.epsilon = checkpoint['epsilon'] self.curr_step = checkpoint['step'] self.policy_net.load_state_dict(checkpoint['policy_state_dict']) self.target_net.load_state_dict(checkpoint['target_state_dict']) self.optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['episode'])) def epsilon_greedy_policy(self, observation, nA, test=False): observation = to_float(observation).to(self.cuda) # print("size of observation->"+str(sys.getsizeof(observation.storage()))) sample = random.random() if test: return self.policy_net(observation).max(1)[1].view(1, 1).item() if sample <= self.epsilon: action = torch.tensor([[random.randrange(self.n_actions)]], device=self.cuda, dtype=torch.long) else: with torch.no_grad(): action = self.policy_net(observation).max(1)[1].view(1, 1) return action def optimize_model(self): transitions = self.memory.sample(self.batch_size) batch = Transition(*zip(*transitions)) non_final_mask = torch.tensor(tuple( map(lambda s: s is not None, batch.next_state)), device=self.cuda, dtype=torch.bool) non_final_next_states = torch.cat( [to_float(s) for s in batch.next_state if s is not None]) state_batch = torch.cat( [to_float(s).to(self.cuda) for s in batch.state]) action_batch = torch.cat(batch.action) reward_batch = torch.cat(batch.reward) state_action_values = self.policy_net(state_batch).gather( 1, action_batch) next_state_values = torch.zeros(self.batch_size, device=self.cuda) next_state_values[non_final_mask] = self.target_net( non_final_next_states).max(1)[0].detach() expected_state_action_values = (next_state_values * self.gamma) + reward_batch loss = F.smooth_l1_loss( state_action_values.float(), expected_state_action_values.unsqueeze(1).float()) self.optimizer.zero_grad() loss.backward() for param in self.policy_net.parameters(): param.grad.data.clamp_(-1, 1) self.optimizer.step() return loss.item() def train(self): current_loss = 0 train_rewards = [] train_episode_len = 0.0 file_loss = open(self.output_logs, "a") file_loss.write("episode,step,epsilon,reward,loss,length\n") print("Training Started") episode = 0 loss = 0.0 while self.curr_step < self.step: state = to_tensor(self.env.reset()) # * State is in torch.uint8 format , convert before passing to model*# done = False episode_reward = 0.0 train_loss = 0 s = 0 # length of episode while not done: # self.env.env.render() action = self.epsilon_greedy_policy(state, self.n_actions) new_state, reward, done, _ = self.env.step( action.item()) # new_state torch.uint8 format new_state, reward = to_tensor(new_state).to( self.cuda), torch.tensor([reward], device=self.cuda) episode_reward += reward self.memory.push(state, action, new_state, reward) if (self.curr_step > self.observe_steps) and ( self.curr_step % self.args.update_current) == 0: loss = self.optimize_model() train_loss += loss print( 'Step: %i, Episode: %i, Action: %i, Reward: %.0f, Epsilon: %.5f, Loss: %.5f' % (self.curr_step, episode, action.item(), reward.item(), self.epsilon, loss), end='\r') if self.curr_step > self.observe_steps and self.curr_step % self.target_update == 0: self.target_net.load_state_dict( self.policy_net.state_dict()) # TO CHECK APPROXIMATELY HOW MUCH GPU MEMORY OUR REPLAY MEMORY IS CONSUMING print(torch.cuda.get_device_name(0)) print('Memory Usage:') print('Allocated:', round(torch.cuda.memory_allocated(0) / 1024**3, 1), 'GB') print('Cached: ', round(torch.cuda.memory_cached(0) / 1024**3, 1), 'GB') if self.epsilon > self.args.eps_end and self.curr_step > self.observe_steps: interval = self.args.eps_start - self.args.eps_end self.epsilon -= interval / float(self.args.explore_steps) self.curr_step += 1 state = new_state s += 1 if self.curr_step % self.args.saver_steps == 0 and episode != 0 and self.curr_step != 0: k = { 'step': self.curr_step, 'epsilon': self.epsilon, 'episode': episode, 'policy_state_dict': self.policy_net.state_dict(), 'target_state_dict': self.target_net.state_dict(), 'optimizer': self.optimizer.state_dict() } filename = os.path.join(self.ckpt_path, 'ckpt.pth.tar') torch.save(k, filename) episode += 1 train_rewards.append(episode_reward.item()) train_episode_len += s if episode % self.args.num_eval == 0 and episode != 0: current_loss = train_loss avg_reward_train = np.mean(train_rewards) train_rewards = [] avg_episode_len_train = train_episode_len / float( self.args.num_eval) train_episode_len = 0.0 file_loss.write( str(episode) + "," + str(self.curr_step) + "," + "{:.4f}".format(self.epsilon) + "," + "{:.2f}".format(avg_reward_train) + "," + "{:.4f}".format(current_loss) + "," + "{:.2f}".format(avg_episode_len_train) + "\n") file_loss.flush() self.writer.add_scalar('train_loss/episode(avg100)', current_loss, episode) self.writer.add_scalar('episode_reward/episode(avg100)', avg_reward_train, episode) self.writer.add_scalar('length of episode/episode(avg100)', avg_episode_len_train, episode) self.writer.add_scalar('train_loss/episode', train_loss, episode) self.writer.add_scalar('episode_reward/episode', episode_reward, episode) self.writer.add_scalar('epsilon/num_steps', self.epsilon, self.curr_step) self.writer.add_scalar('length of episode/episode', s, episode) print("GAME OVER")
class Agent(): """ RL Agent that interacts with a given environment, learns and adapts succesfull behaviour. """ def __init__(self,state_size, action_size ,batch_size,learn_step_size,buffer_size ,gamma , learning_rate, tau ,seed): """ Intialize the agent and its learning parameter set. Parameters ========= state_size (int): Size of the state space action_size (int): Size of the action space batch_size (int): Size of the batch size used in each learning step learn_step_size (int): Number of steps until agent ist trained again buffer_size (int): Size of replay memory buffer gamma (float): Discount rate that scales future discounts learning_rate (float): Learning rate of neural network tau (float): Update strenght between local and target network seed (float): Random set for initialization """ # ----- Parameter init ----- # State and action size from environment self.state_size = state_size self.action_size = action_size # Replay buffer and learning properties self.batch_size = batch_size self.learn_step_size = learn_step_size self.gamma = gamma self.tau = tau # General self.seed = random.seed(seed) # ----- Network and memory init ----- # Init identical NN as local and target networks and set optimizer self.qnetwork_local = DQN(state_size, action_size, seed).to(device) self.qnetwork_target = DQN(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=learning_rate) # Initialize replay memory and time step (for updating every learn_step_size steps) self.memory = ReplayBuffer(action_size, buffer_size, batch_size, seed) self.t_step = 0 def step(self, state, action, reward, next_state, done): """ Append information of past step in memory and trigger learning. Parameters ========== state (array_like): State before action action (array_like): Action that was taken reward (float): Reward for action next_state (array_like): State after action done (bool): Indicator if env was solved after action """ # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every learn_step_size time steps. self.t_step = (self.t_step + 1) % self.learn_step_size if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if self.memory.get_memory_size() > self.batch_size: self.learn() def act(self, state, eps=0.): """Returns actions for given state as per current policy. Parameters ========== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ # Transform state to PyTorch tensor state = torch.from_numpy(state).float().unsqueeze(0).to(device) # Get action scores for state from network self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self): """ Get sample of experience tuples and value parameters target network. """ # Get tuples from experience buffer experiences = self.memory.get_sample() states, actions, rewards, next_states, dones = experiences # -----DQN ----- #Optional: to be replaced with Double DQN (see below) #Q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1) # ----- Double DQN ----- # Detach to not update weights during learning # Select maximum value # Unsqueeze to reduce the tensor dimension to one expected_next_actions = self.qnetwork_local(next_states).detach().max(1)[1].unsqueeze(1) # Get Q values for next actions from target Q-network Q_targets_next = self.qnetwork_target(next_states).detach().gather(1, expected_next_actions) # Compute Q targets for current states Q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones)) # Get expected Q values from local model # Gather values alon an axis specified by dim Q_expected = self.qnetwork_local(states).gather(1, actions) # Compute loss loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() # ----- Update target network ----- #Soft update model parameters. #θ_target = τ*θ_local + (1 - τ)*θ_target for target_param, local_param in zip(self.qnetwork_target.parameters(), self.qnetwork_local.parameters()): target_param.data.copy_(self.tau*local_param.data + (1.0-self.tau)*target_param.data)
class Agent_DQN(Agent): def __init__(self, env, args): """ Initialize everything you need here. For example: paramters for neural network initialize Q net and target Q net parameters for repaly buffer parameters for q-learning; decaying epsilon-greedy ... """ super(Agent_DQN, self).__init__(env) ########################### # YOUR IMPLEMENTATION HERE # #Gym parameters self.num_actions = env.action_space.n # parameters for repaly buffer self.buffer_max_len = 20000 self.buffer = deque(maxlen=self.buffer_max_len) self.episode_reward_list = [] self.moving_reward_avg = [] # paramters for neural network self.batch_size = 32 self.gamma = 0.999 self.eps_threshold = 0 self.eps_start = 1 self.eps_end = 0.025 self.max_expisode_decay = 10000 self.device = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") #Training self.steps_done = 0 self.num_episode = 20000 self.target_update = 5000 self.learning_rate = 1.5e-4 # Neural Network self.policy_net = DQN().to(self.device) self.target_net = DQN().to(self.device) self.optimizer = optim.Adam(self.policy_net.parameters(), lr=self.learning_rate) if args.test_dqn: #you can load your model here print('loading trained model') self.policy_net = torch.load('policy_net.hb5') self.policy_net.eval() ########################### # YOUR IMPLEMENTATION HERE # def init_game_setting(self): """ Testing function will call this function at the begining of new game Put anything you want to initialize if necessary. If no parameters need to be initialized, you can leave it as blank. """ ########################### # YOUR IMPLEMENTATION HERE # ########################### pass def make_action(self, observation, test=True): """ Return predicted action of your agent Input: observation: np.array stack 4 last preprocessed frames, shape: (84, 84, 4) Return: action: int the predicted action from trained model """ ########################### # YOUR IMPLEMENTATION HERE # with torch.no_grad(): sample = random.random() ## Check if this is the best way to decline observation = torch.tensor(observation, dtype=torch.float, device=self.device).permute( 2, 0, 1).unsqueeze(0) if test: print("testing") return self.policy_net(observation).max(1)[1].item() if sample > self.eps_threshold: #print("Above threshold") return self.policy_net(observation).max(1)[1].item() else: #print("Below Threshold") return self.env.action_space.sample() ########################### def push(self, state, reward, action, next_state, done): """ You can add additional arguments as you need. Push new data to buffer and remove the old one if the buffer is full. Hints: ----- you can consider deque(maxlen = 10000) list """ ########################### # YOUR IMPLEMENTATION HERE # self.buffer.append((state, reward, action, next_state, done)) ########################### def replay_buffer(self, batch_size): """ You can add additional arguments as you need. Select batch from buffer. """ ########################### # YOUR IMPLEMENTATION HERE # batch = random.sample(self.buffer, batch_size) states = [] rewards = [] actions = [] next_states = [] dones = [] for sample in batch: state, reward, action, next_state, done = sample states.append(state) rewards.append(reward) actions.append(action) next_states.append(next_state) dones.append(done) ########################### return states, rewards, actions, next_states, dones def update(self): if self.steps_done < 5000: return states, rewards, actions, next_states, dones = self.replay_buffer( self.batch_size) loss = self.compute_loss(states, rewards, actions, next_states, dones) self.optimizer.zero_grad() loss.backward() for param in self.policy_net.parameters(): param.grad.data.clamp(-1, 1) self.optimizer.step() def compute_loss(self, states, rewards, actions, next_states, dones): non_final_mask = [not done for done in dones] states = torch.tensor(states, dtype=torch.float).permute(0, 3, 1, 2).to(self.device) rewards = torch.tensor(rewards, dtype=torch.float).to(self.device) actions = torch.tensor(actions, dtype=torch.long).to(self.device) next_states = torch.tensor(next_states, dtype=torch.float).permute( 0, 3, 1, 2).to(self.device) dones = torch.tensor(dones, dtype=torch.long).to(self.device) Q_current = self.policy_net.forward(states).gather( 1, actions.unsqueeze(1)) Q_current = Q_current.squeeze(1) ## Should do this with no grad next_state_values = torch.zeros(self.batch_size, device=self.device) next_state_values[non_final_mask] = self.target_net( next_states[non_final_mask]).max(1)[0].detach() expected_state_action_values = (next_state_values * self.gamma) + rewards loss = F.smooth_l1_loss(Q_current, expected_state_action_values) del states, rewards, actions, next_states, dones, Q_current, next_state_values, expected_state_action_values return loss def train(self): """ Implement your training algorithm here """ ########################### # YOUR IMPLEMENTATION HERE # for episode in range(self.num_episode): #Check this please observation = self.env.reset() / 255 self.eps_threshold = max( 1 + (((self.eps_end - self.eps_start) / self.max_expisode_decay) * episode), self.eps_end) episode_steps = 0 done = False episode_reward = 0 ## Not sure if this is the right way to do this? while not done: action = self.make_action(observation, test=False) new_observation, reward, done, _ = self.env.step(action) new_observation = new_observation / 255 episode_reward += reward self.steps_done += 1 episode_steps += 1 self.push(observation, reward, action, new_observation, done) ## Updating the network self.update() observation = new_observation if self.steps_done % self.target_update == 0: self.target_net.load_state_dict( self.policy_net.state_dict()) self.episode_reward_list.append(episode_reward) if episode % 100 == 0: print('episode: {} reward: {} episode length: {}'.format( episode, episode_reward, episode_steps)) torch.save(self.policy_net.state_dict(), 'test_model.pt') ########################### print("Done")
class Agent_DQN(Agent): def __init__(self, env, args): """ Initialize everything you need here. For example: parameters for neural network initialize Q net and target Q net parameters for replay buffer parameters for q-learning; decaying epsilon-greedy ... """ super(Agent_DQN, self).__init__(env) if torch.cuda.is_available(): self.device = 'cuda' print("Using GPU!!!!") else: 'cpu' print("WARNING") print("WARNING") print("Using CPU") self.state_size = env.get_state()[0].as1xnArray().shape[0] self.action_size = 4 self.memory = deque(maxlen=10000) self.thirty_ep_ep = deque(maxlen=10000) self.thirty_ep_reward = deque(maxlen=10000) # Discount Factor self.gamma = 0.99 # Exploration Rate: at the beginning do 100% exploration self.epsilon = 1.0 # Decay epsilon so we can shift from exploration to exploitation self.epsilon_decay = 0.995 # Set floor for how low epsilon can go self.epsilon_min = 0.01 # Set the learning rate self.learning_rate = 0.00015 # batch_size self.batch_size = 32 self.epsilon_decay_frames = 1.0/500000 self.policy_net = DQN(self.state_size, self.action_size).to(self.device) self.target_net = DQN(self.state_size, self.action_size).to(self.device) self.target_net.load_state_dict(self.policy_net.state_dict()) self.optimizer = optim.Adam(self.policy_net.parameters(), lr=self.learning_rate) self.loss = 0 self.file_path = 'trained_models_2/./Q_Network_Parameters_' with open('trained_models_2/log2.txt', 'w+') as log: log.write("episode,avg_reward,epsilon\n") if args.test_dqn: # load trained model print('loading trained model') file_number_to_load = 1933 load_file_path = self.file_path+str(file_number_to_load)+'.pth' self.policy_net.load_state_dict(torch.load(load_file_path, map_location=lambda storage, loc: storage)) # for name, param in self.policy_net.named_parameters(): # print(name, '\t\t', param.shape) print('loaded weights') print(self.policy_net.head.weight) def train(self, n_episodes=100000): ep_epsilon = [] accumulated_reward = 0 rewards_30 = [] for i_episode in range(n_episodes): results = self.env.reset() state, reward, done, _ = self.unpack(results) render = os.path.isfile('.makePicture') # Counters for Reward Averages per episode: ep_reward = 0.0 while not done: action = self.make_action(state, False) results = self.env.step({0: action}) next_state, reward, done, _ = self.unpack(results) # print(reward, done) self.push(state, action, reward, next_state, done) state = next_state ep_reward += reward accumulated_reward += reward if i_episode > 1000 and len(self.memory) > self.batch_size: self.learn() if i_episode % 5000 == 0: print('------------ UPDATING TARGET -------------') self.target_net.load_state_dict(self.policy_net.state_dict()) rewards_30.append(ep_reward) # print(rewards_30) if len(rewards_30) > 30: # print("IN HERE") del rewards_30[0] ep_epsilon.append(self.epsilon) # Print average reward for the episode: # print('Episode ', i_episode, 'had a reward of: ', ep_reward) # print('Epsilon: ', self.epsilon) # Logging the average reward over 30 episodes if i_episode % 30 == 0: self.thirty_ep_reward.append(accumulated_reward/30.0) self.thirty_ep_ep.append(i_episode) with open('trained_models_2/log.txt', 'a+') as log: log.write(str(i_episode)+' had a reward of ' + str(accumulated_reward/30.0)+' over 30 ep\n') with open('trained_models_2/log2.txt', 'a+') as log: log.write(str(i_episode) + ',' + str(sum(rewards_30)/30.0) + ',' + str(self.epsilon) + '\n') accumulated_reward = 0.0 # Save network weights after we have started to learn if i_episode > 3000 and i_episode % 1000 == 0: print('saving... ', i_episode) save_file_path = self.file_path+str(i_episode)+'.pth' torch.save(self.policy_net.state_dict(), save_file_path) fig = plt.figure() plt.plot(ep_epsilon) plt.title('Epsilon decay') plt.xlabel('Episodes') plt.ylabel('Epsilon Value') plt.savefig('trained_models_2/epsilon.png') plt.close() fig = plt.figure() plt.plot(self.thirty_ep_ep, self.thirty_ep_reward) plt.title('Average Reward per 30 Episodes') plt.xlabel('Episodes') plt.ylabel('Average Reward') plt.savefig('trained_models_2/reward.png') plt.close() if i_episode % 200 == 0: print('Episode: ',i_episode ,'Avg reward of last 30 episodes: ', sum(rewards_30)/30.0) def learn(self): sampled_batch = self.replay_buffer(self.batch_size) states, actions, rewards, next_states, dones = list(zip(*sampled_batch)) states = torch.from_numpy(np.stack(states)).to(self.device) actions = torch.from_numpy(np.stack(actions)).to(self.device) rewards = torch.from_numpy(np.stack(rewards)).to(self.device) next_states = torch.from_numpy(np.stack(next_states)).to(self.device) dones = torch.from_numpy(np.stack(dones)).to(self.device) states = states.float() next_states = next_states.float() actions = actions.unsqueeze(1) qfun = self.policy_net(states) state_action_values = qfun.gather(1, actions.long()).squeeze() next_state_values = self.target_net(next_states).max(1).values.detach() TD_error = rewards + self.gamma*next_state_values*(1-dones) self.loss = f.smooth_l1_loss(state_action_values, TD_error) self.optimizer.zero_grad() self.loss.backward() # for param in self.policy_net.parameters(): # param.grad.data.clamp_(-1, 1) self.optimizer.step() def make_action(self, observation, test=True): """ Return predicted action of your agent Input: observation: np.array stack 4 last preprocessed frames, shape: (84, 84, 4) Return: action: int the predicted action from trained model """ observation = torch.tensor(observation, dtype=torch.float32).to(self.device) observation = observation.unsqueeze(0) if not test: if np.random.rand() <= self.epsilon: action = random.randrange(self.action_size) else: with torch.no_grad(): # action = torch.argmax(self.policy_net(observation)).item() action = self.target_net(observation).max(1)[1].view(1, 1).item() # print(action) if self.epsilon > self.epsilon_min: self.epsilon = max(0, self.epsilon - self.epsilon_decay_frames) else: with torch.no_grad(): action = torch.argmax(self.policy_net(observation)).item() return action def push(self, state, action, reward, next_state, done): """ Push new data to buffer and remove the old one if the buffer is full. """ action = np.array(action, dtype=np.uint8) reward = np.array(reward, dtype=np.float32) done = np.array(done, dtype=np.float32) self.memory.append((state, action, reward, next_state, done)) def replay_buffer(self, batch_size): """ Select batch from buffer. """ return random.sample(self.memory, batch_size) def unpack(self, results): result = results[0] state, reward, done, info = result.asTuple() return state.as1xnArray(), reward, done, info def init_game_setting(self): pass
class Agent_DQN(Agent): def __init__(self, env, args): """ Initialize every things you need here. For example: building your model """ super(Agent_DQN, self).__init__(env) if args.test_dqn: # you can load your model here print('loading trained model') ################## # YOUR CODE HERE # ################## self.env = env self.batch_size = BATCH_SIZE self.gamma = GAMMA self.eps_start = EPS_START self.eps_decay = EPS_DECAY self.TARGET_UPDATE = TARGET_UPDATE self.policy_net = DQN(self.env.action_space.n) self.target_net = DQN(self.env.action_space.n) self.target_net.load_state_dict(self.policy_net.state_dict()) self.target_net.eval() self.policy_net.to(device) self.target_net.to(device) self.optimizer = optim.Adam(self.policy_net.parameters(), lr=1.5e-4) self.memory = ReplayMemory(10000) if args.test_dqn: # you can load your model here print('loading trained model') self.policy_net.load_state_dict( torch.load(os.path.join('save_dir/' 'model-best.pth'), map_location=torch.device('cpu'))) self.policy_net.eval() def init_game_setting(self): """ Testing function will call this function at the begining of new game Put anything you want to initialize if necessary """ ################## # YOUR CODE HERE # ################## pass def train(self): """ Implement your training algorithm here """ ################## # YOUR CODE HERE # ################## logfile = open('simple_dqn.log', 'w+') step = 0 num_episodes = 1400000 for i_episode in range(num_episodes): # Initialize the environment and state observation = self.env.reset() observation = observation.transpose((2, 0, 1)) observation = observation[np.newaxis, :] state = observation sum_reward = 0 for t in count(): # Select and perform an action action = self.make_action(state, test=False) next_state, reward, done, _ = self.env.step(action.item()) reward = np.clip(reward, -1., 1.) next_state = next_state.transpose((2, 0, 1)) next_state = next_state[np.newaxis, :] sum_reward += reward reward = Tensor([reward]) step += 1 # Store the transition in memory self.memory.push(torch.from_numpy(state), action, torch.from_numpy(next_state), reward) # Observe new state if not done: state = next_state else: state = None if step >= 5000 and step % 5000 == 0: self.optimize_model() self.target_net.load_state_dict( self.policy_net.state_dict()) # Perform one step of the optimization (on the target network) if done: print( 'resetting env. episode %d \'s step=%d reward total was %d.' % (i_episode + 1, step, sum_reward)) print( 'resetting env. episode %d \'s step=%d reward total was %d.' % (i_episode + 1, step, sum_reward), file=logfile) logfile.flush() break # Update the target network # if i_episode % TARGET_UPDATE == 0: # print("Update the target net.") # # print(self.policy_net.state_dict()) # self.target_net.load_state_dict(self.policy_net.state_dict()) if i_episode % 50 == 0: checkpoint_path = os.path.join('save_dir', 'model-best.pth') torch.save(self.policy_net.state_dict(), checkpoint_path) print("model saved to {}".format(checkpoint_path)) def make_action(self, observation, test=True): """ Return predicted action of your agent Input: observation: np.array stack 4 last preprocessed frames, shape: (84, 84, 4) Return: action: int the predicted action from trained model """ ################## # YOUR CODE HERE # ################## global steps_done if test: observation = observation.transpose((2, 0, 1)) observation = observation[np.newaxis, :] # self.policy_net.eval() return self.policy_net( Variable(torch.from_numpy(observation), volatile=True).type(FloatTensor)).data.max(1)[1].view( 1, 1).item() else: self.policy_net.eval() sample = random.random() eps_threshold = EPS_END + (EPS_START - EPS_END) * \ math.exp(-1. * steps_done / EPS_DECAY) steps_done += 1 if sample > eps_threshold: return self.policy_net( Variable( torch.from_numpy(observation), volatile=True).type(FloatTensor)).data.max(1)[1].view( 1, 1) else: return LongTensor([[random.randrange(self.env.action_space.n)] ]) def optimize_model(self): if len(self.memory) < BATCH_SIZE: return transitions = self.memory.sample(BATCH_SIZE) # Transpose the batch (see https://stackoverflow.com/a/19343/3343043 for # detailed explanation). This converts batch-array of Transitions # to Transition of batch-arrays. batch = Transition(*zip(*transitions)) # Compute a mask of non-final states and concatenate the batch elements # (a final state would've been the one after which simulation ended) non_final_mask = torch.tensor(tuple( map(lambda s: s is not None, batch.next_state)), device=device, dtype=torch.bool) non_final_next_states = torch.cat( [s for s in batch.next_state if s is not None]).to(device) state_batch = torch.cat(batch.state).to(device) action_batch = torch.cat(batch.action).to(device) reward_batch = torch.cat(batch.reward).to(device) # Compute Q(s_t, a) - the model computes Q(s_t), then we select the # columns of actions taken. These are the actions which would've been taken # for each batch state according to policy_net state_action_values = self.policy_net(state_batch.float()).gather( 1, action_batch) # Compute V(s_{t+1}) for all next states. # Expected values of actions for non_final_next_states are computed based # on the "older" target_net; selecting their best reward with max(1)[0]. # This is merged based on the mask, such that we'll have either the expected # state value or 0 in case the state was final. next_state_values = torch.zeros(BATCH_SIZE, device=device) next_state_values[non_final_mask] = self.target_net( non_final_next_states.float()).max(1)[0].detach() # Compute the expected Q values expected_state_action_values = (next_state_values * GAMMA) + reward_batch # Compute Huber loss loss = F.smooth_l1_loss(state_action_values, expected_state_action_values.unsqueeze(1)) # Optimize the model self.optimizer.zero_grad() loss.backward() for param in self.policy_net.parameters(): param.grad.data.clamp_(-1, 1) self.optimizer.step()
class Agent_DQN(Agent): def __init__(self, env, args): """ Initialize everything you need here. For example: paramters for neural network initialize Q net and target Q net parameters for repaly buffer parameters for q-learning; decaying epsilon-greedy ... """ super(Agent_DQN,self).__init__(env) ########################### # YOUR IMPLEMENTATION HERE # self.device = 'cuda' if torch.cuda.is_available() else 'cpu' self.state_size = env.observation_space.shape self.action_size = env.action_space.n self.memory = deque(maxlen = 1000000) self.thirty_ep_reward = deque(maxlen = 100000) #print(self.state_size, self.action_size) # Discount Factor self.gamma = 0.99 # Exploration Rate: at the beginning do 100% exploration self.epsilon = 1.0 # Decay epsilon so we can shift from exploration to exploitation self.epsilon_decay = 0.995 # Set floor for how low epsilon can go self.epsilon_min = 0.01 # Set the learning rate self.learning_rate = 0.00015 # batch_size self.batch_size = 32 self.epsilon_decay_frames = 1.0/500000 self.qnetwork = DQN(self.state_size[0], self.state_size[1], self.action_size).to(self.device) print('initial weights:') print(self.qnetwork.head.weight) self.q_prime = DQN(self.state_size[0], self.state_size[1], self.action_size).to(self.device) self.q_prime.load_state_dict(self.qnetwork.state_dict()) self.optimizer = optim.Adam(self.qnetwork.parameters(), lr = self.learning_rate) self.loss = 0 self.file_path = 'trained_models_2/./Q_Network_Parameters_' if args.test_dqn: #you can load your model here print('loading trained model') ########################### # YOUR IMPLEMENTATION HERE # file_number_to_load = 1933 load_file_path = self.file_path+str(file_number_to_load)+'.pth' self.qnetwork.load_state_dict(torch.load(load_file_path, map_location = lambda storage, loc: storage)) #for name, param in self.qnetwork.named_parameters(): # print(name, '\t\t', param.shape) print('loaded weights') print(self.qnetwork.head.weight) def init_game_setting(self): """ Testing function will call this function at the begining of new game Put anything you want to initialize if necessary. If no parameters need to be initialized, you can leave it as blank. """ ########################### # YOUR IMPLEMENTATION HERE # self.curr_state = self.env.reset() ########################### pass def make_action(self, observation, test=True): """ Return predicted action of your agent Input: observation: np.array stack 4 last preprocessed frames, shape: (84, 84, 4) Return: action: int the predicted action from trained model """ ########################### # YOUR IMPLEMENTATION HERE # observation = observation[np.newaxis,:] observation = torch.tensor(observation, dtype = torch.float32).to(self.device) observation = observation.permute(0 , 3, 1, 2) if not test: if np.random.rand()<=self.epsilon: action = random.randrange(self.action_size) else: action = torch.argmax(self.qnetwork(observation)).item() else: action = torch.argmax(self.qnetwork(observation)).item() ########################### return action def push(self, state, action, reward, next_state, done): """ You can add additional arguments as you need. Push new data to buffer and remove the old one if the buffer is full. Hints: ----- you can consider deque(maxlen = 10000) list """ ########################### # YOUR IMPLEMENTATION HERE # action = np.array(action, dtype = np.uint8) reward = np.array(reward, dtype = np.float32) done = np.array(done, dtype = np.float32) self.memory.append((state, action, reward, next_state, done)) ########################### def replay_buffer(self, batch_size): """ You can add additional arguments as you need. Select batch from buffer. """ ########################### # YOUR IMPLEMENTATION HERE # minibatch = random.sample(self.memory, self.batch_size) ########################### return minibatch def learn(self): minibatch = self.replay_buffer(self.batch_size) states, actions, rewards, next_states, dones = list(zip(*minibatch)) states = torch.from_numpy(np.stack(states)).to(self.device) actions = torch.from_numpy(np.stack(actions)).to(self.device) rewards = torch.from_numpy(np.stack(rewards)).to(self.device) next_states = torch.from_numpy(np.stack(next_states)).to(self.device) dones = torch.from_numpy(np.stack(dones)).to(self.device) states = states.permute(0 , 3, 1, 2).float() next_states = next_states.permute(0, 3, 1, 2).float() actions = actions.unsqueeze(1) qfun = self.qnetwork(states) #print('input...\n',states[1][1].shape) #fig = plt.figure() #plt.imshow(states[0,0,:,:].cpu()) #plt.title('State') #plt.savefig('state.png') #plt.close() state_action_values = qfun.gather(1, actions.long()).squeeze() next_state_values = self.q_prime(next_states).max(1).values.detach() TD_error = rewards + self.gamma*next_state_values*(1-dones) self.loss = F.smooth_l1_loss(state_action_values, TD_error) self.optimizer.zero_grad() self.loss.backward() if self.epsilon > self.epsilon_min: self.epsilon = max(0, self.epsilon - self.epsilon_decay_frames) for param in self.qnetwork.parameters(): param.grad.data.clamp_(-1, 1) self.optimizer.step() #print(torch.sum(self.qnetwork.conv1.weight.data)) def train(self, n_episodes = 100000): """ Implement your training algorithm here """ ########################### # YOUR IMPLEMENTATION HERE # # Initializing counters and lists for average reward over 30 episodes: ep_counter = 0.0 time_steps = 0.0 thirty_reward = 0.0 ep_epsilon = [] thirty_ep_reward = [] thirty_ep_ep = [] naming_counter = 0 log = open('trained_models_2/log.txt', 'w+') log.write('Beginning of Log\n') log.close() frames = 0.0 for e in range(n_episodes): running_loss = 0.0 ep_counter += 1 state = self.env.reset() done = False render = os.path.isfile('.makePicture') # Counters for Reward Averages per episode: ep_reward = 0.0 counter = 0.0 while not done: frames += 1 counter += 1 time_steps += 1 if render: self.env.env.render() action = self.make_action(state, False) next_state, reward, done, _ = self.env.step(action) reward = np.clip(reward, -1, 1) self.push(state, action, reward, next_state, done) state = next_state #if done: # reward = -1 if frames > 500000: if len(self.memory) > self.batch_size: self.learn() if frames%5000 == 0: print('------------ UPDATING TARGET -------------') self.q_prime.load_state_dict(self.qnetwork.state_dict()) running_loss+= self.loss ep_reward+=reward thirty_reward += reward ep_epsilon.append(self.epsilon) # Print average reward for the episode: print('Episode ', e, 'had a reward of: ', ep_reward) print('Epsilon: ', self.epsilon) # Loging the average reward over 30 episodes if ep_counter%30 == 0: print('Frame: ', frames) thirty_ep_reward.append(thirty_reward/30) thirty_ep_ep.append(e) print('The Avereage Reward over 30 Episodes: ', thirty_reward/30.0) with open('trained_models_2/log.txt', 'a+') as log: log.write(str(naming_counter)+' had a reward of '+ str(thirty_reward/30.0)+' over 30 ep\n') time_steps = 0.0 thirty_reward = 0.0 # Save network weights after we have started to learn if e > 3000: print('saving... ', naming_counter) save_file_path = self.file_path+str(naming_counter)+'.pth' torch.save(self.qnetwork.state_dict(), save_file_path) naming_counter += 1 fig = plt.figure() plt.plot(ep_epsilon) plt.title('Epsilon decay') plt.xlabel('Episodes') plt.ylabel('Epsilon Value') plt.savefig('trained_models_2/epsilon.png') plt.close() fig = plt.figure() plt.plot(thirty_ep_ep, thirty_ep_reward) plt.title('Average Reward per 30 Episodes') plt.xlabel('Episodes') plt.ylabel('Average Reward') plt.savefig('trained_models_2/reward.png') plt.close()
class DQNAgent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed, alpha, gamma, tau): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.alpha = alpha self.gamma = gamma self.tau = tau # Q Learning Network self.qnetwork_local = DQN(state_size, action_size, seed).to(device) self.qnetwork_target = DQN(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=self.alpha) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.fill_replay_buffer(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if self.memory.__len__() > BATCH_SIZE: experiences = self.memory.get_sample_replay_buffer() self.learn_DDQN(experiences, self.gamma, self.alpha, self.tau) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn_DDQN(self, experiences, gamma, alpha, tau): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # Get index of maximum value for next state from Q_expected Q_argmax = self.qnetwork_local(next_states).detach() _, a_prime = Q_argmax.max(1) #print (self.qnetwork_local(states).detach()) # Get max predicted Q values (for next states) from target model Q_targets_next = self.qnetwork_target(next_states).detach().gather( 1, a_prime.unsqueeze(1)) #print (Q_targets_next.shape) # Compute Q targets for current states Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) #print (Q_targets.shape) # Get expected Q values from local model Q_expected = self.qnetwork_local(states).gather(1, actions) #print (Q_expected.shape) # Compute loss loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, self.tau) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent_DQN(Agent): def __init__(self, env, args): """ Initialize everything you need here. For example: paramters for neural network initialize Q net and target Q net parameters for repaly buffer parameters for q-learning; decaying epsilon-greedy """ super(Agent_DQN, self).__init__(env) ########################### # initializations for replay memory self.env = env self.buffer = collections.deque( maxlen=REPLAY_SIZE) # initializing a replay memory buffer #initializations of agent self._reset() self.last_action = 0 self.net = DQN((4, 84, 84), self.env.action_space.n).to(DEVICE) self.target_net = DQN((4, 84, 84), self.env.action_space.n).to(DEVICE) LOAD_MODEL = True if args.test_dqn: #you can load your model here print('preparing to load trained model') ########################### LOAD_MODEL = True if LOAD_MODEL: self.net.load_state_dict( torch.load(MODEL, map_location=lambda storage, loc: storage)) print('loaded trained model') self.target_net.load_state_dict(self.net.state_dict()) def init_game_setting(self): """ Testing function will call this function at the begining of new game Put anything you want to initialize if necessary. If no parameters need to be initialized, you can leave it as blank. """ ########################### ########################### pass def push(self, experience): """ You can add additional arguments as you need. Push new data to buffer and remove the old one if the buffer is full. """ ########################### self.buffer.append(experience) ########################### def replay_buffer(self, batch_size): """ You can add additional arguments as you need. Select batch from buffer. sample a batch of 32 from the experience collected """ ########################### indices = np.random.choice(len(self.buffer), batch_size, replace=False) states, actions, rewards, dones, next_states = zip( *[self.buffer[idx] for idx in indices]) ########################### # The 'states' below are already in the transposed form because they are sampled from experience return np.array(states, dtype=np.float32), np.array(actions), np.array( rewards, dtype=np.float32), np.array(dones, dtype=np.bool), np.array(next_states) def _reset(self): self.state = self.env.reset() self.total_reward = 0.0 def make_action(self, observation, test=True): """ this is exclusively for testing our actions select action """ state_a_test = np.array([observation.transpose(2, 0, 1)], copy=False) #torch.tensor opperation appends a '1' at the start of the numpy array state_v_test = torch.tensor(state_a_test).to('cpu') #feeding observation to the network Q_values_v_test = self.net.forward(state_v_test) # picking the action with maximum probability #picking the best action _, action_v_test = torch.max(Q_values_v_test, dim=1) #coverting tensor to int action_test = int(action_v_test.item()) ########################### return action_test def make_action_train(self, net, epsilon=0.0, device=DEVICE): """ select action using epsilon greedy method for training purposes """ if np.random.random() < self.epsilon: action = random.randrange(self.env.action_space.n) else: state_a = np.array([self.state.transpose(2, 0, 1)], copy=False) #torch.tensor opperation appends a '1' at the start of the numpy array # and makes it a tensor to be fed to the net state_v = Variable(torch.FloatTensor(state_a).to(device)) #Q_values_v = self.net(state_v) Q_values_v = self.net.forward(state_v) #picking the best action _, action_v = torch.max(Q_values_v, dim=1) #coverting tensor to int action = int(action_v.item()) ########################### return action def take_a_step(self, net, epsilon=0.0, device=DEVICE): """ execute action and take a step in the environment add the state,action,rewards to the experience replay return the total_reward """ done_reward = None action_for_exp = self.make_action_train(self.net, self.epsilon, DEVICE) new_state, reward, is_done, _ = self.env.step(action_for_exp) #Here total reward is the reward for each episode self.total_reward += reward new_state = new_state #remember that the state that comes in from taking a step in our environment # will be in the form of width X height X depth # But whatever state goes into experience will be in the form of depth X height X width # i.e the experience buffer will have state in the transposed format # because this is the format that pytorch input should look like exp = Experience(self.state.transpose(2, 0, 1), action_for_exp, reward, is_done, new_state.transpose(2, 0, 1)) #adding experiences in our replay memory self.push(exp) self.state = new_state if is_done: done_reward = self.total_reward self._reset() return done_reward def loss_function(self, batch, net, target_net, optimizer, device=DEVICE): states, actions, rewards, dones, next_states = batch states_v = Variable(torch.FloatTensor(states).to(device)) next_states_v = Variable(torch.FloatTensor(next_states).to(device)) actions_v = Variable(torch.LongTensor(actions).to(device)) rewards_v = Variable(torch.FloatTensor(rewards).to(device)) done = Variable(torch.FloatTensor(dones).to(device)) #Q_vals state_action_values = self.net(states_v).gather( 1, actions_v.long().unsqueeze(-1)).squeeze(-1) #next_Q_vals next_state_values = self.target_net(next_states_v).max(1)[0] #next_state_values[done] = 0.0 #next_state_values = next_state_values.detach() expected_state_action_values = rewards_v + next_state_values * GAMMA * ( 1 - done) loss = (state_action_values - Variable(expected_state_action_values)).pow(2).mean() # we dont wanna accumilate our gradients # hence it is importent to make them zero at every iteration optimizer.zero_grad() loss.backward() optimizer.step() return loss def train(self): """ Implement your training algorithm here """ ########################### device = torch.device(DEVICE) #defining the optimizer for your neural network optimizer = optim.RMSprop(self.net.parameters(), lr=LEARNING_RATE) #empty list of total rewards total_rewards = [] best_mean_reward = None # initializations for time and speed calculation frame_idx = 0 timestep_frame = 0 timestep = time.time() while True: frame_idx += 1 self.epsilon = EPSILON_END + (EPSILON_START - EPSILON_END) * math.exp( -1. * frame_idx / EPSILON_DECAY) reward = self.take_a_step(self.net, self.epsilon, device=device) if reward is not None: #appending rewards in an empty list of total_rewards total_rewards.append(reward) # not asked to calculate speed speed = (frame_idx - timestep_frame) / (time.time() - timestep) timestep_frame = frame_idx timestep = time.time() #calculating mean of last(recent) 1000 rewards mean_reward = np.mean(total_rewards[-100:]) print( "{} frames: done {} games, mean reward {}, epsilon {}, speed {} frames/s" .format(frame_idx, len(total_rewards), round(mean_reward, 3), round(self.epsilon, 2), round(speed, 2))) if best_mean_reward is None or best_mean_reward < mean_reward or len( total_rewards) % 25 == 0: if best_mean_reward is not None: print("New best mean reward {} -> {}, model saved". format(round(best_mean_reward, 3), round(mean_reward, 3))) if frame_idx % SAVE_INTERVAL == 0: torch.save(self.net.state_dict(), 'breakoutNoFrameSkip-4v1' + '.dat') #checking the replay memory if len(self.buffer) < LEARNING_STARTS: continue #check if we need to update our target function if frame_idx % TARGET_UPDATE_INTERVAL == 0: self.target_net.load_state_dict(self.net.state_dict()) # sampling a batch from buffer batch = self.replay_buffer(BATCH_SIZE) #calculate and backpropogate loss_t = self.loss_function(batch, self.net, self.target_net, optimizer, device) #printing loss at every 100 episodes if len(total_rewards) % 100 == 0: print("loss at episode" + str(len(total_rewards)) + "is" + str(float(loss_t.item()))) with open('rewards_collection-100mean.csv', mode='w') as dataFile: writer = csv.writer(dataFile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) writer.writerow(total_rewards) self.env.close()
def ddqn_train(env, scheduler, optimizer_constructor, model_type, batch_size, rp_start, rp_size, exp_frame, exp_initial, exp_final, gamma, target_update_steps, frames_per_epoch, frames_per_state, output_directory, last_checkpoint, envo): """ Implementation of the training algorithm for DDQN. """ gym.undo_logger_setup() logging.basicConfig(filename=envo + '_' + model_type + 'ddqn_training.log', level=logging.INFO) num_actions = env.action_space.n env.reset() print('No. of actions: ', num_actions) print(env.unwrapped.get_action_meanings()) # initialize action value and target network with the same weights model = DQN(num_actions, use_bn=False) target = DQN(num_actions, use_bn=False) if use_cuda: model.cuda() target.cuda() exp_replay = None episodes_count = 1 if last_checkpoint: model.load_state_dict(torch.load(last_checkpoint)) print(last_checkpoint) print('weights loaded...') exp_replay = initialize_replay_resume(env, rp_start, rp_size, frames_per_state, model) episodes_count = get_index_from_checkpoint_path(last_checkpoint) else: exp_replay = initialize_replay(env, rp_start, rp_size, frames_per_state) target.load_state_dict(model.state_dict()) # scheduler = Scheduler(exp_frame, exp_initial, exp_final) optimizer = optimizer_constructor.type( model.parameters(), lr=optimizer_constructor.kwargs['lr'], alpha=optimizer_constructor.kwargs['alpha'], eps=optimizer_constructor.kwargs['eps']) frames_count = 1 frames_per_episode = 1 epsiodes_durations = [] rewards_per_episode = 0 rewards_duration = [] loss_per_epoch = [] current_state, _, _, _ = play_game(env, frames_per_state) print('Starting training...') count = 0 while True: epsilon = scheduler.anneal_linear(frames_count) choice = random.uniform(0, 1) # epsilon greedy algorithm if choice <= epsilon: action = LongTensor([[random.randrange(num_actions)]]) else: action = get_greedy_action(model, current_state) curr_obs, reward, done, _ = play_game(env, frames_per_state, action[0][0]) rewards_per_episode += reward reward = Tensor([reward]) exp_replay.push(current_state, action, reward, curr_obs) current_state = curr_obs #sample random mini-batch obs_sample = exp_replay.sample(batch_size) batch = Experience( *zip(*obs_sample) ) #unpack the batch into states, actions, rewards and next_states #compute y if len(exp_replay) >= batch_size: loss = ddqn_compute_y(batch, batch_size, model, target, gamma) optimizer.zero_grad() loss.backward() for param in model.parameters(): param.grad.data.clamp_(-1, 1) optimizer.step() loss_per_epoch.append(loss.data.cpu().numpy()) frames_count += 1 frames_per_episode += frames_per_state if done: rewards_duration.append(rewards_per_episode) rewards_per_episode = 0 frames_per_episode = 1 episodes_count += 1 env.reset() current_state, _, _, _ = play_game(env, frames_per_state) if episodes_count % 100 == 0: avg_episode_reward = sum(rewards_duration) / 100.0 avg_reward_content = 'Episode from', episodes_count - 99, ' to ', episodes_count, ' has an average of ', avg_episode_reward, ' reward and loss of ', sum( loss_per_epoch) print(avg_reward_content) logging.info(avg_reward_content) rewards_duration = [] loss_per_epoch = [] # update weights of target network for every TARGET_UPDATE_FREQ steps if frames_count % target_update_steps == 0: target.load_state_dict(model.state_dict()) # print('weights updated at frame no. ', frames_count) #Save weights every 250k frames if frames_count % 250000 == 0: util.make_sure_path_exists(output_directory + '/' + envo + '/') torch.save( model.state_dict(), output_directory + envo + '_' + model_type + '/weights_' + str(frames_count) + '.pth') #Print frame count for every 1000000 (one million) frames: if frames_count % 1000000 == 0: training_update = 'frame count: ', frames_count, 'episode count: ', episodes_count, 'epsilon: ', epsilon print(training_update) logging.info(training_update)
class Agent_DQN(): def __init__(self, env, test=False): self.cuda = torch.device('cuda') print("Using device: " + torch.cuda.get_device_name(self.cuda), flush=True) self.env = env self.state_shape = env.observation_space.shape self.n_actions = env.action_space.n self.memory = deque(maxlen=100000) self.batch_size = 32 self.mem_threshold = 50000 self.gamma = 0.99 self.learning_rate = 1e-4 self.epsilon = 1.0 self.epsilon_min = 0.05 self.epsilon_period = 10000 self.epsilon_decay = (self.epsilon - self.epsilon_min) / self.epsilon_period self.update_rate = 4 self.start_epoch = 1 self.epochs = 10 self.epoch = 10000 self.model = DQN(self.state_shape, self.n_actions).to(self.cuda) print("DQN parameters: {}".format(count_parameters(self.model))) self.target = DQN(self.state_shape, self.n_actions).to(self.cuda) self.target.eval() self.target_update = 10000 self.optimizer = optim.Adam(self.model.parameters(), lr=self.learning_rate) if test: self.model.load_state_dict(torch.load('model.pt')) def init_game_setting(self): pass def make_action(self, observation, test=False): epsilon = 0.01 if test else self.epsilon # turn action into tensor observation = torch.tensor(observation, device=self.cuda, dtype=torch.float) # turn off learning self.model.eval() # epsilon greedy policy if random.random() > epsilon: # no need to calculate gradient with torch.no_grad(): # choose highest value action b = self.model(observation) b = b.cpu().data.numpy() action = np.random.choice( np.flatnonzero(np.isclose(b, b.max()))) else: # random action action = random.choice(np.arange(self.n_actions)) # turn learning back on self.model.train() return action def replay_buffer(self): # Return tuple of sars transitions states, actions, rewards, next_states, dones = zip( *random.sample(self.memory, self.batch_size)) states = torch.tensor(np.vstack(states), device=self.cuda, dtype=torch.float) actions = torch.tensor(np.array(actions), device=self.cuda, dtype=torch.long) rewards = torch.tensor(np.array(rewards, dtype=np.float32), device=self.cuda, dtype=torch.float) next_states = torch.tensor(np.vstack(next_states), device=self.cuda, dtype=torch.float) dones = torch.tensor(np.array(dones, dtype=np.float32), device=self.cuda, dtype=torch.float) return states, actions, rewards, next_states, dones def experience_replay(self, n=0): # clamp gradient clamp = False # Reset gradient (because it accumulates by default) self.optimizer.zero_grad() # sample experience memory states, actions, rewards, next_states, dones = self.replay_buffer() # get Q(s,a) for sample Q = self.model(states).gather(1, actions.unsqueeze(-1)).squeeze(-1) # get max_a' Q(s',a') Q_prime = self.target(next_states).detach().max(1)[0] # calculate y = r + gamma * max_a' Q(s',a') for non-terminal states Y = rewards + (self.gamma * Q_prime) * (1 - dones) # Huber loss of Q and Y loss = F.smooth_l1_loss(Q, Y) # Compute dloss/dx loss.backward() # Clamp gradient if clamp: for param in self.model.parameters(): param.grad.data.clamp_(-1, 1) # Change the weights self.optimizer.step() def train(self): step = 0 learn_step = 0 print("Begin Training:", flush=True) learn_curve = [] last30 = deque(maxlen=30) for epoch in range(self.start_epoch, self.epochs + 1): durations = [] rewards = [] flag = [] # progress bar epoch_bar = tqdm(range(self.epoch), total=self.epoch, ncols=200) for episode in epoch_bar: # reset state state = self.env.reset() # decay epsilon if self.epsilon > self.epsilon_min: self.epsilon -= self.epsilon_decay # run one episode done = False ep_duration = 0 ep_reward = 0 while not done: step += 1 ep_duration += 1 # get epsilon-greedy action action = self.make_action(state) # do action next_state, reward, done, info = self.env.step(action) ep_reward += reward # add transition to replay memory self.memory.append( Transition(state, action, reward, next_state, done)) state = next_state # learn from experience, if available if step % self.update_rate == 0 and len( self.memory) > self.mem_threshold: self.experience_replay(learn_step) learn_step += 1 # update target network if step % self.target_update == 1: self.target.load_state_dict(self.model.state_dict()) durations.append(ep_duration) rewards.append(ep_reward) last30.append(ep_reward) learn_curve.append(np.mean(last30)) flag.append(info['flag_get']) epoch_bar.set_description( "epoch {}/{}, avg duration = {:.2f}, avg reward = {:.2f}, last30 = {:2f}" .format(epoch, self.epochs, np.mean(durations), np.mean(rewards), learn_curve[-1])) # save model every epoch plt.clf() plt.plot(learn_curve) plt.title(f"DQN Epoch {epoch} with {save_prefix} Reward") plt.xlabel('Episodes') plt.ylabel('Moving Average Reward') if not os.path.exists(f"{save_prefix}_DQN"): os.mkdir(f"{save_prefix}_DQN") torch.save(self.model.state_dict(), f'{save_prefix}_DQN/DQN_model_ep{epoch}.pt') pickle.dump( rewards, open(f"{save_prefix}_DQN/DQN_reward_ep{epoch}.pkl", 'wb')) pickle.dump(flag, open(f"{save_prefix}_DQN/flag_ep{epoch}.pkl", 'wb')) plt.savefig(f"{save_prefix}_DQN/epoch{epoch}.png") learn_curve = []
class Agent_DQN(Agent): def __init__(self, env, args): """ Initialize everything you need here. For example: paramters for neural network initialize Q net and target Q net parameters for repaly buffer parameters for q-learning; decaying epsilon-greedy ... """ super(Agent_DQN, self).__init__(env) ########################### # YOUR IMPLEMENTATION HERE # self.env = env self.args = args self.gamma = self.args.gamma self.batch_size = self.args.batch_size self.memory_cap = self.args.memory_cap self.n_episode = self.args.n_episode self.lr = self.args.learning_rate self.epsilon = self.args.epsilon self.epsilon_decay_window = self.args.epsilon_decay_window self.epsilon_min = self.args.epsilon_min self.epsilon_decay = (self.epsilon - self.epsilon_min) / self.epsilon_decay_window self.n_step = self.args.n_step self.f_update = self.args.f_update self.load_model = self.args.load_model self.action_size = self.args.action_size # self.algorithm = self.args.algorithm self.use_cuda = torch.cuda.is_available() self.device = torch.device("cuda" if self.use_cuda else "cpu") print('using device ', torch.cuda.get_device_name(0)) self.FloatTensor = torch.cuda.FloatTensor if self.use_cuda else torch.FloatTensor self.LongTensor = torch.cuda.LongTensor if self.use_cuda else torch.LongTensor self.ByteTensor = torch.cuda.ByteTensor if self.use_cuda else torch.ByteTensor self.Tensor = self.FloatTensor # Create the policy net and the target net self.policy_net = DQN() self.policy_net.to(self.device) # if self.algorithm == 'DDQN': # self.policy_net_2 = DQN() # self.policy_net_2.to(self.device) self.target_net = DQN() self.target_net.to(self.device) self.policy_net.train() self.target_net.load_state_dict(self.policy_net.state_dict()) self.target_net.eval() self.optimizer = optim.Adam(params=self.policy_net.parameters(), lr=self.lr) # buffer self.memory = [] ## self.mean_window = 100 self.print_frequency = 100 self.out_dir = "DQN_Module_b1_1/" if args.test_dqn: #you can load your model here print('loading trained model') ########################### # YOUR IMPLEMENTATION HERE # self.policy_net.load_state_dict( torch.load('model.pth', map_location=self.device)) self.target_net.load_state_dict(self.policy_net.state_dict()) if self.algorithm == 'DDQN': self.policy_net_2.load_state_dict( torch.load('model.pth', map_location=self.device)) self.print_test = True def init_game_setting(self): """ Testing function will call this function at the begining of new game Put anything you want to initialize if necessary. If no parameters need to be initialized, you can leave it as blank. """ ########################### # YOUR IMPLEMENTATION HERE # ########################### pass def make_action(self, observation, test=False): """ Return predicted action of your agent Input: observation: np.array stack 4 last preprocessed frames, shape: (84, 84, 4) Return: action: int the predicted action from trained model """ ########################### # YOUR IMPLEMENTATION HERE # if test: self.epsilon = self.epsilon_min * 0.5 observation = observation / 255. else: self.epsilon = max(self.epsilon - self.epsilon_decay, self.epsilon_min) if random.random() > self.epsilon: observation = self.Tensor(observation.reshape( (1, 84, 84, 4))).transpose(1, 3).transpose(2, 3) state_action_value = self.policy_net( observation).data.cpu().numpy() action = np.argmax(state_action_value) else: action = random.randint(0, self.action_size - 1) ########################### return action def push(self, state, action, reward, next_state, done): """ You can add additional arguments as you need. Push new data to buffer and remove the old one if the buffer is full. Hints: ----- you can consider deque(maxlen = 10000) list """ ########################### # YOUR IMPLEMENTATION HERE # if len(self.memory) >= self.memory_cap: self.memory.pop(0) self.memory.append((state, action, reward, next_state, done)) ########################### def replay_buffer(self): """ You can add additional arguments as you need. Select batch from buffer. """ ########################### # YOUR IMPLEMENTATION HERE # self.mini_batch = random.sample(self.memory, self.batch_size) ########################### return def train(self): """ Implement your training algorithm here """ ########################### # YOUR IMPLEMENTATION HERE # self.steps_done = 0 self.steps = [] self.rewards = [] self.mean_rewards = [] self.time = [] self.best_reward = 0 self.last_saved_reward = 0 self.start_time = time.time() print('train') # continue training from where it stopped if self.load_model: self.policy_net.load_state_dict( torch.load(self.out_dir + 'model.pth', map_location=self.device)) self.target_net.load_state_dict(self.policy_net.state_dict()) self.epsilon = self.epsilon_min print('Loaded') for episode in range(self.n_episode): # Initialize the environment and state state = self.env.reset() / 255. # self.last_life = 5 total_reward = 0 self.step = 0 done = False while (not done) and self.step < 10000: # move to next state self.step += 1 self.steps_done += 1 action = self.make_action(state) next_state, reward, done, life = self.env.step(action) # lives matter # self.now_life = life['ale.lives'] # dead = self.now_life < self.last_life # self.last_life = self.now_life next_state = next_state / 255. # Store the transition in memory self.push(state, action, reward, next_state, done) state = next_state total_reward += reward if done: self.rewards.append(total_reward) self.mean_reward = np.mean( self.rewards[-self.mean_window:]) self.mean_rewards.append(self.mean_reward) self.time.append(time.time() - self.start_time) self.steps.append(self.step) # print the process to terminal progress = "episode: " + str( episode) + ",\t epsilon: " + str( self.epsilon ) + ",\t Current mean reward: " + "{:.2f}".format( self.mean_reward) progress += ',\t Best mean reward: ' + "{:.2f}".format( self.best_reward) + ",\t time: " + time.strftime( '%H:%M:%S', time.gmtime(self.time[-1])) print(progress) if episode % self.print_frequency == 0: self.print_and_plot() # save the best model if self.mean_reward > self.best_reward and len( self.memory) >= 5000: print('~~~~~~~~~~<Model updated with best reward = ', self.mean_reward, '>~~~~~~~~~~') checkpoint_path = self.out_dir + 'model.pth' torch.save(self.policy_net.state_dict(), checkpoint_path) self.last_saved_reward = self.mean_reward self.best_reward = self.mean_reward if len(self.memory) >= 5000 and self.steps_done % 4 == 0: # if self.algorithm == 'DQN': self.optimize_DQN() if self.steps_done % self.f_update == 0: self.target_net.load_state_dict( self.policy_net.state_dict()) # print('-------<target net updated at step,',self.steps_done,'>-------') ########################### def optimize_DQN(self): # sample self.replay_buffer() state, action, reward, next_state, done = zip(*self.mini_batch) # transfer 1*84*84*4 to 1*4*84*84, which is 0,3,1,2 state = self.Tensor(np.float32(state)).permute(0, 3, 1, 2).to(self.device) action = self.LongTensor(action).to(self.device) reward = self.Tensor(reward).to(self.device) next_state = self.Tensor(np.float32(next_state)).permute( 0, 3, 1, 2).to(self.device) done = self.Tensor(done).to(self.device) # Compute Q(s_t, a) state_action_values = self.policy_net(state).gather( 1, action.unsqueeze(1)).squeeze(1) # Compute next Q, including the mask next_state_values = self.target_net(next_state).detach().max(1)[0] # Compute the expected Q value. stop update if done expected_state_action_values = reward + (next_state_values * self.gamma) * (1 - done) # Compute Huber loss self.loss = F.smooth_l1_loss(state_action_values, expected_state_action_values.data) # Optimize the model self.optimizer.zero_grad() self.loss.backward() self.optimizer.step() return def print_and_plot(self): fig1 = plt.figure(1) plt.clf() plt.title('Training...') plt.xlabel('Episode') plt.ylabel('Steps') plt.plot(self.steps) fig1.savefig(self.out_dir + 'steps.png') fig2 = plt.figure(2) plt.clf() plt.title('Training...') plt.xlabel('Episode') plt.ylabel('Reward') plt.plot(self.mean_rewards) fig2.savefig(self.out_dir + 'rewards.png') fig2 = plt.figure(3) plt.clf() plt.title('Training...') plt.xlabel('Episode') plt.ylabel('Time') plt.plot(self.time) fig2.savefig(self.out_dir + 'time.png')
class Agent_DQN(Agent): def __init__(self, env, args): """ Initialize everything you need here. For example: paramters for neural network initialize Q net and target Q net parameters for repaly buffer parameters for q-learning; decaying epsilon-greedy ... """ super(Agent_DQN, self).__init__(env) ########################### # YOUR IMPLEMENTATION HERE # # Declare variables self.exp_id = uuid.uuid4().__str__().replace('-', '_') self.args = args self.env = env self.eps_threshold = None self.nA = env.action_space.n self.action_list = np.arange(self.nA) self.reward_list = deque( maxlen=args.window) # np.zeros(args.window, np.float32) self.max_q_list = deque( maxlen=args.window) # np.zeros(args.window, np.float32) self.loss_list = deque( maxlen=args.window) # np.zeros(args.window, np.float32) self.probability_list = np.zeros(env.action_space.n, np.float32) self.cur_eps = self.args.eps self.t = 0 self.ep_len = 0 self.mode = None if self.args.use_pri_buffer: self.replay_buffer = NaivePrioritizedBuffer( capacity=self.args.capacity, args=self.args) else: self.replay_buffer = ReplayBuffer(capacity=self.args.capacity, args=self.args) self.position = 0 self.args.save_dir += f'/{self.exp_id}/' os.system(f"mkdir -p {self.args.save_dir}") self.meta = MetaData(fp=open( os.path.join(self.args.save_dir, 'result.csv'), 'w'), args=self.args) self.eps_delta = (self.args.eps - self.args.eps_min) / self.args.eps_decay_window self.beta_by_frame = lambda frame_idx: min( 1.0, args.pri_beta_start + frame_idx * (1.0 - args.pri_beta_start) / args.pri_beta_decay) # Create Policy and Target Networks if self.args.use_dueling: print("Using dueling dqn . . .") self.policy_net = DuelingDQN(env, self.args).to(self.args.device) self.target_net = DuelingDQN(env, self.args).to(self.args.device) elif self.args.use_crnn: print("Using dueling crnn . . .") self.policy_net = CrnnDQN(env).to(self.args.device) self.target_net = CrnnDQN(env).to(self.args.device) else: self.policy_net = DQN(env, self.args).to(self.args.device) self.target_net = DQN(env, self.args).to(self.args.device) self.target_net.load_state_dict(self.policy_net.state_dict()) self.optimizer = optim.Adam(self.policy_net.parameters(), lr=self.args.lr, eps=self.args.optimizer_eps) if self.args.lr_scheduler: print("Enabling LR Decay . . .") self.scheduler = optim.lr_scheduler.ExponentialLR( optimizer=self.optimizer, gamma=self.args.lr_decay) self.cur_lr = self.optimizer.param_groups[0]['lr'] # Compute Huber loss self.loss = F.smooth_l1_loss # todo: Support for Multiprocessing. Bug in pytorch - https://github.com/pytorch/examples/issues/370 self.policy_net.share_memory() self.target_net.share_memory() # Set defaults for networks self.policy_net.train() self.target_net.eval() self.target_net.load_state_dict(self.policy_net.state_dict()) if args.test_dqn: # you can load your model here ########################### # YOUR IMPLEMENTATION HERE # print('loading trained model') self.load_model() if args.use_pri_buffer: print('Using priority buffer . . .') if args.use_double_dqn: print('Using double dqn . . .') if args.use_bnorm: print("Using batch normalization . . .") print("Arguments: \n", json.dumps(vars(self.args), indent=2), '\n') def init_game_setting(self): pass def make_action(self, observation, test=True): """ Return predicted action of your agent Input: observation: np.array stack 4 last preprocessed frames, shape: (84, 84, 4) Return: action: int the predicted action from trained model """ ########################### # YOUR IMPLEMENTATION HERE # with torch.no_grad(): if self.args.test_dqn: q, argq = self.policy_net( Variable( self.channel_first(observation))).data.cpu().max(1) return self.action_list[argq] # Fill up probability list equal for all actions self.probability_list.fill(self.cur_eps / self.nA) # Fetch q from the model prediction q, argq = self.policy_net(Variable( self.channel_first(observation))).data.cpu().max(1) # Increase the probability for the selected best action self.probability_list[argq[0].item()] += 1 - self.cur_eps # Use random choice to decide between a random action / best action action = torch.tensor( [np.random.choice(self.action_list, p=self.probability_list)]) ########################### return action.item(), q.item() def optimize_model(self): """ Function to perform optimization on DL Network :return: Loss """ # Return if initial buffer is not filled. if len(self.replay_buffer.memory) < self.args.mem_init_size: return 0 if self.args.use_pri_buffer: batch_state, batch_action, batch_next_state, batch_reward, batch_done, indices, weights = self.replay_buffer.sample( self.args.batch_size, beta=self.beta_by_frame(self.t)) else: batch_state, batch_action, batch_next_state, batch_reward, batch_done = self.replay_buffer.sample( self.args.batch_size) batch_state = Variable( self.channel_first( torch.tensor(np.array(batch_state), dtype=torch.float32))) batch_action = Variable( torch.tensor(np.array(batch_action), dtype=torch.long)) batch_next_state = Variable( self.channel_first( torch.tensor(np.array(batch_next_state), dtype=torch.float32))) batch_reward = Variable( torch.tensor(np.array(batch_reward), dtype=torch.float32)) batch_done = Variable( torch.tensor(np.array(batch_done), dtype=torch.float32)) policy_max_q = self.policy_net(batch_state).gather( 1, batch_action.unsqueeze(1)).squeeze(1) if self.args.use_double_dqn: policy_ns_max_q = self.policy_net(batch_next_state) next_q_value = self.target_net(batch_next_state).gather( 1, torch.max(policy_ns_max_q, 1)[1].unsqueeze(1)).squeeze(1) target_max_q = next_q_value * self.args.gamma * (1 - batch_done) else: target_max_q = self.target_net(batch_next_state).detach().max( 1)[0].squeeze(0) * self.args.gamma * (1 - batch_done) # Compute Huber loss if self.args.use_pri_buffer: loss = (policy_max_q - (batch_reward + target_max_q.detach())).pow(2) * Variable( torch.tensor(weights, dtype=torch.float32)) prios = loss + 1e-5 loss = loss.mean() else: loss = self.loss(policy_max_q, batch_reward + target_max_q) # Optimize the model self.optimizer.zero_grad() loss.backward() # Clip gradients between -1 and 1 for param in self.policy_net.parameters(): param.grad.data.clamp_(-1, 1) if self.args.use_pri_buffer: self.replay_buffer.update_priorities(indices, prios.data.cpu().numpy()) self.optimizer.step() return loss.cpu().detach().numpy() def train(self): """ Implement your training algorithm here """ ########################### # YOUR IMPLEMENTATION HERE # def train_fn(): self.t = 1 self.mode = "Random" train_start = time.time() if not self.args.load_dir == '': self.load_model() for i_episode in range(1, self.args.max_episodes + 1): # Initialize the environment and state start_time = time.time() state = self.env.reset() self.reward_list.append(0) self.loss_list.append(0) self.max_q_list.append(0) self.ep_len = 0 done = False # Save Model self.save_model(i_episode) # Collect garbage self.collect_garbage(i_episode) # Run the game while not done: # Update the target network, copying all weights and biases in DQN if self.t % self.args.target_update == 0: print("Updating target network . . .") self.target_net.load_state_dict( self.policy_net.state_dict()) # Select and perform an action self.cur_eps = max(self.args.eps_min, self.cur_eps - self.eps_delta) if self.cur_eps == self.args.eps_min: self.mode = 'Exploit' else: self.mode = "Explore" action, q = self.make_action(state) next_state, reward, done, _ = self.env.step(action) self.reward_list[-1] += reward self.max_q_list[-1] = max(self.max_q_list[-1], q) # Store the transition in memory self.replay_buffer.push(state, action, next_state, reward, done) self.meta.update_step(self.t, self.cur_eps, self.reward_list[-1], self.max_q_list[-1], self.loss_list[-1], self.cur_lr) # Increment step and Episode Length self.t += 1 self.ep_len += 1 # Move to the next state state = next_state # Perform one step of the optimization (on the target network) if self.ep_len % self.args.learn_freq == 0: loss = self.optimize_model() self.loss_list[-1] += loss self.loss_list[-1] /= self.ep_len # Decay Step: if self.args.lr_scheduler: self.cur_lr = self.scheduler.get_lr()[0] if i_episode % self.args.lr_decay_step == 0 and self.cur_lr > self.args.lr_min: self.scheduler.step(i_episode) # Update meta self.meta.update_episode( i_episode, self.t, time.time() - start_time, time.time() - train_start, self.ep_len, len(self.replay_buffer.memory), self.cur_eps, self.reward_list[-1], np.mean(self.reward_list), self.max_q_list[-1], np.mean(self.max_q_list), self.loss_list[-1], np.mean(self.loss_list), self.mode, self.cur_lr) import multiprocessing as mp processes = [] for rank in range(4): p = mp.Process(target=train_fn) p.start() processes.append(p) for p in processes: p.join()
class Agent_DQN(Agent): def __init__(self, env, args): """ Initialize everything you need here. For example: paramters for neural network initialize Q net and target Q net parameters for repaly buffer parameters for q-learning; decaying epsilon-greedy ... """ super(Agent_DQN, self).__init__(env) ########################### # YOUR IMPLEMENTATION HERE # # import arguments self.args = args self.env = env self.batch_size = self.args.batch_size self.gamma = self.args.gamma self.lr = self.args.learning_rate self.memory_cap = self.args.memory_cap self.n_episode = self.args.n_episode self.n_step = self.args.n_step self.update_f = self.args.update_f self.explore_step = self.args.explore_step self.action_size = self.args.action_size self.algorithm = self.args.algorithm self.save_path = "dqn/" print('using algorithm ', self.algorithm) # whether continue training self.load_model = self.args.load_model # unify tensor tpye according to device names self.use_cuda = torch.cuda.is_available() self.device = torch.device("cuda" if self.use_cuda else "cpu") print('using device ', torch.cuda.get_device_name(0)) self.FloatTensor = torch.cuda.FloatTensor if self.use_cuda else torch.FloatTensor self.LongTensor = torch.cuda.LongTensor if self.use_cuda else torch.LongTensor self.ByteTensor = torch.cuda.ByteTensor if self.use_cuda else torch.ByteTensor self.Tensor = self.FloatTensor # default type # epsilon decay self.epsilon = 1.0 self.epsilon_min = 0.025 self.epsilon_decay = (self.epsilon - self.epsilon_min) / self.explore_step # Create the policy net and the target net self.policy_net = DQN() self.policy_net.to(self.device) if self.algorithm == 'DDQN': self.policy_net_2 = DQN() self.policy_net_2.to(self.device) self.target_net = DQN() self.target_net.to(self.device) self.target_net.load_state_dict(self.policy_net.state_dict()) self.target_net.eval() # replay buffer self.memory = [] # optimizer self.optimizer = optim.Adam(params=self.policy_net.parameters(), lr=self.lr) if self.algorithm == 'DDQN': self.optimizer_2 = optim.Adam( params=self.policy_net_2.parameters(), lr=self.lr) # other self.f_skip = 4 # frame skip self.n_avg_reward = 100 self.f_print = 100 self.print_test = False if args.test_dqn: #you can load your model here print('loading trained model') ########################### # YOUR IMPLEMENTATION HERE # self.policy_net.load_state_dict( torch.load('model.pth', map_location=self.device)) self.target_net.load_state_dict(self.policy_net.state_dict()) if self.algorithm == 'DDQN': self.policy_net_2.load_state_dict( torch.load('model.pth', map_location=self.device)) self.print_test = True def init_game_setting(self): """ Testing function will call this function at the begining of new game Put anything you want to initialize if necessary. If no parameters need to be initialized, you can leave it as blank. """ ########################### # YOUR IMPLEMENTATION HERE # state = self.env.reset() / 255. self.last_life = 5 self.step = 0 done = False total_reward = 0 ########################### return state, done, total_reward def make_action(self, observation, test=False): """ Return predicted action of your agent Input: observation: np.array stack 4 last preprocessed frames, shape: (84, 84, 4) Return: action: int the predicted action from trained model """ ########################### # YOUR IMPLEMENTATION HERE # if test: self.epsilon = self.epsilon_min observation = observation / 255. else: self.epsilon = max(self.epsilon - self.epsilon_decay, self.epsilon_min) if random.random() > self.epsilon: observation = self.Tensor(observation.reshape( (1, 84, 84, 4))).transpose(1, 3).transpose(2, 3) state_action_value = self.policy_net( observation).data.cpu().numpy() action = np.argmax(state_action_value) else: action = random.randint(0, self.action_size - 1) ########################### return action def push(self, state, action, reward, next_state, dead, done): """ You can add additional arguments as you need. Push new data to buffer and remove the old one if the buffer is full. Hints: ----- you can consider deque(maxlen = 10000) list """ ########################### # YOUR IMPLEMENTATION HERE # if len(self.memory) >= self.memory_cap: self.memory.pop(0) self.memory.append((state, action, reward, next_state, dead, done)) ########################### def replay_buffer(self): """ You can add additional arguments as you need. Select batch from buffer. """ ########################### # YOUR IMPLEMENTATION HERE # self.mini_batch = random.sample(self.memory, self.batch_size) ########################### return def train(self): """ Implement your training algorithm here """ ########################### # YOUR IMPLEMENTATION HERE # # initialize self.steps_done = 0 self.steps = [] self.rewards = [] self.mean_rewards = [] self.best_reward = 0 self.last_saved_reward = 0 start = time.time() logfile = open('dqn.log', 'w+') # continue training if self.load_model: self.policy_net.load_state_dict( torch.load(self.save_path + 'model.pth', map_location=self.device)) self.target_net.load_state_dict(self.policy_net.state_dict()) self.epsilon = self.epsilon_min for episode in range(self.n_episode): state, done, total_reward = self.init_game_setting() while (not done) and self.step < 10000: # move to next state self.step += 1 self.steps_done += 1 action = self.make_action(state) next_state, reward, done, life = self.env.step(action) # lives matter now_life = life['ale.lives'] dead = (now_life < self.last_life) self.last_life = now_life next_state = next_state / 255. # Store the transition in memory self.push(state, action, reward, next_state, dead, done) state = next_state total_reward += reward if len(self.memory ) >= self.n_step and self.steps_done % self.f_skip == 0: if self.algorithm == 'DQN': self.optimize_DQN() elif self.algorithm == 'DDQN': self.optimize_DDQN() if self.steps_done % self.update_f == 0: self.target_net.load_state_dict( self.policy_net.state_dict()) self.rewards.append(total_reward) self.mean_reward = np.mean(self.rewards[-self.n_avg_reward:]) self.mean_rewards.append(self.mean_reward) self.steps.append(self.step) # print progress in terminal progress = "Episode: " + str( episode) + ",\tCurrent mean reward: " + "{:.2f}".format( self.mean_reward ) + ',\tBest mean reward: ' + "{:.2f}".format(self.best_reward) progress += ",\tCurerent Reward: " + str( total_reward) + ",\tTime: " + time.strftime( '%H:%M:%S', time.gmtime(time.time() - start)) print(progress) print(episode, self.mean_reward, self.best_reward, total_reward, time.time() - start, file=logfile) logfile.flush() if (episode + 1) % self.f_print == 0: self.plots() # save the best model if self.mean_reward > self.best_reward and self.steps_done > self.n_step: checkpoint_path = self.save_path + 'model.pth' torch.save(self.policy_net.state_dict(), checkpoint_path) self.last_saved_reward = self.mean_reward self.best_reward = max(self.mean_reward, self.best_reward) ########################### def optimize_DQN(self): # sample self.replay_buffer() state, action, reward, next_state, dead, done = zip(*self.mini_batch) state = self.Tensor(np.float32(state)).permute(0, 3, 1, 2).to(self.device) action = self.LongTensor(action).to(self.device) reward = self.Tensor(reward).to(self.device) next_state = self.Tensor(np.float32(next_state)).permute( 0, 3, 1, 2).to(self.device) dead = self.Tensor(dead).to(self.device) done = self.Tensor(done).to(self.device) # Compute Q(s_t, a) state_action_values = self.policy_net(state).gather( 1, action.unsqueeze(1)).squeeze(1) # Compute next Q, including the mask next_state_values = self.target_net(next_state).detach().max(1)[0] # Compute the expected Q value. stop update if done expected_state_action_values = reward + (next_state_values * self.gamma) * (1 - done) # Compute Huber loss self.loss = F.smooth_l1_loss(state_action_values, expected_state_action_values.data) # Optimize the model self.optimizer.zero_grad() self.loss.backward() self.optimizer.step() return def optimize_DDQN(self): # sample self.replay_buffer() state, action, reward, next_state, dead, done = zip(*self.mini_batch) # transfer 1*84*84*4 to 1*4*84*84, which is 0,3,1,2 state = self.Tensor(np.float32(state)).permute(0, 3, 1, 2).to(self.device) action = self.LongTensor(action).to(self.device) reward = self.Tensor(reward).to(self.device) next_state = self.Tensor(np.float32(next_state)).permute( 0, 3, 1, 2).to(self.device) dead = self.Tensor(dead).to(self.device) done = self.Tensor(done).to(self.device) # Compute Q(s_t, a) state_action_values = self.policy_net(state).gather( 1, action.unsqueeze(1)).squeeze(1) state_action_values_2 = self.policy_net_2(state).gather( 1, action.unsqueeze(1)).squeeze(1) # Compute next Q, including the mask next_state_values = self.target_net(next_state).detach().max(1)[0] next_state_values_2 = self.target_net(next_state).detach().max(1)[0] next_state_values = torch.min(next_state_values, next_state_values_2) # Compute the expected Q value. stop update if done expected_state_action_values = reward + (next_state_values * self.gamma) * (1 - done) # Compute Huber loss self.loss = F.smooth_l1_loss(state_action_values, expected_state_action_values.data) self.loss_2 = F.smooth_l1_loss(state_action_values_2, expected_state_action_values.data) # Optimize the model self.optimizer.zero_grad() self.loss.backward() self.optimizer.step() self.optimizer_2.zero_grad() self.loss_2.backward() self.optimizer_2.step() return def plots(self): fig1 = plt.figure(1) plt.clf() plt.title('Training_Steps_per_Episode') plt.xlabel('Episode') plt.ylabel('Steps') plt.plot(self.steps) fig1.savefig(self.save_path + 'steps.png') fig2 = plt.figure(2) plt.clf() plt.title('Training...') plt.xlabel('Episode') plt.ylabel('Reward') plt.plot(self.rewards) if len(self.rewards) >= self.n_avg_reward: plt.plot(self.mean_rewards) fig2.savefig(self.save_path + 'rewards.png') rewards = np.array(self.rewards) np.save(self.save_path + 'rewards.npy', rewards)
class Simulation: """ Simulation for the game of 3D Pong. Parameters ---------- params: dict Dictionary of all the simulation parameters """ def __init__(self, params, player_n=0): # unpack the parameters: #### simulation self.device = params["device"] self.env_name = params["env_name"] self.training_frames = params["training_frames"] self.skip_frames = params["skip_frames"] self.nactions = params["nactions"] self.messages_enabled = params["messages_enabled"] self.selfplay = params["selfplay"] #### qnet model self.learning_rate = params["learning_rate"] self.sync = params["sync"] self.load_from = params["load_from"] #### buffer self.batch_size = params["batch_size"] self.replay_size = params["replay_size"] self.nstep = params["nstep"] #### agent model self.gamma = params["gamma"] self.eps_start = params["eps_start"] self.eps_end = params["eps_end"] self.eps_decay_rate = params["eps_decay_rate"] self.player_n = player_n self.double = params["double"] # initialize the simulation with shared properties self.env = gym.make( self.env_name ) # environment, agent etc. can"t be created jointly in a server simulation self.net = DQN(self.env.observation_space.shape[0], self.nactions**2).to(self.device) def _create_environment(self): """ create a gym environment for the simulation. Actions are discretized into nactions and frames are skipped for faster training :return: env """ env = gym.make(self.env_name) if self.selfplay: env.unwrapped.multiplayer(env, game_server_guid="selfplayer", player_n=self.player_n) env = wrappers.action_space_discretizer(env, n=self.nactions) env = wrappers.SkipEnv(env, skip=self.skip_frames) return env def _create_agent(self, env): """ Create agent with buffer for the simulation. :return: agent """ # buffer = ExperienceBuffer(self.replay_size) buffer = Extendedbuffer(self.replay_size, nstep=self.nstep, gamma=self.gamma) agent = pongagent.Pongagent(env, self.player_n, buffer) return agent def _create_model(self): """ Create a deep Q model for function approximation with Adam optimizer. :return: net, tgt_net, optimizer """ tgt_net = DQN(self.env.observation_space.shape[0], self.nactions**2).to(self.device) if self.load_from is not None: assert type( self.load_from ) == str, "Name of model to be loaded has to be a string!" self.net.load_state_dict(torch.load(self.load_from)) tgt_net.load_state_dict(torch.load(self.load_from)) optimizer = optim.Adam(self.net.parameters(), lr=self.learning_rate) return tgt_net, optimizer def _init_non_shared(self, player_n): env = self._create_environment() tgt_net, optimizer = self._create_model() agent = self._create_agent(env) writer = SummaryWriter( comment="-" + "player" + str(player_n) + "batch" + str(self.batch_size) + "_n" + str(env.action_space.n) + "_eps" + str(self.eps_decay_rate) + "_skip" + str(self.skip_frames) + "learning_rate" + str(self.learning_rate)) return env, agent, tgt_net, optimizer, writer def _fill_buffer(self, agent): if self.messages_enabled: print("Player populating Buffer ...") agent.exp_buffer.fill(agent.env, self.replay_size, self.nstep) if self.messages_enabled: print("Buffer_populated!") def train(self, net, player_n=0): self.net = net env, agent, tgt_net, optimizer, writer = self._init_non_shared( player_n) self._fill_buffer(agent) if self.messages_enabled: print("Player %i start training: " % player_n) reward = [] for frame in range(self.training_frames): epsilon = max(self.eps_end, self.eps_start - frame / self.eps_decay_rate) ep_reward = agent.play_step(net, epsilon, self.device) if ep_reward: reward.append(ep_reward) writer.add_scalar("episode_reward", ep_reward, frame) writer.add_scalar("mean100_reward", np.mean(reward[-100:]), frame) if (frame % self.sync) == 0: tgt_net.load_state_dict( net.state_dict()) # Syncs target and Standard net if self.messages_enabled: print("We are at: %7i / %7i frames" % (frame, self.training_frames)) if player_n == 0: torch.save(net.state_dict(), self.env_name + "-time_update.dat") optimizer.zero_grad() batch = agent.exp_buffer.sample(self.batch_size) loss_t = calc_loss(batch, net, tgt_net, self.gamma**self.nstep, self.double, self.device) loss_t.backward() optimizer.step() writer.add_scalar("loss", loss_t, frame) writer.add_scalar("epsilon", epsilon, frame) writer.close() if self.messages_enabled: print("Player %i end training!" % player_n) torch.save(net.state_dict(), self.env_name + "end_of_training.dat") return np.mean(reward[-len(reward) // 2:]) # TODO: clean this function! def run(self, mode="play"): """ runs the simulation. :param mode: str, either "play" or "train" :return: mean reward over all episodes with eps_end """ if mode == "train": reward = self.train(self.net) return reward elif mode == "play": # Run play.py to see model in action pass else: raise Exception("Mode should be either play or train")
def ddqn_rankBatch_train(env, scheduler, optimizer_constructor, model_type, batch_size, rp_start, rp_size, exp_frame, exp_initial, exp_final, inital_beta, gamma, target_update_steps, frames_per_epoch, frames_per_state, output_directory, last_checkpoint): """ Implementation of the training algorithm for DDQN using Rank-based prioritization. Information with regards to the algorithm can be found in the paper, "Prioritized Experience Replay" by Tom Schaul, John Quan, Ioannis Antonoglou and David Silver. Refer to section 3.3 in the paper for more info. """ gym.undo_logger_setup() logging.basicConfig(filename='ddqn_rank_training.log', level=logging.INFO) num_actions = env.action_space.n env.reset() print('No. of actions: ', num_actions) print(env.unwrapped.get_action_meanings()) # initialize action value and target network with the same weights model = DQN(num_actions, use_bn=False) target = DQN(num_actions, use_bn=False) if use_cuda: model.cuda() target.cuda() frames_count = 1 if last_checkpoint: model.load_state_dict(torch.load(last_checkpoint)) print(last_checkpoint) print('weights loaded...') exp_replay = util.initialize_rank_replay_resume( env, rp_start, rp_size, frames_per_state, model, target, gamma, batch_size) frames_count = get_index_from_checkpoint_path(last_checkpoint) else: exp_replay = util.initialize_rank_replay(env, rp_start, rp_size, frames_per_state, model, target, gamma) target.load_state_dict(model.state_dict()) optimizer = optimizer_constructor.type( model.parameters(), lr=optimizer_constructor.kwargs['lr'], alpha=optimizer_constructor.kwargs['alpha'], eps=optimizer_constructor.kwargs['eps']) episodes_count = 1 frames_per_episode = 1 epsiodes_durations = [] rewards_per_episode = 0 rewards_duration = [] loss_per_epoch = [] current_state, _, _, _ = util.play_game(env, frames_per_state) print('Starting training...') count = 0 while True: epsilon = scheduler.anneal_linear(frames_count) choice = random.uniform(0, 1) # epsilon greedy algorithm if choice <= epsilon: action = LongTensor([[random.randrange(num_actions)]]) else: action = util.get_greedy_action(model, current_state) curr_obs, reward, done, _ = util.play_game(env, frames_per_state, action[0][0]) rewards_per_episode += reward reward = Tensor([[reward]]) current_state_ex = Variable(current_state, volatile=True) curr_obs_ex = Variable(curr_obs, volatile=True) action_ex = Variable(action, volatile=True) reward_ex = Variable(reward, volatile=True) #compute td-error for one sample td_error = ddqn_compute_td_error(batch_size=1, state_batch=current_state_ex, reward_batch=reward_ex, action_batch=action_ex, next_state_batch=curr_obs_ex, model=model, target=target, gamma=gamma) td_error = torch.abs(td_error) exp_replay.push(current_state_ex, action_ex, reward_ex, curr_obs_ex, td_error) current_state = curr_obs # compute y if len(exp_replay) >= batch_size: # Get batch samples obs_samples, obs_ranks, obs_priorityVals = exp_replay.sample( batch_size) obs_priorityTensor = torch.from_numpy(np.array(obs_priorityVals)) p_batch = 1 / obs_priorityTensor w_batch = (1 / len(exp_replay) * p_batch)**inital_beta max_weight = exp_replay.get_max_weight(inital_beta) params_grad = [] for i in range(len(obs_samples)): sample = obs_samples[i] sample.state.volatile = False sample.next_state.volatile = False sample.reward.volatile = False sample.action.volatile = False loss = ddqn_compute_y(batch_size=1, state_batch=sample.state, reward_batch=sample.reward, action_batch=sample.action, next_state_batch=sample.next_state, model=model, target=target, gamma=gamma) loss_abs = torch.abs(loss) exp_replay.update(obs_ranks[i], loss_abs) for param in model.parameters(): if param.grad is not None: param.grad.data.zero_() loss.backward() #accumulate weight change if i == 0: for param in model.parameters(): tmp = ((w_batch[i] / max_weight) * loss.data[0]) * param.grad.data params_grad.append(tmp) else: paramIndex = 0 for param in model.parameters(): tmp = ((w_batch[i] / max_weight) * loss.data[0]) * param.grad.data params_grad[paramIndex] = tmp + params_grad[paramIndex] paramIndex += 1 # update weights paramIndex = 0 for param in model.parameters(): param.data += params_grad[paramIndex].mul( optimizer_constructor.kwargs['lr']).type(Tensor) paramIndex += 1 frames_count += 1 frames_per_episode += frames_per_state if done: rewards_duration.append(rewards_per_episode) rewards_per_episode = 0 frames_per_episode = 1 episodes_count += 1 env.reset() current_state, _, _, _ = util.play_game(env, frames_per_state) if episodes_count % 100 == 0: avg_episode_reward = sum(rewards_duration) / 100.0 avg_reward_content = 'Episode from', episodes_count - 99, ' to ', episodes_count, ' has an average of ', avg_episode_reward, ' reward and loss of ', sum( loss_per_epoch) print(avg_reward_content) logging.info(avg_reward_content) rewards_duration = [] loss_per_epoch = [] # update weights of target network for every TARGET_UPDATE_FREQ steps if frames_count % target_update_steps == 0: target.load_state_dict(model.state_dict()) # print('weights updated at frame no. ', frames_count) #Save weights every 250k frames if frames_count % 250000 == 0: util.make_sure_path_exists(output_directory + model_type + '/') torch.save(model.state_dict(), 'rank_weights_' + str(frames_count) + '.pth') #Print frame count and sort experience replay for every 1000000 (one million) frames: if frames_count % 1000000 == 0: training_update = 'frame count: ', frames_count, 'episode count: ', episodes_count, 'epsilon: ', epsilon print(training_update) logging.info(training_update) exp_replay.sort()