def testReplayMemory(self): od = [84, 84, 4] ad = [8, 10] rd = [5] s = int(10000) b = 32 rm = ReplayMemory(obs_dim=od, act_dim=ad, r_dim=rd, size=s) o = self.get_rand(od) a = self.get_rand(ad) r = self.get_rand(rd) d = 0 for _ in range(1000): rm.store(o, a, r, o, d) o_s, a_s, r_s, on_s, d_s = rm.sample(b) self.assertEqual(o_s.shape, combined_shape(b, od)) self.assertEqual(a_s.shape, combined_shape(b, ad)) self.assertEqual(r_s.shape, combined_shape(b, rd)) self.assertEqual(on_s.shape, combined_shape(b, od)) self.assertEqual(d_s.shape, combined_shape(b))
class Agent: def __init__(self, env, input_size, output_size, hidden_size, max_cars=10, max_passengers=10, mix_hidden=32, batch_size=128, lr=0.001, gamma=.999, eps_start=0.9, eps_end=0.05, eps_decay=750, replay_capacity=10000, num_save=200, num_episodes=10000, mode="random", training=False, load_file=None): self.env = env self.orig_env = copy.deepcopy(env) self.grid_map = env.grid_map self.cars = env.grid_map.cars self.num_cars = len(self.cars) self.passengers = env.grid_map.passengers self.num_passengers = len(self.passengers) self.max_cars = max_cars self.max_passengers = max_passengers self.input_size = input_size self.output_size = output_size self.hidden_size = hidden_size self.batch_size = batch_size self.gamma = gamma self.eps_start = eps_start self.eps_end = eps_end self.eps_decay = eps_decay self.replay_capacity = replay_capacity self.num_episodes = num_episodes self.steps_done = 0 self.lr = lr self.mode = mode self.num_save = num_save self.training = training self.algorithm = PairAlgorithm() self.episode_durations = [] self.duration_matrix = np.zeros((self.max_passengers, self.max_cars)) self.count_matrix = np.zeros((self.max_passengers, self.max_cars)) self.loss_history = [] self.memory = ReplayMemory(self.replay_capacity) self.device = torch.device("cpu") print("Device being used:", self.device) self.policy_net = DQN(self.input_size, self.output_size, self.hidden_size).to(self.device) self.params = list(self.policy_net.parameters()) if self.mode == "qmix": self.mixer = QMixer(self.input_size, self.max_passengers, mix_hidden).to(self.device) self.params += list(self.mixer.parameters()) if load_file: self.policy_net.load_state_dict(torch.load(load_file)) if self.mode == "qmix": self.mixer.load_state_dict(torch.load("mixer_" + load_file)) self.mixer.eval() self.policy_net.eval() self.load_file = "Pretrained_" + load_file print("Checkpoint loaded") else: self.load_file = self.mode + "_model_num_cars_" + str(self.num_cars) + "_num_passengers_" + str(self.num_passengers) + \ "_num_episodes_" + str(self.num_episodes) + "_hidden_size_" + str(self.hidden_size) + ".pth" self.optimizer = optim.RMSprop(self.params, lr=self.lr) #self.optimizer = optim.Adam(self.params, lr=self.lr, betas=(0.9, 0.999), eps=1e-08, weight_decay=0, amsgrad=False) #self.scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, 1500, gamma=0.1) def select_action(self, state): #Select action with epsilon greedy sample = random.random() eps_threshold = self.eps_end + (self.eps_start - self.eps_end) * \ math.exp(-1. * self.steps_done / self.eps_decay) print(eps_threshold) self.steps_done += 1 if not self.training: eps_threshold = 0.0 if sample > eps_threshold: # Choose best action with torch.no_grad(): self.policy_net.eval() action = self.policy_net(state).view( self.max_passengers, self.max_cars)[:, :self.num_cars].max(1)[1].view( 1, self.max_passengers) action[0, self.num_passengers:] = self.max_cars return action else: #Choose random action action = torch.tensor([[ random.randrange(self.num_cars) for car in range(self.max_passengers) ]], device=self.device, dtype=torch.long) action[0, self.num_passengers:] = self.max_cars return action def random_action(self, state): return torch.tensor([[ random.randrange(self.num_cars) for car in range(self.num_passengers) ]], device=self.device, dtype=torch.long) def get_state(self): # Cars (px, py, 1=matched), Passengers(pickup_x, pickup_y, dest_x, dest_y, 1=matched) # Vector Size = 3*C + 5*P cars = self.cars passengers = self.passengers indicator_cars_vec = np.zeros(self.max_cars) indicator_passengers_vec = np.zeros(self.max_passengers) # Encode information about cars cars_vec = np.array([0] * (2 * self.max_cars)) for i, car in enumerate(cars): cars_vec[2 * i:2 * i + 2] = [car.position[0], car.position[1]] indicator_cars_vec[i] = 1 # Encode information about passengers passengers_vec = np.array([0] * (4 * self.max_passengers)) for i, passenger in enumerate(passengers): passengers_vec[4 * i:4 * i + 4] = [ passenger.pick_up_point[0], passenger.pick_up_point[1], passenger.drop_off_point[0], passenger.drop_off_point[1] ] indicator_passengers_vec[i] = 1 return torch.tensor(np.concatenate( (cars_vec, indicator_cars_vec, passengers_vec, indicator_passengers_vec)), device=self.device, dtype=torch.float).unsqueeze(0) def train(self): duration_sum = 0.0 for episode in range(self.num_episodes): self.reset_different_num() #self.reset() #self.reset_orig_env() state = self.get_state() if self.mode == "dqn" or self.mode == "qmix": action = self.select_action(state) elif self.mode == "random": action = self.random_action([state]) elif self.mode == "greedy": action = [self.algorithm.greedy_fcfs(self.grid_map)] action = torch.tensor(action, device=self.device, dtype=torch.long) #print(action.size()) #print(action[:,:self.num_passengers]) reward, duration = self.env.step(action[:, :self.num_passengers], self.mode) if self.mode == "dqn": reward.extend([0] * (self.max_passengers - self.num_passengers)) self.episode_durations.append(duration) count = self.count_matrix[self.num_passengers - 1, self.num_cars - 1] self.duration_matrix[ self.num_passengers - 1, self.num_cars - 1] = self.duration_matrix[ self.num_passengers - 1, self.num_cars - 1] * (count / (count + 1)) + duration / (count + 1) self.count_matrix[self.num_passengers - 1, self.num_cars - 1] += 1 duration_sum += duration if self.training: self.memory.push( state, action, torch.tensor(reward, device=self.device, dtype=torch.float).unsqueeze(0)) self.optimize_model() self.plot_durations(self.mode) self.plot_loss_history(self.mode) if self.training and episode % self.num_save == 0: torch.save(self.policy_net.state_dict(), "episode_" + str(episode) + "_" + self.load_file) if self.mode == "qmix": torch.save( self.mixer.state_dict(), "mixer_episode_" + str(episode) + "_" + self.load_file) print("Checkpoint saved") print("Episode: ", episode) if self.training: torch.save(self.policy_net.state_dict(), self.load_file) if self.mode == "qmix": torch.save(self.mixer.state_dict(), "mixer_" + self.load_file) print("Checkpoint saved") print("Average duration was ", duration_sum / self.num_episodes) print("Finished") np.save("Duration_matrix", self.duration_matrix) np.save("Count_matrix", self.count_matrix) print(self.duration_matrix) print(self.count_matrix) def reset(self): self.env.reset() self.grid_map = self.env.grid_map self.cars = self.env.grid_map.cars self.passengers = self.env.grid_map.passengers def reset_different_num(self): self.env.grid_map.cars = [] self.env.grid_map.passengers = [] self.env.grid_map.num_passengers = random.randint( 1, self.max_passengers) self.env.grid_map.num_cars = random.randint(1, self.max_cars) self.env.grid_map.add_passenger(self.env.grid_map.num_passengers) self.env.grid_map.add_cars(self.env.grid_map.num_cars) self.grid_map = self.env.grid_map self.num_passengers = self.env.grid_map.num_passengers self.num_cars = self.env.grid_map.num_cars self.cars = self.env.grid_map.cars self.passengers = self.env.grid_map.passengers def reset_orig_env(self): self.env = copy.deepcopy(self.orig_env) self.grid_map = self.env.grid_map self.cars = self.env.grid_map.cars self.passengers = self.env.grid_map.passengers self.grid_map.init_zero_map_cost() def optimize_model(self): if len(self.memory) < self.batch_size: return transitions = self.memory.sample(self.batch_size) batch = Transition(*zip(*transitions)) state_batch = torch.cat(batch.state) action_batch = torch.cat(batch.action) reward_batch = torch.cat(batch.reward) self.policy_net.train() q_values = self.policy_net(state_batch).view(self.batch_size, self.max_passengers, self.max_cars) q_values = torch.cat((q_values, torch.zeros( (self.batch_size, self.max_passengers, 1), device=self.device)), 2) state_action_values = q_values.gather( 2, action_batch.unsqueeze(2)).squeeze() # Compute the expected Q values expected_state_action_values = reward_batch # Compute Huber loss if self.mode == "dqn": loss = F.smooth_l1_loss(state_action_values, expected_state_action_values) elif self.mode == "qmix": self.mixer.train() chosen_action_qvals = self.mixer(state_action_values, state_batch) loss = F.smooth_l1_loss(chosen_action_qvals, reward_batch.view(-1, 1, 1)) #loss = F.mse_loss(chosen_action_qvals, reward_batch.view(-1, 1, 1)) self.loss_history.append(loss.item()) # Optimize the model self.optimizer.zero_grad() loss.backward() for param in self.policy_net.parameters(): param.grad.data.clamp_(-1, 1) self.optimizer.step() def plot_durations(self, filename): print("Saving durations plot ...") plt.figure(2) plt.clf() total_steps = np.array(self.episode_durations) N = len(total_steps) window_size = 200 if N < window_size: total_steps_smoothed = total_steps else: total_steps_smoothed = np.zeros(N - window_size) for i in range(N - window_size): window_steps = total_steps[i:i + window_size] total_steps_smoothed[i] = np.average(window_steps) plt.title('Episode Duration history') plt.xlabel('Episode') plt.ylabel('Duration') plt.plot(total_steps_smoothed) np.save("Duration_" + filename, total_steps_smoothed) #plt.savefig("Durations_history_" + filename) def plot_loss_history(self, filename): print("Saving loss history ...") plt.figure(2) plt.clf() #loss = torch.tensor(self.loss_history, dtype=torch.float) total_loss = np.array(self.loss_history) N = len(total_loss) window_size = 50 if N < window_size: total_loss_smoothed = total_loss else: total_loss_smoothed = np.zeros(N - window_size) for i in range(N - window_size): window_steps = total_loss[i:i + window_size] total_loss_smoothed[i] = np.average(window_steps) plt.title('Loss history') plt.xlabel('Episodes') plt.ylabel('Loss') plt.plot(self.loss_history) np.save("Loss_" + filename, total_loss_smoothed)
class RLAgent(Player): def __init__(self, name, others=None, last_n=10, load_path=None, checkpoint=5000, fixed_strategy=False, eps_decay=0.00005): if others is None: others = [1, 2] self.others = others self.last_n = last_n self.prev_points = 0 self.batch_size = 32 self.gamma = 0.9 self.eps_start = 1 self.eps_end = 0.01 self.eps_decay = eps_decay self.target_update = 100 self.plot_at = 1000 self.q_max = [] self.q_list = [] self.checkpoint = checkpoint self.memory_size = 1000 self.lr = 0.00001 self.train = True self.input_dim = len(others) * 6 self.output_dim = 3 self.current_step = 1 self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.memory = ReplayMemory(self.memory_size) # Initialize the policy and target networks self.policy_net = DQN(self.input_dim, self.output_dim).to(self.device) self.target_net = DQN(self.input_dim, self.output_dim).to(self.device) self.target_net.load_state_dict(self.policy_net.state_dict()) self.target_net.eval() if load_path is not None: checkpoint = torch.load(load_path) self.policy_net.load_state_dict(checkpoint['model_state_dict']) self.policy_net.eval() self.eps_start = 0 self.eps_end = 0 self.train = False if fixed_strategy: self.strategy = FixedStrategy() self.strategy = EpsilonGreedyStrategy(self.eps_start, self.eps_end, self.eps_decay) # Set the optimizer self.optimizer = optim.Adam(params=self.policy_net.parameters(), lr=self.lr) self.loss = None # Push to replay memory self.prev_state = None self.action = None self.reward = None self.current_state = None super().__init__(name) def select_action(self, valid_actions, history): # print(self.memory.can_provide_sample(self.batch_size)) if self.memory.can_provide_sample(self.batch_size) and self.train: self.train_model() if len(history) > self.last_n + 1: self.prev_state, self.current_state = self.get_states(history) self.reward = self.get_reward() if self.action is not None and self.train: self.memory.push( Experience(self.prev_state, self.action, self.current_state, self.reward)) self.action = self.get_action(valid_actions) return self.action.item() else: return np.random.choice(valid_actions) def get_states(self, history): prev_state, current_state = [], [] if len(history) > self.last_n + 1: for other in self.others: other_history = [i[other] for i in history] other_last_n = other_history[-self.last_n:] other_last_n_p = other_history[-self.last_n - 1:-1] other_policy_total = get_policy(other_history) other_policy_last_n = get_policy(other_last_n) other_policy_total_p = get_policy(other_history[:-1]) other_policy_last_n_p = get_policy(other_last_n_p) prev_state.extend(other_policy_total_p + other_policy_last_n_p) current_state.extend(other_policy_total + other_policy_last_n) return torch.as_tensor(prev_state).unsqueeze(-2), torch.as_tensor( current_state).unsqueeze(-2) def get_reward(self): reward = self.points - self.prev_points self.prev_points = self.points return torch.tensor([reward]) def get_action(self, valid_actions): rate = self.strategy.get_exploration_rate(self.current_step) self.current_step += 1 if rate > random.random(): # For random, we can pass the allowable_moves vector and choose from it randomly action = np.random.choice(valid_actions) return torch.tensor([action]).to(self.device) # explore else: with torch.no_grad(): self.q_max.append( self.policy_net(self.current_state).max().item()) return self.policy_net(self.current_state).max(1)[1].to( self.device) # exploit def train_model(self): experiences = self.memory.sample(self.batch_size) states, actions, rewards, next_states = extract_tensors(experiences) if self.current_step % self.target_update == 0: print('UPDATE TARGET NET', self.current_step) self.q_list.extend(self.q_max) print('Q Max', sum(self.q_max) / self.target_update) q_max_list.append(sum(self.q_max) / self.target_update) self.q_max = [] self.target_net.load_state_dict(self.policy_net.state_dict()) if self.current_step % self.plot_at == 0: e_ = self.memory.memory[-100:] batch = Experience(*zip(*e_)) print('\n', '*' * 42) print('EXPLORATION RATE', self.strategy.get_exploration_rate(self.current_step)) print('REWARD', sum(batch.reward).item()) print('POLICY', get_policy([i.item() for i in batch.action])) print('*' * 42, '\n') plt.plot(range(len(q_max_list)), q_max_list) plt.show() if self.current_step % self.checkpoint == 0: print('SAVE CHECKPOINT AT', self.current_step) checkpoint_path = checkpoint_folder + checkpoint_prefix + str( self.current_step) + checkpoint_suffix torch.save({'model_state_dict': self.policy_net.state_dict()}, checkpoint_path) current_q_values = QValues.get_current(self.policy_net, states, actions) next_q_values = QValues.get_next(self.policy_net, self.target_net, next_states) target_q_values = (next_q_values * self.gamma) + rewards self.loss = F.mse_loss(current_q_values, target_q_values) self.optimizer.zero_grad() self.loss.backward() self.optimizer.step()
class DQNAgent: def __init__(self, inputs, n_actions): self.brain = DeepQNetwork(inputs, 16, 16, outputNum=n_actions) self.target_brain = DeepQNetwork(inputs, 16, 16, outputNum=n_actions) self.target_brain.load_state_dict(self.brain.state_dict()) self.target_brain.eval() self.set_params() self.optimizer = torch.optim.Adam(self.brain.parameters()) self.memory = ReplayMemory(50000) self.action_space = [0, 1] def set_params(self): self.batch_size = 64 self.max_exploration_rate = 1 self.min_exploration_rate = 0.05 self.exploration_decay_rate = 0.0005 self.steps_done = 0 def select_action(self, state): sample = np.random.random() exploration_rate = self.min_exploration_rate + ( self.max_exploration_rate - self.min_exploration_rate) * np.exp( -self.steps_done * self.exploration_decay_rate) self.steps_done += 1 if sample > exploration_rate: with torch.no_grad(): actions = self.brain(state) return torch.argmax(actions).item() else: return np.random.choice(self.action_space) def learn(self): if len(self.memory) < self.batch_size: return self.optimizer.zero_grad() max_capacity = (len(self.memory) if len(self.memory) < self.memory.capacity else self.memory.capacity) batch = np.random.choice(max_capacity, self.batch_size) transitions = self.memory.sample(self.batch_size) batch = Transition(*zip(*transitions)) non_final_mask = torch.tensor( tuple(map(lambda s: s is not None, batch.next_state)), dtype=torch.bool, ) non_final_next_states = torch.tensor( [s for s in batch.next_state if s is not None]) state_batch = torch.tensor(batch.state) action_batch = torch.tensor(batch.action) reward_batch = torch.tensor(batch.reward, dtype=torch.float) state_action_values = self.brain(state_batch).gather( 1, action_batch.unsqueeze(-1)) next_state_values = torch.zeros(self.batch_size) next_state_values[non_final_mask] = self.target_brain( non_final_next_states).max(1)[0] gamma = 0.99 expected_state_action_values = (gamma * next_state_values + reward_batch / reward_batch.max()) self.loss = torch.nn.MSELoss()( expected_state_action_values.unsqueeze(-1), state_action_values) self.optimizer.zero_grad() self.loss.backward() self.optimizer.step()
class Agent(AbstractAgent): actions = ['←', '→', '↑', '↓'] def __init__(self, env, p=1.0, lr=0.8, y=0.95, step_cost=.0, living_cost=.0, episode_length=100, memory_capacity=100, batch_size=10, target_update=10, eps=0.5, eps_decay=0.999): AbstractAgent.__init__(self, eps, eps_decay) self.env = env self.lr = lr self.y = y self.step_cost = step_cost self.living_cost = living_cost q = (1.0 - p) / 2 self.stochastic_actions = { '←': [[0, 2, 3], [p, q, q]], '→': [[1, 2, 3], [p, q, q]], '↑': [[2, 0, 1], [p, q, q]], '↓': [[3, 0, 1], [p, q, q]] } self.s0 = env.field.index('s') self.episode_length = episode_length self.rewards = [] self.losses = [] self.state_len = env.width * env.height self.nn = Model( in_features=self.state_len, hidden=[], out_features=len(Agent.actions)) self.target_nn = Model( in_features=self.state_len, hidden=[], out_features=len(Agent.actions)) self.target_nn.load_state_dict(self.nn.state_dict()) self.target_nn.eval() self.criterion = nn.MSELoss() self.optimizer = torch.optim.Adam(self.nn.parameters(), lr=0.05) self.memory = ReplayMemory(memory_capacity) self.batch_size = batch_size self.target_update = target_update def step(self, state, action): # simulating Markov Process, desired action happens with probability p # but with the probability (1-p) / 2 the agent goes sideways sa = self.stochastic_actions[action] mp_action = np.random.choice(sa[0], p=sa[1]) action = Agent.actions[mp_action] return self.env.step(state, action) def print_policy(self): for y in range(self.env.height): for x in range(self.env.width): s = y * self.env.width + x cell = self.env.field[s] if not (cell == '.' or cell == 's'): print(cell, end='') continue q_predicted = self._predict_q_policy(s) a = torch.argmax(q_predicted, 0).item() print(Agent.actions[a], end='') print() def _encode_state(self, s): z = np.zeros(self.state_len) z[s] = 1 return torch.tensor(z, dtype=torch.float) def _predict_q_policy(self, s): return self.nn(self._encode_state(s)) def _predict_q_target(self, s): return self.target_nn(self._encode_state(s)) def optimize(self): if len(self.memory) < self.batch_size: return transitions = self.memory.sample(self.batch_size) for s, a, s1, r in transitions: q_predicted = self._predict_q_policy(s) q_target = q_predicted.clone().detach() q_target[a] = r + self.y * self._predict_q_target(s1).max().item() loss = self.criterion(q_predicted, q_target) self.losses.append(loss.item()) self.optimizer.zero_grad() loss.backward() self.optimizer.step() def run_episode(self): AbstractAgent.run_episode(self) s = self.s0 episode_number = len(self.rewards) self.rewards.append(.0) for j in range(self.episode_length): q_predicted = self._predict_q_policy(s) a = torch.argmax(q_predicted, 0).item() a = self.select_action(a) s1, r, over = self.step(s, Agent.actions[a]) if s != s1: r -= self.step_cost r -= self.living_cost self.memory.push(s, a, s1, r) s = s1 self.optimize() self.rewards[-1] += r if over: break if episode_number % self.target_update == 0: self.target_nn.load_state_dict(self.nn.state_dict())
class Agent(AbstractAgent): actions = ['←', '→', '↑', '↓'] def __init__(self, env, lr=0.8, y=0.95, step_cost=.0, living_cost=.0, episode_length=100, memory_capacity=100, batch_size=25, eps=0.5, eps_decay=0.999): AbstractAgent.__init__(self, eps, eps_decay) self.env = env self.lr = lr self.y = y self.step_cost = step_cost self.living_cost = living_cost self.s0 = env.field.index('s') self.episode_length = episode_length self.rewards = [] self.losses = [] self.state_len = env.width * env.height self.nn = Model(in_features=2, hidden=[self.state_len, self.state_len], out_features=len(Agent.actions)) self.criterion = nn.MSELoss() self.optimizer = torch.optim.Adam(self.nn.parameters(), lr=0.01) self.memory = ReplayMemory(memory_capacity) self.batch_size = batch_size def step(self, state, action): return self.env.step(state, action) def print_policy(self): for y in range(self.env.height): for x in range(self.env.width): s = y * self.env.width + x cell = self.env.field[s] if not (cell == '.' or cell == 's'): print(cell, end='') continue q_predicted = self._predict_q(s) a = torch.argmax(q_predicted, 0).item() print(Agent.actions[a], end='') print() def _encode_state(self, s): # z = np.zeros(self.state_len) # z[s] = 1 # return torch.tensor(z, dtype=torch.float) w = self.env.width x, y = s % w, s // w return torch.tensor([x, y], dtype=torch.float) def _predict_q(self, s): return self.nn(self._encode_state(s)) def optimize(self): if len(self.memory) < self.batch_size: return transitions = self.memory.sample(self.batch_size) for s, a, s1, r in transitions: q_predicted = self._predict_q(s) q_target = q_predicted.clone().detach() q_target[a] = r + self.y * self._predict_q(s1).max().item() loss = self.criterion(q_predicted, q_target) self.losses.append(loss) self.optimizer.zero_grad() loss.backward() self.optimizer.step() def run_episode(self): AbstractAgent.run_episode(self) s = self.s0 self.rewards.append(.0) for j in range(self.episode_length): q_predicted = self._predict_q(s) a = torch.argmax(q_predicted, 0).item() a = self.select_action(a) s1, r, over = self.step(s, Agent.actions[a]) if s != s1: r -= self.step_cost r -= self.living_cost self.memory.push(s, a, s1, r) s = s1 self.optimize() self.rewards[-1] += r if over: break
class Agent(AbstractAgent): actions = ['←', '→', '↑', '↓'] def __init__(self, env, model, lr=0.8, y=0.95, step_cost=.0, living_cost=.0, episode_length=100, memory_capacity=100, batch_size=10, eps=0.5, eps_decay=0.999): AbstractAgent.__init__(self, eps, eps_decay) self.env = env self.model = model self.lr = lr self.y = y self.step_cost = step_cost self.living_cost = living_cost self.s0 = env.field.index('s') self.episode_length = episode_length self.rewards = [] self.losses = [] self.memory = ReplayMemory(memory_capacity) self.batch_size = batch_size def step(self, state, action): return self.env.step(state, action) def print_policy(self): for y in range(self.env.height): for x in range(self.env.width): s = y * self.env.width + x cell = self.env.field[s] if not (cell == '.' or cell == 's'): print(cell, end='') continue q_predicted = self.predict_q(s) a = np.argmax(q_predicted) print(Agent.actions[a], end='') print() def _encode_state(self, s): z = np.zeros(self.env.length) z[s] = 1.0 return np.array([z]) def predict_q(self, s): return self.model.predict(self._encode_state(s))[0] def optimize(self): if len(self.memory) < self.batch_size: return transitions = self.memory.sample(self.batch_size) for s, a, s1, r in transitions: q_predicted = self.predict_q(s) q_target = q_predicted q_target[a] = r + self.y * self.predict_q(s1).max() history = self.model.fit(x=self._encode_state(s), y=np.array([q_target]), epochs=1, verbose=False) self.losses.append(history.history["loss"][-1]) def run_episode(self): AbstractAgent.run_episode(self) s = self.s0 self.rewards.append(.0) for j in range(self.episode_length): q_predicted = self.predict_q(s) a = np.argmax(q_predicted) a = self.select_action(a) s1, r, over = self.step(s, Agent.actions[a]) if s != s1: r -= self.step_cost r -= self.living_cost self.memory.push(s, a, s1, r) s = s1 self.optimize() self.rewards[-1] += r if over: break
def train(args): device = torch.device("cuda" if args.gpu else "cpu") env = Environment(draw=False, fps=args.fps, debug=args.debug, dist_to_pipe=args.dist_to_pipe, dist_between_pipes=args.dist_between_pipes, obs_this_pipe=args.obs_this_pipe) observation_space = env.get_observation_size_buffer() action_space = env.get_action_size() policy_network = DQN(observation_space, action_space).to(device) target_network = DQN(observation_space, action_space).to(device) optimizer = torch.optim.Adam(policy_network.parameters(), lr=args.lr) replay_buffer = ReplayMemory(args.replay_capacity) writer = SummaryWriter() if args.inference: target_network.load_checkpoint() best_reward = None iteration = 0 total_reward = 0.0 rewards = [] state = env.reset() while True: epsilon = max(args.final_eps, args.start_eps - iteration / args.eps_decay_final_step) iteration += 1 episode_reward = None if np.random.rand() < epsilon: action = env.get_action_random() else: state_v = torch.tensor(np.array([state], copy=False)).to(device) q_vals_v = policy_network(state_v.float()) _, act_v = torch.max(q_vals_v, dim=1) action = int(act_v.item()) next_state, reward, done = env.step(action) total_reward += reward replay_buffer.push(state, action, next_state, reward, done) state = next_state if done: episode_reward = total_reward state = env.reset() total_reward = 0.0 if episode_reward is not None: rewards.append(episode_reward) mean_reward = np.mean(rewards[-80:]) print( f"Episode {iteration}: eps {epsilon} mean reward {mean_reward} episode reward {episode_reward}" ) writer.add_scalar("epsilon", epsilon, iteration) writer.add_scalar("mean_reward", mean_reward, iteration) writer.add_scalar("reward", episode_reward, iteration) if best_reward is None or best_reward < mean_reward: torch.save(policy_network.state_dict(), f"./models/checkpoint_{iteration}") print(f"New best reward found: {best_reward} -> {mean_reward}") best_reward = mean_reward if mean_reward > args.goal_reward: print(f"Achieved in {iteration} steps.") break if len(replay_buffer) < args.replay_start_step: continue if iteration % args.target_update_iterations == 0: target_network.load_state_dict(policy_network.state_dict()) optimizer.zero_grad() batch = replay_buffer.sample(args.batch_size) loss = calculate_loss(batch, policy_network, target_network, args.gamma, device=device) loss.backward() optimizer.step() writer.close()