class DQNagent: def __init__(self, mem_size, epsilon, mini_batch_size, learning_rate, gamma): self.epsilon = epsilon self.mini_batch_size = mini_batch_size self.gamma = gamma self.update_counter = 0 self.net = nn.Sequential( nn.Linear(2, 128), nn.ReLU(), nn.Linear(128, 128), nn.ReLU(), nn.Linear(128, 3) ).float() self.net_target = copy.deepcopy(self.net) self.net = self.net.cuda() self.net_target = self.net_target.cuda() # self.net_target = nn.Sequential( # nn.Linear(2, 128), # nn.ReLU(), # nn.Linear(128, 128), # nn.ReLU(), # nn.Linear(128, 3) # ).float() self.replay_memory = ReplayMemory(max_size=mem_size) self.criterion = nn.MSELoss() self.optimizer = optim.Adam(self.net.parameters(), lr=learning_rate) def get_action(self, obs, mode='e-greedy'): if mode == 'random': action = random.choice([0, 1, 2]) elif mode == 'greedy': obs = torch.tensor(obs, dtype=torch.float).cuda() with torch.no_grad(): action = torch.argmax(self.net(obs)).cpu().numpy().tolist() elif mode == 'e-greedy': action = random.choice([0, 1, 2]) if random.random() >= self.epsilon: obs = torch.tensor(obs, dtype=torch.float).cuda() with torch.no_grad(): action = torch.argmax(self.net(obs)).cpu().numpy().tolist() # if not explore and random.random() >= self.epsilon: # obs = torch.tensor(obs, dtype=torch.float).cuda() # with torch.no_grad(): # action = torch.argmax(self.net(obs)).cpu().numpy().tolist() assert type(action) == int return action def store_transition(self, obs, action, reward, new_obs, done): self.replay_memory.push(obs, action, reward, new_obs, done) def update(self): if len(self.replay_memory) < self.mini_batch_size: return obs_batch, action_batch, reward_batch, new_obs_batch, done_batch = self.replay_memory.sample(self.mini_batch_size) new_obs_batch = torch.tensor(new_obs_batch, dtype=torch.float).cuda() # print(new_obs_batch.shape) # time.sleep(5) with torch.no_grad(): target_batch = torch.tensor(reward_batch, dtype=torch.float).cuda() # print(target_batch.shape) # time.sleep(5) vals_new_obs = torch.max(self.net_target(new_obs_batch), dim=1)[0] # print(vals_new_obs.shape) # time.sleep(5) for i in range(self.mini_batch_size): if not done_batch[i]: target_batch[i] += self.gamma * vals_new_obs[i] # target_batch = target_batch + self.gamma * vals_new_obs obs_batch = torch.tensor(obs_batch, dtype=torch.float).cuda() pred_batch = self.net(obs_batch) # print(pred_batch[:5]) # print(pred_batch.size(0)) # print(action_batch) # pred_batch_ = pred_batch[torch.arange(pred_batch.size(0)), action_batch] action_batch = torch.tensor(action_batch, dtype=torch.long).cuda() # print(action_batch[:5]) pred_batch_ = pred_batch.gather(1, action_batch.unsqueeze(1)).squeeze(1) # print(pred_batch_[:5]) # time.sleep(5) loss = self.criterion(pred_batch_, target_batch) self.optimizer.zero_grad() loss.backward() self.optimizer.step() self.update_counter += 1 if self.update_counter%20 == 0: self.update_counter = 0 for target_param, param in zip(self.net_target.parameters(), self.net.parameters()): target_param.data.copy_(param)
class EGreedyAgent(MAEAgent): """Epsilon greedy agent. """ def __init__(self, default_reward, name, color, env, agent_type, features_n, memory_capacity, init_value=0.0, batch_size=64, gamma=0.99, eps_start=0.9, eps_end=0.01, eps_decay=50, need_reload=False, reload_path=None, need_exploit=True): super(EGreedyAgent, self).__init__((0, 0), default_reward=default_reward, color=color, env=env, name=name, default_type=agent_type, default_value=init_value) self.actions_n = env.action_space.n # discounted value self.gamma = gamma self.batch_size = batch_size self.eps_start = eps_start self.eps_end = eps_end self.eps_decay = eps_decay self.features_n = features_n self.memory_capacity = memory_capacity self.memory = ReplayMemory(self.memory_capacity) self.steps_count = 0 self.device = 'cpu' # for evaluate Q_value self.policy_net = DQN(self.features_n, self.actions_n, 50, 50, 50) # evaluate Q_target self.target_net = DQN(self.features_n, self.actions_n, 50, 50, 50) if need_reload: self.restore(reload_path) # let target net has the same params as policy net self.target_net.eval() self.target_net.load_state_dict(self.policy_net.state_dict()) self.optimizer = optim.RMSprop(self.policy_net.parameters(), lr=0.001) self.save_file_path = './model/' self.need_exploit = need_exploit def act(self, state): """Chose action greedily. """ # Trans list state to tensor, shape is (1, 4) # [[1,2,3,4]] state = torch.FloatTensor([state]) sample = random.random() # chose action randomly at the beginning, then slowly chose max Q_value eps_threhold = self.eps_end + (self.eps_start - self.eps_end) * \ math.exp(-1. * self.steps_count / self.eps_decay) \ if self.need_exploit else 0.01 self.steps_count += 1 if sample > eps_threhold: with torch.no_grad(): return self.policy_net(state).max(1)[1].view(1, 1).item() else: return random.randrange(self.actions_n) def optimize_model(self): """ Train model. """ if len(self.memory) < self.batch_size: return 0.0 transitions = self.memory.sample(self.batch_size) # batch is ([state], [action], [next_state], [reward]) batch = Transition(*zip(*transitions)) non_final_mask = torch.tensor(tuple( map(lambda s: s is not None, batch.next_state)), device=self.device) non_final_next_states = torch.cat([ torch.tensor([s], dtype=torch.float) for s in batch.next_state if s is not None ]) state_batch = torch.cat( [torch.tensor([s], dtype=torch.float) for s in batch.state]) action_batch = torch.cat( [torch.tensor([[s]], dtype=torch.long) for s in batch.action]) reward_batch = torch.cat( [torch.tensor([[s]], dtype=torch.float) for s in batch.reward]) q_eval = self.policy_net(state_batch).gather(1, action_batch) q_next = torch.zeros(self.batch_size, device=self.device) q_next[non_final_mask] = self.target_net(non_final_next_states).max( 1)[0].detach() q_target = (q_next * self.gamma) + reward_batch.squeeze() loss = F.mse_loss(q_eval, q_target.unsqueeze(1)) self.optimizer.zero_grad() loss.backward() self.optimizer.step() return loss.item() def save(self, name): """ Save trained model to model/`name. """ torch.save(self.target_net.state_dict(), self.save_file_path + name) def restore(self, path): """ Restore model from `path. """ params = torch.load(path) self.target_net.load_state_dict(params) self.policy_net.load_state_dict(params)
class Agent(object): """RL agent for the Atari game""" def __init__( self, player_id: int = 1, name: str = "Ugo", batch_size: int = 128, gamma: float = 0.98, memory_size: int = 40000, ) -> None: """Initialization for the DQN agent Args: player_id (int, optional): Side of the board on which to play. Defaults to 1. name (str, optional): Name of the player. Defaults to "Ugo". batch_size (int, optional): Batch size of the update. Defaults to 128. gamma (float, optional): Gamme value for update decay. Defaults to 0.98. memory_size (int, optional): Experience memory capacity. Defaults to 40000. """ # list of parameters of the agent self.player_id = player_id self.name = name self.batch_size = batch_size # size of batch for update self.gamma = gamma # discount factor self.memory_size = memory_size # size of replay memory self.memory = ReplayMemory(self.memory_size, train_buffer_capacity=4, test_buffer_capacity=4) # networks self.policy_net = DQN(action_space_dim=3, hidden_dim=256).to(torch.device(device)) self.target_net = DQN(action_space_dim=3, hidden_dim=256).to(torch.device(device)) self.target_net.load_state_dict(self.policy_net.state_dict()) self.target_net.eval() self.optimizer = optim.Adam(self.policy_net.parameters(), lr=1e-4) def update_policy_net(self) -> None: """Update policy_net via Q-learning approximation""" # check if memory has enough elements to sample if len(self.memory) < self.batch_size: return # get transitions transitions = self.memory.sample(self.batch_size) batch = Transition(*zip(*transitions)) # get elements from batch non_final_mask = 1 - torch.tensor(batch.done, dtype=torch.uint8).to( torch.device(device)) non_final_mask = non_final_mask.type(torch.bool) non_final_next_obs = torch.stack([ ob for nonfinal, ob in zip(non_final_mask, batch.next_ob) if nonfinal ]).to(torch.device(device)) ob_batch = torch.stack(batch.ob).to(torch.device(device)) rew_batch = torch.stack(batch.rew).to(torch.device(device)) action_batch = torch.stack(batch.action).to(torch.device(device)) # estimate Q(st, a) with the policy network state_action_values = (self.policy_net.forward(ob_batch).gather( 1, action_batch).squeeze()) # estimate V(st+1) with target network next_state_values = torch.zeros(self.batch_size).to( torch.device(device)) next_state_values[non_final_mask] = ( self.target_net.forward(non_final_next_obs).max(1)[0].detach()) # expected Q value expected_state_action_values = (rew_batch.squeeze() + self.gamma * next_state_values) # loss loss = F.smooth_l1_loss(state_action_values, expected_state_action_values) # optimize the network self.optimizer.zero_grad() loss.backward() for param in self.policy_net.parameters(): param.grad.data.clamp_(-0.1, 0.1) self.optimizer.step() def update_target_net(self) -> None: """Update target net""" self.target_net.load_state_dict(self.policy_net.state_dict()) def get_action(self, ob: np.ndarray, epsilon: float = 0.1, train: bool = False) -> int: """Interface function that returns the action that the agent took based on the observation ob Args: ob (np.ndarray, optional): Current observation from the game. epsilon (float, optional): Epsilon for epsilon greedy. Defaults to 0.1. train (bool, optional): Identifies if the agent is in testing or training phase. Defaults to False. Returns: int: the action taken by the agent policy """ # epsilon greedy action selection if train and np.random.rand() < epsilon: action = np.random.randint(0, 3) else: # get stack of obeservations if train: ob_stack = self.get_stack_from_train_buffer(ob) else: ob_stack = self.get_stack_from_test_buffer(ob) ob_stack = ob_stack.unsqueeze(0) # predict best action with torch.no_grad(): action = self.policy_net.forward(ob_stack).argmax().item() if not train: self.push_to_test_buffer(ob) return action def get_name(self) -> str: """Return name of the agent Returns: str: name of the agent """ return self.name def reset(self) -> None: """Clean the buffers of the memory""" self.memory.test_buffer = [] self.memory.train_buffer = [] def load_model( self, path_ai: str = "weights/hibrid_tuned_best.ai", path_optm: str = None, ) -> None: """Load model weights and optimizer from a certain path Args: path_ai (str, optional): Path to model weights. Defaults to "weights/hibrid_tuned_best.ai". path_optm (str, optional): Path to optimizer weights. Defaults to None. """ # load model weights self.policy_net.load_state_dict( torch.load(path_ai, map_location=torch.device(device))) self.target_net.load_state_dict(self.policy_net.state_dict()) self.target_net.eval() # load optimizer parameters if path_optm is not None: try: self.optimizer.load_state_dict( torch.load(path_optm, map_location=torch.device(device))) except: print( "WARNING: No optimizer state_dict found! Remember to load the optimizer state_dict when retraining the model!" ) def save_model(self, dir: str, ep: int) -> None: """Save model to file Args: dir (str): Directory to where save the model ep (int): episode number """ torch.save(self.policy_net.state_dict(), dir + f"/DQN_{ep+1}.ai") torch.save(self.optimizer.state_dict(), dir + f"/DQN_{ep+1}.optm") def push_to_train_buffer(self, ob: np.ndarray, action: int, reward: int, next_ob: np.ndarray, done: bool) -> None: """Push a transition to the memory train buffer Args: ob (np.ndarray): Obsertation/state at time t action (int): Action at time t reward (int): Reward for taking action a in state s at time t next_ob (np.ndarray): Observation/state at time t+1 done (bool): Defines if the game is finished or not """ # preprocess observations ob = self.preprocess_ob(ob) next_ob = self.preprocess_ob(next_ob) # save to buffer action = torch.Tensor([action]).long().to(torch.device(device)) reward = torch.tensor([reward], dtype=torch.float32).to(torch.device(device)) self.memory.push_to_train_buffer(ob, action, next_ob, reward, done) # check if I need to push to memory if len(self.memory.train_buffer ) == self.memory.train_buffer_capacity or done: # get the buffer and transition elements to push into memory buffer = self.memory.train_buffer ob_stack = torch.stack((buffer[0].ob, buffer[1].ob, buffer[2].ob, buffer[3].ob)).to(torch.device(device)) next_ob_stack = torch.stack(( buffer[0].next_ob, buffer[1].next_ob, buffer[2].next_ob, buffer[3].next_ob, )).to(torch.device(device)) # push to memory self.memory.push_to_memory( ob_stack, buffer[3].action, next_ob_stack, buffer[3].rew, buffer[3].done, ) # if not done delete the firt row in the buffer if not done: self.memory.train_buffer = self.memory.train_buffer[1:] # if done reset everything if done: self.reset() def push_to_test_buffer(self, ob: np.ndarray) -> None: """Push a transition to the train buffer Args: ob (np.ndarray): Observation to push to the buffer """ # preprocess observation and push to test buffer ob = self.preprocess_ob(ob) self.memory.push_to_test_buffer(ob) # check if I have filled it if len(self.memory.test_buffer) == self.memory.test_buffer_capacity: self.memory.test_buffer = self.memory.test_buffer[1:] def get_stack_from_train_buffer(self, ob: np.ndarray) -> Tensor: """Get stack of preprocessed observations/states from train buffer Args: ob (np.ndarray): Current observation/state Returns: Tensor: Stack of preprocessed observations/states """ ob = self.preprocess_ob(ob) # get observations from train buffer obs = ([x.ob for x in self.memory.train_buffer] if len(self.memory.train_buffer) != 0 else [ob]) obs.append(ob) # complete the sequence while len(obs) != self.memory.train_buffer_capacity: obs.append(obs[-1]) # stack observations and return them ob_stack = torch.stack(obs).to(torch.device(device)) return ob_stack def get_stack_from_test_buffer(self, ob: np.ndarray) -> Tensor: """Get stack of preprocessed observations/states from test buffer Args: ob (np.ndarray): Current observation/state Returns: Tensor: Stack of preprocessed observations/states """ ob = self.preprocess_ob(ob) # get observations from test buffer obs = ([x for x in self.memory.test_buffer] if len(self.memory.test_buffer) != 0 else [ob]) obs.append(ob) # complete the sequence while len(obs) != self.memory.test_buffer_capacity: obs.append(obs[-1]) # stack observations and return them ob_stack = torch.stack(obs).to(torch.device(device)) return ob_stack def preprocess_ob(self, ob: np.ndarray) -> Tensor: """Preprocess observation:\n - shrink the image to 100x100\n - transform it to black and white\n - transform it into a Tensor\n Args: ob (np.ndarray): Observation to preprocess Returns: Tensor: Preprocessed observation """ # shrink image ob = Image.fromarray(ob) ob = ob.resize((100, 100)) ob = np.asarray(ob) # grayscale image ob = rgb2grayscale(ob) ob[ob != ob[0][0]] = 1 ob[ob == ob[0][0]] = 0 # Tensor definition ob = torch.from_numpy(ob).float().to(torch.device(device)) return ob
class DQNAgent(Agent): def __init__(self, model, env, **kwargs): Agent.__init__(self, **kwargs) self.update_step = 0 self.eps = self.EPS_START self.global_step = 0 self.model = model self.target_model = copy.deepcopy(model) self.in_size = model.in_size self.out_size = model.out_size self.memory = ReplayMemory(self.REPLAY_CAPACITY) self.opt = torch.optim.Adam(self.model.parameters(), lr=self.LR) self.env = env self.container = Container(self.model.SAVE_MODEL_NAME) def select_action(self, state): if self.is_training: self.global_step += 1 self.eps = self.EPS_START - (self.EPS_START - self.EPS_END ) / self.EPS_DECAY * self.global_step if self.eps < self.EPS_END: self.eps = self.EPS_END if self.is_training and np.random.rand() < self.eps: return LongTensor([[np.random.randint(self.out_size)]]) else: var = Variable(state).type(FloatTensor) out = self.model(var) return out.max(1)[1].data.view(1, 1) def _DQ_loss(self, y_pred, reward_batch, non_final_mask, non_final_next_states): q_next = Variable(torch.zeros(self.BATCH_SIZE).type(FloatTensor)) target_q = self.target_model(non_final_next_states) if self.DOUBLE_DQN: max_act = self.model(non_final_next_states).max(1)[1].view(-1, 1) q_next[non_final_mask] = target_q.gather(1, max_act).data.view( target_q.gather(1, max_act).data.shape[0]) else: q_next[non_final_mask] = target_q.max(1)[0].data # next_state_values.volatile = False y = q_next * self.GAMMA + reward_batch loss = nn.functional.mse_loss(y_pred, y) return loss def _calc_loss(self): batch = self.memory.sample(self.BATCH_SIZE) non_final_mask = ByteTensor( tuple([s is not None for s in batch.next_state])) non_final_next_states = Variable( torch.cat([s for s in batch.next_state if s is not None])) state_batch = Variable( torch.cat([s for s in batch.state if s is not None])) action_batch = Variable( torch.cat([s for s in batch.action if s is not None])) reward_batch = Variable( torch.cat([s for s in batch.reward if s is not None])) y_pred = self.model(state_batch).gather(1, action_batch).squeeze() loss = self._DQ_loss(y_pred, reward_batch, non_final_mask, non_final_next_states) self.container.add("y_pred", torch.mean(y_pred.data)) self.container.add("loss", loss.data.item()) return loss def update_policy(self): loss = self._calc_loss() self.opt.zero_grad() loss.backward() if self.GRADIENT_CLIPPING: for param in self.model.parameters(): param.grad.data.clamp_(-self.GRADIENT_CLIPPING, self.GRADIENT_CLIPPING) self.opt.step() def update_target_network(self): if not self.SOFT_UPDATE: self.update_step = (self.update_step + 1) % self.TARGET_UPDATE_FREQ if self.update_step == 0: state_dict = self.model.state_dict() self.target_model.load_state_dict(copy.deepcopy(state_dict)) else: tw = self.target_model.state_dict().values() sw = self.model.state_dict().values() for t, s in zip(tw, sw): t.add_(self.TARGET_UPDATE_FREQ * (s - t)) def _forward(self, obs, is_train, update_memory): if self.state_processor: state = self.state_processor(obs) else: temp = obs[None, :] if len(obs.shape) == 1 else obs[None, None, :] state = torch.from_numpy(temp).type(FloatTensor) if self.GET_DEMO: action = self.rule_processor(obs) else: action = self.select_action(state) act = action.numpy().squeeze() if self.VERBOSE: print("action: {}".format(act)) action_step = self.ACTION_REPEAT reward = 0 done = False while action_step > 0: action_step -= 1 next_obs, r, done, _ = self.env.step(act) # CartPole reward # x, x_dot, theta, theta_dot = next_obs # r1 = (self.env.x_threshold - abs(x)) / self.env.x_threshold - 0.8 # r2 = (self.env.theta_threshold_radians - abs(theta)) / self.env.theta_threshold_radians - 0.5 # r = r1 + r2 # MountainCar reward # position, velocity = next_obs # r = abs(position - (-0.5)) reward += r if done: break self.reward_episode += reward if update_memory: reward = FloatTensor([reward]) self.memory.push(state, action, reward) if done: self.memory.push(None, None, None) if len(self.memory) >= self.REPLAY_START and is_train: self.update_policy() self.update_target_network() if self.is_render: self.env.render() return next_obs, done def fit(self, is_train, update_memory=True, num_step=np.inf, num_episode=np.inf, max_episode_length=np.inf, is_render=False): if num_step == np.inf and num_episode == np.inf: raise Exception("") if num_step != np.inf and num_episode != np.inf: raise Exception("") self.is_render = is_render while self.i_episode < num_episode and self.i_step < num_step: self.i_episode += 1 print("------------------------") print("episode: {}, step: {}".format(self.i_episode, self.i_step)) obs = self.env.reset() self.reward_episode = 0 episode_step = 0 while episode_step < max_episode_length: episode_step += 1 self.i_step += 1 obs, done = self._forward(obs, is_train, update_memory) if done: self.reward_step_pairs.push(self.reward_episode, self.i_step) if self.is_test: self.container.add("reward", self.reward_episode, self.record_i_step) self.print(is_train) break def train(self, **kwargs): self.is_training = True if kwargs.pop("clear", True): self.i_episode = 0 self.i_step = 0 self.reward_step_pairs.reset() print("Training starts...") self.fit(True, **kwargs) # self.model.save() self.container.save() def run(self, **kwargs): self.is_training = False if kwargs.pop("clear", True): self.i_episode = 0 self.i_step = 0 self.reward_step_pairs.reset() print("Running starts...") self.fit(False, **kwargs) def _test(self, num_step): self.record_i_episode = self.i_episode self.record_i_step = self.i_step self.is_test = True self.run(num_step=num_step) self.i_episode = self.record_i_episode self.i_step = self.record_i_step self.is_test = False def train_test(self, num_step, test_period=1000, test_step=100): self.i_episode = 0 self.i_step = 0 while self.i_step < num_step: self._test(test_step) self.train(num_step=self.record_i_step + test_period, clear=False) self._test(test_step) def print(self, is_train): print("reward_episode {}".format(self.reward_episode)) print("eps {}".format(self.eps)) if is_train: print("loss_episode {}".format(self.container.get("loss"))) print("y_pred_episode {}".format(self.container.get("y_pred")))
class Agent(object): def __init__(self, num_actions, gamma=0.98, memory_size=5000, batch_size=32): self.scaler = None self.featurizer = None self.q_functions = None self.gamma = gamma self.batch_size = batch_size self.num_actions = num_actions self.memory = ReplayMemory(memory_size) self.initialize_model() def initialize_model(self): # Draw some samples from the observation range and initialize the scaler obs_limit = np.array([4.8, 5, 0.5, 5]) samples = np.random.uniform(-obs_limit, obs_limit, (1000, obs_limit.shape[0])) self.scaler = StandardScaler() self.scaler.fit(samples) # Initialize the RBF featurizer self.featurizer = FeatureUnion([ ("rbf1", RBFSampler(gamma=5.0, n_components=100)), ("rbf2", RBFSampler(gamma=2.0, n_components=80)), ("rbf3", RBFSampler(gamma=1.0, n_components=50)), ]) self.featurizer.fit(self.scaler.transform(samples)) # Create a value approximator for each action self.q_functions = [ SGDRegressor(learning_rate="constant", max_iter=500, tol=1e-3) for _ in range(self.num_actions) ] # Initialize it to whatever values; implementation detail for q_a in self.q_functions: q_a.partial_fit(self.featurize(samples), np.zeros((samples.shape[0], ))) def featurize(self, state): """ Test two different features for state representations """ if len(state.shape) == 1: state = state.reshape(1, -1) # Task 1a: TODO: Use (s, abs(s)) as features # handcrafted feature vector: s = [1, -2, 3, -4], then (s, abs(s)) = [1, -2, 3, -4, 1, 2, 3, 4] (see slack discussion) #return np.concatenate((state, abs(state)), axis=1) # Task 1b: RBF features # radial basis function representations return self.featurizer.transform(self.scaler.transform(state)) def get_action(self, state, epsilon=0.0): if np.random.random() < epsilon: a = int(np.random.random() * self.num_actions) return a else: featurized = self.featurize(state) qs = [q.predict(featurized)[0] for q in self.q_functions] qs = np.array(qs) a = np.argmax(qs, axis=0) return a def single_update(self, state, action, next_state, reward, done): # Calculate feature representations of the # Task 1: TODO: Set the feature state and feature next state featurized_state = self.featurize(state) featurized_next_state = self.featurize(next_state) # Task 1: TODO Get Q(s', a) for the next state predictions = [] for q_func in self.q_functions: # one function approximator for each of the two actions predictions.append( q_func.predict(featurized_next_state) ) # calculate prediction for every function approximator q_function next_qs = np.max(predictions) # chose highest predicted value # Calculate the updated target Q- values # Task 1: TODO: Calculate target based on rewards and next_qs if done: # terminal state target = [reward + self.gamma * 0] else: # not terminal state target = [reward + self.gamma * next_qs] # Update Q-value estimation self.q_functions[action].partial_fit( featurized_state, target) # partial_fit() for mini-batch learning (see sklearn docs) def update_estimator(self): if len(self.memory) < self.batch_size: # Use the whole memory samples = self.memory.memory else: # Sample some data samples = self.memory.sample( self.batch_size ) # return random sample; length=32 # print("", ) # Task 2: TODO: Reformat data in the minibatch states = np.array( [sample.state for sample in samples] ) # pick all the states from the batch, we have to retrieve the data of the batches action = np.array([ sample.action for sample in samples ]) # return array with 32 elements (number of batch size) next_states = np.array([sample.next_state for sample in samples]) rewards = np.array([sample.reward for sample in samples]) dones = np.array([sample.done for sample in samples]) # Task 2: TODO: Calculate Q(s', a) featurized_next_states = self.featurize(next_states) # we need to do the same for next_qs as in single_update but for every sample in the batch next_qs = [] # 32x1 (#samples x #functions) for s in featurized_next_states: arr = np.array([q.predict([s]) for q in self.q_functions]) next_qs.append(np.max(arr)) next_qs = np.array(next_qs) # Calculate the updated target values # Task 2: TODO: Calculate target based on rewards and next_qs targets = rewards + self.gamma * next_qs * (1 - dones) # Calculate featurized states featurized_states = self.featurize(states) # Get new weights for each action separately for a in range(self.num_actions): # Find states where a was taken idx = action == a # If a not present in the batch, skip and move to the next action if np.any(idx): act_states = featurized_states[idx] act_targets = targets[idx] # Perform a single SGD step on the Q-function params self.q_functions[a].partial_fit(act_states, act_targets) def store_transition(self, *args): self.memory.push(*args)
class Agent: def __init__(self, state_space, n_actions, replay_buffer_size=50000, batch_size=32, hidden_size=64, gamma=0.99): self.n_actions = n_actions self.state_space_dim = state_space self.policy_net = GenericNetwork(state_space, n_actions, hidden_size, name='dqn_network_') self.target_net = GenericNetwork(state_space, n_actions, hidden_size, name='target_dqn_network_') self.target_net.load_state_dict(self.policy_net.state_dict()) self.target_net.eval() self.memory = ReplayMemory(replay_buffer_size) self.batch_size = batch_size self.gamma = gamma self.action = {} self.j = 0 def learn(self): """ Learning function :return: """ if len(self.memory) < self.batch_size: return transitions = self.memory.sample(self.batch_size) batch = Transition(*zip(*transitions)) non_final_mask = 1 - T.tensor(batch.done, dtype=T.uint8) # avoid having an empty tensor test_tensor = T.zeros(self.batch_size) while T.all(T.eq(test_tensor, non_final_mask)).item() is True: transitions = self.memory.sample(self.batch_size) batch = Transition(*zip(*transitions)) non_final_mask = 1 - T.tensor(batch.done, dtype=T.uint8) non_final_next_states = [ s for nonfinal, s in zip(non_final_mask, batch.next_state) if nonfinal > 0 ] non_final_next_states = T.stack(non_final_next_states) state_batch = T.stack(batch.state) action_batch = T.cat(batch.action) reward_batch = T.cat(batch.reward) state_action_values = self.policy_net(state_batch).gather( 1, action_batch) next_state_values = T.zeros(self.batch_size) next_state_values[non_final_mask] = self.target_net( non_final_next_states).max(1)[0].detach() expected_state_action_values = (next_state_values * self.gamma) + reward_batch # Compute mse loss loss = F.mse_loss(state_action_values.squeeze(), expected_state_action_values) # Optimize the model self.policy_net.optimizer.zero_grad() loss.backward() for param in self.policy_net.parameters(): param.grad.data.clamp_(-1e-1, 1e-1) self.policy_net.optimizer.step() def get_action(self, state, epsilon=0.05): """ Used to select actions :param state: :param epsilon: :return: action """ sample = random.random() if sample > epsilon: with T.no_grad(): state = T.from_numpy(state).float() q_values = self.policy_net(state) self.action[self.j] = { 'list_of_actions': q_values, 'max': T.argmax(q_values).item() } self.j += 1 return T.argmax(q_values).item() + 1 else: action = random.randrange(self.n_actions) return action + 1 def update_target_network(self): """ Used to update target networks :return: """ self.target_net.load_state_dict(self.policy_net.state_dict()) def store_transition(self, state, action, reward, next_state, done): """ Used for memory replay purposes :param state: :param action: :param reward: :param next_state: :param done: :return: """ action = T.Tensor([[action]]).long() reward = T.tensor([reward], dtype=T.float32) next_state = T.from_numpy(next_state).float() state = T.from_numpy(state).float() self.memory.push(state, action, reward, next_state, done) def save_models(self): """ Used to save models :return: """ self.policy_net.save_checkpoint() self.target_net.save_checkpoint() def load_models(self): """ Used to load models :return: """ self.policy_net.load_checkpoint()
class DQNagent(object): def __init__(self, filename='dqn0'): self.filename = './trained_agents/' + filename self.policy_net = DQN(self.filename + '.cfg') self.target_net = DQN(self.filename + '.cfg') self.memory = ReplayMemory(16384) self.gamma = 0.999 def select_action(self, state, epsilon): if np.random.rand() < epsilon: idx = LongTensor([[random.randrange(self.policy_net.output_size)]]) else: idx = self.policy_net( Variable(state, volatile=True).type(FloatTensor)).data.max(1)[1].view( 1, 1) return idx def update(self, batch_size=16): if len(self.memory.memory) < batch_size: batch_size = len(self.memory.memory) transitions = self.memory.sample(batch_size) batch = Transition(*zip(*transitions)) state_batch = Variable(torch.cat(batch.state)) action_batch = Variable(torch.cat(batch.action)) reward_batch = Variable(torch.cat(batch.reward)) non_final_mask = ByteTensor( tuple(map(lambda s: s is not None, batch.next_state))) non_final_next_states = Variable(torch.cat( [s for s in batch.next_state if s is not None]), volatile=True) state_action_values = self.policy_net(state_batch).gather( 1, action_batch) next_state_values = Variable(torch.zeros(batch_size).type(Tensor)) next_state_values[non_final_mask] = self.target_net( non_final_next_states).max(1)[0] expected_state_action_values = (next_state_values * self.gamma) + reward_batch expected_state_action_values = Variable( expected_state_action_values.data) loss = F.mse_loss(state_action_values, expected_state_action_values) old_params = freeze_as_np_dict(self.policy_net.state_dict()) self.optimizer.zero_grad() loss.backward() for param in self.policy_net.parameters(): logging.debug(param.grad.data.sum()) param.grad.data.clamp_(-1., 1.) self.optimizer.step() new_params = freeze_as_np_dict(self.policy_net.state_dict()) check_params_changed(old_params, new_params) return loss.data[0] def train(self, env, n_epochs=30, epsilon_init=1., epsilon_schedule='exp', eps_decay=None, lr=0.001, batch_size=32): if epsilon_schedule == 'linear': eps_range = np.linspace(epsilon_init, 0., n_epochs) elif epsilon_schedule == 'constant': eps_range = [epsilon_init for _ in range(n_epochs)] elif epsilon_schedule == 'exp': if not eps_decay: eps_decay = n_epochs // 4 eps_range = [ epsilon_init * math.exp(-1. * i / eps_decay) for i in range(n_epochs) ] history_file = open(self.filename + 'history', mode='a+') self.policy_net = self.policy_net.cuda() self.target_net = self.target_net.cuda() self.target_net.eval() self.optimizer = optim.RMSprop(self.policy_net.parameters(), lr=lr) losses, rewards, change_history = [], [], [] for epoch in range(n_epochs): env.reset() last_screen = get_screen(env) current_screen = get_screen(env) state = current_screen - last_screen done = False epoch_losses = [] epoch_rewards = [] video = [] while not done: if epoch % 10 == 1: video.append(last_screen) action = self.select_action(state, eps_range[epoch]) _, reward, done, _ = env.step(action[0, 0]) last_screen = current_screen current_screen = get_screen(env) reward = Tensor([reward]) if not done: next_state = current_screen - last_screen else: next_state = None self.memory.push(state, action, next_state, reward) state = next_state loss = self.update(batch_size=batch_size) epoch_losses.append(loss) epoch_rewards.append(reward) history_file.write( 'Epoch {}: loss= {}, reward= {}, duration= {}\n'.format( epoch, np.mean(epoch_losses), np.sum(epoch_rewards), len(epoch_rewards))) losses.append(np.mean(epoch_losses)) rewards.append(np.sum(epoch_rewards)) if epoch % 10 == 1: self.target_net.load_state_dict(self.policy_net.state_dict()) self.save(ext=str(epoch)) self.make_video(video, ext='_train_' + str(epoch)) with open(self.filename + '.train_losses', 'a+') as f: for l in losses: f.write(str(l) + '\n') losses = [] with open(self.filename + '.train_rewards', 'a+') as f: for r in rewards: f.write(str(r) + '\n') rewards = [] self.save() def test(self, env, n_epochs=30, verbose=False): rewards = [] self.policy_net = self.policy_net.cuda() self.target_net = self.target_net.cuda() self.target_net.eval() for epoch in range(n_epochs): env.reset() done = False epoch_rewards = [] video = [] last_screen = get_screen(env) current_screen = get_screen(env) state = current_screen - last_screen while not done: if epoch % 5 == 0: video.append(last_screen) action = self.select_action(state, 0.) _, reward, done, _ = env.step(action[0, 0]) last_screen = current_screen current_screen = get_screen(env) if not done: next_state = current_screen - last_screen else: next_state = None epoch_rewards.append(reward) reward = Tensor([reward]) state = next_state logging.debug( 'Test epoch {} : reward= {}, duration= {}'.format( epoch, np.sum(epoch_rewards), len(epoch_rewards))) rewards.append(np.sum(epoch_rewards)) if epoch % 5 == 0: self.make_video(video, ext='_test_' + str(epoch)) logging.info('Performance estimate : {} pm {}'.format( np.mean(rewards), np.std(rewards))) def make_video(self, replay, ext=''): n_frames = len(replay) b_s, n_channels, n_w, n_h = replay[0].shape writer = VideoWriter(self.filename + ext + '.mp4') for i in range(n_frames): writer.writeFrame(replay[i][0][[1, 2, 0]] * 255) writer.close() def save(self, ext=''): torch.save(self.policy_net.state_dict(), self.filename + ext + '.pol.ckpt') torch.save(self.target_net.state_dict(), self.filename + ext + '.tgt.ckpt') def load(self, filename): self.policy_net.load_state_dict( torch.load('./trained_agents/' + filename + '.pol.ckpt')) self.target_net.load_state_dict( torch.load('./trained_agents/' + filename + '.tgt.ckpt'))
class DDPG_Agent: def __init__(self, ob_sp, act_sp, alow, ahigh, writer, args): self.args = args self.alow = alow self.ahigh = ahigh self.policy = Policy_net(ob_sp, act_sp) self.policy_targ = Policy_net(ob_sp, act_sp) self.qnet = Q_net(ob_sp, act_sp) self.qnet_targ = Q_net(ob_sp, act_sp) self.policy.to(device) self.qnet.to(device) self.policy_targ.to(device) self.qnet_targ.to(device) self.MSE_loss = nn.MSELoss() self.noise = OUNoise(1, 1) hard_update(self.policy_targ, self.policy) hard_update(self.qnet_targ, self.qnet) self.p_optimizer = optim.Adam(self.policy.parameters(), lr=LR) self.q_optimizer = optim.Adam(self.qnet.parameters(), lr=LR) self.memory = ReplayMemory(int(1e6)) self.epsilon_scheduler = LinearSchedule(E_GREEDY_STEPS, FINAL_STD, INITIAL_STD, warmup_steps=WARMUP_STEPS) self.n_steps = 0 self.n_updates = 0 self.writer = writer def get_action(self, state): if self.args.use_ounoise: noise = self.noise.sample()[0] else: noise = np.random.normal( 0, self.epsilon_scheduler.value(self.n_steps)) st = torch.from_numpy(state).view(1, -1).float() action = self.policy(st) action_with_noise = np.clip(action.item() + noise, self.alow, self.ahigh) if self.args.use_writer: self.writer.add_scalar("action mean", action.item(), self.n_steps) self.writer.add_scalar("action noise", noise, self.n_steps) self.writer.add_scalar("epsilon", self.epsilon_scheduler.value(self.n_steps), self.n_steps) self.writer.add_scalar("action", action_with_noise, self.n_steps) self.n_steps += 1 return action_with_noise def store_transition(self, state, action, reward, next_state, done): self.memory.push(torch.from_numpy(state), torch.tensor(action), torch.tensor(reward), torch.from_numpy(next_state), torch.tensor(done)) def reset(self): self.noise.reset() def train(self): batch = self.memory.sample(min(BATCH_SIZE, len(self.memory))) b_dict = [torch.stack(elem) for elem in Transition(*zip(*batch))] states, actions, rewards, next_states, dones = \ b_dict[0], b_dict[1].view(-1, 1), \ b_dict[2].view(-1, 1).float().to(device), b_dict[3], \ b_dict[4].view(-1, 1).float().to(device) # CRITIC LOSS: Q(s, a) += (r + gamma*Q'(s, π'(s)) - Q(s, a)) # inputs computation inputs_critic = self.qnet(states, actions) # targets with torch.no_grad(): policy_acts = self.policy_targ(next_states) targ_values = self.qnet_targ(next_states, policy_acts) targets_critics = rewards + GAMMA * (1 - dones) * targ_values loss_critic = self.MSE_loss(inputs_critic, targets_critics) self.q_optimizer.zero_grad() loss_critic.backward() # nn.utils.clip_grad_norm_(self.qnet.parameters(), GRAD_CLIP) self.q_optimizer.step() # ACTOR objective: derivative of Q(s, π(s | ø)) with respect to ø actor_loss = -self.qnet(states, self.policy(states)).mean() self.p_optimizer.zero_grad() actor_loss.backward() # nn.utils.clip_grad_norm_(self.policy.parameters(), GRAD_CLIP) self.p_optimizer.step() soft_update(self.policy_targ, self.policy, TAU) soft_update(self.qnet_targ, self.qnet, TAU) if self.args.use_writer: self.writer.add_scalar("critic_loss", loss_critic.item(), self.n_updates) self.writer.add_scalar("actor_loss", actor_loss.item(), self.n_updates) self.n_updates += 1
class Agent: """DQN Agent class for training a OpenAI-gym environment """ def __init__(self, learning_rate, gamma, state_shape, actions, batch_size, epsilon_initial=0.9, epsilon_decay=1e-3, epsilon_final=0.01, replay_buffer_capacity=1000000, model_name='dqn_model.h5', model_dir='models/dqn_model', ckpt_dir='models/dqn_model/checkpoints', log_dir='logs'): """Initialize DQN agent Args: learning_rate (float): Optimizer learning rate gamma (float): Discount factor in Bellman equation state_shape (np.shape): Shape of state space of the environment actions (int): Number of actions batch_size (int): Size of batch from which agent would learn epsilon_initial (float): Initial value of epsilon epsilon_decay (float): Decay rate of epsilon epsilon_final (float): Final value of epsilon after complete decay replay_buffer_capacity (int): Maximum size of experience replay buffer model_name (str): Name of the model file to save/load model_dir (str): Directory in which model file is stored ckpt_dir (str): Model Checkpoint directory log_dir (str): Directory where tensorflow logs are stored """ self.learning_rate = learning_rate self.gamma = gamma self.actions = actions self.batch_size = batch_size self.epsilon = epsilon_initial self.epsilon_decay = epsilon_decay self.epsilon_final = epsilon_final self.buffer = ReplayMemory(replay_buffer_capacity, state_shape) self.q_network = self._get_model() self.model_file = f'{model_dir}/{model_name}' self.checkpoint_dir = ckpt_dir def select_action(self, state): """Select action according to epsilon greedy policy Args: state (list|np.array): Current state of the environment """ if np.random.random() < self.epsilon: return np.random.choice(range(self.actions)) else: return np.argmax(self.q_network.predict(np.array([state]))) def train(self): """Optimize the model for the current batch""" if self.buffer.current_size >= self.batch_size: states, actions, rewards, next_states, dones = self.buffer.sample( self.batch_size) q_target = np.copy(self.q_network.predict(states)) # Q*(s,a) q_values_next = self.q_network.predict(next_states) # Q*(s',a') batch = np.arange(self.batch_size, dtype=np.int32) # Bellman equation update q_target[batch, actions] = rewards + ( self.gamma * np.max(q_values_next, axis=1) * dones) # Train using fixed q-targets self.q_network.train_on_batch(states, q_target) # Update epsilon if self.epsilon > self.epsilon_final: self.epsilon -= self.epsilon_decay else: self.epsilon = self.epsilon_final def store_experience(self, state, action, reward, next_state, done): """Store tuple <s, a, r, s', done> to the buffer""" self.buffer.store(state, action, reward, next_state, done) def save_model(self): self.q_network.save(self.model_file) def load_model(self): self.q_network = keras.models.load_model(self.model_file) def save_checkpoint(self, id): self.q_network.save(f'{self.checkpoint_dir}/{id}.h5') def load_checkpoint(self, id): self.q_network = keras.models.load_model( f'{self.checkpoint_dir}/{id}.h5') def _get_model(self): # 2 hidden layers, 1 FC layer model = keras.Sequential([ keras.layers.Dense(256, activation='relu'), keras.layers.Dense(256, activation='relu'), keras.layers.Dense(self.actions, activation=None) ]) # Use Adam optimizer optimizer = keras.optimizers.Adam(learning_rate=self.learning_rate) model.compile(optimizer=optimizer, loss='mean_squared_error') return model
class Agent(nn.Module): def __init__(self, q_models, target_model, hyperbolic, k, gamma, model_params, replay_buffer_size, batch_size, inp_dim, lr): super(Agent, self).__init__() if hyperbolic: self.q_models = torch.nn.ModuleList(q_models) self.target_models = torch.nn.ModuleList(target_model) else: self.q_models = q_models self.target_models = target_model self.optimizer = optim.RMSprop(self.q_models.parameters(), lr=1e-5) self.hyperbolic = hyperbolic self.n_actions = model_params.act_space self.k = k self.gamma = gamma self.memory = ReplayMemory(replay_buffer_size) self.batch_size = batch_size self.inp_dim = inp_dim def update_network(self, updates=1): for _ in range(updates): self._do_network_update() @staticmethod def get_hyperbolic_train_coeffs(k, num_models): coeffs = [] gamma_intervals = np.linspace(0, 1, num_models + 2) for i in range(1, num_models + 1): coeffs.append(((gamma_intervals[i + 1] - gamma_intervals[i]) * (1 / k) * gamma_intervals[i]**((1 / k) - 1))) return torch.tensor(coeffs) / sum(coeffs) def get_action(self, state_batch, epsilon=0.05): model_outputs = [] take_random_action = random.random() if take_random_action > epsilon: return random.randrange(self.n_actions) elif self.hyperbolic: if take_random_action > epsilon: return random.randrange(self.n_actions) else: with torch.no_grad(): state_batch = torch.tensor(state_batch, dtype=torch.float32).view( -1, self.inp_dim) for ind, mdl in enumerate(self.q_models): model_outputs.append(mdl(state_batch)) coeff = self.get_hyperbolic_train_coeffs( self.k, len(self.q_models)) model_outputs = torch.cat(model_outputs, 1).reshape( -1, len(self.q_models)) model_outputs = (model_outputs * coeff).sum(dim=1) return torch.argmax(model_outputs).item() def get_state_act_vals(self, state_batch, action_batch=None): if self.hyperbolic: model_outputs = [] for ind, mdl in enumerate(self.q_models): model_outputs.append(mdl(state_batch).gather(1, action_batch)) model_outputs = torch.cat(model_outputs, 1).reshape(-1, len(self.q_models)) coeffs = self.get_hyperbolic_train_coeffs(self.k, len(self.q_models)) model_outputs = model_outputs * coeffs return model_outputs.sum(dim=1).reshape(-1, 1) else: model_output = self.q_models(state_batch).gather(1, action_batch) return model_output def get_max_next_state_vals(self, non_final_mask, non_final_next_states): if self.hyperbolic: target_outptus = [] gammas = torch.tensor(np.linspace(0, 1, len(self.q_models) + 1), dtype=torch.float)[1:] for ind, mdl in enumerate(self.target_models): next_state_values = torch.zeros(self.batch_size) next_state_values[non_final_mask] = mdl( non_final_next_states).max(1)[0].detach() target_outptus.append(next_state_values) target_outptus = torch.cat(target_outptus, 0).reshape(-1, len(self.target_models)) target_outptus = target_outptus * gammas return target_outptus def _do_network_update(self): if len(self.memory) < self.batch_size: return transitions = self.memory.sample(self.batch_size) batch = Transition(*zip(*transitions)) non_final_mask = ~torch.tensor(batch.done, dtype=torch.bool) non_final_next_states = [ s for nonfinal, s in zip(non_final_mask, batch.next_state) if nonfinal > 0 ] non_final_next_states = torch.stack(non_final_next_states) state_batch = torch.stack(batch.state) action_batch = torch.cat(batch.action) reward_batch = torch.cat(batch.reward) # Compute Q(s_t, a) - the model computes Q(s_t), then we select the # columns of actions taken. These are the actions which would've been taken # for each batch state according to policy_net state_action_values = self.get_state_act_vals(state_batch, action_batch) # Compute V(s_{t+1}) for all next states. # Expected values of actions for non_final_next_states are computed based # on the "older" target_net; selecting their best reward with max(1)[0]. # This is merged based on the mask, such that we'll have either the expected # state value or 0 in case the state was final. state_action_values = state_action_values.view(-1, 1).repeat( 1, len(self.q_models)) next_state_values = self.get_max_next_state_vals( non_final_mask, non_final_next_states) expected_state_action_values = next_state_values + reward_batch.view( -1, 1).repeat(1, len(self.q_models)) loss = (state_action_values - expected_state_action_values)**2 coefs = self.get_hyperbolic_train_coeffs(self.k, len(self.q_models)) loss = torch.sum(loss * coefs) # loss = F.smooth_l1_loss(state_action_values.squeeze(), # expected_state_action_values) # Optimize the model self.optimizer.zero_grad() loss.backward() self.optimizer.step() def update_target_network(self): self.target_net.load_state_dict(self.policy_net.state_dict()) def store_transition(self, state, action, next_state, reward, done): action = torch.Tensor([[action]]).long() reward = torch.tensor([reward], dtype=torch.float32) next_state = torch.from_numpy(next_state).float() state = torch.from_numpy(state).float() self.memory.push(state, action, next_state, reward, done)
class ActorCritic: def __init__(self, sess, training_steps=5000000, learning_rate=0.0001, momentum=0.95, memory_size=100000, discount_rate=0.95, eps_min=0.05): self.activation = tf.nn.relu self.optimizer = tf.train.MomentumOptimizer self.learning_rate = learning_rate self.momentum = momentum self._build_graph() self.memory_size = memory_size self.memory = ReplayMemory(self.memory_size) ''' The discount rate is the parameter that indicates how many actions will be considered in the future to evaluate the reward of a given action. A value of 0 means the agent only considers the present action, and a value close to 1 means the agent considers actions very far in the future. ''' self.discount_rate = discount_rate self.eps_min = eps_min self.eps_decay_steps = int(training_steps / 2) self.sess = sess self.init = tf.global_variables_initializer() def cnn_model(self, X_state, name): """ Creates a CNN network with two convolutional layers followed by two fully connected layers. :param X_state: Placeholder for the state of the game :param name: Name of the network (actor or critic) :return : The output (logits) layer and the trainable variables """ initializer = tf.contrib.layers.variance_scaling_initializer() conv1_fmaps = 32 conv1_ksize = 8 conv1_stride = 2 conv1_pad = 'SAME' conv2_fmaps = 64 conv2_ksize = 4 conv2_stride = 2 conv2_pad = 'SAME' n_fc1 = 256 with tf.variable_scope(name) as scope: conv1 = tf.layers.conv2d(X_state, filters=conv1_fmaps, kernel_size=conv1_ksize, activation=self.activation, strides=conv1_stride, padding=conv1_pad, name='conv1') conv2 = tf.layers.conv2d(conv1, filters=conv2_fmaps, kernel_size=conv2_ksize, activation=self.activation, strides=conv2_stride, padding=conv2_pad, name='conv2') conv2_flat = tf.reshape(conv2, shape=[-1, conv2_fmaps * 5 * 5]) fc1 = tf.layers.dense(conv2_flat, n_fc1, activation=self.activation, name='fc1', kernel_initializer=initializer) logits = tf.layers.dense(fc1, N_OUTPUTS, kernel_initializer=initializer) trainable_vars = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope.name) trainable_vars_by_name = { var.name[len(scope.name):]: var for var in trainable_vars } return logits, trainable_vars_by_name def _build_graph(self): """ Creates the Tensorflow graph of the CNN network. Two networks will be used, one for the actor, and one for the critic. """ X_state = tf.placeholder(tf.float32, shape=[None, 20, 20, CHANNELS]) actor_q_values, actor_vars = self.cnn_model(X_state, name="actor") critic_q_values, critic_vars = self.cnn_model(X_state, name="critic") with tf.variable_scope("train"): X_action = tf.placeholder(tf.int32, shape=[None]) y = tf.placeholder(tf.float32, shape=[None, 1]) '''A one hot vector (tf.one_hot) is used to only keep the Q-value corresponding to chosen action in the memory. By multiplying the one-hot vector with the actor_q_values, this will zero out all of the Q-values except for the one corresponding to the memorized action. Then, by making sum along the first axis (axis=1), we obtain the desired Q-value prediction for each memory. ''' q_value = tf.reduce_sum(actor_q_values * tf.one_hot(X_action, N_OUTPUTS), axis=1, keep_dims=True) error = tf.abs(y - q_value) loss = tf.reduce_mean(clipped_error(error)) global_step = tf.Variable(0, trainable=False, name='global_step') # iteration step optimizer = self.optimizer(self.learning_rate, self.momentum, use_nesterov=True) training_op = optimizer.minimize(loss, global_step=global_step) self.saver = tf.train.Saver() self.X_state = X_state self.X_action = X_action self.y = y self.training_op = training_op self.loss = loss self.actor_q_values, self.actor_vars = actor_q_values, actor_vars self.critic_q_values, self.critic_vars = critic_q_values, critic_vars self.global_step = global_step with tf.variable_scope('summary'): self.loss_summary = tf.summary.scalar('loss', loss) self.mean_score = tf.placeholder(tf.float32, None) self.score_summary = tf.summary.scalar('mean score', self.mean_score) self.summary_merged = tf.summary.merge( [self.loss_summary, self.score_summary]) def start(self, checkpoint_path): """ Intialize the model or restore the model if it already exists. :return: Iteration that we want the model to start training """ if os.path.isfile(checkpoint_path + '.index'): self.saver.restore(self.sess, checkpoint_path) training_start = 1 print('Restoring model...') else: # Make the model warm up before training training_start = 10000 self.init.run() self.make_copy().run() print('New model...') return training_start return training_start def train(self, checkpoint_path, file_writer, mean_score): """ Trains the agent and writes regularly a training summary. :param checkpoint_path: The path where the model will be saved :param file_writer: The file where the training summary will be written for Tensorboard visualization :param mean_score: The mean game score """ copy_steps = 5000 save_steps = 2000 summary_steps = 500 cur_states, actions, rewards, next_states, dones = self.sample_memories( ) next_q_values = self.critic_q_values.eval( feed_dict={self.X_state: next_states}) max_next_q_values = np.max(next_q_values, axis=1, keepdims=True) y_vals = rewards + (1 - dones) * self.discount_rate * max_next_q_values _, loss_val = self.sess.run([self.training_op, self.loss], feed_dict={ self.X_state: cur_states, self.X_action: actions, self.y: y_vals }) step = self.global_step.eval() # Regularly copy the online DQN to the target DQN if step % copy_steps == 0: self.make_copy().run() # Save the model regularly if step % save_steps == 0: self.saver.save(self.sess, checkpoint_path) # Write the training summary regularly if step % summary_steps == 0: summary = self.sess.run(self.summary_merged, feed_dict={ self.X_state: cur_states, self.X_action: actions, self.y: y_vals, self.mean_score: mean_score }) file_writer.add_summary(summary, step) def predict(self, cur_state): """ Makes the actor predict q-values based on the current state of the game. :param cur_state: Current state of the game :return The Q-values predicted by the actor """ q_values = self.actor_q_values.eval( feed_dict={self.X_state: [cur_state]}) return q_values def remember(self, cur_state, action, reward, new_state, done): self.memory.append([cur_state, action, reward, new_state, done]) def act(self, cur_state, step): """ :param cur_state: Current state of the game :param step: Training step :return: Action selected by the agent """ eps_max = 1.0 epsilon = max( self.eps_min, eps_max - (eps_max - self.eps_min) * 2 * step / self.eps_decay_steps) if np.random.rand() < epsilon: return np.random.randint(N_OUTPUTS), epsilon # Random action else: q_values = self.predict(cur_state) return np.argmax(q_values), epsilon # Optimal action def make_copy(self): """ Makes regular copies of the training varibales from the critic to the actor. Credits goes to https://github.com/ageron/handson-ml/blob/master/16_reinforcement_learning.ipynb. :return: A copy of the training variables """ copy_ops = [ target_var.assign(self.actor_vars[var_name]) for var_name, target_var in self.critic_vars.items() ] copy_online_to_target = tf.group(*copy_ops) return copy_online_to_target def sample_memories(self, batch_size=32): """ Extracts memories from the agent's memory. Credits goes to https://github.com/ageron/handson-ml/blob/master/16_reinforcement_learning.ipynb. :param batch_size: Size of the batch that we extract form the memory :return: State, action, reward, next_state, and done values as np.arrays """ cols = [[], [], [], [], []] # state, action, reward, next_state, done for memory in self.memory.sample(batch_size): for col, value in zip(cols, memory): col.append(value) cols = [np.array(col) for col in cols] return cols[0], cols[1], cols[2].reshape(-1, 1), cols[3], cols[4].reshape( -1, 1)
class Model: def __init__(self, device, state_size, action_size, folder, config): self.folder = folder self.config = config self.device = device self.memory = ReplayMemory(self.config["MEMORY_CAPACITY"]) self.state_size = state_size self.action_size = action_size self.critic = Critic(self.state_size, self.action_size, self.device, self.config) self.actor = Actor(self.state_size, self.action_size, self.device, self.config) def select_action(self, state): action = self.actor.select_action(state) return action def optimize(self): if len(self.memory) < self.config["BATCH_SIZE"]: return None, None transitions = self.memory.sample(self.config["BATCH_SIZE"]) batch = list(zip(*transitions)) # Divide memory into different tensors states = torch.FloatTensor(batch[0]).to(self.device) actions = torch.FloatTensor(batch[1]).to(self.device) rewards = torch.FloatTensor(batch[2]).unsqueeze(1).to(self.device) next_states = torch.FloatTensor(batch[3]).to(self.device) done = torch.FloatTensor(batch[4]).unsqueeze(1).to(self.device) # Compute Q(s,a) using critic network current_Q = self.critic(states, actions) # Compute deterministic next state action using actor target network next_actions = self.actor.target(next_states) # Compute next state values at t+1 using target critic network target_Q = self.critic.target(next_states, next_actions).detach() # Compute expected state action values y[i]= r[i] + Q'(s[i+1], a[i+1]) target_Q = rewards + done * self.config["GAMMA"] * target_Q # Critic loss by mean squared error loss_critic = F.mse_loss(current_Q, target_Q) # Optimize the critic network self.critic.update(loss_critic) # Optimize actor loss_actor = -self.critic(states, self.actor(states)).mean() self.actor.update(loss_actor) # Soft parameter update update_targets(self.critic.target_nn, self.critic.nn, self.config["TAU"]) update_targets(self.actor.target_nn, self.actor.nn, self.config["TAU"]) return loss_actor.item(), loss_critic.item() def evaluate(self, environement, n_ep=10): rewards = [] try: for i in range(n_ep): print('Episode number', i + 1, 'out of', n_ep, 'keep waiting...') state = environement.reset() reward = 0 done = False steps = 0 while not done and steps < self.config["MAX_STEPS"]: action = self.select_action(state) state, r, done = environement.step(action) reward += r steps += 1 rewards.append(reward) print('Episode reward:', reward) except KeyboardInterrupt: pass if rewards: score = sum(rewards) / len(rewards) else: score = 0 return score def save(self): self.actor.save(self.folder) self.critic.save(self.folder) def load(self): try: self.actor.load(self.folder) self.critic.load(self.folder) except FileNotFoundError: raise Exception("No model has been saved !") from None
class Agent: """Definition of the Agent that will interact with the environment. Attributes: REPLAY_MEM_SIZE (:obj:`int`): max capacity of Replay Memory BATCH_SIZE (:obj:`int`): Batch size. Default is 40 as specified in the paper. GAMMA (:obj:`float`): The discount, should be a constant between 0 and 1 that ensures the sum converges. It also controls the importance of future expected reward. EPS_START(:obj:`float`): initial value for epsilon of the e-greedy action selection EPS_END(:obj:`float`): final value for epsilon of the e-greedy action selection LEARNING_RATE(:obj:`float`): learning rate of the optimizer (Adam) INPUT_DIM (:obj:`int`): input dimentionality withut considering batch size. HIDDEN_DIM (:obj:`int`): hidden layer dimentionality (for Linear models only) ACTION_NUMBER (:obj:`int`): dimentionality of output layer of the Q network TARGET_UPDATE (:obj:`int`): period of Q target network updates MODEL (:obj:`string`): type of the model. DOUBLE (:obj:`bool`): Type of Q function computation. """ def __init__(self, REPLAY_MEM_SIZE=10000, BATCH_SIZE=40, GAMMA=0.98, EPS_START=1, EPS_END=0.12, EPS_STEPS=300, LEARNING_RATE=0.001, INPUT_DIM=24, HIDDEN_DIM=120, ACTION_NUMBER=3, TARGET_UPDATE=10, MODEL='ddqn', DOUBLE=True): self.REPLAY_MEM_SIZE = REPLAY_MEM_SIZE self.BATCH_SIZE = BATCH_SIZE self.GAMMA = GAMMA self.EPS_START = EPS_START self.EPS_END = EPS_END self.EPS_STEPS = EPS_STEPS self.LEARNING_RATE = LEARNING_RATE self.INPUT_DIM = INPUT_DIM self.HIDDEN_DIM = HIDDEN_DIM self.ACTION_NUMBER = ACTION_NUMBER self.TARGET_UPDATE = TARGET_UPDATE self.MODEL = MODEL # deep q network (dqn) or Dueling deep q network (ddqn) self.DOUBLE = DOUBLE # to understand if use or do not use a 'Double' model (regularization) self.TRAINING = True # to do not pick random actions during testing self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") print("Agent is using device:\t" + str(self.device)) '''elif self.MODEL == 'lin_ddqn': self.policy_net = DuelingDQN(self.INPUT_DIM, self.HIDDEN_DIM, self.ACTION_NUMBER).to(self.device) self.target_net = DuelingDQN(self.INPUT_DIM, self.HIDDEN_DIM, self.ACTION_NUMBER).to(self.device) elif self.MODEL == 'lin_dqn': self.policy_net = DQN(self.INPUT_DIM, self.HIDDEN_DIM, self.ACTION_NUMBER).to(self.device) self.target_net = DQN(self.INPUT_DIM, self.HIDDEN_DIM, self.ACTION_NUMBER).to(self.device) ''' if self.MODEL == 'ddqn': self.policy_net = ConvDuelingDQN( self.INPUT_DIM, self.ACTION_NUMBER).to(self.device) self.target_net = ConvDuelingDQN( self.INPUT_DIM, self.ACTION_NUMBER).to(self.device) elif self.MODEL == 'dqn': self.policy_net = ConvDQN(self.INPUT_DIM, self.ACTION_NUMBER).to(self.device) self.target_net = ConvDQN(self.INPUT_DIM, self.ACTION_NUMBER).to(self.device) self.target_net.load_state_dict(self.policy_net.state_dict()) self.target_net.eval() self.optimizer = optim.Adam(self.policy_net.parameters(), lr=self.LEARNING_RATE) self.memory = ReplayMemory(self.REPLAY_MEM_SIZE) self.steps_done = 0 self.training_cumulative_reward = [] def select_action(self, state): """ the epsilon-greedy action selection""" state = state.unsqueeze(0).unsqueeze(1) sample = random.random() if self.TRAINING: if self.steps_done > self.EPS_STEPS: eps_threshold = self.EPS_END else: eps_threshold = self.EPS_START else: eps_threshold = self.EPS_END self.steps_done += 1 # [Exploitation] pick the best action according to current Q approx. if sample > eps_threshold: with torch.no_grad(): # Return the number of the action with highest non normalized probability # TODO: decide if diverge from paper and normalize probabilities with # softmax or at least compare the architectures return torch.tensor([self.policy_net(state).argmax()], device=self.device, dtype=torch.long) # [Exploration] pick a random action from the action space else: return torch.tensor([random.randrange(self.ACTION_NUMBER)], device=self.device, dtype=torch.long) def optimize_model(self): if len(self.memory) < self.BATCH_SIZE: # it will return without doing nothing if we have not enough data to sample return transitions = self.memory.sample(self.BATCH_SIZE) # Transpose the batch (see https://stackoverflow.com/a/19343/3343043 for # detailed explanation). This converts batch-array of Transitions # to Transition of batch-arrays. # Transition is the named tuple defined above. batch = Transition(*zip(*transitions)) # Compute a mask of non-final states and concatenate the batch elements # (a final state would've been the one after which simulation ended) # # non_final_mask is a column vector telling wich state of the sampled is final # non_final_next_states contains all the non-final states sampled non_final_mask = torch.tensor(tuple( map(lambda s: s is not None, batch.next_state)), device=self.device, dtype=torch.bool) nfns = [s for s in batch.next_state if s is not None] non_final_next_states = torch.cat(nfns).view(len(nfns), -1) non_final_next_states = non_final_next_states.unsqueeze(1) state_batch = torch.cat(batch.state).view(self.BATCH_SIZE, -1) state_batch = state_batch.unsqueeze(1) action_batch = torch.cat(batch.action).view(self.BATCH_SIZE, -1) reward_batch = torch.cat(batch.reward).view(self.BATCH_SIZE, -1) # Compute Q(s_t, a) - the model computes Q(s_t), then we select the # columns of actions taken. These are the actions which would've been taken # for each batch state according to policy_net state_action_values = self.policy_net(state_batch).gather( 1, action_batch) # Compute V(s_{t+1}) for all next states. # Expected values of actions for non_final_next_states are computed based # on the "older" target_net; selecting their best reward with max(1)[0]. # This is merged based on the mask, such that we'll have either the expected # state value or 0 in case the state was final. # detach removes the tensor from the graph -> no gradient computation is # required next_state_values = torch.zeros(self.BATCH_SIZE, device=self.device) next_state_values[non_final_mask] = self.target_net( non_final_next_states).max(1)[0].detach() next_state_values = next_state_values.view(self.BATCH_SIZE, -1) # Compute the expected Q values expected_state_action_values = (next_state_values * self.GAMMA) + reward_batch # print("expected_state_action_values.shape:\t%s"%str(expected_state_action_values.shape)) # Compute MSE loss loss = F.mse_loss(state_action_values, expected_state_action_values ) # expected_state_action_values.unsqueeze(1) # Optimize the model self.optimizer.zero_grad() loss.backward() for param in self.policy_net.parameters(): param.grad.data.clamp_(-1, 1) self.optimizer.step() def optimize_double_dqn_model(self): if len(self.memory) < self.BATCH_SIZE: # it will return without doing nothing if we have not enough data to sample return transitions = self.memory.sample(self.BATCH_SIZE) # Transpose the batch (see https://stackoverflow.com/a/19343/3343043 for # detailed explanation). This converts batch-array of Transitions # to Transition of batch-arrays. # Transition is the named tuple defined above. batch = Transition(*zip(*transitions)) # Compute a mask of non-final states and concatenate the batch elements # (a final state would've been the one after which simulation ended) # # non_final_mask is a column vector telling wich state of the sampled is final # non_final_next_states contains all the non-final states sampled non_final_mask = torch.tensor(tuple( map(lambda s: s is not None, batch.next_state)), device=self.device, dtype=torch.bool) nfns = [s for s in batch.next_state if s is not None] non_final_next_states = torch.cat(nfns).view(len(nfns), -1) non_final_next_states = non_final_next_states.unsqueeze(1) state_batch = torch.cat(batch.state).view(self.BATCH_SIZE, -1) state_batch = state_batch.unsqueeze(1) action_batch = torch.cat(batch.action).view(self.BATCH_SIZE, -1) reward_batch = torch.cat(batch.reward).view(self.BATCH_SIZE, -1) # print("state_batch shape: %s\nstate_batch[0]:%s\nactionbatch shape: %s\nreward_batch shape: %s"%(str(state_batch.view(40,-1).shape),str(state_batch.view(40,-1)[0]),str(action_batch.shape),str(reward_batch.shape))) # Compute Q(s_t, a) - the model computes Q(s_t), then we select the # columns of actions taken. These are the actions which would've been taken # for each batch state according to policy_net state_action_values = self.policy_net(state_batch).gather( 1, action_batch) # ---------- D-DQN Extra Line--------------- _, next_state_action = self.policy_net(state_batch).max(1, keepdim=True) # Compute V(s_{t+1}) for all next states. # Expected values of actions for non_final_next_states are computed based # on the actions given by policynet. # This is merged based on the mask, such that we'll have either the expected # state value or 0 in case the state was final. # detach removes the tensor from the graph -> no gradient computation is # required next_state_values = torch.zeros(self.BATCH_SIZE, device=self.device).view( self.BATCH_SIZE, -1) out = self.target_net(non_final_next_states) next_state_values[non_final_mask] = out.gather( 1, next_state_action[non_final_mask]) # next_state_values = next_state_values.view(self.BATCH_SIZE, -1) # Compute the expected Q values expected_state_action_values = (next_state_values * self.GAMMA) + reward_batch # Compute MSE loss loss = F.mse_loss(state_action_values, expected_state_action_values) # Optimize the model self.optimizer.zero_grad() loss.backward() for param in self.policy_net.parameters(): param.grad.data.clamp_(-1, 1) self.optimizer.step() def train(self, env, path, num_episodes=40): self.TRAINING = True cumulative_reward = [0 for t in range(num_episodes)] print("Training:") for i_episode in tqdm(range(num_episodes)): # Initialize the environment and state env.reset( ) # reset the env st it is set at the beginning of the time serie self.steps_done = 0 state = env.get_state() for t in range(len(env.data)): # while not env.done # Select and perform an action action = self.select_action(state) reward, done, _ = env.step(action) cumulative_reward[i_episode] += reward.item() # Observe new state: it will be None if env.done = True. It is the next # state since env.step() has been called two rows above. next_state = env.get_state() # Store the transition in memory self.memory.push(state, action, next_state, reward) # Move to the next state state = next_state # Perform one step of the optimization (on the policy network): note that # it will return without doing nothing if we have not enough data to sample if self.DOUBLE: self.optimize_double_dqn_model() else: self.optimize_model() if done: break # Update the target network, copying all weights and biases of policy_net if i_episode % self.TARGET_UPDATE == 0: self.target_net.load_state_dict(self.policy_net.state_dict()) # save the model if self.DOUBLE: model_name = env.reward_f + '_reward_double_' + self.MODEL + '_model' count = 0 while os.path.exists(path + model_name): # avoid overrinding models count += 1 model_name = model_name + "_" + str(count) else: model_name = env.reward_f + '_reward_' + self.MODEL + '_model' count = 0 while os.path.exists(path + model_name): # avoid overrinding models count += 1 model_name = model_name + "_" + str(count) torch.save(self.policy_net.state_dict(), path + model_name) return cumulative_reward def test(self, env_test, model_name=None, path=None): self.TRAINING = False cumulative_reward = [0 for t in range(len(env_test.data))] reward_list = [0 for t in range(len(env_test.data))] if model_name is None: pass elif path is not None: if re.match(".*_dqn_.*", model_name): self.policy_net = ConvDQN(self.INPUT_DIM, self.ACTION_NUMBER).to(self.device) if str(self.device) == "cuda": self.policy_net.load_state_dict( torch.load(path + model_name)) else: self.policy_net.load_state_dict( torch.load(path + model_name, map_location=torch.device('cpu'))) elif re.match(".*_ddqn_.*", model_name): self.policy_net = ConvDuelingDQN( self.INPUT_DIM, self.ACTION_NUMBER).to(self.device) if str(self.device) == "cuda": self.policy_net.load_state_dict( torch.load(path + model_name)) else: self.policy_net.load_state_dict( torch.load(path + model_name, map_location=torch.device('cpu'))) else: raise RuntimeError( "Please Provide a valid model name or valid path.") else: raise RuntimeError( 'Path can not be None if model Name is not None.') env_test.reset( ) # reset the env st it is set at the beginning of the time serie state = env_test.get_state() for t in tqdm(range(len(env_test.data))): # while not env.done # Select and perform an action action = self.select_action(state) reward, done, _ = env_test.step(action) cumulative_reward[t] += reward.item( ) + cumulative_reward[t - 1 if t - 1 > 0 else 0] reward_list[t] = reward # Observe new state: it will be None if env.done = True. It is the next # state since env.step() has been called two rows above. next_state = env_test.get_state() # Move to the next state state = next_state if done: break return cumulative_reward, reward_list
def train(self, config: TrainConfig): # experience replay memory replay_mem = ReplayMemory(config.memmory_capacity) # reward history reward = 0 reward_history = [] reward_avg = [] # learning rate related alpha = config.lrn_rate eps = config.epsilon eps_delta = (config.epsilon - config.epsilon_final) / config.warmup_episodes step = 0 for epi in range(config.total_episodes): obs = self.env.reset() done = False traj = [] reward = 0 while not done: # random choose action with epsilon-greedy action = self.act(obs, eps) obs_next, r, done, info = self.env.step(action) reward += r step += 1 # record trajectories traj.append( Transition(obs.flatten(), action, r, obs_next.flatten(), done)) obs = obs_next if replay_mem.size < self.batch_size: continue # update q networks with mini-batch replay samples batch_data = replay_mem.sample(self.batch_size) feed_dict = { self.learning_rate: alpha, self.states: batch_data['s'], self.actions: batch_data['a'], self.rewards: batch_data['r'], self.next_states: batch_data['s_next'], self.dones: batch_data['done'], self.epi_reward: reward_history[-1] } _, q, q_target, loss, summary = self.session.run([ self.optimizer, self.Q, self.Q_target, self.loss, self.merged_summary ], feed_dict) # update target q networks hardly if step % config.target_update_every_steps == 0: self._update_target_q_net() self.writer.add_summary(summary) replay_mem.add(traj) # one episode done reward_history.append(reward) reward_avg.append(np.mean(reward_history[-10:])) # update training param alpha *= config.lrn_rate_decay if eps > config.epsilon_final: eps -= eps_delta # report progress # if reward_history and config.log_every_episodes and epi % config.log_every_episodes == 0 : print( "[episodes:{}/step:{}], best:{}, avg:{:.2f}:{}, lrn_rate:{:.4f}, eps:{:.4f}" .format(epi, step, np.max(reward_history), np.mean(reward_history[-10:]), reward_history[-5:], alpha, eps)) self.save_checkpoint(step=step) print( "[FINAL] episodes: {}, Max reward: {}, Average reward: {}".format( len(reward_history), np.max(reward_history), np.mean(reward_history))) return {'rwd': reward_history, 'rwd_avg': reward_avg}
class Agent(object): def __init__(self, env_name, state_space, n_actions, replay_buffer_size=500000, batch_size=32, hidden_size=64, gamma=0.99): self.env_name = env_name device = 'cuda' if torch.cuda.is_available() else 'cpu' self.train_device = device self.n_actions = n_actions self.state_space_dim = state_space if "CartPole" in self.env_name: self.policy_net = CartpoleDQN(state_space, n_actions, 4) self.target_net = CartpoleDQN(state_space, n_actions, 4) self.target_net.load_state_dict(self.policy_net.state_dict()) self.target_net.eval() self.optimizer = optim.Adam(self.policy_net.parameters(), lr=1e-4) elif "WimblepongVisualSimpleAI-v0" in self.env_name: self.policy_net = Policy(state_space, n_actions, 4) self.target_net = Policy(state_space, n_actions, 4) self.target_net.load_state_dict(self.policy_net.state_dict()) self.target_net.eval() self.optimizer = optim.Adam(self.policy_net.parameters(), lr=5e-4) else: raise ValueError( "Wrong environment. An agent has not been specified for %s" % env_name) self.memory = ReplayMemory(replay_buffer_size) self.batch_size = batch_size self.gamma = gamma def update_network(self, updates=1): for _ in range(updates): self._do_network_update() def _do_network_update(self): if len(self.memory) < self.batch_size: return transitions = self.memory.sample(self.batch_size) # Transpose the batch (see https://stackoverflow.com/a/19343/3343043 for # detailed explanation). This converts batch-array of Transitions # to Transition of batch-arrays. batch = Transition(*zip(*transitions)) # Compute a mask of non-final states and concatenate the batch elements # (a final state would've been the one after which simulation ended) non_final_mask = 1 - torch.tensor(batch.done, dtype=torch.uint8).to( self.train_device) non_final_mask = non_final_mask.type(torch.bool) non_final_next_states = [ s for nonfinal, s in zip(non_final_mask, batch.next_state) if nonfinal > 0 ] non_final_next_states = torch.stack(non_final_next_states).to( self.train_device) state_batch = torch.stack(batch.state).to(self.train_device) action_batch = torch.cat(batch.action).to(self.train_device) reward_batch = torch.cat(batch.reward).to(self.train_device) # Compute Q(s_t, a) - the model computes Q(s_t), then we select the # columns of actions taken. These are the actions which would've been taken # for each batch state according to policy_net state_action_values = self.policy_net(state_batch).gather( 1, action_batch).to(self.train_device) # Compute V(s_{t+1}) for all next states. # Expected values of actions for non_final_next_states are computed based # on the "older" target_net; selecting their best reward with max(1)[0]. # This is merged based on the mask, such that we'll have either the expected # state value or 0 in case the state was final. next_state_values = torch.zeros(self.batch_size) next_state_values[non_final_mask] = self.target_net( non_final_next_states).max(1)[0].detach() # Task 4: TODO: Compute the expected Q values expected_state_action_values = reward_batch + self.gamma * next_state_values # Compute Huber loss loss = F.smooth_l1_loss(state_action_values.squeeze(), expected_state_action_values) # Optimize the model self.optimizer.zero_grad() loss.backward() for param in self.policy_net.parameters(): param.grad.data.clamp_(-1e-1, 1e-1) self.optimizer.step() def get_action(self, state, epsilon=0.05): #print('initial get action',state.shape) #print('final get action',state.shape) sample = random.random() if sample > epsilon: with torch.no_grad(): #print('a',state) state = torch.from_numpy(state) #print('b',state) state = state.unsqueeze(0) q_values = self.policy_net(state) return torch.argmax(q_values).item() else: return random.randrange(3) def preprocessing(self, observation): """ Preprocess the received information: 1) Grayscaling 2) Reducing quality (resizing) Params: observation: image of pong """ # Grayscaling #img_gray = rgb2gray(observation) img_gray = np.dot(observation, [0.2989, 0.5870, 0.1140]).astype(np.uint8) # Normalize pixel values img_norm = img_gray / 255.0 # Downsampling: we receive squared image (e.g. 200x200) and downsample by x2.5 to (80x80) img_resized = cv2.resize(img_norm, dsize=(80, 80)) #img_resized = img_norm[::2.5,::2.5] return img_resized def stack_images(self, observation, img_collection, timestep): """ Stack up to four frames together """ # image preprocessing img_preprocessed = self.preprocessing(observation) if (timestep == 0): # start of new episode # img_collection get filled with zeros again img_collection = deque( [np.zeros((80, 80), dtype=np.int) for i in range(4)], maxlen=4) # fill img_collection 4x with the first frame img_collection.append(img_preprocessed) img_collection.append(img_preprocessed) img_collection.append(img_preprocessed) img_collection.append(img_preprocessed) # Stack the images in img_collection img_stacked = np.stack(img_collection, axis=2) else: # Delete first/oldest entry and append new image #img_collection.pop(0) img_collection.append(img_preprocessed) # Stack the images in img_collection img_stacked = np.stack(img_collection, axis=2) # TODO: right axis?? return img_stacked, img_collection def update_target_network(self): self.target_net.load_state_dict(self.policy_net.state_dict()) def store_transition(self, state, action, next_state, reward, done): action = torch.Tensor([[action]]).long().to(self.train_device) reward = torch.tensor([reward], dtype=torch.float32).to(self.train_device) next_state = torch.from_numpy(next_state).float().to(self.train_device) state = torch.from_numpy(state).float().to(self.train_device) self.memory.push(state, action, next_state, reward, done) def load_model(self): #load_path = '/home/isaac/codes/autonomous_driving/highway-env/data/2020_09_03/Intersection_egoattention_dqn_ego_attention_1_22:00:25/models' #policy.load_state_dict(torch.load("./model50000ep_WimblepongVisualSimpleAI-v0_0.mdl")) """ Load already created model return: none """ weights = torch.load("FROM2100v2WimblepongVisualSimpleAI-v0_1900.mdl", map_location=self.train_device) self.policy_net.load_state_dict(weights, strict=False) def get_name(self): """ Interface function to retrieve the agents name """ return self.name def reset(self): """ Resets the agent’s state after an episode is finished
class Agent(nn.Module): def __init__(self, q_models, target_model, hyperbolic, k, gamma, model_params, replay_buffer_size, batch_size, inp_dim, lr, no_models, act_space, hidden_size, loss_type, target_update=False): super(Agent, self).__init__() if hyperbolic: self.q_models = DQN(state_space_dim=inp_dim, action_space_dim=act_space, hidden=hidden_size, no_models=no_models) self.target_models = DQN(state_space_dim=inp_dim, action_space_dim=act_space, hidden=hidden_size, no_models=no_models) self.target_models.load_state_dict(self.q_models.state_dict()) self.target_models.eval() else: self.q_models = q_models self.optimizer = optim.RMSprop(self.q_models.parameters(), lr=lr) self.hyperbolic = hyperbolic self.n_actions = model_params.act_space self.k = k # self.gammas = torch.tensor(np.linspace(0, 1, self.q_models.no_models + 1), dtype=torch.float)[1:] self.gammas = np.sort( np.random.uniform(0, 1, self.q_models.no_models + 1)) self.gammas = np.append(self.gammas, 0.98) self.gammas = torch.tensor(np.sort(self.gammas)) self.memory = ReplayMemory(replay_buffer_size) self.batch_size = batch_size self.inp_dim = inp_dim self.device = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") self.target_models.to(self.device) self.q_models.to(self.device) self.gammas = self.gammas.to(self.device) self.loss_type = loss_type self.criterion = nn.MSELoss() self.use_target_network = target_update def update_network(self, updates=1): for _ in range(updates): loss = self._do_network_update() return loss def get_hyperbolic_train_coeffs(self, k, num_models): coeffs = [] for i in range(1, num_models + 1): coeffs.append(((self.gammas[i + 1] - self.gammas[i]) * (1 / k) * self.gammas[i]**((1 / k) - 1))) return torch.tensor(coeffs).to(self.device) / sum(coeffs) def get_action(self, state_batch, epsilon=0.05, get_among_last=False): # epsilon gets smaller as time goes by. # (glie_a/(glie_a + eps)) with eps in range(0, no_episodes) take_random_action = random.random() if take_random_action < epsilon: return random.randrange(self.n_actions) elif get_among_last: state_batch = torch.tensor(state_batch, dtype=torch.float32, device=self.device).view( -1, self.inp_dim) model_outputs = self.q_models(state_batch).reshape( 2, self.q_models.no_models) return torch.argmax(model_outputs[:, -10].view(-1)).item() model_outputs = model_outputs * self.get_hyperbolic_train_coeffs( self.k, self.q_models.no_models) actions = torch.argmax(torch.sum(model_outputs, dim=1)) return actions.item() elif self.hyperbolic: with torch.no_grad(): state_batch = torch.tensor(state_batch, dtype=torch.float32, device=self.device).view( -1, self.inp_dim) model_outputs = self.q_models(state_batch.double()).reshape( -1, 2) coeffs = self.get_hyperbolic_train_coeffs( self.k, self.q_models.no_models).reshape(-1, 1) model_outputs = model_outputs * coeffs actions = torch.argmax(torch.sum(model_outputs, dim=0)) return actions.item() def get_state_act_vals(self, state_batch, action_batch=None): if self.hyperbolic: action_batch = action_batch.repeat( 1, self.q_models.no_models).reshape(-1, 1) model_outputs = self.q_models(state_batch.to(self.device).double()) model_outputs = model_outputs.reshape(-1, self.n_actions) model_outputs = model_outputs.gather(1, action_batch) # .reshape(self.q_models.no_models * state_batch.shape[0], # 2).gather(1, action_batch.reshape(-1)) return model_outputs else: model_output = self.q_models(state_batch).gather(1, action_batch) return model_output def get_max_next_state_vals(self, non_final_mask, non_final_next_states): if self.hyperbolic: with torch.no_grad(): next_state_values = torch.zeros(self.batch_size).to( self.device) # doing it like this, the model_no will come first and then the batch_no (b1m1, b1m2, b1m3..., b2m1, # ...b10m1, b10m2... # if False in non_final_mask: # print(non_final_mask) # print(len(non_final_next_states)) non_final_mask = non_final_mask.reshape(-1, 1).repeat( 1, self.q_models.no_models).view(-1) # if False in non_final_mask: # print([nf for nf in non_final_mask]) next_state_values = next_state_values.view(-1, 1).repeat( 1, self.q_models.no_models).view(-1) if self.use_target_network: # [b1m1o1, b1m1o2], -> max -> [b1m1] # [b1m2o1, b1m2o2], [b1m2] # [b1m3o1, b1m3o3], [b1m3] # ... ... # next_state_values[non_final_mask] = \ self.target_models(non_final_next_states.to(self.device)).reshape(-1, self.n_actions).max(1)[0] # if False in non_final_mask: # print("first", self.target_models(non_final_next_states.to(self.device))) # print("after reshaping", self.target_models(non_final_next_states.to(self.device)).reshape(-1, self.n_actions)) # print(self.target_models(non_final_next_states.to(self.device)).shape) # print("next_state_values", next_state_values) else: next_state_values[non_final_mask] = \ self.q_models(non_final_next_states.to(self.device)).reshape(-1, self.n_actions).max(1)[0] target_outptus = next_state_values return target_outptus * self.gammas[2:].repeat(self.batch_size) def _do_network_update(self): if len(self.memory) < self.batch_size: return transitions = self.memory.sample(self.batch_size) batch = Transition(*zip(*transitions)) non_final_mask = ~torch.tensor(batch.done, dtype=torch.bool) non_final_next_states = [ s for nonfinal, s in zip(non_final_mask, batch.next_state) if nonfinal ] non_final_next_states = torch.stack(non_final_next_states).to( self.device) state_batch = torch.stack(batch.state).to(self.device) action_batch = torch.cat(batch.action).to(self.device) reward_batch = torch.cat(batch.reward).to(self.device) state_action_values = self.get_state_act_vals(state_batch, action_batch).view(-1) next_state_values = self.get_max_next_state_vals( non_final_mask, non_final_next_states) # this should be perfect expected_state_action_values = next_state_values + \ reward_batch.view(-1, 1).repeat(1, self.q_models.no_models).view(-1) # print(reward_batch.view(-1, 1).repeat(1, self.q_models.no_models).view(-1).shape) if self.loss_type == "weighted_loss": loss = (state_action_values - expected_state_action_values)**2 hyp_coef = self.get_hyperbolic_train_coeffs( self.k, self.q_models.no_models).repeat(self.batch_size) loss = (loss.reshape(-1).view(-1) * hyp_coef).view(-1) loss = torch.mean(loss) elif self.loss_type == "separate_summarized_loss": loss = F.smooth_l1_loss(state_action_values, expected_state_action_values).double() # loss = (state_action_values - expected_state_action_values) ** 2 # loss = torch.sum(loss) elif self.loss_type == "one_output_loss": hyp_coef = self.get_hyperbolic_train_coeffs( self.k, self.q_models.no_models) state_action_values = state_action_values.reshape( self.batch_size, -1) * hyp_coef state_action_values = torch.sum(state_action_values, dim=1) expected_state_action_values = expected_state_action_values.reshape( self.batch_size, -1) * hyp_coef expected_state_action_values = torch.sum( expected_state_action_values, dim=1) loss = self.criterion(state_action_values, expected_state_action_values) loss_item = loss.item() # print(hyp_coef.repeat(self.batch_size).shape) # print(loss.shape) # loss = (state_action_values - expected_state_action_values) ** 2 * self.get_hyperbolic_train_coeffs(self.k, # self.q_models.no_models).repeat( # self.batch_size) # # loss = torch.sum(loss) # loss = F.smooth_l1_loss(stsave_figate_action_values.squeeze(), # expected_state_action_values) # Optimize the model self.optimizer.zero_grad() loss.backward() for param in self.q_models.parameters(): param.grad.data.clamp_(-1e-1, 1e-1) self.optimizer.step() return loss_item def update_target_network(self): self.target_models.load_state_dict(self.q_models.state_dict()) def store_transition(self, state, action, next_state, reward, done): action = torch.Tensor([[action]]).long() reward = torch.tensor([reward], dtype=torch.float32) next_state = torch.from_numpy(next_state).float() state = torch.from_numpy(state).float() self.memory.push(state, action, next_state, reward, done)
class Agent(object): def __init__(self, state_space, n_actions, replay_buffer_size=50000, batch_size=32, hidden_size=12, gamma=0.98): self.n_actions = n_actions self.state_space_dim = state_space self.policy_net = DQN(state_space, n_actions, hidden_size) self.target_net = DQN(state_space, n_actions, hidden_size) self.target_net.load_state_dict(self.policy_net.state_dict()) self.target_net.eval() self.optimizer = optim.RMSprop(self.policy_net.parameters(), lr=1e-3) self.memory = ReplayMemory(replay_buffer_size) self.batch_size = batch_size self.gamma = gamma def update_network(self, updates=1): for _ in range(updates): self._do_network_update() def _do_network_update(self): if len(self.memory) < self.batch_size: return transitions = self.memory.sample(self.batch_size) # Transpose the batch (see https://stackoverflow.com/a/19343/3343043 for # detailed explanation). This converts batch-array of Transitions # to Transition of batch-arrays. batch = Transition(*zip(*transitions)) # Compute a mask of non-final states and concatenate the batch elements # (a final state would've been the one after which simulation ended) non_final_mask = 1 - torch.tensor(batch.done, dtype=torch.uint8) non_final_next_states = [ s for nonfinal, s in zip(non_final_mask, batch.next_state) if nonfinal > 0 ] non_final_next_states = torch.stack(non_final_next_states) state_batch = torch.stack(batch.state) action_batch = torch.cat(batch.action) reward_batch = torch.cat(batch.reward) # Compute Q(s_t, a) - the model computes Q(s_t), then we select the # columns of actions taken. These are the actions which would've been taken # for each batch state according to policy_net state_action_values = self.policy_net(state_batch).gather( 1, action_batch) # Compute V(s_{t+1}) for all next states. # Expected values of actions for non_final_next_states are computed based # on the "older" target_net; selecting their best reward with max(1)[0]. # This is merged based on the mask, such that we'll have either the expected # state value or 0 in case the state was final. next_state_values = torch.zeros(self.batch_size) next_state_values[non_final_mask] = self.target_net( non_final_next_states).max(1)[0].detach() # Task 4: TODO: Compute the expected Q values expected_state_action_values = reward_batch + self.gamma * next_state_values # Compute Huber loss loss = F.smooth_l1_loss(state_action_values.squeeze(), expected_state_action_values) # Optimize the model self.optimizer.zero_grad() loss.backward() for param in self.policy_net.parameters(): param.grad.data.clamp_(-1e-1, 1e-1) self.optimizer.step() def get_action(self, state, epsilon=0.05): sample = random.random() if sample > epsilon: with torch.no_grad(): state = torch.from_numpy(state).float() q_values = self.policy_net(state) return torch.argmax(q_values).item() else: return random.randrange(self.n_actions) def update_target_network(self): self.target_net.load_state_dict(self.policy_net.state_dict()) def store_transition(self, state, action, next_state, reward, done): action = torch.Tensor([[action]]).long() reward = torch.tensor([reward], dtype=torch.float32) next_state = torch.from_numpy(next_state).float() state = torch.from_numpy(state).float() self.memory.push(state, action, next_state, reward, done)
class Execute: def __init__(self, path): self.config = Configuration.construct(path) self.env = Environment(self.config) self.memory = ReplayMemory(self.config) self.model = Model(self.config) self.ep = None def get_epsilon(self, is_play): if is_play: return self.config.play.ep ep_start = self.config.train.ep.start ep_final = self.config.train.ep.final ep_num_frames = self.config.train.ep.num_frames decay = (ep_start - ep_final) / ep_num_frames if self.ep is None: self.ep = ep_start self.ep = max(self.ep - decay, ep_final) return self.ep def log(self, **kawrgs): log = "" for name, value in kawrgs.items(): log += f"{name}: {value}, " print(log) def run_episode(self, episode=1, steps=0, is_play=True, debug=False): config = self.config self.env.reset() action = 1 _, _, curr_state, is_done = self.env.step(action) total_reward = 0 update_net = 0; C = config.train.network_update_freq t = 0; T = config.max_episode_length while not is_done and t < T: if t % config.action_repeat == 0: ep = self.get_epsilon(is_play) action = self.model.choose_action(curr_state, ep) prev_state, reward, curr_state, is_done = self.env.step(action) total_reward += reward t += 1 if is_play: self.env.render("human") if debug and t % config.play.debug.time == 0: self.log(ftype=self.env.get_frame_type(), action=action, reward=total_reward) continue self.memory.add((prev_state, action, reward, curr_state, is_done)) if self.memory.get_size() > config.train.replay_start_size: for i in range(config.train.batch_run): batch = self.memory.sample() self.model.optimize(batch) steps = (steps + 1) % C if steps % C == 0: self.model.update_qhat() update_net += 1 if not is_play and debug and episode % config.train.debug.time == 0: self.log(ftype=self.env.get_frame_type(), total_reward=total_reward, network_update_steps=update_net, episode_time=t, ep=ep) return total_reward, steps def load_model(self): ftype = self.env.get_frame_type() in_size = self.env.get_in_size() num_actions = self.env.get_num_actions() self.model.load_model(ftype, in_size, num_actions) def play(self, debug=False): self.load_model() for ep in range(1): self.run_episode(is_play=True, debug=debug) def train(self, debug=False): self.load_model() optimize_steps = 0 episodes = self.config.train.episodes for episode in range(1, episodes+1): reward, steps = self.run_episode(episode=episode, steps=optimize_steps, is_play=False, debug=debug) optimize_steps += steps if episode % self.config.train.save_model_episode == 0: self.model.save_model() self.model.update_qhat() self.model.save_model() def close(self): self.env.close() self.memory.close()
def train(agent, env, num_episode=50, test_interval=25, num_test=20, num_iteration=200, iteration_cutoff=0, BATCH_SIZE=128, num_sample=50, action_space=[-1,1], debug=True, memory=None, seed=2020, update_mode=UPDATE_PER_ITERATION, reward_mode=FUTURE_REWARD_NO, gamma=0.99, loss_history=[], loss_historyA=[], lr_history=[], lr_historyA=[], reward_mean_var=(0,-1), save_sim_intv=50, save_sim_fnames=[], imdir='screencaps/', useVid=False, save_intm_models=False, not_use_rand_in_action=False, not_use_rand_in_test=True, return_memory=False): test_hists = [] steps = 0 if memory is None: ### UPDate 11/05: Changed memory size based on number of agents memory = ReplayMemory(1000 * env.N) if iteration_cutoff <= 0: iteration_cutoff = num_iteration # Save all iterations into the memory # Values that would be useful N = env.N # Note that the seed only controls the numpy random, which affects the environment. # To affect pytorch, refer to further documentations: https://github.com/pytorch/pytorch/issues/7068 np.random.seed(seed) # torch.manual_seed(seed) test_seeds = np.random.randint(0, 5392644, size=int(num_episode // test_interval)+1) # rmean = 0 # rvar = -1 (rmean, rvar) = reward_mean_var for e in range(num_episode): steps = 0 state = env.reset() if agent.centralized: state = env.state state = torch.from_numpy(state).float() state = Variable(state) if debug: env.render() # Train History state_pool = [] action_pool = [] reward_pool = [] next_state_pool = [] loss_history.append([]) loss_historyA.append([]) for t in range(num_iteration): # agent.net.train() agent.set_train(True) # Try to pick an action, react, and store the resulting behavior in the pool here if agent.centralized: action = agent.select_action(state, **{ 'steps_done':t, 'num_sample':50, 'action_space':action_space, 'rand':not_use_rand_in_action }).T else: actions = [] for i in range(N): action = agent.select_action(state[i], **{ 'steps_done':t, 'num_sample':50, 'action_space':action_space, 'rand':not_use_rand_in_action }) actions.append(action) if torch.is_tensor(action): action = torch.cat(actions).view(-1,env.N)#.T else: action = np.array(actions).T # Shape would become (2,N) if torch.is_tensor(action): next_state, reward, done, _ = env.step(action.detach().numpy()) else: next_state, reward, done, _ = env.step(action) if agent.centralized: next_state = env.state next_state = Variable(torch.from_numpy(next_state).float()) # The float() probably avoids bug in net.forward() action = action.T # Turn shape back to (N,2) if agent.needsExpert: # If we need to use expert input during training, then we consult it and get the best action for this state actions = env.controller() action = actions.T # Shape should already be (2,N), so we turn it into (N,2) if not(agent.centralized): # if reward_mode & FUTURE_REWARD_YES == 0: # # Push everything directly inside if we don't use future discounts # for i in range(N): # memory.push(state[i], action[i], next_state[i], reward[i]) # else: # # Store and push them outside the loop # state_pool.append(state) # action_pool.append(action) # reward_pool.append(reward) # next_state_pool.append(next_state) pass else: # if reward_mode & FUTURE_REWARD_YES == 0: # # Push everything directly inside if we don't use future discounts # memory.push(state, action, next_state, reward) # else: # # Store and push them outside the loop # state_pool.append(state) # action_pool.append(action) # reward_pool.append(reward) # next_state_pool.append(next_state) # Centralized training should directly use the real states, instead of observations reward = np.sum(reward) # Update 1028: Moved this training step outside the loop if update_mode == UPDATE_PER_ITERATION: # Added 1214: Push the samples to memory if no need for extra processing if reward_mode & FUTURE_REWARD_YES == 0 and reward_mode & FUTURE_REWARD_NORMALIZE == 0: if agent.centralized: memory.push(state, action, next_state, reward, reward) else: for i in range(N): memory.push(state[i], action[i], next_state[i], reward[i], reward[i]) # Learn if len(memory) >= BATCH_SIZE: transitions = memory.sample(BATCH_SIZE) batch = Transition(*zip(*transitions)) agent.optimize_model(batch, **{'B':BATCH_SIZE}) elif len(memory) > 0: transitions = memory.sample(len(memory)) batch = Transition(*zip(*transitions)) agent.optimize_model(batch, **{'B':len(memory)}) loss_history[-1].append(agent.losses[:]) # print(e,t,agent.losses) agent.losses=[] # Also record scheduler history for learning rate. If the scheduler is a Plateau one, then # we can know from the learning rate if we're in a flatter area. # https://discuss.pytorch.org/t/how-to-retrieve-learning-rate-from-reducelronplateau-scheduler/54234/2 # The scheduler requires the validation loss - can I just use the average training loss instead? # try: # agent.scheduler.step(np.mean(loss_history[-1])) # lr_history.append(agent.optimizer.param_groups[0]['lr']) # except: # agent.schedulerC.step(np.mean(loss_history[-1])) # lr_history.append(agent.optimizerC.param_groups[0]['lr']) try: loss_historyA[-1].append(agent.lossesA[:]) agent.lossesA=[] # agent.schedulerA.step(np.mean(loss_historyA[-1])) # lr_historyA.append(agent.optimizerA.param_groups[0]['lr']) except: pass elif update_mode == UPDATE_ON_POLICY: # This case would ditch sampling, and just update by the current thing. # Note that methods that use future cumulative reward would be highly incompatible with this... if not(agent.centralized) or reward_mode & FUTURE_REWARD_YES != 0: print("Error: Update-on-policy might be incompatible with decentralized planning or cumulative reward") return None if rvar == -1 and rmean == 0 and reward_mode & FUTURE_REWARD_NORMALIZE != 0: rvar = np.abs(reward) rmean = reward reward = (reward - rmean) / rvar batch = Transition(state, action, next_state, [[reward]], [[reward]]) agent.optimize_model(batch, **{'B':1}) # batch = Transition(state, action, next_state, reward, reward) # # transitions = [batch,batch] # # agent.optimize_model(Transition(*zip(*transitions)), **{'B':2}) # transitions = [batch,batch] # agent.optimize_model(batch, **{'B':1}) loss_history[-1].append(agent.losses[:]) agent.losses=[] try: loss_historyA[-1].append(agent.lossesA[:]) agent.lossesA=[] except: pass else: # Store and push them outside the loop state_pool.append(state) if torch.is_tensor(action): action_pool.append(action.detach().numpy()) else: action_pool.append(action) reward_pool.append(reward) next_state_pool.append(next_state) state = next_state steps += 1 if debug: env.render() if debug and done: print("Took ", t, " steps to converge") break # Now outside the iteration loop - prepare for per-episode trainings if update_mode == UPDATE_ON_POLICY: pass elif update_mode == UPDATE_PER_EPISODE: #se: inst_reward = torch.tensor(reward_pool) if reward_mode & FUTURE_REWARD_YES != 0: for j in range(len(reward_pool)): ### IT was previously miswritten as "reward". Retard bug that might had effects if j > 0: reward_pool[-j-1] += gamma * reward_pool[-j] reward_pool = torch.tensor(reward_pool) if reward_mode & FUTURE_REWARD_NORMALIZE != 0: if rvar == -1 and rmean == 0: rmean = reward_pool.mean() rvar = reward_pool.std() print("Updated mean and stdev: {0} and {1}".format(rmean.numpy(), rvar.numpy())) reward_pool = (reward_pool - rmean) / rvar inst_reward = (inst_reward - rmean) / rvar # Update: 0106 added option to only push the first few iterations into the memory. # if agent.centralized: # # print(state_pool[0].shape, action_pool[0].shape) # for j in range(len(reward_pool)): # memory.push(state_pool[-j-1], action_pool[-j-1], # next_state_pool[-j-1], reward_pool[-j-1], inst_reward[-j-1]) # else: # for j in range(len(reward_pool)): # for i in range(N): # memory.push(state_pool[-j-1][i], action_pool[-j-1][i], # next_state_pool[-j-1][i], reward_pool[-j-1][i], inst_reward[-j-1][i]) if agent.centralized: for j in range(iteration_cutoff): print(j, len(reward_pool)) memory.push(state_pool[j], action_pool[j], next_state_pool[j], reward_pool[j], inst_reward[j]) else: for j in range(iteration_cutoff): for i in range(N): memory.push(state_pool[j][i], action_pool[j][i], next_state_pool[j][i], reward_pool[j][i], inst_reward[j][i]) if update_mode == UPDATE_PER_EPISODE: if len(memory) >= BATCH_SIZE: transitions = memory.sample(BATCH_SIZE) batch = Transition(*zip(*transitions)) agent.optimize_model(batch, **{'B':BATCH_SIZE}) elif len(memory) > 0: transitions = memory.sample(len(memory)) batch = Transition(*zip(*transitions)) agent.optimize_model(batch, **{'B':len(memory)}) loss_history[-1].append(agent.losses[:]) agent.losses=[] # Also record scheduler history for learning rate. If the scheduler is a Plateau one, then # we can know from the learning rate if we're in a flatter area. # https://discuss.pytorch.org/t/how-to-retrieve-learning-rate-from-reducelronplateau-scheduler/54234/2 # try: # agent.scheduler.step(np.mean(loss_history[-1])) # lr_history.append(agent.optimizer.param_groups[0]['lr']) # except: # agent.schedulerC.step(np.mean(loss_history[-1])) # lr_history.append(agent.optimizerC.param_groups[0]['lr']) try: loss_historyA[-1].append(agent.lossesA[:]) agent.lossesA=[] # agent.schedulerA.step(np.mean(loss_historyA[-1])) # lr_historyA.append(agent.optimizerA.param_groups[0]['lr']) except: pass if debug: print("Episode ", e, " finished; t = ", t) if e % test_interval == 0: print("Test result at episode ", e, ": ") test_hist = test(agent, env, num_test, num_iteration, num_sample, action_space, seed=test_seeds[int(e/test_interval)], debug=debug, not_use_rand_in_action=not_use_rand_in_test) test_hists.append(test_hist) # Save demos of simulation if wanted if e % save_sim_intv == (save_sim_intv-1) and e > 0: try: fnames = [f+'_{0}'.format(e) for f in save_sim_fnames] plot_test(agent, env, fnames=fnames, num_iteration=num_iteration, action_space=action_space, imdir=imdir, debug=debug, useVid=useVid, not_use_rand=not_use_rand_in_test) for f in fnames: os.system('ffmpeg -y -pattern_type glob -i "'+imdir+f+'*.jpg" '+f+'.gif') except: print("Failed to save simulation at e={0}".format(e)) if save_intm_models and len(save_sim_fnames) > 0: agent.save_model(save_sim_fnames[0]+'_{0}'.format(e)) if return_memory: return test_hists, memory else: return test_hists
class DqnPolicy(BaseTFModel): def __init__(self, env, training, name=None, model_path=None, gamma=0.99, lr=0.001, lr_decay=1.0, epsilon=1.0, epsilon_final=0.02, batch_size=32, memory_capacity=100000, model_params={}, layer_sizes=[32, 32], target_update_type='hard', target_update_params={}, double_q=True, dueling=True, **kwargs): if name is None: self.name = self.__class__.__name__ else: self.name = name if model_path is None: self.model_path = os.path.join('model', self.name) else: self.model_path = model_path self.env = env self.training = training self.gamma = gamma self.lr = lr self.lr_decay = lr_decay self.epsilon = epsilon self.epsilon_final = epsilon_final self.batch_size = batch_size self.memory_capacity = memory_capacity self.model_params = model_params self.layer_sizes = layer_sizes self.double_q = double_q self.dueling = dueling self.target_update_type = target_update_type self.target_update_every_step = target_update_params.get( 'every_step', 100) self.target_update_tau = target_update_params.get('tau', 0.05) self.memory = ReplayMemory(capacity=memory_capacity) self.action_size = self.env.action_space.n self.state_size = np.prod(list(self.env.observation_space.shape)) print 'action_size: {a}, state_size: {s}'.format(a=self.action_size, s=self.state_size) if self.training: # clear existing model files if os.path.exists(self.model_path): print 'deleting existing model files at {}'.format( self.model_path) if os.path.isdir(self.model_path): shutil.rmtree(self.model_path) else: os.remove(self.model_path) BaseTFModel.__init__(self, self.name, self.model_path, saver_max_to_keep=5) print 'building graph ...' with self.graph.as_default(): self.__build_graph() def act(self, state, epsilon=0.1): """ :param state: 1d np.ndarray :param epsilon: :return: int """ assert isinstance(state, np.ndarray) and state.ndim == 1 if self.training and np.random.random() < epsilon: return self.env.action_space.sample() with self.sess.as_default(): return self.actions_selected_by_q.eval( {self.states: state.reshape((1, -1))})[0] def train(self, n_episodes=500, annealing_episodes=450, every_episode=10, **kwargs): if self.training is False: raise Exception( 'prohibited to call train() for a non-training model') reward_history = [0.0] reward_averaged = [] lr = self.lr eps = self.epsilon annealing_episodes = annealing_episodes or n_episodes eps_drop = (self.epsilon - self.epsilon_final) / annealing_episodes print "eps_drop: {}".format(eps_drop) step = 0 # calling the property method of BaseTFModel to start a session self.sess.run(self.init_vars) self.__init_target_q_net() for n_episode in range(n_episodes): ob = self.env.reset() done = False traj = [] reward = 0. while not done: a = self.act(ob, eps) assert a >= 0 new_ob, r, done, _ = self.env.step(a) step += 1 reward += r traj.append(Transition(ob, a, r, new_ob, done)) ob = new_ob # No enough samples in the buffer yet. if self.memory.size < self.batch_size: continue # Training with a mini batch of samples batch_data = self.memory.sample(self.batch_size) feed_dict = { self.learning_rate: lr, self.states: batch_data['s'], self.actions: batch_data['a'], self.rewards: batch_data['r'], self.states_next: batch_data['s_next'], self.done_flags: batch_data['done'] } if self.double_q: actions_next = self.sess.run( self.actions_selected_by_q, {self.states: batch_data['s_next']}) feed_dict.update({self.actions_next: actions_next}) _, q_val, q_target_val, loss, summ_str = self.sess.run( [ self.optimizer, self.q, self.q_target, self.loss, self.merged_summary ], feed_dict=feed_dict) self.writer.add_summary(summ_str, step) # update the target q net if necessary self.__update_target_q_net(step) self.memory.add(traj) reward_history.append(reward) reward_averaged.append(np.mean(reward_history[-10:])) # Annealing the learning and exploration rate after every episode lr *= self.lr_decay if eps > self.epsilon_final: eps -= eps_drop if reward_history and every_episode and n_episode % every_episode == 0: print "[episodes: {}/step: {}], best: {}, avg: {:.2f}:{}, lr: {:.4f}, eps: {:.4f}".format( n_episode, step, np.max(reward_history), np.mean(reward_history[-10:]), reward_history[-5:], lr, eps) self.save_model(step=step) print "[training completed] episodes: {}, Max reward: {}, Average reward: {}".format( len(reward_history), np.max(reward_history), np.mean(reward_history)) fig_path = os.path.join(self.model_path, 'figs') makedirs(fig_path) fig_file = os.path.join( fig_path, '{n}-{t}.png'.format(n=self.name, t=int(time.time()))) plot_learning_curve(fig_file, { 'reward': reward_history, 'reward_avg': reward_averaged }, xlabel='episode') def evaluate(self, n_episodes): if self.training: raise Exception( 'prohibited to call evaluate() for a training model') reward_history = [] for episode in xrange(n_episodes): state = self.env.reset() reward_episode = 0. while True: action = self.act(state) new_state, reward, done, _ = self.env.step(action) reward_episode += reward state = new_state if done: break reward_history.append(reward_episode) return reward_history def __build_graph(self): self.__create_q_networks() # q is the Q(s, a) of the behavior policy self.actions_selected_by_q = tf.argmax(self.q, axis=-1, name='action_selected') action_one_hot = tf.one_hot(self.actions, self.action_size, dtype=tf.float32, name='action_one_hot') pred = tf.reduce_sum(self.q * action_one_hot, axis=-1, name='pred') # q_target is the Q(s, a) of the target policy that is what we learning for. if self.double_q: action_next_one_hot = tf.one_hot(self.actions_next, self.action_size, dtype=tf.float32, name='action_next_one_hot') max_q_next_target = tf.reduce_sum(self.q_target * action_next_one_hot, axis=-1, name='max_q_next_target') else: max_q_next_target = tf.reduce_max(self.q_target, axis=-1) y = self.rewards + (1. - self.done_flags) * self.gamma * max_q_next_target self.loss = tf.reduce_mean(tf.square(pred - tf.stop_gradient(y)), name="loss_mse_train") self.optimizer = tf.train.AdamOptimizer(self.learning_rate).minimize( self.loss, name="adam") self.init_vars = tf.global_variables_initializer() with tf.variable_scope('summary'): q_summ = [] avg_q = tf.reduce_mean(self.q, 0) for idx in range(self.action_size): q_summ.append(tf.summary.histogram('q/%s' % idx, avg_q[idx])) self.q_summ = tf.summary.merge(q_summ, 'q_summary') self.q_y_summ = tf.summary.histogram("batch/y", y) self.q_pred_summ = tf.summary.histogram("batch/pred", pred) self.loss_summ = tf.summary.scalar("loss", self.loss) self.merged_summary = tf.summary.merge_all( key=tf.GraphKeys.SUMMARIES) def __create_q_networks(self): # mini-batch self.states = tf.placeholder(tf.float32, shape=(None, self.state_size), name='state') self.states_next = tf.placeholder(tf.float32, shape=(None, self.state_size), name='state_next') self.actions = tf.placeholder(tf.int32, shape=(None, ), name='action') # actions_next is not the actual actions in the next step; # it is used to predict the action value in the Bellman equation. self.actions_next = tf.placeholder(tf.int32, shape=(None, ), name='action_next') self.rewards = tf.placeholder(tf.float32, shape=(None, ), name='reward') self.done_flags = tf.placeholder(tf.float32, shape=(None, ), name='done') self.learning_rate = tf.placeholder(tf.float32, shape=None, name='learning_rate') if self.dueling: with tf.variable_scope('Q_primary'): self.q_hidden = dense_nn(self.states, self.layer_sizes[:-1], name='q_hidden', training=self.training) # advantage function A(s, a) self.adv = dense_nn(self.q_hidden, [self.layer_sizes[-1], self.action_size], name='adv', training=self.training) # state value function V(s) self.v = dense_nn(self.q_hidden, [self.layer_sizes[-1], 1], name='v', training=self.training) self.q = self.v + (self.adv - tf.reduce_mean( self.adv, reduction_indices=1, keep_dims=True)) with tf.variable_scope('Q_target'): self.q_target_hidden = dense_nn(self.states_next, self.layer_sizes[:-1], name='q_hidden', training=self.training) self.adv_target = dense_nn( self.q_target_hidden, [self.layer_sizes[-1], self.action_size], name='adv', training=self.training) self.v_target = dense_nn(self.q_target_hidden, [self.layer_sizes[-1], 1], name='v', training=self.training) self.q_target = self.v_target + ( self.adv_target - tf.reduce_mean( self.adv_target, reduction_indices=1, keep_dims=True)) else: self.q = dense_nn(self.states, self.layer_sizes + [self.action_size], name='Q_primary', training=self.training) self.q_target = dense_nn(self.states_next, self.layer_sizes + [self.action_size], name='Q_target', training=self.training) self.q_vars = self.scope_vars('Q_primary') self.q_target_vars = self.scope_vars('Q_target') assert len(self.q_vars) == len( self.q_target_vars), "Two Q-networks are not same in structure." def __init_target_q_net(self): self.__update_target_q_net_hard() def __update_target_q_net_hard(self): self.sess.run( [v_t.assign(v) for v_t, v in zip(self.q_target_vars, self.q_vars)]) def __update_target_q_net_soft(self, tau=0.05): self.sess.run([ v_t.assign(v_t * (1. - tau) + v * tau) for v_t, v in zip(self.q_target_vars, self.q_vars) ]) def __update_target_q_net(self, step): if self.target_update_type == 'hard': if step % self.target_update_every_step == 0: self.__update_target_q_net_hard() else: self.__update_target_q_net_soft(self.target_update_tau)
class DQfDAgent(DQNAgent): def __init__(self, model, env, demo_memory, **kwargs): DQNAgent.__init__(self, model, env, **kwargs) self.EXPERT_MARGIN = kwargs.pop("expert_margin", 0.8) self.DEMO_PER = kwargs.pop("demo_percent", 0.3) self.N_STEP = kwargs.pop("n_step", 5) self.LAMBDA_1 = kwargs.pop("lambda_1", 0.1) self.LAMBDA_2 = kwargs.pop("lambda_2", 0.5) self.LAMBDA_3 = kwargs.pop("lambda_3", 0) self.memory = ReplayMemory(self.REPLAY_CAPACITY, self.N_STEP, self.GAMMA) self.demo_memory = demo_memory self.demo_memory.n_step = self.N_STEP self.demo_memory.gamma = self.GAMMA self.is_pre_train = False def _n_step_loss(self, y_pred, n_returns_batch, non_final_n_mask, non_final_n_states): q_n = Variable(torch.zeros(self.BATCH_SIZE).type(FloatTensor)) target_q_n = self.target_model(non_final_n_states) if self.DOUBLE_DQN: max_act_n = self.model(non_final_n_states).max(1)[1].view(-1, 1) q_n[non_final_n_mask] = target_q_n.gather(1, max_act_n).data.view( target_q_n.gather(1, max_act_n).data.shape[0]) else: q_n[non_final_n_mask] = target_q_n.max(1)[0].data y_n_step = q_n * np.power(self.GAMMA, self.N_STEP) + n_returns_batch return nn.functional.mse_loss(y_pred, y_n_step) def _expert_loss(self, q_pred, action_batch, non_demo_mask): y_pred = q_pred.gather(1, action_batch).squeeze() expert_margin = torch.zeros(self.BATCH_SIZE, self.out_size) expert_margin[:, action_batch.data] = self.EXPERT_MARGIN q_l = q_pred + Variable(expert_margin) j_e = q_l.max(1)[0] - y_pred j_e[non_demo_mask] = 0 return j_e.sum() def _collect_batch(self): non_demo_mask = ByteTensor([False] * self.BATCH_SIZE) if self.is_pre_train: batch, n_returns, n_step_states = self.demo_memory.sample( self.BATCH_SIZE) else: demo_num = int(self.BATCH_SIZE * self.DEMO_PER) replay_demo, n_returns_demo, n_step_states_demo = \ self.demo_memory.sample(demo_num) replay_agent, n_returns_agent, n_step_states_agent = \ self.memory.sample(self.BATCH_SIZE - demo_num) batch = replay_demo.extend(replay_agent) if demo_num != self.BATCH_SIZE: non_demo_mask[demo_num:] = 1 n_returns_demo.extend(n_returns_agent) n_returns = n_returns_demo n_step_states = np.concatenate( [n_step_states_demo, n_step_states_agent]) return batch, n_returns, n_step_states, non_demo_mask def _calc_loss(self): batch, n_returns, n_step_states, non_demo_mask = self._collect_batch() non_final_mask = ByteTensor( tuple([s is not None for s in batch.next_state])) non_final_next_states = Variable( torch.cat([s for s in batch.next_state if s is not None])) non_final_n_mask = ByteTensor( tuple([s is not None for s in n_step_states])) non_final_n_states = Variable( torch.cat([s for s in n_step_states if s is not None])) state_batch = Variable( torch.cat([s for s in batch.state if s is not None])) action_batch = Variable( torch.cat([s for s in batch.action if s is not None])) reward_batch = Variable( torch.cat([s for s in batch.reward if s is not None])) n_returns_batch = Variable(torch.cat(n_returns)) q_pred = self.model(state_batch) y_pred = q_pred.gather(1, action_batch).squeeze() dq_loss = self._DQ_loss(y_pred, reward_batch, non_final_mask, non_final_next_states) n_step_loss = self._n_step_loss(y_pred, n_returns_batch, non_final_n_mask, non_final_n_states) expert_loss = self._expert_loss(q_pred, action_batch, non_demo_mask) loss = dq_loss + self.LAMBDA_1 * n_step_loss + self.LAMBDA_2 * expert_loss self.container.add("dq_loss", torch.mean(dq_loss.data)) self.container.add("expert_loss", torch.mean(expert_loss.data)) self.container.add("y_pred", torch.mean(y_pred.data)) self.container.add("loss", torch.mean(loss.data)) return loss def pre_train(self, steps): self.i_episode = 0 self.i_step = 0 self.is_pre_train = True print("Pre training...") for i in range(steps): if i % 500 == 0: print("Pre train steps: {}".format(i)) self.update_policy() self.update_target_network() print("Pre train done") self.is_pre_train = False
class EGreedyAgent(AgentWithWheel): """ This agent use actor critic algorithm optimize model. """ def __init__(self, x, y, r, color, agent_type, features_n, actions_n, discounted_value, memory_capacity=4096, batch_size=512, learning_rate=0.0001, need_restore=False): super(EGreedyAgent, self).__init__(x, y, r, color, agent_type) self.gamma = discounted_value self.features_n = features_n self.actions_n = actions_n self.lr = learning_rate self.save_file_path = 'model/dqn.pkl' self.device = 'cpu' self.policy_net = DQNet(self.features_n, self.actions_n) self.target_net = DQNet(self.features_n, self.actions_n) # let target net has the same params as policy net self.target_net.eval() self.target_net.load_state_dict(self.policy_net.state_dict()) self.optimizer = optim.RMSprop(self.policy_net.parameters(), lr=self.lr) self.memory = [] self.eps_start = 0.9 self.eps_end = 0.05 self.eps_decay = 5000 self.steps_count = 0 self.batch_size = batch_size self.memory = ReplayMemory(memory_capacity) self.need_exploit = True if need_restore: self.restore() def act(self, state): """ Chose action with probability. """ state = torch.FloatTensor([state]) sample = random.random() # chose action randomly at the beginning, then slowly chose max Q_value eps_threhold = self.eps_end + (self.eps_start - self.eps_end) * \ math.exp(-1. * self.steps_count / self.eps_decay) \ if self.need_exploit else 0.01 self.steps_count += 1 if sample > eps_threhold: with torch.no_grad(): left_v, right_v = self.policy_net(state) l, r = left_v.max(1)[1].view(1, 1).item(), right_v.max(1)[1].view( 1, 1).item() # print('left: %d\tright: %d' % (l, r)) return l, r else: l, r = random.randrange(self.actions_n), random.randrange( self.actions_n) return l, r def optimize_model(self): """ Train model. """ if len(self.memory) < self.batch_size: return 0.0 transitions = self.memory.sample(self.batch_size) # batch is ([state], [left_v, right_v], [next_state], [reward]) batch = Transition(*zip(*transitions)) non_final_mask = torch.tensor(tuple( map(lambda s: s is not None, batch.next_state)), device=self.device) non_final_next_states = torch.cat([ torch.tensor([s], dtype=torch.float) for s in batch.next_state if s is not None ]) state_batch = torch.cat( [torch.tensor([s], dtype=torch.float) for s in batch.state]) left_batch = torch.cat( [torch.tensor([[s[0]]], dtype=torch.long) for s in batch.action]) right_batch = torch.cat( [torch.tensor([[s[1]]], dtype=torch.long) for s in batch.action]) reward_batch = torch.cat( [torch.tensor([[s]], dtype=torch.float) for s in batch.reward]) left_eval, right_eval = self.policy_net(state_batch) left_q_eval = left_eval.gather(1, left_batch) right_q_eval = right_eval.gather(1, right_batch) left_q_next, right_q_next = self.target_net(non_final_next_states) left_q_next = left_q_next.max(1)[0].detach() right_q_next = right_q_next.max(1)[0].detach() left_q_target = (left_q_next * self.gamma) + reward_batch.squeeze() right_q_target = (right_q_next * self.gamma) + reward_batch.squeeze() loss = F.mse_loss(left_q_eval, left_q_target.unsqueeze(1)) + F.mse_loss( right_q_eval, right_q_target.unsqueeze(1)) self.optimizer.zero_grad() loss.backward() self.optimizer.step() return loss.item() def save(self): """ Save trained model. """ torch.save(self.policy_net.state_dict(), self.save_file_path) print('Model saved succeed!') def restore(self): """ Restore model from saved file. """ self.policy_net.load_state_dict(torch.load(self.save_file_path))
class Agent(object): def __init__(self, num_actions, gamma=0.98, memory_size=5000, batch_size=32): self.scaler = None self.featurizer = None self.q_functions = None self.gamma = gamma self.batch_size = batch_size self.num_actions = num_actions self.memory = ReplayMemory(memory_size) self.initialize_model() def initialize_model(self): # Draw some samples from the observation range and initialize the scaler obs_limit = np.array([4.8, 5, 0.5, 5]) samples = np.random.uniform(-obs_limit, obs_limit, (1000, obs_limit.shape[0])) self.scaler = StandardScaler() self.scaler.fit(samples) # Initialize the RBF featurizer self.featurizer = FeatureUnion([ ("rbf1", RBFSampler(gamma=5.0, n_components=100)), ("rbf2", RBFSampler(gamma=2.0, n_components=80)), ("rbf3", RBFSampler(gamma=1.0, n_components=50)), ]) self.featurizer.fit(self.scaler.transform(samples)) # Create a value approximator for each action self.q_functions = [ SGDRegressor(learning_rate="constant", max_iter=500, tol=1e-3) for _ in range(self.num_actions) ] # Initialize it to whatever values; implementation detail for q_a in self.q_functions: q_a.partial_fit(self.featurize(samples), np.zeros((samples.shape[0], ))) def featurize(self, state): if len(state.shape) == 1: state = state.reshape(1, -1) # Task 1: TODO: Use (s, abs(s)) as features #return np.concatenate((state, np.abs(state)), axis=1) # RBF features return self.featurizer.transform(self.scaler.transform(state)) def get_action(self, state, epsilon=0.0): if np.random.random() < epsilon: a = int(np.random.random() * self.num_actions) return a else: featurized = self.featurize(state) qs = [q.predict(featurized)[0] for q in self.q_functions] qs = np.array(qs) a = np.argmax(qs, axis=0) return a def single_update(self, state, action, next_state, reward, done): # Calculate feature representations of the # Task 1: TODO: Set the feature state and feature next state featurized_state = self.featurize(state) featurized_next_state = self.featurize(next_state) # Task 1: TODO Get Q(s', a) for the next state next_qs = [ q.predict(featurized_next_state)[0] for q in self.q_functions ] # Calculate the updated target Q- values # Task 1: TODO: Calculate target based on rewards and next_qs if done: target = reward else: target = reward + self.gamma * np.max(next_qs) # Update Q-value estimation self.q_functions[action].partial_fit(featurized_state, [target]) def update_estimator(self): if len(self.memory) < self.batch_size: # Use the whole memory samples = self.memory.memory else: # Sample some data samples = self.memory.sample(self.batch_size) # Task 2: TODO: Reformat data in the minibatch states = [] action = [] next_states = [] rewards = [] dones = [] for s in samples: states.append(s.state) action.append(s.action) next_states.append(s.next_state) rewards.append(s.reward) dones.append(s.done) states = np.array(states) next_states = np.array(next_states) action = np.array(action) rewards = np.array(rewards) dones = np.array(dones) # Task 2: TODO: Calculate Q(s', a) featurized_next_states = self.featurize(next_states) next_qs = np.max(np.array( [q.predict(featurized_next_states) for q in self.q_functions]).T, axis=1) # Calculate the updated target values # Task 2: TODO: Calculate target based on rewards and next_qs targets = rewards + self.gamma * next_qs * np.invert(dones) # Calculate featurized states featurized_states = self.featurize(states) # Get new weights for each action separately for a in range(self.num_actions): # Find states where a was taken idx = action == a # If a not present in the batch, skip and move to the next action if np.any(idx): act_states = featurized_states[idx] act_targets = targets[idx] # Perform a single SGD step on the Q-function params self.q_functions[a].partial_fit(act_states, act_targets) def store_transition(self, *args): self.memory.push(*args)
class DQN(object): def __init__(self, game_name, gamma, batch_size, eps_start, eps_end, eps_decay, mem_size, device): if batch_size > mem_size: print( "Error: the training crushes due to batch size smaller than memory size." ) return self.gamma = gamma self.batch_size = batch_size self.eps_start = eps_start self.eps_end = eps_end self.eps_decay = eps_decay self.env = Environment(game_name) self.step_done = 0 self.device = device self.memory = ReplayMemory(mem_size) # define the policy net and target net _, _, height, width = self.env.get_screen().shape self.policy_net = Net(height, width, self.env.num_action).to(self.device) self.target_net = Net(height, width, self.env.num_action).to(self.device) self.target_net.load_state_dict(self.policy_net.state_dict()) self.target_net.eval() self.optimizer = optim.RMSprop(self.policy_net.parameters()) def select_action(self, state): sample = random.random() eps_threshold = self.eps_end + (self.eps_start - self.eps_end)\ * np.exp(-1 * self.step_done / self.eps_decay) self.step_done += 1 # decide whether to exploitation or exploration if sample > eps_threshold: with torch.no_grad(): # return the action with the largest expected reward # similar to classification task but not the same # both tasks use the scoring mechanism to achieve their goals return self.policy_net(state).max(1)[1].view(1, 1) else: return torch.tensor([[random.randrange(self.env.num_action)]], device=self.device, dtype=torch.long) def optimize(self): # see https://stackoverflow.com/a/19343/3343043 if len(self.memory) < self.batch_size: return transitions = self.memory.sample(self.batch_size) batch = Transition(*zip(*transitions)) # creat masks non_final_mask = torch.tensor(tuple( map(lambda s: s is not None, batch.next_state)), device=self.device, dtype=torch.bool) non_final_next_states = torch.cat( [s for s in batch.next_state if s is not None]) state_batch = torch.cat(batch.state) action_batch = torch.cat(batch.action) reward_batch = torch.cat(batch.reward) # use the policy_net as the behavior network # use the target_net as the Q-values fitting network state_action_values = self.policy_net(state_batch).gather( 1, action_batch) next_state_values = torch.zeros(self.batch_size, device=self.device) next_state_values[non_final_mask] = self.target_net( non_final_next_states).max(1)[0].detach() expected_state_action_values = (next_state_values * self.gamma) + reward_batch # compute the loss loss = F.smooth_l1_loss(state_action_values, expected_state_action_values.unsqueeze(1)) self.optimizer.zero_grad() loss.backward() for param in self.policy_net.parameters(): param.grad.data.clamp_(-1, 1) self.optimizer.step() def plot_duration(self): pass