class ExploreAgentPytorch(Agent): def __init__(self, epsilon=0.1, discount=0.5, rotations=3, pheromones=3): super(ExploreAgentPytorch, self).__init__("explore_agent_pytorch") self.epsilon = epsilon self.discount = discount self.rotations = rotations self.model = None self.target_model = None self.criterion = None self.optimizer = None # An array with last n steps for training # self.replay_memory = deque(maxlen=REPLAY_MEMORY_SIZE) self.replay_memory = None # Used to count when to update target network with main network's weights self.target_update_counter = 0 self.state = None def setup(self, rl_api: RLApi, trained_model: Optional[str] = None): super(ExploreAgentPytorch, self).setup(rl_api, trained_model) self.replay_memory = ReplayMemory(REPLAY_MEMORY_SIZE, self.observation_space, self.agent_space, self.action_space) self.state = torch.zeros([rl_api.ants.n_ants] + list(self.observation_space), dtype=torch.float32) # Main model self.model = ExploreModel(self.observation_space, self.agent_space, self.rotations) self.target_model = ExploreModel(self.observation_space, self.agent_space, self.rotations) self.criterion = nn.MSELoss() self.optimizer = torch.optim.Adam(self.model.parameters(), lr=1e-4) if trained_model is not None: self.load_model(trained_model) self.target_model.load_state_dict(self.model.state_dict()) self.target_model.eval() def initialize(self, rl_api: RLApi): rl_api.ants.activate_all_pheromones( np.ones((self.n_ants, len([ obj for obj in rl_api.perceived_objects if isinstance(obj, Pheromone) ]))) * 10) def train(self, done: bool, step: int) -> float: # Start training only if certain number of samples is already saved if len(self.replay_memory) < MIN_REPLAY_MEMORY_SIZE: return 0 # Get a minibatch from replay memory mem_states, mem_agent_state, mem_actions, mem_rewards, mem_new_states, mem_new_agent_state, mem_done = self.replay_memory.random_access( MINIBATCH_SIZE) with torch.no_grad(): future_qs = self.target_model(mem_new_states) # Non-terminal states get current reward plus discounted future reward max_future_qs = torch.max(future_qs, dim=1).values new_qs = mem_rewards + self.discount * max_future_qs * ~mem_done # Terminal states only gets current reward # new_qs += mem_rewards * mem_done target_qs = self.model(mem_states) # for i in range(MINIBATCH_SIZE): # target_qs[i, mem_actions[i]] = new_qs[i] target_qs[np.arange(len(target_qs)), mem_actions[:, 0].tolist()] = new_qs[np.arange( len(target_qs))] loss = self.criterion(self.model(mem_states), target_qs) self.optimizer.zero_grad() loss.backward() self.optimizer.step() # Update target network counter every episode if done: self.target_update_counter += 1 # If counter reaches set value, update target network with weights of main network if self.target_update_counter >= UPDATE_TARGET_EVERY: self.target_model.load_state_dict(self.model.state_dict()) self.target_model.eval() self.target_update_counter = 0 return loss.item() def update_replay_memory(self, states: ndarray, agent_state: ndarray, actions: Tuple[Optional[ndarray], Optional[ndarray]], rewards: ndarray, new_states: ndarray, new_agent_states: ndarray, done: bool): self.replay_memory.extend( states, agent_state, (actions[0] + self.rotations // 2, actions[1]), rewards, new_states, new_agent_states, done) def get_action( self, state: ndarray, training: bool) -> Tuple[Optional[ndarray], Optional[ndarray]]: if random.random() > self.epsilon or not training: # Ask network for next action with torch.no_grad(): predict = torch.max(self.target_model(torch.Tensor(state)), dim=1).indices.numpy() rotation = predict - self.rotations // 2 else: # Random turn rotation = np.random.randint( low=0, high=self.rotations, size=self.n_ants) - self.rotations // 2 return rotation, None def save_model(self, file_name: str): torch.save(self.model.state_dict(), './agents/models/' + file_name) def load_model(self, file_name: str): self.model.load_state_dict(torch.load('./agents/models/' + file_name)) self.target_model.load_state_dict( torch.load('./agents/models/' + file_name)) pass
class CollectAgentMemory(Agent): def __init__(self, epsilon=0.1, discount=0.5, rotations=3, pheromones=3, learning_rate=1e-4): super(CollectAgentMemory, self).__init__("collect_agent_memory") self.learning_rate = learning_rate self.epsilon = epsilon self.discount = discount self.rotations = rotations self.pheromones = pheromones self.model = None self.target_model = None self.criterion = None self.optimizer = None # An array with last n steps for training self.replay_memory = None # Used to count when to update target network with main network's weights self.target_update_counter = 0 self.state = None self.mem_size = 20 self.agent_and_mem_space = None self.previous_memory = None def setup(self, rl_api: RLApi, trained_model: Optional[str] = None): super(CollectAgentMemory, self).setup(rl_api, trained_model) self.previous_memory = torch.zeros((rl_api.ants.n_ants, self.mem_size)) self.agent_and_mem_space = [2 + self.mem_size] self.replay_memory = ReplayMemory(REPLAY_MEMORY_SIZE, self.observation_space, self.agent_and_mem_space, self.action_space) self.state = torch.zeros([rl_api.ants.n_ants] + list(self.observation_space), dtype=torch.float32) # Main model self.model = CollectModelMemory(self.observation_space, self.agent_space, self.mem_size, self.rotations, self.pheromones) self.target_model = CollectModelMemory(self.observation_space, self.agent_space, self.mem_size, self.rotations, self.pheromones) self.criterion = nn.MSELoss() self.optimizer = torch.optim.Adam(self.model.parameters(), lr=self.learning_rate) if trained_model is not None: self.load_model(trained_model) self.target_model.load_state_dict(self.model.state_dict()) self.target_model.eval() def initialize(self, rl_api: RLApi): rl_api.ants.activate_all_pheromones( np.ones((self.n_ants, len([ obj for obj in rl_api.perceived_objects if isinstance(obj, Pheromone) ]))) * 10) def train(self, done: bool, step: int) -> float: # Start training only if certain number of samples is already saved if len(self.replay_memory) < MIN_REPLAY_MEMORY_SIZE: return 0 # Get a minibatch from replay memory mem_states, mem_agent_state, mem_actions, mem_rewards, mem_new_states, mem_new_agent_state, mem_done = self.replay_memory.random_access( MINIBATCH_SIZE) with torch.no_grad(): # Predicting actions (we don't use agent's memory) future_qs_rotation, future_qs_pheromones, _ = self.target_model( mem_new_states, mem_new_agent_state) target_qs_rotation, target_qs_pheromones, _ = self.model( mem_states, mem_agent_state) # Update Q value for rotation max_future_qs = torch.max(future_qs_rotation, dim=1).values new_qs = mem_rewards + self.discount * max_future_qs * ~mem_done target_qs_rotation[np.arange(len(target_qs_rotation)), mem_actions[:, 0].tolist()] = new_qs[np.arange( len(target_qs_rotation))] # Update Q value for pheromones max_future_qs = torch.max(future_qs_pheromones, dim=1).values new_qs = mem_rewards + self.discount * max_future_qs * ~mem_done target_qs_pheromones[np.arange(len(target_qs_pheromones)), mem_actions[:, 1].tolist()] = new_qs[np.arange( len(target_qs_pheromones))] output = self.model(mem_states, mem_agent_state) loss_rotation = self.criterion(output[0], target_qs_rotation) loss_pheromones = self.criterion(output[1], target_qs_pheromones) loss = loss_rotation + loss_pheromones self.optimizer.zero_grad() loss.backward() self.optimizer.step() # Update target network counter every episode if done: self.target_update_counter += 1 # If counter reaches set value, update target network with weights of main network if self.target_update_counter >= UPDATE_TARGET_EVERY: self.target_model.load_state_dict(self.model.state_dict()) self.target_model.eval() self.target_update_counter = 0 return loss.item() def update_replay_memory(self, states: ndarray, agent_state: ndarray, actions: Tuple[Optional[ndarray], Optional[ndarray]], rewards: ndarray, new_states: ndarray, new_agent_states: ndarray, done: bool): self.replay_memory.extend( states, np.hstack([agent_state, self.previous_memory]), (actions[0] + self.rotations // 2, actions[1]), rewards, new_states, np.hstack([new_agent_states, actions[2]]), done) def get_action( self, state: ndarray, agent_state: ndarray, training: bool) -> Tuple[Optional[ndarray], Optional[ndarray]]: if random.random() > self.epsilon or not training: # Ask network for next action with torch.no_grad(): #predict = torch.max(self.target_model(torch.Tensor(state)), dim=1).indices.numpy() qs_rotation, qs_pheromones, self.previous_memory = self.target_model( torch.Tensor(state), torch.cat( [torch.Tensor(agent_state), self.previous_memory], dim=1)) action_rot = torch.max(qs_rotation, dim=1).indices.numpy() action_phero = torch.max(qs_pheromones, dim=1).indices.numpy() rotation = action_rot - self.rotations // 2 pheromone = action_phero else: # Random turn rotation = np.random.randint( low=0, high=self.rotations, size=self.n_ants) - self.rotations // 2 # Random pheromones pheromone = np.random.randint(low=0, high=self.pheromones, size=self.n_ants) # We don't reset memory to zero, we keep previous value return rotation, pheromone, self.previous_memory.numpy() def save_model(self, file_name: str): torch.save(self.model.state_dict(), './agents/models/' + file_name) def load_model(self, file_name: str): self.model.load_state_dict(torch.load('./agents/models/' + file_name)) self.target_model.load_state_dict( torch.load('./agents/models/' + file_name))
class CollectAgent(Agent): def __init__(self, epsilon=0.1, dis=0.5, rotations=3, pheromones=3, lr=1e-4): super(CollectAgent, self).__init__("collect_agent") self.lr = lr self.epsilon = epsilon self.dis = dis self.rotations = rotations self.pheromones = pheromones self.model = None self.target_model = None self.criterion = None self.optimizer = None # An array with last n steps for training self.replay_memory = None # Used to count when to update target network with main network's weights self.update_target = 0 self.state = None self.mem_size = 20 self.agent_and_mem_space = None self.previous_memory = None def setup(self, base: Base, trained_model: Optional[str] = None): super(CollectAgent, self).setup(base, trained_model) self.previous_memory = torch.zeros((base.blobs.n_blobs, self.mem_size)) self.agent_and_mem_space = [2 + self.mem_size] self.replay_memory = ReplayMemory(REPLAY_MEMORY_SIZE, self.observation_space, self.agent_and_mem_space, self.action_space) self.state = torch.zeros([base.blobs.n_blobs] + list(self.observation_space), dtype=torch.float32) # Main model self.model = Model(self.observation_space, self.agent_space, self.mem_size, self.rotations, self.pheromones) self.target_model = Model(self.observation_space, self.agent_space, self.mem_size, self.rotations, self.pheromones) self.criterion = nn.MSELoss() self.optimizer = torch.optim.Adam(self.model.parameters(), lr=self.lr) if trained_model is not None: self.load_model(trained_model) self.target_model.load_state_dict(self.model.state_dict()) self.target_model.eval() def initialize(self, base: Base): base.blobs.activate_all_pheromones( np.ones((self.n_blobs, len([ obj for obj in base.perceived_objects if isinstance(obj, Pheromone) ]))) * 10) def train(self, itr_done: bool, step: int) -> float: # Start training only if certain number of samples is already saved if len(self.replay_memory) < MIN_REPLAY_MEMORY_SIZE: return 0 states, agent_state, actions, rewards, new_states, new_agent_state, done = self.replay_memory.random_access( MINIBATCH_SIZE) with torch.no_grad(): rotation_t, pheromones_t, _ = self.target_model( new_states, new_agent_state) rotation, pheromones, _ = self.model(states, agent_state) rotation_t = torch.max(rotation_t, dim=1).values tmp = rewards + self.dis * rotation_t * ~done rotation[np.arange(len(rotation)), actions[:, 0].tolist()] = tmp[np.arange(len(rotation))] pheromones_t = torch.max(pheromones_t, dim=1).values tmp = rewards + self.dis * pheromones_t * ~done pheromones[np.arange(len(pheromones)), actions[:, 1].tolist()] = tmp[np.arange(len(pheromones))] output = self.model(states, agent_state) loss_r = self.criterion(output[0], rotation) loss_pher = self.criterion(output[1], pheromones) loss = loss_r + loss_pher self.optimizer.zero_grad() loss.backward() self.optimizer.step() if itr_done: self.update_target += 1 if self.update_target >= UPDATE_TARGET_EVERY: self.target_model.load_state_dict(self.model.state_dict()) self.target_model.eval() self.update_target = 0 return loss.item() def update_replay_memory(self, states: ndarray, agent_state: ndarray, actions: Tuple[Optional[ndarray], Optional[ndarray]], rewards: ndarray, new_states: ndarray, new_agent_states: ndarray, done: bool): self.replay_memory.extend( states, np.hstack([agent_state, self.previous_memory]), (actions[0] + self.rotations // 2, actions[1]), rewards, new_states, np.hstack([new_agent_states, actions[2]]), done) def get_action( self, state: ndarray, agent_state: ndarray, training: bool) -> Tuple[Optional[ndarray], Optional[ndarray]]: if random.random() > self.epsilon or not training: with torch.no_grad(): qs_rotation, qs_pheromones, self.previous_memory = self.target_model( torch.Tensor(state), torch.cat( [torch.Tensor(agent_state), self.previous_memory], dim=1)) action_rot = torch.max(qs_rotation, dim=1).indices.numpy() action_phero = torch.max(qs_pheromones, dim=1).indices.numpy() rotation = action_rot - self.rotations // 2 pheromone = action_phero else: rotation = np.random.randint( low=0, high=self.rotations, size=self.n_blobs) - self.rotations // 2 pheromone = np.random.randint(low=0, high=self.pheromones, size=self.n_blobs) return rotation, pheromone, self.previous_memory.numpy() def save_model(self, file_name: str): torch.save(self.model.state_dict(), './agents/models/' + file_name) def load_model(self, file_name: str): self.model.load_state_dict(torch.load('./agents/models/' + file_name)) self.target_model.load_state_dict( torch.load('./agents/models/' + file_name))