コード例 #1
0
class ExploreAgentPytorch(Agent):
    def __init__(self, epsilon=0.1, discount=0.5, rotations=3, pheromones=3):
        super(ExploreAgentPytorch, self).__init__("explore_agent_pytorch")

        self.epsilon = epsilon
        self.discount = discount
        self.rotations = rotations

        self.model = None
        self.target_model = None
        self.criterion = None
        self.optimizer = None

        # An array with last n steps for training
        # self.replay_memory = deque(maxlen=REPLAY_MEMORY_SIZE)
        self.replay_memory = None

        # Used to count when to update target network with main network's weights
        self.target_update_counter = 0
        self.state = None

    def setup(self, rl_api: RLApi, trained_model: Optional[str] = None):
        super(ExploreAgentPytorch, self).setup(rl_api, trained_model)

        self.replay_memory = ReplayMemory(REPLAY_MEMORY_SIZE,
                                          self.observation_space,
                                          self.agent_space, self.action_space)
        self.state = torch.zeros([rl_api.ants.n_ants] +
                                 list(self.observation_space),
                                 dtype=torch.float32)

        # Main model
        self.model = ExploreModel(self.observation_space, self.agent_space,
                                  self.rotations)
        self.target_model = ExploreModel(self.observation_space,
                                         self.agent_space, self.rotations)
        self.criterion = nn.MSELoss()
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=1e-4)

        if trained_model is not None:
            self.load_model(trained_model)

        self.target_model.load_state_dict(self.model.state_dict())
        self.target_model.eval()

    def initialize(self, rl_api: RLApi):
        rl_api.ants.activate_all_pheromones(
            np.ones((self.n_ants,
                     len([
                         obj for obj in rl_api.perceived_objects
                         if isinstance(obj, Pheromone)
                     ]))) * 10)

    def train(self, done: bool, step: int) -> float:
        # Start training only if certain number of samples is already saved
        if len(self.replay_memory) < MIN_REPLAY_MEMORY_SIZE:
            return 0

        # Get a minibatch from replay memory
        mem_states, mem_agent_state, mem_actions, mem_rewards, mem_new_states, mem_new_agent_state, mem_done = self.replay_memory.random_access(
            MINIBATCH_SIZE)

        with torch.no_grad():
            future_qs = self.target_model(mem_new_states)

            # Non-terminal states get current reward plus discounted future reward
            max_future_qs = torch.max(future_qs, dim=1).values
            new_qs = mem_rewards + self.discount * max_future_qs * ~mem_done

            # Terminal states only gets current reward
            # new_qs += mem_rewards * mem_done

            target_qs = self.model(mem_states)

            # for i in range(MINIBATCH_SIZE):
            # 	target_qs[i, mem_actions[i]] = new_qs[i]

            target_qs[np.arange(len(target_qs)),
                      mem_actions[:, 0].tolist()] = new_qs[np.arange(
                          len(target_qs))]

        loss = self.criterion(self.model(mem_states), target_qs)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # Update target network counter every episode
        if done:
            self.target_update_counter += 1

        # If counter reaches set value, update target network with weights of main network
        if self.target_update_counter >= UPDATE_TARGET_EVERY:
            self.target_model.load_state_dict(self.model.state_dict())
            self.target_model.eval()
            self.target_update_counter = 0

        return loss.item()

    def update_replay_memory(self, states: ndarray, agent_state: ndarray,
                             actions: Tuple[Optional[ndarray],
                                            Optional[ndarray]],
                             rewards: ndarray, new_states: ndarray,
                             new_agent_states: ndarray, done: bool):
        self.replay_memory.extend(
            states, agent_state,
            (actions[0] + self.rotations // 2, actions[1]), rewards,
            new_states, new_agent_states, done)

    def get_action(
            self, state: ndarray,
            training: bool) -> Tuple[Optional[ndarray], Optional[ndarray]]:
        if random.random() > self.epsilon or not training:
            # Ask network for next action
            with torch.no_grad():
                predict = torch.max(self.target_model(torch.Tensor(state)),
                                    dim=1).indices.numpy()
            rotation = predict - self.rotations // 2
        else:
            # Random turn
            rotation = np.random.randint(
                low=0, high=self.rotations,
                size=self.n_ants) - self.rotations // 2

        return rotation, None

    def save_model(self, file_name: str):
        torch.save(self.model.state_dict(), './agents/models/' + file_name)

    def load_model(self, file_name: str):
        self.model.load_state_dict(torch.load('./agents/models/' + file_name))
        self.target_model.load_state_dict(
            torch.load('./agents/models/' + file_name))
        pass
コード例 #2
0
class CollectAgentMemory(Agent):
    def __init__(self,
                 epsilon=0.1,
                 discount=0.5,
                 rotations=3,
                 pheromones=3,
                 learning_rate=1e-4):
        super(CollectAgentMemory, self).__init__("collect_agent_memory")

        self.learning_rate = learning_rate

        self.epsilon = epsilon
        self.discount = discount
        self.rotations = rotations
        self.pheromones = pheromones

        self.model = None
        self.target_model = None
        self.criterion = None
        self.optimizer = None

        # An array with last n steps for training
        self.replay_memory = None

        # Used to count when to update target network with main network's weights
        self.target_update_counter = 0
        self.state = None

        self.mem_size = 20
        self.agent_and_mem_space = None
        self.previous_memory = None

    def setup(self, rl_api: RLApi, trained_model: Optional[str] = None):
        super(CollectAgentMemory, self).setup(rl_api, trained_model)

        self.previous_memory = torch.zeros((rl_api.ants.n_ants, self.mem_size))
        self.agent_and_mem_space = [2 + self.mem_size]

        self.replay_memory = ReplayMemory(REPLAY_MEMORY_SIZE,
                                          self.observation_space,
                                          self.agent_and_mem_space,
                                          self.action_space)
        self.state = torch.zeros([rl_api.ants.n_ants] +
                                 list(self.observation_space),
                                 dtype=torch.float32)

        # Main model
        self.model = CollectModelMemory(self.observation_space,
                                        self.agent_space, self.mem_size,
                                        self.rotations, self.pheromones)
        self.target_model = CollectModelMemory(self.observation_space,
                                               self.agent_space, self.mem_size,
                                               self.rotations, self.pheromones)
        self.criterion = nn.MSELoss()
        self.optimizer = torch.optim.Adam(self.model.parameters(),
                                          lr=self.learning_rate)

        if trained_model is not None:
            self.load_model(trained_model)

        self.target_model.load_state_dict(self.model.state_dict())
        self.target_model.eval()

    def initialize(self, rl_api: RLApi):
        rl_api.ants.activate_all_pheromones(
            np.ones((self.n_ants,
                     len([
                         obj for obj in rl_api.perceived_objects
                         if isinstance(obj, Pheromone)
                     ]))) * 10)

    def train(self, done: bool, step: int) -> float:
        # Start training only if certain number of samples is already saved
        if len(self.replay_memory) < MIN_REPLAY_MEMORY_SIZE:
            return 0

        # Get a minibatch from replay memory
        mem_states, mem_agent_state, mem_actions, mem_rewards, mem_new_states, mem_new_agent_state, mem_done = self.replay_memory.random_access(
            MINIBATCH_SIZE)

        with torch.no_grad():
            # Predicting actions (we don't use agent's memory)
            future_qs_rotation, future_qs_pheromones, _ = self.target_model(
                mem_new_states, mem_new_agent_state)
            target_qs_rotation, target_qs_pheromones, _ = self.model(
                mem_states, mem_agent_state)

            # Update Q value for rotation
            max_future_qs = torch.max(future_qs_rotation, dim=1).values
            new_qs = mem_rewards + self.discount * max_future_qs * ~mem_done
            target_qs_rotation[np.arange(len(target_qs_rotation)),
                               mem_actions[:, 0].tolist()] = new_qs[np.arange(
                                   len(target_qs_rotation))]

            # Update Q value for pheromones
            max_future_qs = torch.max(future_qs_pheromones, dim=1).values
            new_qs = mem_rewards + self.discount * max_future_qs * ~mem_done
            target_qs_pheromones[np.arange(len(target_qs_pheromones)),
                                 mem_actions[:,
                                             1].tolist()] = new_qs[np.arange(
                                                 len(target_qs_pheromones))]

        output = self.model(mem_states, mem_agent_state)
        loss_rotation = self.criterion(output[0], target_qs_rotation)
        loss_pheromones = self.criterion(output[1], target_qs_pheromones)
        loss = loss_rotation + loss_pheromones

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # Update target network counter every episode
        if done:
            self.target_update_counter += 1

        # If counter reaches set value, update target network with weights of main network
        if self.target_update_counter >= UPDATE_TARGET_EVERY:
            self.target_model.load_state_dict(self.model.state_dict())
            self.target_model.eval()
            self.target_update_counter = 0

        return loss.item()

    def update_replay_memory(self, states: ndarray, agent_state: ndarray,
                             actions: Tuple[Optional[ndarray],
                                            Optional[ndarray]],
                             rewards: ndarray, new_states: ndarray,
                             new_agent_states: ndarray, done: bool):
        self.replay_memory.extend(
            states, np.hstack([agent_state, self.previous_memory]),
            (actions[0] + self.rotations // 2, actions[1]), rewards,
            new_states, np.hstack([new_agent_states, actions[2]]), done)

    def get_action(
            self, state: ndarray, agent_state: ndarray,
            training: bool) -> Tuple[Optional[ndarray], Optional[ndarray]]:
        if random.random() > self.epsilon or not training:
            # Ask network for next action
            with torch.no_grad():
                #predict = torch.max(self.target_model(torch.Tensor(state)), dim=1).indices.numpy()
                qs_rotation, qs_pheromones, self.previous_memory = self.target_model(
                    torch.Tensor(state),
                    torch.cat(
                        [torch.Tensor(agent_state), self.previous_memory],
                        dim=1))
                action_rot = torch.max(qs_rotation, dim=1).indices.numpy()
                action_phero = torch.max(qs_pheromones, dim=1).indices.numpy()
            rotation = action_rot - self.rotations // 2
            pheromone = action_phero
        else:
            # Random turn
            rotation = np.random.randint(
                low=0, high=self.rotations,
                size=self.n_ants) - self.rotations // 2
            # Random pheromones
            pheromone = np.random.randint(low=0,
                                          high=self.pheromones,
                                          size=self.n_ants)
            # We don't reset memory to zero, we keep previous value

        return rotation, pheromone, self.previous_memory.numpy()

    def save_model(self, file_name: str):
        torch.save(self.model.state_dict(), './agents/models/' + file_name)

    def load_model(self, file_name: str):
        self.model.load_state_dict(torch.load('./agents/models/' + file_name))
        self.target_model.load_state_dict(
            torch.load('./agents/models/' + file_name))
コード例 #3
0
class CollectAgent(Agent):
    def __init__(self,
                 epsilon=0.1,
                 dis=0.5,
                 rotations=3,
                 pheromones=3,
                 lr=1e-4):
        super(CollectAgent, self).__init__("collect_agent")

        self.lr = lr

        self.epsilon = epsilon
        self.dis = dis
        self.rotations = rotations
        self.pheromones = pheromones

        self.model = None
        self.target_model = None
        self.criterion = None
        self.optimizer = None

        # An array with last n steps for training
        self.replay_memory = None

        # Used to count when to update target network with main network's weights
        self.update_target = 0
        self.state = None

        self.mem_size = 20
        self.agent_and_mem_space = None
        self.previous_memory = None

    def setup(self, base: Base, trained_model: Optional[str] = None):
        super(CollectAgent, self).setup(base, trained_model)

        self.previous_memory = torch.zeros((base.blobs.n_blobs, self.mem_size))
        self.agent_and_mem_space = [2 + self.mem_size]

        self.replay_memory = ReplayMemory(REPLAY_MEMORY_SIZE,
                                          self.observation_space,
                                          self.agent_and_mem_space,
                                          self.action_space)
        self.state = torch.zeros([base.blobs.n_blobs] +
                                 list(self.observation_space),
                                 dtype=torch.float32)

        # Main model
        self.model = Model(self.observation_space, self.agent_space,
                           self.mem_size, self.rotations, self.pheromones)
        self.target_model = Model(self.observation_space, self.agent_space,
                                  self.mem_size, self.rotations,
                                  self.pheromones)
        self.criterion = nn.MSELoss()
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=self.lr)

        if trained_model is not None:
            self.load_model(trained_model)

        self.target_model.load_state_dict(self.model.state_dict())
        self.target_model.eval()

    def initialize(self, base: Base):
        base.blobs.activate_all_pheromones(
            np.ones((self.n_blobs,
                     len([
                         obj for obj in base.perceived_objects
                         if isinstance(obj, Pheromone)
                     ]))) * 10)

    def train(self, itr_done: bool, step: int) -> float:
        # Start training only if certain number of samples is already saved
        if len(self.replay_memory) < MIN_REPLAY_MEMORY_SIZE:
            return 0

        states, agent_state, actions, rewards, new_states, new_agent_state, done = self.replay_memory.random_access(
            MINIBATCH_SIZE)

        with torch.no_grad():
            rotation_t, pheromones_t, _ = self.target_model(
                new_states, new_agent_state)
            rotation, pheromones, _ = self.model(states, agent_state)

            rotation_t = torch.max(rotation_t, dim=1).values
            tmp = rewards + self.dis * rotation_t * ~done
            rotation[np.arange(len(rotation)),
                     actions[:, 0].tolist()] = tmp[np.arange(len(rotation))]

            pheromones_t = torch.max(pheromones_t, dim=1).values
            tmp = rewards + self.dis * pheromones_t * ~done
            pheromones[np.arange(len(pheromones)),
                       actions[:,
                               1].tolist()] = tmp[np.arange(len(pheromones))]

        output = self.model(states, agent_state)
        loss_r = self.criterion(output[0], rotation)
        loss_pher = self.criterion(output[1], pheromones)
        loss = loss_r + loss_pher

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        if itr_done:
            self.update_target += 1

        if self.update_target >= UPDATE_TARGET_EVERY:
            self.target_model.load_state_dict(self.model.state_dict())
            self.target_model.eval()
            self.update_target = 0

        return loss.item()

    def update_replay_memory(self, states: ndarray, agent_state: ndarray,
                             actions: Tuple[Optional[ndarray],
                                            Optional[ndarray]],
                             rewards: ndarray, new_states: ndarray,
                             new_agent_states: ndarray, done: bool):
        self.replay_memory.extend(
            states, np.hstack([agent_state, self.previous_memory]),
            (actions[0] + self.rotations // 2, actions[1]), rewards,
            new_states, np.hstack([new_agent_states, actions[2]]), done)

    def get_action(
            self, state: ndarray, agent_state: ndarray,
            training: bool) -> Tuple[Optional[ndarray], Optional[ndarray]]:
        if random.random() > self.epsilon or not training:
            with torch.no_grad():
                qs_rotation, qs_pheromones, self.previous_memory = self.target_model(
                    torch.Tensor(state),
                    torch.cat(
                        [torch.Tensor(agent_state), self.previous_memory],
                        dim=1))
                action_rot = torch.max(qs_rotation, dim=1).indices.numpy()
                action_phero = torch.max(qs_pheromones, dim=1).indices.numpy()
            rotation = action_rot - self.rotations // 2
            pheromone = action_phero
        else:
            rotation = np.random.randint(
                low=0, high=self.rotations,
                size=self.n_blobs) - self.rotations // 2
            pheromone = np.random.randint(low=0,
                                          high=self.pheromones,
                                          size=self.n_blobs)

        return rotation, pheromone, self.previous_memory.numpy()

    def save_model(self, file_name: str):
        torch.save(self.model.state_dict(), './agents/models/' + file_name)

    def load_model(self, file_name: str):
        self.model.load_state_dict(torch.load('./agents/models/' + file_name))
        self.target_model.load_state_dict(
            torch.load('./agents/models/' + file_name))