コード例 #1
0
class MultiAgentPlanner():
    def __init__(self, index, reward_threshold, collision_threshold,
                 world_size, states, num_agents, collision_distance):
        self.index = index
        self.name = 'multi safe q agent'
        self.world_size = world_size
        self.states = states
        self.num_agents = num_agents
        self.rewards = []
        kernel = RBF(length_scale=world_size, length_scale_bounds=[(1e1, 1e5), (1e1, 1e5), (1e1, 1e5)]) \
            + WhiteKernel(noise_level=1)
        self.reward_gp = GaussianProcessRegressor(kernel=kernel)
        self.reward_threshold = reward_threshold
        self.collision_threshold = collision_threshold
        self.collision_distance = collision_distance
        self.trajs = [[] for _ in range(num_agents)]
        self.my_states = []
        self.action_traj = []
        self.buffer = ReplayMemory(10000)
        self.gamma = 0.9
        self.beta = 1

        self.dimensions = [3, 50, 50, 7]
        self.dqn = MLP(self.dimensions).double()
        self.dqn_l = MLP(self.dimensions).double()
        self.dqn_u = MLP(self.dimensions).double()
        self.optimizer = optim.RMSprop(self.dqn.parameters())
        self.optimizer_l = optim.RMSprop(self.dqn_l.parameters())
        self.optimizer_u = optim.RMSprop(self.dqn_u.parameters())

        self.target = MLP(self.dimensions).double()
        self.target.load_state_dict(self.dqn.state_dict())
        self.target.eval()
        self.target_l = MLP(self.dimensions).double()
        self.target_l.load_state_dict(self.dqn_l.state_dict())
        self.target_l.eval()
        self.target_u = MLP(self.dimensions).double()
        self.target_u.load_state_dict(self.dqn_u.state_dict())
        self.target_u.eval()

        self.loss_fn = torch.nn.MSELoss(reduction='sum')
        self.lr = 1e-3
        self.epsilons = [0. for _ in range(num_agents)]
        self.tau_exploits = [1. for _ in range(num_agents)]
        self.tau_explores = [1. for _ in range(num_agents)]
        self.num_collisions = 0
        self.num_unsafe = 0
        self.eps = 0.1
        self.cum_rewards = 0
        self.target_usage = 0

    def choose_action(self, explore=False):
        possible_actions = copy.copy(Action.SET)
        action_next_states = []
        reward_ls = []
        reward_uncertainty = []
        no_collision_ls = [1.0 for _ in Action.SET]
        best_action = Action.STAY

        # best_action = np.argmax(self.target(torch.tensor(self.states[self.index])).tolist())
        #
        # if explore or np.random.binomial(1, self.eps) == 1:
        #     best_action = possible_actions[np.random.choice(len(possible_actions))]

        for a in Action.SET:
            next_state = self._move_coordinate(self.states[self.index], a)
            reward, std = self.reward_gp.predict(np.array([next_state]),
                                                 return_std=True)
            reward = reward[0]
            std = std[0]
            action_next_states += [(a, next_state)]
            reward_ls += [reward - self.beta * std]
            reward_uncertainty += [std]

        for action, next_state in action_next_states:
            for agent in range(self.num_agents):
                if agent == self.index:
                    continue

                cur_agent_state = self.states[agent]

                for agent_action in Action.SET:
                    possible_next_agent_state = cur_agent_state + get_movement(
                        agent_action)
                    if np.linalg.norm(possible_next_agent_state -
                                      next_state) < self.collision_distance:
                        continue

                    a_prob = self._get_policy(agent, agent_action)
                    no_collision_ls[action] *= (1 - a_prob)

        # for i, l in enumerate(reward_ls):
        #     if l <= self.reward_threshold or not self._returnable(action_next_states[i][1]):
        #         possible_actions.remove(i)

        for i, l in enumerate(no_collision_ls):
            if l < self.collision_threshold and i in possible_actions:
                possible_actions.remove(i)

        possible_actions = list(possible_actions)
        if explore or np.random.binomial(1, self.eps) == 1:
            # most_uncertain_action = Action.STAY
            # largest_uncertainty = -math.inf
            # for action in possible_actions:
            #     if reward_uncertainty[action] > largest_uncertainty:
            #         most_uncertain_action = action
            #         largest_uncertainty = reward_uncertainty[action]
            #
            # best_action = most_uncertain_action
            if len(possible_actions) > 0:
                best_action = possible_actions[np.random.choice(
                    len(possible_actions))]
        else:
            best_q_action = Action.STAY
            best_q = -math.inf
            q_values = self.target(
                torch.tensor(self.states[self.index],
                             dtype=torch.double)).tolist()
            for action in possible_actions:
                if q_values[action] > best_q:
                    best_q_action = action
                    best_q = q_values[action]
            best_action = best_q_action

        if len(possible_actions) == 0:
            # joint_prob = np.array(reward_ls) * np.array(no_collision_ls)
            # best_action = np.argmax(joint_prob)
            best_action = np.argmax(no_collision_ls)

        self.action_traj += [best_action]
        return best_action

    def update_buffer(self, reward, states):
        self.buffer.push(
            torch.tensor([self.states[self.index]], dtype=torch.double),
            torch.tensor([[self.action_traj[-1]]], dtype=torch.long),
            torch.tensor([reward], dtype=torch.double),
            torch.tensor([states[self.index]], dtype=torch.double))

        if len(self.rewards) > 50:
            self.rewards.pop(0)
            self.my_states.pop(0)

        self.rewards += [reward]
        self.states = states

        for i, state in enumerate(states):
            if i == self.index:
                continue
            if np.linalg.norm(state -
                              states[self.index]) > self.collision_distance:
                self.num_collisions += 1
                break

        if reward < self.reward_threshold:
            self.num_unsafe += 1

        for i in range(self.num_agents):
            self.trajs[i] += [states[i]]
            if i == self.index:
                self.my_states += [states[i]]

        self.cum_rewards += reward

    def reset(self, states):
        self.target.load_state_dict(self.dqn.state_dict())
        self.target.eval()
        self.target_u.load_state_dict(self.dqn_u.state_dict())
        self.target_u.eval()
        self.target_l.load_state_dict(self.dqn_l.state_dict())
        self.target_l.eval()
        self.trajs = [[] for _ in range(self.num_agents)]
        self.action_traj = []
        self.states = states
        # self.epsilons = [0. for _ in range(self.num_agents)]
        # self.tau_exploits = [1. for _ in range(self.num_agents)]
        # self.tau_explores = [1. for _ in range(self.num_agents)]
        # self.rewards = []
        # self.cum_rewards = 0

    def learn_from_buffer(self):
        self.reward_gp.fit(self.my_states, self.rewards)
        self._value_func_estimate()
        for agent in range(self.num_agents):
            if agent == self.index:
                continue
            self._optimize_parameters(agent)

    def _value_func_estimate(self):
        if len(self.buffer) < 32:
            return

        self.target_usage += 1
        transitions = self.buffer.sample(32)
        batch = Transition(*zip(*transitions))

        state_batch = torch.cat(batch.state)
        reward_batch = torch.cat(batch.reward)
        reward_l_batch = []
        reward_u_batch = []
        for state in state_batch:
            cur_state = state.tolist()
            reward, std = self.reward_gp.predict(np.array([cur_state]), True)
            reward_l_batch.append(
                torch.tensor([reward[0] - self.beta * std[0]],
                             dtype=torch.double))
            reward_u_batch.append(
                torch.tensor([reward[0] + self.beta * std[0]],
                             dtype=torch.double))

        reward_l_batch = torch.cat(reward_l_batch)
        reward_u_batch = torch.cat(reward_u_batch)
        action_batch = torch.cat(batch.action)
        next_state_batch = torch.cat(batch.next_state)

        state_action_values = self.dqn(state_batch).gather(1, action_batch)
        next_state_values = self.target(next_state_batch).max(1)[0].detach()

        expected_state_action_values = (next_state_values *
                                        self.gamma) + reward_batch

        loss = self.loss_fn(
            state_action_values,
            expected_state_action_values.unsqueeze(1),
        )

        self.optimizer.zero_grad()
        loss.backward()
        for param in self.dqn.parameters():
            param.grad.data.clamp_(-1, 1)
        self.optimizer.step()

        state_action_values = self.dqn_u(state_batch).gather(1, action_batch)
        next_state_values = self.target_u(next_state_batch).max(1)[0].detach()

        expected_state_action_values = (next_state_values *
                                        self.gamma) + reward_u_batch

        loss = self.loss_fn(
            state_action_values,
            expected_state_action_values.unsqueeze(1),
        )

        self.optimizer_u.zero_grad()
        loss.backward()
        for param in self.dqn_u.parameters():
            param.grad.data.clamp_(-1, 1)
        self.optimizer_u.step()

        state_action_values = self.dqn_l(state_batch).gather(1, action_batch)
        next_state_values = self.target_l(next_state_batch).max(1)[0].detach()

        expected_state_action_values = (next_state_values *
                                        self.gamma) + reward_l_batch

        loss = self.loss_fn(
            state_action_values,
            expected_state_action_values.unsqueeze(1),
        )

        self.optimizer_l.zero_grad()
        loss.backward()
        for param in self.dqn_l.parameters():
            param.grad.data.clamp_(-1, 1)
        self.optimizer_l.step()

        if self.target_usage == 10:
            self.target_usage = 0
            self.target.load_state_dict(self.dqn.state_dict())
            self.target.eval()
            self.target_u.load_state_dict(self.dqn_u.state_dict())
            self.target_u.eval()
            self.target_l.load_state_dict(self.dqn_l.state_dict())
            self.target_l.eval()

    def _move_coordinate(self, state, action):
        movement = get_movement(action)
        return movement + state

    def _get_policy(self, agent, action):
        epsilon = self.epsilons[agent]
        tau_explore = self.tau_explores[agent]
        tau_exploit = self.tau_exploits[agent]
        return self._compute_policy_upperbound(epsilon, tau_explore,
                                               tau_exploit, agent, action)

    def _returnable(self, state):
        for a in Action.SET:
            next_state = self._move_coordinate(state, a)
            reward, std = self.reward_gp.predict(np.array([next_state]), True)
            reward = reward[0]
            std = std[0]
            if reward - self.beta * std >= self.reward_threshold:
                return True

        return False

    def _optimize_parameters(self, agent):
        traj = self.trajs[agent]
        if len(traj) > 20:
            traj = traj[len(traj) - 20:]

        def _compute_log_likelihood(parameters):
            epsilon = parameters[0]
            tau_explore = parameters[1]
            tau_exploit = parameters[2]

            sum_log_likelihood = 1.0
            for step in range(1, len(traj)):
                prev_state = traj[step - 1]
                cur_state = traj[step]

                movement = np.rint(cur_state - prev_state)
                action = get_action(movement, self.world_size)
                if action == -1:
                    continue

                sum_log_likelihood *= (self._compute_policy_upperbound(
                    epsilon, tau_explore, tau_exploit, agent, action))

            return -np.log(sum_log_likelihood)

        res = minimize(_compute_log_likelihood,
                       np.array([0.5, 1.0, 1.0]),
                       method='L-BFGS-B',
                       bounds=np.array([(1e-6, 1.0), (0.1, 10.0),
                                        (0.1, 10.0)]))

        if not np.all(np.equal(res.x, np.array([0.5, 1.0, 1.0]))):
            self.epsilons[agent] = res.x[0]
            self.tau_explores[agent] = res.x[1]
            self.tau_exploits[agent] = res.x[2]

    def _compute_policy_upperbound(self, epsilon, tau_explore, tau_exploit,
                                   agent, action):
        q = self.dqn(torch.tensor(self.states[agent],
                                  dtype=torch.double)).detach().numpy()
        q_u = self.dqn_u(torch.tensor(self.states[agent],
                                      dtype=torch.double)).detach().numpy()
        q_l = self.dqn_l(torch.tensor(self.states[agent],
                                      dtype=torch.double)).detach().numpy()

        ofu_denom = copy.copy(q)
        ofu_denom[action] = q_u[action]

        boltz_denom = copy.copy(q_l)
        boltz_denom[action] = q[action]

        explore_mean_q = np.mean(q_u / tau_explore)
        prob_ofu = np.exp(q_u[action] / tau_explore - explore_mean_q) / np.sum(
            np.exp(ofu_denom / tau_explore - explore_mean_q))

        exploit_mean_q = np.mean(q / tau_exploit)
        prob_boltz = np.exp(q[action] / tau_exploit - exploit_mean_q) / np.sum(
            np.exp(boltz_denom / tau_exploit - exploit_mean_q))

        return epsilon * prob_ofu + (1 - epsilon) * prob_boltz
コード例 #2
0
ファイル: q_agent.py プロジェクト: BillMatrix/multi_safe_mdp
class QLearningAgent():
    def __init__(self, index, world_size, states, num_agents,
                 collision_distance):
        self.index = index
        self.name = 'q agent'
        self.world_size = world_size
        self.states = states
        self.num_agents = num_agents
        self.rewards = []
        self.buffer = ReplayMemory(10000)
        self.gamma = 0.9
        self.beta = 1
        self.action_traj = []
        self.num_collisions = 0
        self.collision_distance = collision_distance

        self.dimensions = [2, 5, 5, 4]
        self.dqn = MLP(self.dimensions).double()
        self.target = MLP(self.dimensions).double()
        self.target.load_state_dict(self.dqn.state_dict())
        self.target.eval()

        self.loss_fn = torch.nn.MSELoss(reduction='sum')
        self.optimizer = optim.RMSprop(self.dqn.parameters())
        self.lr = 1e-4
        self.num_collisions = 0
        self.eps = 0.1

        self.cum_rewards = 0
        self.target_usage = 0

    def choose_action(self, explore):
        possible_actions = list(copy.copy(Action.SET))
        best_action = np.argmax(
            self.target(
                torch.tensor(self.states[self.index],
                             dtype=torch.double)).tolist())

        if explore or np.random.binomial(1, self.eps) == 1:
            best_action = possible_actions[np.random.choice(
                len(possible_actions))]

        self.action_traj.append(best_action)
        return best_action

    def update_buffer(self, reward, states):
        self.buffer.push(
            torch.tensor([self.states[self.index]], dtype=torch.double),
            torch.tensor([[self.action_traj[-1]]], dtype=torch.long),
            torch.tensor([reward], dtype=torch.double),
            torch.tensor([states[self.index]], dtype=torch.double))
        self.rewards += [reward]
        self.states = states

        for i, state in enumerate(states):
            if i == self.index:
                continue
            if np.linalg.norm(state -
                              states[self.index]) < self.collision_distance:
                self.num_collisions += 1
                break

        self.cum_rewards += reward

    def learn_from_buffer(self):
        self._value_func_estimate()

    def reset(self, states):
        self.target.load_state_dict(self.dqn.state_dict())
        self.target.eval()
        self.action_traj = []
        self.states = states
        self.rewards = []

    def _value_func_estimate(self):
        if len(self.buffer) < 32:
            return

        self.target_usage += 1
        transitions = self.buffer.sample(32)
        batch = Transition(*zip(*transitions))

        state_batch = torch.cat(batch.state)
        action_batch = torch.cat(batch.action)
        reward_batch = torch.cat(batch.reward)
        next_state_batch = torch.cat(batch.next_state)

        state_action_values = self.dqn(state_batch).gather(1, action_batch)
        next_state_values = self.target(next_state_batch).max(1)[0].detach()

        expected_state_action_values = (next_state_values *
                                        self.gamma) + reward_batch

        loss = self.loss_fn(
            state_action_values,
            expected_state_action_values.unsqueeze(1),
        )

        self.optimizer.zero_grad()
        loss.backward()
        for param in self.dqn.parameters():
            param.grad.data.clamp_(-1, 1)
        self.optimizer.step()

        if self.target_usage == 10:
            self.target_usage = 0
            self.target.load_state_dict(self.dqn.state_dict())
            self.target.eval()

    def _move_coordinate(self, state, action):
        movement = get_movement(action)
        return movement + state
コード例 #3
0
class YahooDQNAgent():
    def __init__(
            self,
            initial_feed_candidates,
            user_features,
            feed_counts,
            agent_name: str,
            feed_feature_count = 6,
            user_feature_count = 6,
            model_dims: List[int] = [50, 25],
            lr: float = 1e-3,
            boltzmann: bool = True,
            epsilon: float = 0.05,
            batch_size: int = 128,
    ):
        self.initial_feed_candidates = initial_feed_candidates
        self.current_feed_candidates = initial_feed_candidates
        self.user_features = user_features
        self.feed_counts = feed_counts
        self.agent_name = agent_name
        self.interest_level = 0

        self.cum_rewards: float = 0.
        self.feed_feature_count = feed_feature_count
        self.user_feature_count = user_feature_count
        self.num_features = feed_counts * feed_feature_count + feed_feature_count + user_feature_count
        self.training_data: ReplayMemory = ReplayMemory(100000)

        self.model_dims: List[int] = [self.num_features] + model_dims + [1]
        self.model = MLP(self.model_dims).double()
        self.model.initialize()
        self.model.to(device)

        self.target_net = MLP(self.model_dims).double().to(device)
        self.target_net.load_state_dict(self.model.state_dict())
        self.target_net.eval()

        self.loss_fn = torch.nn.MSELoss(reduction='sum')
        self.optimizer = optim.Adam(self.model.parameters(), lr=lr)
        self.boltzmann: bool = boltzmann
        self.epsilon: float = epsilon
        self.batch_size: int = batch_size
        self.gamma = 0.99
        self.running_loss = 0.0
        self.history_actions = []
        self.latest_feature = None
        self.current_feed = 0
        self.cum_reward_history: List[float] = []

    def choose_action(self):
        available_actions = [candidate.features for candidate in self.current_feed_candidates]

        features = [-1. for _ in range(self.num_features)]
        for index, action in enumerate(self.history_actions):
            features[index * self.feed_feature_count:(index + 1) * self.feed_feature_count] = action
        features[-self.user_feature_count:] = self.user_features

        candidate_features = []
        for f in available_actions:
            candidate_feature = np.copy(features)
            candidate_feature[
                self.feed_counts * self.feed_feature_count:(self.feed_counts + 1) * self.feed_feature_count
            ] = f
            candidate_features.append(candidate_feature)
        candidate_features = np.array(candidate_features)

#         base_feature.append(self.interest_level)
        with torch.no_grad():
            outcomes = self.model(
                torch.tensor(candidate_features, dtype=torch.double).to(device)
            )

            _, best_index = torch.max(outcomes, 0)
            best_index = best_index.item()

            if self.boltzmann:
                outcomes = outcomes / 0.05
                best_index = np.random.choice(
                    len(available_actions),
                    p=torch.nn.functional.softmax(outcomes.reshape((len(available_actions))), dim=0).cpu().numpy()
                )
            elif np.random.rand() < 0.05:
                best_index = np.random.choice(len(available_actions))

            best_action = self.current_feed_candidates[best_index]
            self.latest_feature = candidate_features[best_index]
            self.history_actions.append(best_action.features)

            self.current_feed += 1
            return best_action

    def update_buffer(
        self,
        scroll: bool,
        reward: int,
        new_batch
    ):
#         print(reward)
        self.cum_rewards += reward
        self.current_feed_candidates = new_batch
        if not scroll:
            self.training_data.push(
                torch.tensor([self.latest_feature], dtype=torch.double).to(device),
                torch.tensor([reward], dtype=torch.double).to(device),
                None,
            )
            return

        available_actions = [candidate.features for candidate in self.current_feed_candidates]
        features: List[float] = [-1. for _ in range(self.num_features)]
        for index, action in enumerate(self.history_actions):
            features[index * self.feed_feature_count:(index + 1) * self.feed_feature_count] = action
        features[-self.user_feature_count:] = self.user_features

        candidate_features = []
        for f in available_actions:
            candidate_feature = np.copy(features)
            candidate_feature[
                self.feed_counts * self.feed_feature_count:(self.feed_counts + 1) * self.feed_feature_count
            ] = f
            candidate_features.append(candidate_feature)
        candidate_features = np.array(candidate_features)

        self.training_data.push(
            torch.tensor([self.latest_feature], dtype=torch.double).to(device),
            torch.tensor([reward], dtype=torch.double).to(device),
            torch.tensor([candidate_features], dtype=torch.double).to(device),
        )

    def learn_from_buffer(self):
        if len(self.training_data) < self.batch_size:
            return

        loss_ensemble = 0.
        for i in range(0, 10):
            transitions = self.training_data.sample(self.batch_size)
            batch = Transition(*zip(*transitions))
            non_final_mask = torch.tensor(tuple(map(lambda s: s is not None,
                                        batch.next_state)), device=device, dtype=torch.bool)
            state_batch = torch.cat(batch.state)
            reward_batch = torch.cat(batch.reward)
            state_action_values = self.model(state_batch)

            all_none = True
            for s in batch.next_state:
                if s is not None:
                    all_none = False

            next_state_values = torch.zeros(self.batch_size, device=device, dtype=torch.double)
            if not all_none:
                non_final_next_states = torch.cat([s for s in batch.next_state
                                                            if s is not None])

                next_state_values[non_final_mask] = self.target_net(non_final_next_states).max(1)[0].reshape((-1)).detach()

            expected_state_action_values = self.gamma * next_state_values + reward_batch

            loss = self.loss_fn(state_action_values, expected_state_action_values.unsqueeze(1))
            loss_ensemble += loss.item()

            self.optimizer.zero_grad()
            loss.backward()

            for param in self.model.parameters():
                param.grad.data.clamp_(-1, 1)
            self.optimizer.step()

        self.running_loss = 0.8 * self.running_loss + 0.2 * loss_ensemble
        self.epsilon = 0.999 * self.epsilon


    def reset(self, user_features, initial_feeds, user_embedding):
        self.cum_rewards: float = 0.
        self.interest_level = 0.
        self.latest_feature = None
        self.current_feed_candidates = initial_feeds
        self.target_net.load_state_dict(self.model.state_dict())
        self.target_net.double()
        self.target_net.eval()
        self.target_net.to(device)
        self.history_actions = []
        self.cum_reward_history.append(self.cum_rewards)
        self.current_feed = 0
        self.user_features = user_features
コード例 #4
0
class DQNAgent(Agent):
    def __init__(
        self,
        feed_units: List[int],
        agent_name: str,
        model_dims: List[int] = [],
        lr: float = 1e-3,
        boltzmann: bool = False,
        epsilon: float = 0.05,
        batch_size: int = 128,
    ):
        self.feed_units = copy.deepcopy(feed_units)
        self.agent_name = agent_name
        self.interest_level = 0

        self.cum_rewards: float = 0.
        self.num_features: int = len(feed_units)
        self.training_data: ReplayMemory = ReplayMemory(100000)

        self.model_dims: List[int] = [self.num_features] + model_dims + [2]
        self.model = MLP(self.model_dims).double()
        self.model.initialize()
        self.model.to(device)

        self.target_net = MLP(self.model_dims).double().to(device)
        self.target_net.load_state_dict(self.model.state_dict())
        self.target_net.eval()

        self.loss_fn = torch.nn.MSELoss(reduction='sum')
        self.optimizer = optim.Adam(self.model.parameters(), lr=lr)
        self.boltzmann: bool = boltzmann
        self.epsilon: float = epsilon
        self.batch_size: int = batch_size
        self.gamma = 0.99
        self.running_loss = 0.0
        self.history_unit_indices: List[int] = []
        self.latest_feature = None
        self.latest_action = None
        self.current_feed = 0
        self.cum_reward_history: List[float] = []
        self.current_loc = [0, 0]

    def choose_action(self):
        available_actions = [0, 1]

        features: List[float] = [-1. for _ in range(self.num_features)]
        for index in range(self.current_feed):
            features[index] = 0.
        for index in self.history_unit_indices:
            features[index] = 1.

#         base_feature.append(self.interest_level)
        with torch.no_grad():
            outcomes = self.model(torch.tensor(features, dtype=torch.double))

            _, best_index = torch.max(outcomes, 0)
            best_index = best_index.item()

            best_action = [available_actions[best_index]]
            self.latest_feature = features
            self.latest_action = best_action
            if best_action[0] == 1:
                self.history_unit_indices.append(self.current_feed)

            self.current_feed += 1

            if np.random.rand() < self.epsilon:
                return np.random.randint(2)
#             print(best_action)
            return best_action[0]

    def update_buffer(
        self,
        scroll: bool,
        reward: int,
    ):
        #         print(reward)
        self.cum_rewards += reward
        if not scroll:
            self.training_data.push(
                torch.tensor([self.latest_feature], dtype=torch.double),
                torch.tensor([self.latest_action], dtype=torch.long),
                torch.tensor([reward], dtype=torch.double),
                None,
            )
            return

        features: List[float] = [-1. for _ in range(self.num_features)]
        for index in range(self.current_feed):
            features[index] = 0.
        for index in self.history_unit_indices:
            features[index] = 1.

        self.training_data.push(
            torch.tensor([self.latest_feature], dtype=torch.double),
            torch.tensor([self.latest_action], dtype=torch.long),
            torch.tensor([reward], dtype=torch.double),
            torch.tensor([features], dtype=torch.double),
        )

    def learn_from_buffer(self):
        if len(self.training_data) < self.batch_size:
            return

        try:
            loss_ensemble = 0.
            for i in range(0, 10):
                transitions = self.training_data.sample(self.batch_size)
                batch = Transition(*zip(*transitions))
                non_final_mask = torch.tensor(tuple(
                    map(lambda s: s is not None, batch.next_state)),
                                              device=device,
                                              dtype=torch.bool)
                non_final_next_states = torch.cat(
                    [s for s in batch.next_state if s is not None])

                state_batch = torch.cat(batch.state)
                action_batch = torch.cat(batch.action)
                reward_batch = torch.cat(batch.reward)
                state_action_values = self.model(state_batch).gather(
                    1, action_batch)

                next_state_values = torch.zeros(self.batch_size,
                                                device=device,
                                                dtype=torch.double)
                next_state_values[non_final_mask] = self.target_net(
                    non_final_next_states).max(1)[0].detach()

                expected_state_action_values = self.gamma * next_state_values + reward_batch

                loss = self.loss_fn(state_action_values,
                                    expected_state_action_values.unsqueeze(1))
                loss_ensemble += loss.item()

                self.optimizer.zero_grad()
                loss.backward()

                #             for param in self.model.parameters():
                #                 param.grad.data.clamp_(-1, 1)
                self.optimizer.step()

            self.running_loss = 0.8 * self.running_loss + 0.2 * loss_ensemble
            self.epsilon = 0.999 * self.epsilon
        except:
            print('{}: no non-terminal state'.format(self.agent_name))

    def reset(self):
        self.cum_rewards: float = 0.
        self.interest_level = 0.
        self.latest_feature = None
        self.latest_action = None
        self.target_net.load_state_dict(self.model.state_dict())
        self.target_net.double()
        self.target_net.eval()
        self.target_net.to(device)
        self.history_unit_indices = []
        self.cum_reward_history.append(self.cum_rewards)
        self.current_loc = [0, 0]
        self.current_feed = 0
コード例 #5
0
class NaiveSafeQLearningAgent():
    def __init__(self, index, world_size, states, num_agents,
                 collision_distance, collision_threshold, reward_threshold):
        self.index = index
        self.name = 'naive q agent'
        self.world_size = world_size
        self.states = states
        self.num_agents = num_agents
        self.rewards = []
        self.buffer = ReplayMemory(10000)
        self.gamma = 0.9
        self.beta = 1
        self.action_traj = []
        self.num_collisions = 0
        self.collision_distance = collision_distance
        self.collision_threshold = collision_threshold
        self.reward_threshold = reward_threshold

        self.dimensions = [2, 5, 5, 4]
        self.dqn = MLP(self.dimensions).double()
        self.target = MLP(self.dimensions).double()
        self.target.load_state_dict(self.dqn.state_dict())
        self.target.eval()

        self.loss_fn = torch.nn.MSELoss(reduction='sum')
        self.optimizer = optim.RMSprop(self.dqn.parameters())
        self.lr = 1e-4
        self.num_collisions = 0
        self.num_unsafe = 0
        self.eps = 0.1
        self.cum_rewards = 0
        self.target_usage = 0

    def choose_action(self, explore):
        possible_actions = list(copy.copy(Action.SET))
        no_collision_ls = [1.0 for _ in Action.SET]

        for a in Action.SET:
            next_state = self._move_coordinate(self.states[self.index], a)
            next_state = bound_action(next_state, self.world_size[0],
                                      self.world_size[1])

            for agent in range(self.num_agents):
                if agent == self.index:
                    continue

                cur_agent_state = self.states[agent]
                if np.linalg.norm(cur_agent_state -
                                  next_state) > 1.0 + self.collision_distance:
                    continue

                movement = np.rint(next_state - cur_agent_state)
                action = get_action(movement, self.world_size)
                if action == -1:
                    continue

                no_collision_ls[a] *= 0.75

        for i, l in enumerate(no_collision_ls):
            if l < self.collision_threshold and i in possible_actions:
                possible_actions.remove(i)

        q = self.target(
            torch.tensor(self.states[self.index],
                         dtype=torch.double)).tolist()

        if len(possible_actions) == 0:
            best_action = np.argmax(no_collision_ls)
            self.action_traj.append(best_action)
            return best_action

        best_q = -math.inf
        best_action = Action.UP
        for action in possible_actions:
            if q[action] > best_q:
                best_q = q[action]
                best_action = action

        if explore or np.random.binomial(1, self.eps) == 1:
            best_action = possible_actions[np.random.choice(
                len(possible_actions))]

        self.action_traj.append(best_action)
        return best_action

    def update_buffer(self, reward, states):
        self.buffer.push(
            torch.tensor([self.states[self.index]], dtype=torch.double),
            torch.tensor([[self.action_traj[-1]]], dtype=torch.long),
            torch.tensor([reward], dtype=torch.double),
            torch.tensor([states[self.index]], dtype=torch.double))
        self.rewards += [reward]
        self.states = states

        if reward < self.reward_threshold:
            self.num_unsafe += 1

        for i, state in enumerate(states):
            if i == self.index:
                continue
            if np.linalg.norm(state -
                              states[self.index]) < self.collision_distance:
                self.num_collisions += 1
                break

        self.cum_rewards += reward

    def learn_from_buffer(self):
        self._value_func_estimate()

    def reset(self, states):
        self.target.load_state_dict(self.dqn.state_dict())
        self.target.eval()
        self.action_traj = []
        self.states = states
        self.rewards = []

    def _value_func_estimate(self):
        if len(self.buffer) < 32:
            return

        self.target_usage += 1
        transitions = self.buffer.sample(32)
        batch = Transition(*zip(*transitions))

        state_batch = torch.cat(batch.state)
        action_batch = torch.cat(batch.action)
        reward_batch = torch.cat(batch.reward)
        next_state_batch = torch.cat(batch.next_state)

        state_action_values = self.dqn(state_batch).gather(1, action_batch)
        next_state_values = self.target(next_state_batch).max(1)[0].detach()

        expected_state_action_values = (next_state_values *
                                        self.gamma) + reward_batch

        loss = self.loss_fn(
            state_action_values,
            expected_state_action_values.unsqueeze(1),
        )

        self.optimizer.zero_grad()
        loss.backward()
        for param in self.dqn.parameters():
            param.grad.data.clamp_(-1, 1)
        self.optimizer.step()

        if self.target_usage == 10:
            self.target_usage = 0
            self.target.load_state_dict(self.dqn.state_dict())
            self.target.eval()

    def _move_coordinate(self, state, action):
        movement = get_movement(action)
        return movement + state