def __init__(self, value_network, config, writer=None):
        self.config = config

        # Load configs
        self.betas_for_duplication = parse(self.config["betas_for_duplication"])
        self.betas_for_discretisation = parse(self.config["betas_for_discretisation"])
        self.loss_function = loss_function_factory(self.config["loss_function"])
        self.loss_function_c = loss_function_factory(self.config["loss_function_c"])
        self.device = choose_device(self.config["device"])

        # Load network
        self._value_network = value_network
        self._value_network = self._value_network.to(self.device)
        self.n_actions = self._value_network.predict.out_features // 2

        self.writer = writer
        if writer:
            self.writer.add_graph(self._value_network,
                                  input_to_model=torch.tensor(np.zeros((1, 1, self._value_network.size_state + 1),
                                                                       dtype=np.float32)).to(self.device))

        self.memory = ReplayMemory(transition_type=TransitionBFTQ, config=self.config)
        self.optimizer = None
        self.batch = 0
        self.epoch = 0
        self.reset()
Esempio n. 2
0
 def __init__(self, env, config=None):
     super(AbstractDQNAgent, self).__init__(config)
     self.env = env
     assert isinstance(env.action_space, spaces.Discrete) or isinstance(env.action_space, spaces.Tuple), \
         "Only compatible with Discrete action spaces."
     self.memory = ReplayMemory(self.config)
     self.exploration_policy = exploration_factory(self.config["exploration"], self.env.action_space)
     self.training = True
     self.previous_state = None
Esempio n. 3
0
 def __init__(self, env, config=None):
     super(AbstractDQNAgent, self).__init__(config)
     self.env = env
     self.config["num_states"] = env.observation_space.shape[0]
     self.config["num_actions"] = env.action_space.n
     self.config["model"]["all_layers"] = \
         [self.config["num_states"]] + self.config["model"]["layers"] + [self.config["num_actions"]]
     self.memory = ReplayMemory(self.config)
     self.exploration_policy = exploration_factory(self.config["exploration"], self.env.action_space)
     self.training = True
     self.previous_state = None
Esempio n. 4
0
 def __init__(self, env, config=None):
     super(AbstractDQNAgent, self).__init__(config)
     self.env = env
     assert isinstance(
         env.action_space,
         spaces.Discrete), "Only compatible with Discrete action spaces."
     self.config["model"]["in"] = int(np.prod(env.observation_space.shape))
     self.config["model"]["out"] = env.action_space.n
     self.memory = ReplayMemory(self.config)
     self.exploration_policy = exploration_factory(
         self.config["exploration"], self.env.action_space)
     self.training = True
     self.previous_state = None
class BudgetedFittedQ(object):
    def __init__(self, value_network, config, writer=None):
        self.config = config

        # Load configs
        self.betas_for_duplication = parse(self.config["betas_for_duplication"])
        self.betas_for_discretisation = parse(self.config["betas_for_discretisation"])
        self.loss_function = loss_function_factory(self.config["loss_function"])
        self.loss_function_c = loss_function_factory(self.config["loss_function_c"])
        self.device = choose_device(self.config["device"])

        # Load network
        self._value_network = value_network
        self._value_network = self._value_network.to(self.device)
        self.n_actions = self._value_network.predict.out_features // 2

        self.writer = writer
        if writer:
            self.writer.add_graph(self._value_network,
                                  input_to_model=torch.tensor(np.zeros((1, 1, self._value_network.size_state + 1),
                                                                       dtype=np.float32)).to(self.device))

        self.memory = ReplayMemory(transition_type=TransitionBFTQ, config=self.config)
        self.optimizer = None
        self.batch = 0
        self.epoch = 0
        self.reset()

    def push(self, state, action, reward, next_state, terminal, cost, beta=None):
        """
            Push a transition into the replay memory.
        """
        action = torch.tensor([[action]], dtype=torch.long)
        reward = torch.tensor([reward], dtype=torch.float)
        terminal = torch.tensor([terminal], dtype=torch.bool)
        cost = torch.tensor([cost], dtype=torch.float)
        state = torch.tensor([[state]], dtype=torch.float)
        next_state = torch.tensor([[next_state]], dtype=torch.float)

        # Data augmentation for (potentially missing) budget values
        if np.size(self.betas_for_duplication):
            for beta_d in self.betas_for_duplication:
                if beta:  # If the transition already has a beta, augment data by altering it.
                    beta_d = torch.tensor([[[beta_d * beta]]], dtype=torch.float)
                else:  # Otherwise, simply set new betas
                    beta_d = torch.tensor([[[beta_d]]], dtype=torch.float)
                self.memory.push(state, action, reward, next_state, terminal, cost, beta_d)
        else:
            beta = torch.tensor([[[beta]]], dtype=torch.float)
            self.memory.push(state, action, reward, next_state, terminal, cost, beta)

    def run(self):
        """
            Run BFTQ on the batch of transitions in memory.

            We fit a model for the optimal reward-cost state-budget-action values Qr and Qc.
            The BFTQ epoch is repeated until convergence or timeout.
        :return: the obtained value network Qr*, Qc*
        """
        logger.info("Run")
        self.batch += 1
        for self.epoch in range(self.config["epochs"]):
            self._epoch()
        return self._value_network

    def _epoch(self):
        """
            Run a single epoch of BFTQ.

            This is similar to a fitted value iteration:
            1. Bootstrap the targets for Qr, Qc using the Budgeted Bellman Optimality operator
            2. Fit the Qr, Qc model to the targets
        """
        logger.debug("Epoch {}/{}".format(self.epoch + 1, self.config["epochs"]))
        states_betas, actions, rewards, costs, next_states, betas, terminals = self._zip_batch()
        target_r, target_c = self.compute_targets(rewards, costs, next_states, betas, terminals)
        self._fit(states_betas, actions, target_r, target_c)
        plot_values_histograms(self._value_network, (target_r, target_c), states_betas, actions, self.writer, self.epoch, self.batch)

    def _zip_batch(self):
        """
            Convert the batch of transitions to several tensors of states, actions, rewards, etc.
        :return: state-beta, state, action, reward, constraint, next_state, beta, terminal batches
        """
        batch = self.memory.memory
        self.size_batch = len(batch)
        zipped = TransitionBFTQ(*zip(*batch))
        actions = torch.cat(zipped.action).to(self.device)
        rewards = torch.cat(zipped.reward).to(self.device)
        terminals = torch.cat(zipped.terminal).to(self.device)
        costs = torch.cat(zipped.cost).to(self.device)

        betas = torch.cat(zipped.beta).to(self.device)
        states = torch.cat(zipped.state).to(self.device)
        next_states = torch.cat(zipped.next_state).to(self.device)
        states_betas = torch.cat((states, betas), dim=2).to(self.device)

        # Batch normalization
        mean = torch.mean(states_betas, 0).to(self.device)
        std = torch.std(states_betas, 0).to(self.device)
        self._value_network.set_normalization_params(mean, std)

        return states_betas, actions, rewards, costs, next_states, betas, terminals

    def compute_targets(self, rewards, costs, next_states, betas, terminals):
        """
            Compute target values by applying the Budgeted Bellman Optimality operator
        :param rewards: batch of rewards
        :param costs: batch of costs
        :param next_states: batch of next states
        :param betas: batch of budgets
        :param terminals: batch of terminations
        :return: target values
        """
        logger.debug("Compute targets")
        with torch.no_grad():
            next_rewards, next_costs = self.boostrap_next_values(next_states, betas, terminals)
            target_r = rewards + self.config["gamma"] * next_rewards
            target_c = costs + self.config["gamma_c"] * next_costs

            if self.config["clamp_qc"] is not None:
                target_c = torch.clamp(target_c, min=self.config["clamp_qc"][0], max=self.config["clamp_qc"][1])
            torch.cuda.empty_cache()
        return target_r, target_c

    def boostrap_next_values(self, next_states, betas, terminals):
        """
            Boostrap the (Vr, Vc) values at next states by following the greedy policy.

            The model is evaluated for optimal one-step mixtures of actions & budgets that fulfill the cost constraints.

        :param next_states: batch of next states
        :param betas: batch of budgets
        :param terminals: batch of terminations
        :return: Vr and Vc at the next states, following optimal mixtures
        """
        # Initialisation
        next_rewards = torch.zeros(len(next_states), device=self.device)
        next_costs = torch.zeros(len(next_states), device=self.device)
        if self.epoch == 0:
            return next_rewards, next_costs

        # Greedy policy computation pi(a'|s')
        # 1. Select non-final next states
        next_states_nf = next_states[~terminals]
        betas_nf = betas[~terminals]
        # 2. Forward pass of the model Qr, Qc
        q_values = self.compute_next_values(next_states_nf)
        # 3. Compute Pareto-optimal frontiers F of {(Qc, Qr)}_AB at all states
        hulls = self.compute_all_frontiers(q_values, len(next_states_nf))
        # 4. Compute optimal mixture policies satisfying budget constraint: max E[Qr] s.t. E[Qc] < beta
        mixtures = self.compute_all_optimal_mixtures(hulls, betas_nf)

        # Expected value Vr,Vc of the greedy policy at s'
        next_rewards_nf = torch.zeros(len(next_states_nf), device=self.device)
        next_costs_nf = torch.zeros(len(next_states_nf), device=self.device)
        for i, mix in enumerate(mixtures):
            next_rewards_nf[i] = (1 - mix.probability_sup) * mix.inf.qr + mix.probability_sup * mix.sup.qr
            next_costs_nf[i] = (1 - mix.probability_sup) * mix.inf.qc + mix.probability_sup * mix.sup.qc
        next_rewards[~terminals] = next_rewards_nf
        next_costs[~terminals] = next_costs_nf

        torch.cuda.empty_cache()
        return next_rewards, next_costs

    def compute_next_values(self, next_states):
        """
            Compute Q(S, B) with a single forward pass.

            S: set of states
            B: set of budgets (discretised)
        :param next_states: batch of next state
        :return: Q values at next states
        """
        logger.debug("-Forward pass")
        # Compute the cartesian product sb of all next states s with all budgets b
        ss = next_states.squeeze().repeat((1, len(self.betas_for_discretisation))) \
            .view((len(next_states) * len(self.betas_for_discretisation), self._value_network.size_state))
        bb = torch.from_numpy(self.betas_for_discretisation).float().unsqueeze(1).to(device=self.device)
        bb = bb.repeat((len(next_states), 1))
        sb = torch.cat((ss, bb), dim=1).unsqueeze(1)

        # To avoid spikes in memory, we actually split the batch in several minibatches
        batch_sizes = near_split(x=len(sb), num_bins=self.config["split_batches"])
        q_values = []
        for minibatch in range(self.config["split_batches"]):
            mini_batch = sb[sum(batch_sizes[:minibatch]):sum(batch_sizes[:minibatch + 1])]
            q_values.append(self._value_network(mini_batch))
            torch.cuda.empty_cache()
        return torch.cat(q_values).detach().cpu().numpy()

    def compute_all_frontiers(self, q_values, states_count):
        """
            Parallel computing of pareto-optimal frontiers F
        """
        logger.debug("-Compute frontiers")
        n_beta = len(self.betas_for_discretisation)
        hull_params = [(q_values[state * n_beta: (state + 1) * n_beta],
                        self.betas_for_discretisation,
                        self.config["hull_options"],
                        self.config["clamp_qc"])
                       for state in range(states_count)]
        if self.config["processes"] == 1:
            results = [pareto_frontier(*param) for param in hull_params]
        else:
            with Pool(self.config["processes"]) as p:
                results = p.starmap(pareto_frontier, hull_params)
        frontiers, all_points = zip(*results)

        torch.cuda.empty_cache()
        for s in [0, -1]:
            plot_frontier(frontiers[s], all_points[s], self.writer, self.epoch, title="agent/Hull {} batch {}".format(s, self.batch))
        return frontiers

    def compute_all_optimal_mixtures(self, frontiers, betas):
        """
            Parallel computing of optimal mixtures
        """
        logger.debug("-Compute optimal mixtures")
        params = [(frontiers[i], beta.detach().item()) for i, beta in enumerate(betas)]
        if self.config["processes"] == 1:
            optimal_policies = [optimal_mixture(*param) for param in params]
        else:
            with Pool(self.config["processes"]) as p:
                optimal_policies = p.starmap(optimal_mixture, params)
        return optimal_policies

    def _fit(self, states_betas, actions, target_r, target_c):
        """
            Fit a network Q(state, action, beta) = (Qr, Qc) to target values
        :param states_betas: batch of states and betas
        :param actions: batch of actions
        :param target_r: batch of target reward-values
        :param target_c: batch of target cost-values
        :return: the Bellman residual delta between the model and target values
        """
        logger.debug("Fit model")
        # Initial Bellman residual
        with torch.no_grad():
            delta = self._compute_loss(states_betas, actions, target_r, target_c).detach().item()
            torch.cuda.empty_cache()

        # Reset network
        if self.config["reset_network_each_epoch"]:
            self.reset_network()

        # Gradient descent
        losses = []
        for nn_epoch in range(self.config["regression_epochs"]):
            loss = self._gradient_step(states_betas, actions, target_r, target_c)
            losses.append(loss)
        torch.cuda.empty_cache()

        return delta

    def _compute_loss(self, states_betas, actions, target_r, target_c):
        """
            Compute the loss between the model values and target values
        :param states_betas: input state-beta batch
        :param actions: input actions batch
        :param target_r: target qr
        :param target_c: target qc
        :return: the weighted loss for expected rewards and costs
        """
        values = self._value_network(states_betas)
        qr = values.gather(1, actions)
        qc = values.gather(1, actions + self.n_actions)
        loss_qc = self.loss_function_c(qc, target_c.unsqueeze(1))
        loss_qr = self.loss_function(qr, target_r.unsqueeze(1))
        w_r, w_c = self.config["weights_losses"]
        loss = w_c * loss_qc + w_r * loss_qr
        return loss

    def _gradient_step(self, states_betas, actions, target_r, target_c):
        loss = self._compute_loss(states_betas, actions, target_r, target_c)
        self.optimizer.zero_grad()
        loss.backward()
        for param in self._value_network.parameters():
            param.grad.data.clamp_(-1, 1)
        self.optimizer.step()
        return loss.detach().item()

    def save_network(self, path=None):
        path = Path(path) if path else Path("policy.pt")
        torch.save(self._value_network, path)
        return path

    def load_network(self, path=None):
        path = Path(path) if path else Path("policy.pt")
        self._value_network = torch.load(path, map_location=self.device)
        return self._value_network

    def reset_network(self):
        self._value_network.reset()

    def reset(self, reset_weight=True):
        torch.cuda.empty_cache()
        if reset_weight:
            self.reset_network()
        self.optimizer = optimizer_factory(self.config["optimizer"]["type"],
                                           self._value_network.parameters(),
                                           self.config["optimizer"]["learning_rate"],
                                           self.config["optimizer"]["weight_decay"])
        self.epoch = 0
Esempio n. 6
0
class AbstractDQNAgent(AbstractStochasticAgent, ABC):
    def __init__(self, env, config=None):
        super(AbstractDQNAgent, self).__init__(config)
        self.env = env
        assert isinstance(
            env.action_space,
            spaces.Discrete), "Only compatible with Discrete action spaces."
        self.config["model"]["in"] = int(np.prod(env.observation_space.shape))
        self.config["model"]["out"] = env.action_space.n
        self.memory = ReplayMemory(self.config)
        self.exploration_policy = exploration_factory(
            self.config["exploration"], self.env.action_space)
        self.training = True
        self.previous_state = None

    @classmethod
    def default_config(cls):
        return dict(model=dict(type="DuelingNetwork", layers=[100, 100]),
                    optimizer=dict(type="ADAM", lr=5e-4, weight_decay=0, k=5),
                    loss_function="smooth_l1",
                    memory_capacity=50000,
                    batch_size=100,
                    gamma=0.99,
                    device="cuda:best",
                    exploration=dict(method="EpsilonGreedy"),
                    target_update=1)

    def record(self, state, action, reward, next_state, done, info):
        """
            Record a transition by performing a Deep Q-Network iteration

            - push the transition into memory
            - sample a minibatch
            - compute the bellman residual loss over the minibatch
            - perform one gradient descent step
            - slowly track the policy network with the target network
        :param state: a state
        :param action: an action
        :param reward: a reward
        :param next_state: a next state
        :param done: whether state is terminal
        """
        if not self.training:
            return
        self.memory.push(state, action, reward, next_state, done, info)
        batch = self.sample_minibatch()
        if batch:
            loss, _, _ = self.compute_bellman_residual(batch)
            self.step_optimizer(loss)
            self.update_target_network()

    def act(self, state):
        """
            Act according to the state-action value model and an exploration policy
        :param state: current state
        :return: an action
        """
        self.previous_state = state
        values = self.get_state_action_values(state)
        self.exploration_policy.update(values, step_time=True)
        return self.exploration_policy.sample()

    def sample_minibatch(self):
        if len(self.memory) < self.config["batch_size"]:
            return None
        transitions = self.memory.sample(self.config["batch_size"])
        return Transition(*zip(*transitions))

    def update_target_network(self):
        self.steps += 1
        if self.steps % self.config["target_update"] == 0:
            self.target_net.load_state_dict(self.value_net.state_dict())

    @abstractmethod
    def compute_bellman_residual(self, batch, target_state_action_value=None):
        """
            Compute the Bellman Residual Loss over a batch
        :param batch: batch of transitions
        :param target_state_action_value: if provided, acts as a target (s,a)-value
                                          if not, it will be computed from batch and model (Double DQN target)
        :return: the loss over the batch, and the computed target
        """
        raise NotImplementedError

    @abstractmethod
    def get_batch_state_values(self, states):
        """
        Get the state values of several states
        :param states: [s1; ...; sN] an array of states
        :return: values, actions:
                 - [V1; ...; VN] the array of the state values for each state
                 - [a1*; ...; aN*] the array of corresponding optimal action indexes for each state
        """
        raise NotImplementedError

    @abstractmethod
    def get_batch_state_action_values(self, states):
        """
        Get the state-action values of several states
        :param states: [s1; ...; sN] an array of states
        :return: values:[[Q11, ..., Q1n]; ...] the array of all action values for each state
        """
        raise NotImplementedError

    def get_state_value(self, state):
        """
        :param state: s, an environment state
        :return: V, its state-value
        """
        values, actions = self.get_batch_state_values([state])
        return values[0], actions[0]

    def get_state_action_values(self, state):
        """
        :param state: s, an environment state
        :return: [Q(a1,s), ..., Q(an,s)] the array of its action-values for each actions
        """
        return self.get_batch_state_action_values([state])[0]

    def step_optimizer(self, loss):
        raise NotImplementedError

    def seed(self, seed=None):
        return self.exploration_policy.seed(seed)

    def reset(self):
        pass

    def set_writer(self, writer):
        super().set_writer(writer)
        try:
            self.exploration_policy.set_writer(writer)
        except AttributeError:
            pass

    def action_distribution(self, state):
        self.previous_state = state
        values = self.get_state_action_values(state)
        self.exploration_policy.update(values, step_time=False)
        return self.exploration_policy.get_distribution()

    def set_time(self, time):
        self.exploration_policy.set_time(time)

    def eval(self):
        self.training = False
        self.config['exploration']['method'] = "Greedy"
        self.exploration_policy = exploration_factory(
            self.config["exploration"], self.env.action_space)