Esempio n. 1
0
class DDPG_REC:

    def __init__(self, state_item_num, action_item_num, emb_dim, batch_size, tau, actor_lr, critic_lr,
                 gamma, buffer_size, item_space, summary_dir):

        self.state_item_num = state_item_num
        self.action_item_num = action_item_num
        self.emb_dim = emb_dim
        self.batch_size = batch_size
        self.tau = tau
        self.actor_lr = actor_lr
        self.critic_lr = critic_lr
        self.gamma = gamma
        self.buffer_size = buffer_size
        self.item_space = item_space
        self.summary_dir = summary_dir

        self.sess = tf.Session()

        self.s_dim = emb_dim * state_item_num
        self.a_dim = emb_dim * action_item_num
        self.actor = Actor(self.sess, state_item_num, action_item_num, emb_dim, batch_size, tau, actor_lr)
        self.critic = Critic(self.sess, state_item_num, action_item_num, emb_dim,
                             self.actor.get_num_trainable_vars(), gamma, tau, critic_lr)
        self.exploration_noise = OUNoise(self.a_dim)

        # set up summary operators
        self.summary_ops, self.summary_vars = self.build_summaries()
        self.sess.run(tf.global_variables_initializer())
        self.writer = tf.summary.FileWriter(summary_dir, self.sess.graph)

        # initialize target network weights
        self.actor.hard_update_target_network()
        self.critic.hard_update_target_network()

        # initialize replay memory
        self.replay_buffer = ReplayBuffer(buffer_size)

    def gene_actions(self, weight_batch):
        """use output of actor network to calculate action list
        Args:
            weight_batch: actor network outputs

        Returns:
            recommendation list
        """
        item_ids = list(self.item_space.keys())
        item_weights = list(self.item_space.values())
        max_ids = list()
        for weight in weight_batch:
            score = np.dot(item_weights, np.transpose(weight))
            idx = np.argmax(score, 0)
            max_ids.append([item_ids[_] for _ in idx])
        return max_ids

    # def gene_action(self, weight):
    #     """use output of actor network to calculate action list
    #     Args:
    #         weight: actor network outputs
    #
    #     Returns:
    #         recommendation list
    #     """
    #     item_ids = list(self.item_space.keys())
    #     item_weights = list(self.item_space.values())
    #     score = np.dot(item_weights, np.transpose(weight))
    #     idx = np.argmax(score)
    #     return item_ids[idx]

    @staticmethod
    def build_summaries():
        episode_reward = tf.Variable(0.)
        tf.summary.scalar("reward", episode_reward)
        episode_max_q = tf.Variable(0.)
        tf.summary.scalar("max_q_value", episode_max_q)
        critic_loss = tf.Variable(0.)
        tf.summary.scalar("critic_loss", critic_loss)

        summary_vars = [episode_reward, episode_max_q, critic_loss]
        summary_ops = tf.summary.merge_all()
        return summary_ops, summary_vars

    def _train(self):
        samples = self.replay_buffer.sample_batch(self.batch_size)
        state_batch = np.asarray([_[0] for _ in samples])
        action_batch = np.asarray([_[1] for _ in samples])
        reward_batch = np.asarray([_[2] for _ in samples])
        n_state_batch = np.asarray([_[3] for _ in samples])
        done_batch = np.asarray([_[4] for _ in samples])

        seq_len_batch = np.asarray([self.state_item_num] * self.batch_size)

        # calculate predicted q value
        action_weights = self.actor.predict_target(state_batch, seq_len_batch)  # [batch_size,
        n_action_batch = self.gene_actions(action_weights.reshape((-1, self.action_item_num, self.emb_dim)))
        n_action_emb_batch = get_item_emb(n_action_batch, item_ids_emb_dict)
        target_q_batch = self.critic.predict_target(n_state_batch.reshape((-1, self.s_dim)),
                                                    n_action_emb_batch.reshape((-1, self.a_dim)), seq_len_batch)
        y_batch = []
        for i in range(self.batch_size):
            if done_batch[i]:
                y_batch.append(reward_batch[i])
            else:
                y_batch.append(reward_batch[i] + self.critic.gamma * target_q_batch[i])

        # train critic
        q_value, critic_loss, _ = self.critic.train(state_batch, action_batch,
                                                    np.reshape(y_batch, (self.batch_size, 1)), seq_len_batch)

        # train actor
        action_weight_batch_for_gradients = self.actor.predict(state_batch, seq_len_batch)
        action_batch_for_gradients = self.gene_actions(action_weight_batch_for_gradients)
        action_emb_batch_for_gradients = get_item_emb(action_batch_for_gradients, item_ids_emb_dict)
        a_gradient_batch = self.critic.action_gradients(state_batch,
                                                        action_emb_batch_for_gradients.reshape((-1, self.a_dim)),
                                                        seq_len_batch)
        self.actor.train(state_batch, a_gradient_batch[0], seq_len_batch)

        # update target networks
        self.actor.update_target_network()
        self.critic.update_target_network()

        return np.amax(q_value), critic_loss

    def action(self, state):
        weight = self.actor.predict(np.reshape(state, [1, self.s_dim]), np.array([self.state_item_num])) + \
                 self.exploration_noise.noise().reshape(
                     (1, self.action_item_num, int(self.a_dim / self.action_item_num)))
        action = self.gene_actions(weight)
        return np.array(action[0])

    def perceive_and_train(self, state, action, reward, n_state, done):
        action_emb = get_item_emb(action, item_ids_emb_dict)
        self.replay_buffer.add(list(state.reshape((self.s_dim,))),
                               list(action_emb.reshape((self.a_dim,))),
                               [reward],
                               list(n_state.reshape((self.s_dim,))),
                               [done])

        # Store transitions to replay start size then start training
        ep_q_value_, critic_loss = 0, 0
        if self.replay_buffer.size() > self.batch_size:
            ep_q_value_, critic_loss = self._train()

        # if self.time_step % 10000 == 0:
        # self.actor_network.save_network(self.time_step)
        # self.critic_network.save_network(self.time_step)

        # Re-iniitialize the random process when an episode ends
        if done:
            self.exploration_noise.reset()

        return ep_q_value_, critic_loss

    def write_summary(self, ep_reward, ep_q_value, loss, i):
        summary_str = self.sess.run(self.summary_ops, feed_dict={self.summary_vars[0]: ep_reward,
                                                                 self.summary_vars[1]: ep_q_value,
                                                                 self.summary_vars[2]: loss})
        self.writer.add_summary(summary_str, i)

    def save(self):
        self.writer.close()
        saver = tf.train.Saver()
        ckpt_path = os.path.join(os.path.dirname(__file__), "models")
        saver.save(self.sess, ckpt_path, write_meta_graph=False)
Esempio n. 2
0
class DDPG:

    def __init__(self, sess, params):
        self.sess = sess
        self.__dict__.update(params)
        # create placeholders
        self.create_input_placeholders()
        # create actor/critic models
        self.actor = Actor(self.sess, self.inputs, **self.actor_params)
        self.critic = Critic(self.sess, self.inputs, **self.critic_params)
        self.noise_params = {k: np.array(list(map(float, v.split(","))))
                             for k, v in self.noise_params.items()}
        self.noise = Noise(**self.noise_params)
        self.ou_level = np.zeros(self.dimensions["u"])
        self.memory = Memory(self.n_mem_objects,
                             self.memory_size)

    def create_input_placeholders(self):
        self.inputs = {}
        with tf.name_scope("inputs"):
            for ip_name, dim in self.dimensions.items():
                self.inputs[ip_name] = tf.placeholder(tf.float32,
                                                      shape=(None, dim),
                                                      name=ip_name)
            self.inputs["g"] = tf.placeholder(tf.float32,
                                              shape=self.inputs["u"].shape,
                                              name="a_grad")
            self.inputs["p"] = tf.placeholder(tf.float32,
                                              shape=(None, 1),
                                              name="pred_q")

    def step(self, x, is_u_discrete, explore=True):
        x = x.reshape(-1, self.dimensions["x"])
        u = self.actor.predict(x)
        if explore:
            self.ou_level = self.noise.ornstein_uhlenbeck_level(self.ou_level)
            u = u + self.ou_level
        q = self.critic.predict(x, u)
        if is_u_discrete:
            return [np.argmax(u), u[0], q[0]]
        return [u[0], u, q[0]]

    def remember(self, experience):
        self.memory.add(experience)

    def train(self):
        # check if the memory contains enough experiences
        if self.memory.size < 3*self.b_size:
            return
        x, g, ag, u, r, nx, ng, t = self.get_batch()
        # for her transitions
        her_idxs = np.where(np.random.random(self.b_size) < 0.80)[0]
        # print("{} of {} selected for HER transitions".
        # format(len(her_idxs), self.b_size))
        g[her_idxs] = ag[her_idxs]
        r[her_idxs] = 1
        t[her_idxs] = 1
        x = np.hstack([x, g])
        nx = np.hstack([nx, ng])
        nu = self.actor.predict_target(nx)
        tq = r + self.gamma*self.critic.predict_target(nx, nu)*(1-t)
        self.critic.train(x, u, tq)
        grad = self.critic.get_action_grads(x, u)
        # print("Grads:\n", g)
        self.actor.train(x, grad)
        self.update_targets()

    def get_batch(self):
        return self.memory.sample(self.b_size)

    def update_targets(self):
        self.critic.update_target()
        self.actor.update_target()