Example #1
0
class Agent(object):
    def __init__(self, state_space, action_space, max_action, device):
        self.state_size = state_space.shape[0]
        self.action_size = action_space.shape[0]
        self.max_action = max_action
        self.device = device
        self.actor_local = Actor(state_space.shape, action_space.high.size,
                                 max_action)
        self.actor_target = Actor(state_space.shape, action_space.high.size,
                                  max_action)
        self.actor_optimizer = optimizers.Adam(LR_ACTOR)
        # let target be equal to local
        self.actor_target.set_weights(self.actor_local.get_weights())

        self.critic_local = Critic(state_space.shape, action_space.high.size)
        self.critic_target = Critic(state_space.shape, action_space.high.size)
        self.critic_optimizer = optimizers.Adam(LR_CRITIC)
        # let target be equal to local
        self.critic_target.set_weights(self.critic_local.get_weights())

        self.noise = OUNoise(self.action_size)
        self.memory = ReplayBuffer(BUFFER_SIZE)

        self.current_steps = 0

    def step(self,
             state,
             action,
             reward,
             done,
             next_state,
             train=True) -> None:
        self.memory.store(state, action, reward, done, next_state)
        if train and self.memory.count > BATCH_SIZE and self.memory.count > MIN_MEM_SIZE:
            if self.current_steps % UPDATE_STEPS == 0:
                experiences = self.memory.sample(BATCH_SIZE)
                self.learn(experiences, GAMMA)
            self.current_steps += 1

    @tf.function
    def critic_train(self, states, actions, rewards, dones, next_states):
        with tf.device(self.device):
            # Compute yi
            u_t = self.actor_target(next_states)
            q_t = self.critic_target([next_states, u_t])
            yi = tf.cast(rewards, dtype=tf.float64) + \
                 tf.cast(GAMMA, dtype=tf.float64) * \
                 tf.cast((1 - tf.cast(dones, dtype=tf.int64)), dtype=tf.float64) * \
                 tf.cast(q_t, dtype=tf.float64)

            # Compute MSE
            with tf.GradientTape() as tape:
                q_l = tf.cast(self.critic_local([states, actions]),
                              dtype=tf.float64)
                loss = (q_l - yi) * (q_l - yi)
                loss = tf.reduce_mean(loss)
                # Update critic by minimizing loss
                dloss_dql = tape.gradient(loss,
                                          self.critic_local.trainable_weights)
            self.critic_optimizer.apply_gradients(
                zip(dloss_dql, self.critic_local.trainable_weights))
        return

    @tf.function
    def actor_train(self, states):
        with tf.device(self.device):
            with tf.GradientTape(watch_accessed_variables=False) as tape:
                tape.watch(self.actor_local.trainable_variables)
                u_l = self.actor_local(states)
                q_l = -tf.reduce_mean(self.critic_local([states, u_l]))
            j = tape.gradient(q_l, self.actor_local.trainable_variables)
            self.actor_optimizer.apply_gradients(
                zip(j, self.actor_local.trainable_variables))
        return

    def learn(self, experiences, gamma) -> None:
        states, actions, rewards, dones, next_states = experiences
        states = np.array(states).reshape(BATCH_SIZE, self.state_size)
        states = tf.convert_to_tensor(states)
        actions = np.array(actions).reshape(BATCH_SIZE, self.action_size)
        actions = tf.convert_to_tensor(actions)
        rewards = np.array(rewards).reshape(BATCH_SIZE, 1)
        next_states = np.array(next_states).reshape(BATCH_SIZE,
                                                    self.state_size)
        dones = np.array(dones).reshape(BATCH_SIZE, 1)

        self.critic_train(states, actions, rewards, dones, next_states)
        self.actor_train(states)
        self.update_local()
        return

    def update_local(self):
        def soft_updates(local_model: tf.keras.Model,
                         target_model: tf.keras.Model) -> np.ndarray:
            local_weights = np.array(local_model.get_weights())
            target_weights = np.array(target_model.get_weights())

            assert len(local_weights) == len(target_weights)
            new_weights = TAU * local_weights + (1 - TAU) * target_weights
            return new_weights

        self.actor_target.set_weights(
            soft_updates(self.actor_local, self.actor_target))
        self.critic_target.set_weights(
            soft_updates(self.critic_local, self.critic_target))

    def store_weights(self, episode: int) -> None:
        self.actor_target.save_weights(
            join(CKPTS_PATH, ACTOR_CKPTS, f'cp-{episode}'))
        self.critic_target.save_weights(
            join(CKPTS_PATH, CRITIC_CKPTS, f'cp-{episode}'))
        return

    def act(self, state, add_noise=True) -> (float, float):
        state = np.array(state).reshape(1, self.state_size)
        pure_action = self.actor_local.predict(state)[0]
        action = self.noise.get_action(pure_action)
        return action, pure_action

    def reset(self):
        self.noise.reset()
Example #2
0
class DRRAgent:
    def __init__(self,
                 env,
                 users_num,
                 items_num,
                 state_size,
                 is_test=False,
                 use_wandb=False):

        self.env = env

        self.users_num = users_num
        self.items_num = items_num

        self.embedding_dim = 100
        self.actor_hidden_dim = 128
        self.actor_learning_rate = 0.001
        self.critic_hidden_dim = 128
        self.critic_learning_rate = 0.001
        self.discount_factor = 0.9
        self.tau = 0.001

        self.replay_memory_size = 1000000
        self.batch_size = 32

        self.actor = Actor(self.embedding_dim, self.actor_hidden_dim,
                           self.actor_learning_rate, state_size, self.tau)
        self.critic = Critic(self.critic_hidden_dim, self.critic_learning_rate,
                             self.embedding_dim, self.tau)

        # self.m_embedding_network = MovieGenreEmbedding(items_num, 19, self.embedding_dim)
        # self.m_embedding_network([np.zeros((1,)),np.zeros((1,))])
        # self.m_embedding_network.load_weights('/home/diominor/Workspace/DRR/save_weights/m_g_model_weights.h5')

        self.embedding_network = UserMovieEmbedding(users_num, items_num,
                                                    self.embedding_dim)
        self.embedding_network([np.zeros((1, )), np.zeros((1, ))])
        # self.embedding_network = UserMovieEmbedding(users_num, self.embedding_dim)
        # self.embedding_network([np.zeros((1)),np.zeros((1,100))])
        self.embedding_network.load_weights(
            '/home/diominor/Workspace/DRR/save_weights/user_movie_embedding_case4.h5'
        )

        self.srm_ave = DRRAveStateRepresentation(self.embedding_dim)
        self.srm_ave([np.zeros((
            1,
            100,
        )), np.zeros((1, state_size, 100))])

        # PER
        self.buffer = PriorityExperienceReplay(self.replay_memory_size,
                                               self.embedding_dim)
        self.epsilon_for_priority = 1e-6

        # ε-탐욕 탐색 하이퍼파라미터 ε-greedy exploration hyperparameter
        self.epsilon = 1.
        self.epsilon_decay = (self.epsilon - 0.1) / 500000
        self.std = 1.5

        self.is_test = is_test

        # wandb
        self.use_wandb = use_wandb
        if use_wandb:
            wandb.init(project="drr",
                       entity="diominor",
                       config={
                           'users_num': users_num,
                           'items_num': items_num,
                           'state_size': state_size,
                           'embedding_dim': self.embedding_dim,
                           'actor_hidden_dim': self.actor_hidden_dim,
                           'actor_learning_rate': self.actor_learning_rate,
                           'critic_hidden_dim': self.critic_hidden_dim,
                           'critic_learning_rate': self.critic_learning_rate,
                           'discount_factor': self.discount_factor,
                           'tau': self.tau,
                           'replay_memory_size': self.replay_memory_size,
                           'batch_size': self.batch_size,
                           'std_for_exploration': self.std
                       })

    def calculate_td_target(self, rewards, q_values, dones):
        y_t = np.copy(q_values)
        for i in range(q_values.shape[0]):
            y_t[i] = rewards[i] + (1 - dones[i]) * (self.discount_factor *
                                                    q_values[i])
        return y_t

    def recommend_item(self,
                       action,
                       recommended_items,
                       top_k=False,
                       items_ids=None):
        if items_ids == None:
            items_ids = np.array(
                list(
                    set(i for i in range(self.items_num)) - recommended_items))

        items_ebs = self.embedding_network.get_layer('movie_embedding')(
            items_ids)
        # items_ebs = self.m_embedding_network.get_layer('movie_embedding')(items_ids)
        action = tf.transpose(action, perm=(1, 0))
        if top_k:
            item_indice = np.argsort(
                tf.transpose(tf.keras.backend.dot(items_ebs, action),
                             perm=(1, 0)))[0][-top_k:]
            return items_ids[item_indice]
        else:
            item_idx = np.argmax(tf.keras.backend.dot(items_ebs, action))
            return items_ids[item_idx]

    def train(self, max_episode_num, top_k=False, load_model=False):
        # 타겟 네트워크들 초기화
        self.actor.update_target_network()
        self.critic.update_target_network()

        if load_model:
            self.load_model(
                "/home/diominor/Workspace/DRR/save_weights/actor_50000.h5",
                "/home/diominor/Workspace/DRR/save_weights/critic_50000.h5")
            print('Completely load weights!')

        episodic_precision_history = []

        for episode in range(max_episode_num):
            # episodic reward 리셋
            episode_reward = 0
            correct_count = 0
            steps = 0
            q_loss = 0
            mean_action = 0
            # Environment 리셋
            user_id, items_ids, done = self.env.reset()
            # print(f'user_id : {user_id}, rated_items_length:{len(self.env.user_items)}')
            # print('items : ', self.env.get_items_names(items_ids))
            while not done:

                # Observe current state & Find action
                ## Embedding 해주기
                user_eb = self.embedding_network.get_layer('user_embedding')(
                    np.array(user_id))
                items_eb = self.embedding_network.get_layer('movie_embedding')(
                    np.array(items_ids))
                # items_eb = self.m_embedding_network.get_layer('movie_embedding')(np.array(items_ids))
                ## SRM으로 state 출력
                state = self.srm_ave([
                    np.expand_dims(user_eb, axis=0),
                    np.expand_dims(items_eb, axis=0)
                ])

                ## Action(ranking score) 출력
                action = self.actor.network(state)

                ## ε-greedy exploration
                if self.epsilon > np.random.uniform() and not self.is_test:
                    self.epsilon -= self.epsilon_decay
                    action += np.random.normal(0, self.std, size=action.shape)

                ## Item 추천
                recommended_item = self.recommend_item(
                    action, self.env.recommended_items, top_k=top_k)

                # Calculate reward & observe new state (in env)
                ## Step
                next_items_ids, reward, done, _ = self.env.step(
                    recommended_item, top_k=top_k)
                if top_k:
                    reward = np.sum(reward)

                # get next_state
                next_items_eb = self.embedding_network.get_layer(
                    'movie_embedding')(np.array(next_items_ids))
                # next_items_eb = self.m_embedding_network.get_layer('movie_embedding')(np.array(next_items_ids))
                next_state = self.srm_ave([
                    np.expand_dims(user_eb, axis=0),
                    np.expand_dims(next_items_eb, axis=0)
                ])

                # buffer에 저장
                self.buffer.append(state, action, reward, next_state, done)

                if self.buffer.crt_idx > 1 or self.buffer.is_full:
                    # Sample a minibatch
                    batch_states, batch_actions, batch_rewards, batch_next_states, batch_dones, weight_batch, index_batch = self.buffer.sample(
                        self.batch_size)

                    # Set TD targets
                    target_next_action = self.actor.target_network(
                        batch_next_states)
                    qs = self.critic.network(
                        [target_next_action, batch_next_states])
                    target_qs = self.critic.target_network(
                        [target_next_action, batch_next_states])
                    min_qs = tf.raw_ops.Min(input=tf.concat([target_qs, qs],
                                                            axis=1),
                                            axis=1,
                                            keep_dims=True)  # Double Q method
                    td_targets = self.calculate_td_target(
                        batch_rewards, min_qs, batch_dones)

                    # Update priority
                    for (p, i) in zip(td_targets, index_batch):
                        self.buffer.update_priority(
                            abs(p[0]) + self.epsilon_for_priority, i)

                    # print(weight_batch.shape)
                    # print(td_targets.shape)
                    # raise Exception
                    # Update critic network
                    q_loss += self.critic.train([batch_actions, batch_states],
                                                td_targets, weight_batch)

                    # Update actor network
                    s_grads = self.critic.dq_da([batch_actions, batch_states])
                    self.actor.train(batch_states, s_grads)
                    self.actor.update_target_network()
                    self.critic.update_target_network()

                items_ids = next_items_ids
                episode_reward += reward
                mean_action += np.sum(action[0]) / (len(action[0]))
                steps += 1

                if reward > 0:
                    correct_count += 1

                print(
                    f'recommended items : {len(self.env.recommended_items)},  epsilon : {self.epsilon:0.3f}, reward : {reward:+}',
                    end='\r')

                if done:
                    print()
                    precision = int(correct_count / steps * 100)
                    print(
                        f'{episode}/{max_episode_num}, precision : {precision:2}%, total_reward:{episode_reward}, q_loss : {q_loss/steps}, mean_action : {mean_action/steps}'
                    )
                    if self.use_wandb:
                        wandb.log({
                            'precision': precision,
                            'total_reward': episode_reward,
                            'epsilone': self.epsilon,
                            'q_loss': q_loss / steps,
                            'mean_action': mean_action / steps
                        })
                    episodic_precision_history.append(precision)

            if (episode + 1) % 50 == 0:
                plt.plot(episodic_precision_history)
                plt.savefig(
                    f'/home/diominor/Workspace/DRR/images/training_precision_%_top_5.png'
                )

            if (episode + 1) % 1000 == 0:
                self.save_model(
                    f'/home/diominor/Workspace/DRR/save_weights/actor_{episode+1}_fixed.h5',
                    f'/home/diominor/Workspace/DRR/save_weights/critic_{episode+1}_fixed.h5'
                )

    def save_model(self, actor_path, critic_path):
        self.actor.save_weights(actor_path)
        self.critic.save_weights(critic_path)

    def load_model(self, actor_path, critic_path):
        self.actor.load_weights(actor_path)
        self.critic.load_weights(critic_path)