class Agent(object): def __init__(self, state_space, action_space, max_action, device): self.state_size = state_space.shape[0] self.action_size = action_space.shape[0] self.max_action = max_action self.device = device self.actor_local = Actor(state_space.shape, action_space.high.size, max_action) self.actor_target = Actor(state_space.shape, action_space.high.size, max_action) self.actor_optimizer = optimizers.Adam(LR_ACTOR) # let target be equal to local self.actor_target.set_weights(self.actor_local.get_weights()) self.critic_local = Critic(state_space.shape, action_space.high.size) self.critic_target = Critic(state_space.shape, action_space.high.size) self.critic_optimizer = optimizers.Adam(LR_CRITIC) # let target be equal to local self.critic_target.set_weights(self.critic_local.get_weights()) self.noise = OUNoise(self.action_size) self.memory = ReplayBuffer(BUFFER_SIZE) self.current_steps = 0 def step(self, state, action, reward, done, next_state, train=True) -> None: self.memory.store(state, action, reward, done, next_state) if train and self.memory.count > BATCH_SIZE and self.memory.count > MIN_MEM_SIZE: if self.current_steps % UPDATE_STEPS == 0: experiences = self.memory.sample(BATCH_SIZE) self.learn(experiences, GAMMA) self.current_steps += 1 @tf.function def critic_train(self, states, actions, rewards, dones, next_states): with tf.device(self.device): # Compute yi u_t = self.actor_target(next_states) q_t = self.critic_target([next_states, u_t]) yi = tf.cast(rewards, dtype=tf.float64) + \ tf.cast(GAMMA, dtype=tf.float64) * \ tf.cast((1 - tf.cast(dones, dtype=tf.int64)), dtype=tf.float64) * \ tf.cast(q_t, dtype=tf.float64) # Compute MSE with tf.GradientTape() as tape: q_l = tf.cast(self.critic_local([states, actions]), dtype=tf.float64) loss = (q_l - yi) * (q_l - yi) loss = tf.reduce_mean(loss) # Update critic by minimizing loss dloss_dql = tape.gradient(loss, self.critic_local.trainable_weights) self.critic_optimizer.apply_gradients( zip(dloss_dql, self.critic_local.trainable_weights)) return @tf.function def actor_train(self, states): with tf.device(self.device): with tf.GradientTape(watch_accessed_variables=False) as tape: tape.watch(self.actor_local.trainable_variables) u_l = self.actor_local(states) q_l = -tf.reduce_mean(self.critic_local([states, u_l])) j = tape.gradient(q_l, self.actor_local.trainable_variables) self.actor_optimizer.apply_gradients( zip(j, self.actor_local.trainable_variables)) return def learn(self, experiences, gamma) -> None: states, actions, rewards, dones, next_states = experiences states = np.array(states).reshape(BATCH_SIZE, self.state_size) states = tf.convert_to_tensor(states) actions = np.array(actions).reshape(BATCH_SIZE, self.action_size) actions = tf.convert_to_tensor(actions) rewards = np.array(rewards).reshape(BATCH_SIZE, 1) next_states = np.array(next_states).reshape(BATCH_SIZE, self.state_size) dones = np.array(dones).reshape(BATCH_SIZE, 1) self.critic_train(states, actions, rewards, dones, next_states) self.actor_train(states) self.update_local() return def update_local(self): def soft_updates(local_model: tf.keras.Model, target_model: tf.keras.Model) -> np.ndarray: local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) assert len(local_weights) == len(target_weights) new_weights = TAU * local_weights + (1 - TAU) * target_weights return new_weights self.actor_target.set_weights( soft_updates(self.actor_local, self.actor_target)) self.critic_target.set_weights( soft_updates(self.critic_local, self.critic_target)) def store_weights(self, episode: int) -> None: self.actor_target.save_weights( join(CKPTS_PATH, ACTOR_CKPTS, f'cp-{episode}')) self.critic_target.save_weights( join(CKPTS_PATH, CRITIC_CKPTS, f'cp-{episode}')) return def act(self, state, add_noise=True) -> (float, float): state = np.array(state).reshape(1, self.state_size) pure_action = self.actor_local.predict(state)[0] action = self.noise.get_action(pure_action) return action, pure_action def reset(self): self.noise.reset()
class DRRAgent: def __init__(self, env, users_num, items_num, state_size, is_test=False, use_wandb=False): self.env = env self.users_num = users_num self.items_num = items_num self.embedding_dim = 100 self.actor_hidden_dim = 128 self.actor_learning_rate = 0.001 self.critic_hidden_dim = 128 self.critic_learning_rate = 0.001 self.discount_factor = 0.9 self.tau = 0.001 self.replay_memory_size = 1000000 self.batch_size = 32 self.actor = Actor(self.embedding_dim, self.actor_hidden_dim, self.actor_learning_rate, state_size, self.tau) self.critic = Critic(self.critic_hidden_dim, self.critic_learning_rate, self.embedding_dim, self.tau) # self.m_embedding_network = MovieGenreEmbedding(items_num, 19, self.embedding_dim) # self.m_embedding_network([np.zeros((1,)),np.zeros((1,))]) # self.m_embedding_network.load_weights('/home/diominor/Workspace/DRR/save_weights/m_g_model_weights.h5') self.embedding_network = UserMovieEmbedding(users_num, items_num, self.embedding_dim) self.embedding_network([np.zeros((1, )), np.zeros((1, ))]) # self.embedding_network = UserMovieEmbedding(users_num, self.embedding_dim) # self.embedding_network([np.zeros((1)),np.zeros((1,100))]) self.embedding_network.load_weights( '/home/diominor/Workspace/DRR/save_weights/user_movie_embedding_case4.h5' ) self.srm_ave = DRRAveStateRepresentation(self.embedding_dim) self.srm_ave([np.zeros(( 1, 100, )), np.zeros((1, state_size, 100))]) # PER self.buffer = PriorityExperienceReplay(self.replay_memory_size, self.embedding_dim) self.epsilon_for_priority = 1e-6 # ε-탐욕 탐색 하이퍼파라미터 ε-greedy exploration hyperparameter self.epsilon = 1. self.epsilon_decay = (self.epsilon - 0.1) / 500000 self.std = 1.5 self.is_test = is_test # wandb self.use_wandb = use_wandb if use_wandb: wandb.init(project="drr", entity="diominor", config={ 'users_num': users_num, 'items_num': items_num, 'state_size': state_size, 'embedding_dim': self.embedding_dim, 'actor_hidden_dim': self.actor_hidden_dim, 'actor_learning_rate': self.actor_learning_rate, 'critic_hidden_dim': self.critic_hidden_dim, 'critic_learning_rate': self.critic_learning_rate, 'discount_factor': self.discount_factor, 'tau': self.tau, 'replay_memory_size': self.replay_memory_size, 'batch_size': self.batch_size, 'std_for_exploration': self.std }) def calculate_td_target(self, rewards, q_values, dones): y_t = np.copy(q_values) for i in range(q_values.shape[0]): y_t[i] = rewards[i] + (1 - dones[i]) * (self.discount_factor * q_values[i]) return y_t def recommend_item(self, action, recommended_items, top_k=False, items_ids=None): if items_ids == None: items_ids = np.array( list( set(i for i in range(self.items_num)) - recommended_items)) items_ebs = self.embedding_network.get_layer('movie_embedding')( items_ids) # items_ebs = self.m_embedding_network.get_layer('movie_embedding')(items_ids) action = tf.transpose(action, perm=(1, 0)) if top_k: item_indice = np.argsort( tf.transpose(tf.keras.backend.dot(items_ebs, action), perm=(1, 0)))[0][-top_k:] return items_ids[item_indice] else: item_idx = np.argmax(tf.keras.backend.dot(items_ebs, action)) return items_ids[item_idx] def train(self, max_episode_num, top_k=False, load_model=False): # 타겟 네트워크들 초기화 self.actor.update_target_network() self.critic.update_target_network() if load_model: self.load_model( "/home/diominor/Workspace/DRR/save_weights/actor_50000.h5", "/home/diominor/Workspace/DRR/save_weights/critic_50000.h5") print('Completely load weights!') episodic_precision_history = [] for episode in range(max_episode_num): # episodic reward 리셋 episode_reward = 0 correct_count = 0 steps = 0 q_loss = 0 mean_action = 0 # Environment 리셋 user_id, items_ids, done = self.env.reset() # print(f'user_id : {user_id}, rated_items_length:{len(self.env.user_items)}') # print('items : ', self.env.get_items_names(items_ids)) while not done: # Observe current state & Find action ## Embedding 해주기 user_eb = self.embedding_network.get_layer('user_embedding')( np.array(user_id)) items_eb = self.embedding_network.get_layer('movie_embedding')( np.array(items_ids)) # items_eb = self.m_embedding_network.get_layer('movie_embedding')(np.array(items_ids)) ## SRM으로 state 출력 state = self.srm_ave([ np.expand_dims(user_eb, axis=0), np.expand_dims(items_eb, axis=0) ]) ## Action(ranking score) 출력 action = self.actor.network(state) ## ε-greedy exploration if self.epsilon > np.random.uniform() and not self.is_test: self.epsilon -= self.epsilon_decay action += np.random.normal(0, self.std, size=action.shape) ## Item 추천 recommended_item = self.recommend_item( action, self.env.recommended_items, top_k=top_k) # Calculate reward & observe new state (in env) ## Step next_items_ids, reward, done, _ = self.env.step( recommended_item, top_k=top_k) if top_k: reward = np.sum(reward) # get next_state next_items_eb = self.embedding_network.get_layer( 'movie_embedding')(np.array(next_items_ids)) # next_items_eb = self.m_embedding_network.get_layer('movie_embedding')(np.array(next_items_ids)) next_state = self.srm_ave([ np.expand_dims(user_eb, axis=0), np.expand_dims(next_items_eb, axis=0) ]) # buffer에 저장 self.buffer.append(state, action, reward, next_state, done) if self.buffer.crt_idx > 1 or self.buffer.is_full: # Sample a minibatch batch_states, batch_actions, batch_rewards, batch_next_states, batch_dones, weight_batch, index_batch = self.buffer.sample( self.batch_size) # Set TD targets target_next_action = self.actor.target_network( batch_next_states) qs = self.critic.network( [target_next_action, batch_next_states]) target_qs = self.critic.target_network( [target_next_action, batch_next_states]) min_qs = tf.raw_ops.Min(input=tf.concat([target_qs, qs], axis=1), axis=1, keep_dims=True) # Double Q method td_targets = self.calculate_td_target( batch_rewards, min_qs, batch_dones) # Update priority for (p, i) in zip(td_targets, index_batch): self.buffer.update_priority( abs(p[0]) + self.epsilon_for_priority, i) # print(weight_batch.shape) # print(td_targets.shape) # raise Exception # Update critic network q_loss += self.critic.train([batch_actions, batch_states], td_targets, weight_batch) # Update actor network s_grads = self.critic.dq_da([batch_actions, batch_states]) self.actor.train(batch_states, s_grads) self.actor.update_target_network() self.critic.update_target_network() items_ids = next_items_ids episode_reward += reward mean_action += np.sum(action[0]) / (len(action[0])) steps += 1 if reward > 0: correct_count += 1 print( f'recommended items : {len(self.env.recommended_items)}, epsilon : {self.epsilon:0.3f}, reward : {reward:+}', end='\r') if done: print() precision = int(correct_count / steps * 100) print( f'{episode}/{max_episode_num}, precision : {precision:2}%, total_reward:{episode_reward}, q_loss : {q_loss/steps}, mean_action : {mean_action/steps}' ) if self.use_wandb: wandb.log({ 'precision': precision, 'total_reward': episode_reward, 'epsilone': self.epsilon, 'q_loss': q_loss / steps, 'mean_action': mean_action / steps }) episodic_precision_history.append(precision) if (episode + 1) % 50 == 0: plt.plot(episodic_precision_history) plt.savefig( f'/home/diominor/Workspace/DRR/images/training_precision_%_top_5.png' ) if (episode + 1) % 1000 == 0: self.save_model( f'/home/diominor/Workspace/DRR/save_weights/actor_{episode+1}_fixed.h5', f'/home/diominor/Workspace/DRR/save_weights/critic_{episode+1}_fixed.h5' ) def save_model(self, actor_path, critic_path): self.actor.save_weights(actor_path) self.critic.save_weights(critic_path) def load_model(self, actor_path, critic_path): self.actor.load_weights(actor_path) self.critic.load_weights(critic_path)