Beispiel #1
0
 def collecting_training_data(self, epoch=1):
     self.data_set = []
     diversitys = []
     for _ in range(64):
         rewards = 0
         trajectory = []
         cur_state = self.env.reset()
         terminal = False
         while not terminal:
             action = self.training_agent.tompson_sampling(
                 cur_state, trajectory,
                 max([
                     self.config.temperature / (epoch + 1e-10),
                     self.config.mini_temperature
                 ]))
             next_state, reward, terminal, div = self.env.step(action)
             diversitys.append(div)
             trajectory.append((cur_state, action, reward))
             cur_state = next_state
             rewards += reward
         self.data_set.append(trajectory)
     log.info(
         "finish collecting training data",
         np.mean([len(item) for item in self.data_set]), "average click",
         np.mean([np.sum([i[2] for i in item])
                  for item in self.data_set]), "average depth",
         np.mean([len(item) for item in self.data_set]), "diversity",
         np.mean(diversitys))
Beispiel #2
0
 def create_model(cls,
                  config,
                  variable_scope="target",
                  trainable=True,
                  graph_name="DEFAULT"):
     log.info("CREATE MODEL", config.model, "GRAPH", graph_name,
              "VARIABLE SCOPE", variable_scope)
     if not graph_name in cls.GRAPHS:
         log.info("Adding a new tensorflow graph:", graph_name)
         cls.GRAPHS[graph_name] = tf.Graph()
     with cls.GRAPHS[graph_name].as_default():
         model = cls(config,
                     variable_scope=variable_scope,
                     trainable=trainable)
         if not graph_name in cls.SESS:
             cls.SESS[graph_name] = tf.Session(config=tf.ConfigProto(
                 gpu_options=config.GPU_OPTION))
             cls.SAVER[graph_name] = tf.train.Saver(max_to_keep=50)
         cls.SESS[graph_name].run(model.init)
     return {
         "graph": cls.GRAPHS[graph_name],
         "sess": cls.SESS[graph_name],
         "saver": cls.SAVER[graph_name],
         "model": model
     }
Beispiel #3
0
 def evaluate(self, num):
     import tensorflow as tf
     path = os.path.join(self.config.saved_model_path,
                         self.config.model_id + "_" + str(num))
     self.pv_agent['saver'].restore(self.pv_agent['sess'],
                                    tf.train.latest_checkpoint(path))
     r = []
     uidss = np.random.choice(range(1, self.config.user_num),
                              (self.config.evaluate_num, ),
                              replace=False)
     for uid in uidss:
         try:
             self.env.reset4evaluate(uid)
             terminal = False
             reward = 0
             while not terminal:
                 _action_probs, next_state_value = self.pv_agent[
                     "model"].get_actions_probability_model(
                         self.pv_agent["sess"], self.env)
                 act, probability = zip(*_action_probs)
                 action = act[np.argmax(probability)]
                 candidate, node_type, reward, terminal = self.env.step(
                     action)
             r.append((reward, self.env.accuracy, self.env.diversity))
         except:
             pass
     print("####" * 5)
     f1, a, d = zip(*r)
     log.info("evaluate", num, "average_reward", np.mean(f1), "accuracy",
              np.mean(a), "diversity", np.mean(d))
     pass
Beispiel #4
0
 def collecting_training_data(self):
     if os.path.isfile(self.config.save_path):
         with open(self.config.save_path, "r") as f:
             for line in f.readlines():
                 self.data_set.append(eval(line.strip("\n")))
     else:
         pbar = ProgressBar()
         diversitys = []
         for _ in pbar(range(self.config.trajectory_number)):
             trajectory = []
             cur_state = self.env.reset()
             terminal = False
             while not terminal:
                 action = self.env.sampling()
                 next_state, reward, terminal, div = self.env.step(action)
                 trajectory.append((cur_state, action, reward))
                 diversitys.append(div)
                 cur_state = next_state
             self.data_set.append(trajectory)
         log.info(
             "finish collecting training data",
             np.mean([len(item) for item in self.data_set]),
             "average click",
             np.mean([
                 np.sum([i[2] for i in item]) for item in self.data_set
             ]), "diversity", np.mean(diversitys))
         with open(self.config.save_path, "w") as f:
             for item in self.data_set:
                 f.writelines(str(item) + "\n")
Beispiel #5
0
 def collecting_training_data(self, epoch):
     self.data_set = []
     log.info(
         "temperature is ", epoch,
         max([
             self.config.temperature / (epoch + 1e-10),
             self.config.mini_temperature
         ]))
     for _ in range(64):
         rewards = 0
         trajectory = []
         cur_state = self.env.reset()
         terminal = False
         while not terminal:
             action = self.rec_agent.tompson_sampling(
                 cur_state, trajectory,
                 max([
                     self.config.temperature / (epoch + 0.1e10),
                     self.config.mini_temperature
                 ]))
             next_state, reward, terminal = self.env.step(action)
             trajectory.append((cur_state, action, reward))
             cur_state = next_state
             rewards += reward
         self.data_set.append(trajectory)
     log.info(
         "fake environment", self.epoch, "clicks",
         np.mean([np.sum([i[2] for i in item]) for item in self.data_set]),
         "depth", np.mean([len(item) for item in self.data_set]))
Beispiel #6
0
 def update_simulator(self, data_set=[]):
     action_probability = self.get_action_probability(data_set) * 0.01
     index = 00
     for trajectory in data_set:
         probability = 1.0
         for i, sar in enumerate(trajectory):
             state, action, reward = sar
             probability *= action_probability[
                 index] / self.env.get_probability(state, action)
             pp_a = min([probability, self.config.maximum_weight])
             if reward > 0:
                 if i == len(trajectory) - 1:
                     self.memory.put(
                         (state, action, trajectory[:i], reward, 1.0, pp_a),
                         0)
                 else:
                     self.memory.put(
                         (state, action, trajectory[:i], reward, 0.0, pp_a),
                         1)
             else:
                 if i == len(trajectory) - 1:
                     self.memory.put(
                         (state, action, trajectory[:i], reward, 1.0, pp_a),
                         2)
                 else:
                     self.memory.put(
                         (state, action, trajectory[:i], reward, 0.0, pp_a),
                         3)
             index += 1
     batch = self.memory.sample_batch(self.config.batch_size)
     uid = [i[0] for i in batch]
     iid = [i[1] for i in batch]
     label = [i[3] for i in batch]
     ts = [i[2] for i in batch]
     terminate = [i[4] for i in batch]
     weight = [i[5] for i in batch]
     log.info("training ratio", np.mean(terminate), "reward",
              np.mean(label), "weight", np.mean(weight))
     traj, feedbacks, target_index = self.convert_item_seq2matrix(
         [[ii[1] for ii in item] for item in ts],
         [[ii[2] for ii in item] for item in ts])
     data = {
         "uid": uid,
         "iid": iid,
         "label": label,
         "trajectory": traj,
         "feedback": feedbacks,
         "target_index": target_index,
         "terminate": terminate,
         "weight": weight
     }
     loss = self.simulator["model"].optimize_model(self.simulator["sess"],
                                                   data)
     p_c, p_t = self.simulator["model"].predict(self.simulator["sess"],
                                                data)
     self.env.set_click_terminate_threshold(np.mean(p_c), np.mean(p_t))
     log.info("loss for simulator", loss)
     return loss
Beispiel #7
0
 def run(self):
     for i in range(self.config.epoch):
         self.collecting_training_data(i)
         loss = self.training_agent.update_model(self.data_set)
         log.info("training epoch", i, 'loss', loss)
         if i % 200 == 0:
             click, length, diversity = self.evaluate()
             log.info("epoch", i, "average click", click, "depth", length,
                      "div", np.mean(diversity))
Beispiel #8
0
def train(config, env, task_index, gpu_index, lock=Lock):
    buffer = type_memory(2, config.memory_capacity)
    from function_approximation import rnn_model
    import utils
    np.random.seed(int(time.time()))
    os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_index)
    # lock.acquire()
    pv_agent = rnn_model.create_model(config, task_index=task_index)
    # utils.save_model(pv_agent["saver"],
    #                  utils.create_saved_path(0, config.saved_model_path, config.model_id),
    #                  pv_agent["sess"],
    #                  pv_agent["model"].global_step)
    # lock.release()
    mcts = MCTS(pv_agent, config.c_puct, config.n_playout,
                config.discount_factor)
    for e in range(1, config.epoch + 1):
        average_reward = []
        for i in range(config.update_frequency):
            print(task_index, "collecting data")
            data, reward = collecting_training_samples(
                config, mcts, env, config.temperature / (e + 0.000001))
            average_reward.append(reward)
            for item in data:
                if item[4] == 0:
                    buffer.put(item, 0)
                elif item[4] == 1:
                    buffer.put(item, 1)
        print(task_index, "finish collecting")
        log.info(str(e), "process", str(task_index),
                 "collecting trajectory reward", np.mean(average_reward))
        batch = buffer.sample_batch(config.batch_size)
        lock.acquire()
        try:
            p1, v_1, p2, v_2 = pv_agent["model"].optimize_model_batch(
                pv_agent["sess"], batch)
            log.info("\t".join([
                str(item) for item in [
                    e, "process", task_index, "policy_1", p1, "value_1", v_1,
                    "policy2", p2, "value_2", v_2
                ]
            ]))
            utils.save_model(
                pv_agent["saver"],
                utils.create_saved_path(e, config.saved_model_path,
                                        config.model_id), pv_agent["sess"],
                pv_agent["model"].global_step)
        except:
            pass
        lock.release()
Beispiel #9
0
 def run(self):
     for i in range(self.config.epoch):
         if i % 200 == 0 and i >= self.config.evaluation_num and self.training_agent.evaluate_or_not(
         ):
             click, length, div = self.evaluate()
             log.info("epoch", i, "average click", click, "depth", length,
                      "diversity", div)
         random.shuffle(self.data_set)
         batch = [
             self.data_set[item]
             for item in np.random.choice(len(self.data_set), (
                 self.config.batch_size, ))
         ]
         loss = self.training_agent.update_model(batch)
         log.info("training epoch", i, 'loss', loss)
Beispiel #10
0
 def update_model(self, data_set=[]):
     self.global_update_time += 1
     if self.global_update_time <= self.config.evaluation_num:
         loss = self.update_simulator(data_set)
         self.update_env()
     else:
         if self.global_update_time % 10 == 0:
             self.collecting_training_data(self.epoch)
             self.update_rec_agent()
             loss_neg = self.update_simulator_negative()
             log.info("simulator negative loss", loss_neg)
         loss = self.update_simulator(data_set)
         log.info("simulator positive loss", loss)
         self.update_env()
     return loss
Beispiel #11
0
 def update_simulator(self, data_set=[]):
     for trajectory in data_set:
         for i, sar in enumerate(trajectory):
             state, action, reward = sar
             if reward > 0:
                 if i == len(trajectory) - 1:
                     self.memory.put(
                         (state, action, trajectory[:i], reward, 1.0), 0)
                 else:
                     self.memory.put(
                         (state, action, trajectory[:i], reward, 0.0), 1)
             else:
                 if i == len(trajectory) - 1:
                     self.memory.put(
                         (state, action, trajectory[:i], reward, 1.0), 2)
                 else:
                     self.memory.put(
                         (state, action, trajectory[:i], reward, 0.0), 3)
     batch = self.memory.sample_batch(self.config.batch_size)
     uid = [i[0] for i in batch]
     iid = [i[1] for i in batch]
     label = [i[3] for i in batch]
     ts = [i[2] for i in batch]
     terminate = [i[4] for i in batch]
     log.info("training ratio", np.mean(terminate), "reward",
              np.mean(label))
     traj, feedbacks, target_index = self.convert_item_seq2matrix(
         [[ii[1] for ii in item] for item in ts],
         [[ii[2] for ii in item] for item in ts])
     weight = [1.0] * len(label)
     data = {
         "uid": uid,
         "iid": iid,
         "label": label,
         "trajectory": traj,
         "feedback": feedbacks,
         "target_index": target_index,
         "terminate": terminate,
         "weight": weight
     }
     loss = self.simulator["model"].optimize_model(self.simulator["sess"],
                                                   data)
     log.info("loss for simulator", loss)
     return loss
Beispiel #12
0
 def create_model(cls, config, variable_scope = "target", trainable = True, graph_name="DEFAULT",task_index=0):
     jobs = config.jobs
     job = list(jobs.keys())[0]
     log.info("CREATE MODEL", config.model, "GRAPH", graph_name, "VARIABLE SCOPE", variable_scope,"jobs",jobs,"job",job,"task_index",task_index)
     cls.CLUSTER = tf.train.ClusterSpec(jobs)
     cls.SERVER = tf.train.Server(cls.CLUSTER, job_name=job, task_index=task_index,config=tf.ConfigProto(gpu_options=tf.GPUOptions(allow_growth=True)))
     if not graph_name in cls.GRAPHS:
         log.info("Adding a new tensorflow graph:",graph_name)
         cls.GRAPHS[graph_name] = tf.Graph()
     with cls.GRAPHS[graph_name].as_default():
         model = cls(config, variable_scope=variable_scope, trainable=trainable)
         if not graph_name in cls.SESS:
             cls.SESS[graph_name] = tf.Session(cls.SERVER.target)
             cls.SAVER[graph_name] = tf.train.Saver(max_to_keep=1000)
         cls.SESS[graph_name].run(model.init)
     return {"graph": cls.GRAPHS[graph_name],
            "sess": cls.SESS[graph_name],
            "saver": cls.SAVER[graph_name],
            "model": model,"cluster":cls.CLUSTER,"server":cls.SERVER}
Beispiel #13
0
 def update_model(self, data_set=[]):
     if np.mean([len(item) for item in data_set]) == 1.0:
         return 0.0
     temp_data = []
     for trajectory in data_set:
         for i, sar in enumerate(trajectory):
             state, action, reward = sar
             if i == len(trajectory) - 1:
                 temp_data.append(
                     (state, action, trajectory[:i], reward, -1))
             else:
                 temp_data.append((state, action, trajectory[:i], reward,
                                   trajectory[:i + 1]))
     temp_data, next_max_q = self.get_next_q_value_tuple(temp_data)
     self.memory.extend(temp_data)
     if len(self.memory) >= self.config.buffer_size:
         self.memory = self.memory[-self.config.buffer_size:]
     batch = [
         self.memory[item]
         for item in np.random.choice(len(self.memory), (
             self.config.batch_size, ))
     ]
     uid = [i[0] for i in batch]
     iid = [i[1] for i in batch]
     label = [i[3] for i in batch]
     ts = [i[2] for i in batch]
     traj, feedbacks, target_index = self.convert_item_seq2matrix(
         [[ii[1] for ii in item] for item in ts],
         [[ii[2] for ii in item] for item in ts])
     data = {
         "uid": uid,
         "iid": iid,
         "label": label,
         "trajectory": traj,
         "feedback": feedbacks,
         "target_index": target_index
     }
     loss = self.agent["model"].optimize_model(self.agent["sess"], data)
     log.info("average max_next_q value", np.mean(next_max_q))
     return loss
Beispiel #14
0
 def init_training(self):
     log.info("load environment")
     self.training_agent = self.config.training_agent(self.config)
     self.env = diversity_environments(self.config)
     self.data_set = []
     self.collecting_training_data()
Beispiel #15
0
 def update_rec_agent(self):
     for i in range(1):
         self.collecting_training_data(self.epoch)
         loss = self.rec_agent.update_model(self.data_set)
         log.info("rec agent", i, 'loss', loss)
         self.epoch += 1