Ejemplo n.º 1
0
    def file_parsing(self):
        """
        读取输入数据,并作文本解析、词频统计、实体统计。
        并根据阈值去掉低频词与低频实体,并生成词索引与实体索引。
        """
        with timer("File Parsing", verbose=True):
            print("**** Starting Parsing Input Files! ****")
            for file in self.input_files:
                with open(file, "r", encoding="utf-8") as f:
                    for line in f:
                        line = line.strip()
                        if line:
                            user, title, label, entities = line.split("\t")

                            content: List[str] = title.split()
                            # collect corpus
                            self.corpus.append(content)

                            # collect word in title to summarise count
                            for s in content:
                                self.word2freq[s] += 1

                            # collect entity in entities to summarise count
                            for pair in entities.split(";"):
                                ent_id, ent_name = pair.split(":")
                                self.entity2freq[int(ent_id)] += 1

            # 生成word2index
            index = 1  # 起始从 1 开始, 0 for dummy
            for word, freq in self.word2freq.items():
                if freq >= self.min_word_count:
                    self.word2index[word] = index
                    index += 1

            # 生成entity2index
            index = 1
            for ent_id, freq in self.entity2freq.items():
                if freq >= self.min_entity_count:
                    self.entity2index[int(ent_id)] = index
                    index += 1

            print("Succeed in Parsing Input Files!")
            print("Words num: %d.\tEntity num: %d" %
                  (len(self.word2index), len(self.entity2index)))
Ejemplo n.º 2
0
 def transform(self, input_file: str, output_file: str):
     """
     对 input_file的标题进行 index处理,生成 word_index_encoding 和 entity_index_encoding
     并输出至output_file
     :param input_file: 输入数据地址
     :param output_file: 输出数据地址
     """
     with timer("Transform", True):
         print("**** Starting Transform %s ****" % input_file)
         with open(input_file, "r",
                   encoding="utf-8") as fr, open(output_file,
                                                 "w",
                                                 encoding="utf-8") as fw:
             for line in fr:
                 line = line.strip()
                 if line:
                     user, title, label, entities = line.split("\t")
                     word_encoding, entity_encoding = self._encoding_title(
                         title, entities)  # 从entity中获取特征
                     content = "\t".join(
                         [user, word_encoding, entity_encoding, label])
                     fw.write(content + "\n")
         print("Transformation Done!")
Ejemplo n.º 3
0
        for item in self.user_recall_items[user]:
            score = self.predict(user, item)
            score = 1 / (1 + math.exp(-score))
            result.push(score, item)
        return result.queue()

    def save(self):
        LOGGER.info(f"Save model to `{self.model_file}`")
        with open(self.model_file, "wb") as f:
            pickle.dump((self.P, self.Q), f, protocol=pickle.HIGHEST_PROTOCOL)

    def load(self):
        LOGGER.info(f"Load model from `{self.model_file}`")
        with open(self.model_file, "rb") as f:
            self.P, self.Q = pickle.load(f)


if __name__ == '__main__':
    data = read_file(os.path.join(MOVIE_LENS_SRC, "ratings.dat"))
    train, test = split(data, seed=0, test_size=0.1)
    lfm = LFMRecommend()
    lfm.fit(train)

    n_recall = 10
    with timer("Recommend"):
        print(lfm.recommend("6027", n_recall))

    precision, recall = evaluate(lfm, test, n_recall_items=n_recall)
    print(f"Precision: {precision}, Recall: {recall}."
          )  # Precision: 0.188542, Recall: 0.11229.
        return similarity

    def user_item_score(self, user: str, item: str) -> float:
        """
        Calculate the recommend score between specified user and item.
        For example, the score computation between user `C` and item `a` is as follows:
        Score(C, a) = sum([Score(U, a) * Sim(U, C) for U in Users-besides-C])
        """
        score = 0.0
        user_sim = self.users_sim[user]
        for user_, sim in user_sim.items():
            score += sim * self.user_scores[user_][item]
        return score

    def recommend(self, user: str) -> Dict[str, float]:
        """Just give the scores on non-rated items of `user`, not sorted or top-k"""
        return {
            item: self.user_item_score(user, item)
            for item in self.user_non_score_items[user]
        }


if __name__ == '__main__':
    Data = generate_score_data(100, 1000, 0.2, 0)

    # user similarity cost: 0.536 sec
    # recommend cost: 0.015 sec
    ub = UserCF(Data)
    with timer(name="User-based CF"):
        print(ub.recommend("C"))
Ejemplo n.º 5
0
    def evaluation(self, k: int = 8, n_items: int = 10) -> Tuple[float, float]:
        """Compute precision and recall"""
        test_user_rated_items: Dict[str, Set[str]] = \
            self.test_data.groupby("user").agg({"item": lambda s: set(list(s))})["item"].to_dict()

        hit = 0
        test_num = 0
        pred_num = 0
        for test_user, test_items in tqdm(test_user_rated_items.items()):
            pred_items = self.recommend(test_user, k=k, n_items=n_items)
            test_num += len(test_items)
            pred_num += len(pred_items)
            for _, pred_item in pred_items:
                if pred_item in test_items:
                    hit += 1

        recall = hit / test_num
        precision = hit / pred_num
        return precision, recall


if __name__ == '__main__':
    rec = ItemCFRecommend.from_file(os.path.join(MOVIE_LENS_SRC, "ratings.dat"))
    with timer("Recommend"):  # 0.1356s
        result = rec.recommend("1", k=8, n_items=40)
    print(result)

    precision, recall = rec.evaluation(k=8, n_items=10)
    print(f"Precision: {precision}, Recall: {recall}.")  # Precision: 0.188542, Recall: 0.11229.
Ejemplo n.º 6
0
        return cls(eval_results)

    def evaluate(self, recommendation: FirstRec):
        """
        Evaluate on recommendation object.
        :param recommendation: The recommend object, can return items for query user.
        :return: recall, precision
        """
        print("Start evaluation.")
        recalls, precisions = [], []
        for user, eval_entries in tqdm.tqdm(self.eval_results.items()):
            hit = 0
            rec_results = recommendation.recommend(user)
            if not rec_results:
                print(f"No recommendation for {user}")
                continue  # possibly because `user` not in `train_file`
            for movie, _ in rec_results:
                if movie in eval_entries:
                    hit += 1
            recalls.append(hit / len(eval_entries))
            precisions.append(hit / len(rec_results))
        return sum(recalls) / len(recalls), sum(precisions) / len(precisions)


if __name__ == '__main__':
    rec = FirstRec.from_json_file(train_file, k=k, n=n)
    evaluation = Evaluation.from_json_file(test_file)
    with timer(name="Evaluation on `FirstRec`"):
        recall, precision = evaluation.evaluate(rec)
    print(f"Recall: {recall}, Precision: {precision}")
Ejemplo n.º 7
0
    def user_item_score(self, user: str, item: str) -> float:
        """
        Calculate the recommend score between specified user and item.
        For example, the score computation between user `C` and item `a` is as follows:
            Score(C, a) = sum([Sim(a, x) * Score(C, x) for x in items-rated-by-C])
        """
        score = 0.0
        item_sim = self.items_sim[item]

        for item_, sim in item_sim.items():
            score += sim * self.user_scores[user][item_]
        return score

    def recommend(self, user: str) -> Dict[str, float]:
        """Just give the scores on non-rated items of `user`, not sorted or top-k"""
        return {
            item: self.user_item_score(user, item)
            for item in self.user_non_score_items[user]
        }


if __name__ == '__main__':
    Data = generate_score_data(100, 1000, 0.2, 0)
    # Data = Data

    # user similarity cost: 3.86 sec
    # recommend cost: 0.0429 sec
    ub = ItemCF(Data)
    with timer(name="Item-based CF"):
        print(ub.recommend("C"))
Ejemplo n.º 8
0
        target_entries = self.features[target_user]

        # Select TopK neighbourhood's entries
        neighbour_users = PriorityQueue(maxsize=self.k)
        for user, entries in self.features.items():
            if user == target_user:
                continue
            corr = pearson(entries, target_entries)
            neighbour_users.push(
                corr,
                entries)  # different from source code, push entries not users.

        movies = defaultdict(float)
        for corr, entries in neighbour_users.queue():
            for movie, rate in entries.items():
                movies[movie] += corr * rate  # corr as the weight of user

        # sort movies
        result = sorted(movies.items(), key=lambda k: k[1], reverse=True)

        return result[:self.n]


if __name__ == '__main__':
    from _utils.context import timer
    json_file = os.path.join(os.path.dirname(__file__), "data/train.json")
    rec = FirstRec.from_json_file(json_file, k=15, n=20)
    with timer(name="Recommend Test"):  # ~30 ms
        print(rec.recommend("436670"))
Ejemplo n.º 9
0
        recall_artists = self.total_artists if recall_old else self.user_non_rated_artists[user]
        recall_artists_tag_gene = self.artist_tag_gene[recall_artists, :]  # shape: [#recall_artists, max_tag_id + 1]

        scores = user_tag_preference.dot(recall_artists_tag_gene.T).toarray().reshape(-1)  # shape: [#recall_artists, ]
        return heapq.nlargest(n_items, zip(recall_artists, scores), key=lambda pair: pair[1])

    def evaluate(self, user: int) -> Tuple[float, float]:
        """Evaluate recommendation on specific user.
        :return: Tuple of precision and recall
        """
        n_total_artists = len(self.total_artists)
        user_non_rated_artists = set(self.user_non_rated_artists[user])

        true_num = n_total_artists - len(user_non_rated_artists)
        pred = self.recommend(user, n_items=true_num, recall_old=True)

        hit = 0
        for artist, _ in pred:
            if artist not in user_non_rated_artists:
                hit += 1
        return hit / len(pred), hit / true_num


if __name__ == '__main__':
    rec = TagBasedRecommend(k=1.0)
    with timer("TagRecommend"):
        print(rec.recommend(2, 20, recall_old=False))  # 0.009s

    precision, recall = rec.evaluate(2)
    print(f"Precision: {precision:.4f}, Recall: {recall:.4f}")  # 0.2, 0.2
Ejemplo n.º 10
0
    a, b = pair
    low = min(a, b)
    if a % low == 0 and b % low == 0:
        return low
    start = low // 2
    for i in range(start, 0, -1):
        if a % i == 0 and b % i == 0:
            return i


numbers = [(1963309, 2265973), (1879675, 2493670), (2030677, 3814172),
           (1551645, 2229620), (1988912, 4736670), (2198964, 7876293)]


if __name__ == '__main__':
    with _context.timer(name="Single Thread"):  # 0.4558
        result = list(map(gcd, numbers))

    with _context.timer(name="Multi Thread"):  # 0.4019
        with ThreadPoolExecutor(max_workers=2) as pool:
            result = list(pool.map(gcd, numbers))

    with _context.timer(name="Multi Process"):  # 0.3790
        with ProcessPoolExecutor(max_workers=2) as pool:
            result = list(pool.map(gcd, numbers))

# 多进程操作流程
# 1)把numbers列表中的每一项输入数据都传给map。
# 2)用pickle模块对数据进行序列化,将其变成二进制形式。
# 3)通过本地套接字,将序列化之后的数据从煮解释器所在的进程,发送到子解释器所在的进程。
# 4)在子进程中,用pickle对二进制数据进行反序列化,将其还原成python对象。
Ejemplo n.º 11
0
    low = min(a, b)
    if a % low == 0 and b % low == 0:
        return low
    start = low // 2
    for i in range(start, 0, -1):
        if a % i == 0 and b % i == 0:
            return i


numbers = [(1963309, 2265973), (1879675, 2493670), (2030677, 3814172),
           (1551645, 2229620), (1988912, 4736670), (2198964, 7876293)]

if __name__ == "__main__":
    """1. map(self, fn, *iterables, **kwargs)"""
    """返回的results列表是有序的,顺序和 `*iterables` 迭代器的顺序一致。"""
    with _context.timer("Map test"):
        with ProcessPoolExecutor(max_workers=2) as pool:
            results = list(pool.map(gcd, numbers))
        print(results)
    """2. submit(self, fn, *args, **kwargs)"""
    """用于提交一个可并行的方法,submit方法同时返回一个future实例。"""
    """future对象标识这个线程/进程异步进行,并在未来的某个时间执行完成。future实例表示线程/进程状态的回调。"""
    with _context.timer("Submit test"):
        futures = []
        with ProcessPoolExecutor(max_workers=2) as pool:
            for pair in numbers:
                future = pool.submit(gcd, pair)
                futures.append(future)
        results = [future.result() for future in futures]
        print(results)
    """3. future"""
Ejemplo n.º 12
0
        """Get not rated items for each user."""
        rated_summary = rating.groupby("UserID").agg(
            {"MovieID": lambda s: set(s)})
        rated_summary = dict(rated_summary["MovieID"])
        return {
            user: total.difference(rated)
            for user, rated in rated_summary.items()
        }

    def recommend(self, user: int) -> List:
        """Recommend item which has not been rated by user and has biggest similarity with user's favor."""
        LOGGER.info(f"Give recommendation for {user}.")
        user_vec = self.user_profile[user]
        result = PriorityQueue(self.k)
        non_rating_items = self.user_non_rating_items[user]
        LOGGER.info(
            f"Recommend from {len(non_rating_items)} / {len(self.total_items)} non-rated items for `{user}`"
        )
        for movie in non_rating_items:  # Not recommend rated movies
            item_vec = self.item_profile[movie]
            sim = 1 - cosine(item_vec, user_vec)
            result.push(sim, movie)
        return sorted(result.queue(), key=lambda k: k[0], reverse=True)


if __name__ == '__main__':
    path = os.path.dirname(__file__)
    rec = ContentBasedRec.from_json_file(os.path.join(path, "data"), k=10)
    with timer("CBRecommend"):
        print(rec.recommend(1))
Ejemplo n.º 13
0
        """Compute precision and recall"""
        test_user_rated_items: Dict[str, Set[str]] =\
            self.test_data.groupby("user").agg({"item": lambda s: set(list(s))})["item"].to_dict()

        hit = 0
        test_num = 0
        pred_num = 0
        for test_user, test_items in tqdm(test_user_rated_items.items()):
            pred_items = self.recommend(test_user, k=k, n_items=n_items)
            test_num += len(test_items)
            pred_num += len(pred_items)
            for _, pred_item in pred_items:
                if pred_item in test_items:
                    hit += 1

        recall = hit / test_num
        precision = hit / pred_num
        return precision, recall


if __name__ == '__main__':
    rec = UserCFRecommend.from_file(os.path.join(MOVIE_LENS_SRC,
                                                 "ratings.dat"))
    with timer("Recommend"):  # 0.006s
        result = rec.recommend("1", k=8, n_items=40)
    print(result)

    precision, recall = rec.evaluation(k=8, n_items=10)
    print(f"Precision: {precision}, Recall: {recall}."
          )  # Precision: 0.17529, Recall: 0.10440.