Beispiel #1
0
    def save_to_es(self):
        try:
            if len(self["watch_user_num"]) == 2:
                watch_user_num = int(self["watch_user_num"][0])
                click_num = int(self["watch_user_num"][1])
            else:
                watch_user_num = int(self["watch_user_num"][0])
                click_num = 0
            topics = ",".join(self["topics"])
        except Exception:
            watch_user_num = 0
            click_num = 0

        try:
            topics = ",".join(self["topics"])
        except Exception:
            topics = ""

        try:
            answer_num = extract_num("".join(self["answer_num"]))
        except:
            answer_num = 0

        zhihu_id = self["zhihu_id"][0]
        url = self["url"][0]
        title = "".join(self["title"])
        content = "".join(self["content"])
        comments_num = extract_num("".join(self["comments_num"]))

        crawl_time = datetime.datetime.now().strftime(SQL_DATE_FORMAT)
        article = ZhihuQuestionType()
        article.meta.id = zhihu_id
        article.topics = topics
        article.url = url
        article.title = title
        article.content = content
        article.answer_num = answer_num
        article.comments_num = comments_num
        article.watch_user_num = watch_user_num
        article.click_num = click_num
        article.crawl_time = crawl_time

        article.suggest = gen_suggest(ZhihuQuestionType._doc_type.index, ((article.title, 10), (article.content, 9)))

        redis_cli.incr("zhihu_count")
        article.save()
        return
Beispiel #2
0
    def save_to_es(self):
        question = ZhihuQuestionType()

        question.meta.id = self['zhihu_id'][0]
        question.zhihu_id = self['zhihu_id'][0]
        question.topics = ",".join(self.get('topics', [""]))
        question.url = self['url'][0]
        question.title = "".join(self.get("title", [""]))
        question.content = remove_tags("".join(self.get("content", [""])))

        # 由于知乎在数字中加入了逗号,这里做了一点小改动
        answer_num = extract_num("".join(
            [text.replace(",", "") for text in self.get('answer_num', [""])]))
        comments_num = extract_num("".join([
            text.replace(",", "") for text in self.get('comments_num', [""])
        ]))
        if len(self["watch_user_num"]) == 2:
            watch_user_num = int(self["watch_user_num"][0].replace(",", ""))
            click_num = int(self["watch_user_num"][1].replace(",", ""))
        else:
            watch_user_num = int(self["watch_user_num"][0].replace(",", ""))
            click_num = 0

        question.answer_num = answer_num
        question.comments_num = comments_num
        question.watch_user_num = watch_user_num
        question.click_num = click_num
        question.crawl_time = datetime.datetime.now().date()

        question.suggest = gen_suggests(ZhihuQuestionType._index._name,
                                        ((question.title, 10),
                                         (question.topics, 7),
                                         (question.content, 5)))

        question.save()
        # redis中记录爬取document数目
        redis_cli.incr("zhihuquestion_count")
        return
Beispiel #3
0
    def save_to_es(self):
        # 讲item转换成es的数据
        zhihuquestion = ZhihuQuestionType()
        if "zhihu_id" in self:
            zhihuquestion.zhihu_id = self["zhihu_id"][0]
        else:
            zhihuquestion.zhihu_id = 0
        zhihuquestion.topics = ",".join(self.get("topics", ""))
        zhihuquestion.url = self["url"][0]
        zhihuquestion.title = "".join(self["title"])
        zhihuquestion.content = "".join(self["content"])
        zhihuquestion.answer_num = get_nums("".join(
            '%s' % i for i in self.setdefault("answer_num", "no exsits")))
        zhihuquestion.comments_num = get_nums("".join(
            '%s' % i for i in self.setdefault("comments_num", "no exsits")))
        if len(self["watch_user_num"]) == 2:
            zhihuquestion.watch_user_num = int(self["watch_user_num"][0])
            zhihuquestion.click_num = int(self["watch_user_num"][1])
        else:
            zhihuquestion.watch_user_num = int(self["watch_user_num"][0])
            zhihuquestion.click_num = 0
        zhihuquestion.crawl_time = datetime.datetime.now().strftime("%Y-%m-%d")

        zhihuquestion.suggest = gen_suggest(ZhihuQuestionType._doc_type.index,
                                            ((zhihuquestion.title, 10),
                                             (zhihuquestion.topics, 7)))

        zhihuquestion.save()
        redis_cli.incr("ZhihuQuestion_count")

        return
    def save_to_es(self):
        #turn the item in the ES's item
        question = ZhihuQuestionType()
        question.zhihu_id = self["zhihu_id"][0]
        question.topics = ",".join(self["topics"])
        question.url = self["url"][0]
        question.title = "".join(self["title"])
        if "content" in self:
            if self["content"]:
                question.content = "".join(self["content"])
        else:
            question.content = "No content"
        question.answer_num = extract_num("".join(self["answer_num"]))
        question.comments_num = extract_num("".join(self["comments_num"]))
        if len(self["watch_user_num"]) == 2:
            question.watch_user_num = int(self["watch_user_num"][0])
            question.click_num = int(self["watch_user_num"][1])
        else:
            question.watch_user_num = int(self["watch_user_num"][0])
            question.click_num = 0
        question.crawl_time = datetime.datetime.now().strftime(
            SQL_DATETIME_FORMAT)
        question.suggest = get_suggests(ZhihuQuestionType._doc_type.index,
                                        ((question.title, 10),
                                         (question.topics, 7),
                                         (question.content, 5)))

        question.save()
        redis_cli.incr("question_count")
        return
Beispiel #5
0
    def save_to_es(self):
        article = ZhihuQuestionType()
        article.question_id = self.get("question_id", "")
        article.topics = self.get("topics", "")
        article.url = self.get("url", "")
        article.title = self.get("title", "")
        article.answer_num = self.get("answer_num", 0)
        article.comments_num = self.get("comments_num", 0)
        article.attention_num = self.get("attention_num", 0)
        article.click_num = self.get("click_num", 0)
        article.create_time = self.get("create_time", "1970-01-01 00:00:00")
        article.crawl_time = self.get("crawl_time", "1970-01-01 00:00:00")
        article.content = self.get("content", "")

        article.suggest = es_method.gen_suggests(article._doc_type.index,
                                                 ((article.title, 10),
                                                  (article.topics, 7),
                                                  (article.content, 4)))
        article.save()
        redis_cli.incr("zhihu_question_count")