Ejemplo n.º 1
0
    def save_to_es(self):
        crawl_time = datetime.datetime.now().strftime(SQL_DATE_FORMAT)
        create_time = datetime.datetime.fromtimestamp(self["create_time"]).strftime(SQL_DATE_FORMAT)
        update_time = datetime.datetime.fromtimestamp(self["update_time"]).strftime(SQL_DATE_FORMAT)
        article = ZhihuAnswerType()
        article.meta.id = self['zhihu_id']
        article.url = self['url']
        article.question_id = self['question_id']
        article.content = remove_tags(self['content'])
        article.praise_num = self['praise_num']
        article.comments_num = self['comments_num']
        article.create_time = create_time
        article.update_time = update_time
        article.crawl_time = crawl_time

        article.save()
        return
Ejemplo n.º 2
0
    def save_to_es(self):
        answer = ZhihuAnswerType()

        answer.meta.id = self['zhihu_id']
        answer.zhihu_id = self['zhihu_id']
        answer.url = self['url']
        answer.question_id = self['question_id']
        answer.author_id = self['author_id']
        answer.content = self['content']
        answer.praise_num = self.get('praise_num', 0)
        answer.comments_num = self.get('comments_num', 0)
        if 'create_time' in self:
            answer.create_time = datetime.datetime.fromtimestamp(
                self['create_time'])
        else:
            answer.create_time = datetime.datetime.now().date()
        if 'update_time' in self:
            answer.update_time = datetime.datetime.fromtimestamp(
                self['update_time'])
        else:
            answer.update_time = datetime.datetime.now().date()
        answer.crawl_time = self['crawl_time']

        answer.suggest = gen_suggests(ZhihuAnswerType._index._name,
                                      [(answer.content, 5)])

        answer.save()
        # redis中记录爬取document数目
        redis_cli.incr("zhihuanswer_count")
        return
    def save_to_es(self):
        #turn the item in the ES's item
        answer = ZhihuAnswerType()
        answer.zhihu_id = self["zhihu_id"]
        answer.url = self["url"]
        answer.question_id = self["question_id"]
        answer.author_id = self["author_id"]
        answer.answer_excerpt = self["answer_excerpt"]
        answer.content = self["content"]
        answer.praise_num = self["praise_num"]
        answer.comments_num = self["comments_num"]
        answer.create_time = datetime.datetime.fromtimestamp(
            self["create_time"]).strftime(SQL_DATETIME_FORMAT)
        answer.update_time = datetime.datetime.fromtimestamp(
            self["update_time"]).strftime(SQL_DATETIME_FORMAT)
        answer.crawl_time = self["crawl_time"]
        answer.suggest = get_suggests(ZhihuAnswerType._doc_type.index,
                                      ((answer.content, 5), ))

        answer.save()
        redis_cli.incr("ans_count")
        return
Ejemplo n.º 4
0
    def save_to_es(self):
        answer = ZhihuAnswerType()
        answer.zhihu_id = self["zhihu_id"]
        answer.url = self["url"]
        answer.question_id = self["question_id"]
        answer.author_id = self["author_id"]
        answer.content = self["content"]
        answer.praise_num = self["praise_num"]
        answer.comments_num = self["comments_num"]
        answer.create_time = datetime.datetime.fromtimestamp(
            self["create_time"]).strftime(SQL_DATETIME_FORMAT)
        answer.update_time = datetime.datetime.fromtimestamp(
            self["update_time"]).strftime(SQL_DATETIME_FORMAT)
        answer.crawl_time = self["crawl_time"].strftime(SQL_DATETIME_FORMAT)

        # question.suggest = gen_suggests(ZhihuQuestionItem._doc_type.index, ((question.title,10),(question.topics, 7)))

        answer.save()

        # redis_cli.incr("jobbole_count")

        return
Ejemplo n.º 5
0
    def save_to_es(self):
        # 讲item转换成es的数据
        create_time = datetime.datetime.fromtimestamp(
            self["create_time"]).strftime("%Y-%m-%d")
        update_time = datetime.datetime.fromtimestamp(
            self["update_time"]).strftime("%Y-%m-%d")
        zhihuanswer = ZhihuAnswerType()
        zhihuanswer.zhihu_id = self["zhihu_id"]
        zhihuanswer.question_id = self["question_id"]
        zhihuanswer.author_id = self["author_id"]
        zhihuanswer.content = self["content"]
        zhihuanswer.praise_num = self["praise_num"]
        zhihuanswer.comments_num = self["comments_num"]
        zhihuanswer.create_time = create_time
        zhihuanswer.update_time = update_time
        zhihuanswer.crawl_time = self["crawl_time"].strftime("%Y-%m-%d")

        zhihuanswer.suggest = gen_suggest(ZhihuAnswerType._doc_type.index,
                                          ((zhihuanswer.content, 10), ))

        zhihuanswer.save()
        redis_cli.incr("ZhihuAnswer_count")
        return
Ejemplo n.º 6
0
    def save_to_es(self):
        article = ZhihuAnswerType()
        article.title = self.get("title", "")
        article.answer_id = self.get("answer_id", "")
        article.question_id = self.get("question_id", "")
        article.url = self.get("url", "")
        article.author_id = self.get("author_id", "")
        article.praise_num = self.get("praise_num", 0)
        article.comments_num = self.get("comments_num", 0)
        article.create_time = self.get("create_time", "1970-01-01 00:00:00")
        article.update_time = self.get("update_time", "1970-01-01 00:00:00")
        article.crawl_time = self.get("crawl_time", "1970-01-01 00:00:00")
        article.content = self.get("content", "")

        # article.suggest = es_method.gen_suggests(article._doc_type.index,((article.content,7),(article.title,10)))

        article.save()