Esempio n. 1
0
def parse_blogs(site_dir: str, config: ES_CONFIG) -> None:
    filenames = glob(site_dir + '/20*/**/*index.html', recursive=True)
    paragraphStatsCollector = ParagraphStatsCollector()
    middlewares: Middlewares = [
        pa_sanitize_ws,
        pa_chunk_long,
        pa_remove_empty,
        pa_cat_short,
        pa_remove_ptag,
        paragraphStatsCollector,
        pa_log,
        ESMiddleware(config),
    ]
    blogParser = BlogParser(middlewares)
    #filenames = ['./site/2017/04/11/custom-intellisense-with-monaco-editor/index.html']
    log.info(pformat(filenames, indent=2))
    for filename in filenames:
        blogParser.parse_file(filename)
    formattedStats = paragraphStatsCollector.formatted()
    log.info(bannerfy(f"Gathered Statistics:\n{formattedStats}"))
Esempio n. 2
0
    def _get_blog_data(self, *args, **kwargs):
        '''获取日志数据
        '''
        if not self.can_access:
            return

        # 获取日志数量
        self._get_main_page_data()

        blogs_url = "https://h5.qzone.qq.com/proxy/domain/b.qzone.qq.com/cgi-bin/blognew/get_abs"
        blogs_payload = {
            "inCharset": "utf-8",
            "outCharset": "utf-8",
            "format": "jsonp",
            "hostUin": self._account_info.target_uin,
            "uin": self._account_info.self_uin,
            "g_tk": self._account_info.g_tk,
            "pos": "%d" % 0,
            "num": "%d" % 0,
            "blogType": "0",
            "reqInfo": "1"
        }

        single_blog_url = "https://h5.qzone.qq.com/proxy/domain/b.qzone.qq.com/cgi-bin/blognew/blog_output_data"
        single_blog_payload = {
            "uin": "%s" % self._account_info.target_uin,
            "blogid": "%s" % "",
            "numperpage": "15",
            "inCharset": "utf-8",
            "outCharset": "utf-8 ",
            "ref": "qzone"
        }

        # 最多100篇日志
        num = 100
        loop_num = math.ceil(self._account_info.blog_num / num)

        total_num = 0
        for i in range(loop_num):
            pos = i * num
            current_num = num if i < loop_num - 1 else self._account_info.blog_num - i * num
            blogs_payload["pos"] = "%d" % pos
            blogs_payload["num"] = "%d" % current_num
            r = self._account_info.get_url(blogs_url, params=blogs_payload)
            json_data = get_json_data_from_response(r.text)

            blogs_info = BlogsInfo(json_data, pos, pos + current_num,
                                   self._directory)
            blogs_info.export()

            for blog in json_data["data"]["list"]:
                title = blog["title"]
                print("process blog:", title)

                blog_id = blog["blogId"]
                category = blog["cate"]
                comment_num = blog["commentNum"]
                blog_info = BlogInfo(category, title, blog_id, comment_num)
                statistical_json_data = self._get_blog_comment_data(blog_info)

                single_blog_payload["blogid"] = "%s" % blog_id
                temp = self._account_info.get_url(single_blog_url,
                                                  params=single_blog_payload)

                read = 0
                try:
                    new_data = statistical_json_data["data"][0]["current"][
                        "newdata"]
                    if new_data and len(new_data) > 0:
                        read = new_data["RZRD"]
                except Exception as e:
                    print("get read num error")
                    print(e)
                    logging.exception(blog_info)
                    logging.exception(e)

                single_blog = BlogParser(self._directory, blog_info, temp.text,
                                         read)
                single_blog.export()

            total_num += len(json_data["data"]["list"])
            print("current get %d blog(s)" % total_num)

        if total_num != self._account_info.blog_num:
            logging.warning(
                "qq %s: not get encough blog, get: %d, should get: %d" %
                (self._account_info.target_uin, total_num,
                 self._account_info.blog_num))