Esempio n. 1
0
    def download_list_page_html(self, url, logger):
        """
        下载列表页的html文件,主要在这个地方需要做一件事:完成存储在同一级目录下的文件之间可以完成首页、上页、下页、末页的切换的功能
        :param url: 列表页的url
        :return: None
        """
        html = requests.get(url, headers=self.headers)
        num_str = url.split("&")[-1].split("=")[-1]
        html_text = html.text
        par = Parse()
        total_page = str(
            par.parse_main_page_get_total_pagenum(html.text, configs["test"]))

        # 这个部分是首页、上页、下页、末页的切换
        pattern_fpage = "id=\"fpage\" href=\"(.+?)\""
        pattern_upage = "id=\"upage\" href=\"(.+?)\""
        pattern_npage = "id=\"npage\" href=\"(.+?)\""
        pattern_epage = "id=\"epage\" href=\"(.+?)\""

        fapge_str = re.search(pattern_fpage, html_text).group(1)
        upage_str = re.search(pattern_upage, html_text).group(1)
        npage_str = re.search(pattern_npage, html_text).group(1)
        epage_str = re.search(pattern_epage, html_text).group(1)
        if num_str == "1":
            html_text = html_text.replace(fapge_str, "#")
            html_text = html_text.replace(upage_str, "#")
            html_text = html_text.replace(
                npage_str, "./page" + str(int(num_str) + 1).zfill(4) + ".html")
            html_text = html_text.replace(
                epage_str, "./page" + total_page.zfill(4) + ".html")
        elif num_str == total_page:
            html_text = html_text.replace(fapge_str, "./page0001.html")
            html_text = html_text.replace(
                upage_str, "./page" + str(int(num_str) - 1).zfill(4) + ".html")
            html_text = html_text.replace(npage_str, "#")
            html_text = html_text.replace(epage_str, "#")
        else:
            html_text = html_text.replace(fapge_str, "./page0001.html")
            html_text = html_text.replace(
                upage_str, "./page" + str(int(num_str) - 1).zfill(4) + ".html")
            html_text = html_text.replace(
                npage_str, "./page" + str(int(num_str) + 1).zfill(4) + ".html")
            html_text = html_text.replace(
                epage_str, "./page" + total_page.zfill(4) + ".html")

        # 这个部分是品牌和code数据链接的切换
        html_text = self.replace_brand_and_code_url(html_text)

        file_name = "page" + num_str.zfill(4) + ".html"
        html_store_dir = make_store_html_dir()
        self.write_file(html_store_dir, file_name, html_text, logger)
Esempio n. 2
0
        "--url",
        type=str,
        default=
        'https://www.qcsanbao.cn/webqcba/DVMProducerServlet?method=getWhereList&p=1',
        help="要爬取的网站")
    args = parser.parse_args()
    url = args.url
    base_url = configs["basic_url"]
    r = get_redis_connect()
    dl = Download()
    par = Parse()

    # 制作列表页的url_list
    make_url_list(
        base_url,
        par.parse_main_page_get_total_pagenum(
            dl.download_first_page(url, logger), configs["test"]))

    threading_list = []

    # 列表页的解析详情页的数据url,存放在redis中,并且下载列表页html
    threading_list.extend([
        Thread(target=download_and_parse_page,
               args=("url_list", r, par.parse_main_page_get_detail_page_url,
                     dl.download_first_page, dl.download_list_page_html, lock,
                     logger)) for _ in range(configs["thread_num"])
    ])

    # 解析详情页的code和name数据url,存放在redis中,并且下载详情页html
    threading_list.extend([
        Thread(target=download_and_parse_page,
               args=("detail_url_list", r, par.parse_detail_page_get_url,