Example #1
0
    def start(self):
        # 构造列表页(翻页)的地址,并分别提取每一页
        page_offset = 0
        while self.big_flag:
            page_offset += 1
            page_url = self.url + "?page=" + str(page_offset)
            if 1 == page_offset:
                page_url = self.url
            logging.info(page_url)
            self.flag = []

            post_url_set = get_post_url(page_url)

            th = []
            for post_url in post_url_set:
                th.append(threading.Thread(target=self.get_img_src_and_save, args=(post_url,)))
            for t in th:
                t.setDaemon(True)
                t.start()
            for t in th:
                t.join()

            self.finish_check()
            time.sleep(0.5)

        re_save_p = PicSave(set([]), " ", path)
        re_save_p.re_save_by_log()
Example #2
0
    def get_img_src_and_save(self, tar_url):
        post_name = str(re.sub(pat, r'\1', tar_url))
        if post_name in post_has_try2save:
            # 整理过了,需要查漏补缺,现在先跳过
            logging.info("done" + post_name)
            self.flag.append(False)
            return
        post_has_try2save.add(post_name)
        self.flag.append(True)

        # 根据post的地址,提取img的地址
        temp_set_of_img_src = set([])
        r = requests.get(tar_url)
        if r.status_code != 200:
            logging.error("打不开:" + tar_url + str(r.status_code))
            return

        for item in pq(r.content)("div")(x_path_post[2])("a"):
            img_src_item = pq(item).attr("bigimgsrc")
            if img_src_item not in img_src_has_try2save:
                temp_set_of_img_src.add(img_src_item)
                img_src_has_try2save.add(img_src_item)
        new_save_p = PicSave(temp_set_of_img_src, post_name, path)
        new_save_p.start_save()