def start(self): # 构造列表页(翻页)的地址,并分别提取每一页 page_offset = 0 while self.big_flag: page_offset += 1 page_url = self.url + "?page=" + str(page_offset) if 1 == page_offset: page_url = self.url logging.info(page_url) self.flag = [] post_url_set = get_post_url(page_url) th = [] for post_url in post_url_set: th.append(threading.Thread(target=self.get_img_src_and_save, args=(post_url,))) for t in th: t.setDaemon(True) t.start() for t in th: t.join() self.finish_check() time.sleep(0.5) re_save_p = PicSave(set([]), " ", path) re_save_p.re_save_by_log()
def get_img_src_and_save(self, tar_url): post_name = str(re.sub(pat, r'\1', tar_url)) if post_name in post_has_try2save: # 整理过了,需要查漏补缺,现在先跳过 logging.info("done" + post_name) self.flag.append(False) return post_has_try2save.add(post_name) self.flag.append(True) # 根据post的地址,提取img的地址 temp_set_of_img_src = set([]) r = requests.get(tar_url) if r.status_code != 200: logging.error("打不开:" + tar_url + str(r.status_code)) return for item in pq(r.content)("div")(x_path_post[2])("a"): img_src_item = pq(item).attr("bigimgsrc") if img_src_item not in img_src_has_try2save: temp_set_of_img_src.add(img_src_item) img_src_has_try2save.add(img_src_item) new_save_p = PicSave(temp_set_of_img_src, post_name, path) new_save_p.start_save()