def save(img_url): name = img_url[-9:-4] print(u"开始保存:", img_url) img = request.get(img_url) f = open(name + ".jpg", "ab") f.write(img.content) f.close()
def start(url): response = request.get(url) Soup = BeautifulSoup(response.text, "lxml") all_a = Soup.find("div", class_="all").find("ul").find_all("a") for a in all_a: title = a.get_text() url = a["href"] spider_queue.push(url, title) """上面这个调用就是把Url写入MongoDB的队列"""
def html(self, href): print(href) html = request.get(href) max_span = BeautifulSoup(html.text, "lxml").find_all("span")[10].get_text() paeg_num = 0 for page in range(1, int(max_span)+1): paeg_num = paeg_num +1 page_url = href + "/" + str(page) self.img(page_url, max_span, paeg_num)
def pageurl_crawler(): while True: try: url = crawl_queue.pop() print(url) except KeyError: print("队列没有数据") else: img_urls = [] req = request.get(url, 3).text title = crawl_queue.pop_title(url) mkdir(title) os.chdir("D:\mzitu\\" + title) max_span = BeautifulSoup(req, "lxml").find("div", class_="pagenavi").find_all("span")[-2].get_text() for page in range(1, int(max_span)+1): page_url = url + "/" +str(page) img_url = BeautifulSoup(request.get(page_url).text, "lxml").find("div", class_="main-image").find("img")["src"] img_urls.append(img_url) save(img_url) crawl_queue.complete(url)
def img(self, page_url, max_span, page_num): img_html = request.get(page_url) img_url = BeautifulSoup(img_html.text, "lxml").find("div", class_="main-image").find("img")["src"] self.img_urls.append(img_url) if int(max_span) == page_num: self.save(img_url) post = { "标题":self.title, "主题页面":self.url, "图片地址":self.img_urls, "获取时间":datetime.datetime.now() } self.meizitu_collection.save(post) else: self.save(img_url)
def all_url(self, url): html = request.get(url) all_a = BeautifulSoup(html.text, "lxml").find("div", class_="all").find("ul").find_all("a") for a in all_a: title = a.get_text() print("开始保存:", title) path = str(title).replace("?", "_") self.mkdir(path) os.chdir("D:\mzitu\\"+path) href = a["href"] self.url = href if self.meizitu_collection.find_one({"主题页面":href}): print(u"这个页面已经爬取过了") else: self.html(href) self.html(href)