def save(img_url):
     name = img_url[-9:-4]
     print(u"开始保存:", img_url)
     img = request.get(img_url)
     f = open(name + ".jpg", "ab")
     f.write(img.content)
     f.close()
def start(url):
    response = request.get(url)
    Soup = BeautifulSoup(response.text, "lxml")
    all_a = Soup.find("div", class_="all").find("ul").find_all("a")
    for a in all_a:
        title = a.get_text()
        url = a["href"]
        spider_queue.push(url, title)
    """上面这个调用就是把Url写入MongoDB的队列"""
Beispiel #3
0
 def html(self, href):
     print(href)
     html = request.get(href)
     max_span = BeautifulSoup(html.text, "lxml").find_all("span")[10].get_text()
     paeg_num = 0
     for page in range(1, int(max_span)+1):
         paeg_num = paeg_num +1
         page_url = href + "/" + str(page)
         self.img(page_url, max_span, paeg_num)
 def pageurl_crawler():
     while True:
         try:
             url = crawl_queue.pop()
             print(url)
         except KeyError:
             print("队列没有数据")
         else:
             img_urls = []
             req = request.get(url, 3).text
             title = crawl_queue.pop_title(url)
             mkdir(title)
             os.chdir("D:\mzitu\\" + title)
             max_span = BeautifulSoup(req, "lxml").find("div", class_="pagenavi").find_all("span")[-2].get_text()
             for page in range(1, int(max_span)+1):
                 page_url = url + "/" +str(page)
                 img_url = BeautifulSoup(request.get(page_url).text, "lxml").find("div", class_="main-image").find("img")["src"]
                 img_urls.append(img_url)
                 save(img_url)
             crawl_queue.complete(url)
Beispiel #5
0
 def img(self, page_url, max_span, page_num):
     img_html = request.get(page_url)
     img_url = BeautifulSoup(img_html.text, "lxml").find("div", class_="main-image").find("img")["src"]
     self.img_urls.append(img_url)
     if int(max_span) == page_num:
         self.save(img_url)
         post = {
             "标题":self.title,
             "主题页面":self.url,
             "图片地址":self.img_urls,
             "获取时间":datetime.datetime.now()
         }
         self.meizitu_collection.save(post)
     else:
         self.save(img_url)
Beispiel #6
0
 def all_url(self, url):
     html = request.get(url)
     all_a = BeautifulSoup(html.text, "lxml").find("div", class_="all").find("ul").find_all("a")
     for a in all_a:
         title = a.get_text()
         print("开始保存:", title)
         path = str(title).replace("?", "_")
         self.mkdir(path)
         os.chdir("D:\mzitu\\"+path)
         href = a["href"]
         self.url = href
         if self.meizitu_collection.find_one({"主题页面":href}):
             print(u"这个页面已经爬取过了")
         else:
             self.html(href)
         self.html(href)