def mzitu_crawler(max_threads=10): crawl_queue = MogoQueue('meinvxiezhenji', 'crawl_queue') def pageurl_crawler(): while True: try: url = crawl_queue.pop() print(url) except KeyError: print("队列没有数据") else: img_urls = [] req = request.get(url, 3).text title = crawl_queue.pop_title(url) mkdir(title) os.chdir("D:\mzitu\\" + title) max_span = BeautifulSoup(req, "lxml").find("div", class_="pagenavi").find_all("span")[-2].get_text() for page in range(1, int(max_span)+1): page_url = url + "/" +str(page) img_url = BeautifulSoup(request.get(page_url).text, "lxml").find("div", class_="main-image").find("img")["src"] img_urls.append(img_url) save(img_url) crawl_queue.complete(url) def save(img_url): name = img_url[-9:-4] print(u"开始保存:", img_url) img = request.get(img_url) f = open(name + ".jpg", "ab") f.write(img.content) f.close() def mkdir(path): paht = path.strip() isExits = os.path.exists(os.path.join("D:\mzitu\\", path)) if not isExits: print(u"建了一个名字叫做", path, u"的文件夹!") os.makedirs(os.path.join("D:\mzitu\\", path)) return True else: print(u"已存在名字叫做", path) return False threads = [] while threads or crawl_queue: """ 这儿crawl_queue用上了,就是我们__bool__函数的作用,为真则代表我们MongoDB队列里面还有数据 threads 或者 crawl_queue为真都代表我们还没下载完成,程序就会继续执行 """ for thread in threads: if not thread.is_alive(): threads.remove(thread) while len(threads) < max_threads or crawl_queue.peek(): thread = threading.Thread(target=pageurl_crawler) thread.setDaemon(True)##设置守护线程 thread.start() threads.append(thread) time.sleep(SLEEP_TIME)
def startDown(url,rule,num,start,decoding=None): if not decoding: decoding='utf8' #req=urllib.request.urlopen(url) #response= request.get(url, 3) #body=response.text #req.read().decode(decoding) print('file='+url) f = open(url) body = f.read() f.close() debs = body.split('\n') #rule=re.compile(rule) #debs=rule.findall(body) crawl_queue = MogoQueue('cetc15-apt', 'crawl_queue') #crawl_queue.clear() # CCCCC for l in debs: l = l.strip() if (len(l)==0 or not l.startswith(PREFIX_STR)): continue print 'deb:' + l[start:] #TODO: get the sha1 crawl_queue.push(l[start:], 'a') for i in range(num): d=download(crawl_queue) d.start()
def mzitu_crawler(max_threads=10): crawl_queue = MogoQueue('name', 'crawl_queue') #img_queue = MogoQueue('name', 'img_queue') def pageurl_crawler(): while True: try: url = =crawl_queue.pop() print url except KeyError: print '队列没有数据' break else: img_urls = [] req = request.get(url, 3).text() title = crawl_queue.pop_title(url) mkdir(title) os.chdir('D:\mzitu\\'+title) max_span = BeautifulSoup(req, 'lxml').find('div', class_='pagenavi').find_all('span')[-2].get_text() for page in range(1, int(max_span)+1): page_url = url + '/' + str(page) img_url = BeautifulSoup(request.get(page_url, 3).text,'lxml').find('div', class_='main-image').find('img')['src'] img_urls.append(img_url) save(img_url) crawl_queue.complete(url) #img_queue.push_imgurl(title, img_urls) #print '插入数据库成功' def save(img_url): name = img_url[-9:-4] print u'开始保存:', img_url img = request.get(img_url, 3) f = open(name + '.jpg', 'ab') f.write(img.content) f.close() def mkdir(path): path = path.strip() isExists = os.path.exists(os.path.join('D:\mzitu',path)) if not isExists: print u'建立一个名字叫做', path, u'的文件夹!' os.makedirs(os.path.join('D:\mzitu', path)) return True else: print u'名字叫做', path, u'的文件夹已经存在了' return False threads = [] while threads or crawl_queue: for threads in threads: if not thread.is_alive(): #is_alive是判断是否为空,不是空则在队列中删掉 threads.remove(thread) while len(threads) < max_threads or crawl_queue.peek(): thread = threading.Thread(target=pageurl_crawler) #创建线程 thread.setDaemon(True) #设置守护线程 thread.start() #启动线程 threads.append(thread) #添加进线程队列 time.sleep(SLEEP_TIME)
def E_Hen_crawler(max_threads=5): img_queue = MogoQueue('meinv', 'img_queue') def pageurl_crawler(): while True: try: (url, name) = img_queue.pop() print(url) except KeyError: print('队列没有数据') break else: title = img_queue.pop_title_(url) path = str(title).replace('?', '') mkdir(path) os.chdir('C:\Data\\' + path) html = request.get(url, 3) html_soup = BeautifulSoup(html.text, 'lxml') img = html_soup.find('div', id='i3').find('img') img_url = img['src'] print(u'得到图片的链接') save(img_url, name) img_queue.complete(url) def save(img_url, page_name): name = page_name print(u'开始保存:', img_url, '\n') img = request.get(img_url, 15) f = open(name + '.jpg', 'ab') f.write(img.content) f.close() def mkdir(path): path = path.strip() isExists = os.path.exists(os.path.join("C:\Data", path)) if not isExists: print(u'建了一个名字叫做', path, u'的文件夹!') os.makedirs(os.path.join("C:\Data", path)) return True else: print(u'名字叫做', path, u'的文件夹已经存在了!') return False threads = [] while threads or img_queue: for thread in threads: if not thread.is_alive(): ##is_alive是判断是否为空,不是空则在队列中删掉 threads.remove(thread) while len(threads) < max_threads and img_queue.peek( ): ##线程池中的线程少于max_threads 或者 crawl_qeue时 thread = threading.Thread(target=pageurl_crawler) ##创建线程 thread.setDaemon(True) ##设置守护线程 thread.start() ##启动线程 threads.append(thread) ##添加进线程队列 time.sleep(SLEEP_TIME)
def start(url): response = request.get(url, 3) Soup = BeautifulSoup(response.text, 'lxml') title = Soup.find('div', class_='gm').find('h1', id='gj').get_text() spider_queue = MogoQueue('meinv', 'img_queue') spider_queue.clear() print(u'清除集合img_queue') spider_queue = MogoQueue('meinv', 'img_queue') print(u'新建集合img_queue') max_span = Soup.find('table', class_='ptt').find_all('td')[-2].get_text() page_url = url for i in range(1,int(max_span)+1): html = request.get(page_url, 3) Soup = BeautifulSoup(html.text, 'lxml') all_a = Soup.find('div', id='gdt').find_all('a') for a in all_a: href = a['href'] name = a.img['alt'] spider_queue.push(href, title,name) page_url =url+'?p='+str(i)
# -*- coding: utf-8 -*- #import urllib2 #import urllib from mongodb_queue import MogoQueue from Download import request from bs4 import BeautifulSoup ''' def getHtml(): page = urllib.urlopen("http://www.chinalawedu.com/falvfagui/") html = page.read() reg=r'class="fenlei_txt"' soup=BeautifulSoup(html,"lxml") ''' law_queue = MogoQueue('falvfagui', 'title_queue') def start(url): response = request.get(url, 3) soup = BeautifulSoup(response.text, 'html.parser') #print soup.prettify().encode('utf-8') all_div = soup.find_all('div', class_="fenlei_txt") #law_queue.push('lianjie','ok') for div in all_div: #print div.prettify() all_a = div.find_all('a') for a in all_a: title = a.get_text() url = a['href']
# encoding=utf-8 from Download import request from mongodb_queue import MogoQueue from bs4 import BeautifulSoup spider_queue = MogoQueue('name', 'crawl_queue') def start(url): response = request.get(url, 3) Soup = BeautifulSoup(response.text, 'lxml') all_a = Soup.find('div', class_='all').find_all('a') for a in all_a: title = a.get_text() url = a['href'] spider_queue.push(url, title) if __name__ == '__main__': start('http://www.mzitu.com/all')
from downloader import request from mongodb_queue import MogoQueue from bs4 import BeautifulSoup spider_queue = MogoQueue("meinvxiezhenji", "crawl_queue") def start(url): response = request.get(url) Soup = BeautifulSoup(response.text, "lxml") all_a = Soup.find("div", class_="all").find("ul").find_all("a") for a in all_a: title = a.get_text() url = a["href"] spider_queue.push(url, title) """上面这个调用就是把Url写入MongoDB的队列""" if __name__ == "__main__": start("http://www.mzitu.com/all")
from Download import request from mongodb_queue import MogoQueue from bs4 import BeautifulSoup spider_queue = MogoQueue('picture', 'jinji') def start(url): response = request.get(url, 3) response.encoding = response.apparent_encoding Soup = BeautifulSoup(response.text, 'lxml') all_a = Soup.find_all('fieldset', id='info')[1].find_all('a') for a in all_a: title = a.get_text().strip() url = 'http://www.cartoonmad.com' + a['href'] print(title, url) spider_queue.push(url, title, 1) if __name__ == "__main__": start('http://www.cartoonmad.com/comic/1221.html')
def mzitu_crawler(max_threads=10): crawl_queue = MogoQueue('meinvxiezhenji','crawl_queue') ##这个是我们获取URL的队列 img_queue = MogoQueue('meinvxiezhenji','img_queue') ##这个是图片实际URL的队列 def pageurl_crawler(): L.acquire() while True: try: url = crawl_queue.pop() print(url) except KeyError: print('队列没有数据') break else: img_urls = [] req = request.get(url, 3).text title = crawl_queue.pop_title(url) path = str(title).replace('?', '')##测试过程中发现一个标题有问号 path = re.sub(r'[?\\*|“<>:/]', '', str(path)) mkdir(path) os.chdir('E:\图片\mzitu\\' + path) max_span = BeautifulSoup(req, 'lxml').find('div', class_='pagenavi').find_all('span')[-2].get_text() for page in range(1, int(max_span) + 1): page_url = url + '/' + str(page) img_html = request.get(page_url, 3) ##这儿更改了一下(是不是发现 self 没见了?) img_url = BeautifulSoup(img_html.text, 'lxml').find('div', class_='main-image').find('img')['src'] name = BeautifulSoup(img_html.text, 'lxml').find('h2', class_='main-title').get_text() name = re.sub(r'[?\\*|“<>:/]', '', str(name)) img_urls.append(img_url) print(u'开始保存:', img_url,name) img = request.get(img_url, 3, referer=page_url) f = open(name + '.jpg', 'ab') f.write(img.content) f.close() crawl_queue.complete(url) ##设置为完成状态 img_queue.push_imgurl(title, img_urls) print('插入数据库成功') L.release() def mkdir(path): path = path.strip() path = re.sub(r'[?\\*|“<>:/]', '', str(path)) isExists = os.path.exists(os.path.join("E:\图片\mzitu", path)) if not isExists: print(u'建了一个名字叫做', path, u'的文件夹!') os.makedirs(os.path.join("E:\图片\mzitu", path)) return True else: print(u'名字叫做', path, u'的文件夹已经存在了!') return False threads = [] while threads or crawl_queue: """ 这儿crawl_queue用上了,就是我们__bool__函数的作用,为真则代表我们MongoDB队列里面还有数据 threads 或者 crawl_queue为真都代表我们还没下载完成,程序就会继续执行 """ for thread in threads: if not thread.is_alive(): ##is_alive是判断是否为空,不是空则在队列中删掉 threads.remove(thread) while len(threads) < max_threads or crawl_queue.peek(): ##线程池中的线程少于max_threads 或者 crawl_qeue时 thread = threading.Thread(target=pageurl_crawler) ##创建线程 thread.setDaemon(True) ##设置守护线程 thread.start() ##启动线程 threads.append(thread) ##添加进线程队列 time.sleep(SLEEP_TIME)
def mypics_crawler(max_threads=10): crawl_queue = MogoQueue('mypics_db', 'crawl_queue') ##这个是我们获取URL的队列 img_queue = MogoQueue('mypics_db', 'img_queue') ##这个是图片实际URL的队列 def pageurl_crawler(): while True: try: url = crawl_queue.pop() print(url) except KeyError: print('队列没有数据') break else: img_urls = [] req = request.get(url, 3).text title = crawl_queue.pop_title(url) path = str(title).replace('?', '') ##测试过程中发现一个标题有问号 mkdir(path) os.chdir('D:\my_pics\\' + path) #找到最大的图片页码 max_span = BeautifulSoup(req, 'lxml').find( 'div', class_='pages').find_all('a')[0].get_text() max_span = re.findall(r"\d+\.?\d*", max_span) print(max_span) #保存每一张图片 for page in range(2, int(max_span[0]) + 1): #第二页都是从序号2开始的,第一页没序号,先略过 page_url = url.rstrip('.html') + '_' + str( page) + '.html' #每一页的网址 print('图片网址:', page_url) #每一页的图片地址 img_url = BeautifulSoup( request.get(page_url, 3).text, 'lxml').find('div', class_='big-pic').find('img')['src'] img_urls.append(img_url) lock.acquire() os.chdir('D:\my_pics\\' + path) save(img_url) lock.release() crawl_queue.complete(url) ##设置为完成状态 img_queue.push_imgurl(title, img_urls) print('合集图片urls插入数据库成功') def save(img_url): name = img_url[-9:-4] print(u'开始保存:', img_url) img = request.get(img_url, 3) f = open(name + '.jpg', 'ab') f.write(img.content) f.close() def mkdir(path): path = path.strip() isExists = os.path.exists(os.path.join("D:\my_pics", path)) if not isExists: print(u'建了一个名字叫做', path, u'的文件夹!') os.makedirs(os.path.join("D:\my_pics", path)) return True else: print(u'名字叫做', path, u'的文件夹已经存在了!') return False threads = [] while threads or crawl_queue: """ 这儿crawl_queue用上了,就是我们__bool__函数的作用,为真则代表我们MongoDB队列里面还有数据 threads 或者 crawl_queue为真都代表我们还没下载完成,程序就会继续执行 """ for thread in threads: if not thread.is_alive(): ##is_alive是判断是否为空,不是空则在队列中删掉 threads.remove(thread) while len(threads) < max_threads and crawl_queue.peek( ): ##线程池中的线程少于max_threads 或者 crawl_qeue时 thread = threading.Thread(target=pageurl_crawler) ##创建线程 thread.setDaemon(True) ##设置守护线程 thread.start() ##启动线程 threads.append(thread) ##添加进线程队列 time.sleep(SLEEP_TIME)
from Download import request from mongodb_queue import MogoQueue from bs4 import BeautifulSoup spider_queue = MogoQueue('mypics_db', 'crawl_queue') def start(url): response = request.get(url, 3) Soup = BeautifulSoup(response.text, 'lxml') all_div = Soup.find_all('div', class_='item masonry_brick masonry-brick') for div in all_div: a = div.find_all('a') title = a[0]['href'][-10:-1] #a[1].get_text() url = a[0]['href'] spider_queue.push(url, title) """上面这个调用就是把URL写入MongoDB的队列了""" if __name__ == "__main__": start('http://www.mmonly.cc/gqbz/mnbz/') #spider_queue.clear() """这一段儿就不解释了哦!超级简单的"""
def E_Hen_crawler(max_threads=4): jinji = MogoQueue('meinv', 'jinji') def pageurl_crawler(): while True: try: (url, name) = jinji.pop() print(url) except KeyError: print('队列没有数据') break else: lock.acquire() img_urls = [] html = request.get(url, 10) title = jinji.pop_title_(url) mkdir(title) os.chdir('E:\E-Hen\\' + title) html.encoding = html.apparent_encoding html_soup = BeautifulSoup(html.text, 'lxml') max_span = html_soup.find_all('table')[1].find_all( 'option')[-1].get_text()[2:4] for page in range(1, int(max_span) + 1): if page < 10: page_url = url[:-6] + str(page) + '.html' else: page_url = url[:-7] + str(page) + '.html' page_html = request.get(page_url, 10) page_html.encoding = page_html.apparent_encoding pattern = re.compile( '<a href=".*?"><img src="(.*?)" border="0".*?oncontextmenu=.*?' ) img_url = re.findall(pattern, page_html.text)[0] img_urls.append(img_url) print(u'得到图片的链接') save(img_url) jinji.complete(url) lock.release() def save(img_url): name = img_url[-7:] print(u'开始保存:', img_url, '\n') img = request.get(img_url, 15) f = open(name, 'ab') f.write(img.content) f.close() def mkdir(path): path = path.strip() isExists = os.path.exists(os.path.join("E:\E-Hen", path)) if not isExists: print(u'建了一个名字叫做', path, u'的文件夹!') os.makedirs(os.path.join("E:\E-Hen", path)) return True else: print(u'名字叫做', path, u'的文件夹已经存在了!') return False threads = [] while threads or jinji: for thread in threads: if not thread.is_alive(): ##is_alive是判断是否为空,不是空则在队列中删掉 threads.remove(thread) while len(threads) < max_threads and jinji.peek( ): ##线程池中的线程少于max_threads 或者 crawl_qeue时 thread = threading.Thread(target=pageurl_crawler) ##创建线程 thread.setDaemon(True) ##设置守护线程 thread.start() ##启动线程 threads.append(thread) ##添加进线程队列 time.sleep(SLEEP_TIME)
def mzitu_crawler(max_threads=10): crawl_queue = MogoQueue('meinvxiezhenji', 'crawl_queue') def pageurl_crawl(lock): while True: try: url = crawl_queue.pop() print(url) except KeyError: print('队列没有数据') break else: img_urls = [] req = request.get(url, 3).text title = crawl_queue.pop_title(url) mkdir(title) with lock: os.chdir('F:\mzitu\\' + title) max_page = BeautifulSoup(req, 'lxml').find( 'div', class_='pagenavi').find_all('span')[-2].get_text() for page in range(1, int(max_page) + 1): page_url = url + '/' + str(page) img_url = BeautifulSoup( request.get(page_url, 3).text, 'lxml').find('div', { 'class': 'main-image' }).find('img') img_urls.append(img_url) save(img_url) def save(img_url): name = img_url[-9:-4] print(u'开始保存:', img_url) img = request.get(img_url, 3) f = open(name + '.jpg', 'ab') f.write(img.content) f.close() def mkdir(path): path = path.strip() isExists = os.path.exists(os.path.join('F:\mzitu', path)) if not isExists: print(u'创建一个名为', path, u'的文件夹!') os.makedirs(os.path.join('F:\mzitu', path)) return True else: print(u'文件夹已经存在!') return False threads = [] while threads or crawl_queue: """ 这儿用到了crawl_queue,就是__bool__函数的作用,为真则代表mongo数列里还有数据 threads 或者 crawl_queue为真都代表还没下载完,程序继续执行 """ for thread in threads: # is_alive判断是否为空,不是空则在队列中删掉 if not thread.is_alive(): threads.remove(thread) # 线程池中的线程小于max_threads 或者 crawl_queue while len(threads) < max_threads or crawl_queue.peek(): thread = threading.Thread(target=pageurl_crawl) ##创建线程 thread.setDaemon(True) ##设置守护线程 thread.start() ##启动线程 threads.append(thread) ##添加进线程队列 time.sleep(SLEEP_TIME)
def E_Hen_crawler(max_threads=5): img_queue = MogoQueue('meinv', 'xiumm') def pageurl_crawler(): while True: try: (url,name) = img_queue.pop() print(url) except KeyError: print('队列没有数据') break else: lock.acquire() title = img_queue.pop_title_(url) path = str(title).replace('?', '') mkdir(path) os.chdir('E:\E-Hen\\' + path) response = request.get(url, 3) response.encoding = 'utf-8' Soup = BeautifulSoup(response.text, 'lxml') all_url = Soup.find('div', class_='gallary_wrap').find_all('td') max_span = Soup.find('div', class_='paginator').find_all('a') for td in all_url: href = s + td.img['src'] name = td.img['alt'].strip()[-3:] save(href, name) for page in max_span: page_url = s + page['href'] html = request.get(page_url, 3) Soup = BeautifulSoup(html.text, 'lxml') all_td = Soup.find('div', class_='gallary_wrap').find_all('td') for td2 in all_td: href2 = s + td2.img['src'] name2 = td2.img['alt'].strip()[-3:] save(href2, name2) img_queue.complete(url) lock.release() def save(img_url,page_name): name=page_name print(u'开始保存:', img_url,'\n') img=request.get(img_url,15) f=open(name+'.jpg','ab') f.write(img.content) f.close() def mkdir(path): path = path.strip() isExists = os.path.exists(os.path.join("E:\E-Hen", path)) if not isExists: print(u'建了一个名字叫做', path, u'的文件夹!') os.makedirs(os.path.join("E:\E-Hen", path)) return True else: print(u'名字叫做', path, u'的文件夹已经存在了!') return False threads = [] while threads or img_queue: for thread in threads: if not thread.is_alive(): ##is_alive是判断是否为空,不是空则在队列中删掉 threads.remove(thread) while len(threads) < max_threads and img_queue.peek(): ##线程池中的线程少于max_threads 或者 crawl_qeue时 thread = threading.Thread(target=pageurl_crawler) ##创建线程 thread.setDaemon(True) ##设置守护线程 thread.start() ##启动线程 threads.append(thread) ##添加进线程队列 time.sleep(SLEEP_TIME)
# -*- coding: UTF-8 -*- import requests from bs4 import BeautifulSoup import os from mongodb_queue import MogoQueue headers = { 'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1" } start_url = input('请输入您要获取的漫画的首地址:') P_url = start_url.rsplit('/', 1)[0] + '/' #地址前缀 spider_queue = MogoQueue('acg', 'sex_cool') def mkdir(title): isExists = os.path.exists(os.path.join("/home/virgil/图片/adult_only", title)) if not isExists: print(u'建立了一个名字叫做', title, u'的文件夹') os.makedirs(os.path.join("/home/virgil/图片/adult_only", title)) os.chdir("/home/virgil/图片/adult_only/" + title) return True else: print(u'名字叫做', title, u'的文件夹已经存在了!') os.chdir("/home/virgil/图片/adult_only/" + title) return False def acg(): global headers
from Download import request from mongodb_queue import MogoQueue from bs4 import BeautifulSoup spider_queue = MogoQueue('meinv', 'xiumm') # spider_queue.clear() # print(u'清除集合xiumm') # spider_queue = MogoQueue('meinv', 'xiumm') # print(u'新建集合xiumm') s='http://www.xiumm.org/' i=1 def start(url): response = request.get(url, 3) Soup = BeautifulSoup(response.text, 'lxml') max_span = Soup.find('div', class_='paginator').find_all('a') page_url = s for page in max_span[0:1]: html = request.get(page_url, 3) html.encoding = 'utf-8' Soup = BeautifulSoup(html.text, 'lxml') all_td = Soup.find('div', class_='gallary_wrap').find_all('td') for td in all_td: address = td.a['href'] title = td.a.img['alt'] spider_queue.push(address, title, i) page_url = s + page['href'] i=i+1 # def get_url(address):
from Download import request from mongodb_queue import MogoQueue from bs4 import BeautifulSoup spider_queue = MogoQueue('meinvxiezhenji', 'crawl_queue') def start(url): response = request.get(url, 3) Soup = BeautifulSoup(response.text, 'lxml') all_a = Soup.find('div', class_='all').find_all('a') for a in all_a[1:]: title = a.get_text() url = a['href'] spider_queue.push(url, title) """上面这个调用就是把URL写入MongoDB的队列了""" if __name__ == "__main__": start('http://www.mzitu.com/all')
from Download import request from mongodb_queue import MogoQueue from bs4 import BeautifulSoup spider_queue = MogoQueue('meinv', 'crawl_queue') def start(url): response = request.get(url, 3) Soup = BeautifulSoup(response.text, 'lxml') title = Soup.find('div', class_='gm').find('h1', id='gj').get_text() max_span = Soup.find('table', class_='ptt').find_all('td') i = 1 for page in max_span[1:-1]: page_url = page.a['href'] i += 1 spider_queue.push(page_url, title) if __name__ == "__main__": start('https://e-hentai.org/g/358581/e6db8cb4b9/')
def mzitu_crawler(max_threads=10): crawl_queue = MogoQueue('meinvxiezhenji', 'crawl_queue') # 调用MogoQueue(mogodb_queue模块中)#这里是获取的url队列 img_queue = MogoQueue('meinvxiezhenji', 'img_queue') ##图片实际url的队列 # 抓取页面地止 def pageurl_crawler(): while True: """ try....except...else(用于捕获异常)的语法: try: <语句> #运行别的代码 except <名字>: <语句> #如果在try部份引发了'name'异常 except <名字> as <数据>: <语句> #如果引发了'name'异常,获得附加的数据 else: <语句> #如果没有异常发生 """ try: url = crawl_queue.pop() # MogoQueue中的pop函数,查询列队中等待抓取的对象,并改变状态 print(url) except KeyError: print('队列没有数据') break else: lock = threading.Lock() lock.acquire() img_urls = [] # 创建图片地址列表备用 req = request.get(url, 3).text # 请求需要抓取的页面 max_page = BeautifulSoup(req, 'lxml').find('div', class_='pagenavi').find_all('span')[-2].get_text() # 获取最大页码 title = crawl_queue.pop_title(url) # 取出主题 mkdir(title) # 调用后面的mkdir函数创建名为title的文件夹 os.chdir('F:\image\mzitu\\' + title) # 切换到上面创建的文件夹 for page in range(1, int(max_page) + 1): page_url = url + '/' + str(page) # 构造套图中每张图片所在页面 img_dict = BeautifulSoup(request.get(page_url, 3).text, 'lxml').find('div', {'class': 'main-image'}).find('img') # print(img_dict) if img_dict is not None: img_url = img_dict['src'] # 获取每张图片真实地址用于此后下载 else: print(u'没有获取到img_url*******************') img_url_reg = re.compile('http://.*?\.jpg', re.S) # 筛选有效图片链接的正则 if re.match(img_url_reg, img_url): img_urls.append(img_url) # 添加有效图片地址到图片地址列表 else: print(u'图片不是有效地链接地址!!!!!!!!!!!!!!') save(img_url) # 调用后面的save函数保存有效图片 lock.release() # 释放锁 crawl_queue.complete(url) # 设置为完成状态 img_queue.push_imgurl(title, img_urls) #插入有效图片地址到队列 print('插入数据库成功') def save(img_url): name = img_url[-9:-4] #获取图片名字 print(u'开始保存:', img_url) img = request.get(img_url, 3) #请求图片地址 f = open(name + '.jpg', 'ab') #创建二进制的jpg文件 f.write(img.content) #写入文件 f.close() #关闭文件 def mkdir(path): path = path.strip() #去除空格 isExists = os.path.exists(os.path.join('F:\image\mzitu', path)) #检查路径是否存在 if not isExists: print(u'创建一个名为', path, u'的文件夹!') os.makedirs(os.path.join('F:\image\mzitu', path)) #创建文件夹 return True else: print(u'文件夹已经存在!') return False threads = [] #创建线程列表备用 while threads or crawl_queue: """ 这儿用到了crawl_queue,就是__bool__函数的作用,为真则代表mongo数列里还有数据 threads 或者 crawl_queue为真都代表还没下载完,程序继续执行 """ for thread in threads: # is_alive判断线程是否为激活的,未激活则在队列中删掉 if not thread.is_alive(): threads.remove(thread) # 线程池中的线程小于max_threads 或者 crawl_queue中还有OUTSTANDING对象 #peek表示取出队列中状态为OUTSTANDING的对象并返回_id(URL) while len(threads) < max_threads or crawl_queue.peek(): thread = threading.Thread(target=pageurl_crawler) ##创建线程,线程中的对象为pageurl_crawler thread.setDaemon(True) ##设置守护线程 thread.start() ##启动线程 threads.append(thread) ##添加进线程队列 time.sleep(SLEEP_TIME)
def E_Hen_crawler(max_threads=5): crawl_queue = MogoQueue('meinv', 'crawl_queue') ##这个是我们获取URL的队列 img_queue = MogoQueue('meinv', 'img_queue') def pageurl_crawler(): while True: try: url = crawl_queue.pop() print(url) except KeyError: print('队列没有数据') break else: title = crawl_queue.pop_title(url) path = str(title).replace('?', '') ##测试过程中发现一个标题有问号 mkdir(path) os.chdir('E:\Cover\\' + path) html = request.get(url, 3) Soup = BeautifulSoup(html.text, 'lxml') all_a = Soup.find('div', id='gdt').find_all('a') for a in all_a: href = a['href'] name = a.img['alt'] html = request.get(href, 3) html_soup = BeautifulSoup(html.text, 'lxml') img = html_soup.find('div', id='i3').find('img') img_url = img['src'] img_queue.push_imgurl(title,img_url) print(u'得到图片的链接') save(img_url, name) img_queue.complete(img_url) crawl_queue.complete(url) def save(img_url,page_name): name=page_name print(u'开始保存:', img_url,'\n') img=request.get(img_url,3) f=open(name+'.jpg','ab') f.write(img.content) f.close() def mkdir(path): path = path.strip() isExists = os.path.exists(os.path.join("E:\Cover", path)) if not isExists: print(u'建了一个名字叫做', path, u'的文件夹!') os.makedirs(os.path.join("E:\Cover", path)) return True else: print(u'名字叫做', path, u'的文件夹已经存在了!') return False threads = [] while threads or crawl_queue: """ 这儿crawl_queue用上了,就是我们__bool__函数的作用,为真则代表我们MongoDB队列里面还有数据 threads 或者 crawl_queue为真都代表我们还没下载完成,程序就会继续执行 """ for thread in threads: if not thread.is_alive(): ##is_alive是判断是否为空,不是空则在队列中删掉 threads.remove(thread) while len(threads) < max_threads and crawl_queue.peek(): ##线程池中的线程少于max_threads 或者 crawl_qeue时 thread = threading.Thread(target=pageurl_crawler) ##创建线程 thread.setDaemon(True) ##设置守护线程 thread.start() ##启动线程 threads.append(thread) ##添加进线程队列 time.sleep(SLEEP_TIME)