class Crawler(object): def __init__(self,threadnum,pathname,limit): '''limit指定图片数目,path指定存放路径''' super(Crawler, self).__init__() self.threadPool = ThreadPool(threadnum) self.file = PicFile('imgfile','r') self.urlqueue = deque() self.count = 1 self._makePath(pathname) self.savaPath = os.getcwd()+'/'+pathname self._getUrl(limit) '''当前目录下创建指定目录''' def _makePath(self,pathname): if not os.path.isdir(os.getcwd()+'/'+pathname): os.mkdir(os.getcwd()+'/'+pathname) else: pass '''从文件取出 URL 到双向列表''' def _getUrl(self,num): while len(self.urlqueue) < num: self.urlqueue.append(self.file.getData().rstrip('\n')) self.file.close() def start(self): print '---start downloading picture---' self.threadPool.startThreads() while self.urlqueue!=deque([]): self.threadPool.putTask(self._handleTask,self.urlqueue.popleft()) self.stop() def stop(self): self.threadPool.stopThreads() print '---end downloading picture---' '''任务处理''' def _handleTask(self,url): self._download(url) '''下载图片,以数字升序命名''' def _download(self,url): retry = 2 try: r = requests.get(url) with open(self.savaPath +'/'+str(self.count)+'.jpg','wb') as jpg: jpg.write(r.content) self.count+=1 print url except Exception,e: if retry > 0: retry = retry - 1 self._download(url)
def __init__(self,url,threadnum,limit): #self.database = Database('pichref.sql') self.file = PicFile('imgfile','a') self.threadPool = ThreadPool(threadnum) self.unaccesshref = deque()#双向列表 self.accessedhref = set()#已访问的链接集合 self.unaccesshref.append(url)#添加初始链接 self.limit = limit self.picUrlCount = 1
def __init__(self,threadnum,pathname,limit): '''limit指定图片数目,path指定存放路径''' super(Crawler, self).__init__() self.threadPool = ThreadPool(threadnum) self.file = PicFile('imgfile','r') self.urlqueue = deque() self.count = 1 self._makePath(pathname) self.savaPath = os.getcwd()+'/'+pathname self._getUrl(limit)
class Fetch(object): def __init__(self,url,threadnum,limit): #self.database = Database('pichref.sql') self.file = PicFile('imgfile','a') self.threadPool = ThreadPool(threadnum) self.unaccesshref = deque()#双向列表 self.accessedhref = set()#已访问的链接集合 self.unaccesshref.append(url)#添加初始链接 self.limit = limit self.picUrlCount = 1 def start(self): print '--start downloading url--' self.threadPool.startThreads() while self.unaccesshref!=deque([]):#不为空 一直分配任务 self._organise() print '---' self.stop() def stop(self): self.threadPool.stopThreads() self.file.close() print '--Stop downloading url--' #往线程池分配任务 def _organise(self): while self.unaccesshref: url = self.unaccesshref.popleft()#从双向队列左取URL #print 'popleft sucess' self.threadPool.putTask(self._handle_task,url)#分配任务 self.accessedhref.add(url)#添加到已处理 time.sleep(2)#中断操作,让unaccesshref可以及时得到数据 print 'accessedhref',self.accessedhref print 'unaccesshref',self.unaccesshref #处理任务 def _handle_task(self,url): webpage = DownloadWeb(url) if webpage.download(): self._addUrlToUnaccesshref(webpage) #添加普通链接到未访问链接列表 def _addUrlToUnaccesshref(self,webpage): url, webpagecontent = webpage.getdata() # pic_links, hrefs = self._getLinkFromPage(url,webpagecontent) hrefs = self._getLinkFromPage(url,webpagecontent) for href in hrefs: if not self._isUsedhref(href): self.unaccesshref.append(href) # print 'self.unaccesshref',len(self.unaccesshref),self.unaccesshref,'\n' #解析源码,获取普通链接和图片链接,将正确的图片链接存储到文件 def _getLinkFromPage(self,url,source_code): pic_links, hrefs = [], [] soup = BeautifulSoup(source_code) href_res = soup.find_all('a',href=True)#获取普通链接 pic_link_res = soup.find_all(src=re.compile('http://.*?\.jpg'))#获取图片链接 for h in href_res: href = h.get('href').encode('utf-8') if not href.startswith('http') and href!='/' and href!='#' and href.find(';')==-1: href = urljoin(url, href) hrefs.append(href) for pic_link in pic_link_res: pic_link = pic_link.get('src').encode('utf-8') self.file.saveData(pic_link)#图片链接存储到文件 self.picUrlCount+=1 if self.picUrlCount >= self.limit: # print self.limit,'limit ------' # self.stop() ##由于线程池是当self.unaccesshref为空时才结束,当链接数满足要求,结束线程池的工作 self.unaccesshref=deque() return []# return hrefs '''判断是否是已经获取的 url''' def _isUsedhref(self,href): if href in self.accessedhref or href in self.unaccesshref: return True else: return False