Exemple #1
0
class Crawler(object):
    
    def __init__(self,threadnum,pathname,limit):
        '''limit指定图片数目,path指定存放路径'''
        super(Crawler, self).__init__()
        self.threadPool = ThreadPool(threadnum)
        self.file = PicFile('imgfile','r')
        self.urlqueue = deque()
        self.count = 1
        self._makePath(pathname)
        self.savaPath = os.getcwd()+'/'+pathname
        self._getUrl(limit)

    '''当前目录下创建指定目录'''
    def _makePath(self,pathname):
        if not os.path.isdir(os.getcwd()+'/'+pathname):
            os.mkdir(os.getcwd()+'/'+pathname)
        else:
            pass

    '''从文件取出 URL 到双向列表'''
    def _getUrl(self,num):
        while len(self.urlqueue) < num:
            self.urlqueue.append(self.file.getData().rstrip('\n'))
        self.file.close()
        
    def start(self):
        print '---start downloading picture---'
        self.threadPool.startThreads()
        while self.urlqueue!=deque([]):
            self.threadPool.putTask(self._handleTask,self.urlqueue.popleft())
        self.stop()

    def stop(self):
        self.threadPool.stopThreads()
        print '---end downloading picture---'

    '''任务处理'''
    def _handleTask(self,url):
        self._download(url)
    
    '''下载图片,以数字升序命名'''
    def _download(self,url):
        retry = 2 
        try:
            r = requests.get(url)
            with open(self.savaPath +'/'+str(self.count)+'.jpg','wb') as jpg:
                jpg.write(r.content)
                self.count+=1
            print url
        except Exception,e:
            if retry > 0:
                retry = retry - 1
                self._download(url)
Exemple #2
0
 def __init__(self,url,threadnum,limit):
     #self.database = Database('pichref.sql')
     self.file = PicFile('imgfile','a')
     self.threadPool = ThreadPool(threadnum)
     self.unaccesshref = deque()#双向列表
     self.accessedhref = set()#已访问的链接集合
     self.unaccesshref.append(url)#添加初始链接
     self.limit = limit
     self.picUrlCount = 1
Exemple #3
0
 def __init__(self,threadnum,pathname,limit):
     '''limit指定图片数目,path指定存放路径'''
     super(Crawler, self).__init__()
     self.threadPool = ThreadPool(threadnum)
     self.file = PicFile('imgfile','r')
     self.urlqueue = deque()
     self.count = 1
     self._makePath(pathname)
     self.savaPath = os.getcwd()+'/'+pathname
     self._getUrl(limit)
Exemple #4
0
class Fetch(object):

    def __init__(self,url,threadnum,limit):
        #self.database = Database('pichref.sql')
        self.file = PicFile('imgfile','a')
        self.threadPool = ThreadPool(threadnum)
        self.unaccesshref = deque()#双向列表
        self.accessedhref = set()#已访问的链接集合
        self.unaccesshref.append(url)#添加初始链接
        self.limit = limit
        self.picUrlCount = 1
        


    def start(self):
        print '--start downloading url--'
        self.threadPool.startThreads()
        while self.unaccesshref!=deque([]):#不为空 一直分配任务
            self._organise()
            print '---'

        self.stop()

    def stop(self):
        self.threadPool.stopThreads()
        self.file.close()
        print '--Stop downloading url--'

    #往线程池分配任务
    def _organise(self):
        while self.unaccesshref:
            url = self.unaccesshref.popleft()#从双向队列左取URL
            #print 'popleft sucess'
            self.threadPool.putTask(self._handle_task,url)#分配任务
            self.accessedhref.add(url)#添加到已处理
            time.sleep(2)#中断操作,让unaccesshref可以及时得到数据

        print 'accessedhref',self.accessedhref
        print 'unaccesshref',self.unaccesshref


    #处理任务
    def _handle_task(self,url):
        webpage = DownloadWeb(url)
        if webpage.download():
            self._addUrlToUnaccesshref(webpage)



    #添加普通链接到未访问链接列表
    def _addUrlToUnaccesshref(self,webpage):
        url, webpagecontent = webpage.getdata()
#        pic_links, hrefs = self._getLinkFromPage(url,webpagecontent)
        hrefs = self._getLinkFromPage(url,webpagecontent)


        for href in hrefs:
            if not self._isUsedhref(href):
                self.unaccesshref.append(href)
#        print 'self.unaccesshref',len(self.unaccesshref),self.unaccesshref,'\n'

    
    #解析源码,获取普通链接和图片链接,将正确的图片链接存储到文件
    def _getLinkFromPage(self,url,source_code):
        pic_links, hrefs = [], []
        soup = BeautifulSoup(source_code)
        href_res = soup.find_all('a',href=True)#获取普通链接
        pic_link_res = soup.find_all(src=re.compile('http://.*?\.jpg'))#获取图片链接
        for h in href_res:
            href = h.get('href').encode('utf-8')
            if not href.startswith('http') and href!='/' and href!='#' and href.find(';')==-1:
                href = urljoin(url, href)
                hrefs.append(href)

        for pic_link in pic_link_res:
            pic_link = pic_link.get('src').encode('utf-8')
            self.file.saveData(pic_link)#图片链接存储到文件
            
            self.picUrlCount+=1
            if self.picUrlCount >= self.limit:
#                print self.limit,'limit ------'
#                self.stop()
                ##由于线程池是当self.unaccesshref为空时才结束,当链接数满足要求,结束线程池的工作
                self.unaccesshref=deque()
                return []#
        return hrefs

    '''判断是否是已经获取的 url'''
    def _isUsedhref(self,href):
        if href in self.accessedhref or href in self.unaccesshref:
            return True
        else:
            return False