Exemple #1
0
class ImageDown:
    def __init__(self, urls, location, urlCache):
        self._urls = list(set(urls))
        self._location = location

        if not os.path.exists(location):
            os.mkdir(location)

        self._cache = urlCache

        self._taskpool = TaskPool(10)
        
    def addUrls(self, urls):
        self._urls.extend(list(set(urls)))
        
        
    def run(self):
        urls = []
        urls, self._urls = self._urls, urls

        for url in urls:
            self._down(url)

        self._taskpool.run()
        
    def _down(self, url):
        if self._cache.get(url):
            return
        
        def callback(response):
            if response.error:
                print 'Error', response.error, url
            else:
                data = response.body
                self._writeImage(url, data)

        self._taskpool.spawn(url, callback)
        
    def _writeImage(self, url, data):
        try:
            fileName = (url.split('/')[-1])
            fileExt = os.path.splitext(fileName)[-1]
            
            fileName = hashlib.md5(data).hexdigest() + fileExt

            fullName = os.path.join(self._location, fileName)

            if not os.path.exists(fullName):
                with open(fullName, 'wb') as f:
                    f.write(data)

            self._cache.set(url, fullName)
                    
        except:
            print 'write image %s error %s' % (url, traceback.format_exc())
Exemple #2
0
class Crawler:
    def __init__(self, beginUrl):
        self._beginUrl = beginUrl
        self._urlTasks = []
        self._urlMarked = {}
        self._imgs = []
        self._taskpool = TaskPool(10)
        
    def run(self, depth):
        self._crawl(self._beginUrl, depth)
        self._taskpool.run()

    def getImgs(self):
        return self._imgs

    def _crawl(self, url, depth):
        if depth == 0:
            return

        if url in self._urlMarked:
            return

        self._urlMarked[url] = True

        def callback(response):
            if response.error:
                print 'Error', response.error, url
            else:
                data = response.body

                lister = URLLister()
                lister.feed(data)

                urls = lister.getUrls()
                imgs = lister.getImgs()

                self._imgs.extend(imgs)

                for newUrl in urls:
                    self._crawl(newUrl, depth - 1)

        self._taskpool.spawn(url, callback)