class ImageDown: def __init__(self, urls, location, urlCache): self._urls = list(set(urls)) self._location = location if not os.path.exists(location): os.mkdir(location) self._cache = urlCache self._taskpool = TaskPool(10) def addUrls(self, urls): self._urls.extend(list(set(urls))) def run(self): urls = [] urls, self._urls = self._urls, urls for url in urls: self._down(url) self._taskpool.run() def _down(self, url): if self._cache.get(url): return def callback(response): if response.error: print 'Error', response.error, url else: data = response.body self._writeImage(url, data) self._taskpool.spawn(url, callback) def _writeImage(self, url, data): try: fileName = (url.split('/')[-1]) fileExt = os.path.splitext(fileName)[-1] fileName = hashlib.md5(data).hexdigest() + fileExt fullName = os.path.join(self._location, fileName) if not os.path.exists(fullName): with open(fullName, 'wb') as f: f.write(data) self._cache.set(url, fullName) except: print 'write image %s error %s' % (url, traceback.format_exc())
class Crawler: def __init__(self, beginUrl): self._beginUrl = beginUrl self._urlTasks = [] self._urlMarked = {} self._imgs = [] self._taskpool = TaskPool(10) def run(self, depth): self._crawl(self._beginUrl, depth) self._taskpool.run() def getImgs(self): return self._imgs def _crawl(self, url, depth): if depth == 0: return if url in self._urlMarked: return self._urlMarked[url] = True def callback(response): if response.error: print 'Error', response.error, url else: data = response.body lister = URLLister() lister.feed(data) urls = lister.getUrls() imgs = lister.getImgs() self._imgs.extend(imgs) for newUrl in urls: self._crawl(newUrl, depth - 1) self._taskpool.spawn(url, callback)