def run(limit = -1): if limit != -1 and store.count() > limit: print 'Limit: ' + str(limit) + ' Count: ' + str(store.count()) print 'Voila, We are done !!' import sys sys.exit() return if swap.current.is_empty(): if swap.next.is_empty() : print 'Somehow, We are done' else: # we exchange current and new, run again swap.exchange() return run(limit) #get the url from current-to be crawled list of urls url = swap.current.pop() if not url or not valid_url(url): print '"'+url+'" is not a valid url, giving up on it' return run(limit) f = Fetcher(url) next_urls = store.process(f.url(), f.child_urls()) for url in next_urls: swap.next.push(url) return run(limit)
def __init__(self, url): if not valid_url(url): raise Exception('Invalid url passed to fetcher: ' + url) self._url = url self._parsed = None self._origin = None self._response = None self._soup = None self._child_urls = []
def process_href(self, href): # return None if href points to the instantiating url if len(href) == 0 or href == '/' or href == self.url() : return None #if href is a complete url, return it if valid_url(href) : return href #if href starts with '#' or '/#' return None, as we don't want the same url in children try: if href.index('#') == 0 or href.index('/#') == 0 : return None except ValueError: pass #Now href is a segment, fetched from page on self.url(), so we prepend the origin to href, # and try again href = href[0] == '/' and href or '/' + href return self.process_href(self.origin() + href)
def get_main_image_from_urls(urls, title=''): try: valid_urls = [] if len(urls) == 0: return '' for u in urls: url = util.valid_url(u[0]) if url != '': valid_urls.append(url) if similarity.get_similarity(u[1], title) > ALT_SIMILARITY: return url break for url in valid_urls: try: if url != '': result = urlfetch.fetch(url) if result.status_code == 200: file = cStringIO.StringIO(result.content) im = Image.open(file) size = result.headers["Content-Length"] height, width = im.size if not size or size == '': size = 0 #print get_size_ratio(height, width) #print url if int(size) > IMAGE_SIZE and get_size_ratio(height, width) < IMAGE_SIZE_RATIO: return url break except Exception, ex: logging.error('get_main_image_from_urls: %s' % ex.message) continue return ''