def qualify_urls(ref_url, urls, rule, newqueue, wb): for url in urls: _dump, _fetch, _spider = False, False, False # apply patterns to determine how to qualify url if recipe.apply_mask(rule.get("dump"), url): _dump = True if recipe.apply_mask(rule.get("fetch"), url): _fetch = True if (recipe.apply_mask(rule.get("spider"), url) and recipe.apply_hostfilter(rule.get("host_filter"), url)): _spider = True # build a record based on qualification record = {"url" : url} if url not in wb: if _dump: io.write_out("%s\n" % url) if _fetch and _spider: record["mode"] = fetch.Fetcher.SPIDER_FETCH elif _fetch: record["mode"] = fetch.Fetcher.FETCH elif _spider: record["mode"] = fetch.Fetcher.SPIDER if _fetch or _spider: newqueue.append(record) # add url to web if it was matched by anything if _dump or _fetch or _spider: wb.add_url(ref_url, [url]) return newqueue, wb
def qualify_urls(ref_url, urls, rule, newqueue, wb): for url in urls: _dump, _fetch, _spider = False, False, False # apply patterns to determine how to qualify url if recipe.apply_mask(rule.get("dump"), url): _dump = True if recipe.apply_mask(rule.get("fetch"), url): _fetch = True if (recipe.apply_mask(rule.get("spider"), url) and recipe.apply_hostfilter(rule.get("host_filter"), url)): _spider = True # build a record based on qualification record = {"url": url} if url not in wb: if _dump: io.write_out("%s\n" % url) if _fetch and _spider: record["mode"] = fetch.Fetcher.SPIDER_FETCH elif _fetch: record["mode"] = fetch.Fetcher.FETCH elif _spider: record["mode"] = fetch.Fetcher.SPIDER if _fetch or _spider: newqueue.append(record) # add url to web if it was matched by anything if _dump or _fetch or _spider: wb.add_url(ref_url, [url]) return newqueue, wb
def get_url(fetcher, wb, host_filter=False): """http 30x redirects produce a recursion with new urls that may or may not have been seen before""" while True: try: fetcher.launch_w_tries() break except fetch.ChangedUrlWarning, e: url = urlrewrite.rewrite_urls(fetcher.url, [e.new_url]).next() if url in wb: raise fetch.DuplicateUrlWarning if not recipe.apply_hostfilter(host_filter, url): raise fetch.UrlRedirectsOffHost wb.add_ref(fetcher.url, url) fetcher.url = url