def get_url(fetcher, wb, host_filter=False): """http 30x redirects produce a recursion with new urls that may or may not have been seen before""" while True: try: fetcher.launch_w_tries() break except fetch.ChangedUrlWarning, e: url = urlrewrite.rewrite_urls(fetcher.url, [e.new_url]).next() if url in wb: raise fetch.DuplicateUrlWarning if not recipe.apply_hostfilter(host_filter, url): raise fetch.UrlRedirectsOffHost wb.add_ref(fetcher.url, url) fetcher.url = url
def process_records(queue, rule, wb): newqueue = [] for record in queue: maybesave(wb, queue) url = record.get("url") try: (fp, filename) = io.get_tempfile() f = fetch.Fetcher(mode=record.get("mode"), url=url, filename=filename) url = get_url(f, wb, host_filter=rule.get("host_filter")) filename = f.filename # consider retrying the fetch if it failed if f.error and fetch.err.is_temporal(f.error): if not record.get("retry"): record["retry"] = True queue.append(record) if record.get("mode") == fetch.Fetcher.SPIDER: data = open(filename, 'r').read() urls = spider.unbox_it_to_ss(spider.findall(data, url)) urls = urlrewrite.rewrite_urls(url, urls) (newqueue, wb) = qualify_urls(url, urls, rule, newqueue, wb) if record.get("mode") == fetch.Fetcher.FETCH: shutil.move(filename, io.safe_filename(urlrewrite.url_to_filename(url))) except (fetch.DuplicateUrlWarning, fetch.UrlRedirectsOffHost): pass except KeyboardInterrupt: q = queue[queue.index(record):] q.extend(newqueue) save_session(wb, queue=q) sys.exit(1) except Exception, exc: log_exc(exc, url, wb) finally: