def fetch_gracefully(url_byte, filename): """Simplified adaptation of spiderfetch.py:get_url()""" getter = fetch.Fetcher(url=url_byte, filename=filename) getter.write_progress = lambda *args, **kw: None # no output or logging while True: try: getter.launch_w_tries() break except fetch.ChangedUrlWarning, e: url = urlrewrite.rewrite_urls(getter.url, [e.new_url]).next() getter.url = url
def find_urls_in_page(web, txt_byte, url_u, url_byte): urls_byte = [] for u_b in spider.unbox_it_to_ss(spider.findall(txt_byte)): urls_byte.append(u_b) urls_byte = sorted(list(set(urls_byte))) filter_regex = get_regex_filter(url_u) candidates_byte = [] for u_b in urlrewrite.rewrite_urls(url_byte, urls_byte): if re.match(filter_regex, u_b) and url_byte != u_b: if u_b not in web: web.add_url(u_b, []) candidates_byte.append(u_b) # if no candidate links are found, fall back on visited urls if len(candidates_byte) == 0: candidates_byte = web.urls() return candidates_byte