def find_urls_in_page(web, txt_byte, url_u, url_byte): urls_byte = [] for u_b in spider.unbox_it_to_ss(spider.findall(txt_byte)): urls_byte.append(u_b) urls_byte = sorted(list(set(urls_byte))) filter_regex = get_regex_filter(url_u) candidates_byte = [] for u_b in urlrewrite.rewrite_urls(url_byte, urls_byte): if re.match(filter_regex, u_b) and url_byte != u_b: if u_b not in web: web.add_url(u_b, []) candidates_byte.append(u_b) # if no candidate links are found, fall back on visited urls if len(candidates_byte) == 0: candidates_byte = web.urls() return candidates_byte
txt_byte = get_page(url_byte) candidates_byte = find_urls_in_page(web, txt_byte, url_u, url_byte) encoding = decoder.detect_encoding(txt_byte) chosen_u = pick_url(candidates_byte, encoding=encoding) return chosen_u if __name__ == '__main__': url_byte = 'http://en.wikipedia.org/wiki/Main_Page' url_byte = 'http://ar.wikipedia.org/wiki/الصفحة_الرئيسية' url_byte = 'http://pt.wikipedia.org/wiki/Casa_da_Cascata' url_byte = 'http://it.wikipedia.org/wiki/Special:Random' web = web.Web() web.add_url(url_byte, []) url_u = decoder.decode(url_byte, 'utf-8') depth = -1 while depth != 0: # easy way to set depth as infinite depth -= 1 try: url_u = find_next(url_u, web, handler=url_handler) except: io.output("Recovering from exception:") io.output(traceback.format_exc()) url_u = pick_url(web.urls()) pause() # less hammer