def crawl_url(seed): # returns list of crawled links crawled = [] content = urlutils.get_page(seed) crawled = get_target_urls(content) return crawled
def get_next_target(page): start_link = page.find(str_to_look) if start_link == -1: return None, 0 start_quote = page.find('"', start_link) end_quote = page.find('"', start_quote + 1) url = page[start_quote + 1:end_quote] content = urlutils.get_page(url) write_file(content, page, end_quote) return url, end_quote
def get_next_target(page): start_link = page.find("<td><a href=") if start_link == -1: return None, 0 start_quote = page.find('"', start_link) end_quote = page.find('"', start_quote + 1) url = URL_BASE + page[start_quote + 1 : end_quote] print url content = urlutils.get_page(url) write_file(content, page, end_quote) return url, end_quote