def crawl_sitemap(url): # Download the sitemap file sitemap = download(url) # Extract the sitemap links links = re.findall("<loc>(.*?)</loc>", sitemap) # Download each link for link in links: html = download(link)
def main(): args = sys.argv[1:] crit = input.criteria() input.run(args, crit) storage = input.criteria() storage.data["department"] = "asd-CHEM" with requests.Session() as s: scraper.login(s) scraper.download(s.get(GOLD_SEARCH_URL), DEFAULT_GOLD_FILE_PATH, "search") scraper.post_search(crit, s, "chem3") html_extraction.parse_to_file("chem3", pretty=True)
def link_crawler(seed_url, link_regex): """Crawl from the given seed URL following links matched by link_regex """ crawl_queue = [seed_url] while crawl_queue: url = crawl_queue.pop() html = download(url) # Filter for links that match the regex for link in get_links(html): if re.match(link_regex, link): link = urlparse.urljoin(seed_url, link) crawl_queue.append(link)
def main(argv): """Run scraper.py in an infinite loop.""" args = parse_cmdline(argv[1:]) rsync_url, status, destination, storage_service = scraper.init(args) prometheus_client.start_http_server(args.metrics_port) # First, clear out any existing cache that can be cleared. with UPLOAD_RUNS.time(): # Upload except for the most recent day on disk. retry.api.retry_call(scraper.upload_stale_disk, (args, status, destination, storage_service), exceptions=scraper.RecoverableScraperException) # Now, download then upload until we run out of num_runs while args.num_runs > 0: try: logging.info('Scraping %s', rsync_url) with RSYNC_RUNS.time(): scraper.download(args, rsync_url, status, destination) with UPLOAD_RUNS.time(): scraper.upload_if_allowed(args, status, destination, storage_service) SCRAPER_SUCCESS.labels(message='success').inc() except scraper.RecoverableScraperException as error: logging.error('Scrape and upload failed: %s', error.message) SCRAPER_SUCCESS.labels(message=str(error.prometheus_label)).inc() # In order to prevent a thundering herd of rsync jobs, we spread the # jobs around in a memoryless way. By choosing our inter-job sleep # time from an exponential distribution, we ensure that the resulting # time distribution of jobs is Poisson, the one and only memoryless # distribution. The denominator of the fraction in the code below is # the mean sleep time in seconds. # # That said, don't sleep for more than an hour. sleep_time = min(random.expovariate(1.0 / args.expected_wait_time), 3600) logging.info('Sleeping for %g seconds', sleep_time) with SLEEPS.time(): time.sleep(sleep_time) args.num_runs -= 1
from scraper import download import itertools # Maximum number of download errors maximum_errors = 5 # Current number of download errors number_of_errors = 0 for page in itertools.count(1): url = "http://example.webscraping.com/view/-%d" % page html = download(url) if html is None: # Error upon trying to download the webpage number_of_errors += 1 if number_of_errors == maximum_errors: break else: # Result scraping number_of_errors = 0