def test_periodic_heartbeat(self): controller = self.make_fresh_controller() controller.sync_loop_timing = 1 controller.healthy_service_ids = { 'trough-read:test01:id0', 'trough-read:test01:id1' } assert set(self.rethinker.table('services')['id'].run()) == set() # first time it inserts individual services heartbeats_after = doublethink.utcnow() healthy_service_ids = controller.periodic_heartbeat() assert set(healthy_service_ids) == { 'trough-read:test01:id0', 'trough-read:test01:id1' } assert set(self.rethinker.table('services')['id'].run()) == { 'trough-nodes:test01:None', 'trough-read:test01:id0', 'trough-read:test01:id1' } for svc in self.rethinker.table('services').run(): assert svc['last_heartbeat'] > heartbeats_after # subsequently updates existing services in one bulk query heartbeats_after = doublethink.utcnow() healthy_service_ids = controller.periodic_heartbeat() assert set(healthy_service_ids) == { 'trough-read:test01:id0', 'trough-read:test01:id1' } assert set(self.rethinker.table('services')['id'].run()) == { 'trough-nodes:test01:None', 'trough-read:test01:id0', 'trough-read:test01:id1' } for svc in self.rethinker.table('services').run(): assert svc['last_heartbeat'] > heartbeats_after
def finished(self, site, status): self.logger.info("%s %s", status, site) site.status = status site.claimed = False site.last_disclaimed = doublethink.utcnow() site.starts_and_stops[-1]["stop"] = doublethink.utcnow() site.save() if site.job_id: self._maybe_finish_job(site.job_id)
def resume_job(self, job): job.status = "ACTIVE" job.starts_and_stops.append( {"start":doublethink.utcnow(), "stop":None}) job.save() for site in self.job_sites(job.id): site.status = "ACTIVE" site.starts_and_stops.append( {"start":doublethink.utcnow(), "stop":None}) site.save()
def resume_job(self, job): job.status = "ACTIVE" job.stop_requested = None job.starts_and_stops.append( {"start":doublethink.utcnow(), "stop":None}) job.save() for site in self.job_sites(job.id): site.status = "ACTIVE" site.starts_and_stops.append( {"start":doublethink.utcnow(), "stop":None}) site.save()
def resume_site(self, site): if site.job_id: # can't call resume_job since that would resume jobs's other sites job = brozzler.Job.load(self.rr, site.job_id) job.status = "ACTIVE" job.starts_and_stops.append( {"start":doublethink.utcnow(), "stop":None}) job.save() site.status = "ACTIVE" site.starts_and_stops.append( {"start":doublethink.utcnow(), "stop":None}) site.save()
def _proxy_request(self): warcprox_meta = None raw_warcprox_meta = self.headers.get('Warcprox-Meta') self.logger.trace('request for %s Warcprox-Meta header: %s', self.url, raw_warcprox_meta) if raw_warcprox_meta: warcprox_meta = json.loads(raw_warcprox_meta) del self.headers['Warcprox-Meta'] remote_ip = self._remote_server_conn.sock.getpeername()[0] timestamp = doublethink.utcnow() extra_response_headers = {} if warcprox_meta and 'accept' in warcprox_meta and \ 'capture-metadata' in warcprox_meta['accept']: rmeta = { 'capture-metadata': { 'timestamp': timestamp.strftime('%Y-%m-%dT%H:%M:%SZ') } } extra_response_headers['Warcprox-Meta'] = json.dumps( rmeta, separators=',:') req, prox_rec_res = warcprox.mitmproxy.MitmProxyHandler._proxy_request( self, extra_response_headers=extra_response_headers) content_type = None try: content_type = prox_rec_res.headers.get('content-type') except AttributeError: # py2 raw = prox_rec_res.msg.getrawheader('content-type') if raw: content_type = raw.strip() recorded_url = RecordedUrl(url=self.url, request_data=req, response_recorder=prox_rec_res.recorder, remote_ip=remote_ip, warcprox_meta=warcprox_meta, status=prox_rec_res.status, size=prox_rec_res.recorder.len, client_ip=self.client_address[0], content_type=content_type, method=self.command, timestamp=timestamp, host=self.hostname, duration=doublethink.utcnow() - timestamp, referer=self.headers.get('referer'), payload_digest=prox_rec_res.payload_digest, truncated=prox_rec_res.truncated) self.server.recorded_url_q.put(recorded_url) return recorded_url
def resume_site(self, site): if site.job_id: # can't call resume_job since that would resume jobs's other sites job = brozzler.Job.load(self.rr, site.job_id) job.status = "ACTIVE" site.stop_requested = None job.starts_and_stops.append( {"start":doublethink.utcnow(), "stop":None}) job.save() site.status = "ACTIVE" site.starts_and_stops.append( {"start":doublethink.utcnow(), "stop":None}) site.save()
def ydl_progress(*args, **kwargs): # in case youtube-dl takes a long time, heartbeat site.last_claimed # to prevent another brozzler-worker from claiming the site try: if site.rr and doublethink.utcnow() - site.last_claimed > datetime.timedelta(minutes=7): self.logger.debug( 'heartbeating site.last_claimed to prevent another ' 'brozzler-worker claiming this site id=%r', site.id) site.last_claimed = doublethink.utcnow() site.save() except: self.logger.debug( 'problem heartbeating site.last_claimed site id=%r', site.id, exc_info=True)
def maybe_heartbeat_site_last_claimed(*args, **kwargs): # in case yt-dlp takes a long time, heartbeat site.last_claimed # to prevent another brozzler-worker from claiming the site try: if site.rr and doublethink.utcnow() - site.last_claimed > datetime.timedelta(minutes=worker.SITE_SESSION_MINUTES): worker.logger.debug( 'heartbeating site.last_claimed to prevent another ' 'brozzler-worker claiming this site id=%r', site.id) site.last_claimed = doublethink.utcnow() site.save() except: worker.logger.debug( 'problem heartbeating site.last_claimed site id=%r', site.id, exc_info=True)
def maybe_heartbeat_site_last_claimed(*args, **kwargs): # in case youtube-dl takes a long time, heartbeat site.last_claimed # to prevent another brozzler-worker from claiming the site try: if site.rr and doublethink.utcnow() - site.last_claimed > datetime.timedelta(minutes=worker.SITE_SESSION_MINUTES): worker.logger.debug( 'heartbeating site.last_claimed to prevent another ' 'brozzler-worker claiming this site id=%r', site.id) site.last_claimed = doublethink.utcnow() site.save() except: worker.logger.debug( 'problem heartbeating site.last_claimed site id=%r', site.id, exc_info=True)
def honor_stop_request(self, site): """Raises brozzler.CrawlStopped if stop has been requested.""" site.refresh() if (site.stop_requested and site.stop_requested <= doublethink.utcnow()): self.logger.info("stop requested for site %s", site.id) raise brozzler.CrawlStopped if site.job_id: job = brozzler.Job.load(self.rr, site.job_id) if (job and job.stop_requested and job.stop_requested <= doublethink.utcnow()): self.logger.info("stop requested for job %s", site.job_id) raise brozzler.CrawlStopped
def send_error(self, code, message=None, explain=None, exception=None): super().send_error(code, message=message, explain=explain, exception=exception) # If error happens during CONNECT handling and before the inner request, self.url # is unset, and self.path is something like 'example.com:443' urlish = self.url or self.path warcprox_meta = self._parse_warcprox_meta() self._swallow_hop_by_hop_headers() request_data = self._build_request() failed_url = FailedUrl(url=urlish, request_data=request_data, warcprox_meta=warcprox_meta, status=code, client_ip=self.client_address[0], method=self.command, timestamp=doublethink.utcnow(), host=self.hostname, duration=None, referer=self.headers.get('referer'), do_not_archive=True, message=message, exception=exception) self.server.recorded_url_q.put(failed_url)
def new_job(frontier, job_conf): '''Returns new Job.''' validate_conf(job_conf) job = Job(frontier.rr, { "conf": job_conf, "status": "ACTIVE", "started": doublethink.utcnow()}) if "id" in job_conf: job.id = job_conf["id"] if "max_claimed_sites" in job_conf: job.max_claimed_sites = job_conf["max_claimed_sites"] job.save() sites = [] pages = [] for seed_conf in job_conf["seeds"]: merged_conf = merge(seed_conf, job_conf) merged_conf.pop("seeds") merged_conf["job_id"] = job.id merged_conf["seed"] = merged_conf.pop("url") site = brozzler.Site(frontier.rr, merged_conf) site.id = str(uuid.uuid4()) sites.append(site) pages.append(new_seed_page(frontier, site)) # insert in batches to avoid this error # rethinkdb.errors.ReqlDriverError: Query size (167883036) greater than maximum (134217727) in: for batch in (pages[i:i+500] for i in range(0, len(pages), 500)): logging.info('inserting batch of %s pages', len(batch)) result = frontier.rr.table('pages').insert(batch).run() for batch in (sites[i:i+100] for i in range(0, len(sites), 100)): logging.info('inserting batch of %s sites', len(batch)) result = frontier.rr.table('sites').insert(batch).run() logging.info('job %s fully started', job.id) return job
def new_job(frontier, job_conf): '''Returns new Job.''' validate_conf(job_conf) job = Job(frontier.rr, { "conf": job_conf, "status": "ACTIVE", "started": doublethink.utcnow()}) if "id" in job_conf: job.id = job_conf["id"] if "max_claimed_sites" in job_conf: job.max_claimed_sites = job_conf["max_claimed_sites"] job.save() sites = [] for seed_conf in job_conf["seeds"]: merged_conf = merge(seed_conf, job_conf) merged_conf.pop("seeds") merged_conf["job_id"] = job.id merged_conf["seed"] = merged_conf.pop("url") site = brozzler.Site(frontier.rr, merged_conf) sites.append(site) for site in sites: new_site(frontier, site) return job
def populate_defaults(self): if not "status" in self: self.status = "ACTIVE" if not "claimed" in self: self.claimed = False if not "last_disclaimed" in self: self.last_disclaimed = brozzler.EPOCH_UTC if not "last_claimed" in self: self.last_claimed = brozzler.EPOCH_UTC if not "scope" in self: self.scope = {} if not "surt" in self.scope and self.seed: self.scope["surt"] = brozzler.site_surt_canon( self.seed).surt().decode('ascii') if not "starts_and_stops" in self: if self.get("start_time"): # backward compatibility self.starts_and_stops = [{ "start":self.get("start_time"),"stop":None}] if self.get("status") != "ACTIVE": self.starts_and_stops[0]["stop"] = self.last_disclaimed del self["start_time"] else: self.starts_and_stops = [ {"start":doublethink.utcnow(),"stop":None}]
def postfetch_status(self): earliest = self.earliest_still_active_fetch_start() if earliest: seconds_behind = (doublethink.utcnow() - earliest).total_seconds() else: seconds_behind = 0 result = { 'earliest_still_active_fetch_start': earliest, 'seconds_behind': seconds_behind, 'postfetch_chain': [] } for processor in self._postfetch_chain: if processor.__class__ == warcprox.ListenerPostfetchProcessor: name = processor.listener.__class__.__name__ else: name = processor.__class__.__name__ queued = len(processor.inq.queue) if hasattr(processor, 'batch'): queued += len(processor.batch) result['postfetch_chain'].append({ 'processor': name, 'queued_urls': queued }) return result
def finish(self): if self.status == "FINISHED" or self.starts_and_stops[-1]["stop"]: self.logger.error( "job is already finished status=%s " "starts_and_stops[-1]['stop']=%s", self.status, self.starts_and_stops[-1]["stop"]) self.status = "FINISHED" self.starts_and_stops[-1]["stop"] = doublethink.utcnow()
def _proxy_request(self): warcprox_meta = None raw_warcprox_meta = self.headers.get('Warcprox-Meta') self.logger.trace( 'request for %s Warcprox-Meta header: %s', self.url, raw_warcprox_meta) if raw_warcprox_meta: warcprox_meta = json.loads(raw_warcprox_meta) del self.headers['Warcprox-Meta'] remote_ip = self._remote_server_conn.sock.getpeername()[0] timestamp = doublethink.utcnow() extra_response_headers = {} if warcprox_meta and 'accept' in warcprox_meta and \ 'capture-metadata' in warcprox_meta['accept']: rmeta = {'capture-metadata': {'timestamp': timestamp.strftime('%Y-%m-%dT%H:%M:%SZ')}} extra_response_headers['Warcprox-Meta'] = json.dumps(rmeta, separators=',:') req, prox_rec_res = warcprox.mitmproxy.MitmProxyHandler._proxy_request( self, extra_response_headers=extra_response_headers) content_type = None try: content_type = prox_rec_res.headers.get('content-type') except AttributeError: # py2 raw = prox_rec_res.msg.getrawheader('content-type') if raw: content_type = raw.strip() recorded_url = RecordedUrl( url=self.url, request_data=req, response_recorder=prox_rec_res.recorder, remote_ip=remote_ip, warcprox_meta=warcprox_meta, status=prox_rec_res.status, size=prox_rec_res.recorder.len, client_ip=self.client_address[0], content_type=content_type, method=self.command, timestamp=timestamp, host=self.hostname, duration=doublethink.utcnow()-timestamp, referer=self.headers.get('referer'), payload_digest=prox_rec_res.payload_digest, truncated=prox_rec_res.truncated) self.server.recorded_url_q.put(recorded_url) return recorded_url
def brozzler_stop_crawl(argv=None): argv = argv or sys.argv arg_parser = argparse.ArgumentParser( prog=os.path.basename(argv[0]), formatter_class=BetterArgumentDefaultsHelpFormatter) group = arg_parser.add_mutually_exclusive_group(required=True) add_rethinkdb_options(arg_parser) group.add_argument('--job', dest='job_id', metavar='JOB_ID', help=('request crawl stop for the specified job')) group.add_argument('--site', dest='site_id', metavar='SITE_ID', help=('request crawl stop for the specified site')) add_common_options(arg_parser, argv) args = arg_parser.parse_args(args=argv[1:]) configure_logging(args) rr = rethinker(args) if args.job_id: try: job_id = int(args.job_id) except ValueError: job_id = args.job_id job = brozzler.Job.load(rr, job_id) if not job: logging.fatal('job not found with id=%r', job_id) sys.exit(1) job.stop_requested = doublethink.utcnow() job.save() elif args.site_id: try: site_id = int(args.site_id) except ValueError: site_id = args.site_id site = brozzler.Site.load(rr, site_id) if not site: logging.fatal('site not found with id=%r', site_id) sys.exit(1) site.stop_requested = doublethink.utcnow() site.save()
def process_request(self, request, client_address): self.active_requests[request] = doublethink.utcnow() future = self.pool.submit(self.process_request_thread, request, client_address) future.add_done_callback( lambda f: self.active_requests.pop(request, None)) if future.done(): # avoid theoretical timing issue, in case process_request_thread # managed to finish before future.add_done_callback() ran self.active_requests.pop(request, None)
def test_utcnow(): now_notz = datetime.datetime.utcnow() # has no timezone :( assert not now_notz.tzinfo now_tz = doublethink.utcnow() # solution to that problem assert now_tz.tzinfo ## .timestamp() was added in python 3.3 if hasattr(now_tz, 'timestamp'): assert now_tz.timestamp() - now_notz.timestamp() < 0.1
def process_request(self, request, client_address): self.active_requests[request] = doublethink.utcnow() future = self.pool.submit( self.process_request_thread, request, client_address) future.add_done_callback( lambda f: self.active_requests.pop(request, None)) if future.done(): # avoid theoretical timing issue, in case process_request_thread # managed to finish before future.add_done_callback() ran self.active_requests.pop(request, None)
def disclaim_site(self, site, page=None): self.logger.info("disclaiming %s", site) site.claimed = False site.last_disclaimed = doublethink.utcnow() if not page and not self.has_outstanding_pages(site): self.finished(site, "FINISHED") else: site.save() if page: page.claimed = False page.save()
def elapsed(self): '''Returns elapsed crawl time as a float in seconds.''' dt = 0 for ss in self.starts_and_stops[:-1]: dt += (ss['stop'] - ss['start']).total_seconds() ss = self.starts_and_stops[-1] if ss['stop']: dt += (ss['stop'] - ss['start']).total_seconds() else: # crawl is active dt += (doublethink.utcnow() - ss['start']).total_seconds() return dt
def brozzler_stop_crawl(argv=None): argv = argv or sys.argv arg_parser = argparse.ArgumentParser( prog=os.path.basename(argv[0]), formatter_class=BetterArgumentDefaultsHelpFormatter) group = arg_parser.add_mutually_exclusive_group(required=True) add_rethinkdb_options(arg_parser) group.add_argument( '--job', dest='job_id', metavar='JOB_ID', help=( 'request crawl stop for the specified job')) group.add_argument( '--site', dest='site_id', metavar='SITE_ID', help=( 'request crawl stop for the specified site')) add_common_options(arg_parser, argv) args = arg_parser.parse_args(args=argv[1:]) configure_logging(args) rr = rethinker(args) if args.job_id: try: job_id = int(args.job_id) except ValueError: job_id = args.job_id job = brozzler.Job.load(rr, job_id) if not job: logging.fatal('job not found with id=%r', job_id) sys.exit(1) job.stop_requested = doublethink.utcnow() job.save() elif args.site_id: try: site_id = int(args.site_id) except ValueError: site_id = args.site_id site = brozzler.Site.load(rr, site_id) if not site: logging.fatal('site not found with id=%r', site_id) sys.exit(1) site.stop_requested = doublethink.utcnow() site.save()
def test_claim_site(): rr = doublethink.Rethinker('localhost', db='ignoreme') frontier = brozzler.RethinkDbFrontier(rr) rr.table('sites').delete().run() # clean slate with pytest.raises(brozzler.NothingToClaim): claimed_site = frontier.claim_sites() site = brozzler.Site(rr, {'seed': 'http://example.org/'}) brozzler.new_site(frontier, site) claimed_sites = frontier.claim_sites() assert len(claimed_sites) == 1 claimed_site = claimed_sites[0] assert claimed_site.id == site.id assert claimed_site.claimed assert claimed_site.last_claimed >= doublethink.utcnow( ) - datetime.timedelta(minutes=1) with pytest.raises(brozzler.NothingToClaim): claimed_site = frontier.claim_sites() # site last_claimed less than 1 hour ago still not to be reclaimed claimed_site.last_claimed = doublethink.utcnow() - datetime.timedelta( minutes=55) claimed_site.save() with pytest.raises(brozzler.NothingToClaim): claimed_site = frontier.claim_sites() # site last_claimed more than 1 hour ago can be reclaimed site = claimed_site claimed_site = None site.last_claimed = doublethink.utcnow() - datetime.timedelta(minutes=65) site.save() claimed_sites = frontier.claim_sites() assert len(claimed_sites) == 1 claimed_site = claimed_sites[0] assert claimed_site.id == site.id # clean up rr.table('sites').get(claimed_site.id).delete().run()
def _service_heartbeat_if_due(self): '''Sends service registry heartbeat if due''' due = False if self._service_registry: if not hasattr(self, "status_info"): due = True else: d = doublethink.utcnow() - self.status_info["last_heartbeat"] due = d.total_seconds() > self.HEARTBEAT_INTERVAL if due: self._service_heartbeat()
def __init__( self, stats_db=None, status_callback=None, options=warcprox.Options()): self.start_time = doublethink.utcnow() warcprox.mitmproxy.SingleThreadedMitmProxy.__init__( self, WarcProxyHandler, options) self.status_callback = status_callback self.stats_db = stats_db self.recorded_url_q = queue.Queue(maxsize=options.queue_size or 1000) self.running_stats = warcprox.stats.RunningStats()
def test_claim_site(): rr = doublethink.Rethinker('localhost', db='ignoreme') frontier = brozzler.RethinkDbFrontier(rr) rr.table('sites').delete().run() # clean slate with pytest.raises(brozzler.NothingToClaim): claimed_site = frontier.claim_sites() site = brozzler.Site(rr, {'seed': 'http://example.org/'}) brozzler.new_site(frontier, site) claimed_sites = frontier.claim_sites() assert len(claimed_sites) == 1 claimed_site = claimed_sites[0] assert claimed_site.id == site.id assert claimed_site.claimed assert claimed_site.last_claimed >= doublethink.utcnow() - datetime.timedelta(minutes=1) with pytest.raises(brozzler.NothingToClaim): claimed_site = frontier.claim_sites() # site last_claimed less than 1 hour ago still not to be reclaimed claimed_site.last_claimed = doublethink.utcnow() - datetime.timedelta(minutes=55) claimed_site.save() with pytest.raises(brozzler.NothingToClaim): claimed_site = frontier.claim_sites() # site last_claimed more than 1 hour ago can be reclaimed site = claimed_site claimed_site = None site.last_claimed = doublethink.utcnow() - datetime.timedelta(minutes=65) site.save() claimed_sites = frontier.claim_sites() assert len(claimed_sites) == 1 claimed_site = claimed_sites[0] assert claimed_site.id == site.id # clean up rr.table('sites').get(claimed_site.id).delete().run()
def populate_defaults(self): if not "status" in self: self.status = "ACTIVE" if not "starts_and_stops" in self: if self.get("started"): # backward compatibility self.starts_and_stops = [{ "start": self.get("started"), "stop": self.get("finished")}] del self["started"] if "finished" in self: del self["finished"] else: self.starts_and_stops = [ {"start":doublethink.utcnow(),"stop":None}]
def elapsed(self): ''' Returns elapsed crawl time as a float in seconds. This metric includes all the time that a site was in active rotation, including any time it spent waiting for its turn to be brozzled. In contrast `Site.active_brozzling_time` only counts time when a brozzler worker claimed the site and was actively brozzling it. ''' dt = 0 for ss in self.starts_and_stops[:-1]: if ss['stop']: dt += (ss['stop'] - ss['start']).total_seconds() else: self.logger.warning("missing expected ss['stop']") dt += (doublethink.utcnow() - ss['start']).total_seconds() ss = self.starts_and_stops[-1] if ss['stop']: dt += (ss['stop'] - ss['start']).total_seconds() else: # crawl is active dt += (doublethink.utcnow() - ss['start']).total_seconds() return dt
def __init__(self, stats_db=None, status_callback=None, options=warcprox.Options()): self.start_time = doublethink.utcnow() self.status_callback = status_callback self.stats_db = stats_db self.options = options self.remote_connection_pool = PoolManager( num_pools=max(round(options.max_threads / 6), 200) if options.max_threads else 200) server_address = (options.address or 'localhost', options.port if options.port is not None else 8000) if options.onion_tor_socks_proxy: try: host, port = options.onion_tor_socks_proxy.split(':') WarcProxyHandler.onion_tor_socks_proxy_host = host WarcProxyHandler.onion_tor_socks_proxy_port = int(port) except ValueError: WarcProxyHandler.onion_tor_socks_proxy_host = options.onion_tor_socks_proxy WarcProxyHandler.onion_tor_socks_proxy_port = None if options.socket_timeout: WarcProxyHandler._socket_timeout = options.socket_timeout if options.max_resource_size: WarcProxyHandler._max_resource_size = options.max_resource_size if options.tmp_file_max_memory_size: WarcProxyHandler._tmp_file_max_memory_size = options.tmp_file_max_memory_size http_server.HTTPServer.__init__(self, server_address, WarcProxyHandler, bind_and_activate=True) self.digest_algorithm = options.digest_algorithm or 'sha1' ca_name = ('Warcprox CA on %s' % socket.gethostname())[:64] self.ca = CertificateAuthority(ca_file=options.cacert or 'warcprox-ca.pem', certs_dir=options.certs_dir or './warcprox-ca', ca_name=ca_name) self.recorded_url_q = queue.Queue(maxsize=options.queue_size or 1000) self.running_stats = warcprox.stats.RunningStats()
def populate_defaults(self): if not "status" in self: self.status = "ACTIVE" if not "claimed" in self: self.claimed = False if not "last_disclaimed" in self: self.last_disclaimed = brozzler.EPOCH_UTC if not "last_claimed" in self: self.last_claimed = brozzler.EPOCH_UTC if not "scope" in self: self.scope = {} # backward compatibility if "surt" in self.scope: if not "accepts" in self.scope: self.scope["accepts"] = [] self.scope["accepts"].append({"surt": self.scope["surt"]}) del self.scope["surt"] # backward compatibility if ("max_hops_off_surt" in self.scope and not "max_hops_off" in self.scope): self.scope["max_hops_off"] = self.scope["max_hops_off_surt"] if "max_hops_off_surt" in self.scope: del self.scope["max_hops_off_surt"] if self.seed: self._accept_ssurt_if_not_redundant( brozzler.site_surt_canon(self.seed).ssurt().decode('ascii')) if not "starts_and_stops" in self: if self.get("start_time"): # backward compatibility self.starts_and_stops = [{ "start": self.get("start_time"), "stop": None }] if self.get("status") != "ACTIVE": self.starts_and_stops[0]["stop"] = self.last_disclaimed del self["start_time"] else: self.starts_and_stops = [{ "start": doublethink.utcnow(), "stop": None }]
def elapsed(self): ''' Returns elapsed crawl time as a float in seconds. This metric includes all the time that a site was in active rotation, including any time it spent waiting for its turn to be brozzled. In contrast `Site.active_brozzling_time` only counts time when a brozzler worker claimed the site and was actively brozzling it. ''' dt = 0 for ss in self.starts_and_stops[:-1]: dt += (ss['stop'] - ss['start']).total_seconds() ss = self.starts_and_stops[-1] if ss['stop']: dt += (ss['stop'] - ss['start']).total_seconds() else: # crawl is active dt += (doublethink.utcnow() - ss['start']).total_seconds() return dt
def test_honor_stop_request(): rr = doublethink.Rethinker('localhost', db='ignoreme') frontier = brozzler.RethinkDbFrontier(rr) # 1. test stop request on job job_conf = {'seeds': [{'url': 'http://example.com'}]} job = brozzler.new_job(frontier, job_conf) assert job.id sites = list(frontier.job_sites(job.id)) assert len(sites) == 1 site = sites[0] assert site.job_id == job.id # does not raise exception frontier.honor_stop_request(site) # set job.stop_requested job.stop_requested = datetime.datetime.utcnow().replace( tzinfo=doublethink.UTC) job.save() with pytest.raises(brozzler.CrawlStopped): frontier.honor_stop_request(site) # 2. test stop request on site job_conf = {'seeds': [{'url': 'http://example.com'}]} job = brozzler.new_job(frontier, job_conf) assert job.id sites = list(frontier.job_sites(job.id)) assert len(sites) == 1 site = sites[0] assert site.job_id == job.id # does not raise exception frontier.honor_stop_request(site) # set site.stop_requested site.stop_requested = doublethink.utcnow() site.save() with pytest.raises(brozzler.CrawlStopped): frontier.honor_stop_request(site)
def populate_defaults(self): if not "status" in self: self.status = "ACTIVE" if not "claimed" in self: self.claimed = False if not "last_disclaimed" in self: self.last_disclaimed = brozzler.EPOCH_UTC if not "last_claimed" in self: self.last_claimed = brozzler.EPOCH_UTC if not "scope" in self: self.scope = {} # backward compatibility if "surt" in self.scope: if not "accepts" in self.scope: self.scope["accepts"] = [] self.scope["accepts"].append({"surt": self.scope["surt"]}) del self.scope["surt"] # backward compatibility if ("max_hops_off_surt" in self.scope and not "max_hops_off" in self.scope): self.scope["max_hops_off"] = self.scope["max_hops_off_surt"] if "max_hops_off_surt" in self.scope: del self.scope["max_hops_off_surt"] if self.seed: self._accept_ssurt_if_not_redundant( brozzler.site_surt_canon(self.seed).ssurt().decode('ascii')) if not "starts_and_stops" in self: if self.get("start_time"): # backward compatibility self.starts_and_stops = [{ "start":self.get("start_time"),"stop":None}] if self.get("status") != "ACTIVE": self.starts_and_stops[0]["stop"] = self.last_disclaimed del self["start_time"] else: self.starts_and_stops = [ {"start":doublethink.utcnow(),"stop":None}]
def claim_site(self, worker_id): # XXX keep track of aggregate priority and prioritize sites accordingly? while True: result = ( self.rr.table("sites", read_mode="majority").between( ["ACTIVE", r.minval], ["ACTIVE", r.maxval], index="sites_last_disclaimed").order_by( index="sites_last_disclaimed").filter( (r.row["claimed"] != True) | (r.row["last_claimed"] < r.now() - 2 * 60 * 60)). limit(1).update( # try to avoid a race condition resulting in multiple # brozzler-workers claiming the same site # see https://github.com/rethinkdb/rethinkdb/issues/3235#issuecomment-60283038 r.branch( (r.row["claimed"] != True) | (r.row["last_claimed"] < r.now() - 2 * 60 * 60), { "claimed": True, "last_claimed_by": worker_id, "last_claimed": doublethink.utcnow() }, {}), return_changes=True)).run() self._vet_result(result, replaced=[0, 1], unchanged=[0, 1]) if result["replaced"] == 1: if result["changes"][0]["old_val"]["claimed"]: self.logger.warn( "re-claimed site that was still marked 'claimed' " "because it was last claimed a long time ago " "at %s, and presumably some error stopped it from " "being disclaimed", result["changes"][0]["old_val"]["last_claimed"]) site = brozzler.Site(self.rr, result["changes"][0]["new_val"]) else: raise brozzler.NothingToClaim # XXX This is the only place we enforce time limit for now. Worker # loop should probably check time limit. Maybe frontier needs a # housekeeping thread to ensure that time limits get enforced in a # timely fashion. if not self._enforce_time_limit(site): return site
def postfetch_status(self): earliest = self.earliest_still_active_fetch_start() if earliest: seconds_behind = (doublethink.utcnow() - earliest).total_seconds() else: seconds_behind = 0 result = { 'earliest_still_active_fetch_start': earliest, 'seconds_behind': seconds_behind, 'postfetch_chain': [] } for processor in self._postfetch_chain: if processor.__class__ == warcprox.ListenerPostfetchProcessor: name = processor.listener.__class__.__name__ else: name = processor.__class__.__name__ queued = len(processor.inq.queue) if hasattr(processor, 'batch'): queued += len(processor.batch) result['postfetch_chain'].append({ 'processor': name, 'queued_urls': queued}) return result
def unique_service(self, role, candidate=None): ''' Retrieve a unique service, possibly setting or heartbeating it first. A "unique service" is a service with only one instance for a given role. Uniqueness is enforced by using the role name as the primary key `{'id':role, ...}`. Args: role (str): role name candidate (dict): if supplied, candidate info for the unique service, explained below `candidate` normally represents "myself, this instance of the service". When a service supplies `candidate`, it is nominating itself for selection as the unique service, or retaining its claim to the role (heartbeating). If `candidate` is supplied: First, atomically in a single rethinkdb query, checks if there is already a unique healthy instance of this service in rethinkdb, and if not, sets `candidate` as the unique service. Looks at the result of that query to determine if `candidate` is the unique service or not. If it is, updates 'last_heartbeat' in rethinkdb. To determine whether `candidate` is the unique service, checks that all the fields other than 'first_heartbeat' and 'last_heartbeat' have the same value in `candidate` as in the value returned from rethinkdb. ***Important***: this means that the caller must ensure that none of the fields of the unique service ever change. Don't store things like 'load' or any other volatile value in there. If you try to do that, heartbeats will end up not being sent, and the unique service will flap among the candidates. Finally, retrieves the service from rethinkdb and returns it, if it is healthy. Returns: the unique service, if there is one and it is healthy, otherwise None ''' # use the same concept of 'now' for all queries now = doublethink.utcnow() if candidate is not None: candidate['id'] = role if not 'ttl' in candidate: raise Exception("candidate is missing required field 'ttl'") val = candidate['ttl'] if not (isinstance(val, float) or isinstance(val, int)) or val <= 0: raise Exception("'ttl' must be a number > 0") candidate['first_heartbeat'] = now candidate['last_heartbeat'] = now if not 'host' in candidate: candidate['host'] = socket.gethostname() if not 'pid' in candidate: candidate['pid'] = os.getpid() result = self.rr.table( 'services', read_mode='majority').get(role).replace( lambda row: r.branch( r.branch(row, row['last_heartbeat'] > now - row['ttl'], False), row, candidate), return_changes='always').run() new_val = result['changes'][0]['new_val'] if all([ new_val.get(k) == candidate[k] for k in candidate if k not in ('first_heartbeat', 'last_heartbeat') ]): # candidate is the unique_service, send a heartbeat del candidate['first_heartbeat'] # don't touch first_heartbeat self.rr.table('services').get(role).update(candidate).run() results = list( self.rr.table('services', read_mode='majority').get_all(role). filter(lambda row: row['last_heartbeat'] > now - row['ttl']).run()) if results: return results[0] else: return None
def do_WARCPROX_WRITE_RECORD(self, warc_type=None): ''' Handles a request with http method WARCPROX_WRITE_RECORD, a special type of request which tells warcprox to construct a warc record from the request more or less verbatim, and write it to a warc. To honor the request, this method creates a RecordedUrl queues it for the WarcWriterThread to process. The warc record headers Content-Type and WARC-Type are taken from the request headers, as is the payload. Example request: WARCPROX_WRITE_RECORD screenshot:https://example.com/ HTTP/1.1 WARC-Type: metadata Content-Type: image/png Content-Length: 12345 Connection: close <png image data> ''' try: self.url = self.path self._enforce_limits_and_blocks() if ('Content-Length' in self.headers and 'Content-Type' in self.headers and (warc_type or 'WARC-Type' in self.headers)): timestamp = doublethink.utcnow() request_data = tempfile.SpooledTemporaryFile( max_size=self._tmp_file_max_memory_size) payload_digest = hashlib.new(self.server.digest_algorithm) # XXX we don't support chunked uploads for now length = int(self.headers['Content-Length']) buf = self.rfile.read(min(65536, length - request_data.tell())) while buf != b'': request_data.write(buf) payload_digest.update(buf) buf = self.rfile.read( min(65536, length - request_data.tell())) warcprox_meta = None raw_warcprox_meta = self.headers.get('Warcprox-Meta') if raw_warcprox_meta: warcprox_meta = json.loads(raw_warcprox_meta) rec_custom = RecordedUrl( url=self.url, request_data=request_data, response_recorder=None, remote_ip=b'', warcprox_meta=warcprox_meta, content_type=self.headers['Content-Type'], custom_type=warc_type or self.headers['WARC-Type'].encode('utf-8'), status=204, size=request_data.tell(), client_ip=self.client_address[0], method=self.command, timestamp=timestamp, duration=doublethink.utcnow()-timestamp, payload_digest=payload_digest) request_data.seek(0) self.server.recorded_url_q.put(rec_custom) self.send_response(204, 'OK') else: self.send_error(400, message='Bad request', explain=( 'Bad request. WARC-Type, Content-Length, and Content-Type ' 'request headers required for WARCPROX_WRITE_RECORD ' 'request.')) self.end_headers() except warcprox.RequestBlockedByRule as e: # limit enforcers have already sent the appropriate response self.logger.info("%r: %r", self.requestline, e) return except: self.logger.error("uncaught exception in do_WARCPROX_WRITE_RECORD", exc_info=True) raise
def test_choose_warcprox(): rr = doublethink.Rethinker('localhost', db='ignoreme') svcreg = doublethink.ServiceRegistry(rr) frontier = brozzler.RethinkDbFrontier(rr) # avoid this error: https://travis-ci.org/internetarchive/brozzler/jobs/330991786#L1021 rr.table('sites').wait().run() rr.table('services').wait().run() rr.table('sites').index_wait().run() rr.table('services').index_wait().run() # clean slate rr.table('sites').delete().run() rr.table('services').delete().run() worker = brozzler.BrozzlerWorker(frontier, svcreg) assert worker._choose_warcprox() is None rr.table('services').insert({ 'role': 'warcprox', 'first_heartbeat': doublethink.utcnow(), 'last_heartbeat': doublethink.utcnow(), 'host': 'host1', 'port': 8000, 'load': 0, 'ttl': 60}).run() rr.table('services').insert({ 'role': 'warcprox', 'first_heartbeat': doublethink.utcnow(), 'last_heartbeat': doublethink.utcnow(), 'host': 'host2', 'port': 8000, 'load': 0, 'ttl': 60}).run() rr.table('services').insert({ 'role': 'warcprox', 'first_heartbeat': doublethink.utcnow(), 'last_heartbeat': doublethink.utcnow(), 'host': 'host2', 'port': 8001, 'load': 0, 'ttl': 60}).run() rr.table('services').insert({ 'role': 'warcprox', 'first_heartbeat': doublethink.utcnow(), 'last_heartbeat': doublethink.utcnow(), 'host': 'host3', 'port': 8000, 'load': 0, 'ttl': 60}).run() rr.table('services').insert({ 'role': 'warcprox', 'first_heartbeat': doublethink.utcnow(), 'last_heartbeat': doublethink.utcnow(), 'host': 'host4', 'port': 8000, 'load': 1, 'ttl': 60}).run() rr.table('sites').insert({ 'proxy': 'host1:8000', 'status': 'ACTIVE', 'last_disclaimed': doublethink.utcnow()}).run() rr.table('sites').insert({ 'proxy': 'host1:8000', 'status': 'ACTIVE', 'last_disclaimed': doublethink.utcnow()}).run() rr.table('sites').insert({ 'proxy': 'host2:8000', 'status': 'ACTIVE', 'last_disclaimed': doublethink.utcnow()}).run() rr.table('sites').insert({ 'proxy': 'host2:8001', 'status': 'ACTIVE', 'last_disclaimed': doublethink.utcnow()}).run() instance = worker._choose_warcprox() assert instance['host'] == 'host3' assert instance['port'] == 8000 rr.table('sites').insert({ 'proxy': 'host3:8000', 'status': 'ACTIVE', 'last_disclaimed': doublethink.utcnow()}).run() instance = worker._choose_warcprox() assert instance['host'] == 'host4' assert instance['port'] == 8000 # clean up rr.table('sites').delete().run() rr.table('services').delete().run()
def test_choose_warcprox(): rr = doublethink.Rethinker('localhost', db='ignoreme') svcreg = doublethink.ServiceRegistry(rr) frontier = brozzler.RethinkDbFrontier(rr) # avoid this of error: https://travis-ci.org/internetarchive/brozzler/jobs/330991786#L1021 rr.table('sites').wait().run() rr.table('services').wait().run() rr.table('sites').index_wait().run() rr.table('services').index_wait().run() # clean slate rr.table('sites').delete().run() rr.table('services').delete().run() worker = brozzler.BrozzlerWorker(frontier, svcreg) assert worker._choose_warcprox() is None rr.table('services').insert({ 'role': 'warcprox', 'first_heartbeat': doublethink.utcnow(), 'last_heartbeat': doublethink.utcnow(), 'host': 'host1', 'port': 8000, 'load': 0, 'ttl': 60}).run() rr.table('services').insert({ 'role': 'warcprox', 'first_heartbeat': doublethink.utcnow(), 'last_heartbeat': doublethink.utcnow(), 'host': 'host2', 'port': 8000, 'load': 0, 'ttl': 60}).run() rr.table('services').insert({ 'role': 'warcprox', 'first_heartbeat': doublethink.utcnow(), 'last_heartbeat': doublethink.utcnow(), 'host': 'host2', 'port': 8001, 'load': 0, 'ttl': 60}).run() rr.table('services').insert({ 'role': 'warcprox', 'first_heartbeat': doublethink.utcnow(), 'last_heartbeat': doublethink.utcnow(), 'host': 'host3', 'port': 8000, 'load': 0, 'ttl': 60}).run() rr.table('services').insert({ 'role': 'warcprox', 'first_heartbeat': doublethink.utcnow(), 'last_heartbeat': doublethink.utcnow(), 'host': 'host4', 'port': 8000, 'load': 1, 'ttl': 60}).run() rr.table('sites').insert({ 'proxy': 'host1:8000', 'status': 'ACTIVE', 'last_disclaimed': doublethink.utcnow()}).run() rr.table('sites').insert({ 'proxy': 'host1:8000', 'status': 'ACTIVE', 'last_disclaimed': doublethink.utcnow()}).run() rr.table('sites').insert({ 'proxy': 'host2:8000', 'status': 'ACTIVE', 'last_disclaimed': doublethink.utcnow()}).run() rr.table('sites').insert({ 'proxy': 'host2:8001', 'status': 'ACTIVE', 'last_disclaimed': doublethink.utcnow()}).run() instance = worker._choose_warcprox() assert instance['host'] == 'host3' assert instance['port'] == 8000 rr.table('sites').insert({ 'proxy': 'host3:8000', 'status': 'ACTIVE', 'last_disclaimed': doublethink.utcnow()}).run() instance = worker._choose_warcprox() assert instance['host'] == 'host4' assert instance['port'] == 8000 # clean up rr.table('sites').delete().run() rr.table('services').delete().run()