def _enforce_time_limit(self, site): if (site.time_limit and site.time_limit > 0 and (rethinkstuff.utcnow() - site.start_time).total_seconds() > site.time_limit): self.logger.debug("site FINISHED_TIME_LIMIT! time_limit=%s start_time=%s elapsed=%s %s", site.time_limit, site.start_time, rethinkstuff.utcnow() - site.start_time, site) self.finished(site, "FINISHED_TIME_LIMIT") return True else: return False
def new_job(frontier, job_conf): job = Job( id=job_conf.get("id"), conf=job_conf, status="ACTIVE", started=rethinkstuff.utcnow()) sites = [] for seed_conf in job_conf["seeds"]: merged_conf = merge(seed_conf, job_conf) # XXX check for unknown settings, invalid url, etc site = brozzler.Site(job_id=job.id, seed=merged_conf["url"], scope=merged_conf.get("scope"), time_limit=merged_conf.get("time_limit"), proxy=merged_conf.get("proxy"), ignore_robots=merged_conf.get("ignore_robots"), enable_warcprox_features=merged_conf.get( "enable_warcprox_features"), warcprox_meta=merged_conf.get("warcprox_meta"), metadata=merged_conf.get("metadata")) sites.append(site) # insert all the sites into database before the job for site in sites: new_site(frontier, site) frontier.new_job(job)
def __init__( self, seed, id=None, job_id=None, scope=None, proxy=None, ignore_robots=False, time_limit=None, warcprox_meta=None, enable_warcprox_features=False, reached_limit=None, status="ACTIVE", claimed=False, start_time=None, last_disclaimed=_EPOCH_UTC, last_claimed_by=None, last_claimed=_EPOCH_UTC, metadata={}, remember_outlinks=None, cookie_db=None, user_agent=None, behavior_parameters=None): self.seed = seed self.id = id self.job_id = job_id self.proxy = proxy self.ignore_robots = ignore_robots self.enable_warcprox_features = bool(enable_warcprox_features) self.warcprox_meta = warcprox_meta self.time_limit = time_limit self.reached_limit = reached_limit self.status = status self.claimed = bool(claimed) self.last_claimed_by = last_claimed_by self.start_time = start_time or rethinkstuff.utcnow() self.last_disclaimed = last_disclaimed self.last_claimed = last_claimed self.metadata = metadata self.remember_outlinks = remember_outlinks self.cookie_db = cookie_db self.user_agent = user_agent self.behavior_parameters = behavior_parameters self.scope = scope or {} if not "surt" in self.scope: self.scope["surt"] = Url(seed).surt
def new_job(frontier, job_conf): validate_conf(job_conf) job = Job(id=job_conf.get("id"), conf=job_conf, status="ACTIVE", started=rethinkstuff.utcnow()) sites = [] for seed_conf in job_conf["seeds"]: merged_conf = merge(seed_conf, job_conf) if "login" in merged_conf and "metadata" in merged_conf: merged_conf["metadata"]["login"] = merged_conf["login"] site = brozzler.Site( job_id=job.id, seed=merged_conf["url"], scope=merged_conf.get("scope"), time_limit=merged_conf.get("time_limit"), proxy=merged_conf.get("proxy"), ignore_robots=merged_conf.get("ignore_robots"), enable_warcprox_features=merged_conf.get("enable_warcprox_features"), warcprox_meta=merged_conf.get("warcprox_meta"), metadata=merged_conf.get("metadata"), remember_outlinks=merged_conf.get("remember_outlinks"), user_agent=merged_conf.get("user_agent"), behavior_parameters=merged_conf.get("behavior_parameters"), ) sites.append(site) # insert all the sites into database before the job for site in sites: new_site(frontier, site) frontier.new_job(job)
def new_job(frontier, job_conf): job = Job(id=job_conf.get("id"), conf=job_conf, status="ACTIVE", started=rethinkstuff.utcnow()) sites = [] for seed_conf in job_conf["seeds"]: merged_conf = merge(seed_conf, job_conf) # XXX check for unknown settings, invalid url, etc site = brozzler.Site( job_id=job.id, seed=merged_conf["url"], scope=merged_conf.get("scope"), time_limit=merged_conf.get("time_limit"), proxy=merged_conf.get("proxy"), ignore_robots=merged_conf.get("ignore_robots"), enable_warcprox_features=merged_conf.get( "enable_warcprox_features"), warcprox_meta=merged_conf.get("warcprox_meta"), metadata=merged_conf.get("metadata"), remember_outlinks=merged_conf.get("remember_outlinks")) sites.append(site) # insert all the sites into database before the job for site in sites: new_site(frontier, site) frontier.new_job(job)
def _enforce_time_limit(self, site): if ( site.time_limit and site.time_limit > 0 and (rethinkstuff.utcnow() - site.start_time).total_seconds() > site.time_limit ): self.logger.debug( "site FINISHED_TIME_LIMIT! time_limit=%s start_time=%s elapsed=%s %s", site.time_limit, site.start_time, rethinkstuff.utcnow() - site.start_time, site, ) self.finished(site, "FINISHED_TIME_LIMIT") return True else: return False
def test_utcnow(): now_notz = datetime.datetime.utcnow() # has no timezone :( assert not now_notz.tzinfo now_tz = rethinkstuff.utcnow() # solution to that problem assert now_tz.tzinfo ## .timestamp() was added in python 3.3 if hasattr(now_tz, "timestamp"): assert now_tz.timestamp() - now_notz.timestamp() < 0.1
def disclaim_site(self, site, page=None): self.logger.info("disclaiming %s", site) site.claimed = False site.last_disclaimed = rethinkstuff.utcnow() if not page and not self.has_outstanding_pages(site): self.finished(site, "FINISHED") else: self.update_site(site) if page: page.claimed = False self.update_page(page)
def _service_heartbeat_if_due(self): '''Sends service registry heartbeat if due''' due = False if self._service_registry: if not hasattr(self, "status_info"): due = True else: d = rethinkstuff.utcnow() - self.status_info["last_heartbeat"] due = d.total_seconds() > self.HEARTBEAT_INTERVAL if due: self._service_heartbeat()
def run(self): try: latest_state = None while not self._shutdown_requested.is_set(): if self._service_registry and ( not hasattr(self, "status_info") or (rethinkstuff.utcnow() - self.status_info["last_heartbeat"]).total_seconds() > self.HEARTBEAT_INTERVAL): self._service_heartbeat() try: browser = self._browser_pool.acquire() try: site = self._frontier.claim_site("{}:{}".format( socket.gethostname(), browser.chrome_port)) self.logger.info( "brozzling site (proxy=%s) %s", repr(self._proxy(site)), site) th = threading.Thread( target=lambda: self._brozzle_site( browser, site), name="BrozzlingThread:%s" % site.seed) th.start() self._browsing_threads.add(th) except: self._browser_pool.release(browser) raise except brozzler.browser.NoBrowsersAvailable: if latest_state != "browsers-busy": self.logger.info( "all %s browsers are busy", self._max_browsers) latest_state = "browsers-busy" except brozzler.NothingToClaim: if latest_state != "no-unclaimed-sites": self.logger.info("no unclaimed sites to browse") latest_state = "no-unclaimed-sites" time.sleep(0.5) except: self.logger.critical( "thread exiting due to unexpected exception", exc_info=True) finally: if self._service_registry and hasattr(self, "status_info"): try: self._service_registry.unregister(self.status_info["id"]) except: self.logger.error( "failed to unregister from service registry", exc_info=True)
def claim_site(self, worker_id): # XXX keep track of aggregate priority and prioritize sites accordingly? while True: result = ( self.r.table("sites", read_mode="majority") .between( ["ACTIVE",rethinkdb.minval], ["ACTIVE",rethinkdb.maxval], index="sites_last_disclaimed") .order_by(index="sites_last_disclaimed") .filter( (rethinkdb.row["claimed"] != True) | (rethinkdb.row["last_claimed"] < rethinkdb.now() - 2*60*60)) .limit(1) .update( # try to avoid a race condition resulting in multiple # brozzler-workers claiming the same site # see https://github.com/rethinkdb/rethinkdb/issues/3235#issuecomment-60283038 rethinkdb.branch( (rethinkdb.row["claimed"] != True) | (rethinkdb.row["last_claimed"] < rethinkdb.now() - 2*60*60), { "claimed": True, "last_claimed_by": worker_id, "last_claimed": rethinkstuff.utcnow() }, {}), return_changes=True)).run() self._vet_result(result, replaced=[0,1], unchanged=[0,1]) if result["replaced"] == 1: if result["changes"][0]["old_val"]["claimed"]: self.logger.warn( "re-claimed site that was still marked 'claimed' " "because it was last claimed a long time ago " "at %s, and presumably some error stopped it from " "being disclaimed", result["changes"][0]["old_val"]["last_claimed"]) site = brozzler.Site(**result["changes"][0]["new_val"]) else: raise brozzler.NothingToClaim # XXX This is the only place we enforce time limit for now. Worker # loop should probably check time limit. Maybe frontier needs a # housekeeping thread to ensure that time limits get enforced in a # timely fashion. if not self._enforce_time_limit(site): return site
def run(self): try: latest_state = None while not self._shutdown_requested.is_set(): if self._service_registry and ( not hasattr(self, "status_info") or (rethinkstuff.utcnow() - self.status_info["last_heartbeat"] ).total_seconds() > self.HEARTBEAT_INTERVAL): self._service_heartbeat() try: browser = self._browser_pool.acquire() try: site = self._frontier.claim_site("{}:{}".format( socket.gethostname(), browser.chrome_port)) self.logger.info("brozzling site %s", site) th = threading.Thread( target=lambda: self._brozzle_site(browser, site), name="BrozzlingThread:%s" % site.seed) th.start() self._browsing_threads.add(th) except: self._browser_pool.release(browser) raise except brozzler.browser.NoBrowsersAvailable: if latest_state != "browsers-busy": self.logger.info("all %s browsers are busy", self._max_browsers) latest_state = "browsers-busy" except brozzler.NothingToClaim: if latest_state != "no-unclaimed-sites": self.logger.info("no unclaimed sites to browse") latest_state = "no-unclaimed-sites" time.sleep(0.5) except: self.logger.critical("thread exiting due to unexpected exception", exc_info=True) finally: if self._service_registry and hasattr(self, "status_info"): try: self._service_registry.unregister(self.status_info["id"]) except: self.logger.error( "failed to unregister from service registry", exc_info=True)
def __init__(self, seed, id=None, job_id=None, scope=None, proxy=None, ignore_robots=False, time_limit=None, warcprox_meta=None, enable_warcprox_features=False, reached_limit=None, status="ACTIVE", claimed=False, start_time=None, last_disclaimed=_EPOCH_UTC, last_claimed_by=None, last_claimed=_EPOCH_UTC, metadata={}, remember_outlinks=None, cookie_db=None): self.seed = seed self.id = id self.job_id = job_id self.proxy = proxy self.ignore_robots = ignore_robots self.enable_warcprox_features = bool(enable_warcprox_features) self.warcprox_meta = warcprox_meta self.time_limit = time_limit self.reached_limit = reached_limit self.status = status self.claimed = bool(claimed) self.last_claimed_by = last_claimed_by self.start_time = start_time or rethinkstuff.utcnow() self.last_disclaimed = last_disclaimed self.last_claimed = last_claimed self.metadata = metadata self.remember_outlinks = remember_outlinks self.cookie_db = cookie_db self.scope = scope or {} if not "surt" in self.scope: self.scope["surt"] = Url(seed).surt
def _maybe_finish_job(self, job_id): """Returns True if job is finished.""" job = self.job(job_id) if not job: return False if job.status.startswith("FINISH"): self.logger.warn("%s is already %s", job, job.status) return True results = self.r.table("sites").get_all(job_id, index="job_id").run() n = 0 for result in results: site = brozzler.Site(**result) if not site.status.startswith("FINISH"): results.close() return False n += 1 self.logger.info("all %s sites finished, job %s is FINISHED!", n, job.id) job.status = "FINISHED" job.finished = rethinkstuff.utcnow() self.update_job(job) return True
def claim_site(self, worker_id): # XXX keep track of aggregate priority and prioritize sites accordingly? while True: result = ( self.r.table("sites", read_mode="majority") .between(["ACTIVE", rethinkdb.minval], ["ACTIVE", rethinkdb.maxval], index="sites_last_disclaimed") .order_by(index="sites_last_disclaimed") .filter( (rethinkdb.row["claimed"] != True) | (rethinkdb.row["last_claimed"] < rethinkdb.now() - 2 * 60 * 60) ) .limit(1) .update( # try to avoid a race condition resulting in multiple # brozzler-workers claiming the same site # see https://github.com/rethinkdb/rethinkdb/issues/3235#issuecomment-60283038 rethinkdb.branch( (rethinkdb.row["claimed"] != True) | (rethinkdb.row["last_claimed"] < rethinkdb.now() - 2 * 60 * 60), {"claimed": True, "last_claimed_by": worker_id, "last_claimed": rethinkstuff.utcnow()}, {}, ), return_changes=True, ) ).run() self._vet_result(result, replaced=[0, 1], unchanged=[0, 1]) if result["replaced"] == 1: if result["changes"][0]["old_val"]["claimed"]: self.logger.warn( "re-claimed site that was still marked 'claimed' " "because it was last claimed a long time ago " "at %s, and presumably some error stopped it from " "being disclaimed", result["changes"][0]["old_val"]["last_claimed"], ) site = brozzler.Site(**result["changes"][0]["new_val"]) else: raise brozzler.NothingToClaim # XXX This is the only place we enforce time limit for now. Worker # loop should probably check time limit. Maybe frontier needs a # housekeeping thread to ensure that time limits get enforced in a # timely fashion. if not self._enforce_time_limit(site): return site