Example #1
0
 def _enforce_time_limit(self, site):
     if (site.time_limit and site.time_limit > 0
             and (rethinkstuff.utcnow() - site.start_time).total_seconds() > site.time_limit):
         self.logger.debug("site FINISHED_TIME_LIMIT! time_limit=%s start_time=%s elapsed=%s %s",
                 site.time_limit, site.start_time, rethinkstuff.utcnow() - site.start_time, site)
         self.finished(site, "FINISHED_TIME_LIMIT")
         return True
     else:
         return False
Example #2
0
File: job.py Project: ato/brozzler
def new_job(frontier, job_conf):
    job = Job(
            id=job_conf.get("id"), conf=job_conf, status="ACTIVE",
            started=rethinkstuff.utcnow())

    sites = []
    for seed_conf in job_conf["seeds"]:
        merged_conf = merge(seed_conf, job_conf)
        # XXX check for unknown settings, invalid url, etc

        site = brozzler.Site(job_id=job.id,
                seed=merged_conf["url"],
                scope=merged_conf.get("scope"),
                time_limit=merged_conf.get("time_limit"),
                proxy=merged_conf.get("proxy"),
                ignore_robots=merged_conf.get("ignore_robots"),
                enable_warcprox_features=merged_conf.get(
                    "enable_warcprox_features"),
                warcprox_meta=merged_conf.get("warcprox_meta"),
                metadata=merged_conf.get("metadata"))
        sites.append(site)

    # insert all the sites into database before the job
    for site in sites:
        new_site(frontier, site)

    frontier.new_job(job)
Example #3
0
    def __init__(
            self, seed, id=None, job_id=None, scope=None, proxy=None,
            ignore_robots=False, time_limit=None, warcprox_meta=None,
            enable_warcprox_features=False, reached_limit=None,
            status="ACTIVE", claimed=False, start_time=None,
            last_disclaimed=_EPOCH_UTC, last_claimed_by=None,
            last_claimed=_EPOCH_UTC, metadata={}, remember_outlinks=None,
            cookie_db=None, user_agent=None, behavior_parameters=None):

        self.seed = seed
        self.id = id
        self.job_id = job_id
        self.proxy = proxy
        self.ignore_robots = ignore_robots
        self.enable_warcprox_features = bool(enable_warcprox_features)
        self.warcprox_meta = warcprox_meta
        self.time_limit = time_limit
        self.reached_limit = reached_limit
        self.status = status
        self.claimed = bool(claimed)
        self.last_claimed_by = last_claimed_by
        self.start_time = start_time or rethinkstuff.utcnow()
        self.last_disclaimed = last_disclaimed
        self.last_claimed = last_claimed
        self.metadata = metadata
        self.remember_outlinks = remember_outlinks
        self.cookie_db = cookie_db
        self.user_agent = user_agent
        self.behavior_parameters = behavior_parameters

        self.scope = scope or {}
        if not "surt" in self.scope:
            self.scope["surt"] = Url(seed).surt
Example #4
0
def new_job(frontier, job_conf):
    validate_conf(job_conf)
    job = Job(id=job_conf.get("id"), conf=job_conf, status="ACTIVE", started=rethinkstuff.utcnow())

    sites = []
    for seed_conf in job_conf["seeds"]:
        merged_conf = merge(seed_conf, job_conf)
        if "login" in merged_conf and "metadata" in merged_conf:
            merged_conf["metadata"]["login"] = merged_conf["login"]
        site = brozzler.Site(
            job_id=job.id,
            seed=merged_conf["url"],
            scope=merged_conf.get("scope"),
            time_limit=merged_conf.get("time_limit"),
            proxy=merged_conf.get("proxy"),
            ignore_robots=merged_conf.get("ignore_robots"),
            enable_warcprox_features=merged_conf.get("enable_warcprox_features"),
            warcprox_meta=merged_conf.get("warcprox_meta"),
            metadata=merged_conf.get("metadata"),
            remember_outlinks=merged_conf.get("remember_outlinks"),
            user_agent=merged_conf.get("user_agent"),
            behavior_parameters=merged_conf.get("behavior_parameters"),
        )
        sites.append(site)

    # insert all the sites into database before the job
    for site in sites:
        new_site(frontier, site)

    frontier.new_job(job)
Example #5
0
def new_job(frontier, job_conf):
    job = Job(id=job_conf.get("id"),
              conf=job_conf,
              status="ACTIVE",
              started=rethinkstuff.utcnow())

    sites = []
    for seed_conf in job_conf["seeds"]:
        merged_conf = merge(seed_conf, job_conf)
        # XXX check for unknown settings, invalid url, etc

        site = brozzler.Site(
            job_id=job.id,
            seed=merged_conf["url"],
            scope=merged_conf.get("scope"),
            time_limit=merged_conf.get("time_limit"),
            proxy=merged_conf.get("proxy"),
            ignore_robots=merged_conf.get("ignore_robots"),
            enable_warcprox_features=merged_conf.get(
                "enable_warcprox_features"),
            warcprox_meta=merged_conf.get("warcprox_meta"),
            metadata=merged_conf.get("metadata"),
            remember_outlinks=merged_conf.get("remember_outlinks"))
        sites.append(site)

    # insert all the sites into database before the job
    for site in sites:
        new_site(frontier, site)

    frontier.new_job(job)
Example #6
0
 def _enforce_time_limit(self, site):
     if (
         site.time_limit
         and site.time_limit > 0
         and (rethinkstuff.utcnow() - site.start_time).total_seconds() > site.time_limit
     ):
         self.logger.debug(
             "site FINISHED_TIME_LIMIT! time_limit=%s start_time=%s elapsed=%s %s",
             site.time_limit,
             site.start_time,
             rethinkstuff.utcnow() - site.start_time,
             site,
         )
         self.finished(site, "FINISHED_TIME_LIMIT")
         return True
     else:
         return False
Example #7
0
def test_utcnow():
    now_notz = datetime.datetime.utcnow()  # has no timezone :(
    assert not now_notz.tzinfo

    now_tz = rethinkstuff.utcnow()  # solution to that problem
    assert now_tz.tzinfo

    ## .timestamp() was added in python 3.3
    if hasattr(now_tz, "timestamp"):
        assert now_tz.timestamp() - now_notz.timestamp() < 0.1
Example #8
0
 def disclaim_site(self, site, page=None):
     self.logger.info("disclaiming %s", site)
     site.claimed = False
     site.last_disclaimed = rethinkstuff.utcnow()
     if not page and not self.has_outstanding_pages(site):
         self.finished(site, "FINISHED")
     else:
         self.update_site(site)
     if page:
         page.claimed = False
         self.update_page(page)
Example #9
0
 def disclaim_site(self, site, page=None):
     self.logger.info("disclaiming %s", site)
     site.claimed = False
     site.last_disclaimed = rethinkstuff.utcnow()
     if not page and not self.has_outstanding_pages(site):
         self.finished(site, "FINISHED")
     else:
         self.update_site(site)
     if page:
         page.claimed = False
         self.update_page(page)
Example #10
0
    def _service_heartbeat_if_due(self):
        '''Sends service registry heartbeat if due'''
        due = False
        if self._service_registry:
            if not hasattr(self, "status_info"):
                due = True
            else:
                d = rethinkstuff.utcnow() - self.status_info["last_heartbeat"]
                due = d.total_seconds() > self.HEARTBEAT_INTERVAL

        if due:
            self._service_heartbeat()
Example #11
0
    def run(self):
        try:
            latest_state = None
            while not self._shutdown_requested.is_set():
                if self._service_registry and (
                        not hasattr(self, "status_info")
                        or (rethinkstuff.utcnow() -
                            self.status_info["last_heartbeat"]).total_seconds()
                        > self.HEARTBEAT_INTERVAL):
                    self._service_heartbeat()

                try:
                    browser = self._browser_pool.acquire()
                    try:
                        site = self._frontier.claim_site("{}:{}".format(
                            socket.gethostname(), browser.chrome_port))
                        self.logger.info(
                                "brozzling site (proxy=%s) %s",
                                repr(self._proxy(site)), site)
                        th = threading.Thread(
                                target=lambda: self._brozzle_site(
                                    browser, site),
                                name="BrozzlingThread:%s" % site.seed)
                        th.start()
                        self._browsing_threads.add(th)
                    except:
                        self._browser_pool.release(browser)
                        raise
                except brozzler.browser.NoBrowsersAvailable:
                    if latest_state != "browsers-busy":
                        self.logger.info(
                                "all %s browsers are busy", self._max_browsers)
                        latest_state = "browsers-busy"
                except brozzler.NothingToClaim:
                    if latest_state != "no-unclaimed-sites":
                        self.logger.info("no unclaimed sites to browse")
                        latest_state = "no-unclaimed-sites"
                time.sleep(0.5)
        except:
            self.logger.critical(
                    "thread exiting due to unexpected exception",
                    exc_info=True)
        finally:
            if self._service_registry and hasattr(self, "status_info"):
                try:
                    self._service_registry.unregister(self.status_info["id"])
                except:
                    self.logger.error(
                            "failed to unregister from service registry",
                            exc_info=True)
Example #12
0
 def claim_site(self, worker_id):
     # XXX keep track of aggregate priority and prioritize sites accordingly?
     while True:
         result = (
                 self.r.table("sites", read_mode="majority")
                 .between(
                     ["ACTIVE",rethinkdb.minval],
                     ["ACTIVE",rethinkdb.maxval],
                     index="sites_last_disclaimed")
                 .order_by(index="sites_last_disclaimed")
                 .filter(
                     (rethinkdb.row["claimed"] != True) |
                     (rethinkdb.row["last_claimed"]
                         < rethinkdb.now() - 2*60*60))
                 .limit(1)
                 .update(
                     # try to avoid a race condition resulting in multiple
                     # brozzler-workers claiming the same site
                     # see https://github.com/rethinkdb/rethinkdb/issues/3235#issuecomment-60283038
                     rethinkdb.branch(
                         (rethinkdb.row["claimed"] != True) |
                         (rethinkdb.row["last_claimed"]
                             < rethinkdb.now() - 2*60*60), {
                                 "claimed": True,
                                 "last_claimed_by": worker_id,
                                 "last_claimed": rethinkstuff.utcnow()
                             }, {}), return_changes=True)).run()
         self._vet_result(result, replaced=[0,1], unchanged=[0,1])
         if result["replaced"] == 1:
             if result["changes"][0]["old_val"]["claimed"]:
                 self.logger.warn(
                         "re-claimed site that was still marked 'claimed' "
                         "because it was last claimed a long time ago "
                         "at %s, and presumably some error stopped it from "
                         "being disclaimed",
                         result["changes"][0]["old_val"]["last_claimed"])
             site = brozzler.Site(**result["changes"][0]["new_val"])
         else:
             raise brozzler.NothingToClaim
         # XXX This is the only place we enforce time limit for now. Worker
         # loop should probably check time limit. Maybe frontier needs a
         # housekeeping thread to ensure that time limits get enforced in a
         # timely fashion.
         if not self._enforce_time_limit(site):
             return site
Example #13
0
    def run(self):
        try:
            latest_state = None
            while not self._shutdown_requested.is_set():
                if self._service_registry and (
                        not hasattr(self, "status_info") or
                    (rethinkstuff.utcnow() - self.status_info["last_heartbeat"]
                     ).total_seconds() > self.HEARTBEAT_INTERVAL):
                    self._service_heartbeat()

                try:
                    browser = self._browser_pool.acquire()
                    try:
                        site = self._frontier.claim_site("{}:{}".format(
                            socket.gethostname(), browser.chrome_port))
                        self.logger.info("brozzling site %s", site)
                        th = threading.Thread(
                            target=lambda: self._brozzle_site(browser, site),
                            name="BrozzlingThread:%s" % site.seed)
                        th.start()
                        self._browsing_threads.add(th)
                    except:
                        self._browser_pool.release(browser)
                        raise
                except brozzler.browser.NoBrowsersAvailable:
                    if latest_state != "browsers-busy":
                        self.logger.info("all %s browsers are busy",
                                         self._max_browsers)
                        latest_state = "browsers-busy"
                except brozzler.NothingToClaim:
                    if latest_state != "no-unclaimed-sites":
                        self.logger.info("no unclaimed sites to browse")
                        latest_state = "no-unclaimed-sites"
                time.sleep(0.5)
        except:
            self.logger.critical("thread exiting due to unexpected exception",
                                 exc_info=True)
        finally:
            if self._service_registry and hasattr(self, "status_info"):
                try:
                    self._service_registry.unregister(self.status_info["id"])
                except:
                    self.logger.error(
                        "failed to unregister from service registry",
                        exc_info=True)
Example #14
0
    def __init__(self,
                 seed,
                 id=None,
                 job_id=None,
                 scope=None,
                 proxy=None,
                 ignore_robots=False,
                 time_limit=None,
                 warcprox_meta=None,
                 enable_warcprox_features=False,
                 reached_limit=None,
                 status="ACTIVE",
                 claimed=False,
                 start_time=None,
                 last_disclaimed=_EPOCH_UTC,
                 last_claimed_by=None,
                 last_claimed=_EPOCH_UTC,
                 metadata={},
                 remember_outlinks=None,
                 cookie_db=None):

        self.seed = seed
        self.id = id
        self.job_id = job_id
        self.proxy = proxy
        self.ignore_robots = ignore_robots
        self.enable_warcprox_features = bool(enable_warcprox_features)
        self.warcprox_meta = warcprox_meta
        self.time_limit = time_limit
        self.reached_limit = reached_limit
        self.status = status
        self.claimed = bool(claimed)
        self.last_claimed_by = last_claimed_by
        self.start_time = start_time or rethinkstuff.utcnow()
        self.last_disclaimed = last_disclaimed
        self.last_claimed = last_claimed
        self.metadata = metadata
        self.remember_outlinks = remember_outlinks
        self.cookie_db = cookie_db

        self.scope = scope or {}
        if not "surt" in self.scope:
            self.scope["surt"] = Url(seed).surt
Example #15
0
    def _maybe_finish_job(self, job_id):
        """Returns True if job is finished."""
        job = self.job(job_id)
        if not job:
            return False
        if job.status.startswith("FINISH"):
            self.logger.warn("%s is already %s", job, job.status)
            return True

        results = self.r.table("sites").get_all(job_id, index="job_id").run()
        n = 0
        for result in results:
            site = brozzler.Site(**result)
            if not site.status.startswith("FINISH"):
                results.close()
                return False
            n += 1

        self.logger.info("all %s sites finished, job %s is FINISHED!", n, job.id)
        job.status = "FINISHED"
        job.finished = rethinkstuff.utcnow()
        self.update_job(job)
        return True
Example #16
0
    def _maybe_finish_job(self, job_id):
        """Returns True if job is finished."""
        job = self.job(job_id)
        if not job:
            return False
        if job.status.startswith("FINISH"):
            self.logger.warn("%s is already %s", job, job.status)
            return True

        results = self.r.table("sites").get_all(job_id, index="job_id").run()
        n = 0
        for result in results:
            site = brozzler.Site(**result)
            if not site.status.startswith("FINISH"):
                results.close()
                return False
            n += 1

        self.logger.info("all %s sites finished, job %s is FINISHED!", n, job.id)
        job.status = "FINISHED"
        job.finished = rethinkstuff.utcnow()
        self.update_job(job)
        return True
Example #17
0
 def claim_site(self, worker_id):
     # XXX keep track of aggregate priority and prioritize sites accordingly?
     while True:
         result = (
             self.r.table("sites", read_mode="majority")
             .between(["ACTIVE", rethinkdb.minval], ["ACTIVE", rethinkdb.maxval], index="sites_last_disclaimed")
             .order_by(index="sites_last_disclaimed")
             .filter(
                 (rethinkdb.row["claimed"] != True) | (rethinkdb.row["last_claimed"] < rethinkdb.now() - 2 * 60 * 60)
             )
             .limit(1)
             .update(
                 # try to avoid a race condition resulting in multiple
                 # brozzler-workers claiming the same site
                 # see https://github.com/rethinkdb/rethinkdb/issues/3235#issuecomment-60283038
                 rethinkdb.branch(
                     (rethinkdb.row["claimed"] != True)
                     | (rethinkdb.row["last_claimed"] < rethinkdb.now() - 2 * 60 * 60),
                     {"claimed": True, "last_claimed_by": worker_id, "last_claimed": rethinkstuff.utcnow()},
                     {},
                 ),
                 return_changes=True,
             )
         ).run()
         self._vet_result(result, replaced=[0, 1], unchanged=[0, 1])
         if result["replaced"] == 1:
             if result["changes"][0]["old_val"]["claimed"]:
                 self.logger.warn(
                     "re-claimed site that was still marked 'claimed' "
                     "because it was last claimed a long time ago "
                     "at %s, and presumably some error stopped it from "
                     "being disclaimed",
                     result["changes"][0]["old_val"]["last_claimed"],
                 )
             site = brozzler.Site(**result["changes"][0]["new_val"])
         else:
             raise brozzler.NothingToClaim
         # XXX This is the only place we enforce time limit for now. Worker
         # loop should probably check time limit. Maybe frontier needs a
         # housekeeping thread to ensure that time limits get enforced in a
         # timely fashion.
         if not self._enforce_time_limit(site):
             return site