Beispiel #1
0
    def __call__(self, mon, thr):
        self.mon = mon
        self.status_queue = queue.PriorityQueue()
        self.mon.register_event_queue(self.status_queue,
                                      (self._MON_SAYS_STOP, -1))

        self.mon.set_status_prefix("d")
        self.mon.report_status("loading...")

        self.proxies = ProxySet(self, self.mon, self.args,
                                self.proxy_sort_key)
        self.mon.report_status("loading... (proxies OK)")

        self.db = url_database.ensure_database(self.args)
        self.prepare_database()

        for _ in range(self.args.total_workers):
            wt = CaptureWorker(self)
            self.mon.add_work_thread(wt)
            self.idle_workers.add(wt)

        self.dispatcher_loop()
Beispiel #2
0
    def __call__(self, mon, thr):
        self.mon = mon
        self.status_queue = queue.Queue()
        self.mon.register_event_queue(self.status_queue, (self._MON_SAYS_STOP,))

        self.mon.set_status_prefix("d")
        self.mon.report_status("loading...")

        self.proxies = ProxySet(self, mon, self.args)
        self.mon.report_status("loading... (proxies OK)")

        self.locations = {
            loc: LocationState(loc, self.args.destinations, self.output_dir) for loc in self.proxies.locations.keys()
        }
        self.mon.report_status("loading... (locations OK)")

        # We only need one worker thread per proxy, because scamper
        # parallelizes work internally.
        for _ in range(self.args.max_simultaneous_proxies):
            wt = TracerouteWorker(self)
            self.mon.add_work_thread(wt)
            self.idle_workers.add(wt)
        self.mon.report_status("loading... (work threads OK)")

        # kick things off by starting one proxy
        (proxy, until_next, n_locations) = self.proxies.start_a_proxy()
        self.mon.report_status(
            "{}/{}/{} locations active, {} started, "
            "{} till next".format(
                len(self.proxies.active_proxies),
                n_locations,
                len(self.locations),
                proxy.label() if proxy else None,
                until_next,
            )
        )

        while n_locations:
            time_now = time.monotonic()
            # Technically, until_next being None means "wait for a proxy
            # to exit", but use an hour as a backstop.  (When a proxy does
            # exit, this will get knocked down to zero below.)
            if until_next is None:
                until_next = 3600
            time_next = time_now + until_next
            pending_stop = False
            while time_now < time_next:
                for msg in queue_iter(self.status_queue, until_next):
                    if msg[0] == self._PROXY_ONLINE:
                        self.proxies.note_proxy_online(msg[1])
                        self.mon.report_status("proxy {} online".format(msg[1].label()))
                        self.mon.idle(1)

                    elif msg[0] == self._PROXY_OFFLINE:
                        self.mon.report_status("proxy {} offline".format(msg[1].label()))
                        self.proxies.note_proxy_offline(msg[1])
                        # Wait no more than 5 minutes before trying to
                        # start another proxy.  (XXX This hardwires a
                        # specific provider's policy.)
                        time_now = time.monotonic()
                        time_next = min(time_next, time_now + 300)
                        until_next = time_next - time_now

                    elif msg[0] == self._BATCH_COMPLETE:
                        locstate = self.active_workers[msg[1]]
                        del self.active_workers[msg[1]]
                        self.idle_workers.add(msg[1])
                        locstate.complete_job()
                        self.mon.report_status("{} batch complete".format(locstate.location))

                    elif msg[0] == self._BATCH_FAILED:
                        locstate = self.active_workers[msg[1]]
                        del self.active_workers[msg[1]]
                        self.idle_workers.add(msg[1])
                        locstate.fail_job()
                        self.mon.report_status("{} batch failed".format(locstate.location))

                    elif msg[0] == self._DROP_WORKER:
                        self.idle_workers.discard(worker)
                        if worker in self.active_workers:
                            self.active_workers[worker].fail_job()
                            del self.active_workers[worker]

                    elif msg[0] == self._MON_SAYS_STOP:
                        self.mon.report_status("interrupt pending")
                        pending_stop = True

                    else:
                        self.mon.report_error("bogus message: {!r}".format(message))

                for loc, state in self.locations.items():
                    if state.next_task is None:
                        self.mon.report_status("{} finished".format(loc))
                        if loc in self.proxies.locations:
                            self.proxies.locations[loc].finished()

                if pending_stop:
                    self.mon.report_status("interrupted")
                    self.mon.maybe_pause_or_stop()
                    # don't start new work yet, the set of proxies
                    # available may be totally different now

                else:
                    for proxy in self.proxies.active_proxies:
                        if not self.idle_workers:
                            break
                        if not proxy.online:
                            continue
                        state = self.locations[proxy.loc]
                        if not state.active_task and state.next_task is not None:
                            worker = self.idle_workers.pop()
                            self.active_workers[worker] = state
                            state.queue_job(worker, proxy)
                            self.mon.report_status("queuing job for {}".format(proxy.label()))

                time_now = time.monotonic()
                until_next = time_next - time_now

            # when we get to this point, it's time to start another proxy
            (proxy, until_next, n_locations) = self.proxies.start_a_proxy()
            self.mon.report_status(
                "{}/{}/{} locations active, {} started, "
                "{} till next".format(
                    len(self.proxies.active_proxies),
                    n_locations,
                    len(self.locations),
                    proxy.label() if proxy else None,
                    until_next,
                )
            )

        # done, kill off all the workers
        self.mon.report_status("finished")
        assert not self.active_workers
        for w in self.idle_workers:
            w.finished()
Beispiel #3
0
    def __call__(self, mon, thr):
        self.mon = mon
        self.status_queue = queue.Queue()
        self.mon.register_event_queue(self.status_queue,
                                      (self._MON_SAYS_STOP, ))

        self.mon.set_status_prefix("d")
        self.mon.report_status("loading...")

        self.proxies = ProxySet(self,
                                mon,
                                self.args,
                                include_locations=self.dns_servers)
        self.mon.report_status("loading... (proxies OK)")

        for loc in list(self.dns_servers.keys()):
            if loc not in self.proxies.locations:
                del self.dns_servers[loc]

        assert list(self.dns_servers.keys()) == \
               list(self.proxies.locations.keys())

        self.locations = {
            loc: LocationState(loc, self.dns_servers[loc], self.hostnames,
                               self.output_dir)
            for loc in self.dns_servers.keys()
        }
        self.mon.report_status("loading... (locations OK)")

        # One work thread per active proxy.
        for _ in range(self.args.max_simultaneous_proxies):
            wt = DNSWorker(self)
            self.mon.add_work_thread(wt)
            self.idle_workers.add(wt)
        self.mon.report_status("loading... (work threads OK)")

        # kick things off by starting one proxy
        (proxy, until_next, n_locations) = self.proxies.start_a_proxy()
        self.mon.report_status("{}/{}/{} locations active, {} started, "
                               "{} till next".format(
                                   len(self.proxies.active_proxies),
                                   n_locations, len(self.locations),
                                   proxy.label() if proxy else None,
                                   until_next))

        while n_locations:
            time_now = time.monotonic()
            # Technically, until_next being None means "wait for a proxy
            # to exit", but use an hour as a backstop.  (When a proxy does
            # exit, this will get knocked down to zero below.)
            if until_next is None: until_next = 3600
            time_next = time_now + until_next
            pending_stop = False
            while time_now < time_next:
                for msg in queue_iter(self.status_queue, until_next):
                    if msg[0] == self._PROXY_ONLINE:
                        self.proxies.note_proxy_online(msg[1])
                        self.mon.report_status("proxy {} online".format(
                            msg[1].label()))
                        self.mon.idle(1)

                    elif msg[0] == self._PROXY_OFFLINE:
                        self.mon.report_status("proxy {} offline".format(
                            msg[1].label()))
                        self.proxies.note_proxy_offline(msg[1])
                        # Wait no more than 5 minutes before trying to
                        # start another proxy.  (XXX This hardwires a
                        # specific provider's policy.)
                        time_now = time.monotonic()
                        time_next = min(time_next, time_now + 300)
                        until_next = time_next - time_now

                    elif msg[0] == self._BATCH_COMPLETE:
                        locstate = self.active_workers[msg[1]]
                        del self.active_workers[msg[1]]
                        self.idle_workers.add(msg[1])
                        locstate.complete_job()
                        self.mon.report_status("{} batch complete".format(
                            locstate.location))

                    elif msg[0] == self._BATCH_FAILED:
                        locstate = self.active_workers[msg[1]]
                        del self.active_workers[msg[1]]
                        self.idle_workers.add(msg[1])
                        locstate.fail_job()
                        self.mon.report_status("{} batch failed".format(
                            locstate.location))

                    elif msg[0] == self._DROP_WORKER:
                        self.idle_workers.discard(worker)
                        if worker in self.active_workers:
                            self.active_workers[worker].fail_job()
                            del self.active_workers[worker]

                    elif msg[0] == self._MON_SAYS_STOP:
                        self.mon.report_status("interrupt pending")
                        pending_stop = True

                    else:
                        self.mon.report_error(
                            "bogus message: {!r}".format(message))

                for loc, state in self.locations.items():
                    if state.finished_p():
                        self.mon.report_status("{} finished".format(loc))
                        if loc in self.proxies.locations:
                            self.proxies.locations[loc].finished()

                if pending_stop:
                    self.mon.report_status("interrupted")
                    self.mon.maybe_pause_or_stop()
                    # don't start new work yet, the set of proxies
                    # available may be totally different now

                else:
                    for proxy in self.proxies.active_proxies:
                        if not self.idle_workers:
                            break
                        if not proxy.online:
                            continue
                        state = self.locations[proxy.loc]
                        if state.idle_p():
                            worker = self.idle_workers.pop()
                            self.active_workers[worker] = state
                            state.queue_job(worker, proxy)
                            self.mon.report_status("queuing job for {}".format(
                                proxy.label()))

                time_now = time.monotonic()
                until_next = time_next - time_now

            # when we get to this point, it's time to start another proxy
            (proxy, until_next, n_locations) = self.proxies.start_a_proxy()
            self.mon.report_status("{}/{}/{} locations active, {} started, "
                                   "{} till next".format(
                                       len(self.proxies.active_proxies),
                                       n_locations, len(self.locations),
                                       proxy.label() if proxy else None,
                                       until_next))

        # done, kill off all the workers
        self.mon.report_status("finished")
        assert not self.active_workers
        for w in self.idle_workers:
            w.finished()
Beispiel #4
0
class CaptureDispatcher:
    def __init__(self, args):
        # complete initialization deferred till we're on the right thread
        self.args                    = args
        self.idle_workers            = set()
        self.active_workers          = {}
        self.locations               = {}
        self.overall_jobsize         = 0
        self.proxies                 = None
        self.mon                     = None
        self.db                      = None
        self.status_queue            = None
        self.status_queue_serializer = 0

    def __call__(self, mon, thr):
        self.mon = mon
        self.status_queue = queue.PriorityQueue()
        self.mon.register_event_queue(self.status_queue,
                                      (self._MON_SAYS_STOP, -1))

        self.mon.set_status_prefix("d")
        self.mon.report_status("loading...")

        self.proxies = ProxySet(self, self.mon, self.args,
                                self.proxy_sort_key)
        self.mon.report_status("loading... (proxies OK)")

        self.db = url_database.ensure_database(self.args)
        self.prepare_database()

        for _ in range(self.args.total_workers):
            wt = CaptureWorker(self)
            self.mon.add_work_thread(wt)
            self.idle_workers.add(wt)

        self.dispatcher_loop()

    # Status queue helper constants and methods.
    _PROXY_OFFLINE  = 1
    _PROXY_ONLINE   = 2
    _BATCH_COMPLETE = 3
    _BATCH_FAILED   = 4
    _DROP_WORKER    = 5
    _MON_SAYS_STOP  = 6 # Stop after handling all incoming work

    # Entries in a PriorityQueue must be totally ordered.  We just
    # want to service all COMPLETE messages ahead of all others, and
    # STOP messages after all others, so we give them all a serial
    # number which goes in the tuple right after the command code,
    # before the data.  This also means we don't have to worry about
    # unsortable data.
    def oq(self):
        self.status_queue_serializer += 1
        return self.status_queue_serializer

    # worker-to-dispatcher API
    def complete_batch(self, worker, result):
        self.status_queue.put((self._BATCH_COMPLETE, self.oq(),
                               worker, result))

    def fail_batch(self, worker, exc_info):
        self.status_queue.put((self._BATCH_FAILED, self.oq(), worker))

    def drop_worker(self, worker):
        self.status_queue.put((self._DROP_WORKER, self.oq(), worker))

    # proxy-to-dispatcher API
    def proxy_online(self, proxy):
        self.status_queue.put((self._PROXY_ONLINE, self.oq(), proxy))

    def proxy_offline(self, proxy):
        self.status_queue.put((self._PROXY_OFFLINE, self.oq(), proxy))

    def _invalid_message(self, *args):
        self.mon.report_error("invalid status queue message {!r}"
                              .format(args))

    def dispatcher_loop(self):

        # Kick things off by starting one proxy.
        (proxy, until_next, n_locations) = self.proxies.start_a_proxy()

        while n_locations:
            time_now = time.monotonic()
            # Technically, until_next being None means "wait for a proxy
            # to exit", but use an hour as a backstop.  (When a proxy does
            # exit, this will get knocked down to zero below.)
            if until_next is None: until_next = 3600
            time_next = time_now + until_next
            pending_stop = False

            while time_now < time_next:
                self.update_progress_statistics(n_locations, until_next)

                for msg in queue_iter(self.status_queue, until_next):
                    if msg[0] == self._PROXY_ONLINE:
                        self.proxies.note_proxy_online(msg[2])

                    elif msg[0] == self._PROXY_OFFLINE:
                        self.proxies.note_proxy_offline(msg[2])
                        # Wait no more than 5 minutes before trying to
                        # start another proxy.  (XXX This hardwires a
                        # specific provider's policy.)
                        time_now = time.monotonic()
                        time_next = min(time_next, time_now + 300)
                        until_next = time_next - time_now

                    elif msg[0] == self._BATCH_COMPLETE:
                        worker, result = msg[2], msg[3]
                        locstate, _ = self.active_workers[worker]
                        del self.active_workers[worker]
                        self.idle_workers.add(worker)
                        self.record_batch(locstate, *result)

                    elif msg[0] == self._BATCH_FAILED:
                        worker = msg[2]
                        # We might've already gotten a COMPLETE message
                        # with more precision.
                        if worker in self.active_workers:
                            locstate, batch = self.active_workers[worker]
                            del self.active_workers[worker]
                            self.idle_workers.add(worker)
                            self.record_batch(locstate, [], batch)

                    elif msg[0] == self._DROP_WORKER:
                        worker = msg[2]
                        self.idle_workers.discard(worker)
                        if worker in self.active_workers:
                            self.active_workers[worker].fail_job()
                            del self.active_workers[worker]

                    elif msg[0] == self._MON_SAYS_STOP:
                        self.mon.report_status("interrupt pending")
                        pending_stop = True

                    else:
                        self.mon.report_error("bogus message: {!r}"
                                              .format(message))

                for loc, state in self.locations.items():
                    if state.todo == 0 and loc in self.proxies.locations:
                        self.proxies.locations[loc].finished()

                if pending_stop:
                    self.mon.report_status("interrupted")
                    self.mon.maybe_pause_or_stop()
                    # don't start new work yet, the set of proxies
                    # available may be totally different now

                else:
                    # One-second delay before starting new work, because
                    # proxies aren't always 100% up when they say they are.
                    self.mon.idle(1)

                    while self.idle_workers:
                        assigned_work = False
                        for proxy in self.proxies.active_proxies:
                            if not proxy.online:
                                continue
                            state = self.locations[proxy.loc]
                            if state.n_workers >= self.args.workers_per_loc:
                                continue
                            batch = self.select_batch(state)
                            if not batch:
                                # All work for this location is
                                # assigned to other workers already.
                                continue

                            state.n_workers += 1
                            state.in_progress.update(row[0] for row in batch)
                            worker = self.idle_workers.pop()
                            self.active_workers[worker] = (state, batch)
                            worker.queue_batch(state, batch)
                            assigned_work = True
                            if not self.idle_workers:
                                break

                        if not assigned_work:
                            break

                time_now = time.monotonic()
                until_next = time_next - time_now

            # when we get to this point, it's time to start another proxy
            (proxy, until_next, n_locations) = self.proxies.start_a_proxy()

        # done, kill off all the workers
        self.mon.report_status("finished")
        assert not self.active_workers
        for w in self.idle_workers:
            w.finished()

    def proxy_sort_key(self, loc, method):
        # Consider locales that currently have no workers at all first.
        # Consider locales with more work to do first.
        # Consider locales whose proxy is 'direct' first.
        # Consider locales named 'us' first.
        # As a final tie breaker use alphabetical order of locale name.
        state = self.locations[loc]
        return (state.n_workers != 0,
                -state.todo,
                method != 'direct',
                loc != 'us',
                loc)

    def select_batch(self, loc):
        with self.db, self.db.cursor() as cr:

            query = ('SELECT c.url as uid, s.url as url'
                     '  FROM capture_progress c, url_strings s'
                     ' WHERE c.url = s.id')

            query += ' AND NOT c."l_{0}"'.format(loc.locale)

            if loc.in_progress:
                query += ' AND c.url NOT IN ('
                query += ','.join(str(u) for u in loc.in_progress)
                query += ')'

            query += ' LIMIT {0}'.format(self.args.batch_size)
            cr.execute(query)
            return cr.fetchall()

    def record_batch(self, loc, successes, failures):
        locale = loc.locale
        loc.n_workers -= 1
        for r in failures:
            loc.in_progress.remove(r[0])

        if not successes:
            return

        with self.db, self.db.cursor() as cr:
            for s in successes:
                url_id = s[0]
                r      = s[1]
                loc.in_progress.remove(url_id)

                redir_url = None
                redir_url_id = None
                if r['canon']:
                    redir_url = r['canon']
                    if redir_url == r['ourl']:
                        redir_url_id = url_id
                    elif redir_url is not None:
                        try:
                            (redir_url_id, _) = \
                                url_database.add_url_string(cr, redir_url)
                        except (ValueError, UnicodeError):
                            addendum = "invalid redir url: " + redir_url
                            if ('detail' not in r or r['detail'] is None):
                                r['detail'] = addendum
                            else:
                                r['detail'] += " | " + addendum

                detail_id = self.capture_detail.get(r['detail'])
                if detail_id is None:
                    cr.execute("INSERT INTO capture_detail(id, detail) "
                               "  VALUES(DEFAULT, %s)"
                               "  RETURNING id", (r['detail'],))
                    detail_id = cr.fetchone()[0]
                    self.capture_detail[r['detail']] = detail_id

                result = url_database.categorize_result(r['status'],
                                                        r['detail'],
                                                        url_id,
                                                        redir_url_id)

                to_insert = {
                    "locale":       locale,
                    "url":          url_id,
                    "result":       result,
                    "detail":       detail_id,
                    "redir_url":    redir_url_id,
                    "log":          r['log'],
                    "html_content": r['content'],
                    "screenshot":   r['render']
                }
                cr.execute("INSERT INTO captured_pages"
                           "(locale, url, access_time, result, detail,"
                           " redir_url, capture_log, html_content,"
                           " screenshot)"
                           "VALUES ("
                           "  %(locale)s,"
                           "  %(url)s,"
                           "  TIMESTAMP 'now',"
                           "  %(result)s,"
                           "  %(detail)s,"
                           "  %(redir_url)s,"
                           "  %(log)s,"
                           "  %(html_content)s,"
                           "  %(screenshot)s)",
                           to_insert)
                cr.execute('UPDATE capture_progress SET "l_{0}" = TRUE '
                           ' WHERE url = {1}'.format(locale, url_id))
                loc.todo -= 1

    def update_progress_statistics(self, n_locations, until_next):
        jobsize = 0
        plreport = []
        for plstate in self.locations.values():
            jobsize = max(jobsize, plstate.todo)
            plreport.append((-plstate.todo, plstate.locale))

        plreport.sort()
        plreport = " ".join("{}:{}".format(pl[1], -pl[0]) for pl in plreport)

        self.mon.report_status("Processing {}/{} URLs | {}/{}/{} active, {} till next | {}"
                               .format(jobsize, self.overall_jobsize,
                                       len(self.proxies.active_proxies),
                                       n_locations,
                                       len(self.locations),
                                       until_next,
                                       plreport))

    def prepare_database(self):
        self.locations = { loc: PerLocaleState(loc, proxy)
                           for loc, proxy in self.proxies.locations.items() }
        with self.db, self.db.cursor() as cr:
            # Cache the status table in memory; it's reasonably small.
            self.mon.report_status("Preparing database... (capture detail)")
            cr.execute("SELECT detail, id FROM capture_detail;")
            self.capture_detail = { row.detail: row.id for row in cr }

            # The capture_progress table tracks what we've done so far.
            # It is regenerated from scratch each time this program is run,
            # based on the contents of the urls_* and captured_pages tables.
            self.mon.maybe_pause_or_stop()
            self.mon.report_status("Preparing database... "
                                   "(capture progress)")

            l_columns = ",\n  ".join(
                "\"l_{0}\" BOOLEAN NOT NULL DEFAULT FALSE"
                .format(loc) for loc in self.locations.keys())

            cr.execute("CREATE TEMPORARY TABLE capture_progress ("
                       "  url INTEGER PRIMARY KEY,"
                       + l_columns + ");")

            # Determine the set of URLs yet to be captured from the selected
            # tables.
            self.mon.maybe_pause_or_stop()
            self.mon.report_status("Preparing database... "
                                   "(capture progress rows)")

            cr.execute("SELECT table_name FROM information_schema.tables"
                       " WHERE table_schema = %s"
                       "   AND table_type = 'BASE TABLE'"
                       "   AND table_name LIKE 'urls_%%'",
                       (self.args.schema,))
            all_url_tables = set(row[0] for row in cr)

            if self.args.tables is None:
                want_url_tables = all_url_tables
            else:
                want_url_tables = set("urls_"+t.strip()
                                      for t in self.args.tables.split(","))
                if not want_url_tables.issubset(all_url_tables):
                    raise RuntimeError("Requested URL tables do not exist: "
                                       + ", ".join(
                                           t[5:] for t in
                                           want_url_tables - all_url_tables))

            for tbl in want_url_tables:
                self.mon.maybe_pause_or_stop()
                self.mon.report_status("Preparing database... "
                                       "(capture progress rows: {})"
                                       .format(tbl))

                # Only one row per URL, even if it appears in more than one
                # source table.
                cr.execute("INSERT INTO capture_progress (url) "
                           "        SELECT url FROM "+tbl+
                           " EXCEPT SELECT url FROM capture_progress")

            self.mon.maybe_pause_or_stop()
            self.mon.report_status("Preparing database... (analyzing)")
            cr.execute("ANALYZE captured_pages")

            for loc in self.locations.keys():
                self.mon.maybe_pause_or_stop()
                self.mon.report_status("Preparing database... "
                                       "(capture progress values: {})"
                                       .format(loc))

                cr.execute('UPDATE capture_progress c SET "l_{0}" = TRUE'
                           '  FROM captured_pages p'
                           ' WHERE c.url = p.url AND p.locale = \'{0}\''
                           .format(loc))

                self.mon.maybe_pause_or_stop()
                self.mon.report_status("Preparing database... (indexing: {})"
                                       .format(loc))
                cr.execute("CREATE INDEX \"capture_progress_l_{0}_idx\""
                           "  ON capture_progress(\"l_{0}\");"
                           .format(loc))

            self.mon.maybe_pause_or_stop()
            self.mon.report_status("Preparing database... (analyzing)")
            cr.execute("ANALYZE capture_progress")

            self.mon.maybe_pause_or_stop()
            self.mon.report_status("Preparing database... (statistics)")

            query = "SELECT COUNT(*)"
            for loc in self.locations.keys():
                query += ', SUM("l_{0}"::INTEGER) AS "l_{0}"'.format(loc)
            query += " FROM capture_progress"
            cr.execute(query)

            # Compute the number of unvisited URLs for each locale,
            # and remove locales where that number is zero from the
            # working set.

            counts = cr.fetchone()
            self.overall_jobsize = counts[0]
            for loc, done in zip(self.locations.keys(), counts[1:]):
                todo = self.overall_jobsize - done
                assert todo >= 0
                if todo:
                    self.locations[loc].todo = todo
                else:
                    self.locations[loc].proxy.finished()

            self.mon.maybe_pause_or_stop()
            self.mon.report_status("Database prepared.")