Ejemplo n.º 1
0
    def __call__(self, mon, thr):
        self.mon = mon
        self.db = url_database.ensure_database(self.args)
        self.prepared_batches = {}
        self.processing = {}
        self.per_locale = [0]*(len(self.locales)+1)
        self.status_queue = queue.PriorityQueue()
        self.status_queue_serializer = 0
        self.mon.register_event_queue(self.status_queue,
                                      (self._MON_SAYS_STOP, -1))
        self.prepare_database()

        try:
            self.sshsockdir = tempfile.mkdtemp(prefix="capture-control")

            for tld, workers in self.locales.items():
                self.prepared_batches[tld] = collections.deque()
                self.processing[tld] = set()
                for w in workers:
                    self.mon.add_work_thread(CaptureWorker(self, tld, w))

            self.dispatcher_loop()

        finally:
            if self.sshsockdir is not None:
                shutil.rmtree(self.sshsockdir)
Ejemplo n.º 2
0
def rescan(args):
    db = url_database.ensure_database(args)
    cur = db.cursor()
    cur.execute("SET search_path TO ts_run_4")
    with open(args.to_rescan, "rt") as f:
        rd = csv.DictReader(f)
        process_urls(db, rd)
Ejemplo n.º 3
0
    def __call__(self):
        db = url_database.ensure_database(self.args)
        with open(self.args.file) as fp:
            self.load_metadata(db, fp)
            self.load_urls(db, fp)

        if self.delayed_failure:
            raise SystemExit(1)
Ejemplo n.º 4
0
 def __call__(self):
     datestamp = time.strftime("%Y-%m-%d", time.gmtime())
     db = url_database.ensure_database(self.args)
     to_import = self.update_srcdir(db, self.args.source, self.args.repo)
     self.ensure_category_codes(db, self.args.source)
     self.process_imports(db, datestamp, to_import)
     if self.delayed_failure:
         raise SystemExit(1)
Ejemplo n.º 5
0
 def __call__(self):
     datestamp = time.strftime("%Y-%m-%d", time.gmtime())
     db        = url_database.ensure_database(self.args)
     to_import = self.update_srcdir(db, self.args.source, self.args.repo)
     self.process_imports(db, datestamp, to_import)
     self.update_canon_queue(db)
     if self.delayed_failure:
         raise SystemExit(1)
Ejemplo n.º 6
0
    def __call__(self):
        db = url_database.ensure_database(self.args)
        with open(self.args.file) as fp:
            self.load_metadata(db, fp)
            self.load_urls(db, fp)

        if self.delayed_failure:
            raise SystemExit(1)
Ejemplo n.º 7
0
    def load_database(self):
        self.report_progress("Loading database...")
        self.db = url_database.ensure_database(self.args)

        cr = self.db.cursor()
        # Cache the status table in memory; it's reasonably small.
        self.report_progress("Loading database... (canon statuses)")
        cr.execute("SELECT id, status FROM canon_statuses;")
        self.canon_statuses = { row[1]: row[0]
                                for row in url_database.fetch_iter(cr) }
Ejemplo n.º 8
0
def make_new_run(args):
    global quiet
    quiet = args.quiet

    progress(None)
    db = url_database.ensure_database(args)
    if args.copy_from is not None:
        old_run = find_old_run(db, args)

    new_run = initialize_new_schema(db, args)
    if args.copy_from is not None:
        copy_sources(db, args, old_run, new_run)
Ejemplo n.º 9
0
def make_new_run(args):
    global quiet
    quiet = args.quiet

    progress(None)
    db = url_database.ensure_database(args)
    if args.copy_from is not None:
        old_run = find_old_run(db, args)

    new_run = initialize_new_schema(db, args)
    if args.copy_from is not None:
        copy_sources(db, args, old_run, new_run)
Ejemplo n.º 10
0
def extract_from_twitter(args):
    extractors = {
        'single':   SingleExtractor,
        'snowball': SnowballExtractor,
        'frontier': FrontierExtractor,
        'firehose': FirehoseExtractor,
        'urls':     UrlsOnlyExtractor,
        'resume':   resume_extraction
    }
    args.seed = " ".join(args.seed)
    db = url_database.ensure_database(args)
    twi = connect_to_twitter_api()
    extractor = extractors[args.mode](args, db, twi)
    extractor.run()
Ejemplo n.º 11
0
def extract_from_twitter(args):
    extractors = {
        'single': SingleExtractor,
        'snowball': SnowballExtractor,
        'frontier': FrontierExtractor,
        'firehose': FirehoseExtractor,
        'urls': UrlsOnlyExtractor,
        'resume': resume_extraction
    }
    args.seed = " ".join(args.seed)
    db = url_database.ensure_database(args)
    twi = connect_to_twitter_api()
    extractor = extractors[args.mode](args, db, twi)
    extractor.run()
Ejemplo n.º 12
0
    def prepare_database(self):
        db = url_database.ensure_database(self.args)
        cur = db.cursor()
        # Find the latest date already in the table.  We don't
        # need to process dates before that point.
        cur.execute("SELECT coalesce(max(timestamp), 0) "
                    "FROM urls_herdict")
        start_date = cur.fetchone()[0];
        if start_date == 0:
            start_date = None
        else:
            start_date = (datetime.date.fromtimestamp(start_date)
                          .strftime("%Y-%m-%d"))

        end_date = datetime.datetime.utcnow().strftime("%Y-%m-%d")
        return db, start_date, end_date
Ejemplo n.º 13
0
    def __call__(self, mon, thr):
        self.mon = mon

        srcdbs = { src: url_database.reconnect_to_database(argshim(src))
                   for src in self.args.sources }

        destdb = url_database.ensure_database(self.args)

        self.uidmap = self.merge_url_strings(destdb, srcdbs)
        self.oidmap = self.merge_origins(destdb, srcdbs)
        self.cidmap = self.merge_canon_statuses(destdb, srcdbs)

        self.merge_urls(destdb, srcdbs)
        self.merge_canon_urls(destdb, srcdbs)
        self.merge_anomalies(destdb, srcdbs)
        self.merge_ancillary(destdb, srcdbs)
Ejemplo n.º 14
0
    def prepare_database(self):
        db = url_database.ensure_database(self.args)
        cur = db.cursor()
        # Find the latest date already in the table.  We don't
        # need to process dates before that point.
        cur.execute("SELECT coalesce(max(timestamp), 0) "
                    "FROM urls_herdict")
        start_date = cur.fetchone()[0];
        if start_date == 0:
            start_date = None
        else:
            start_date = (datetime.date.fromtimestamp(start_date)
                          .strftime("%Y-%m-%d"))

        end_date = datetime.datetime.utcnow().strftime("%Y-%m-%d")
        return db, start_date, end_date
Ejemplo n.º 15
0
    def __call__(self, mon, thr):
        self.mon = mon
        self.status_queue = queue.PriorityQueue()
        self.mon.register_event_queue(self.status_queue,
                                      (self._MON_SAYS_STOP, -1))

        self.mon.set_status_prefix("d")
        self.mon.report_status("loading...")

        self.proxies = ProxySet(self, self.mon, self.args,
                                self.proxy_sort_key)
        self.mon.report_status("loading... (proxies OK)")

        self.db = url_database.ensure_database(self.args)
        self.prepare_database()

        for _ in range(self.args.total_workers):
            wt = CaptureWorker(self)
            self.mon.add_work_thread(wt)
            self.idle_workers.add(wt)

        self.dispatcher_loop()
Ejemplo n.º 16
0
    def load_database(self):
        self.report_progress("Loading database...")
        self.db = url_database.ensure_database(self.args)

        cr = self.db.cursor()
        # Cache the status table in memory; it's reasonably small.
        self.report_progress("Loading database... (canon statuses)")
        cr.execute("SELECT id, status FROM canon_statuses;")
        self.canon_statuses = { row[1]: row[0]
                                for row in url_database.fetch_iter(cr) }

        if self.args.work_queue:
            self.todo = open(self.args.work_queue, "rb")
        else:
            self.report_progress("Loading database... (work queue)")
            self.todo = tempfile.TemporaryFile("w+b")
            subprocess.check_call(["sqlite3", self.args.database,
                   "SELECT DISTINCT u.url, v.url"
                   "  FROM urls as u"
                   "  LEFT JOIN url_strings as v on u.url = v.id"
                   "  WHERE u.url NOT IN (SELECT url FROM canon_urls)"],
                                  stdout=self.todo)
            self.todo.seek(0)
Ejemplo n.º 17
0
 def __call__(self, mon, thr):
     datestamp = time.strftime("%Y-%m-%d", time.gmtime())
     db        = url_database.ensure_database(self.args)
     sitelist  = self.download_sitelist(mon, datestamp)
     self.process_sitelist(mon, db, sitelist, datestamp)
Ejemplo n.º 18
0
 def __call__(self, mon, thr):
     datestamp = time.strftime("%Y-%m-%d", time.gmtime())
     db = url_database.ensure_database(self.args)
     sitelist = self.download_sitelist(mon, datestamp)
     self.process_sitelist(mon, db, sitelist, datestamp)
Ejemplo n.º 19
0
    def prepare_database(self):
        # Herdict reports have several more keys than this, but none
        # of them appear to be terribly trustworthy.
        herdict_schema = """\
CREATE TABLE herdict_reports (
    uid         INTEGER PRIMARY KEY,
    timestamp   INTEGER,
    accessible  INTEGER, -- (boolean)
    country     TEXT
);
CREATE INDEX herdict_reports__timestamp ON herdict_reports(timestamp);
"""
        db = ensure_database(self.args)
        with db:
            # FIXME: More sophisticated way of detecting presence of our
            # ancillary schema.
            s_tables = frozenset(re.findall("(?m)(?<=^CREATE TABLE )[a-z_]+",
                                            herdict_schema))
            s_indices = frozenset(re.findall("(?m)(?<=^CREATE INDEX )[a-z_]+",
                                             herdict_schema))
            d_tables = frozenset(r[0] for r in db.execute(
                    "SELECT name FROM sqlite_master WHERE "
                    "  type = 'table' AND name LIKE 'herdict_%'"))
            d_indices = frozenset(r[0] for r in db.execute(
                    "SELECT name FROM sqlite_master WHERE "
                    "  type = 'index' AND name LIKE 'herdict_%'"))

            if not d_tables and not d_indices:
                db.executescript(herdict_schema)
                db.commit()
            elif d_tables != s_tables or d_indices != s_indices:
                raise RuntimeError("ancillary schema mismatch - "
                                   "migration needed")

            oid = db.execute("SELECT id FROM origins"
                             "  WHERE label = 'herdict'").fetchone()
            if oid is None:
                oid = db.execute("INSERT INTO origins"
                                 "  VALUES(NULL, 'herdict')").lastrowid
            else:
                oid = oid[0]

            # Find the latest date already in the table.  We don't
            # need to process dates before that point.  Note that
            # Herdict's fsd= and fed= parameters are both inclusive, so
            # we need to step to the next day.
            db.execute("ANALYZE");
            start_date = db.execute("SELECT COALESCE(MIN(timestamp), 0) "
                                    "FROM herdict_reports").fetchone()[0];
            if start_date == 0:
                start_date = None
            else:
                start_date = ((datetime.date.fromtimestamp(start_date)
                               + datetime.timedelta(days=1))
                              .strftime("%Y-%m-%d"))

        # Herdict raw reports do not have serial numbers, and the API
        # only lets you ask for reports up to a certain _date_, not a
        # date and time.  So, to avoid ever getting duplicate dates
        # upon requerying the API, ask for reports up to and including
        # yesterday (UTC).  Note that datetime.date seems to be
        # unaware that "today (local)" and "today (UTC)" are not the
        # same thing, feh.
        end_date = ((datetime.datetime.utcnow()
                     - datetime.timedelta(days=1))
                    .strftime("%Y-%m-%d"))

        return db, oid, start_date, end_date