def __call__(self, mon, thr): self.mon = mon self.db = url_database.ensure_database(self.args) self.prepared_batches = {} self.processing = {} self.per_locale = [0]*(len(self.locales)+1) self.status_queue = queue.PriorityQueue() self.status_queue_serializer = 0 self.mon.register_event_queue(self.status_queue, (self._MON_SAYS_STOP, -1)) self.prepare_database() try: self.sshsockdir = tempfile.mkdtemp(prefix="capture-control") for tld, workers in self.locales.items(): self.prepared_batches[tld] = collections.deque() self.processing[tld] = set() for w in workers: self.mon.add_work_thread(CaptureWorker(self, tld, w)) self.dispatcher_loop() finally: if self.sshsockdir is not None: shutil.rmtree(self.sshsockdir)
def rescan(args): db = url_database.ensure_database(args) cur = db.cursor() cur.execute("SET search_path TO ts_run_4") with open(args.to_rescan, "rt") as f: rd = csv.DictReader(f) process_urls(db, rd)
def __call__(self): db = url_database.ensure_database(self.args) with open(self.args.file) as fp: self.load_metadata(db, fp) self.load_urls(db, fp) if self.delayed_failure: raise SystemExit(1)
def __call__(self): datestamp = time.strftime("%Y-%m-%d", time.gmtime()) db = url_database.ensure_database(self.args) to_import = self.update_srcdir(db, self.args.source, self.args.repo) self.ensure_category_codes(db, self.args.source) self.process_imports(db, datestamp, to_import) if self.delayed_failure: raise SystemExit(1)
def __call__(self): datestamp = time.strftime("%Y-%m-%d", time.gmtime()) db = url_database.ensure_database(self.args) to_import = self.update_srcdir(db, self.args.source, self.args.repo) self.process_imports(db, datestamp, to_import) self.update_canon_queue(db) if self.delayed_failure: raise SystemExit(1)
def load_database(self): self.report_progress("Loading database...") self.db = url_database.ensure_database(self.args) cr = self.db.cursor() # Cache the status table in memory; it's reasonably small. self.report_progress("Loading database... (canon statuses)") cr.execute("SELECT id, status FROM canon_statuses;") self.canon_statuses = { row[1]: row[0] for row in url_database.fetch_iter(cr) }
def make_new_run(args): global quiet quiet = args.quiet progress(None) db = url_database.ensure_database(args) if args.copy_from is not None: old_run = find_old_run(db, args) new_run = initialize_new_schema(db, args) if args.copy_from is not None: copy_sources(db, args, old_run, new_run)
def extract_from_twitter(args): extractors = { 'single': SingleExtractor, 'snowball': SnowballExtractor, 'frontier': FrontierExtractor, 'firehose': FirehoseExtractor, 'urls': UrlsOnlyExtractor, 'resume': resume_extraction } args.seed = " ".join(args.seed) db = url_database.ensure_database(args) twi = connect_to_twitter_api() extractor = extractors[args.mode](args, db, twi) extractor.run()
def prepare_database(self): db = url_database.ensure_database(self.args) cur = db.cursor() # Find the latest date already in the table. We don't # need to process dates before that point. cur.execute("SELECT coalesce(max(timestamp), 0) " "FROM urls_herdict") start_date = cur.fetchone()[0]; if start_date == 0: start_date = None else: start_date = (datetime.date.fromtimestamp(start_date) .strftime("%Y-%m-%d")) end_date = datetime.datetime.utcnow().strftime("%Y-%m-%d") return db, start_date, end_date
def __call__(self, mon, thr): self.mon = mon srcdbs = { src: url_database.reconnect_to_database(argshim(src)) for src in self.args.sources } destdb = url_database.ensure_database(self.args) self.uidmap = self.merge_url_strings(destdb, srcdbs) self.oidmap = self.merge_origins(destdb, srcdbs) self.cidmap = self.merge_canon_statuses(destdb, srcdbs) self.merge_urls(destdb, srcdbs) self.merge_canon_urls(destdb, srcdbs) self.merge_anomalies(destdb, srcdbs) self.merge_ancillary(destdb, srcdbs)
def __call__(self, mon, thr): self.mon = mon self.status_queue = queue.PriorityQueue() self.mon.register_event_queue(self.status_queue, (self._MON_SAYS_STOP, -1)) self.mon.set_status_prefix("d") self.mon.report_status("loading...") self.proxies = ProxySet(self, self.mon, self.args, self.proxy_sort_key) self.mon.report_status("loading... (proxies OK)") self.db = url_database.ensure_database(self.args) self.prepare_database() for _ in range(self.args.total_workers): wt = CaptureWorker(self) self.mon.add_work_thread(wt) self.idle_workers.add(wt) self.dispatcher_loop()
def load_database(self): self.report_progress("Loading database...") self.db = url_database.ensure_database(self.args) cr = self.db.cursor() # Cache the status table in memory; it's reasonably small. self.report_progress("Loading database... (canon statuses)") cr.execute("SELECT id, status FROM canon_statuses;") self.canon_statuses = { row[1]: row[0] for row in url_database.fetch_iter(cr) } if self.args.work_queue: self.todo = open(self.args.work_queue, "rb") else: self.report_progress("Loading database... (work queue)") self.todo = tempfile.TemporaryFile("w+b") subprocess.check_call(["sqlite3", self.args.database, "SELECT DISTINCT u.url, v.url" " FROM urls as u" " LEFT JOIN url_strings as v on u.url = v.id" " WHERE u.url NOT IN (SELECT url FROM canon_urls)"], stdout=self.todo) self.todo.seek(0)
def __call__(self, mon, thr): datestamp = time.strftime("%Y-%m-%d", time.gmtime()) db = url_database.ensure_database(self.args) sitelist = self.download_sitelist(mon, datestamp) self.process_sitelist(mon, db, sitelist, datestamp)
def prepare_database(self): # Herdict reports have several more keys than this, but none # of them appear to be terribly trustworthy. herdict_schema = """\ CREATE TABLE herdict_reports ( uid INTEGER PRIMARY KEY, timestamp INTEGER, accessible INTEGER, -- (boolean) country TEXT ); CREATE INDEX herdict_reports__timestamp ON herdict_reports(timestamp); """ db = ensure_database(self.args) with db: # FIXME: More sophisticated way of detecting presence of our # ancillary schema. s_tables = frozenset(re.findall("(?m)(?<=^CREATE TABLE )[a-z_]+", herdict_schema)) s_indices = frozenset(re.findall("(?m)(?<=^CREATE INDEX )[a-z_]+", herdict_schema)) d_tables = frozenset(r[0] for r in db.execute( "SELECT name FROM sqlite_master WHERE " " type = 'table' AND name LIKE 'herdict_%'")) d_indices = frozenset(r[0] for r in db.execute( "SELECT name FROM sqlite_master WHERE " " type = 'index' AND name LIKE 'herdict_%'")) if not d_tables and not d_indices: db.executescript(herdict_schema) db.commit() elif d_tables != s_tables or d_indices != s_indices: raise RuntimeError("ancillary schema mismatch - " "migration needed") oid = db.execute("SELECT id FROM origins" " WHERE label = 'herdict'").fetchone() if oid is None: oid = db.execute("INSERT INTO origins" " VALUES(NULL, 'herdict')").lastrowid else: oid = oid[0] # Find the latest date already in the table. We don't # need to process dates before that point. Note that # Herdict's fsd= and fed= parameters are both inclusive, so # we need to step to the next day. db.execute("ANALYZE"); start_date = db.execute("SELECT COALESCE(MIN(timestamp), 0) " "FROM herdict_reports").fetchone()[0]; if start_date == 0: start_date = None else: start_date = ((datetime.date.fromtimestamp(start_date) + datetime.timedelta(days=1)) .strftime("%Y-%m-%d")) # Herdict raw reports do not have serial numbers, and the API # only lets you ask for reports up to a certain _date_, not a # date and time. So, to avoid ever getting duplicate dates # upon requerying the API, ask for reports up to and including # yesterday (UTC). Note that datetime.date seems to be # unaware that "today (local)" and "today (UTC)" are not the # same thing, feh. end_date = ((datetime.datetime.utcnow() - datetime.timedelta(days=1)) .strftime("%Y-%m-%d")) return db, oid, start_date, end_date