def handle_complete_batch(self, cmd, serial, locale, raw_results): with self.db, self.db.cursor() as cr: results = [] finished_urls = set() for r in raw_results: (url_id, surl) = url_database.add_url_string(cr, r['ourl']) redir_url_id = None if 'canon' in r: redir_url = r['canon'] if redir_url == surl or redir_url == r['ourl']: redir_url_id = url_id elif redir_url is not None: (redir_url_id, _) = \ url_database.add_url_string(cr, r['canon']) detail_id = self.canon_statuses.get(r['detail']) if detail_id is None and r['detail'] is not None: cr.execute("INSERT INTO canon_statuses(id, detail) " " VALUES(DEFAULT, %s)" " RETURNING id", (r['detail'],)) detail_id = cr.fetchone()[0] self.canon_statuses[r['detail']] = detail_id (_, result) = url_database.categorize_result(r['status'], url_id, redir_url_id) results.append({ "locale": locale, "url": url_id, "result": result, "detail": detail_id, "redir_url": redir_url_id, "html_content": r.get('content'), "screenshot": r.get('render') }) finished_urls.add(url_id) cr.executemany("UPDATE captured_urls " "SET access_time = TIMESTAMP 'now'," " result = %(result)s," " detail = %(detail)s," " redir_url = %(redir_url)s," " html_content = %(html_content)s," " screenshot = %(screenshot)s " "" "WHERE locale = %(locale)s" "AND url = %(url)s", results) self.processing[locale] -= finished_urls
def record_canonized(self, result): try: self.processed += 1 cr = self.db.cursor() status_id = self.canon_statuses.get(result.status) if status_id is None: cr.execute("INSERT INTO canon_statuses VALUES(NULL, ?)", (result.status,)) status_id = cr.lastrowid self.canon_statuses[result.status] = status_id if result.anomaly is not None: cr.execute("INSERT INTO anomalies VALUES(?, ?, ?)", (result.original_uid, status_id, result.anomaly)) self.anomalies += 1 if result.canon_url is None: canon_id = None self.failures += 1 self.report_result(result, result.status) else: (canon_id, curl) = \ url_database.add_url_string(cr, result.canon_url) self.successes += 1 self.report_result(result, curl) cr.execute("INSERT INTO canon_urls VALUES (?, ?, ?)", (result.original_uid, canon_id, status_id)) if self.processed % 1000 == 0: self.db.commit() except Exception as e: raise type(e)("Bogus result: {{ status: {!r} canon: {!r} anomaly: {!r} }}".format(result.status, result.canon_url, result.anomaly)) from e
def process_urls(db, rd): batch = [] for row in rd: (uid, _) = url_database.add_url_string(db, row['url']) batch.append( (uid, row['result'], row['locales']) ) with db, db.cursor() as cur: batch_str = b",".join(cur.mogrify("(%s,%s,%s)", row) for row in batch) cur.execute(b"INSERT INTO urls_rescan (url, result, locales)" b"VALUES " + batch_str) db.commit()
def load_urls(self, db, fp): to_insert = set() uname = self.args.user sys.stderr.write("Importing {} ({})... /".format( uname, self.args.file)) sys.stderr.flush() spinner = "/-\\|" c = 0 with db, db.cursor() as cur: for entry in json.load(fp): try: url = url_database.add_url_string(cur, entry['href'])[0] atime = entry['time'] title = entry['description'] annot = entry['extended'] tags = entry['tags'] except Exception as e: sys.stderr.write("\nInvalid entry: {}\n".format( json.dumps(entry))) for l in traceback.format_exception_only(type(e), e): sys.stderr.write(l) to_insert.add( cur.mogrify("(%s,%s,TIMESTAMP %s,%s,%s,%s)", (uname, url, atime, title, annot, tags))) sys.stderr.write("\b" + spinner[c % 4]) sys.stderr.flush() c += 1 sys.stderr.write(" (insert)") sys.stderr.flush() cur.execute( b"INSERT INTO urls_pinboard" b"(username, url, access_time, title, annotation, tags)" b"VALUES" + b",".join(sorted(to_insert))) sys.stderr.write(" (commit)") sys.stderr.flush() sys.stderr.write("\n")
def record_canonized(self, result): try: self.processed += 1 cr = self.db.cursor() status_id = self.canon_statuses.get(result.status) if status_id is None: cr.execute("INSERT INTO canon_statuses VALUES(NULL, ?)", (result.status,)) status_id = cr.lastrowid self.canon_statuses[result.status] = status_id if result.anomaly is not None: cr.execute("INSERT INTO anomalies VALUES(?, ?, ?)", (result.original_uid, status_id, json.dumps(result.anomaly))) self.anomalies += 1 if result.canon_url is None: canon_id = None self.failures += 1 self.report_result(result, result.status) else: (canon_id, curl) = \ url_database.add_url_string(cr, result.canon_url) self.successes += 1 self.report_result(result, curl) cr.execute("INSERT OR REPLACE INTO canon_urls VALUES (?, ?, ?)", (result.original_uid, canon_id, status_id)) if self.processed % 1000 == 0: self.db.commit() except Exception as e: self.anomalies += 1 self.report_result(result, "bogus") self.bogus_results.write("{}\n".format(json.dumps({ "exception": repr(e), "canon": result.canon_url, "status": result.status, "anomaly": result.anomaly })))
def load_urls(self, db, fp): to_insert = set() uname = self.args.user sys.stderr.write("Importing {} ({})... /" .format(uname, self.args.file)) sys.stderr.flush() spinner = "/-\\|" c = 0 with db, db.cursor() as cur: for entry in json.load(fp): try: url = url_database.add_url_string(cur, entry['href'])[0] atime = entry['time'] title = entry['description'] annot = entry['extended'] tags = entry['tags'] except Exception as e: sys.stderr.write("\nInvalid entry: {}\n" .format(json.dumps(entry))) for l in traceback.format_exception_only(type(e), e): sys.stderr.write(l) to_insert.add(cur.mogrify("(%s,%s,TIMESTAMP %s,%s,%s,%s)", (uname, url, atime, title, annot, tags))) sys.stderr.write("\b" + spinner[c % 4]) sys.stderr.flush() c += 1 sys.stderr.write(" (insert)") sys.stderr.flush() cur.execute(b"INSERT INTO urls_pinboard" b"(username, url, access_time, title, annotation, tags)" b"VALUES" + b",".join(sorted(to_insert))) sys.stderr.write(" (commit)") sys.stderr.flush() sys.stderr.write("\n")
def process_one_import(self, cur, datestamp, country_code, reader): sys.stderr.write("Importing {}...".format(country_code)) sys.stderr.flush() values = [] for row in reader: category_code = row["category_code"] uid, url = url_database.add_url_string(cur, row["url"]) values.append(cur.mogrify("(%s,%s,%s,%s)", (uid, country_code, category_code, datestamp))) if not values: return sys.stderr.write(" (insert)") sys.stderr.flush() cur.execute( b"INSERT INTO urls_citizenlab " b"(url, country, category, retrieval_date) " b"VALUES " + b",".join(values) ) sys.stderr.write(" (commit)") sys.stderr.flush()
def process_one_import(self, cur, datestamp, country_code, reader): sys.stderr.write("Importing {}...".format(country_code)) sys.stderr.flush() values = [] for row in reader: category_code = row['category_code'] uid, url = url_database.add_url_string(cur, row['url']) values.append( cur.mogrify("(%s,%s,%s,%s)", (uid, country_code, category_code, datestamp))) if not values: return sys.stderr.write(" (insert)") sys.stderr.flush() cur.execute(b"INSERT INTO urls_citizenlab " b"(url, country, category, retrieval_date) " b"VALUES " + b",".join(values)) sys.stderr.write(" (commit)") sys.stderr.flush()
def __call__(self, mon, thr): db, start_date, end_date = self.prepare_database() self.db = db cur = db.cursor() pageq = queue.Queue() mon.add_work_thread(HerdictReader(pageq, start_date, end_date)) n_accessible = 0 n_inaccessible = 0 n_total = 0 lo_timestamp = time.time() + 86400 hi_timestamp = 0 while True: page = pageq.get() if not page: break batch = [] for row in page: if ("url" not in row or "reportDate" not in row or "reportType" not in row): continue timestamp = (datetime.datetime.strptime(row["reportDate"], "%Y-%m-%dT%H:%M:%S %z") .timestamp()) lo_timestamp = min(timestamp, lo_timestamp) hi_timestamp = max(timestamp, hi_timestamp) url = row["url"] if "/" not in url: url = url + "/" if "protocol" not in row: url = "HTTP://" + url else: url = row["protocol"] + "://" + url # Herdict reports have several more keys than this, # but none of them appear to be terribly trustworthy. accessible = (row["reportType"] != "INACCESSIBLE") if "country" in row and "shortName" in row["country"]: country = row["country"]["shortName"] else: country = "??" (uid, url) = url_database.add_url_string(cur, url) batch.append((uid, timestamp, accessible, country)) n_total += 1 if accessible: n_accessible += 1 else: n_inaccessible += 1 mon.report_status("Processed {} URLs; " "{} accessible, {} inaccessible" .format(n_total, n_accessible, n_inaccessible)) mon.report_status("Processed {} URLs; " "{} accessible, {} inaccessible; checkpointing" .format(n_total, n_accessible, n_inaccessible)) cur.execute(b"INSERT INTO urls_herdict " b"(url, \"timestamp\", accessible, country) VALUES " + b",".join(cur.mogrify("(%s,%s,%s,%s)", row) for row in batch)) db.commit() mon.maybe_pause_or_stop() mon.report_status("Flushing duplicates...") # The urls_herdict table doesn't have any uniquifier. # Flush any duplicate rows that may have occurred. cur.execute( 'DELETE FROM urls_herdict WHERE ctid IN (SELECT ctid FROM (' ' SELECT ctid, row_number() OVER (' ' PARTITION BY url,"timestamp",accessible,country' ' ORDER BY ctid) AS rnum FROM urls_herdict) t' ' WHERE t.rnum > 1)') db.commit() mon.report_status("Adding URLs to be canonicalized...") cur.execute("INSERT INTO canon_urls (url) " " SELECT DISTINCT url FROM urls_herdict" " EXCEPT SELECT url FROM canon_urls") db.commit() self.summary = (lo_timestamp, hi_timestamp, n_total, n_accessible, n_inaccessible)
def __call__(self, mon, thr): db, start_date, end_date = self.prepare_database() self.db = db cur = db.cursor() pageq = queue.Queue() mon.add_work_thread(HerdictReader(pageq, start_date, end_date)) n_accessible = 0 n_inaccessible = 0 n_total = 0 lo_timestamp = time.time() + 86400 hi_timestamp = 0 while True: page = pageq.get() if not page: break batch = [] for row in page: if ("url" not in row or "reportDate" not in row or "reportType" not in row): continue timestamp = (datetime.datetime.strptime(row["reportDate"], "%Y-%m-%dT%H:%M:%S %z") .timestamp()) lo_timestamp = min(timestamp, lo_timestamp) hi_timestamp = max(timestamp, hi_timestamp) url = row["url"] if "/" not in url: url = url + "/" if "protocol" not in row: url = "HTTP://" + url else: url = row["protocol"] + "://" + url # Herdict reports have several more keys than this, # but none of them appear to be terribly trustworthy. accessible = (row["reportType"] != "INACCESSIBLE") if "country" in row and "shortName" in row["country"]: country = row["country"]["shortName"] else: country = "??" (uid, url) = url_database.add_url_string(cur, url) batch.append((uid, timestamp, accessible, country)) n_total += 1 if accessible: n_accessible += 1 else: n_inaccessible += 1 mon.report_status("Processed {} URLs; " "{} accessible, {} inaccessible" .format(n_total, n_accessible, n_inaccessible)) mon.report_status("Processed {} URLs; " "{} accessible, {} inaccessible; checkpointing" .format(n_total, n_accessible, n_inaccessible)) cur.execute(b"INSERT INTO urls_herdict " b"(url, \"timestamp\", accessible, country) VALUES " + b",".join(cur.mogrify("(%s,%s,%s,%s)", row) for row in batch)) db.commit() mon.maybe_pause_or_stop() self.summary = (lo_timestamp, hi_timestamp, n_total, n_accessible, n_inaccessible)
def load_urls(self, db, fp): to_insert = set() sys.stderr.write("Importing {}...".format(self.source_label)) sys.stderr.flush() with db, db.cursor() as cur: for line in fp: line = line.strip() self.lineno += 1 if line == "" or line[0] == "#": continue if self._has_scheme.match(line): if (not line.startswith("http://") and not line.startswith("https://")): sys.stderr.write("{}:{}: non-HTTP(S) URL: {!r}\n" .format(self.args.file, self.lineno, line)) self.delayed_failure = True continue try: (url_id, _) = url_database.add_url_string(cur, line) except Exception as e: sys.stderr.write("{}:{}: {}\n" .format(self.args.file, self.lineno, str(e))) self.delayed_failure = True continue to_insert.add(cur.mogrify("(%s, %s)", (url_id, self.import_id))) else: try: urls = url_database.add_site(cur, line) except Exception as e: sys.stderr.write("{}:{}: {}\n" .format(self.args.file, self.lineno, str(e))) self.delayed_failure = True continue for pair in urls: to_insert.add(cur.mogrify("(%s, %s)", (pair[0], self.import_id))) if self.delayed_failure: raise SystemExit(1) sys.stderr.write(" (insert)") sys.stderr.flush() cur.execute(b"INSERT INTO urls_staticlist " b"(url, listid) VALUES " + b",".join(sorted(to_insert))) sys.stderr.write(" (commit)") sys.stderr.flush() sys.stderr.write("\n")
def load_urls(self, db, fp): to_insert = set() sys.stderr.write("Importing {}...".format(self.source_label)) sys.stderr.flush() with db, db.cursor() as cur: for line in fp: line = line.strip() self.lineno += 1 if line == "" or line[0] == "#": continue if self._has_scheme.match(line): if (not line.startswith("http://") and not line.startswith("https://")): sys.stderr.write( "{}:{}: non-HTTP(S) URL: {!r}\n".format( self.args.file, self.lineno, line)) self.delayed_failure = True continue try: (url_id, _) = url_database.add_url_string(cur, line) except Exception as e: sys.stderr.write("{}:{}: {}\n".format( self.args.file, self.lineno, str(e))) self.delayed_failure = True continue to_insert.add( cur.mogrify("(%s, %s)", (url_id, self.import_id))) else: try: urls = url_database.add_site(cur, line) except Exception as e: sys.stderr.write("{}:{}: {}\n".format( self.args.file, self.lineno, str(e))) self.delayed_failure = True continue for pair in urls: to_insert.add( cur.mogrify("(%s, %s)", (pair[0], self.import_id))) if self.delayed_failure: raise SystemExit(1) sys.stderr.write(" (insert)") sys.stderr.flush() cur.execute(b"INSERT INTO urls_staticlist " b"(url, listid) VALUES " + b",".join(sorted(to_insert))) sys.stderr.write(" (commit)") sys.stderr.flush() sys.stderr.write("\n")
def record_batch(self, loc, successes, failures): locale = loc.locale loc.n_workers -= 1 for r in failures: loc.in_progress.remove(r[0]) if not successes: return with self.db, self.db.cursor() as cr: for s in successes: url_id = s[0] r = s[1] loc.in_progress.remove(url_id) redir_url = None redir_url_id = None if r['canon']: redir_url = r['canon'] if redir_url == r['ourl']: redir_url_id = url_id elif redir_url is not None: try: (redir_url_id, _) = \ url_database.add_url_string(cr, redir_url) except (ValueError, UnicodeError): addendum = "invalid redir url: " + redir_url if ('detail' not in r or r['detail'] is None): r['detail'] = addendum else: r['detail'] += " | " + addendum detail_id = self.capture_detail.get(r['detail']) if detail_id is None: cr.execute("INSERT INTO capture_detail(id, detail) " " VALUES(DEFAULT, %s)" " RETURNING id", (r['detail'],)) detail_id = cr.fetchone()[0] self.capture_detail[r['detail']] = detail_id result = url_database.categorize_result(r['status'], r['detail'], url_id, redir_url_id) to_insert = { "locale": locale, "url": url_id, "result": result, "detail": detail_id, "redir_url": redir_url_id, "log": r['log'], "html_content": r['content'], "screenshot": r['render'] } cr.execute("INSERT INTO captured_pages" "(locale, url, access_time, result, detail," " redir_url, capture_log, html_content," " screenshot)" "VALUES (" " %(locale)s," " %(url)s," " TIMESTAMP 'now'," " %(result)s," " %(detail)s," " %(redir_url)s," " %(log)s," " %(html_content)s," " %(screenshot)s)", to_insert) cr.execute('UPDATE capture_progress SET "l_{0}" = TRUE ' ' WHERE url = {1}'.format(locale, url_id)) loc.todo -= 1
def add_urls_from_site(cur, site, ordinal, oid, already_seen): # Subroutine of process_sitelist. # # Alexa's "site" list has two different kinds of # addresses on it: with and without a URL path. # Also, most but not all of the sites are second-level # domains: any third-level piece (such as "www.") has # been stripped. In no case is there a scheme; in # particular we have no idea whether the site prefers # http: or https:. So we expand each entry to four: # # http:// site # https:// site # http:// www. site # https:// www. site # # If there was a path, we include all of the above # both with and without the path. This scheme won't # do us any good if the actual content people are # loading is neither at the name in the list nor at # www. the name in the list; for instance, # akamaihd.net is site #68, but neither akamaihd.net # nor www.akamaihd.net has any A records, because, # being a CDN, all of the actual content is on servers # named SOMETHINGELSE.akamaihd.net, and you're not # expected to notice that the domain even exists. # But there's nothing we can do about that. # # Because the database schema requires the ordinal+oid to be # unique, we shift the ordinal left three bits to make room # for a prefix index and an indication of whether or not there # was a path component. # # It does not make sense to prepend 'www.' if 'site' already # starts with 'www.' or if it is an IP address. parsed = url_database.canon_url_syntax( urllib.parse.urlsplit("http://" + site)) assert parsed.path != "" if parsed.path != "/": root = to_siteroot(parsed) need_path = True else: root = parsed need_path = False urls = [ (0, root.geturl()), (1, to_https(root).geturl()) ] host = root.hostname if no_www_re.match(host): need_www = False else: need_www = True with_www = add_www(root) urls.extend([ (2, with_www.geturl()), (3, to_https(with_www).geturl()) ]) if need_path: urls.extend([ (4, parsed.geturl()), (5, to_https(parsed).geturl()) ]) if need_www: with_www = add_www(parsed) urls.extend([ (6, with_www.geturl()), (7, to_https(with_www).geturl()) ]) ordinal = int(ordinal) * 8 nnew = 0 for tag, url in urls: (uid, url) = url_database.add_url_string(cur, url) if url in already_seen: continue already_seen.add(url) # We want to add an url-table entry for this URL even if it's # already there from some other source; we only drop them if # they are redundant within this data set. However, in case # the database-loading operation got interrupted midway, # do an INSERT OR IGNORE. cur.execute("INSERT OR IGNORE INTO urls VALUES(?, ?, ?)", (oid, ordinal + tag, uid)) nnew += 1 return nnew
def add_urls_from_site(cur, site, rank, datestamp, batch, already_seen): # Subroutine of process_sitelist. # # Alexa's "site" list has two different kinds of # addresses on it: with and without a URL path. # Also, most but not all of the sites are second-level # domains: any third-level piece (such as "www.") has # been stripped. In no case is there a scheme; in # particular we have no idea whether the site prefers # http: or https:. So we expand each entry to four: # # http:// site # https:// site # http:// www. site # https:// www. site # # If there was a path, we include all of the above # both with and without the path. This scheme won't # do us any good if the actual content people are # loading is neither at the name in the list nor at # www. the name in the list; for instance, # akamaihd.net is site #68, but neither akamaihd.net # nor www.akamaihd.net has any A records, because, # being a CDN, all of the actual content is on servers # named SOMETHINGELSE.akamaihd.net, and you're not # expected to notice that the domain even exists. # But there's nothing we can do about that. # # It does not make sense to prepend 'www.' if 'site' already # starts with 'www.' or if it is an IP address. parsed = url_database.canon_url_syntax("http://" + site, want_splitresult=True) assert parsed.path != "" if parsed.path != "/": root = to_siteroot(parsed) need_path = True else: root = parsed need_path = False urls = [ root.geturl(), to_https(root).geturl() ] host = root.hostname if no_www_re.match(host): need_www = False else: need_www = True with_www = add_www(root) urls.extend([ with_www.geturl(), to_https(with_www).geturl() ]) if need_path: urls.extend([ parsed.geturl(), to_https(parsed).geturl() ]) if need_www: with_www = add_www(parsed) urls.extend([ with_www.geturl(), to_https(with_www).geturl() ]) for url in urls: (uid, url) = url_database.add_url_string(cur, url) if url in already_seen: continue batch.append( (uid, rank, datestamp) ) already_seen.add(url)