def api_private_country_overview() -> Response: """Country-specific overview --- responses: '200': description: { "first_bucket_date":"2012-12-01", "measurement_count":6659891, "network_count":333} """ # TODO: add circumvention_tools_blocked im_apps_blocked # middlebox_detected_networks websites_confirmed_blocked probe_cc = validate_probe_cc_query_param() s = """SELECT toDate(MIN(measurement_start_time)) AS first_bucket_date, COUNT() AS measurement_count, COUNT(DISTINCT probe_asn) AS network_count FROM fastpath WHERE probe_cc = :probe_cc AND measurement_start_time > '2012-12-01' """ r = query_click_one_row(sql.text(s), {"probe_cc": probe_cc}) assert r # FIXME: websites_confirmed_blocked return cachedjson("1d", **r)
def update_url_priority_click(old: dict, new: dict): # The url_priorities table is CollapsingMergeTree # Both old and new might be set ins_sql = """INSERT INTO url_priorities (sign, category_code, cc, domain, url, priority) VALUES """ if old: rule = old.copy() rule["sign"] = -1 log.info(f"Deleting prioritization rule {rule}") r = insert_click(ins_sql, [rule]) log.debug(f"Result: {r}") if new: q = """SELECT count() AS cnt FROM url_priorities FINAL WHERE sign = 1 AND category_code = :category_code AND cc = :cc AND domain = :domain AND url = :url""" cnt = query_click_one_row(sql.text(q), new) if cnt and cnt["cnt"] > 0: log.info(f"Rejecting duplicate rule {new}") raise DuplicateRuleError(err_args=new) rule = new.copy() rule["sign"] = 1 log.info(f"Creating prioritization rule {rule}") r = insert_click(ins_sql, [rule]) log.debug(f"Result: {r}")
def _get_measurement_meta_clickhouse(report_id: str, input_) -> dict: # Given report_id + input, fetch measurement data from fastpath table query = "SELECT * FROM fastpath " if input_ is None: # fastpath uses input = '' for empty values query += "WHERE report_id = :report_id AND input = '' " else: query += """ LEFT OUTER JOIN citizenlab ON citizenlab.url = fastpath.input WHERE fastpath.input = :input AND fastpath.report_id = :report_id """ query_params = dict(input=input_, report_id=report_id) # Limit the time range where we scan for the measurement to improve # performance try: rid_t = datetime.strptime(report_id[:8], "%Y%m%d") query_params["begin"] = rid_t - timedelta(days=30) query_params["end"] = rid_t + timedelta(days=30) query += """ AND fastpath.measurement_start_time > :begin AND fastpath.measurement_start_time < :end """ except ValueError: pass # legacy report_id format without date except Exception as e: log.error(e, exc_info=True) query += "LIMIT 1" msmt_meta = query_click_one_row(sql.text(query), query_params) if not msmt_meta: return {} # measurement not found if msmt_meta["probe_asn"] == 0: # https://ooni.org/post/2020-ooni-probe-asn-incident-report/ # https://github.com/ooni/explorer/issues/495 return {} # unwanted keys = ( "input", "measurement_start_time", "measurement_uid", "report_id", "test_name", "test_start_time", "probe_asn", "probe_cc", "scores", ) out = {k: msmt_meta[k] for k in keys} out["category_code"] = msmt_meta.get("category_code", None) out["anomaly"] = msmt_meta["anomaly"] == "t" out["confirmed"] = msmt_meta["confirmed"] == "t" out["failure"] = msmt_meta["msm_failure"] == "t" return out
def initialize_url_priorities_if_needed(): cntq = "SELECT count() AS cnt FROM url_priorities" cnt = query_click_one_row(sql.text(cntq), {}) if cnt["cnt"] > 0: return rules = [ ("NEWS", 100), ("POLR", 100), ("HUMR", 100), ("LGBT", 100), ("ANON", 100), ("MMED", 80), ("SRCH", 80), ("PUBH", 80), ("REL", 60), ("XED", 60), ("HOST", 60), ("ENV", 60), ("FILE", 40), ("CULTR", 40), ("IGO", 40), ("GOVT", 40), ("DATE", 30), ("HATE", 30), ("MILX", 30), ("PROV", 30), ("P**N", 30), ("GMB", 30), ("ALDR", 30), ("GAME", 20), ("MISC", 20), ("HACK", 20), ("ECON", 20), ("COMM", 20), ("CTRL", 20), ("COMT", 100), ("GRP", 100), ] rows = [{ "sign": 1, "category_code": ccode, "cc": "*", "domain": "*", "url": "*", "priority": prio, } for ccode, prio in rules] # The url_priorities table is CollapsingMergeTree query = """INSERT INTO url_priorities (sign, category_code, cc, domain, url, priority) VALUES """ log.info("Populating url_priorities") r = insert_click(query, rows) return r
def _fetch_jsonl_measurement_body_clickhouse( report_id, input: str, measurement_uid) -> Optional[bytes]: """Fetch jsonl from S3, decompress it, extract msmt""" inp = input or "" # NULL/None input is stored as '' try: # FIXME temporary hack to test reprocessing query = """SELECT s3path, linenum FROM new_jsonl PREWHERE report_id = :report_id AND input = :inp LIMIT 1""" query_params = dict(inp=inp, report_id=report_id) lookup = query_click_one_row(sql.text(query), query_params) assert lookup is not None log.info("Found in new_jsonl") metrics.incr("msmt_found_in_new_jsonl") except: log.info("Not found in new_jsonl") metrics.incr("msmt_not_found_in_new_jsonl") query = """SELECT s3path, linenum FROM jsonl PREWHERE report_id = :report_id AND input = :inp LIMIT 1""" query_params = dict(inp=inp, report_id=report_id) lookup = query_click_one_row(sql.text(query), query_params) if lookup is None: m = f"Missing row in jsonl table: {report_id} {input} {measurement_uid}" log.error(m) metrics.incr("msmt_not_found_in_jsonl") return None s3path = lookup["s3path"] linenum = lookup["linenum"] log.debug(f"Fetching file {s3path} from S3") try: return _fetch_jsonl_measurement_body_inner(s3path, linenum) except: # pragma: no cover log.error(f"Failed to fetch file {s3path} from S3") return None
def _remove_from_session_expunge(email_address: str) -> None: # Used by integ test log = current_app.logger account_id = hash_email_address(email_address) query_params: Dict[str, Any] = dict(account_id=account_id) # 'session_expunge' is on RocksDB (ACID key-value database) q1 = "SELECT * FROM session_expunge WHERE account_id = :account_id" row = query_click_one_row(sql.text(q1), query_params) # https://github.com/ClickHouse/ClickHouse/issues/20546 if row: log.info("Resetting expunge in Clickhouse session_expunge") query = "INSERT INTO session_expunge (account_id, threshold) VALUES" query_params["threshold"] = 0 insert_click(query, [query_params])
def api_private_global_overview() -> Response: """Provide global summary of measurements Sources: global_stats db table --- responses: '200': description: JSON struct TODO """ q = """SELECT COUNT(DISTINCT(probe_asn)) AS network_count, COUNT(DISTINCT probe_cc) AS country_count, COUNT(*) AS measurement_count FROM fastpath """ r = query_click_one_row(q, {}) assert r return cachedjson("1d", **r)
def wrapper(*args, **kwargs): token = request.cookies.get("ooni", "") try: tok = decode_jwt(token, audience="user_auth") del token if tok["role"] not in roles: return jerror("Role not authorized", 401) except Exception: return jerror("Authentication required", 401) # check for session expunge # TODO: cache query? query = """SELECT threshold FROM session_expunge WHERE account_id = :account_id """ account_id = tok["account_id"] query_params = dict(account_id=account_id) row = query_click_one_row(sql.text(query), query_params) if row: threshold = row["threshold"] iat = datetime.utcfromtimestamp(tok["iat"]) print((iat, threshold)) if iat < threshold: return jerror("Authentication token expired", 401) # attach account_id to request request._account_id = account_id # run the HTTP route method resp = func(*args, **kwargs) # Prevent an authenticated page to be cached and served to # unauthorized users resp.cache_control.no_cache = True token_age = time.time() - tok["iat"] if token_age > 600: # refresh token if needed newtoken = _create_session_token(tok["account_id"], tok["role"], tok["login_time"]) set_JWT_cookie(resp, newtoken) return resp
def _get_account_role(account_id: str) -> Optional[str]: """Get account role from database, or None""" query = "SELECT role FROM accounts WHERE account_id = :account_id" query_params = dict(account_id=account_id) r = query_click_one_row(sql.text(query), query_params) return r["role"] if r else None
def _clickhouse_aggregation( resp_format: str, download: bool, since, until, inp: ostr, domain: ostr, category_code: ostr, probe_cc: ostr, probe_asn: Optional[int], test_name: ostr, axis_x: ostr, axis_y: ostr, ): log = current_app.logger dimension_cnt = int(bool(axis_x)) + int(bool(axis_y)) cacheable = until and until < datetime.now() - timedelta(hours=72) cacheable = False # FIXME # Assemble query colnames = [ "anomaly_count", "confirmed_count", "failure_count", "ok_count", "measurement_count", ] cols = [ sql.text( "countIf(anomaly = 't' AND confirmed = 'f' AND msm_failure = 'f') AS anomaly_count" ), sql.text( "countIf(confirmed = 't' AND msm_failure = 'f') AS confirmed_count" ), sql.text("countIf(msm_failure = 't') AS failure_count"), sql.text( "countIf(anomaly = 'f' AND confirmed = 'f' AND msm_failure = 'f') AS ok_count" ), sql.text("COUNT(*) AS measurement_count"), ] table = sql.table("fastpath") where = [] query_params = {} if domain: where.append(sql.text("domain = :domain")) query_params["domain"] = domain if inp: where.append(sql.text("input = :input")) query_params["input"] = inp if category_code: where.append(sql.text("citizenlab.category_code = :category_code")) query_params["category_code"] = category_code if probe_cc: where.append( sql.text("(citizenlab.cc = :lcc OR citizenlab.cc = 'ZZ')")) query_params["lcc"] = probe_cc.lower() else: where.append(sql.text("citizenlab.cc = 'ZZ'")) if probe_cc: where.append(sql.text("probe_cc = :probe_cc")) query_params["probe_cc"] = probe_cc if probe_asn is not None: where.append(sql.text("probe_asn = :probe_asn")) query_params["probe_asn"] = str(probe_asn) if since: where.append(sql.text("measurement_start_time > :since")) query_params["since"] = since if until: where.append(sql.text("measurement_start_time <= :until")) query_params["until"] = until if test_name: where.append(sql.text("test_name = :test_name")) query_params["test_name"] = test_name if axis_x == axis_y and axis_x is not None: raise ValueError("Axis X and Y cannot be the same") def group_by_date(since, until, cols, colnames, group_by): if since and until: delta = abs((until - since).days) else: delta = 0 # TODO: add a granularity parameter and a check around query weight / # / response size. Also add support in CSV format. granularity = ("day", "toDate") tcol, fun = granularity tcol = "measurement_start_day" # TODO: support dynamic axis names cols.append(sql.text(f"{fun}(measurement_start_time) AS {tcol}")) colnames.append(tcol) group_by.append(column(tcol)) def add_axis(axis, cols, colnames, group_by): if axis == "blocking_type": t = "JSONExtractString(scores, 'analysis', 'blocking_type') AS blocking_type" cols.append(sql.text(t)) else: validate_axis_name(axis) cols.append(sql.text(axis)) colnames.append(axis) group_by.append(column(axis)) group_by: List = [] if axis_x == "measurement_start_day": group_by_date(since, until, cols, colnames, group_by) elif axis_x: add_axis(axis_x, cols, colnames, group_by) if axis_y == "measurement_start_day": group_by_date(since, until, cols, colnames, group_by) elif axis_y: add_axis(axis_y, cols, colnames, group_by) # Assemble query if category_code or axis_x == "category_code" or axis_y == "category_code": # Join in the citizenlab table if we need to filter on category_code # or perform group-by on it table = table.join( sql.table("citizenlab"), sql.text("citizenlab.url = fastpath.input"), ) where_expr = and_(*where) query = select(cols).where(where_expr).select_from(table) # Add group-by for g in group_by: query = query.group_by(g).order_by(g) try: if dimension_cnt > 0: r: Any = list(query_click(query, query_params)) else: r = query_click_one_row(query, query_params) pq = current_app.click.last_query msg = f"Stats: {pq.progress.rows} {pq.progress.bytes} {pq.progress.total_rows} {pq.elapsed}" log.info(msg) if resp_format == "CSV": csv_data = _convert_to_csv(r) response = make_response(csv_data) response.headers["Content-Type"] = "text/csv" if download: set_dload(response, "ooni-aggregate-data.csv") else: resp_d = { "v": 0, "dimension_count": dimension_cnt, "result": r, "db_stats": { "row_count": pq.progress.rows, "bytes": pq.progress.bytes, "total_row_count": pq.progress.total_rows, "elapsed_seconds": pq.elapsed, }, } response = jsonify(resp_d) if download: set_dload(response, "ooni-aggregate-data.json") if cacheable: response.cache_control.max_age = 3600 * 24 return response except Exception as e: return jsonify({"v": 0, "error": str(e)})
def api_private_website_test_urls() -> Response: """TODO --- responses: '200': description: TODO """ limit = int(request.args.get("limit", 10)) if limit <= 0: limit = 10 offset = int(request.args.get("offset", 0)) probe_cc = validate_probe_cc_query_param() probe_asn = validate_probe_asn_query_param() probe_asn = int(probe_asn.replace("AS", "")) # Count how many distinct inputs we have in this CC / ASN / period s = """ SELECT COUNT(DISTINCT(input)) as input_count FROM fastpath WHERE measurement_start_time >= today() - interval '31 day' AND measurement_start_time < today() AND test_name = 'web_connectivity' AND probe_cc = :probe_cc AND probe_asn = :probe_asn """ q = query_click_one_row( sql.text(s), dict(probe_cc=probe_cc, probe_asn=probe_asn) ) total_count = q["input_count"] if q else 0 # Group msmts by CC / ASN / period with LIMIT and OFFSET s = """SELECT input, countIf(anomaly = 't') as anomaly_count, countIf(confirmed = 't') as confirmed_count, countIf(msm_failure = 't') as failure_count, COUNT() AS total_count FROM fastpath WHERE measurement_start_time >= today() - interval '31 day' AND measurement_start_time < today() AND test_name = 'web_connectivity' AND probe_cc = :probe_cc AND probe_asn = :probe_asn GROUP BY input ORDER BY confirmed_count DESC, total_count DESC, anomaly_count DESC, input ASC LIMIT :limit OFFSET :offset """ d = { "probe_cc": probe_cc, "probe_asn": probe_asn, "limit": limit, "offset": offset, } results = query_click(sql.text(s), d) current_page = math.ceil(offset / limit) + 1 metadata = { "offset": offset, "limit": limit, "current_page": current_page, "total_count": total_count, "next_url": None, } # Create next_url if len(results) >= limit: args = dict( limit=limit, offset=offset + limit, probe_asn=probe_asn, probe_cc=probe_cc, ) # TODO: remove BASE_URL? next_url = urljoin( current_app.config["BASE_URL"], "/api/_/website_urls?%s" % urlencode(args), ) metadata["next_url"] = next_url return cachedjson("1h", metadata=metadata, results=results) # TODO caching