Beispiel #1
0
def api_private_country_overview() -> Response:
    """Country-specific overview
    ---
    responses:
      '200':
        description: {
        "first_bucket_date":"2012-12-01",
        "measurement_count":6659891,
        "network_count":333}
    """
    # TODO: add circumvention_tools_blocked im_apps_blocked
    # middlebox_detected_networks websites_confirmed_blocked
    probe_cc = validate_probe_cc_query_param()
    s = """SELECT
        toDate(MIN(measurement_start_time)) AS first_bucket_date,
        COUNT() AS measurement_count,
        COUNT(DISTINCT probe_asn) AS network_count
        FROM fastpath
        WHERE probe_cc = :probe_cc
        AND measurement_start_time > '2012-12-01'
    """
    r = query_click_one_row(sql.text(s), {"probe_cc": probe_cc})
    assert r
    # FIXME: websites_confirmed_blocked
    return cachedjson("1d", **r)
Beispiel #2
0
def update_url_priority_click(old: dict, new: dict):
    # The url_priorities table is CollapsingMergeTree
    # Both old and new might be set
    ins_sql = """INSERT INTO url_priorities
        (sign, category_code, cc, domain, url, priority) VALUES
    """
    if old:
        rule = old.copy()
        rule["sign"] = -1
        log.info(f"Deleting prioritization rule {rule}")
        r = insert_click(ins_sql, [rule])
        log.debug(f"Result: {r}")

    if new:
        q = """SELECT count() AS cnt FROM url_priorities FINAL WHERE sign = 1 AND
        category_code = :category_code AND cc = :cc AND domain = :domain
        AND url = :url"""
        cnt = query_click_one_row(sql.text(q), new)
        if cnt and cnt["cnt"] > 0:
            log.info(f"Rejecting duplicate rule {new}")
            raise DuplicateRuleError(err_args=new)

        rule = new.copy()
        rule["sign"] = 1
        log.info(f"Creating prioritization rule {rule}")
        r = insert_click(ins_sql, [rule])
        log.debug(f"Result: {r}")
Beispiel #3
0
def _get_measurement_meta_clickhouse(report_id: str, input_) -> dict:
    # Given report_id + input, fetch measurement data from fastpath table
    query = "SELECT * FROM fastpath "
    if input_ is None:
        # fastpath uses input = '' for empty values
        query += "WHERE report_id = :report_id AND input = '' "
    else:
        query += """
        LEFT OUTER JOIN citizenlab ON citizenlab.url = fastpath.input
        WHERE fastpath.input = :input
        AND fastpath.report_id = :report_id
        """
    query_params = dict(input=input_, report_id=report_id)
    # Limit the time range where we scan for the measurement to improve
    # performance
    try:
        rid_t = datetime.strptime(report_id[:8], "%Y%m%d")
        query_params["begin"] = rid_t - timedelta(days=30)
        query_params["end"] = rid_t + timedelta(days=30)
        query += """
        AND fastpath.measurement_start_time > :begin
        AND fastpath.measurement_start_time < :end
        """
    except ValueError:
        pass  # legacy report_id format without date
    except Exception as e:
        log.error(e, exc_info=True)

    query += "LIMIT 1"
    msmt_meta = query_click_one_row(sql.text(query), query_params)
    if not msmt_meta:
        return {}  # measurement not found
    if msmt_meta["probe_asn"] == 0:
        # https://ooni.org/post/2020-ooni-probe-asn-incident-report/
        # https://github.com/ooni/explorer/issues/495
        return {}  # unwanted

    keys = (
        "input",
        "measurement_start_time",
        "measurement_uid",
        "report_id",
        "test_name",
        "test_start_time",
        "probe_asn",
        "probe_cc",
        "scores",
    )
    out = {k: msmt_meta[k] for k in keys}
    out["category_code"] = msmt_meta.get("category_code", None)
    out["anomaly"] = msmt_meta["anomaly"] == "t"
    out["confirmed"] = msmt_meta["confirmed"] == "t"
    out["failure"] = msmt_meta["msm_failure"] == "t"
    return out
Beispiel #4
0
def initialize_url_priorities_if_needed():
    cntq = "SELECT count() AS cnt FROM url_priorities"
    cnt = query_click_one_row(sql.text(cntq), {})
    if cnt["cnt"] > 0:
        return

    rules = [
        ("NEWS", 100),
        ("POLR", 100),
        ("HUMR", 100),
        ("LGBT", 100),
        ("ANON", 100),
        ("MMED", 80),
        ("SRCH", 80),
        ("PUBH", 80),
        ("REL", 60),
        ("XED", 60),
        ("HOST", 60),
        ("ENV", 60),
        ("FILE", 40),
        ("CULTR", 40),
        ("IGO", 40),
        ("GOVT", 40),
        ("DATE", 30),
        ("HATE", 30),
        ("MILX", 30),
        ("PROV", 30),
        ("P**N", 30),
        ("GMB", 30),
        ("ALDR", 30),
        ("GAME", 20),
        ("MISC", 20),
        ("HACK", 20),
        ("ECON", 20),
        ("COMM", 20),
        ("CTRL", 20),
        ("COMT", 100),
        ("GRP", 100),
    ]
    rows = [{
        "sign": 1,
        "category_code": ccode,
        "cc": "*",
        "domain": "*",
        "url": "*",
        "priority": prio,
    } for ccode, prio in rules]
    # The url_priorities table is CollapsingMergeTree
    query = """INSERT INTO url_priorities
        (sign, category_code, cc, domain, url, priority) VALUES
    """
    log.info("Populating url_priorities")
    r = insert_click(query, rows)
    return r
Beispiel #5
0
def _fetch_jsonl_measurement_body_clickhouse(
        report_id, input: str, measurement_uid) -> Optional[bytes]:
    """Fetch jsonl from S3, decompress it, extract msmt"""
    inp = input or ""  # NULL/None input is stored as ''
    try:
        # FIXME temporary hack to test reprocessing
        query = """SELECT s3path, linenum FROM new_jsonl
            PREWHERE report_id = :report_id AND input = :inp
            LIMIT 1"""
        query_params = dict(inp=inp, report_id=report_id)
        lookup = query_click_one_row(sql.text(query), query_params)
        assert lookup is not None
        log.info("Found in new_jsonl")
        metrics.incr("msmt_found_in_new_jsonl")

    except:
        log.info("Not found in new_jsonl")
        metrics.incr("msmt_not_found_in_new_jsonl")
        query = """SELECT s3path, linenum FROM jsonl
            PREWHERE report_id = :report_id AND input = :inp
            LIMIT 1"""
        query_params = dict(inp=inp, report_id=report_id)
        lookup = query_click_one_row(sql.text(query), query_params)

    if lookup is None:
        m = f"Missing row in jsonl table: {report_id} {input} {measurement_uid}"
        log.error(m)
        metrics.incr("msmt_not_found_in_jsonl")
        return None

    s3path = lookup["s3path"]
    linenum = lookup["linenum"]
    log.debug(f"Fetching file {s3path} from S3")
    try:
        return _fetch_jsonl_measurement_body_inner(s3path, linenum)
    except:  # pragma: no cover
        log.error(f"Failed to fetch file {s3path} from S3")
        return None
Beispiel #6
0
Datei: auth.py Projekt: ooni/api
def _remove_from_session_expunge(email_address: str) -> None:
    # Used by integ test
    log = current_app.logger
    account_id = hash_email_address(email_address)
    query_params: Dict[str, Any] = dict(account_id=account_id)
    # 'session_expunge' is on RocksDB (ACID key-value database)
    q1 = "SELECT * FROM session_expunge WHERE account_id = :account_id"
    row = query_click_one_row(sql.text(q1), query_params)
    # https://github.com/ClickHouse/ClickHouse/issues/20546
    if row:
        log.info("Resetting expunge in Clickhouse session_expunge")
        query = "INSERT INTO session_expunge (account_id, threshold) VALUES"
        query_params["threshold"] = 0
        insert_click(query, [query_params])
Beispiel #7
0
def api_private_global_overview() -> Response:
    """Provide global summary of measurements
    Sources: global_stats db table
    ---
    responses:
      '200':
        description: JSON struct TODO
    """
    q = """SELECT
        COUNT(DISTINCT(probe_asn)) AS network_count,
        COUNT(DISTINCT probe_cc) AS country_count,
        COUNT(*) AS measurement_count
    FROM fastpath
    """
    r = query_click_one_row(q, {})
    assert r
    return cachedjson("1d", **r)
Beispiel #8
0
Datei: auth.py Projekt: ooni/api
        def wrapper(*args, **kwargs):
            token = request.cookies.get("ooni", "")
            try:
                tok = decode_jwt(token, audience="user_auth")
                del token
                if tok["role"] not in roles:
                    return jerror("Role not authorized", 401)
            except Exception:
                return jerror("Authentication required", 401)

            # check for session expunge
            # TODO: cache query?
            query = """SELECT threshold
                FROM session_expunge
                WHERE account_id = :account_id """
            account_id = tok["account_id"]
            query_params = dict(account_id=account_id)
            row = query_click_one_row(sql.text(query), query_params)
            if row:
                threshold = row["threshold"]
                iat = datetime.utcfromtimestamp(tok["iat"])
                print((iat, threshold))
                if iat < threshold:
                    return jerror("Authentication token expired", 401)

            # attach account_id to request
            request._account_id = account_id
            # run the HTTP route method
            resp = func(*args, **kwargs)
            # Prevent an authenticated page to be cached and served to
            # unauthorized users
            resp.cache_control.no_cache = True

            token_age = time.time() - tok["iat"]
            if token_age > 600:  # refresh token if needed
                newtoken = _create_session_token(tok["account_id"],
                                                 tok["role"],
                                                 tok["login_time"])
                set_JWT_cookie(resp, newtoken)

            return resp
Beispiel #9
0
Datei: auth.py Projekt: ooni/api
def _get_account_role(account_id: str) -> Optional[str]:
    """Get account role from database, or None"""
    query = "SELECT role FROM accounts WHERE account_id = :account_id"
    query_params = dict(account_id=account_id)
    r = query_click_one_row(sql.text(query), query_params)
    return r["role"] if r else None
Beispiel #10
0
def _clickhouse_aggregation(
    resp_format: str,
    download: bool,
    since,
    until,
    inp: ostr,
    domain: ostr,
    category_code: ostr,
    probe_cc: ostr,
    probe_asn: Optional[int],
    test_name: ostr,
    axis_x: ostr,
    axis_y: ostr,
):
    log = current_app.logger
    dimension_cnt = int(bool(axis_x)) + int(bool(axis_y))
    cacheable = until and until < datetime.now() - timedelta(hours=72)
    cacheable = False  # FIXME

    # Assemble query
    colnames = [
        "anomaly_count",
        "confirmed_count",
        "failure_count",
        "ok_count",
        "measurement_count",
    ]
    cols = [
        sql.text(
            "countIf(anomaly = 't' AND confirmed = 'f' AND msm_failure = 'f') AS anomaly_count"
        ),
        sql.text(
            "countIf(confirmed = 't' AND msm_failure = 'f') AS confirmed_count"
        ),
        sql.text("countIf(msm_failure = 't') AS failure_count"),
        sql.text(
            "countIf(anomaly = 'f' AND confirmed = 'f' AND msm_failure = 'f') AS ok_count"
        ),
        sql.text("COUNT(*) AS measurement_count"),
    ]
    table = sql.table("fastpath")
    where = []
    query_params = {}

    if domain:
        where.append(sql.text("domain = :domain"))
        query_params["domain"] = domain

    if inp:
        where.append(sql.text("input = :input"))
        query_params["input"] = inp

    if category_code:
        where.append(sql.text("citizenlab.category_code = :category_code"))
        query_params["category_code"] = category_code
        if probe_cc:
            where.append(
                sql.text("(citizenlab.cc = :lcc OR citizenlab.cc = 'ZZ')"))
            query_params["lcc"] = probe_cc.lower()
        else:
            where.append(sql.text("citizenlab.cc = 'ZZ'"))

    if probe_cc:
        where.append(sql.text("probe_cc = :probe_cc"))
        query_params["probe_cc"] = probe_cc

    if probe_asn is not None:
        where.append(sql.text("probe_asn = :probe_asn"))
        query_params["probe_asn"] = str(probe_asn)

    if since:
        where.append(sql.text("measurement_start_time > :since"))
        query_params["since"] = since

    if until:
        where.append(sql.text("measurement_start_time <= :until"))
        query_params["until"] = until

    if test_name:
        where.append(sql.text("test_name = :test_name"))
        query_params["test_name"] = test_name

    if axis_x == axis_y and axis_x is not None:
        raise ValueError("Axis X and Y cannot be the same")

    def group_by_date(since, until, cols, colnames, group_by):
        if since and until:
            delta = abs((until - since).days)
        else:
            delta = 0

        # TODO: add a granularity parameter and a check around query weight /
        # / response size. Also add support in CSV format.
        granularity = ("day", "toDate")
        tcol, fun = granularity
        tcol = "measurement_start_day"  # TODO: support dynamic axis names
        cols.append(sql.text(f"{fun}(measurement_start_time) AS {tcol}"))
        colnames.append(tcol)
        group_by.append(column(tcol))

    def add_axis(axis, cols, colnames, group_by):
        if axis == "blocking_type":
            t = "JSONExtractString(scores, 'analysis', 'blocking_type') AS blocking_type"
            cols.append(sql.text(t))
        else:
            validate_axis_name(axis)
            cols.append(sql.text(axis))
        colnames.append(axis)
        group_by.append(column(axis))

    group_by: List = []
    if axis_x == "measurement_start_day":
        group_by_date(since, until, cols, colnames, group_by)
    elif axis_x:
        add_axis(axis_x, cols, colnames, group_by)

    if axis_y == "measurement_start_day":
        group_by_date(since, until, cols, colnames, group_by)
    elif axis_y:
        add_axis(axis_y, cols, colnames, group_by)

    # Assemble query
    if category_code or axis_x == "category_code" or axis_y == "category_code":
        # Join in the citizenlab table if we need to filter on category_code
        # or perform group-by on it
        table = table.join(
            sql.table("citizenlab"),
            sql.text("citizenlab.url = fastpath.input"),
        )

    where_expr = and_(*where)
    query = select(cols).where(where_expr).select_from(table)

    # Add group-by
    for g in group_by:
        query = query.group_by(g).order_by(g)

    try:
        if dimension_cnt > 0:
            r: Any = list(query_click(query, query_params))
        else:
            r = query_click_one_row(query, query_params)

        pq = current_app.click.last_query
        msg = f"Stats: {pq.progress.rows} {pq.progress.bytes} {pq.progress.total_rows} {pq.elapsed}"
        log.info(msg)

        if resp_format == "CSV":
            csv_data = _convert_to_csv(r)
            response = make_response(csv_data)
            response.headers["Content-Type"] = "text/csv"
            if download:
                set_dload(response, "ooni-aggregate-data.csv")

        else:
            resp_d = {
                "v": 0,
                "dimension_count": dimension_cnt,
                "result": r,
                "db_stats": {
                    "row_count": pq.progress.rows,
                    "bytes": pq.progress.bytes,
                    "total_row_count": pq.progress.total_rows,
                    "elapsed_seconds": pq.elapsed,
                },
            }
            response = jsonify(resp_d)
            if download:
                set_dload(response, "ooni-aggregate-data.json")

        if cacheable:
            response.cache_control.max_age = 3600 * 24
        return response

    except Exception as e:
        return jsonify({"v": 0, "error": str(e)})
Beispiel #11
0
def api_private_website_test_urls() -> Response:
    """TODO
    ---
    responses:
      '200':
        description: TODO
    """
    limit = int(request.args.get("limit", 10))
    if limit <= 0:
        limit = 10
    offset = int(request.args.get("offset", 0))

    probe_cc = validate_probe_cc_query_param()
    probe_asn = validate_probe_asn_query_param()
    probe_asn = int(probe_asn.replace("AS", ""))

    # Count how many distinct inputs we have in this CC / ASN / period
    s = """
        SELECT COUNT(DISTINCT(input)) as input_count
        FROM fastpath
        WHERE measurement_start_time >= today() - interval '31 day'
        AND measurement_start_time < today()
        AND test_name = 'web_connectivity'
        AND probe_cc = :probe_cc
        AND probe_asn = :probe_asn
    """
    q = query_click_one_row(
        sql.text(s), dict(probe_cc=probe_cc, probe_asn=probe_asn)
    )
    total_count = q["input_count"] if q else 0

    # Group msmts by CC / ASN / period with LIMIT and OFFSET
    s = """SELECT input,
        countIf(anomaly = 't') as anomaly_count,
        countIf(confirmed = 't') as confirmed_count,
        countIf(msm_failure = 't') as failure_count,
        COUNT() AS total_count
        FROM fastpath
        WHERE measurement_start_time >= today() - interval '31 day'
        AND measurement_start_time < today()
        AND test_name = 'web_connectivity'
        AND probe_cc =  :probe_cc
        AND probe_asn = :probe_asn
        GROUP BY input
        ORDER BY confirmed_count DESC, total_count DESC,
                anomaly_count DESC, input ASC
        LIMIT :limit
        OFFSET :offset
        """
    d = {
        "probe_cc": probe_cc,
        "probe_asn": probe_asn,
        "limit": limit,
        "offset": offset,
    }
    results = query_click(sql.text(s), d)
    current_page = math.ceil(offset / limit) + 1
    metadata = {
        "offset": offset,
        "limit": limit,
        "current_page": current_page,
        "total_count": total_count,
        "next_url": None,
    }

    # Create next_url
    if len(results) >= limit:
        args = dict(
            limit=limit,
            offset=offset + limit,
            probe_asn=probe_asn,
            probe_cc=probe_cc,
        )
        # TODO: remove BASE_URL?
        next_url = urljoin(
            current_app.config["BASE_URL"],
            "/api/_/website_urls?%s" % urlencode(args),
        )
        metadata["next_url"] = next_url

    return cachedjson("1h", metadata=metadata, results=results)  # TODO caching