Esempio n. 1
0
def minicans(test_name, start_date: date, end_date: date, end=None):
    """Fetches minicans from S3 and iterates over measurements.
    Detect broken dloads.
    """
    s3 = s3feeder.create_s3_client()
    day = start_date
    file_cnt = 0
    while day <= end_date:
        tn_filter = set([test_name.replace("_", "")])
        log.info(day)
        li = s3feeder.list_minicans_on_s3_for_a_day(s3, day, None, tn_filter)
        for s3fname, s3size in li:
            # s3fname: raw/20210426/23/YE/ndt/2021042623_YE_ndt.n0.0.tar.gz
            local_file = Path("testdata") / "mini" / s3fname
            in_cache = local_file.is_file() and (local_file.stat().st_size
                                                 == s3size)
            if not in_cache:
                # Download minican
                log.debug("Downloading can %s of size %d KB" %
                          (s3fname, s3size / 1024))
                local_file.parent.mkdir(parents=True, exist_ok=True)
                with local_file.open("wb") as f:
                    s3.download_fileobj(s3feeder.MC_BUCKET_NAME, s3fname, f)
                assert s3size == local_file.stat().st_size

            log.debug("Loading %s", s3fname)
            for msm_jstr, msm, _ in s3feeder.load_multiple(
                    local_file.as_posix()):
                msm = msm or ujson.loads(msm_jstr)
                yield local_file.as_posix(), msm

            file_cnt += 1
            if end is not None and file_cnt == end:
                return
Esempio n. 2
0
def list_cans_on_s3_for_a_day(day, filter=None, bysize=False):
    s3 = s3feeder.create_s3_client()
    fns = s3feeder.list_cans_on_s3_for_a_day(s3, day)
    if bysize:
        fns = sorted(fns, key=lambda i: i[1])
    else:
        fns = sorted(fns)

    for fn, size in fns:
        size = size / float(2**20)
        if filter is None or (filter in fn):
            print(f"{fn:<160} {size} MB")
Esempio n. 3
0
def cans():
    """
    Download interesting cans from S3 to a local directory

    Uses credentials from ~/.aws/config in the block:
    [ooni-data-private]
    aws_access_key_id = ...
    aws_secret_access_key = ...

    Explore bucket from CLI:
    AWS_PROFILE=ooni-data-private aws s3 ls s3://ooni-data-private/canned/2019-07-16/
    """
    _cans = dict(
        # "2013-05-05/20130505T065438Z-VN-AS24173-captive_portal-no_report_id-0.1.0-probe.yaml.lz4",
        # "2013-09-12/20130912T150305Z-MD-AS1547-http_requests-no_report_id-0.1.0-probe.yaml.lz4",
        vn=
        "2013-05-05/20130505T103213Z-VN-AS24173-http_requests-no_report_id-0.1.0-probe.yaml.lz4",
        yaml16=
        "2016-07-07/20160706T000046Z-GB-AS9105-http_requests-TYXZLcFg4yUp9Io2LrOMM7CjLk0QcIdsMPiCZtVgkxUrTxnFM0GiMbr8iGDl3OEe-0.1.0-probe.yaml.lz4",
        yaml17=
        "2017-12-21/20171220T153044Z-BE-AS5432-dns_consistency-mnKRlHuqk8Eo6XMJt5ZkVQrgReaEXPEWaO9NafgXxSVIhAswTXT7QJc6zhsuttpK-0.1.0-probe.yaml.lz4",
        yaml18=
        "2018-03-21/20180320T211810Z-NL-AS1103-dns_consistency-yiCRUmXy6MndqnV3g5QYBKGich5OwP9cQQfOiYnxYAfZatgQZlStuWIT30yu586R-0.1.0-probe.yaml.lz4",
        # yaml2014hr1="2014-02-20/http_requests.1.tar.lz4",
        yaml2014dns="2014-02-20/dns_consistency.0.tar.lz4",
        # yaml2014mpt="2014-02-20/multi_protocol_traceroute.0.tar.lz4",
        yaml2014hr0="2014-02-20/http_requests.0.tar.lz4",
        yaml2014hh="2014-02-20/http_host.0.tar.lz4",
        yaml2014hfm="2014-02-20/http_header_field_manipulation.0.tar.lz4",
    )
    for k, v in _cans.items():
        _cans[k] = Path("testdata") / v

    to_dload = sorted(f for f in _cans.values() if not f.is_file())
    if not to_dload:
        return _cans

    bname = "ooni-data"
    s3 = create_s3_client()
    for fn in to_dload:
        s3fname = fn.as_posix().replace("testdata", "canned")
        r = s3.list_objects_v2(Bucket=bname, Prefix=s3fname)
        assert r["KeyCount"] == 1, r
        filedesc = r["Contents"][0]
        size = filedesc["Size"]
        print("Downloading can %s size %d MB" % (fn, size / 1024 / 1024))

        os.makedirs(os.path.dirname(fn), exist_ok=True)
        with open(fn, "wb") as f:
            s3.download_fileobj(bname, s3fname, f)
        assert size == os.path.getsize(fn)

    return _cans
Esempio n. 4
0
def main():
    conf = parse_args()
    format_char = "n"
    collector_id = "L"
    identity = f"{format_char}{collector_id}"
    log.info(f"From bucket {conf.src_bucket} to {conf.dst_bucket}")
    s3sig = create_s3_client(conf)  # signed client for writing
    db_conn = psycopg2.connect(conf.db_uri)
    db.setup(conf)  # setup db conn inside db module
    setup_fingerprints()

    # Fetch msmts for one day

    buf = {}  # "<cc> <testname>" -> jsonlf / fd / jsonl_s3path
    seen_uids = set()  # Avoid uploading duplicates

    # raw/20210601/00/SA/webconnectivity/2021060100_SA_webconnectivity.n0.0.jsonl.gz
    # jsonl_s3path = f"raw/{ts}/00/{cc}/{testname}/{jsonlf.name}"

    s3uns = s3f.create_s3_client()  # unsigned client for reading
    cans_fns = s3f.list_cans_on_s3_for_a_day(s3uns, conf.day)
    cans_fns = sorted(cans_fns)  # this is not enough to sort by time
    tot_size = sum(size for _, size in cans_fns)
    processed_size = 0
    log.info(f"{tot_size/1024/1024/1024} GB to process")
    log.info(f"{len(cans_fns)} cans to process")
    #  TODO make assertions on msmt
    #  TODO add consistency check on trivial id found in fastpath table
    for can in cans_fns:
        can_fn, size = can
        log.info(f"Processed percentage: {100 * processed_size / tot_size}")
        log.info(f"Opening can {can_fn}")
        Path(can_fn).parent.mkdir(parents=True, exist_ok=True)
        s3uns.download_file(conf.src_bucket, can_fn, can_fn)
        for msm_tup in s3f.load_multiple(can_fn):
            process_measurement(msm_tup, buf, seen_uids, conf, s3sig, db_conn)
        processed_size += size
        Path(can_fn).unlink()

    log.info("Finish jsonl files still open")
    for json_entities in buf.values():
        for e in json_entities:
            if e.fd.closed:
                continue
            finalize_jsonl(s3sig, db_conn, conf, e)

    log.info("Exiting")
Esempio n. 5
0
def s3msmts(test_name, start_date=date(2018, 1, 1), end_date=date(2019, 11,
                                                                  4)):
    """Fetches cans from S3 and iterates over measurements.
    Detect broken dloads.
    """
    s3 = s3feeder.create_s3_client()
    can_date = start_date
    tpl = "{}/{}.00.tar.lz4" if test_name == "web_connectivity" else "{}/{}.0.tar.lz4"
    while can_date <= end_date:
        # e.g. 2019-10-30/psiphon.0.tar.lz4
        can_fname = tpl.format(can_date.strftime("%Y-%m-%d"), test_name)
        can_date += timedelta(days=1)
        can_local_file = Path("testdata") / can_fname

        s3fname = "canned/" + can_fname
        r = s3.list_objects_v2(Bucket=BUCKET_NAME, Prefix=s3fname)
        if r["KeyCount"] != 1:
            log.info("Can %s not found. Skipping." % s3fname)
            continue

        s3size = r["Contents"][0]["Size"]
        assert s3size > 0
        ready = can_local_file.is_file() and (can_local_file.stat().st_size
                                              == s3size)
        if not ready:
            # Download can
            log.debug("Downloading can %s of size %d MB" %
                      (can_fname, s3size / 1024 / 1024))
            can_local_file.parent.mkdir(exist_ok=True)
            with can_local_file.open("wb") as f:
                s3.download_fileobj(BUCKET_NAME, s3fname, f)
            assert s3size == can_local_file.stat().st_size

        log.debug("Loading %s", s3fname)
        for msm_jstr, msm, _ in s3feeder.load_multiple(
                can_local_file.as_posix()):
            msm = msm or ujson.loads(msm_jstr)
            if msm.get("report_id", None) is None:
                # Missing or empty report_id
                # https://github.com/ooni/probe-engine/pull/104
                continue
            yield can_fname, msm
Esempio n. 6
0
def cans():
    """Download interesting cans from S3 to a local directory
    """
    # TODO: move to the more flexible s3msmts where possible
    _cans = dict(
        web_conn_it=
        "2018-05-07/20180501T071932Z-IT-AS198471-web_connectivity-20180506T090836Z_AS198471_gKqEpbg0Ny30ldGCQockbZMJSg9HhFiSizjey5e6JxSEHvzm7j-0.2.0-probe.json.lz4",
        web_conn_cn=
        "2018-05-07/20180506T014008Z-CN-AS4134-web_connectivity-20180506T014010Z_AS4134_ZpxhAVt3iqCjT5bW5CfJspbqUcfO4oZfzDVjCWAu2UuVkibFsv-0.2.0-probe.json.lz4",
        web_conn_30="2019-10-30/web_connectivity.00.tar.lz4",
        telegram="2019-08-29/telegram.0.tar.lz4",
        whatsapp="2019-08-29/whatsapp.0.tar.lz4",
        facebook_messenger="2019-08-29/facebook_messenger.0.tar.lz4",
        facebook_messenger2="2019-10-29/facebook_messenger.0.tar.lz4",
        # telegram="2019-08-29/20190829T105210Z-IR-AS31549-telegram-20190829T105214Z_AS31549_t32ZZ5av3B6yNruRIFhCnuT1dHTnwPk7vwIa9F0TAe064HG4tk-0.2.0-probe.json",
        # fb="2019-06-27/20190627T214121Z-ET-AS24757-facebook_messenger-20190627T214126Z_AS24757_h8g9P5kTmmzyX1VyOjqcVonIbFNujm84l2leMCwC2gX3BI78fI-0.2.0-probe.json",
        hhfm_2019_10_26="2019-10-26/http_header_field_manipulation.0.tar.lz4",
        hhfm_2019_10_27="2019-10-27/http_header_field_manipulation.0.tar.lz4",
        hhfm_2019_10_28="2019-10-28/http_header_field_manipulation.0.tar.lz4",
        hhfm_2019_10_29="2019-10-29/http_header_field_manipulation.0.tar.lz4",
        tor_2018_10_26="2018-10-26/vanilla_tor.0.tar.lz4",
        tor_2019_10_26="2019-10-26/vanilla_tor.0.tar.lz4",
        tor_2019_10_27="2019-10-27/vanilla_tor.0.tar.lz4",
        tor_2019_10_28="2019-10-28/vanilla_tor.0.tar.lz4",
        tor_2019_10_29="2019-10-29/vanilla_tor.0.tar.lz4",
        ndt_2018_10_26="2018-10-26/ndt.0.tar.lz4",
        tcp_connect_2018_10_26="2018-10-26/tcp_connect.0.tar.lz4",
        dash_2019_10_26="2019-10-26/dash.0.tar.lz4",
        dash_2019_10_27="2019-10-27/dash.0.tar.lz4",
        dash_2019_10_28="2019-10-28/dash.0.tar.lz4",
        dash_2019_10_29="2019-10-29/dash.0.tar.lz4",
        meek_2019_10_26="2019-10-26/meek_fronted_requests_test.0.tar.lz4",
        meek_2019_10_27="2019-10-27/meek_fronted_requests_test.0.tar.lz4",
        meek_2019_10_28="2019-10-28/meek_fronted_requests_test.0.tar.lz4",
        meek_2019_10_29="2019-10-29/meek_fronted_requests_test.0.tar.lz4",
        big2858=
        "2019-10-30/20191030T032301Z-BR-AS28573-web_connectivity-20191030T032303Z_AS28573_VzW6UrXrs21YjYWvlk1hyzRqnKlmKNsSntSBGqFCnzFVxVSLQf-0.2.0-probe.json.lz4",
    )
    for k, v in _cans.items():
        _cans[k] = Path("testdata") / v

    to_dload = sorted(f for f in _cans.values() if not f.is_file())
    if not to_dload:
        return _cans

    s3 = s3feeder.create_s3_client()

    for fn in to_dload:
        s3fname = fn.as_posix().replace("testdata", "canned")
        r = s3.list_objects_v2(Bucket=BUCKET_NAME, Prefix=s3fname)
        assert r["KeyCount"] == 1, fn
        assert r["KeyCount"] == 1, r
        filedesc = r["Contents"][0]
        size = filedesc["Size"]
        print("Downloading can %s size %d MB" % (fn, size / 1024 / 1024))

        os.makedirs(os.path.dirname(fn), exist_ok=True)
        with open(fn, "wb") as f:
            s3.download_fileobj(BUCKET_NAME, s3fname, f)
        assert size == os.path.getsize(fn)

    return _cans