def test_compare_normalized_with_autoclaved(autoclaved_io):
    for fname, bucket_tstamp, yaml_bytes, autoclaved_msmts in autoclaved_io:
        log.info("Normalizing %s", fname)
        log.info("len autoclaved msmts %d", len(autoclaved_msmts))
        g = norm.iter_yaml_msmt_normalized(yaml_bytes, bucket_tstamp)
        normalized_msmts = list(g)
        assert len(normalized_msmts) == len(autoclaved_msmts)

        a_mst = list(i["measurement_start_time"] for i in autoclaved_msmts)
        n_mst = list(i["measurement_start_time"] for i in normalized_msmts)
        assert a_mst == n_mst

        for n in range(len(autoclaved_msmts)):
            autoclaved = autoclaved_msmts[n]
            normalized = normalized_msmts[n]
            log.info(n)
            for k, v in sorted(autoclaved.items()):
                log.info("K %s", k)
                if isinstance(v, dict):
                    for k1, v1 in sorted(v.items()):
                        log.info("K1 %s->%s", k, k1)
                        assert v1 == normalized[k][k1]

                else:
                    assert v == normalized[k]
def test_normalize_yaml_dns_consistency_2018(cans):
    can = cans["yaml18"]
    canfn = can.as_posix()
    day = canfn.split("/")[1]
    rfn = canfn.split("/", 1)[1][:-4]  # remove testdata/ and .lz4
    with lz4frame.open(can) as f:
        for n, entry in enumerate(norm.iter_yaml_msmt_normalized(f, day, rfn)):
            ujson.dumps(entry)  # ensure it's serializable
Exemple #3
0
def load_multiple(fn, touch=True) -> tuple:
    """Load contents of cans. Decompress tar archives if found.
    Yields measurements one by one as:
        (string of JSON, None) or (None, msmt dict)
    """
    if touch:
        os.utime(fn)  # update access time - used for cache cleanup

    # TODO: handle:
    # RuntimeError: LZ4F_decompress failed with code: ERROR_decompressionFailed
    if fn.endswith(".tar.lz4"):
        with lz4frame.open(fn) as f:
            tf = tarfile.TarFile(fileobj=f)
            while True:
                m = tf.next()
                if m is None:
                    break
                log.debug("Loading nested %s", m.name)
                k = tf.extractfile(m)
                if m.name.endswith(".json"):
                    for line in k:
                        yield (line, None)

                elif m.name.endswith(".yaml"):
                    continue  # FIXME
                    bucket_tstamp = "FIXME"
                    for msm in iter_yaml_msmt_normalized(k, bucket_tstamp):
                        yield (None, msm)

    elif fn.endswith(".json.lz4"):
        with lz4frame.open(fn) as f:
            for line in f:
                yield (line, None)

    elif fn.endswith(".yaml.lz4"):
        with lz4frame.open(fn) as f:
            raise Exception("Unsupported format: YAML")
            bucket_tstamp = "FIXME"
            for msm in iter_yaml_msmt_normalized(f, bucket_tstamp):
                metrics.incr("yaml_normalization")
                yield (None, msm)

    else:
        raise RuntimeError(fn)
def test_normalize_yaml_2016(cans):
    can = cans["yaml16"]
    canfn = can.as_posix()
    assert canfn.startswith("testdata/2016-07-07/20160706T000046Z-GB")
    day = canfn.split("/")[1]
    rfn = canfn.split("/", 1)[1][:-4]  # remove testdata/ and .lz4
    with lz4frame.open(can) as f:
        for n, entry in enumerate(norm.iter_yaml_msmt_normalized(f, day, rfn)):
            ujson.dumps(entry)  # ensure it's serializable
            if n == 0:
                with open("fastpath/tests/data/yaml16_0.json") as f:
                    exp = ujson.load(f)
                assert entry == exp
            elif n > 20:
                break
def test_normalize_yaml_dns_consistency_2017(cans):
    can = cans["yaml17"]
    canfn = can.as_posix()
    day = canfn.split("/")[1]
    rfn = canfn.split("/", 1)[1][:-4]  # remove testdata/ and .lz4
    # s3://ooni-data/autoclaved/jsonl.tar.lz4/2017-12-21/20171220T153044Z-BE-AS5432-dns_consistency-mnKRlHuqk8Eo6XMJt5ZkVQrgReaEXPEWaO9NafgXxSVIhAswTXT7QJc6zhsuttpK-0.1.0-probe.yaml.lz4
    # lz4cat <fn> | head -n1 | jq -S . > fastpath/tests/data/yaml17_0.json
    with lz4frame.open(can) as f:
        for n, entry in enumerate(norm.iter_yaml_msmt_normalized(f, day, rfn)):
            ujson.dumps(entry)  # ensure it's serializable
            if n == 0:
                with open("fastpath/tests/data/yaml17_0.json") as f:
                    exp = ujson.load(f)
                assert entry == exp
            elif n > 20:
                break
    def _fetch_measurement(self, fn):
        """Fetch measurements from one collector using SSH/SFTP
        :yields: (string of JSON, msmt dict) or (None, msmt dict)
        """
        t = time.time()
        try:
            log.debug("Fetching %s", fn)
            fn = os.path.join(self._archive_dir, fn)
            with io.BytesIO() as data:
                metrics.gauge("fetching", 1)
                t = metrics.timer("fetch").start()
                # Fetch all data in a blocking call
                self.sftp.getfo(fn, data)
                metrics.gauge("fetching", 0)
                t.stop()
                data_len = data.tell()
                data.seek(0)
                metrics.incr("fetched.count")
                metrics.incr("fetched.data", data_len)
                metrics.gauge("fetching_bw_KBps",
                              data_len / (t.ms or 0.000_000_001))

                if fn.endswith(".yaml"):
                    raise Exception("Unsupported format: YAML")
                    bucket_tstamp = "FIXME"
                    for msm in normalize.iter_yaml_msmt_normalized(
                            data, bucket_tstamp):
                        yield (None, msm)

                else:
                    # JSON documents
                    while True:
                        line = data.readline()
                        if len(line) == 0:
                            break

                        yield (line, None)

        except Exception as e:
            metrics.gauge("fetching", 0)
            log.exception(e)
            metrics.incr("unhandled_exception")
Exemple #7
0
def load_multiple(fn: str) -> Generator[MsmtTup, None, None]:
    """Load contents of legacy cans and minicans.
    Decompress tar archives if found.
    Yields measurements one by one as:
        (string of JSON, None, uid) or (None, msmt dict, uid)
    The uid is either taken from the filename or generated by trivial_id for
    legacy cans
    """
    # TODO: split this and handle legacy cans and post/minicans independently
    if fn.endswith(".tar.lz4"):
        # Legacy lz4 cans
        with lz4frame.open(fn) as f:
            tf = tarfile.TarFile(fileobj=f)
            while True:
                m = tf.next()
                if m is None:
                    # end of tarball
                    break
                log.debug("Loading nested %s", m.name)
                k = tf.extractfile(m)
                assert k is not None
                if m.name.endswith(".json"):
                    for line in k:
                        msm = ujson.loads(line)
                        msmt_uid = trivial_id(msm)
                        yield (None, msm, msmt_uid)

                elif m.name.endswith(".yaml"):
                    bucket_tstamp = fn.split("/")[-2]
                    rfn = f"{bucket_tstamp}/" + fn.split("/")[-1]
                    for msm in iter_yaml_msmt_normalized(
                            k, bucket_tstamp, rfn):
                        metrics.incr("yaml_normalization")
                        msmt_uid = trivial_id(msm)
                        yield (None, msm, msmt_uid)

    elif fn.endswith(".json.lz4"):
        # Legacy lz4 json files
        with lz4frame.open(fn) as f:
            for line in f:
                msm = ujson.loads(line)
                msmt_uid = trivial_id(msm)
                yield (None, msm, msmt_uid)

    elif fn.endswith(".yaml.lz4"):
        # Legacy lz4 yaml files
        with lz4frame.open(fn) as f:
            bucket_tstamp = fn.split("/")[-2]
            rfn = f"{bucket_tstamp}/" + fn.split("/")[-1]
            for msm in iter_yaml_msmt_normalized(f, bucket_tstamp, rfn):
                metrics.incr("yaml_normalization")
                msmt_uid = trivial_id(msm)
                yield (None, msm, msmt_uid)

    elif fn.endswith(".tar.gz"):
        # minican with missing gzipping :(
        tf = tarfile.open(fn)
        while True:
            m = tf.next()
            if m is None:
                # end of tarball
                tf.close()
                break
            log.debug("Loading %s", m.name)
            k = tf.extractfile(m)
            assert k is not None
            if not m.name.endswith(".post"):
                log.error("Unexpected filename")
                continue

            try:
                j = ujson.loads(k.read())
            except Exception:
                log.error(repr(k[:100]), exc_info=1)
                continue

            fmt = j.get("format", "")
            if fmt == "json":
                msm = j.get("content", {})
                # extract msmt_uid from filename e.g:
                # ... /20210614004521.999962_JO_signal_68eb19b439326d60.post
                msmt_uid = m.name.rsplit("/", 1)[1]
                msmt_uid = msmt_uid[:-5]
                yield (None, msm, msmt_uid)

            elif fmt == "yaml":
                log.info("Skipping YAML")

            else:
                log.info("Ignoring invalid post")

    elif fn.endswith("/index.json.gz"):
        pass

    else:
        raise RuntimeError(f"Unexpected [mini]can filename '{fn}'")
Exemple #8
0
def load_multiple(fn: str) -> Generator[MsmtTup, None, None]:
    """Load contents of cans. Decompress tar archives if found.
    Yields measurements one by one as:
        (string of JSON, None, None) or (None, msmt dict, None)
    """
    # TODO: handle:
    # RuntimeError: LZ4F_decompress failed with code: ERROR_decompressionFailed
    if fn.endswith(".tar.lz4"):
        with lz4frame.open(fn) as f:
            tf = tarfile.TarFile(fileobj=f)
            while True:
                m = tf.next()
                if m is None:
                    # end of tarball
                    break
                log.debug("Loading nested %s", m.name)
                k = tf.extractfile(m)
                assert k is not None
                if m.name.endswith(".json"):
                    for line in k:
                        yield (line, None, None)

                elif m.name.endswith(".yaml"):
                    bucket_tstamp = fn.split("/")[-2]
                    rfn = f"{bucket_tstamp}/" + fn.split("/")[-1]
                    for msm in iter_yaml_msmt_normalized(
                            k, bucket_tstamp, rfn):
                        metrics.incr("yaml_normalization")
                        yield (None, msm, None)

    elif fn.endswith(".json.lz4"):
        with lz4frame.open(fn) as f:
            for line in f:
                yield (line, None, None)

    elif fn.endswith(".yaml.lz4"):
        with lz4frame.open(fn) as f:
            bucket_tstamp = fn.split("/")[-2]
            rfn = f"{bucket_tstamp}/" + fn.split("/")[-1]
            for msm in iter_yaml_msmt_normalized(f, bucket_tstamp, rfn):
                metrics.incr("yaml_normalization")
                yield (None, msm, None)

    elif fn.endswith(".tar.gz"):
        # minican with missing gzipping :(
        tf = tarfile.open(fn)
        while True:
            m = tf.next()
            if m is None:
                # end of tarball
                tf.close()
                break
            log.debug("Loading %s", m.name)
            k = tf.extractfile(m)
            assert k is not None
            if not m.name.endswith(".post"):
                log.error("Unexpected filename")
                continue

            try:
                j = ujson.loads(k.read())
            except Exception:
                log.error(repr(k[:100]), exc_info=1)

            fmt = j.get("format", "")
            if fmt == "json":
                msm = j.get("content", {})
                yield (None, msm, None)

            elif fmt == "yaml":
                log.info("Skipping YAML")

            else:
                log.info("Ignoring invalid post")

    elif fn.endswith("/index.json.gz"):
        pass

    else:
        raise RuntimeError(f"Unexpected [mini]can filename '{fn}'")