def test_replay_content(kafka_server, kafka_prefix, kafka_consumer_group):
    objstorage1 = get_objstorage(cls="memory")
    objstorage2 = get_objstorage(cls="memory")

    writer = get_journal_writer(
        cls="kafka",
        brokers=[kafka_server],
        client_id="kafka_writer",
        prefix=kafka_prefix,
        anonymize=False,
    )

    for content in CONTENTS:
        objstorage1.add(content.data)
        writer.write_addition("content", content)

    replayer = JournalClient(
        brokers=kafka_server,
        group_id=kafka_consumer_group,
        prefix=kafka_prefix,
        stop_on_eof=True,
        # stop_after_objects=len(objects),
    )

    worker_fn = functools.partial(process_replay_objects_content,
                                  src=objstorage1,
                                  dst=objstorage2)
    replayer.process(worker_fn)
    # only content with status visible will be copied in storage2
    expected_objstorage_state = {
        c.sha1: c.data
        for c in CONTENTS if c.status == "visible"
    }

    assert expected_objstorage_state == objstorage2.state
Example #2
0
 def __init__(self, journal_writer):
     if journal_writer:
         if get_journal_writer is None:
             raise EnvironmentError(
                 "You need the swh.journal package to use the "
                 "journal_writer feature")
         self.journal = get_journal_writer(
             value_sanitizer=model_object_dict_sanitizer, **journal_writer)
     else:
         self.journal = None
Example #3
0
def test_stream_journal_writer_filename(tmp_path):
    out_fname = str(tmp_path / "journal.msgpack")

    writer = get_journal_writer(
        cls="stream",
        value_sanitizer=model_object_dict_sanitizer,
        output_stream=out_fname,
    )
    expected = fill_writer(writer)

    with open(out_fname, "rb") as outs:
        unpacker = kafka_stream_to_value(outs)
        for i, (objtype, objd) in enumerate(unpacker, start=1):
            assert (objtype, objd) in expected
        assert len(expected) == i
Example #4
0
def test_stream_journal_writer_stream():
    outs = io.BytesIO()

    writer = get_journal_writer(
        cls="stream",
        value_sanitizer=model_object_dict_sanitizer,
        output_stream=outs,
    )
    expected = fill_writer(writer)

    outs.seek(0, 0)
    unpacker = kafka_stream_to_value(outs)
    for i, (objtype, objd) in enumerate(unpacker, start=1):
        assert (objtype, objd) in expected
    assert len(expected) == i
Example #5
0
def test_stream_journal_writer_stdout(capfdbinary):
    writer = get_journal_writer(
        cls="stream",
        value_sanitizer=model_object_dict_sanitizer,
        output_stream="-",
    )
    expected = fill_writer(writer)

    captured = capfdbinary.readouterr()
    assert captured.err == b""
    outs = io.BytesIO(captured.out)

    unpacker = kafka_stream_to_value(outs)
    for i, (objtype, objd) in enumerate(unpacker, start=1):
        assert (objtype, objd) in expected
    assert len(expected) == i
Example #6
0
 def __init__(self, tool_getter: Callable[[int], Dict[str, Any]],
              journal_writer):
     """
     Args:
         tool_getter: a callable that takes a tool_id and return a dict representing
                      a tool object
         journal_writer: configuration passed to
                         `swh.journal.writer.get_journal_writer`
     """
     self._tool_getter = tool_getter
     if journal_writer:
         if get_journal_writer is None:
             raise EnvironmentError(
                 "You need the swh.journal package to use the "
                 "journal_writer feature")
         self.journal = get_journal_writer(
             **journal_writer,
             value_sanitizer=lambda object_type, value_dict: value_dict,
         )
     else:
         self.journal = None
Example #7
0
def test_replay_statsd(kafka_server, kafka_prefix, kafka_consumer_group,
                       statsd):
    objstorage1 = get_objstorage(cls="memory")
    objstorage2 = get_objstorage(cls="memory")

    writer = get_journal_writer(
        cls="kafka",
        brokers=[kafka_server],
        client_id="kafka_writer",
        prefix=kafka_prefix,
        anonymize=False,
    )

    # Fill the source objstorage with a bunch of content object. In the end,
    # there should be 2 content objects for each possible replaying decision
    # (aka. skipped, excluded, in_dst, not_in_src, failed and copied):
    # contents[0:2] are properly copied
    # contents[2:4] are excluded
    # contents[4:6] are in dst
    # contents[6:8] are hidden
    contents = [
        Content.from_data(f"foo{i}".encode(),
                          status="hidden" if 6 <= i < 8 else "visible")
        for i in range(8)
    ]

    for content in contents:
        objstorage1.add(content.data)
        writer.write_addition("content", content)
    excluded = [c.sha1 for c in contents[2:4]]

    def exclude_fn(cnt_d):
        return cnt_d["sha1"] in excluded

    for content in contents[4:6]:
        objstorage2.add(content.data)

    replayer = JournalClient(
        brokers=kafka_server,
        group_id=kafka_consumer_group,
        prefix=kafka_prefix,
        stop_on_eof=True,
        # stop_after_objects=len(objects),
    )

    worker_fn = functools.partial(
        process_replay_objects_content,
        src=objstorage1,
        dst=objstorage2,
        exclude_fn=exclude_fn,
    )
    replayer.process(worker_fn)

    # We cannot expect any order from replayed objects, so statsd reports won't
    # be sorted according to contents, so we just count the expected occurrence
    # of each statsd message.
    prefix = "swh_content_replayer"
    expected_reports = {
        # 4 because 2 for the copied objects + 2 for the in_dst ones
        f"^{prefix}_retries_total:1[|]c[|]#attempt:1,operation:obj_in_objstorage$":
        4,
        f"^{prefix}_retries_total:1[|]c[|]#attempt:1,operation:get_object$":
        2,
        f"^{prefix}_retries_total:1[|]c[|]#attempt:1,operation:put_object$":
        2,
        f"^{prefix}_duration_seconds:[0-9]+[.][0-9]+[|]ms[|]#request:get$":
        2,
        f"^{prefix}_duration_seconds:[0-9]+[.][0-9]+[|]ms[|]#request:put$":
        2,
        f"^{prefix}_bytes:4[|]c$":
        2,
    }
    decisions = ("copied", "skipped", "excluded", "in_dst", "not_in_src",
                 "failed")
    decision_re = (
        "^swh_content_replayer_operations_total:1[|]c[|]#decision:(?P<decision>"
        + "|".join(decisions) + ")(?P<extras>,.+)?$")

    operations = dict.fromkeys(decisions, 0)
    reports = dict.fromkeys(expected_reports, 0)

    for report in (r.decode() for r in statsd.socket.payloads):
        m = re.match(decision_re, report)
        if m:
            operations[m.group("decision")] += 1
        else:
            for expected in expected_reports:
                m = re.match(expected, report)
                if m:
                    reports[expected] += 1

    assert reports == expected_reports

    assert operations["skipped"] == 2
    assert operations["excluded"] == 2
    assert operations["in_dst"] == 2
    assert operations["copied"] == 2
    # TODO:
    assert operations["not_in_src"] == 0
    assert operations["failed"] == 0
Example #8
0
def test_cli_journal_client(
    cli_runner,
    swh_config,
    indexer_scheduler,
    kafka_prefix: str,
    kafka_server,
    consumer: Consumer,
):
    """Test the 'swh indexer journal-client' cli tool."""
    journal_writer = get_journal_writer(
        "kafka",
        brokers=[kafka_server],
        prefix=kafka_prefix,
        client_id="test producer",
        value_sanitizer=lambda object_type, value: value,
        flush_timeout=3,  # fail early if something is going wrong
    )

    visit_statuses = [
        OriginVisitStatus(
            origin="file:///dev/zero",
            visit=1,
            date=now(),
            status="full",
            snapshot=None,
        ),
        OriginVisitStatus(
            origin="file:///dev/foobar",
            visit=2,
            date=now(),
            status="full",
            snapshot=None,
        ),
        OriginVisitStatus(
            origin="file:///tmp/spamegg",
            visit=3,
            date=now(),
            status="full",
            snapshot=None,
        ),
        OriginVisitStatus(
            origin="file:///dev/0002",
            visit=6,
            date=now(),
            status="full",
            snapshot=None,
        ),
        OriginVisitStatus(  # will be filtered out due to its 'partial' status
            origin="file:///dev/0000",
            visit=4,
            date=now(),
            status="partial",
            snapshot=None,
        ),
        OriginVisitStatus(  # will be filtered out due to its 'ongoing' status
            origin="file:///dev/0001",
            visit=5,
            date=now(),
            status="ongoing",
            snapshot=None,
        ),
    ]

    journal_writer.write_additions("origin_visit_status", visit_statuses)
    visit_statuses_full = [vs for vs in visit_statuses if vs.status == "full"]

    result = cli_runner.invoke(
        indexer_cli_group,
        [
            "-C",
            swh_config,
            "journal-client",
            "--broker",
            kafka_server,
            "--prefix",
            kafka_prefix,
            "--group-id",
            "test-consumer",
            "--stop-after-objects",
            len(visit_statuses),
            "--origin-metadata-task-type",
            "index-origin-metadata",
        ],
        catch_exceptions=False,
    )

    # Check the output
    expected_output = "Done.\n"
    assert result.exit_code == 0, result.output
    assert result.output == expected_output

    # Check scheduled tasks
    tasks = indexer_scheduler.search_tasks(task_type="index-origin-metadata")

    # This can be split into multiple tasks but no more than the origin-visit-statuses
    # written in the journal
    assert len(tasks) <= len(visit_statuses_full)

    actual_origins = []
    for task in tasks:
        actual_task = dict(task)
        assert actual_task["type"] == "index-origin-metadata"
        scheduled_origins = actual_task["arguments"]["args"][0]
        actual_origins.extend(scheduled_origins)

    assert set(actual_origins) == {vs.origin for vs in visit_statuses_full}