Ejemplo n.º 1
0
    def send(self, topic: str, key: KeyType, value):
        kafka_key = key_to_kafka(key)
        max_attempts = 5
        last_exception: Optional[Exception] = None
        for attempt in range(max_attempts):
            try:
                self.producer.produce(
                    topic=topic,
                    key=kafka_key,
                    value=value_to_kafka(value),
                )
            except BufferError as e:
                last_exception = e
                wait = 1 + 3 * attempt

                if logger.isEnabledFor(
                        logging.DEBUG):  # pprint_key is expensive
                    logger.debug(
                        "BufferError producing %s %s; waiting for %ss",
                        get_object_type(topic),
                        pprint_key(kafka_key),
                        wait,
                    )
                self.producer.poll(wait)
            else:
                self.deliveries_pending[DeliveryTag(topic, kafka_key)] = key
                return

        # We reach this point if all delivery attempts have failed
        self.delivery_failures.append(
            DeliveryFailureInfo(get_object_type(topic), key,
                                str(last_exception), "SWH_BUFFER_ERROR"))
Ejemplo n.º 2
0
def test_cli_journal_client_origin_visit_status(
    swh_scheduler_cfg, swh_scheduler_cfg_path,
):
    kafka_server = swh_scheduler_cfg["journal"]["brokers"][0]
    swh_scheduler = get_scheduler(**swh_scheduler_cfg["scheduler"])
    producer = Producer(
        {
            "bootstrap.servers": kafka_server,
            "client.id": "test visit-stats producer",
            "acks": "all",
        }
    )
    visit_status = VISIT_STATUSES_1[0]

    value = value_to_kafka(visit_status)
    topic = "swh.journal.objects.origin_visit_status"
    producer.produce(topic=topic, key=b"bogus-origin", value=value)
    producer.flush()

    result = invoke(
        ["journal-client", "--stop-after-objects", "1",], swh_scheduler_cfg_path,
    )

    # Check the output
    expected_output = "Processed 1 message(s).\nDone.\n"
    assert result.exit_code == 0, result.output
    assert result.output == expected_output

    actual_visit_stats = swh_scheduler.origin_visit_stats_get(
        [(visit_status["origin"], visit_status["type"])]
    )

    assert actual_visit_stats
    assert len(actual_visit_stats) == 1
Ejemplo n.º 3
0
def test_client(kafka_prefix: str, kafka_consumer_group: str, kafka_server: str):
    producer = Producer(
        {
            "bootstrap.servers": kafka_server,
            "client.id": "test producer",
            "acks": "all",
        }
    )

    # Fill Kafka
    producer.produce(
        topic=kafka_prefix + ".revision",
        key=REV["id"],
        value=value_to_kafka(REV),
    )
    producer.flush()

    client = JournalClient(
        brokers=[kafka_server],
        group_id=kafka_consumer_group,
        prefix=kafka_prefix,
        stop_on_eof=True,
    )
    worker_fn = MagicMock()
    client.process(worker_fn)

    worker_fn.assert_called_once_with({"revision": [REV]})
Ejemplo n.º 4
0
def test_client_subscriptions_with_anonymized_topics(
    kafka_prefix: str, kafka_consumer_group: str, kafka_server_base: str
):
    producer = Producer(
        {
            "bootstrap.servers": kafka_server_base,
            "client.id": "test producer",
            "acks": "all",
        }
    )

    # Fill Kafka with revision object on both the regular prefix (normally for
    # anonymized objects in this case) and privileged one
    producer.produce(
        topic=kafka_prefix + ".revision",
        key=REV["id"],
        value=value_to_kafka(REV),
    )
    producer.produce(
        topic=kafka_prefix + "_privileged.revision",
        key=REV["id"],
        value=value_to_kafka(REV),
    )
    producer.flush()

    # without privileged "channels" activated on the client side
    client = JournalClient(
        brokers=[kafka_server_base],
        group_id=kafka_consumer_group,
        prefix=kafka_prefix,
        stop_on_eof=True,
        privileged=False,
    )
    # we only subscribed to "standard" topics
    assert client.subscription == [kafka_prefix + ".revision"]

    # with privileged "channels" activated on the client side
    client = JournalClient(
        brokers=[kafka_server_base],
        group_id=kafka_consumer_group,
        prefix=kafka_prefix,
        privileged=True,
    )
    # we only subscribed to "privileged" topics
    assert client.subscription == [kafka_prefix + "_privileged.revision"]
Ejemplo n.º 5
0
def test_replay(
    swh_storage,
    kafka_prefix: str,
    kafka_consumer_group: str,
    kafka_server: str,
):
    kafka_prefix += ".swh.journal.objects"

    producer = Producer(
        {
            "bootstrap.servers": kafka_server,
            "client.id": "test-producer",
            "acks": "all",
        }
    )

    snapshot = Snapshot(
        branches={
            b"HEAD": SnapshotBranch(
                target_type=TargetType.REVISION,
                target=b"\x01" * 20,
            )
        },
    )
    snapshot_dict = snapshot.to_dict()

    producer.produce(
        topic=kafka_prefix + ".snapshot",
        key=key_to_kafka(snapshot.id),
        value=value_to_kafka(snapshot_dict),
    )
    producer.flush()

    logger.debug("Flushed producer")

    result = invoke(
        "replay",
        "--stop-after-objects",
        "1",
        journal_config={
            "brokers": [kafka_server],
            "group_id": kafka_consumer_group,
            "prefix": kafka_prefix,
        },
    )

    expected = r"Done.\n"
    assert result.exit_code == 0, result.output
    assert re.fullmatch(expected, result.output, re.MULTILINE), result.output

    assert swh_storage.snapshot_get(snapshot.id) == {
        **snapshot_dict,
        "next_branch": None,
    }
Ejemplo n.º 6
0
def kafka_producer(kafka_prefix: str, kafka_server_base: str):
    producer = Producer(
        {
            "bootstrap.servers": kafka_server_base,
            "client.id": "test producer",
            "acks": "all",
        }
    )

    # Fill Kafka
    producer.produce(
        topic=kafka_prefix + ".something",
        key=key_to_kafka(b"key1"),
        value=value_to_kafka("value1"),
    )
    producer.produce(
        topic=kafka_prefix + ".else",
        key=key_to_kafka(b"key1"),
        value=value_to_kafka("value2"),
    )
    producer.flush()
    return producer
Ejemplo n.º 7
0
def test_client_batch_size(
    kafka_prefix: str,
    kafka_consumer_group: str,
    kafka_server: str,
    batch_size: int,
):
    num_objects = 2 * batch_size + 1
    assert num_objects < 256, "Too many objects, generation will fail"

    producer = Producer(
        {
            "bootstrap.servers": kafka_server,
            "client.id": "test producer",
            "acks": "all",
        }
    )

    contents = [Content.from_data(bytes([i])) for i in range(num_objects)]

    # Fill Kafka
    for content in contents:
        producer.produce(
            topic=kafka_prefix + ".content",
            key=key_to_kafka(content.sha1),
            value=value_to_kafka(content.to_dict()),
        )

    producer.flush()

    client = JournalClient(
        brokers=[kafka_server],
        group_id=kafka_consumer_group,
        prefix=kafka_prefix,
        stop_on_eof=True,
        batch_size=batch_size,
    )

    collected_output: List[Dict] = []

    def worker_fn(objects):
        received = objects["content"]
        assert len(received) <= batch_size
        collected_output.extend(received)

    client.process(worker_fn)

    expected_output = [content.to_dict() for content in contents]
    assert len(collected_output) == len(expected_output)

    for output in collected_output:
        assert output in expected_output
Ejemplo n.º 8
0
def test_client_with_deserializer(
    kafka_prefix: str, kafka_consumer_group: str, kafka_server: str
):
    producer = Producer(
        {
            "bootstrap.servers": kafka_server,
            "client.id": "test producer",
            "acks": "all",
        }
    )

    # Fill Kafka
    revisions = cast(List[Revision], TEST_OBJECTS["revision"])
    for rev in revisions:
        producer.produce(
            topic=kafka_prefix + ".revision",
            key=rev.id,
            value=value_to_kafka(rev.to_dict()),
        )
    producer.flush()

    def custom_deserializer(object_type, msg):
        assert object_type == "revision"
        obj = kafka_to_value(msg)
        # filter the first revision
        if obj["id"] == revisions[0].id:
            return None
        return Revision.from_dict(obj)

    client = JournalClient(
        brokers=[kafka_server],
        group_id=kafka_consumer_group,
        prefix=kafka_prefix,
        stop_on_eof=True,
        value_deserializer=custom_deserializer,
    )
    worker_fn = MagicMock()
    client.process(worker_fn)

    # a commit seems to be needed to prevent some race condition situation
    # where the worker_fn has not yet been called at this point (not sure how)
    client.consumer.commit()

    # Check the first revision has not been passed to worker_fn
    processed_revisions = set(worker_fn.call_args[0][0]["revision"])
    assert revisions[0] not in processed_revisions
    assert all(rev in processed_revisions for rev in revisions[1:])
Ejemplo n.º 9
0
def test_client_stop_after_objects(
    kafka_prefix: str, kafka_consumer_group: str, kafka_server: str, count: int
):
    producer = Producer(
        {
            "bootstrap.servers": kafka_server,
            "client.id": "test producer",
            "acks": "all",
        }
    )

    # Fill Kafka
    revisions = cast(List[Revision], TEST_OBJECTS["revision"])
    for rev in revisions:
        producer.produce(
            topic=kafka_prefix + ".revision",
            key=rev.id,
            value=value_to_kafka(rev.to_dict()),
        )
    producer.flush()

    client = JournalClient(
        brokers=[kafka_server],
        group_id=kafka_consumer_group,
        prefix=kafka_prefix,
        stop_on_eof=False,
        stop_after_objects=count,
    )

    worker_fn = MagicMock()
    client.process(worker_fn)

    # this code below is not pretty, but needed since we have to deal with
    # dicts (so no set) which can have values that are list vs tuple, and we do
    # not know for sure how many calls of the worker_fn will happen during the
    # consumption of the topic...
    worker_fn.assert_called()
    revs = []  # list of (unique) rev dicts we got from the client
    for call in worker_fn.call_args_list:
        callrevs = call[0][0]["revision"]
        for rev in callrevs:
            assert Revision.from_dict(rev) in revisions
            if rev not in revs:
                revs.append(rev)
    assert len(revs) == count
Ejemplo n.º 10
0
def test_client_subscriptions_without_anonymized_topics(
    kafka_prefix: str, kafka_consumer_group: str, kafka_server_base: str
):
    producer = Producer(
        {
            "bootstrap.servers": kafka_server_base,
            "client.id": "test producer",
            "acks": "all",
        }
    )

    # Fill Kafka with revision objects only on the standard prefix
    producer.produce(
        topic=kafka_prefix + ".revision",
        key=REV["id"],
        value=value_to_kafka(REV),
    )
    producer.flush()

    # without privileged channel activated on the client side
    client = JournalClient(
        brokers=[kafka_server_base],
        group_id=kafka_consumer_group,
        prefix=kafka_prefix,
        stop_on_eof=True,
        privileged=False,
    )
    # we only subscribed to the standard prefix
    assert client.subscription == [kafka_prefix + ".revision"]

    # with privileged channel activated on the client side
    client = JournalClient(
        brokers=[kafka_server_base],
        group_id=kafka_consumer_group,
        prefix=kafka_prefix,
        stop_on_eof=True,
        privileged=True,
    )
    # we also only subscribed to the standard prefix, since there is no priviled prefix
    # on the kafka broker
    assert client.subscription == [kafka_prefix + ".revision"]
Ejemplo n.º 11
0
def _check_replay_skipped_content(storage, replayer, topic):
    skipped_contents = _gen_skipped_contents(100)
    nb_sent = len(skipped_contents)
    producer = storage.journal_writer.journal.producer
    prefix = storage.journal_writer.journal._prefix

    for i, obj in enumerate(skipped_contents):
        producer.produce(
            topic=f"{prefix}.{topic}",
            key=key_to_kafka({"sha1": obj["sha1"]}),
            value=value_to_kafka(obj),
        )
    producer.flush()

    dst_storage = get_storage(cls="memory")
    worker_fn = functools.partial(process_replay_objects, storage=dst_storage)
    nb_inserted = replayer.process(worker_fn)

    assert nb_sent == nb_inserted
    for content in skipped_contents:
        assert not storage.content_find({"sha1": content["sha1"]})

    # no skipped_content_find API endpoint, so use this instead
    assert not list(dst_storage.skipped_content_missing(skipped_contents))
Ejemplo n.º 12
0
 def write_addition(self, object_type: str, object_: ValueProtocol) -> None:
     object_.unique_key(
     )  # Check this does not error, to mimic the kafka writer
     dict_ = self.value_sanitizer(object_type, object_.to_dict())
     self.output.write(value_to_kafka((object_type, dict_)))
def test_encode_int(value):
    assert serializers.kafka_to_value(
        serializers.value_to_kafka(value)) == value
def test_encode_datetime_bw(value):
    bwdate = {b"swhtype": "datetime", b"d": value.isoformat()}
    assert serializers.kafka_to_value(
        serializers.value_to_kafka(bwdate)) == value
Ejemplo n.º 15
0
def test_storage_replay_with_collision(replayer_storage_and_client, caplog):
    """Another replayer scenario with collisions.

    This:
    - writes objects to the topic, including colliding contents
    - replayer consumes objects from the topic and replay them
    - This drops the colliding contents from the replay when detected

    """
    src, replayer = replayer_storage_and_client

    # Fill Kafka using a source storage
    nb_sent = 0
    for object_type, objects in TEST_OBJECTS.items():
        method = getattr(src, object_type + "_add")
        method(objects)
        if object_type == "origin_visit":
            nb_sent += len(
                objects)  # origin-visit-add adds origin-visit-status as well
        nb_sent += len(objects)

    # Create collision in input data
    # These should not be written in the destination
    producer = src.journal_writer.journal.producer
    prefix = src.journal_writer.journal._prefix
    for content in DUPLICATE_CONTENTS:
        topic = f"{prefix}.content"
        key = content.sha1
        now = datetime.datetime.now(tz=UTC)
        content = attr.evolve(content, ctime=now)
        producer.produce(
            topic=topic,
            key=key_to_kafka(key),
            value=value_to_kafka(content.to_dict()),
        )
        nb_sent += 1

    producer.flush()

    caplog.set_level(logging.ERROR, "swh.journal.replay")

    # Fill the destination storage from Kafka
    dst = get_storage(cls="memory")
    worker_fn = functools.partial(process_replay_objects, storage=dst)
    nb_inserted = replayer.process(worker_fn)
    assert nb_sent == nb_inserted

    # check the logs for the collision being properly detected
    nb_collisions = 0
    actual_collision: Dict
    for record in caplog.records:
        logtext = record.getMessage()
        if "Collision detected:" in logtext:
            nb_collisions += 1
            actual_collision = record.args["collision"]

    assert nb_collisions == 1, "1 collision should be detected"

    algo = "sha1"
    assert actual_collision["algo"] == algo
    expected_colliding_hash = hash_to_hex(DUPLICATE_CONTENTS[0].get_hash(algo))
    assert actual_collision["hash"] == expected_colliding_hash

    actual_colliding_hashes = actual_collision["objects"]
    assert len(actual_colliding_hashes) == len(DUPLICATE_CONTENTS)
    for content in DUPLICATE_CONTENTS:
        expected_content_hashes = {
            k: hash_to_hex(v)
            for k, v in content.hashes().items()
        }
        assert expected_content_hashes in actual_colliding_hashes

    # all objects from the src should exists in the dst storage
    assert isinstance(src, InMemoryStorage)  # needed to help mypy
    assert isinstance(dst, InMemoryStorage)  # needed to help mypy
    check_replayed(src, dst, exclude=["contents"])
    # but the dst has one content more (one of the 2 colliding ones)
    assert (len(list(src._cql_runner._contents.iter_all())) == len(
        list(dst._cql_runner._contents.iter_all())) - 1)