Example #1
0
    def send(self, topic: str, key: KeyType, value):
        kafka_key = key_to_kafka(key)
        max_attempts = 5
        last_exception: Optional[Exception] = None
        for attempt in range(max_attempts):
            try:
                self.producer.produce(
                    topic=topic,
                    key=kafka_key,
                    value=value_to_kafka(value),
                )
            except BufferError as e:
                last_exception = e
                wait = 1 + 3 * attempt

                if logger.isEnabledFor(
                        logging.DEBUG):  # pprint_key is expensive
                    logger.debug(
                        "BufferError producing %s %s; waiting for %ss",
                        get_object_type(topic),
                        pprint_key(kafka_key),
                        wait,
                    )
                self.producer.poll(wait)
            else:
                self.deliveries_pending[DeliveryTag(topic, kafka_key)] = key
                return

        # We reach this point if all delivery attempts have failed
        self.delivery_failures.append(
            DeliveryFailureInfo(get_object_type(topic), key,
                                str(last_exception), "SWH_BUFFER_ERROR"))
def test_key_to_kafka_repeatable():
    """Check the kafka key encoding is repeatable"""
    base_dict = {
        "a": "foo",
        "b": "bar",
        "c": "baz",
    }

    key = serializers.key_to_kafka(base_dict)

    for dict_keys in itertools.permutations(base_dict):
        d = OrderedDict()
        for k in dict_keys:
            d[k] = base_dict[k]

        assert key == serializers.key_to_kafka(d)
Example #3
0
def test_replay(
    swh_storage,
    kafka_prefix: str,
    kafka_consumer_group: str,
    kafka_server: str,
):
    kafka_prefix += ".swh.journal.objects"

    producer = Producer(
        {
            "bootstrap.servers": kafka_server,
            "client.id": "test-producer",
            "acks": "all",
        }
    )

    snapshot = Snapshot(
        branches={
            b"HEAD": SnapshotBranch(
                target_type=TargetType.REVISION,
                target=b"\x01" * 20,
            )
        },
    )
    snapshot_dict = snapshot.to_dict()

    producer.produce(
        topic=kafka_prefix + ".snapshot",
        key=key_to_kafka(snapshot.id),
        value=value_to_kafka(snapshot_dict),
    )
    producer.flush()

    logger.debug("Flushed producer")

    result = invoke(
        "replay",
        "--stop-after-objects",
        "1",
        journal_config={
            "brokers": [kafka_server],
            "group_id": kafka_consumer_group,
            "prefix": kafka_prefix,
        },
    )

    expected = r"Done.\n"
    assert result.exit_code == 0, result.output
    assert re.fullmatch(expected, result.output, re.MULTILINE), result.output

    assert swh_storage.snapshot_get(snapshot.id) == {
        **snapshot_dict,
        "next_branch": None,
    }
Example #4
0
def kafka_producer(kafka_prefix: str, kafka_server_base: str):
    producer = Producer(
        {
            "bootstrap.servers": kafka_server_base,
            "client.id": "test producer",
            "acks": "all",
        }
    )

    # Fill Kafka
    producer.produce(
        topic=kafka_prefix + ".something",
        key=key_to_kafka(b"key1"),
        value=value_to_kafka("value1"),
    )
    producer.produce(
        topic=kafka_prefix + ".else",
        key=key_to_kafka(b"key1"),
        value=value_to_kafka("value2"),
    )
    producer.flush()
    return producer
Example #5
0
def test_client_batch_size(
    kafka_prefix: str,
    kafka_consumer_group: str,
    kafka_server: str,
    batch_size: int,
):
    num_objects = 2 * batch_size + 1
    assert num_objects < 256, "Too many objects, generation will fail"

    producer = Producer(
        {
            "bootstrap.servers": kafka_server,
            "client.id": "test producer",
            "acks": "all",
        }
    )

    contents = [Content.from_data(bytes([i])) for i in range(num_objects)]

    # Fill Kafka
    for content in contents:
        producer.produce(
            topic=kafka_prefix + ".content",
            key=key_to_kafka(content.sha1),
            value=value_to_kafka(content.to_dict()),
        )

    producer.flush()

    client = JournalClient(
        brokers=[kafka_server],
        group_id=kafka_consumer_group,
        prefix=kafka_prefix,
        stop_on_eof=True,
        batch_size=batch_size,
    )

    collected_output: List[Dict] = []

    def worker_fn(objects):
        received = objects["content"]
        assert len(received) <= batch_size
        collected_output.extend(received)

    client.process(worker_fn)

    expected_output = [content.to_dict() for content in contents]
    assert len(collected_output) == len(expected_output)

    for output in collected_output:
        assert output in expected_output
Example #6
0
def _fill_objstorage_and_kafka(kafka_server, kafka_prefix, objstorage):
    producer = Producer({
        "bootstrap.servers": kafka_server,
        "client.id": "test-producer",
        "acks": "all",
    })

    contents = {}
    for i in range(NUM_CONTENTS):
        content = b"\x00" * 19 + bytes([i])
        sha1 = objstorage.add(content)
        contents[sha1] = content
        producer.produce(
            topic=kafka_prefix + ".content",
            key=key_to_kafka(sha1),
            value=key_to_kafka({
                "sha1": sha1,
                "status": "visible",
            }),
        )

    producer.flush()

    return contents
def test_kafka_to_key():
    """Standard back and forth serialization with keys"""
    # All KeyType(s)
    keys: Iterable[serializers.KeyType] = [
        {
            "a": "foo",
            "b": "bar",
            "c": "baz",
        },
        {
            "a": b"foobarbaz",
        },
        b"foo",
    ]
    for object_type, objects in TEST_OBJECTS.items():
        for obj in objects:
            key = obj.unique_key()
            keys.append(key)

    for key in keys:
        ktk = serializers.key_to_kafka(key)
        v = serializers.kafka_to_key(ktk)

        assert v == key
Example #8
0
def _check_replay_skipped_content(storage, replayer, topic):
    skipped_contents = _gen_skipped_contents(100)
    nb_sent = len(skipped_contents)
    producer = storage.journal_writer.journal.producer
    prefix = storage.journal_writer.journal._prefix

    for i, obj in enumerate(skipped_contents):
        producer.produce(
            topic=f"{prefix}.{topic}",
            key=key_to_kafka({"sha1": obj["sha1"]}),
            value=value_to_kafka(obj),
        )
    producer.flush()

    dst_storage = get_storage(cls="memory")
    worker_fn = functools.partial(process_replay_objects, storage=dst_storage)
    nb_inserted = replayer.process(worker_fn)

    assert nb_sent == nb_inserted
    for content in skipped_contents:
        assert not storage.content_find({"sha1": content["sha1"]})

    # no skipped_content_find API endpoint, so use this instead
    assert not list(dst_storage.skipped_content_missing(skipped_contents))
Example #9
0
def test_storage_replay_with_collision(replayer_storage_and_client, caplog):
    """Another replayer scenario with collisions.

    This:
    - writes objects to the topic, including colliding contents
    - replayer consumes objects from the topic and replay them
    - This drops the colliding contents from the replay when detected

    """
    src, replayer = replayer_storage_and_client

    # Fill Kafka using a source storage
    nb_sent = 0
    for object_type, objects in TEST_OBJECTS.items():
        method = getattr(src, object_type + "_add")
        method(objects)
        if object_type == "origin_visit":
            nb_sent += len(
                objects)  # origin-visit-add adds origin-visit-status as well
        nb_sent += len(objects)

    # Create collision in input data
    # These should not be written in the destination
    producer = src.journal_writer.journal.producer
    prefix = src.journal_writer.journal._prefix
    for content in DUPLICATE_CONTENTS:
        topic = f"{prefix}.content"
        key = content.sha1
        now = datetime.datetime.now(tz=UTC)
        content = attr.evolve(content, ctime=now)
        producer.produce(
            topic=topic,
            key=key_to_kafka(key),
            value=value_to_kafka(content.to_dict()),
        )
        nb_sent += 1

    producer.flush()

    caplog.set_level(logging.ERROR, "swh.journal.replay")

    # Fill the destination storage from Kafka
    dst = get_storage(cls="memory")
    worker_fn = functools.partial(process_replay_objects, storage=dst)
    nb_inserted = replayer.process(worker_fn)
    assert nb_sent == nb_inserted

    # check the logs for the collision being properly detected
    nb_collisions = 0
    actual_collision: Dict
    for record in caplog.records:
        logtext = record.getMessage()
        if "Collision detected:" in logtext:
            nb_collisions += 1
            actual_collision = record.args["collision"]

    assert nb_collisions == 1, "1 collision should be detected"

    algo = "sha1"
    assert actual_collision["algo"] == algo
    expected_colliding_hash = hash_to_hex(DUPLICATE_CONTENTS[0].get_hash(algo))
    assert actual_collision["hash"] == expected_colliding_hash

    actual_colliding_hashes = actual_collision["objects"]
    assert len(actual_colliding_hashes) == len(DUPLICATE_CONTENTS)
    for content in DUPLICATE_CONTENTS:
        expected_content_hashes = {
            k: hash_to_hex(v)
            for k, v in content.hashes().items()
        }
        assert expected_content_hashes in actual_colliding_hashes

    # all objects from the src should exists in the dst storage
    assert isinstance(src, InMemoryStorage)  # needed to help mypy
    assert isinstance(dst, InMemoryStorage)  # needed to help mypy
    check_replayed(src, dst, exclude=["contents"])
    # but the dst has one content more (one of the 2 colliding ones)
    assert (len(list(src._cql_runner._contents.iter_all())) == len(
        list(dst._cql_runner._contents.iter_all())) - 1)