コード例 #1
0
    def __init__(self, test_context):
        super(CopycatDistributedFileTest, self).__init__(test_context, num_zk=1, num_brokers=1, topics={
            'test' : { 'partitions': 1, 'replication-factor': 1 }
        })

        self.source = CopycatDistributedService(test_context, 2, self.kafka, [self.INPUT_FILE])
        self.sink = CopycatDistributedService(test_context, 2, self.kafka, [self.OUTPUT_FILE])
コード例 #2
0
    def __init__(self, test_context):
        super(CopycatDistributedFileTest, self).__init__(
            test_context,
            num_zk=1,
            num_brokers=1,
            topics={'test': {
                'partitions': 1,
                'replication-factor': 1
            }})

        # FIXME these should have multiple nodes. However, currently the connectors are submitted via command line,
        # which means we would get duplicates. Both would run, but they would have conflicting keys for offsets and
        # configs. Until we have real distributed coordination of workers with unified connector submission, we need
        # to restrict each of these to a single node.
        self.num_nodes = 1
        self.source = CopycatDistributedService(test_context, self.num_nodes,
                                                self.kafka, [self.INPUT_FILE])
        self.sink = CopycatDistributedService(test_context, self.num_nodes,
                                              self.kafka, [self.OUTPUT_FILE])
コード例 #3
0
class CopycatDistributedFileTest(KafkaTest):
    """
    Simple test of Copycat in distributed mode, producing data from files on on Copycat cluster and consuming it on
    another, validating the total output is identical to the input.
    """

    INPUT_FILE = "/mnt/copycat.input"
    OUTPUT_FILE = "/mnt/copycat.output"

    TOPIC = "test"
    OFFSETS_TOPIC = "copycat-offsets"
    CONFIG_TOPIC = "copycat-configs"

    # Since tasks can be assigned to any node and we're testing with files, we need to make sure the content is the same
    # across all nodes.
    FIRST_INPUT_LIST = ["foo", "bar", "baz"]
    FIRST_INPUTS = "\n".join(FIRST_INPUT_LIST) + "\n"
    SECOND_INPUT_LIST = ["razz", "ma", "tazz"]
    SECOND_INPUTS = "\n".join(SECOND_INPUT_LIST) + "\n"

    SCHEMA = {"type": "string", "optional": False}

    def __init__(self, test_context):
        super(CopycatDistributedFileTest, self).__init__(
            test_context,
            num_zk=1,
            num_brokers=1,
            topics={'test': {
                'partitions': 1,
                'replication-factor': 1
            }})

        self.cc = CopycatDistributedService(
            test_context, 2, self.kafka, [self.INPUT_FILE, self.OUTPUT_FILE])

    def test_file_source_and_sink(
            self,
            converter="org.apache.kafka.copycat.json.JsonConverter",
            schemas=True):
        assert converter != None, "converter type must be set"
        # Template parameters
        self.key_converter = converter
        self.value_converter = converter
        self.schemas = schemas

        self.cc.set_configs(lambda node: self.render(
            "copycat-distributed.properties", node=node))

        self.cc.start()

        self.logger.info("Creating connectors")
        for connector_props in [
                self.render("copycat-file-source.properties"),
                self.render("copycat-file-sink.properties")
        ]:
            connector_config = dict([
                line.strip().split('=', 1)
                for line in connector_props.split('\n')
                if line.strip() and not line.strip().startswith('#')
            ])
            self.cc.create_connector(connector_config)

        # Generating data on the source node should generate new records and create new output on the sink node. Timeouts
        # here need to be more generous than they are for standalone mode because a) it takes longer to write configs,
        # do rebalancing of the group, etc, and b) without explicit leave group support, rebalancing takes awhile
        for node in self.cc.nodes:
            node.account.ssh("echo -e -n " + repr(self.FIRST_INPUTS) + " >> " +
                             self.INPUT_FILE)
        wait_until(
            lambda: self.validate_output(self.FIRST_INPUT_LIST),
            timeout_sec=120,
            err_msg=
            "Data added to input file was not seen in the output file in a reasonable amount of time."
        )

        # Restarting both should result in them picking up where they left off,
        # only processing new data.
        self.cc.restart()

        for node in self.cc.nodes:
            node.account.ssh("echo -e -n " + repr(self.SECOND_INPUTS) +
                             " >> " + self.INPUT_FILE)
        wait_until(
            lambda: self.validate_output(self.FIRST_INPUT_LIST + self.
                                         SECOND_INPUT_LIST),
            timeout_sec=120,
            err_msg=
            "Sink output file never converged to the same state as the input file"
        )

    def validate_output(self, input):
        input_set = set(input)
        # Output needs to be collected from all nodes because we can't be sure where the tasks will be scheduled.
        # Between the first and second rounds, we might even end up with half the data on each node.
        output_set = set(
            itertools.chain(*[[
                line.strip()
                for line in self.file_contents(node, self.OUTPUT_FILE)
            ] for node in self.cc.nodes]))
        return input_set == output_set

    def file_contents(self, node, file):
        try:
            # Convert to a list here or the CalledProcessError may be returned during a call to the generator instead of
            # immediately
            return list(node.account.ssh_capture("cat " + file))
        except subprocess.CalledProcessError:
            return []
コード例 #4
0
class CopycatDistributedFileTest(KafkaTest):
    """
    Simple test of Copycat in distributed mode, producing data from files on on Copycat cluster and consuming it on
    another, validating the total output is identical to the input.
    """

    INPUT_FILE = "/mnt/copycat.input"
    OUTPUT_FILE = "/mnt/copycat.output"

    TOPIC = "test"
    OFFSETS_TOPIC = "copycat-offsets"

    FIRST_INPUT_LISTS = [["foo", "bar", "baz"], ["foo2", "bar2", "baz2"]]
    FIRST_INPUTS = ["\n".join(input_list) + "\n" for input_list in FIRST_INPUT_LISTS]
    SECOND_INPUT_LISTS = [["razz", "ma", "tazz"], ["razz2", "ma2", "tazz2"]]
    SECOND_INPUTS = ["\n".join(input_list) + "\n" for input_list in SECOND_INPUT_LISTS]

    SCHEMA = { "type": "string", "optional": False }

    def __init__(self, test_context):
        super(CopycatDistributedFileTest, self).__init__(test_context, num_zk=1, num_brokers=1, topics={
            'test' : { 'partitions': 1, 'replication-factor': 1 }
        })

        self.source = CopycatDistributedService(test_context, 2, self.kafka, [self.INPUT_FILE])
        self.sink = CopycatDistributedService(test_context, 2, self.kafka, [self.OUTPUT_FILE])

    def test_file_source_and_sink(self, converter="org.apache.kafka.copycat.json.JsonConverter", schemas=True):
        assert converter != None, "converter type must be set"
        # Template parameters
        self.key_converter = converter
        self.value_converter = converter
        self.schemas = schemas

        # These need to be set
        self.source.set_configs(self.render("copycat-distributed.properties"), self.render("copycat-file-source.properties"))
        self.sink.set_configs(self.render("copycat-distributed.properties"), self.render("copycat-file-sink.properties"))

        self.source.start()
        self.sink.start()

        # Generating data on the source node should generate new records and create new output on the sink node
        for node, input in zip(self.source.nodes, self.FIRST_INPUTS):
            node.account.ssh("echo -e -n " + repr(input) + " >> " + self.INPUT_FILE)
        wait_until(lambda: self.validate_output(self.FIRST_INPUT_LISTS), timeout_sec=60, err_msg="Data added to input file was not seen in the output file in a reasonable amount of time.")

        # Restarting both should result in them picking up where they left off,
        # only processing new data.
        self.source.restart()
        self.sink.restart()

        for node, input in zip(self.source.nodes, self.SECOND_INPUTS):
            node.account.ssh("echo -e -n " + repr(input) + " >> " + self.INPUT_FILE)
        wait_until(lambda: self.validate_output(self.FIRST_INPUT_LISTS + self.SECOND_INPUT_LISTS), timeout_sec=60, err_msg="Sink output file never converged to the same state as the input file")

    def validate_output(self, inputs):
        try:
            input_set = set(itertools.chain(*inputs))
            output_set = set(itertools.chain(*[
                [line.strip() for line in node.account.ssh_capture("cat " + self.OUTPUT_FILE)] for node in self.sink.nodes
            ]))
            return input_set == output_set
        except subprocess.CalledProcessError:
            return False
コード例 #5
0
class CopycatRestApiTest(KafkaTest):
    """
    Test of Copycat's REST API endpoints.
    """

    INPUT_FILE = "/mnt/copycat.input"
    INPUT_FILE2 = "/mnt/copycat.input2"
    OUTPUT_FILE = "/mnt/copycat.output"

    TOPIC = "test"
    OFFSETS_TOPIC = "copycat-offsets"
    CONFIG_TOPIC = "copycat-configs"

    # Since tasks can be assigned to any node and we're testing with files, we need to make sure the content is the same
    # across all nodes.
    INPUT_LIST = ["foo", "bar", "baz"]
    INPUTS = "\n".join(INPUT_LIST) + "\n"
    LONGER_INPUT_LIST = ["foo", "bar", "baz", "razz", "ma", "tazz"]
    LONER_INPUTS = "\n".join(LONGER_INPUT_LIST) + "\n"

    SCHEMA = { "type": "string", "optional": False }

    def __init__(self, test_context):
        super(CopycatRestApiTest, self).__init__(test_context, num_zk=1, num_brokers=1, topics={
            'test' : { 'partitions': 1, 'replication-factor': 1 }
        })

        self.cc = CopycatDistributedService(test_context, 2, self.kafka, [self.INPUT_FILE, self.INPUT_FILE2, self.OUTPUT_FILE])

    def test_rest_api(self):
        # Template parameters
        self.key_converter = "org.apache.kafka.copycat.json.JsonConverter"
        self.value_converter = "org.apache.kafka.copycat.json.JsonConverter"
        self.schemas = True

        self.cc.set_configs(lambda node: self.render("copycat-distributed.properties", node=node))

        self.cc.start()

        assert self.cc.list_connectors() == []

        self.logger.info("Creating connectors")
        source_connector_props = self.render("copycat-file-source.properties")
        sink_connector_props = self.render("copycat-file-sink.properties")
        for connector_props in [source_connector_props, sink_connector_props]:
            connector_config = self._config_dict_from_props(connector_props)
            self.cc.create_connector(connector_config)

        # We should see the connectors appear
        wait_until(lambda: set(self.cc.list_connectors()) == set(["local-file-source", "local-file-sink"]),
                   timeout_sec=10, err_msg="Connectors that were just created did not appear in connector listing")

        # We'll only do very simple validation that the connectors and tasks really ran.
        for node in self.cc.nodes:
            node.account.ssh("echo -e -n " + repr(self.INPUTS) + " >> " + self.INPUT_FILE)
        wait_until(lambda: self.validate_output(self.INPUT_LIST), timeout_sec=120, err_msg="Data added to input file was not seen in the output file in a reasonable amount of time.")


        # Trying to create the same connector again should cause an error
        try:
            self.cc.create_connector(self._config_dict_from_props(source_connector_props))
            assert False, "creating the same connector should have caused a conflict"
        except CopycatRestError:
            pass # expected

        # Validate that we can get info about connectors
        expected_source_info = {
            'name': 'local-file-source',
            'config': self._config_dict_from_props(source_connector_props),
            'tasks': [{ 'connector': 'local-file-source', 'task': 0 }]
        }
        source_info = self.cc.get_connector("local-file-source")
        assert expected_source_info == source_info, "Incorrect info:" + json.dumps(source_info)
        source_config = self.cc.get_connector_config("local-file-source")
        assert expected_source_info['config'] == source_config, "Incorrect config: " + json.dumps(source_config)
        expected_sink_info = {
            'name': 'local-file-sink',
            'config': self._config_dict_from_props(sink_connector_props),
            'tasks': [{ 'connector': 'local-file-sink', 'task': 0 }]
        }
        sink_info = self.cc.get_connector("local-file-sink")
        assert expected_sink_info == sink_info, "Incorrect info:" + json.dumps(sink_info)
        sink_config = self.cc.get_connector_config("local-file-sink")
        assert expected_sink_info['config'] == sink_config, "Incorrect config: " + json.dumps(sink_config)


        # Validate that we can get info about tasks. This info should definitely be available now without waiting since
        # we've already seen data appear in files.
        # TODO: It would be nice to validate a complete listing, but that doesn't make sense for the file connectors
        expected_source_task_info = [{
            'id': { 'connector': 'local-file-source', 'task': 0 },
            'config': {
                'task.class': 'org.apache.kafka.copycat.file.FileStreamSourceTask',
                'file': self.INPUT_FILE,
                'topic': self.TOPIC
            }
        }]
        source_task_info = self.cc.get_connector_tasks("local-file-source")
        assert expected_source_task_info == source_task_info, "Incorrect info:" + json.dumps(source_task_info)
        expected_sink_task_info = [{
            'id': { 'connector': 'local-file-sink', 'task': 0 },
            'config': {
                'task.class': 'org.apache.kafka.copycat.file.FileStreamSinkTask',
                'file': self.OUTPUT_FILE,
                'topics': self.TOPIC
            }
        }]
        sink_task_info = self.cc.get_connector_tasks("local-file-sink")
        assert expected_sink_task_info == sink_task_info, "Incorrect info:" + json.dumps(sink_task_info)

        file_source_config = self._config_dict_from_props(source_connector_props)
        file_source_config['file'] = self.INPUT_FILE2
        self.cc.set_connector_config("local-file-source", file_source_config)

        # We should also be able to verify that the modified configs caused the tasks to move to the new file and pick up
        # more data.
        for node in self.cc.nodes:
            node.account.ssh("echo -e -n " + repr(self.LONER_INPUTS) + " >> " + self.INPUT_FILE2)
        wait_until(lambda: self.validate_output(self.LONGER_INPUT_LIST), timeout_sec=120, err_msg="Data added to input file was not seen in the output file in a reasonable amount of time.")

        self.cc.delete_connector("local-file-source")
        self.cc.delete_connector("local-file-sink")
        wait_until(lambda: len(self.cc.list_connectors()) == 0, timeout_sec=10, err_msg="Deleted connectors did not disappear from REST listing")

    def validate_output(self, input):
        input_set = set(input)
        # Output needs to be collected from all nodes because we can't be sure where the tasks will be scheduled.
        output_set = set(itertools.chain(*[
            [line.strip() for line in self.file_contents(node, self.OUTPUT_FILE)] for node in self.cc.nodes
            ]))
        return input_set == output_set


    def file_contents(self, node, file):
        try:
            # Convert to a list here or the CalledProcessError may be returned during a call to the generator instead of
            # immediately
            return list(node.account.ssh_capture("cat " + file))
        except subprocess.CalledProcessError:
            return []

    def _config_dict_from_props(self, connector_props):
        return dict([line.strip().split('=', 1) for line in connector_props.split('\n') if line.strip() and not line.strip().startswith('#')])