class KafkaConnector(DataConnector): engine_type = "kafka://" def __init__(self, *args, **kwargs): """ Connector to Apache Kafka. Args: @see :class:`connectors.base.DataConnector` additional args for KafkaConnector None Connection information- engine_url format is kafka://bootstrap_server/topic=<topic>;[start params;][end params;] start and end params can be partitions with offsets or '@' notation to use dates. e.g. kafka://bionic/topic=foobar;start=@(2019-05-15 08:00:00);end=@(2019-05-15 18:00:00); """ super().__init__(*args, **kwargs) # set by :method:`connect` self.bootstrap_server = self.topic = self.start_params = self.end_params = None self.start_p_offsets = self.end_p_offsets = None self.available_topics = None self.client = None # publicly readable self.stats = Pinnate({"added": 0}) # used during read self.approx_position = None self.items_to_fetch = None def close_connection(self): if self.access == AccessMode.WRITE and self.client is not None: self.flush() def connect(self): if self.client is None: self.bootstrap_server, self.topic, self.start_params, self.end_params = self._decode_engine_url() if self.access == AccessMode.READ: self.client = KafkaConsumer(bootstrap_servers=self.bootstrap_server) self._setup_consumer() elif self.access == AccessMode.WRITE: if self.start_params is not None or self.end_params is not None: raise ValueError("Start and end offsets can't be set when writing") self.client = KafkaProducer(bootstrap_servers=self.bootstrap_server) else: raise NotImplementedError("Unknown access mode") def _setup_consumer(self): """ prepare offset numbers etc. for reading from Topic """ # <WTF> https://github.com/dpkp/kafka-python/issues/601 self.available_topics = self.client.topics() # </WTF> # might as well use it assert self.topic in self.available_topics if (self.start_params is None) != (self.end_params is None): raise ValueError("Both start and end params must be set or both must be None") if self.start_params is None: # setup partitions to read through # TODO not checked with multiple partitions since inheriting from foxglove # An offset is assigned to make repeatability (via a locking file) possible later on. # and it's easier to terminate the fetch loop this way. p_id = self.client.partitions_for_topic(self.topic) topic_partitions = [TopicPartition(topic=self.topic, partition=p) for p in list(p_id)] starts = self.client.beginning_offsets(topic_partitions) ends = self.client.end_offsets(topic_partitions) self.start_p_offsets = { tp: OffsetAndTimestamp(offset=offset, timestamp=None) for tp, offset in starts.items() } self.end_p_offsets = { tp: OffsetAndTimestamp(offset=offset - 1, timestamp=None) for tp, offset in ends.items() } else: # TODO - this code was inherited from Foxglove and hasn't be checked through # setup start and end partitions and offsets # self.client.seek_to_beginning() # datetime is only start/end implemented assert isinstance(self.start_params, datetime) and isinstance(self.end_params, datetime) start = int(self.start_params.timestamp() * 1000) end = int(self.end_params.timestamp() * 1000) partitions = self.client.partitions_for_topic(self.topic) tx = {TopicPartition(topic=self.topic, partition=p): start for p in list(partitions)} self.start_p_offsets = self.client.offsets_for_times(tx) # if you give a timestamp after the last record it returns None for tp, offset_details in self.start_p_offsets.items(): if offset_details is None: raise ValueError("Start date outside of available messages") tx = {TopicPartition(topic=self.topic, partition=p): end for p in list(partitions)} self.end_p_offsets = self.client.offsets_for_times(tx) # as above - out of range, for end offset give something useful for tp, offset_details in self.end_p_offsets.items(): if offset_details is None: # go to last message. I'm not 100% sure this is correct end_offsets = self.client.end_offsets([tp]) offset = end_offsets[tp] - 1 self.end_p_offsets[tp] = OffsetAndTimestamp(offset=offset, timestamp=None) def _decode_engine_url(self): """ Returns: bootstrap_server, topic, start_params, end_params bootstrap_server and topic are (str) start_params, end_params are (None), (datetime) or (mixed) - not implemented but will be partition+offsets pairs """ date_format = "%Y-%m-%d %H:%M:%S" r = dict(topic=None, start=None, end=None) s_url = self.engine_url[len(self.__class__.engine_type) :] bootstrap_server, r_url = s_url.split("/", 1) for param_section in r_url.split(";"): if len(param_section) == 0: continue k, v = param_section.split("=", 1) if k in r: r[k] = v # resolve to dates if needed # partition+offset not implemented so start and end must be None or start with @ # to resolve to a datetime. for position in ("start", "end"): p_marker = r[position] if p_marker is not None: assert p_marker.startswith("@(") and p_marker.endswith(")") date_str = p_marker[2:-1] r[position] = datetime.strptime(date_str, date_format) return bootstrap_server, r["topic"], r["start"], r["end"] def __len__(self): raise NotImplementedError("TODO") def __getitem__(self, key): raise NotImplementedError("TODO") def _partition_ranges(self) -> Generator: """ yield partition (int), start_offset (int), end_offset (int) for range given in self.engine_url """ self.connect() for topic_partition, start_offset_time in self.start_p_offsets.items(): end_offset_time = self.end_p_offsets[topic_partition] yield topic_partition.partition, start_offset_time.offset, end_offset_time.offset @property def data(self) -> Generator: """ Generator yielding just the value of the record from Kafka. Value is made into Pinnate object. See https://kafka-python.readthedocs.io/en/master/apidoc/KafkaConsumer.html useful attribs include m.offset, m.partition, m.timestamp, m.key, m.value """ # Not using a consumer group and setting partitions manually so it's a smaller # jump to make this deterministic/repeatable with multiple workers later on. self.connect() self.approx_position = 0 for partition_id, start_offset, end_offset in self._partition_ranges(): # TODO - confirm this can never jump to another partition tp = TopicPartition(topic=self.topic, partition=partition_id) self.client.assign([tp]) self.items_to_fetch = end_offset - start_offset self.client.seek(tp, start_offset) if self.items_to_fetch <= 0: msg = f"Invalid offsets {start_offset}:{end_offset} for partition {partition_id}" raise ValueError(msg) for m in self.client: self.approx_position += 1 yield Pinnate(data=m.value) if end_offset is not None and m.offset >= end_offset: break def add(self, data, partition=None): """ Write message to topic. @param data: (str) @param partition: (int) Kafka partition. Not yet implemented. """ # TODO expand data to include binary and instance of :class:`Pinnate` but needs a way of # de-serialising on retrieve. if self.access != AccessMode.WRITE: raise ValueError("Write attempted on dataset opened in READ mode.") if partition is not None: raise NotImplementedError("Placeholder value, not implemented yet") if not isinstance(data, str): raise ValueError("data isn't an accepted type. Only (str) is accepted.") self.connect() # TODO use futures self.client.send(self.topic, value=bytes(data, "utf-8")) self.stats.added += 1 def flush(self): """ Ensure all messages have been sent to Kafka """ if self.access != AccessMode.WRITE: raise ValueError("Flush attempted on dataset opened in READ mode.") # TODO futures and performance stats self.client.flush() @property def progress(self): if self.access != AccessMode.READ or self.items_to_fetch is None or self.approx_position is None: return None return self.approx_position / self.items_to_fetch