def _process_single_chunk(self, offset_ranges, msg_rdd_processor): keyed_msg_rdd = KafkaUtils.createRDD(self._sc, self._kafka_param, offset_ranges, valueDecoder=lambda x: x) msg_rdd = keyed_msg_rdd.values() msg_rdd_processor(msg_rdd)
def _fetch_single_chunk(self, offset_ranges): keyed_msg_rdd = KafkaUtils.createRDD(self._sc, self._kafka_param, offset_ranges, valueDecoder=lambda x: x) msg_rdd = keyed_msg_rdd.values() return msg_rdd
def fetch_pre_hourly_data(spark_context, offset_range_list): """get metrics pre hourly data from offset range list.""" for o in offset_range_list: log.debug( "fetch_pre_hourly: offset_range_list:" " OffSetRanges: %s %s %s %s" % ( o.topic, o.partition, o.fromOffset, o.untilOffset)) effective_offset_list = PreHourlyProcessor.\ get_effective_offset_range_list(offset_range_list) for o in effective_offset_list: log.debug( "fetch_pre_hourly: effective_offset_range_list:" " OffSetRanges: %s %s %s %s" % ( o.topic, o.partition, o.fromOffset, o.untilOffset)) # get kafka stream over the same offsets pre_hourly_rdd = KafkaUtils.createRDD(spark_context, {"metadata.broker.list": cfg.CONF.messaging.brokers}, effective_offset_list) return pre_hourly_rdd
def kafka_rdd(spark_context, kafka_brokers='192.168.1.106:9092'): return KafkaUtils.createRDD( sc=spark_context, kafkaParams={'metadata.broker.list': kafka_brokers}, offsetRanges=[ OffsetRange(topic='flights', partition=0, fromOffset=0, untilOffset=49) ])
def fetch_pre_hourly_data(spark_context, offset_range_list): """get metrics pre hourly data from offset range list.""" # get kafka stream over the same offsets pre_hourly_rdd = KafkaUtils.createRDD(spark_context, {"metadata.broker.list": cfg.CONF.messaging.brokers}, offset_range_list) return pre_hourly_rdd
def test_kafka_rdd_get_offsetRanges(self): """Test Python direct Kafka RDD get OffsetRanges.""" topic = self._randomTopic() sendData = {"a": 3, "b": 4, "c": 5} offsetRanges = [OffsetRange(topic, 0, long(0), long(sum(sendData.values())))] kafkaParams = {"metadata.broker.list": self._kafkaTestUtils.brokerAddress()} self._kafkaTestUtils.createTopic(topic) self._kafkaTestUtils.sendMessages(topic, sendData) rdd = KafkaUtils.createRDD(self.sc, kafkaParams, offsetRanges) self.assertEqual(offsetRanges, rdd.offsetRanges())
def test_kafka_rdd(self): """Test the Python direct Kafka RDD API.""" topic = self._randomTopic() sendData = {"a": 1, "b": 2} offsetRanges = [OffsetRange(topic, 0, long(0), long(sum(sendData.values())))] kafkaParams = {"metadata.broker.list": self._kafkaTestUtils.brokerAddress()} self._kafkaTestUtils.createTopic(topic) self._kafkaTestUtils.sendMessages(topic, sendData) rdd = KafkaUtils.createRDD(self.sc, kafkaParams, offsetRanges) self._validateRddResult(sendData, rdd)
def test_kafka_rdd_get_offsetRanges(self): """Test Python direct Kafka RDD get OffsetRanges.""" topic = self._randomTopic() sendData = {"a": 3, "b": 4, "c": 5} offsetRanges = [OffsetRange(topic, 0, long(0), long(sum(sendData.values())))] kafkaParams = {"metadata.broker.list": self._kafkaTestUtils.brokerAddress()} self._kafkaTestUtils.createTopic(topic) self._kafkaTestUtils.sendMessages(topic, sendData) rdd = KafkaUtils.createRDD(self.sc, kafkaParams, offsetRanges) self.assertEqual(offsetRanges, rdd.offsetRanges())
def test_kafka_rdd(self): """Test the Python direct Kafka RDD API.""" topic = self._randomTopic() sendData = {"a": 1, "b": 2} offsetRanges = [OffsetRange(topic, 0, long(0), long(sum(sendData.values())))] kafkaParams = {"metadata.broker.list": self._kafkaTestUtils.brokerAddress()} self._kafkaTestUtils.createTopic(topic) self._kafkaTestUtils.sendMessages(topic, sendData) rdd = KafkaUtils.createRDD(self.sc, kafkaParams, offsetRanges) self._validateRddResult(sendData, rdd)
def test_kafka_rdd_with_leaders(self): """Test the Python direct Kafka RDD API with leaders.""" topic = self._randomTopic() sendData = {"a": 1, "b": 2, "c": 3} offsetRanges = [OffsetRange(topic, 0, long(0), long(sum(sendData.values())))] kafkaParams = {"metadata.broker.list": self._kafkaTestUtils.brokerAddress()} address = self._kafkaTestUtils.brokerAddress().split(":") leaders = {TopicAndPartition(topic, 0): Broker(address[0], int(address[1]))} self._kafkaTestUtils.createTopic(topic) self._kafkaTestUtils.sendMessages(topic, sendData) rdd = KafkaUtils.createRDD(self.sc, kafkaParams, offsetRanges, leaders) self._validateRddResult(sendData, rdd)
def test_kafka_rdd_with_leaders(self): """Test the Python direct Kafka RDD API with leaders.""" topic = self._randomTopic() sendData = {"a": 1, "b": 2, "c": 3} offsetRanges = [OffsetRange(topic, 0, long(0), long(sum(sendData.values())))] kafkaParams = {"metadata.broker.list": self._kafkaTestUtils.brokerAddress()} address = self._kafkaTestUtils.brokerAddress().split(":") leaders = {TopicAndPartition(topic, 0): Broker(address[0], int(address[1]))} self._kafkaTestUtils.createTopic(topic) self._kafkaTestUtils.sendMessages(topic, sendData) rdd = KafkaUtils.createRDD(self.sc, kafkaParams, offsetRanges, leaders) self._validateRddResult(sendData, rdd)
def collect_results(sc, brokers, receive_record, offsets_start, offsets_end, run_id): """ Parameters ---------- sc : pyspark.SparkContext brokers : list of str receive_record : callable Callable receiving a json decoded record from kafka. It must return either an empty list on error, or a 3 item tuple containing hit_page_id as int, query as str, and features as DenseVector offsets_start : list of int Per-partition offsets to start reading at offsets_end : list of int Per-partition offsets to end reading at run_id : str unique identifier for this run Returns ------- pyspark.RDD RDD containing results of receive_record """ offset_ranges = [] if offsets_start is None: offsets_start = get_offset_start(brokers, mjolnir.kafka.TOPIC_RESULT) if offsets_start is None: raise RuntimeError("Cannot fetch offset_start, topic %s should have been created" % mjolnir.kafka.TOPIC_RESULT) for partition, (start, end) in enumerate(zip(offsets_start, offsets_end)): offset_ranges.append(OffsetRange(mjolnir.kafka.TOPIC_RESULT, partition, start, end)) assert not isinstance(brokers, basestring) # TODO: how can we force the kafka api_version here? kafka_params = { 'metadata.broker.list': ','.join(brokers), # Set high fetch size values so we don't fail because of large messages 'max.partition.fetch.bytes': '40000000', 'fetch.message.max.bytes': '40000000' } # If this ends up being too much data from kafka, blowing up memory in the # spark executors, we could chunk the offsets and union together multiple RDD's. return ( KafkaUtils.createRDD(sc, kafka_params, offset_ranges) .map(lambda (k, v): json.loads(v)) .filter(lambda rec: 'run_id' in rec and rec['run_id'] == run_id) .flatMap(receive_record))
def collect_results(sc, brokers, receive_record, start, end, run_id): """ Parameters ---------- sc : pyspark.SparkContext brokers : list of str receive_record : callable Callable receiving a json decoded record from kafka. It should return a list and the resulting rdd will have a record per result returned. start : int Timestamp, in seconds since unix epoch, at which to start looking for records end : int Timestamp at which to stop looking for records. run_id : str unique identifier for this run Returns ------- pyspark.RDD RDD containing results of receive_record """ # Decide what offsets we need. offset_ranges = offset_range_for_timestamp_range(brokers, start, end) if offset_ranges is None: raise RuntimeError( 'Could not retrieve offset ranges for result topic. Does it exist?' ) kafka_params = { 'metadata.broker.list': ','.join(brokers), # Set high fetch size values so we don't fail because of large messages 'max.partition.fetch.bytes': '40000000', 'fetch.message.max.bytes': '40000000' } # If this ends up being too much data from kafka, blowing up memory in the # spark executors, we could chunk the offsets and union together multiple RDD's. return (KafkaUtils.createRDD( sc, kafka_params, offset_ranges).map(lambda x: json.loads(x[1])).filter( lambda rec: 'run_id' in rec and rec['run_id'] == run_id).flatMap( receive_record))
sc = SparkContext('local[*]', 'hands on PySpark') kafkaParams = {"metadata.broker.list": "localhost:9092"} start = 1 # pular primeira linha until = 500000 partition = 0 topic = 'csvtopic' offset1 = OffsetRange(topic, partition, start, until) # offset2 = OffsetRange('csvtopic', 0, 500001, 1000000) offsets = [offset1] print(" >>>>>>>> CONSUMINDO KAFKA <<<<<<<<") rdd = KafkaUtils.createRDD(sc, kafkaParams, offsets) linhas = rdd.map(lambda x: x[1]) # linhas.foreach(printer) arr = linhas.map(criarPoints)\ .map(setDistance)\ .sortBy(lambda x: x.distance).map(pegarVal) # valores = arr.map(pegarVal) # pegando os k primeiros valores -> ja no formato de lista lista = arr.take(k)
def kafka(self, host, topic, offset_ranges=None, key_deserializer=None, value_deserializer=None, schema=None, port=9092, parallelism=None, options=None): """Creates dataframe from specified set of messages from Kafka topic. Defining ranges: - If `offset_ranges` is specified it defines which specific range to read. - If `offset_ranges` is omitted it will auto-discover it's partitions. The `schema` parameter, if specified, should contain two top level fields: `key` and `value`. Parameters `key_deserializer` and `value_deserializer` are callables which get bytes as input and should return python structures as output. Args: host (str): Kafka host. topic (str|None): Kafka topic to read from. offset_ranges (list[(int, int, int)]|None): List of partition ranges [(partition, start_offset, end_offset)]. key_deserializer (function): Function used to deserialize the key. value_deserializer (function): Function used to deserialize the value. schema (pyspark.sql.types.StructType): Schema to apply to create a Dataframe. port (int): Kafka port. parallelism (int|None): The max number of parallel tasks that could be executed during the read stage (see :ref:`controlling-the-load`). options (dict|None): Additional kafka parameters, see KafkaUtils.createRDD docs. Returns: pyspark.sql.DataFrame Raises: InvalidArgumentError """ assert self._spark.has_package('org.apache.spark:spark-streaming-kafka') if not key_deserializer or not value_deserializer or not schema: raise InvalidArgumentError('You should specify all of parameters:' '`key_deserializer`, `value_deserializer` and `schema`') kafka_params = { 'metadata.broker.list': '{}:{}'.format(host, port), } if options: kafka_params.update(options) if not offset_ranges: offset_ranges = kafka_get_topics_offsets(host, topic, port) offset_ranges = [OffsetRange(topic, partition, start_offset, end_offset) for partition, start_offset, end_offset in offset_ranges] rdd = KafkaUtils.createRDD(self._spark.sparkContext, kafkaParams=kafka_params, offsetRanges=offset_ranges or [], keyDecoder=key_deserializer, valueDecoder=value_deserializer, ) if parallelism: rdd = rdd.coalesce(parallelism) return self._spark.createDataFrame(rdd, schema=schema)
def handler(rdd_mapped): """ Handle prepared RDD. Each RDD item's 'payload' field append to string. Create json object from string. Flter out field 'fields'. Then call method 'send'. """ records = rdd_mapped.collect() records_str = "" for record in records: records_str = records_str + str(record['payload']) + "\n" json_records = json.loads(records_str) # filter out "fields" field json_records.pop('fields', None) sendToBroker(json.dumps(json_records, indent=2)) if __name__ == "__main__": """Create Spark context, create KafkaRDD, prepare RDD for filtering.""" sc = SparkContext(appName="Kafka") sc.setLogLevel("WARN") offset = OffsetRange(TOPIC_IN, 0, 0, 16) rdd = KafkaUtils.createRDD(sc, {"metadata.broker.list": BROKER}, [offset]) rdd_mapped = rdd.map(lambda v: json.loads(v[1])) handler(rdd_mapped)