Example #1
0
 def _process_single_chunk(self, offset_ranges, msg_rdd_processor):
     keyed_msg_rdd = KafkaUtils.createRDD(self._sc,
                                          self._kafka_param,
                                          offset_ranges,
                                          valueDecoder=lambda x: x)
     msg_rdd = keyed_msg_rdd.values()
     msg_rdd_processor(msg_rdd)
Example #2
0
 def _fetch_single_chunk(self, offset_ranges):
     keyed_msg_rdd = KafkaUtils.createRDD(self._sc,
                                          self._kafka_param,
                                          offset_ranges,
                                          valueDecoder=lambda x: x)
     msg_rdd = keyed_msg_rdd.values()
     return msg_rdd
    def fetch_pre_hourly_data(spark_context,
                              offset_range_list):
        """get metrics pre hourly data from offset range list."""

        for o in offset_range_list:
            log.debug(
                "fetch_pre_hourly: offset_range_list:"
                " OffSetRanges: %s %s %s %s" % (
                    o.topic, o.partition, o.fromOffset, o.untilOffset))

        effective_offset_list = PreHourlyProcessor.\
            get_effective_offset_range_list(offset_range_list)

        for o in effective_offset_list:
            log.debug(
                "fetch_pre_hourly: effective_offset_range_list:"
                " OffSetRanges: %s %s %s %s" % (
                    o.topic, o.partition, o.fromOffset, o.untilOffset))

        # get kafka stream over the same offsets
        pre_hourly_rdd = KafkaUtils.createRDD(spark_context,
                                              {"metadata.broker.list":
                                                  cfg.CONF.messaging.brokers},
                                              effective_offset_list)
        return pre_hourly_rdd
def kafka_rdd(spark_context, kafka_brokers='192.168.1.106:9092'):
    return KafkaUtils.createRDD(
        sc=spark_context,
        kafkaParams={'metadata.broker.list': kafka_brokers},
        offsetRanges=[
            OffsetRange(topic='flights',
                        partition=0,
                        fromOffset=0,
                        untilOffset=49)
        ])
    def fetch_pre_hourly_data(spark_context,
                              offset_range_list):
        """get metrics pre hourly data from offset range list."""

        # get kafka stream over the same offsets
        pre_hourly_rdd = KafkaUtils.createRDD(spark_context,
                                              {"metadata.broker.list":
                                                  cfg.CONF.messaging.brokers},
                                              offset_range_list)
        return pre_hourly_rdd
Example #6
0
    def test_kafka_rdd_get_offsetRanges(self):
        """Test Python direct Kafka RDD get OffsetRanges."""
        topic = self._randomTopic()
        sendData = {"a": 3, "b": 4, "c": 5}
        offsetRanges = [OffsetRange(topic, 0, long(0), long(sum(sendData.values())))]
        kafkaParams = {"metadata.broker.list": self._kafkaTestUtils.brokerAddress()}

        self._kafkaTestUtils.createTopic(topic)
        self._kafkaTestUtils.sendMessages(topic, sendData)
        rdd = KafkaUtils.createRDD(self.sc, kafkaParams, offsetRanges)
        self.assertEqual(offsetRanges, rdd.offsetRanges())
Example #7
0
    def test_kafka_rdd(self):
        """Test the Python direct Kafka RDD API."""
        topic = self._randomTopic()
        sendData = {"a": 1, "b": 2}
        offsetRanges = [OffsetRange(topic, 0, long(0), long(sum(sendData.values())))]
        kafkaParams = {"metadata.broker.list": self._kafkaTestUtils.brokerAddress()}

        self._kafkaTestUtils.createTopic(topic)
        self._kafkaTestUtils.sendMessages(topic, sendData)
        rdd = KafkaUtils.createRDD(self.sc, kafkaParams, offsetRanges)
        self._validateRddResult(sendData, rdd)
Example #8
0
    def test_kafka_rdd_get_offsetRanges(self):
        """Test Python direct Kafka RDD get OffsetRanges."""
        topic = self._randomTopic()
        sendData = {"a": 3, "b": 4, "c": 5}
        offsetRanges = [OffsetRange(topic, 0, long(0), long(sum(sendData.values())))]
        kafkaParams = {"metadata.broker.list": self._kafkaTestUtils.brokerAddress()}

        self._kafkaTestUtils.createTopic(topic)
        self._kafkaTestUtils.sendMessages(topic, sendData)
        rdd = KafkaUtils.createRDD(self.sc, kafkaParams, offsetRanges)
        self.assertEqual(offsetRanges, rdd.offsetRanges())
Example #9
0
    def test_kafka_rdd(self):
        """Test the Python direct Kafka RDD API."""
        topic = self._randomTopic()
        sendData = {"a": 1, "b": 2}
        offsetRanges = [OffsetRange(topic, 0, long(0), long(sum(sendData.values())))]
        kafkaParams = {"metadata.broker.list": self._kafkaTestUtils.brokerAddress()}

        self._kafkaTestUtils.createTopic(topic)
        self._kafkaTestUtils.sendMessages(topic, sendData)
        rdd = KafkaUtils.createRDD(self.sc, kafkaParams, offsetRanges)
        self._validateRddResult(sendData, rdd)
Example #10
0
    def test_kafka_rdd_with_leaders(self):
        """Test the Python direct Kafka RDD API with leaders."""
        topic = self._randomTopic()
        sendData = {"a": 1, "b": 2, "c": 3}
        offsetRanges = [OffsetRange(topic, 0, long(0), long(sum(sendData.values())))]
        kafkaParams = {"metadata.broker.list": self._kafkaTestUtils.brokerAddress()}
        address = self._kafkaTestUtils.brokerAddress().split(":")
        leaders = {TopicAndPartition(topic, 0): Broker(address[0], int(address[1]))}

        self._kafkaTestUtils.createTopic(topic)
        self._kafkaTestUtils.sendMessages(topic, sendData)
        rdd = KafkaUtils.createRDD(self.sc, kafkaParams, offsetRanges, leaders)
        self._validateRddResult(sendData, rdd)
Example #11
0
    def test_kafka_rdd_with_leaders(self):
        """Test the Python direct Kafka RDD API with leaders."""
        topic = self._randomTopic()
        sendData = {"a": 1, "b": 2, "c": 3}
        offsetRanges = [OffsetRange(topic, 0, long(0), long(sum(sendData.values())))]
        kafkaParams = {"metadata.broker.list": self._kafkaTestUtils.brokerAddress()}
        address = self._kafkaTestUtils.brokerAddress().split(":")
        leaders = {TopicAndPartition(topic, 0): Broker(address[0], int(address[1]))}

        self._kafkaTestUtils.createTopic(topic)
        self._kafkaTestUtils.sendMessages(topic, sendData)
        rdd = KafkaUtils.createRDD(self.sc, kafkaParams, offsetRanges, leaders)
        self._validateRddResult(sendData, rdd)
Example #12
0
def collect_results(sc, brokers, receive_record, offsets_start, offsets_end, run_id):
    """
    Parameters
    ----------
    sc : pyspark.SparkContext
    brokers : list of str
    receive_record : callable
        Callable receiving a json decoded record from kafka. It must return
        either an empty list on error, or a 3 item tuple containing
        hit_page_id as int, query as str, and features as DenseVector
    offsets_start : list of int
        Per-partition offsets to start reading at
    offsets_end : list of int
        Per-partition offsets to end reading at
    run_id : str
        unique identifier for this run

    Returns
    -------
    pyspark.RDD
        RDD containing results of receive_record
    """

    offset_ranges = []
    if offsets_start is None:
        offsets_start = get_offset_start(brokers, mjolnir.kafka.TOPIC_RESULT)

    if offsets_start is None:
        raise RuntimeError("Cannot fetch offset_start, topic %s should have been created" % mjolnir.kafka.TOPIC_RESULT)
    for partition, (start, end) in enumerate(zip(offsets_start, offsets_end)):
        offset_ranges.append(OffsetRange(mjolnir.kafka.TOPIC_RESULT, partition, start, end))
    assert not isinstance(brokers, basestring)
    # TODO: how can we force the kafka api_version here?
    kafka_params = {
        'metadata.broker.list': ','.join(brokers),
        # Set high fetch size values so we don't fail because of large messages
        'max.partition.fetch.bytes': '40000000',
        'fetch.message.max.bytes': '40000000'
    }

    # If this ends up being too much data from kafka, blowing up memory in the
    # spark executors, we could chunk the offsets and union together multiple RDD's.
    return (
        KafkaUtils.createRDD(sc, kafka_params, offset_ranges)
        .map(lambda (k, v): json.loads(v))
        .filter(lambda rec: 'run_id' in rec and rec['run_id'] == run_id)
        .flatMap(receive_record))
Example #13
0
def collect_results(sc, brokers, receive_record, start, end, run_id):
    """
    Parameters
    ----------
    sc : pyspark.SparkContext
    brokers : list of str
    receive_record : callable
        Callable receiving a json decoded record from kafka. It should return
        a list and the resulting rdd will have a record per result returned.
    start : int
        Timestamp, in seconds since unix epoch, at which to start looking for records
    end : int
        Timestamp at which to stop looking for records.
    run_id : str
        unique identifier for this run

    Returns
    -------
    pyspark.RDD
        RDD containing results of receive_record
    """
    # Decide what offsets we need.
    offset_ranges = offset_range_for_timestamp_range(brokers, start, end)
    if offset_ranges is None:
        raise RuntimeError(
            'Could not retrieve offset ranges for result topic. Does it exist?'
        )

    kafka_params = {
        'metadata.broker.list': ','.join(brokers),
        # Set high fetch size values so we don't fail because of large messages
        'max.partition.fetch.bytes': '40000000',
        'fetch.message.max.bytes': '40000000'
    }

    # If this ends up being too much data from kafka, blowing up memory in the
    # spark executors, we could chunk the offsets and union together multiple RDD's.
    return (KafkaUtils.createRDD(
        sc, kafka_params,
        offset_ranges).map(lambda x: json.loads(x[1])).filter(
            lambda rec: 'run_id' in rec and rec['run_id'] == run_id).flatMap(
                receive_record))
    sc = SparkContext('local[*]', 'hands on PySpark')

    kafkaParams = {"metadata.broker.list": "localhost:9092"}

    start = 1  # pular primeira linha
    until = 500000
    partition = 0
    topic = 'csvtopic'
    offset1 = OffsetRange(topic, partition, start, until)
    # offset2 = OffsetRange('csvtopic', 0, 500001, 1000000)
    offsets = [offset1]

    print(" >>>>>>>> CONSUMINDO KAFKA <<<<<<<<")

    rdd = KafkaUtils.createRDD(sc, kafkaParams, offsets)

    linhas = rdd.map(lambda x: x[1])

    # linhas.foreach(printer)



    arr = linhas.map(criarPoints)\
        .map(setDistance)\
        .sortBy(lambda x: x.distance).map(pegarVal)

    # valores = arr.map(pegarVal)

    # pegando os k primeiros valores -> ja no formato de lista
    lista = arr.take(k)
Example #15
0
    def kafka(self,
              host,
              topic,
              offset_ranges=None,
              key_deserializer=None,
              value_deserializer=None,
              schema=None,
              port=9092,
              parallelism=None,
              options=None):
        """Creates dataframe from specified set of messages from Kafka topic.

        Defining ranges:
            - If `offset_ranges` is specified it defines which specific range to read.
            - If `offset_ranges` is omitted it will auto-discover it's partitions.

        The `schema` parameter, if specified, should contain two top level fields:
        `key` and `value`.

        Parameters `key_deserializer` and `value_deserializer` are callables
        which get bytes as input and should return python structures as output.

        Args:
            host (str): Kafka host.
            topic (str|None): Kafka topic to read from.
            offset_ranges (list[(int, int, int)]|None): List of partition ranges
                [(partition, start_offset, end_offset)].
            key_deserializer (function): Function used to deserialize the key.
            value_deserializer (function): Function used to deserialize the value.
            schema (pyspark.sql.types.StructType): Schema to apply to create a Dataframe.
            port (int): Kafka port.
            parallelism (int|None): The max number of parallel tasks that could be executed
                during the read stage (see :ref:`controlling-the-load`).
            options (dict|None): Additional kafka parameters, see KafkaUtils.createRDD docs.

        Returns:
            pyspark.sql.DataFrame

        Raises:
            InvalidArgumentError
        """
        assert self._spark.has_package('org.apache.spark:spark-streaming-kafka')

        if not key_deserializer or not value_deserializer or not schema:
            raise InvalidArgumentError('You should specify all of parameters:'
                                       '`key_deserializer`, `value_deserializer` and `schema`')

        kafka_params = {
            'metadata.broker.list': '{}:{}'.format(host, port),
        }

        if options:
            kafka_params.update(options)

        if not offset_ranges:
            offset_ranges = kafka_get_topics_offsets(host, topic, port)

        offset_ranges = [OffsetRange(topic, partition, start_offset, end_offset)
                         for partition, start_offset, end_offset in offset_ranges]

        rdd = KafkaUtils.createRDD(self._spark.sparkContext,
                                   kafkaParams=kafka_params,
                                   offsetRanges=offset_ranges or [],
                                   keyDecoder=key_deserializer,
                                   valueDecoder=value_deserializer,
                                   )

        if parallelism:
            rdd = rdd.coalesce(parallelism)

        return self._spark.createDataFrame(rdd, schema=schema)
Example #16
0
def handler(rdd_mapped):
    """
    Handle prepared RDD. Each RDD item's 'payload' field append to string. 
    Create json object from string. Flter out field 'fields'. 
    Then call method 'send'. 
    """
    records = rdd_mapped.collect()
    records_str = ""

    for record in records:
        records_str = records_str + str(record['payload']) + "\n"

    json_records = json.loads(records_str)

    # filter out "fields" field
    json_records.pop('fields', None)

    sendToBroker(json.dumps(json_records, indent=2))


if __name__ == "__main__":
    """Create Spark context, create KafkaRDD, prepare RDD for filtering."""
    sc = SparkContext(appName="Kafka")
    sc.setLogLevel("WARN")

    offset = OffsetRange(TOPIC_IN, 0, 0, 16)
    rdd = KafkaUtils.createRDD(sc, {"metadata.broker.list": BROKER}, [offset])
    rdd_mapped = rdd.map(lambda v: json.loads(v[1]))
    handler(rdd_mapped)