def offset_range_for_timestamp_range(brokers, start, end, topic=mjolnir.kafka.TOPIC_RESULT): """Determine OffsetRange for a given timestamp range Parameters ---------- brokers : list of str List of kafka broker hostport to bootstrap kafka connection with start : number Unix timestamp in seconds end : number Unix timestamp in seconds topic : str Kafka topic to retrieve offsets for Returns ------- list of pyspark.streaming.kafka.OffsetRange or None Per-partition ranges of offsets to read """ consumer = kafka.KafkaConsumer(bootstrap_servers=brokers, api_version=mjolnir.kafka.BROKER_VERSION) partitions = consumer.partitions_for_topic(topic) if partitions is None: # Topic does not exist. return None partitions = [kafka.TopicPartition(topic, p) for p in partitions] o_start = offsets_for_times(consumer, partitions, start) o_end = offsets_for_times(consumer, partitions, end) return [ OffsetRange(tp.topic, tp.partition, o_start[tp], o_end[tp]) for tp in partitions ]
def test_kafka_direct_stream_foreach_get_offsetRanges(self): """Test the Python direct Kafka stream foreachRDD get offsetRanges.""" topic = self._randomTopic() sendData = {"a": 1, "b": 2, "c": 3} kafkaParams = { "metadata.broker.list": self._kafkaTestUtils.brokerAddress(), "auto.offset.reset": "smallest" } self._kafkaTestUtils.createTopic(topic) self._kafkaTestUtils.sendMessages(topic, sendData) stream = KafkaUtils.createDirectStream(self.ssc, [topic], kafkaParams) offsetRanges = [] def getOffsetRanges(_, rdd): for o in rdd.offsetRanges(): offsetRanges.append(o) stream.foreachRDD(getOffsetRanges) self.ssc.start() self.wait_for(offsetRanges, 1) self.assertEqual(offsetRanges, [OffsetRange(topic, 0, long(0), long(6))])
def _get_new_offset_range_list(brokers, topic): """get offset range from earliest to latest.""" offset_range_list = [] # https://cwiki.apache.org/confluence/display/KAFKA/ # A+Guide+To+The+Kafka+Protocol# # AGuideToTheKafkaProtocol-OffsetRequest GET_LATEST_OFFSETS = -1 latest_dict = PreHourlyProcessor._get_offsets_from_kafka( brokers, topic, GET_LATEST_OFFSETS) GET_EARLIEST_OFFSETS = -2 earliest_dict = PreHourlyProcessor._get_offsets_from_kafka( brokers, topic, GET_EARLIEST_OFFSETS) for item in latest_dict: until_offset = latest_dict[item].offsets[0] from_offset = earliest_dict[item].offsets[0] partition = latest_dict[item].partition topic = latest_dict[item].topic offset_range_list.append(OffsetRange(topic, partition, from_offset, until_offset)) return offset_range_list
def emit(self): sc = peachbox.Spark.Instance().context() peachbox.Spark.Instanz = peachbox.Spark.Instance() sc = peachbox.Spark.Instanz.context() kafka_client = kafka.KafkaClient('localhost:9092') reqs = [OffsetRequest(self.topic, 0, -1, 10)] until_offset = kafka_client.send_offset_request(reqs)[0].offsets[0] offset_ranges = [ OffsetRange(topic=self.topic, partition=0, fromOffset=self.from_offset, untilOffset=until_offset) ] print 'offset range: ' + str( self.from_offset) + ':' + str(until_offset) # TODO: This is kinda hacky, resolve it if self.from_offset > until_offset: self.from_offset = until_offset result = pyspark.streaming.kafka.KafkaUtils.createRDD( sc, self.kafka_params, offset_ranges) self.latest_offset = until_offset result = result.map(lambda x: self.read_json(x[1])) return {'data': result}
def test_kafka_direct_stream_transform_get_offsetRanges(self): """Test the Python direct Kafka stream transform get offsetRanges.""" topic = self._randomTopic() sendData = {"a": 1, "b": 2, "c": 3} kafkaParams = { "metadata.broker.list": self._kafkaTestUtils.brokerAddress(), "auto.offset.reset": "smallest" } self._kafkaTestUtils.createTopic(topic) self._kafkaTestUtils.sendMessages(topic, sendData) stream = KafkaUtils.createDirectStream(self.ssc, [topic], kafkaParams) offsetRanges = [] def transformWithOffsetRanges(rdd): for o in rdd.offsetRanges(): offsetRanges.append(o) return rdd # Test whether it is ok mixing KafkaTransformedDStream and TransformedDStream together, # only the TransformedDstreams can be folded together. stream.transform(transformWithOffsetRanges).map( lambda kv: kv[1]).count().pprint() self.ssc.start() self.wait_for(offsetRanges, 1) self.assertEqual(offsetRanges, [OffsetRange(topic, 0, long(0), long(6))])
def _compose_chunk_offset_ranges(self, chunk): split_chunks = split_chunks_by_parallelism(chunk, self._parallelism) offset_ranges = [ OffsetRange(self._kafka_topic, partition=p, fromOffset=s, untilOffset=e) for (p, s, e) in split_chunks if s < e ] return offset_ranges
def kafka_rdd(spark_context, kafka_brokers='192.168.1.106:9092'): return KafkaUtils.createRDD( sc=spark_context, kafkaParams={'metadata.broker.list': kafka_brokers}, offsetRanges=[ OffsetRange(topic='flights', partition=0, fromOffset=0, untilOffset=49) ])
def test_kafka_rdd_get_offsetRanges(self): """Test Python direct Kafka RDD get OffsetRanges.""" topic = self._randomTopic() sendData = {"a": 3, "b": 4, "c": 5} offsetRanges = [OffsetRange(topic, 0, long(0), long(sum(sendData.values())))] kafkaParams = {"metadata.broker.list": self._kafkaTestUtils.brokerAddress()} self._kafkaTestUtils.createTopic(topic) self._kafkaTestUtils.sendMessages(topic, sendData) rdd = KafkaUtils.createRDD(self.sc, kafkaParams, offsetRanges) self.assertEqual(offsetRanges, rdd.offsetRanges())
def test_kafka_rdd(self): """Test the Python direct Kafka RDD API.""" topic = self._randomTopic() sendData = {"a": 1, "b": 2} offsetRanges = [OffsetRange(topic, 0, long(0), long(sum(sendData.values())))] kafkaParams = {"metadata.broker.list": self._kafkaTestUtils.brokerAddress()} self._kafkaTestUtils.createTopic(topic) self._kafkaTestUtils.sendMessages(topic, sendData) rdd = KafkaUtils.createRDD(self.sc, kafkaParams, offsetRanges) self._validateRddResult(sendData, rdd)
def test_kafka_rdd_with_leaders(self): """Test the Python direct Kafka RDD API with leaders.""" topic = self._randomTopic() sendData = {"a": 1, "b": 2, "c": 3} offsetRanges = [OffsetRange(topic, 0, long(0), long(sum(sendData.values())))] kafkaParams = {"metadata.broker.list": self._kafkaTestUtils.brokerAddress()} address = self._kafkaTestUtils.brokerAddress().split(":") leaders = {TopicAndPartition(topic, 0): Broker(address[0], int(address[1]))} self._kafkaTestUtils.createTopic(topic) self._kafkaTestUtils.sendMessages(topic, sendData) rdd = KafkaUtils.createRDD(self.sc, kafkaParams, offsetRanges, leaders) self._validateRddResult(sendData, rdd)
def test_fetch_quantity_sum(self, usage_manager, setter_manager, insert_manager, data_driven_specs_repo): # test operation test_operation = "sum" # load components usage_manager.return_value = MockComponentManager.get_usage_cmpt_mgr() setter_manager.return_value = \ MockComponentManager.get_setter_cmpt_mgr() insert_manager.return_value = \ MockComponentManager.get_insert_cmpt_mgr() # init mock driver tables data_driven_specs_repo.return_value = \ MockDataDrivenSpecsRepo(self.spark_context, self.get_pre_transform_specs_json(), self.get_transform_specs_json_by_operation( test_operation)) # Create an emulated set of Kafka messages (these were gathered # by extracting Monasca messages from the Metrics queue on mini-mon). # Create an RDD out of the mocked Monasca metrics with open(DataProvider.kafka_data_path) as f: raw_lines = f.read().splitlines() raw_tuple_list = [eval(raw_line) for raw_line in raw_lines] rdd_monasca = self.spark_context.parallelize(raw_tuple_list) # decorate mocked RDD with dummy kafka offsets myOffsetRanges = [OffsetRange("metrics", 1, 10, 20)] # mimic rdd.offsetRanges() transform_context = TransformContextUtils.get_context( offset_info=myOffsetRanges, batch_time_info=self.get_dummy_batch_time()) rdd_monasca_with_offsets = rdd_monasca.map( lambda x: RddTransformContext(x, transform_context)) try: # Call the primary method in mon_metrics_kafka MonMetricsKafkaProcessor.rdd_to_recordstore( rdd_monasca_with_offsets) self.assertTrue(False) except FetchQuantityUtilException as e: self.assertTrue("Operation sum is not supported" in e.value)
def test_missing_field_to_filter(self, usage_manager, setter_manager, insert_manager, data_driven_specs_repo): # load components usage_manager.return_value = MockComponentManager.get_usage_cmpt_mgr() setter_manager.return_value = \ MockComponentManager.get_setter_cmpt_mgr() insert_manager.return_value = \ MockComponentManager.get_insert_cmpt_mgr() # init mock driver tables data_driven_specs_repo.return_value = \ MockDataDrivenSpecsRepo( self.spark_context, self.get_pre_transform_specs_json(), self.get_invalid_filter_transform_specs_json("", "-mgmt$", "exclude")) # Create an emulated set of Kafka messages (these were gathered # by extracting Monasca messages from the Metrics queue on mini-mon). # Create an RDD out of the mocked Monasca metrics with open(DataProvider.fetch_quantity_data_path) as f: raw_lines = f.read().splitlines() raw_tuple_list = [eval(raw_line) for raw_line in raw_lines] rdd_monasca = self.spark_context.parallelize(raw_tuple_list) # decorate mocked RDD with dummy kafka offsets myOffsetRanges = [OffsetRange("metrics", 1, 10, 20)] # mimic rdd.offsetRanges() transform_context = TransformContextUtils.get_context( offset_info=myOffsetRanges, batch_time_info=self.get_dummy_batch_time()) rdd_monasca_with_offsets = rdd_monasca.map( lambda x: RddTransformContext(x, transform_context)) try: # Call the primary method in mon_metrics_kafka MonMetricsKafkaProcessor.rdd_to_recordstore( rdd_monasca_with_offsets) # In this case, it's an error if no exception is caught self.assertTrue(False) except FetchQuantityException as e: self.assertTrue("Encountered invalid filter details:" in e.value) self.assertTrue("field to filter = ," in e.value)
def collect_results(sc, brokers, receive_record, offsets_start, offsets_end, run_id): """ Parameters ---------- sc : pyspark.SparkContext brokers : list of str receive_record : callable Callable receiving a json decoded record from kafka. It must return either an empty list on error, or a 3 item tuple containing hit_page_id as int, query as str, and features as DenseVector offsets_start : list of int Per-partition offsets to start reading at offsets_end : list of int Per-partition offsets to end reading at run_id : str unique identifier for this run Returns ------- pyspark.RDD RDD containing results of receive_record """ offset_ranges = [] if offsets_start is None: offsets_start = get_offset_start(brokers, mjolnir.kafka.TOPIC_RESULT) if offsets_start is None: raise RuntimeError("Cannot fetch offset_start, topic %s should have been created" % mjolnir.kafka.TOPIC_RESULT) for partition, (start, end) in enumerate(zip(offsets_start, offsets_end)): offset_ranges.append(OffsetRange(mjolnir.kafka.TOPIC_RESULT, partition, start, end)) assert not isinstance(brokers, basestring) # TODO: how can we force the kafka api_version here? kafka_params = { 'metadata.broker.list': ','.join(brokers), # Set high fetch size values so we don't fail because of large messages 'max.partition.fetch.bytes': '40000000', 'fetch.message.max.bytes': '40000000' } # If this ends up being too much data from kafka, blowing up memory in the # spark executors, we could chunk the offsets and union together multiple RDD's. return ( KafkaUtils.createRDD(sc, kafka_params, offset_ranges) .map(lambda (k, v): json.loads(v)) .filter(lambda rec: 'run_id' in rec and rec['run_id'] == run_id) .flatMap(receive_record))
def test_invalid_aggregated_metric_name(self, usage_manager, setter_manager, insert_manager, data_driven_specs_repo): # load components usage_manager.return_value = MockComponentManager.get_usage_cmpt_mgr() setter_manager.return_value = \ MockComponentManager.get_setter_cmpt_mgr() insert_manager.return_value = \ MockComponentManager.get_insert_cmpt_mgr() # init mock driver tables data_driven_specs_repo.return_value = \ MockDataDrivenSpecsRepo( self.spark_context, self.get_pre_transform_specs_json(), self.get_transform_specs_json_invalid_name()) # Create an emulated set of Kafka messages (these were gathered # by extracting Monasca messages from the Metrics queue on mini-mon). # Create an RDD out of the mocked Monasca metrics with open(DataProvider.fetch_quantity_data_path) as f: raw_lines = f.read().splitlines() raw_tuple_list = [eval(raw_line) for raw_line in raw_lines] rdd_monasca = self.spark_context.parallelize(raw_tuple_list) # decorate mocked RDD with dummy kafka offsets myOffsetRanges = [OffsetRange("metrics", 1, 10, 20)] # mimic rdd.offsetRanges() transform_context = TransformContextUtils.get_context( offset_info=myOffsetRanges, batch_time_info=self.get_dummy_batch_time()) rdd_monasca_with_offsets = rdd_monasca.map( lambda x: RddTransformContext(x, transform_context)) # Call the primary method in mon_metrics_kafka MonMetricsKafkaProcessor.rdd_to_recordstore(rdd_monasca_with_offsets) # get the metrics that have been submitted to the dummy message adapter metrics = DummyAdapter.adapter_impl.metric_list # metrics should be empty self.assertFalse(metrics)
def _get_offset_range_list(brokers, topic, app_name, saved_offset_spec): """get offset range from saved offset to latest.""" offset_range_list = [] # https://cwiki.apache.org/confluence/display/KAFKA/ # A+Guide+To+The+Kafka+Protocol# # AGuideToTheKafkaProtocol-OffsetRequest GET_LATEST_OFFSETS = -1 latest_dict = PreHourlyProcessor._get_offsets_from_kafka( brokers, topic, GET_LATEST_OFFSETS) GET_EARLIEST_OFFSETS = -2 earliest_dict = PreHourlyProcessor._get_offsets_from_kafka( brokers, topic, GET_EARLIEST_OFFSETS) saved_dict = PreHourlyProcessor._parse_saved_offsets( app_name, topic, saved_offset_spec) for item in latest_dict: # saved spec (spec_app_name, spec_topic_name, spec_partition, spec_from_offset, spec_until_offset) = saved_dict[item] # until until_offset = latest_dict[item].offsets[0] # from if spec_until_offset is not None and int(spec_until_offset) >= 0: from_offset = spec_until_offset else: from_offset = earliest_dict[item].offsets[0] partition = latest_dict[item].partition topic = latest_dict[item].topic offset_range_list.append(OffsetRange(topic, partition, from_offset, until_offset)) return offset_range_list
media += float(i) return media / len(lista) if __name__ == "__main__": sc = SparkContext('local[*]', 'hands on PySpark') kafkaParams = {"metadata.broker.list": "localhost:9092"} start = 1 # pular primeira linha until = 500000 partition = 0 topic = 'csvtopic' offset1 = OffsetRange(topic, partition, start, until) # offset2 = OffsetRange('csvtopic', 0, 500001, 1000000) offsets = [offset1] print(" >>>>>>>> CONSUMINDO KAFKA <<<<<<<<") rdd = KafkaUtils.createRDD(sc, kafkaParams, offsets) linhas = rdd.map(lambda x: x[1]) # linhas.foreach(printer) arr = linhas.map(criarPoints)\ .map(setDistance)\
def kafka(self, host, topic, offset_ranges=None, key_deserializer=None, value_deserializer=None, schema=None, port=9092, parallelism=None, options=None): """Creates dataframe from specified set of messages from Kafka topic. Defining ranges: - If `offset_ranges` is specified it defines which specific range to read. - If `offset_ranges` is omitted it will auto-discover it's partitions. The `schema` parameter, if specified, should contain two top level fields: `key` and `value`. Parameters `key_deserializer` and `value_deserializer` are callables which get bytes as input and should return python structures as output. Args: host (str): Kafka host. topic (str|None): Kafka topic to read from. offset_ranges (list[(int, int, int)]|None): List of partition ranges [(partition, start_offset, end_offset)]. key_deserializer (function): Function used to deserialize the key. value_deserializer (function): Function used to deserialize the value. schema (pyspark.sql.types.StructType): Schema to apply to create a Dataframe. port (int): Kafka port. parallelism (int|None): The max number of parallel tasks that could be executed during the read stage (see :ref:`controlling-the-load`). options (dict|None): Additional kafka parameters, see KafkaUtils.createRDD docs. Returns: pyspark.sql.DataFrame Raises: InvalidArgumentError """ assert self._spark.has_package('org.apache.spark:spark-streaming-kafka') if not key_deserializer or not value_deserializer or not schema: raise InvalidArgumentError('You should specify all of parameters:' '`key_deserializer`, `value_deserializer` and `schema`') kafka_params = { 'metadata.broker.list': '{}:{}'.format(host, port), } if options: kafka_params.update(options) if not offset_ranges: offset_ranges = kafka_get_topics_offsets(host, topic, port) offset_ranges = [OffsetRange(topic, partition, start_offset, end_offset) for partition, start_offset, end_offset in offset_ranges] rdd = KafkaUtils.createRDD(self._spark.sparkContext, kafkaParams=kafka_params, offsetRanges=offset_ranges or [], keyDecoder=key_deserializer, valueDecoder=value_deserializer, ) if parallelism: rdd = rdd.coalesce(parallelism) return self._spark.createDataFrame(rdd, schema=schema)
def get_effective_offset_range_list(offset_range_list): """Get effective batch offset range. Effective batch offset range covers offsets starting from effective batch revision (defined by effective_batch_revision config property). By default this method will set the pyspark Offset.fromOffset for each partition to have value older than the latest revision (defaults to latest -1) so that prehourly processor has access to entire data for the hour. This will also account for and cover any early arriving data (data that arrives before the start hour). """ offset_specifications = PreHourlyProcessor.get_offset_specs() app_name = PreHourlyProcessor.get_app_name() topic = PreHourlyProcessor.get_kafka_topic() # start offset revision effective_batch_revision = cfg.CONF.pre_hourly_processor.\ effective_batch_revision effective_batch_spec = offset_specifications\ .get_kafka_offsets_by_revision(app_name, effective_batch_revision) # get latest revision, if penultimate is unavailable if not effective_batch_spec: log.debug("effective batch spec: offsets: revision %s unavailable," " getting the latest revision instead..." % ( effective_batch_revision)) # not available effective_batch_spec = offset_specifications.get_kafka_offsets( app_name) effective_batch_offsets = PreHourlyProcessor._parse_saved_offsets( app_name, topic, effective_batch_spec) # for debugging for effective_key in effective_batch_offsets.keys(): effective_offset = effective_batch_offsets.get(effective_key, None) (effect_app_name, effect_topic_name, effect_partition, effect_from_offset, effect_until_offset) = effective_offset log.debug( "effective batch offsets (from db):" " OffSetRanges: %s %s %s %s" % ( effect_topic_name, effect_partition, effect_from_offset, effect_until_offset)) # effective batch revision effective_offset_range_list = [] for offset_range in offset_range_list: part_topic_key = "_".join((offset_range.topic, str(offset_range.partition))) effective_offset = effective_batch_offsets.get(part_topic_key, None) if effective_offset: (effect_app_name, effect_topic_name, effect_partition, effect_from_offset, effect_until_offset) = effective_offset log.debug( "Extending effective offset range:" " OffSetRanges: %s %s %s-->%s %s" % ( effect_topic_name, effect_partition, offset_range.fromOffset, effect_from_offset, effect_until_offset)) effective_offset_range_list.append( OffsetRange(offset_range.topic, offset_range.partition, effect_from_offset, offset_range.untilOffset)) else: effective_offset_range_list.append( OffsetRange(offset_range.topic, offset_range.partition, offset_range.fromOffset, offset_range.untilOffset)) # return effective offset range list return effective_offset_range_list
def handler(rdd_mapped): """ Handle prepared RDD. Each RDD item's 'payload' field append to string. Create json object from string. Flter out field 'fields'. Then call method 'send'. """ records = rdd_mapped.collect() records_str = "" for record in records: records_str = records_str + str(record['payload']) + "\n" json_records = json.loads(records_str) # filter out "fields" field json_records.pop('fields', None) sendToBroker(json.dumps(json_records, indent=2)) if __name__ == "__main__": """Create Spark context, create KafkaRDD, prepare RDD for filtering.""" sc = SparkContext(appName="Kafka") sc.setLogLevel("WARN") offset = OffsetRange(TOPIC_IN, 0, 0, 16) rdd = KafkaUtils.createRDD(sc, {"metadata.broker.list": BROKER}, [offset]) rdd_mapped = rdd.map(lambda v: json.loads(v[1])) handler(rdd_mapped)
def test_pod_net_in_usage_app(self, usage_manager, setter_manager, insert_manager, data_driven_specs_repo): # load components usage_manager.return_value = MockComponentManager.get_usage_cmpt_mgr() setter_manager.return_value = \ MockComponentManager.get_setter_cmpt_mgr() insert_manager.return_value = \ MockComponentManager.get_insert_cmpt_mgr() # init mock driver tables data_driven_specs_repo.return_value = \ MockDataDrivenSpecsRepo(self.spark_context, self.get_pre_transform_specs_json_app(), self.get_transform_specs_json_app()) # Create an emulated set of Kafka messages (these were gathered # by extracting Monasca messages from the Metrics queue on mini-mon). # Create an RDD out of the mocked Monasca metrics with open(DataProvider.fetch_quantity_data_path) as f: raw_lines = f.read().splitlines() raw_tuple_list = [eval(raw_line) for raw_line in raw_lines] rdd_monasca = self.spark_context.parallelize(raw_tuple_list) # decorate mocked RDD with dummy kafka offsets myOffsetRanges = [OffsetRange("metrics", 1, 10, 20)] # mimic rdd.offsetRanges() transform_context = TransformContextUtils.get_context( offset_info=myOffsetRanges, batch_time_info=self.get_dummy_batch_time()) rdd_monasca_with_offsets = rdd_monasca.map( lambda x: RddTransformContext(x, transform_context)) # Call the primary method in mon_metrics_kafka MonMetricsKafkaProcessor.rdd_to_recordstore(rdd_monasca_with_offsets) # get the metrics that have been submitted to the dummy message adapter metrics = DummyAdapter.adapter_impl.metric_list pod_net_usage_agg_metric = [ value for value in metrics if value.get('metric').get('name') == 'pod.net.in_bytes_sec_agg' and value.get('metric').get('dimensions').get('app') == 'junk' and value.get('metric').get('dimensions').get('namespace') == 'all' and value.get('metric').get('dimensions').get('pod_name') == 'all' ][0] self.assertTrue(pod_net_usage_agg_metric is not None) self.assertEqual('pod.net.in_bytes_sec_agg', pod_net_usage_agg_metric.get('metric').get('name')) self.assertEqual( 'junk', pod_net_usage_agg_metric.get("metric").get('dimensions').get( 'app')) self.assertEqual( 'all', pod_net_usage_agg_metric.get("metric").get('dimensions').get( 'namespace')) self.assertEqual( 'all', pod_net_usage_agg_metric.get("metric").get('dimensions').get( 'pod_name')) self.assertEqual(122.94, pod_net_usage_agg_metric.get('metric').get('value')) self.assertEqual('useast', pod_net_usage_agg_metric.get('meta').get('region')) self.assertEqual(cfg.CONF.messaging.publish_kafka_project_id, pod_net_usage_agg_metric.get('meta').get('tenantId')) self.assertEqual( 'hourly', pod_net_usage_agg_metric.get('metric').get('dimensions').get( 'aggregation_period')) self.assertEqual( 3.0, pod_net_usage_agg_metric.get('metric').get('value_meta').get( 'record_count')) self.assertEqual( '2017-01-24 20:14:47', pod_net_usage_agg_metric.get('metric').get('value_meta').get( 'firstrecord_timestamp_string')) self.assertEqual( '2017-01-24 20:15:47', pod_net_usage_agg_metric.get('metric').get('value_meta').get( 'lastrecord_timestamp_string'))
def test_vcpus_by_all(self, usage_manager, setter_manager, insert_manager, data_driven_specs_repo): # load components usage_manager.return_value = MockComponentManager.get_usage_cmpt_mgr() setter_manager.return_value = \ MockComponentManager.get_setter_cmpt_mgr() insert_manager.return_value = \ MockComponentManager.get_insert_pre_hourly_cmpt_mgr() # init mock driver tables data_driven_specs_repo.return_value = \ MockDataDrivenSpecsRepo( self.spark_context, self.get_pre_transform_specs_json_by_all(), self.get_transform_specs_json_by_all()) # Create an RDD out of the mocked Monasca metrics with open(DataProvider.kafka_data_path) as f: raw_lines = f.read().splitlines() raw_tuple_list = [eval(raw_line) for raw_line in raw_lines] rdd_monasca = self.spark_context.parallelize(raw_tuple_list) # decorate mocked RDD with dummy kafka offsets myOffsetRanges = [OffsetRange("metrics", 1, 10, 20)] # mimic rdd.offsetRanges() transform_context = TransformContextUtils.get_context( offset_info=myOffsetRanges, batch_time_info=self.get_dummy_batch_time()) rdd_monasca_with_offsets = rdd_monasca.map( lambda x: RddTransformContext(x, transform_context)) # Call the primary method in mon_metrics_kafka MonMetricsKafkaProcessor.rdd_to_recordstore(rdd_monasca_with_offsets) # get the metrics that have been submitted to the dummy message adapter metrics = DummyAdapter.adapter_impl.metric_list vm_cpu_list = map(dump_as_ascii_string, metrics) DummyAdapter.adapter_impl.metric_list = [] vm_cpu_rdd = self.spark_context.parallelize(vm_cpu_list) sql_context = SQLContext(self.spark_context) vm_cpu_df = sql_context.read.json(vm_cpu_rdd) PreHourlyProcessor.do_transform(vm_cpu_df) metrics = DummyAdapter.adapter_impl.metric_list vcpus_agg_metric = [ value for value in metrics if value.get('metric').get('name') == 'vcpus_agg' and value.get( 'metric').get('dimensions').get('project_id') == 'all' ][0] self.assertTrue(vcpus_agg_metric is not None) self.assertEqual(7.0, vcpus_agg_metric.get('metric').get('value')) self.assertEqual('useast', vcpus_agg_metric.get('meta').get('region')) self.assertEqual(cfg.CONF.messaging.publish_kafka_project_id, vcpus_agg_metric.get('meta').get('tenantId')) self.assertEqual( 'all', vcpus_agg_metric.get('metric').get('dimensions').get('host')) self.assertEqual( 'hourly', vcpus_agg_metric.get('metric').get('dimensions').get( 'aggregation_period')) self.assertEqual( 14.0, vcpus_agg_metric.get('metric').get('value_meta').get( 'record_count')) self.assertEqual( '2016-01-20 16:40:00', vcpus_agg_metric.get('metric').get('value_meta').get( 'firstrecord_timestamp_string')) self.assertEqual( '2016-01-20 16:40:46', vcpus_agg_metric.get('metric').get('value_meta').get( 'lastrecord_timestamp_string'))
def test_fetch_quantity_avg(self, usage_manager, setter_manager, insert_manager, data_driven_specs_repo): # test operation test_operation = "avg" # load components usage_manager.return_value = MockComponentManager.get_usage_cmpt_mgr() setter_manager.return_value = \ MockComponentManager.get_setter_cmpt_mgr() insert_manager.return_value = \ MockComponentManager.get_insert_cmpt_mgr() # init mock driver tables data_driven_specs_repo.return_value = \ MockDataDrivenSpecsRepo(self.spark_context, self.get_pre_transform_specs_json(), self.get_transform_specs_json_by_operation( test_operation)) # Create an emulated set of Kafka messages (these were gathered # by extracting Monasca messages from the Metrics queue on mini-mon). # Create an RDD out of the mocked Monasca metrics with open(DataProvider.kafka_data_path) as f: raw_lines = f.read().splitlines() raw_tuple_list = [eval(raw_line) for raw_line in raw_lines] rdd_monasca = self.spark_context.parallelize(raw_tuple_list) # decorate mocked RDD with dummy kafka offsets myOffsetRanges = [OffsetRange("metrics", 1, 10, 20)] # mimic rdd.offsetRanges() transform_context = TransformContextUtils.get_context( offset_info=myOffsetRanges, batch_time_info=self.get_dummy_batch_time()) rdd_monasca_with_offsets = rdd_monasca.map( lambda x: RddTransformContext(x, transform_context)) # Call the primary method in mon_metrics_kafka MonMetricsKafkaProcessor.rdd_to_recordstore(rdd_monasca_with_offsets) # get the metrics that have been submitted to the dummy message adapter metrics = DummyAdapter.adapter_impl.metric_list utilized_cpu_logical_agg_metric = [ value for value in metrics if value.get('metric').get('name') == 'cpu.utilized_logical_cores_agg' ][0] self.assertEqual( 7.134214285714285, utilized_cpu_logical_agg_metric.get('metric').get('value')) self.assertEqual( 'useast', utilized_cpu_logical_agg_metric.get('meta').get('region')) self.assertEqual( cfg.CONF.messaging.publish_kafka_project_id, utilized_cpu_logical_agg_metric.get('meta').get('tenantId')) self.assertEqual( 'all', utilized_cpu_logical_agg_metric.get('metric').get( 'dimensions').get('host')) self.assertEqual( 'all', utilized_cpu_logical_agg_metric.get('metric').get( 'dimensions').get('project_id')) self.assertEqual( 'hourly', utilized_cpu_logical_agg_metric.get('metric').get( 'dimensions').get('aggregation_period')) self.assertEqual( 13.0, utilized_cpu_logical_agg_metric.get('metric').get( 'value_meta').get('record_count')) self.assertEqual( '2016-03-07 16:09:23', utilized_cpu_logical_agg_metric.get('metric').get( 'value_meta').get('firstrecord_timestamp_string')) self.assertEqual( '2016-03-07 16:10:38', utilized_cpu_logical_agg_metric.get('metric').get( 'value_meta').get('lastrecord_timestamp_string'))
def test_rdd_to_recordstore(self, usage_manager, setter_manager, insert_manager): usage_manager.return_value = MockComponentManager.get_usage_cmpt_mgr() setter_manager.return_value = \ MockComponentManager.get_setter_cmpt_mgr() insert_manager.return_value = \ MockComponentManager.get_insert_pre_hourly_cmpt_mgr() # Create an RDD out of the mocked Monasca metrics with open(DataProvider.kafka_data_path) as f: raw_lines = f.read().splitlines() raw_tuple_list = [eval(raw_line) for raw_line in raw_lines] rdd_monasca = self.spark_context.parallelize(raw_tuple_list) # decorate mocked RDD with dummy kafka offsets myOffsetRanges = [OffsetRange("metrics", 1, 10, 20)] # mimic rdd.offsetRanges() transform_context = TransformContextUtils.get_context( offset_info=myOffsetRanges, batch_time_info=self.get_dummy_batch_time()) rdd_monasca_with_offsets = rdd_monasca.map( lambda x: RddTransformContext(x, transform_context)) # Call the primary method in mon_metrics_kafka MonMetricsKafkaProcessor.rdd_to_recordstore(rdd_monasca_with_offsets) host_usage_list = DummyAdapter.adapter_impl.metric_list host_usage_list = map(dump_as_ascii_string, host_usage_list) DummyAdapter.adapter_impl.metric_list = [] host_usage_rdd = self.spark_context.parallelize(host_usage_list) sql_context = SQLContext(self.spark_context) host_usage_df = sql_context.read.json(host_usage_rdd) PreHourlyProcessor.do_transform(host_usage_df) # get the metrics that have been submitted to the dummy message adapter metrics = DummyAdapter.adapter_impl.metric_list # Verify cpu.total_logical_cores_agg for all hosts total_cpu_logical_agg_metric = [ value for value in metrics if value.get('metric').get('name') == 'cpu.total_logical_cores_agg' and value.get('metric').get('dimensions').get('host') == 'all' ][0] self.assertEqual( 15.0, total_cpu_logical_agg_metric.get('metric').get('value')) self.assertEqual( 'useast', total_cpu_logical_agg_metric.get('meta').get('region')) self.assertEqual( cfg.CONF.messaging.publish_kafka_project_id, total_cpu_logical_agg_metric.get('meta').get('tenantId')) self.assertEqual( 'all', total_cpu_logical_agg_metric.get('metric').get('dimensions').get( 'project_id')) self.assertEqual( 'hourly', total_cpu_logical_agg_metric.get('metric').get('dimensions').get( 'aggregation_period')) self.assertEqual( 13.0, total_cpu_logical_agg_metric.get('metric').get('value_meta').get( 'record_count')) self.assertEqual( '2016-03-07 16:09:23', total_cpu_logical_agg_metric.get('metric').get('value_meta').get( 'firstrecord_timestamp_string')) self.assertEqual( '2016-03-07 16:10:38', total_cpu_logical_agg_metric.get('metric').get('value_meta').get( 'lastrecord_timestamp_string')) # Verify cpu.total_logical_cores_agg for mini-mon host total_cpu_logical_agg_metric = [ value for value in metrics if value.get('metric').get('name') == 'cpu.total_logical_cores_agg' and value.get('metric').get('dimensions').get( 'host') == 'test-cp1-comp0333-mgmt' ][0] self.assertEqual( 9.0, total_cpu_logical_agg_metric.get('metric').get('value')) self.assertEqual( 'useast', total_cpu_logical_agg_metric.get('meta').get('region')) self.assertEqual( cfg.CONF.messaging.publish_kafka_project_id, total_cpu_logical_agg_metric.get('meta').get('tenantId')) self.assertEqual( 'all', total_cpu_logical_agg_metric.get('metric').get('dimensions').get( 'project_id')) self.assertEqual( 'hourly', total_cpu_logical_agg_metric.get('metric').get('dimensions').get( 'aggregation_period')) self.assertEqual( 6.0, total_cpu_logical_agg_metric.get('metric').get('value_meta').get( 'record_count')) self.assertEqual( '2016-03-07 16:09:23', total_cpu_logical_agg_metric.get('metric').get('value_meta').get( 'firstrecord_timestamp_string')) self.assertEqual( '2016-03-07 16:10:38', total_cpu_logical_agg_metric.get('metric').get('value_meta').get( 'lastrecord_timestamp_string')) # Verify cpu.total_logical_cores_agg for devstack host total_cpu_logical_agg_metric = [ value for value in metrics if value.get('metric').get('name') == 'cpu.total_logical_cores_agg' and value.get('metric').get('dimensions').get( 'host') == 'test-cp1-comp0027-mgmt' ][0] self.assertEqual( 6.0, total_cpu_logical_agg_metric.get('metric').get('value')) self.assertEqual( 'useast', total_cpu_logical_agg_metric.get('meta').get('region')) self.assertEqual( cfg.CONF.messaging.publish_kafka_project_id, total_cpu_logical_agg_metric.get('meta').get('tenantId')) self.assertEqual( 'all', total_cpu_logical_agg_metric.get('metric').get('dimensions').get( 'project_id')) self.assertEqual( 'hourly', total_cpu_logical_agg_metric.get('metric').get('dimensions').get( 'aggregation_period')) self.assertEqual( 7.0, total_cpu_logical_agg_metric.get('metric').get('value_meta').get( 'record_count')) self.assertEqual( '2016-03-07 16:09:23', total_cpu_logical_agg_metric.get('metric').get('value_meta').get( 'firstrecord_timestamp_string')) self.assertEqual( '2016-03-07 16:10:38', total_cpu_logical_agg_metric.get('metric').get('value_meta').get( 'lastrecord_timestamp_string')) # Verify cpu.utilized_logical_cores_agg for all hosts utilized_cpu_logical_agg_metric = [ value for value in metrics if value.get('metric').get('name') == 'cpu.utilized_logical_cores_agg' and value.get('metric').get('dimensions').get('host') == 'all' ][0] self.assertEqual( 7.134214285714285, utilized_cpu_logical_agg_metric.get('metric').get('value')) self.assertEqual( 'useast', utilized_cpu_logical_agg_metric.get('meta').get('region')) self.assertEqual( cfg.CONF.messaging.publish_kafka_project_id, utilized_cpu_logical_agg_metric.get('meta').get('tenantId')) self.assertEqual( 'all', utilized_cpu_logical_agg_metric.get('metric').get( 'dimensions').get('project_id')) self.assertEqual( 'hourly', utilized_cpu_logical_agg_metric.get('metric').get( 'dimensions').get('aggregation_period')) self.assertEqual( 13.0, utilized_cpu_logical_agg_metric.get('metric').get( 'value_meta').get('record_count')) self.assertEqual( '2016-03-07 16:09:23', utilized_cpu_logical_agg_metric.get('metric').get( 'value_meta').get('firstrecord_timestamp_string')) self.assertEqual( '2016-03-07 16:10:38', utilized_cpu_logical_agg_metric.get('metric').get( 'value_meta').get('lastrecord_timestamp_string')) # Verify cpu.utilized_logical_cores_agg for the mini-mon host utilized_cpu_logical_agg_metric = [ value for value in metrics if value.get('metric').get('name') == 'cpu.utilized_logical_cores_agg' and value.get('metric').get( 'dimensions').get('host') == 'test-cp1-comp0333-mgmt' ][0] self.assertEqual( 4.9665, utilized_cpu_logical_agg_metric.get('metric').get('value')) self.assertEqual( 'useast', utilized_cpu_logical_agg_metric.get('meta').get('region')) self.assertEqual( cfg.CONF.messaging.publish_kafka_project_id, utilized_cpu_logical_agg_metric.get('meta').get('tenantId')) self.assertEqual( 'all', utilized_cpu_logical_agg_metric.get('metric').get( 'dimensions').get('project_id')) self.assertEqual( 'hourly', utilized_cpu_logical_agg_metric.get('metric').get( 'dimensions').get('aggregation_period')) self.assertEqual( 6.0, utilized_cpu_logical_agg_metric.get('metric').get( 'value_meta').get('record_count')) self.assertEqual( '2016-03-07 16:09:23', utilized_cpu_logical_agg_metric.get('metric').get( 'value_meta').get('firstrecord_timestamp_string')) self.assertEqual( '2016-03-07 16:10:38', utilized_cpu_logical_agg_metric.get('metric').get( 'value_meta').get('lastrecord_timestamp_string')) # Verify cpu.utilized_logical_cores_agg for the devstack host utilized_cpu_logical_agg_metric = [ value for value in metrics if value.get('metric').get('name') == 'cpu.utilized_logical_cores_agg' and value.get('metric').get( 'dimensions').get('host') == 'test-cp1-comp0027-mgmt' ][0] self.assertEqual( 2.1677142857142853, utilized_cpu_logical_agg_metric.get('metric').get('value')) self.assertEqual( 'useast', utilized_cpu_logical_agg_metric.get('meta').get('region')) self.assertEqual( cfg.CONF.messaging.publish_kafka_project_id, utilized_cpu_logical_agg_metric.get('meta').get('tenantId')) self.assertEqual( 'all', utilized_cpu_logical_agg_metric.get('metric').get( 'dimensions').get('project_id')) self.assertEqual( 'hourly', utilized_cpu_logical_agg_metric.get('metric').get( 'dimensions').get('aggregation_period')) self.assertEqual( 7.0, utilized_cpu_logical_agg_metric.get('metric').get( 'value_meta').get('record_count')) self.assertEqual( '2016-03-07 16:09:23', utilized_cpu_logical_agg_metric.get('metric').get( 'value_meta').get('firstrecord_timestamp_string')) self.assertEqual( '2016-03-07 16:10:38', utilized_cpu_logical_agg_metric.get('metric').get( 'value_meta').get('lastrecord_timestamp_string'))
#dstream_time_interval = 5 #ssc = pyspark.streaming.StreamingContext(sc,dstream_time_interval) #@staticmethod #def createRDD(sc, kafkaParams, offsetRanges, leaders={}, #keyDecoder=utf8_decoder, valueDecoder=utf8_decoder): kafka_params = { "zookeeper.connect": "localhost:2182", "metadata.broker.list": "localhost:9092", "group.id": "TutorialGroup1", "zookeeper.connection.timeout.ms": "10000" } tutorial1 = OffsetRange(topic='movie_reviews', partition=0, fromOffset=0, untilOffset=2) offset_ranges = [tutorial1] kafka = pyspark.streaming.kafka.KafkaUtils.createRDD(sc, kafka_params, offset_ranges) kafka = kafka.map(lambda x: x[1]) print kafka.collect() #kafka = pyspark.streaming.kafka.KafkaUtils.createStream(ssc, 'localhost:2181', 'TutorialGroup1', {'t1':1}) #kafka.pprint() #ssc.start() #ssc.awaitTermination()