def _test_pipeline(self, configuration_path, processor_creator, input_dir, expected_result_file, print_result=False): """ Method for checking equals between driver result and manually generated result file :param configuration_path: path to config file :param processor_creator: processor for event_creator :param input_dir: path to input messages :param expected_result_file: path to expected result :param print_result: flag for only printing results to console for debugging :return: """ table_uuid_postfix = "_" + str(uuid.uuid1()).replace("-", "_") configuration = Utils.load_config(configuration_path) pipeline = TestPipeline(configuration, processor_creator(configuration), input_dir, "test_result" + table_uuid_postfix) pipeline.process_all_available() result_tables_list = [[ json.loads(row.value) for row in pipeline.spark.sql("select value from " + query.name).collect() ] for query in pipeline.spark.streams.active] result = [table for results in result_tables_list for table in results] pipeline.terminate_active_streams() if print_result: for row in result: print(row) else: expected_result = self.__read_expected_result(expected_result_file) self.maxDiff = None self.assertItemsEqual(expected_result, result)
r"(?P<finished_time>\w+?\s+?\w+?\s+?\d{1,2}\s+?\d{2}:\d{2}:\d{2}\s+?\w+?\s+?\d{4}).*" ))).add_intermediate_result_parser( duration_event_creator), Utils.get_output_topic(configuration, "reingest")) }) def duration_update(started_script, finished_script, finished_time, timestamp): """ if started script equals finished script duration is calculated :param started_script :param finished_script :param finished_time :param timestamp :return: duration ":exception: ParsingException """ if started_script == finished_script: return abs(finished_time - timestamp).seconds else: raise ParsingException( "Message contains different started and finished scripts") if __name__ == "__main__": configuration = Utils.load_config(sys.argv[:]) KafkaPipeline( configuration, LogParsingProcessor(configuration, create_event_creators(configuration))).start()
import sys import uuid from datetime import datetime, timedelta from pyspark import SparkContext from pyspark.streaming import StreamingContext from pyspark.streaming.kafka import KafkaUtils from applications.vspp_unique_activities.recording_state import RecordingState from common.kafka_helper import KafkaHelper from util.utils import Utils if __name__ == "__main__": config = Utils.load_config(sys.argv[:]) sc = SparkContext(appName=config.property('spark.appName'), master=config.property('spark.master')) ssc = StreamingContext(sc, config.property('spark.batchInterval')) ssc.checkpoint(config.property('spark.checkpointLocation')) # Kafka input configs options = config.kafka_input_options() input_stream = KafkaUtils.createDirectStream(ssc, config.kafka_input_topics(), options) def parse_message(message):