def __init__(self, view_function): self._queue = MultiprocessingQueue() self._paused = False self._terminated = False self._coroutine = None self._proc = None # The udater is implementation-specific and must be provided by # the subclass by calling self._set_updater self._updater = None self.view_function = view_function
def main(): server_queue = MultiprocessingQueue() server = start_server_process(server_queue) client = start_client() testers = [ algorithm_tester('async coalescing tester', test_asyncio_coalesce, use_tcp_nodelay=True), algorithm_tester('async no_delay tester', test_asyncio_no_coalesce, use_tcp_nodelay=True), algorithm_tester('async nagle tester', test_asyncio_no_coalesce, use_tcp_nodelay=False), algorithm_tester('threaded coalescing tester', test_threaded_coalesce, use_tcp_nodelay=True), algorithm_tester('threaded no_delay tester', test_threaded_no_coalesce, use_tcp_nodelay=True), algorithm_tester('threaded_nagle_tester', test_threaded_no_coalesce, use_tcp_nodelay=False), algorithm_tester('simple coalescing', test_simple_coalescing, use_tcp_nodelay=True), algorithm_tester('simple no_delay', test_simple, use_tcp_nodelay=True), algorithm_tester('simple nagle', test_simple, use_tcp_nodelay=False), ] stats = [] try: for tester in testers: assert server_queue.empty() tester_stats = execute_test(tester, client, server_queue) stats.append(tester_stats) finally: client.close() server.kill() display_stats(stats)
def __init__(self, project_id: str, subscription: str, agg_function: AggregationFunction, agg_window_millis: int, auth_file: str = None, spark_opts: dict = {}, multiprocessing=False) -> None: """ SparkPubSubStreamConsumer constructor :param project_id: the project id :param subscription: the subscription name :param agg_function: aggregation function to apply :param agg_window_millis: aggregation window in milliseconds :param auth_file: path to credentials json file :param spark_opts: spark options dict :param multiprocessing: use multiprocessing instead of threading """ super().__init__(agg_function, agg_window_millis) self.project_id = project_id self.subscription = subscription self.spark_opts = spark_opts self.subscribed = True self.multiprocessing = multiprocessing if self.multiprocessing: self.queue = MultiprocessingQueue() else: self.queue = Queue() os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = auth_file def run_spark_job(queue: Queue, _agg_function: AggregationFunction, _agg_window_millis: int, _spark_opts: dict = {}, _environment: dict = {}): os.environ.update(_environment) try: try: import findspark findspark.init() except Exception as ex: self.logger.warn("Cannot import Spark pyspark with" " findspark. Message: {}".format(str(ex))) pass from pyspark.sql import SparkSession from pyspark.streaming import StreamingContext from pyspark.sql.functions import expr, window from pyspark.serializers import NoOpSerializer from pyspark.streaming import DStream from pyspark.streaming.kafka import utf8_decoder spark_builder = SparkSession \ .builder \ for k in _spark_opts: spark_builder = spark_builder.config(k, _spark_opts[k]) spark_builder \ .appName(str(self)) \ .config("spark.jars.packages", "org.apache.spark:spark-streaming-kafka-0-8_2.11:2.2.1," "org.apache.bahir:spark-streaming-pubsub_2.11:2.2.1") \ .config("spark.jars", BASE_PATH + "/lib/streaming-pubsub-serializer_2.11-0.1.jar") spark = spark_builder.getOrCreate() spark.sparkContext.setLogLevel("WARN") ssc = StreamingContext(spark.sparkContext, (agg_window_millis / 1000)) agg = expr("value") if _agg_function == AggregationFunction.AVG: agg = expr("avg(value)") elif _agg_function == AggregationFunction.SUM: agg = expr("sum(value)") elif _agg_function == AggregationFunction.COUNT: agg = expr("count(value)") elif _agg_function == AggregationFunction.P50: agg = expr("percentile(value, 0.5)") elif _agg_function == AggregationFunction.P75: agg = expr("percentile(value, 0.75)") elif _agg_function == AggregationFunction.P95: agg = expr("percentile(value, 0.95)") elif _agg_function == AggregationFunction.P99: agg = expr("percentile(value, 0.99)") deserializer = \ ssc._jvm.org.apache.spark.streaming.pubsub.SparkPubsubMessageSerializer() # noqa: E501 pubsub_utils = \ ssc._jvm.org.apache.spark.streaming.pubsub.PubsubUtils credentials = \ ssc._jvm.org.apache.spark.streaming.pubsub.SparkGCPCredentials storage_level = \ ssc._jvm.org.apache.spark.storage.StorageLevel _pubsub_stream = pubsub_utils \ .createStream(ssc._jssc, project_id, subscription, credentials.Builder().build(), storage_level.DISK_ONLY()) _pubsub_stream_des = _pubsub_stream.transform(deserializer) ser = NoOpSerializer() pubsub_stream = DStream(_pubsub_stream_des, ssc, ser).map(utf8_decoder) def aggregate_rdd(_queue, _agg, df, ts): secs = int(self.agg_window_millis / 1000) win = window("ts", "{} seconds".format(secs)) if df.first(): aggs = df \ .groupBy("application", win) \ .agg(_agg.alias("value")) \ .collect() for row in aggs: message = InputMessage(row["application"], value=row["value"], ts=ts) self.logger.debug("Enqueue: {}".format( message.to_json())) try: _queue.put(message.to_json()) except AssertionError as ex: self.logger.warn(str(ex)) else: self.logger.warn("Empty RDD") # Create kafka stream pubsub_stream \ .foreachRDD(lambda ts, rdd: aggregate_rdd(queue, agg, spark.read.json(rdd), ts)) # Run ssc.start() if "timeout" in _spark_opts: ssc.awaitTerminationOrTimeout(_spark_opts["timeout"]) ssc.stop() spark.stop() else: ssc.awaitTermination() ssc.stop() spark.stop() except Exception as e: raise e # Run in multiprocessing, each aggregation runs a spark driver. runner = Concurrency.run_process \ if self.multiprocessing \ else Concurrency.run_thread Concurrency.get_lock("spark").acquire() pid = runner(target=run_spark_job, args=(self.queue, self.agg_function, self.agg_window_millis, self.spark_opts, os.environ.copy()), name="PySpark {}".format(str(self))) Concurrency.schedule_release("spark", 30) self.pid = pid
def __init__(self, broker_servers: str, input_topic: str, group_id: str, agg_function: AggregationFunction, agg_window_millis: int, spark_opts: dict = {}, multiprocessing=True) -> None: """ SparkKafkaStreamConsumer constructor :param broker_servers: broker servers :param input_topic: input topic :param group_id: consumer group id :param agg_function: aggregation function to apply :param agg_window_millis: aggregation window in milliseconds :param spark_opts: spark options dict :param multiprocessing: use multiprocessing instead of threading """ super().__init__(agg_function, agg_window_millis) self.broker_servers = broker_servers.split(",") self.input_topic = input_topic self.group_id = group_id self.spark_opts = spark_opts self.subscribed = True self.multiprocessing = multiprocessing if self.multiprocessing: self.queue = MultiprocessingQueue() else: self.queue = Queue() def run_spark_job(queue: Queue, _agg_function: AggregationFunction, _agg_window_millis: int, _spark_opts: dict = {}, _environment: dict = {}): os.environ.update(_environment) try: try: import findspark findspark.init() except Exception as ex: self.logger.warn("Cannot import Spark pyspark with" " findspark. Message: {}".format(str(ex))) pass from pyspark.sql import SparkSession from pyspark.streaming import StreamingContext from pyspark.streaming.kafka import KafkaUtils from pyspark.sql.functions import expr, window spark_builder = SparkSession \ .builder \ for k in _spark_opts: spark_builder = spark_builder.config(k, _spark_opts[k]) spark_builder = spark_builder \ .appName(str(self)) \ .config("spark.jars.packages", "org.apache.spark:spark-streaming-kafka-0-8_2.11:2.2.1," "org.apache.bahir:spark-streaming-pubsub_2.11:2.2.1") \ .config("spark.jars", BASE_PATH + "/lib/streaming-pubsub-serializer_2.11-0.1.jar") spark = spark_builder.getOrCreate() spark.sparkContext.setLogLevel("WARN") ssc = StreamingContext(spark.sparkContext, (agg_window_millis / 1000)) agg = expr("value") if _agg_function == AggregationFunction.AVG: agg = expr("avg(value)") elif _agg_function == AggregationFunction.SUM: agg = expr("sum(value)") elif _agg_function == AggregationFunction.COUNT: agg = expr("count(value)") elif _agg_function == AggregationFunction.P50: agg = expr("percentile(value, 0.5)") elif _agg_function == AggregationFunction.P75: agg = expr("percentile(value, 0.75)") elif _agg_function == AggregationFunction.P95: agg = expr("percentile(value, 0.95)") elif _agg_function == AggregationFunction.P99: agg = expr("percentile(value, 0.99)") kafka_stream = KafkaUtils.createDirectStream( ssc, [self.input_topic], {"metadata.broker.list": ",".join(self.broker_servers)}) def aggregate_rdd(_queue, _agg, df, ts): secs = int(self.agg_window_millis / 1000) win = window("ts", "{} seconds".format(secs)) if df.first(): aggs = df \ .groupBy("application", win) \ .agg(_agg.alias("value")) \ .collect() for row in aggs: message = InputMessage(row["application"], value=row["value"], ts=ts) self.logger.debug("Enqueue: {}".format( message.to_json())) try: _queue.put(message.to_json()) except AssertionError as ex: self.logger.warn(str(ex)) else: warnings.warn("Empty RDD") # Create kafka stream kafka_stream \ .map(lambda x: x[1]) \ .foreachRDD(lambda ts, rdd: aggregate_rdd(queue, agg, spark.read.json(rdd), ts)) # Run ssc.start() if "timeout" in _spark_opts: ssc.awaitTerminationOrTimeout(_spark_opts["timeout"]) ssc.stop() spark.stop() else: ssc.awaitTermination() ssc.stop() spark.stop() except Exception as e: raise e # Run in multiprocessing, each aggregation runs a spark driver. runner = Concurrency.run_process \ if self.multiprocessing \ else Concurrency.run_thread Concurrency.get_lock("spark").acquire() pid = runner(target=run_spark_job, args=(self.queue, self.agg_function, self.agg_window_millis, self.spark_opts, os.environ.copy()), name="PySpark {}".format(str(self))) Concurrency.schedule_release("spark", 30) self.pid = pid
def main(args): n_processes = get_num_processes(min_free_cpu_cores=args.n_free_cpus) start_time = datetime.now() ( soma_diameter, max_cluster_size, ball_xy_size, ball_z_size, ) = calculate_parameters_in_pixels( args.x_pixel_um, args.y_pixel_um, args.z_pixel_um, args.soma_diameter, args.max_cluster_size, args.ball_xy_size, args.ball_z_size, ) # file extension only used if a directory is passed img_paths = get_sorted_file_paths(args.signal_planes_paths[0], file_extension="tif") if args.end_plane == -1: args.end_plane = len(img_paths) planes_paths_range = img_paths[args.start_plane:args.end_plane] workers_queue = MultiprocessingQueue(maxsize=n_processes) # WARNING: needs to be AT LEAST ball_z_size mp_3d_filter_queue = MultiprocessingQueue(maxsize=ball_z_size) for plane_id in range(n_processes): # place holder for the queue to have the right size on first run workers_queue.put(None) clipping_val, threshold_value, ball_filter, cell_detector = setup( img_paths[0], soma_diameter, ball_xy_size, ball_z_size, ball_overlap_fraction=args.ball_overlap_fraction, z_offset=args.start_plane, ) progress_bar = tqdm(total=len(planes_paths_range), desc="Processing planes") mp_3d_filter = Mp3DFilter( mp_3d_filter_queue, ball_filter, cell_detector, soma_diameter, args.output_dir, soma_size_spread_factor=args.soma_spread_factor, progress_bar=progress_bar, save_planes=args.save_planes, plane_directory=args.plane_directory, start_plane=args.start_plane, max_cluster_size=max_cluster_size, outlier_keep=args.outlier_keep, artifact_keep=args.artifact_keep, save_csv=args.save_csv, ) # start 3D analysis (waits for planes in queue) bf_process = multiprocessing.Process(target=mp_3d_filter.process, args=()) bf_process.start() # needs to be started before the loop mp_tile_processor = MpTileProcessor(workers_queue, mp_3d_filter_queue) prev_lock = Lock() processes = [] # start 2D tile filter (output goes into queue for 3D analysis) for plane_id, path in enumerate(planes_paths_range): workers_queue.get() lock = Lock() lock.acquire() p = multiprocessing.Process( target=mp_tile_processor.process, args=( plane_id, path, prev_lock, lock, clipping_val, threshold_value, soma_diameter, args.log_sigma_size, args.n_sds_above_mean_thresh, ), ) prev_lock = lock processes.append(p) p.start() processes[-1].join() mp_3d_filter_queue.put((None, None, None)) # Signal the end bf_process.join() logging.info( "Detection complete - all planes done in : {}".format(datetime.now() - start_time))