Example #1
0
 def __init__(self, view_function):
     self._queue = MultiprocessingQueue()
     self._paused = False
     self._terminated = False
     self._coroutine = None
     self._proc = None
     # The udater is implementation-specific and must be provided by
     # the subclass by calling self._set_updater
     self._updater = None
     self.view_function = view_function
def main():
    server_queue = MultiprocessingQueue()
    server = start_server_process(server_queue)
    client = start_client()

    testers = [
        algorithm_tester('async coalescing tester',
                         test_asyncio_coalesce,
                         use_tcp_nodelay=True),
        algorithm_tester('async no_delay tester',
                         test_asyncio_no_coalesce,
                         use_tcp_nodelay=True),
        algorithm_tester('async nagle tester',
                         test_asyncio_no_coalesce,
                         use_tcp_nodelay=False),
        algorithm_tester('threaded coalescing tester',
                         test_threaded_coalesce,
                         use_tcp_nodelay=True),
        algorithm_tester('threaded no_delay tester',
                         test_threaded_no_coalesce,
                         use_tcp_nodelay=True),
        algorithm_tester('threaded_nagle_tester',
                         test_threaded_no_coalesce,
                         use_tcp_nodelay=False),
        algorithm_tester('simple coalescing',
                         test_simple_coalescing,
                         use_tcp_nodelay=True),
        algorithm_tester('simple no_delay', test_simple, use_tcp_nodelay=True),
        algorithm_tester('simple nagle', test_simple, use_tcp_nodelay=False),
    ]

    stats = []
    try:
        for tester in testers:
            assert server_queue.empty()
            tester_stats = execute_test(tester, client, server_queue)
            stats.append(tester_stats)
    finally:
        client.close()
        server.kill()
    display_stats(stats)
Example #3
0
    def __init__(self,
                 project_id: str,
                 subscription: str,
                 agg_function: AggregationFunction,
                 agg_window_millis: int,
                 auth_file: str = None,
                 spark_opts: dict = {},
                 multiprocessing=False) -> None:
        """
        SparkPubSubStreamConsumer constructor

        :param project_id:          the project id
        :param subscription:        the subscription name
        :param agg_function:        aggregation function to apply
        :param agg_window_millis:   aggregation window in milliseconds
        :param auth_file:           path to credentials json file
        :param spark_opts:          spark options dict
        :param multiprocessing:     use multiprocessing instead of threading
        """
        super().__init__(agg_function, agg_window_millis)
        self.project_id = project_id
        self.subscription = subscription
        self.spark_opts = spark_opts
        self.subscribed = True
        self.multiprocessing = multiprocessing
        if self.multiprocessing:
            self.queue = MultiprocessingQueue()
        else:
            self.queue = Queue()

        os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = auth_file

        def run_spark_job(queue: Queue,
                          _agg_function: AggregationFunction,
                          _agg_window_millis: int,
                          _spark_opts: dict = {},
                          _environment: dict = {}):
            os.environ.update(_environment)
            try:
                try:
                    import findspark
                    findspark.init()
                except Exception as ex:
                    self.logger.warn("Cannot import Spark pyspark with"
                                     " findspark. Message: {}".format(str(ex)))
                    pass

                from pyspark.sql import SparkSession
                from pyspark.streaming import StreamingContext
                from pyspark.sql.functions import expr, window
                from pyspark.serializers import NoOpSerializer
                from pyspark.streaming import DStream
                from pyspark.streaming.kafka import utf8_decoder

                spark_builder = SparkSession \
                    .builder \

                for k in _spark_opts:
                    spark_builder = spark_builder.config(k, _spark_opts[k])

                spark_builder \
                    .appName(str(self)) \
                    .config("spark.jars.packages",
                            "org.apache.spark:spark-streaming-kafka-0-8_2.11:2.2.1,"
                            "org.apache.bahir:spark-streaming-pubsub_2.11:2.2.1") \
                    .config("spark.jars",
                            BASE_PATH + "/lib/streaming-pubsub-serializer_2.11-0.1.jar")

                spark = spark_builder.getOrCreate()
                spark.sparkContext.setLogLevel("WARN")
                ssc = StreamingContext(spark.sparkContext,
                                       (agg_window_millis / 1000))

                agg = expr("value")
                if _agg_function == AggregationFunction.AVG:
                    agg = expr("avg(value)")
                elif _agg_function == AggregationFunction.SUM:
                    agg = expr("sum(value)")
                elif _agg_function == AggregationFunction.COUNT:
                    agg = expr("count(value)")
                elif _agg_function == AggregationFunction.P50:
                    agg = expr("percentile(value, 0.5)")
                elif _agg_function == AggregationFunction.P75:
                    agg = expr("percentile(value, 0.75)")
                elif _agg_function == AggregationFunction.P95:
                    agg = expr("percentile(value, 0.95)")
                elif _agg_function == AggregationFunction.P99:
                    agg = expr("percentile(value, 0.99)")

                deserializer = \
                    ssc._jvm.org.apache.spark.streaming.pubsub.SparkPubsubMessageSerializer()  # noqa: E501
                pubsub_utils = \
                    ssc._jvm.org.apache.spark.streaming.pubsub.PubsubUtils
                credentials = \
                    ssc._jvm.org.apache.spark.streaming.pubsub.SparkGCPCredentials
                storage_level = \
                    ssc._jvm.org.apache.spark.storage.StorageLevel

                _pubsub_stream = pubsub_utils \
                    .createStream(ssc._jssc,
                                  project_id,
                                  subscription,
                                  credentials.Builder().build(),
                                  storage_level.DISK_ONLY())
                _pubsub_stream_des = _pubsub_stream.transform(deserializer)
                ser = NoOpSerializer()
                pubsub_stream = DStream(_pubsub_stream_des, ssc,
                                        ser).map(utf8_decoder)

                def aggregate_rdd(_queue, _agg, df, ts):

                    secs = int(self.agg_window_millis / 1000)
                    win = window("ts", "{}  seconds".format(secs))
                    if df.first():
                        aggs = df \
                            .groupBy("application", win) \
                            .agg(_agg.alias("value")) \
                            .collect()

                        for row in aggs:
                            message = InputMessage(row["application"],
                                                   value=row["value"],
                                                   ts=ts)
                            self.logger.debug("Enqueue: {}".format(
                                message.to_json()))
                            try:
                                _queue.put(message.to_json())
                            except AssertionError as ex:
                                self.logger.warn(str(ex))
                    else:
                        self.logger.warn("Empty RDD")

                # Create kafka stream
                pubsub_stream \
                    .foreachRDD(lambda ts, rdd:
                                aggregate_rdd(queue, agg,
                                              spark.read.json(rdd), ts))

                # Run
                ssc.start()
                if "timeout" in _spark_opts:
                    ssc.awaitTerminationOrTimeout(_spark_opts["timeout"])
                    ssc.stop()
                    spark.stop()
                else:
                    ssc.awaitTermination()
                    ssc.stop()
                    spark.stop()

            except Exception as e:
                raise e

        # Run in multiprocessing, each aggregation runs a spark driver.
        runner = Concurrency.run_process \
            if self.multiprocessing \
            else Concurrency.run_thread

        Concurrency.get_lock("spark").acquire()
        pid = runner(target=run_spark_job,
                     args=(self.queue, self.agg_function,
                           self.agg_window_millis, self.spark_opts,
                           os.environ.copy()),
                     name="PySpark {}".format(str(self)))
        Concurrency.schedule_release("spark", 30)
        self.pid = pid
Example #4
0
    def __init__(self,
                 broker_servers: str,
                 input_topic: str,
                 group_id: str,
                 agg_function: AggregationFunction,
                 agg_window_millis: int,
                 spark_opts: dict = {},
                 multiprocessing=True) -> None:
        """
        SparkKafkaStreamConsumer constructor

        :param broker_servers:      broker servers
        :param input_topic:         input topic
        :param group_id:            consumer group id
        :param agg_function:        aggregation function to apply
        :param agg_window_millis:   aggregation window in milliseconds
        :param spark_opts:          spark options dict
        :param multiprocessing:     use multiprocessing instead of threading
        """
        super().__init__(agg_function, agg_window_millis)
        self.broker_servers = broker_servers.split(",")
        self.input_topic = input_topic
        self.group_id = group_id
        self.spark_opts = spark_opts
        self.subscribed = True
        self.multiprocessing = multiprocessing
        if self.multiprocessing:
            self.queue = MultiprocessingQueue()
        else:
            self.queue = Queue()

        def run_spark_job(queue: Queue,
                          _agg_function: AggregationFunction,
                          _agg_window_millis: int,
                          _spark_opts: dict = {},
                          _environment: dict = {}):
            os.environ.update(_environment)
            try:
                try:
                    import findspark
                    findspark.init()
                except Exception as ex:
                    self.logger.warn("Cannot import Spark pyspark with"
                                     " findspark. Message: {}".format(str(ex)))
                    pass

                from pyspark.sql import SparkSession
                from pyspark.streaming import StreamingContext
                from pyspark.streaming.kafka import KafkaUtils
                from pyspark.sql.functions import expr, window

                spark_builder = SparkSession \
                    .builder \

                for k in _spark_opts:
                    spark_builder = spark_builder.config(k, _spark_opts[k])

                spark_builder = spark_builder \
                    .appName(str(self)) \
                    .config("spark.jars.packages",
                            "org.apache.spark:spark-streaming-kafka-0-8_2.11:2.2.1,"
                            "org.apache.bahir:spark-streaming-pubsub_2.11:2.2.1") \
                    .config("spark.jars",
                            BASE_PATH + "/lib/streaming-pubsub-serializer_2.11-0.1.jar")

                spark = spark_builder.getOrCreate()
                spark.sparkContext.setLogLevel("WARN")
                ssc = StreamingContext(spark.sparkContext,
                                       (agg_window_millis / 1000))

                agg = expr("value")
                if _agg_function == AggregationFunction.AVG:
                    agg = expr("avg(value)")
                elif _agg_function == AggregationFunction.SUM:
                    agg = expr("sum(value)")
                elif _agg_function == AggregationFunction.COUNT:
                    agg = expr("count(value)")
                elif _agg_function == AggregationFunction.P50:
                    agg = expr("percentile(value, 0.5)")
                elif _agg_function == AggregationFunction.P75:
                    agg = expr("percentile(value, 0.75)")
                elif _agg_function == AggregationFunction.P95:
                    agg = expr("percentile(value, 0.95)")
                elif _agg_function == AggregationFunction.P99:
                    agg = expr("percentile(value, 0.99)")

                kafka_stream = KafkaUtils.createDirectStream(
                    ssc, [self.input_topic],
                    {"metadata.broker.list": ",".join(self.broker_servers)})

                def aggregate_rdd(_queue, _agg, df, ts):

                    secs = int(self.agg_window_millis / 1000)
                    win = window("ts", "{}  seconds".format(secs))
                    if df.first():
                        aggs = df \
                            .groupBy("application", win) \
                            .agg(_agg.alias("value")) \
                            .collect()

                        for row in aggs:
                            message = InputMessage(row["application"],
                                                   value=row["value"],
                                                   ts=ts)
                            self.logger.debug("Enqueue: {}".format(
                                message.to_json()))
                            try:
                                _queue.put(message.to_json())
                            except AssertionError as ex:
                                self.logger.warn(str(ex))
                    else:
                        warnings.warn("Empty RDD")

                # Create kafka stream
                kafka_stream \
                    .map(lambda x: x[1]) \
                    .foreachRDD(lambda ts, rdd:
                                aggregate_rdd(queue, agg,
                                              spark.read.json(rdd), ts))

                # Run
                ssc.start()
                if "timeout" in _spark_opts:
                    ssc.awaitTerminationOrTimeout(_spark_opts["timeout"])
                    ssc.stop()
                    spark.stop()
                else:
                    ssc.awaitTermination()
                    ssc.stop()
                    spark.stop()

            except Exception as e:
                raise e

        # Run in multiprocessing, each aggregation runs a spark driver.
        runner = Concurrency.run_process \
            if self.multiprocessing \
            else Concurrency.run_thread

        Concurrency.get_lock("spark").acquire()
        pid = runner(target=run_spark_job,
                     args=(self.queue, self.agg_function,
                           self.agg_window_millis, self.spark_opts,
                           os.environ.copy()),
                     name="PySpark {}".format(str(self)))
        Concurrency.schedule_release("spark", 30)
        self.pid = pid
Example #5
0
def main(args):
    n_processes = get_num_processes(min_free_cpu_cores=args.n_free_cpus)
    start_time = datetime.now()

    (
        soma_diameter,
        max_cluster_size,
        ball_xy_size,
        ball_z_size,
    ) = calculate_parameters_in_pixels(
        args.x_pixel_um,
        args.y_pixel_um,
        args.z_pixel_um,
        args.soma_diameter,
        args.max_cluster_size,
        args.ball_xy_size,
        args.ball_z_size,
    )

    # file extension only used if a directory is passed
    img_paths = get_sorted_file_paths(args.signal_planes_paths[0],
                                      file_extension="tif")

    if args.end_plane == -1:
        args.end_plane = len(img_paths)
    planes_paths_range = img_paths[args.start_plane:args.end_plane]

    workers_queue = MultiprocessingQueue(maxsize=n_processes)
    # WARNING: needs to be AT LEAST ball_z_size
    mp_3d_filter_queue = MultiprocessingQueue(maxsize=ball_z_size)
    for plane_id in range(n_processes):
        # place holder for the queue to have the right size on first run
        workers_queue.put(None)

    clipping_val, threshold_value, ball_filter, cell_detector = setup(
        img_paths[0],
        soma_diameter,
        ball_xy_size,
        ball_z_size,
        ball_overlap_fraction=args.ball_overlap_fraction,
        z_offset=args.start_plane,
    )

    progress_bar = tqdm(total=len(planes_paths_range),
                        desc="Processing planes")
    mp_3d_filter = Mp3DFilter(
        mp_3d_filter_queue,
        ball_filter,
        cell_detector,
        soma_diameter,
        args.output_dir,
        soma_size_spread_factor=args.soma_spread_factor,
        progress_bar=progress_bar,
        save_planes=args.save_planes,
        plane_directory=args.plane_directory,
        start_plane=args.start_plane,
        max_cluster_size=max_cluster_size,
        outlier_keep=args.outlier_keep,
        artifact_keep=args.artifact_keep,
        save_csv=args.save_csv,
    )

    # start 3D analysis (waits for planes in queue)
    bf_process = multiprocessing.Process(target=mp_3d_filter.process, args=())
    bf_process.start()  # needs to be started before the loop

    mp_tile_processor = MpTileProcessor(workers_queue, mp_3d_filter_queue)
    prev_lock = Lock()
    processes = []

    # start 2D tile filter (output goes into queue for 3D analysis)
    for plane_id, path in enumerate(planes_paths_range):
        workers_queue.get()
        lock = Lock()
        lock.acquire()
        p = multiprocessing.Process(
            target=mp_tile_processor.process,
            args=(
                plane_id,
                path,
                prev_lock,
                lock,
                clipping_val,
                threshold_value,
                soma_diameter,
                args.log_sigma_size,
                args.n_sds_above_mean_thresh,
            ),
        )
        prev_lock = lock
        processes.append(p)
        p.start()

    processes[-1].join()
    mp_3d_filter_queue.put((None, None, None))  # Signal the end
    bf_process.join()

    logging.info(
        "Detection complete - all planes done in : {}".format(datetime.now() -
                                                              start_time))