コード例 #1
0
ファイル: stage.py プロジェクト: ondrocks/pypeln
 def __init__(
     self,
     f,
     workers,
     maxsize,
     on_start,
     on_done,
     dependencies,
     timeout,
 ):
     self.f = f
     self.workers = workers
     self.maxsize = maxsize
     self.on_start = on_start
     self.on_done = on_done
     self.timeout = timeout
     self.dependencies = dependencies
     self.output_queues = utils.MultiQueue()
     self.f_args = pypeln_utils.function_args(self.f) if self.f else set()
     self.on_start_args = (pypeln_utils.function_args(self.on_start)
                           if self.on_start else set())
     self.on_done_args = (pypeln_utils.function_args(self.on_done)
                          if self.on_done else set())
     ######################################
     # build fields
     ######################################
     self.input_queue = None
     self.stage_namespace = None
     self.stage_lock = None
     self.pipeline_namespace = None
     self.pipeline_error_queue = None
     self.pipeline_stages = None
     self.loop = None
コード例 #2
0
ファイル: worker.py プロジェクト: maybeee18/pypeln
    def __call__(self):

        worker_info = WorkerInfo(index=self.index)

        on_start_args: tp.List[str] = (pypeln_utils.function_args(
            self.on_start) if self.on_start else [])
        on_done_args: tp.List[str] = (pypeln_utils.function_args(self.on_done)
                                      if self.on_done else [])

        try:
            if self.on_start is not None:
                on_start_kwargs = dict(worker_info=worker_info)
                kwargs = self.on_start(
                    **{
                        key: value
                        for key, value in on_start_kwargs.items()
                        if key in on_start_args
                    })
            else:
                kwargs = {}

            if kwargs is None:
                kwargs = {}

            kwargs.setdefault("worker_info", worker_info)

            self.process_fn(
                self,
                **{
                    key: value
                    for key, value in kwargs.items() if key in self.f_args
                },
            )

            self.stage_params.worker_done()

            if self.on_done is not None:

                kwargs.setdefault(
                    "stage_status",
                    StageStatus(
                        namespace=self.stage_params.namespace,
                        lock=self.stage_params.lock,
                    ),
                )

                self.on_done(
                    **{
                        key: value
                        for key, value in kwargs.items() if key in on_done_args
                    })

        except pypeln_utils.StopThreadException:
            pass
        except BaseException as e:
            self.main_queue.raise_exception(e)
            time.sleep(0.01)
        finally:
            self.namespace.done = True
            self.stage_params.output_queues.done()
コード例 #3
0
    def __init__(self, f, on_start, on_done, dependencies, timeout):

        self.f = f
        self.on_start = on_start
        self.on_done = on_done
        self.timeout = timeout
        self.dependencies = dependencies
        self.f_args = pypeln_utils.function_args(self.f) if self.f else set()
        self.on_start_args = (
            pypeln_utils.function_args(self.on_start) if self.on_start else set()
        )
        self.on_done_args = (
            pypeln_utils.function_args(self.on_done) if self.on_done else set()
        )
コード例 #4
0
    def run(self) -> tp.Iterable:

        worker_info = WorkerInfo(index=0)

        on_start_args: tp.List[str] = (pypeln_utils.function_args(
            self.on_start) if self.on_start else [])
        on_done_args: tp.List[str] = (pypeln_utils.function_args(self.on_done)
                                      if self.on_done else [])

        if self.on_start is not None:
            on_start_kwargs = dict(worker_info=worker_info)
            kwargs = self.on_start(
                **{
                    key: value
                    for key, value in on_start_kwargs.items()
                    if key in on_start_args
                })
        else:
            kwargs = {}

        if kwargs is None:
            kwargs = {}

        kwargs.setdefault("worker_info", worker_info)

        yield from self.process_fn(
            self,
            **{
                key: value
                for key, value in kwargs.items() if key in self.f_args
            },
        )

        if self.on_done is not None:

            kwargs.setdefault(
                "stage_status",
                StageStatus(),
            )

            self.on_done(**{
                key: value
                for key, value in kwargs.items() if key in on_done_args
            })
コード例 #5
0
def flat_map(
    f: FlatMapFn,
    stage: tp.Union[Stage[A], tp.Iterable[A],
                    pypeln_utils.Undefined] = pypeln_utils.UNDEFINED,
    workers: int = 1,
    maxsize: int = 0,
    timeout: float = 0,
    on_start: tp.Callable = None,
    on_done: tp.Callable = None,
) -> tp.Union[Stage[B], pypeln_utils.Partial[Stage[B]]]:
    """
    Creates a stage that maps a function `f` over the data, however unlike `pypeln.process.map` in this case `f` returns an iterable. As its name implies, `flat_map` will flatten out these iterables so the resulting stage just contains their elements.

    ```python
    import pypeln as pl
    import time
    from random import random

    def slow_integer_pair(x):
        time.sleep(random()) # <= some slow computation

        if x == 0:
            yield x
        else:
            yield x
            yield -x

    data = range(10) # [0, 1, 2, ..., 9]
    stage = pl.thread.flat_map(slow_integer_pair, data, workers=3, maxsize=4)

    list(stage) # e.g. [2, -2, 3, -3, 0, 1, -1, 6, -6, 4, -4, ...]
    ```

    !!! note
        Because of concurrency order is not guaranteed. 
        
    `flat_map` is a more general operation, you can actually implement `pypeln.process.map` and `pypeln.process.filter` with it, for example:

    ```python
    import pypeln as pl

    pl.thread.map(f, stage) = pl.thread.flat_map(lambda x: [f(x)], stage)
    pl.thread.filter(f, stage) = pl.thread.flat_map(lambda x: [x] if f(x) else [], stage)
    ```

    Using `flat_map` with a generator function is very useful as e.g. you are able to filter out unwanted elements when there are exceptions, missing data, etc.

    Arguments:
        f: A function with signature `f(x) -> iterable`. `f` can accept additional arguments by name as described in [Advanced Usage](https://cgarciae.github.io/pypeln/advanced/#dependency-injection).
        stage: A Stage or Iterable.
        workers: The number of workers the stage should contain.
        maxsize: The maximum number of objects the stage can hold simultaneously, if set to `0` (default) then the stage can grow unbounded.
        timeout: Seconds before stoping the worker if its current task is not yet completed. Defaults to `0` which means its unbounded. 
        on_start: A function with signature `on_start(worker_info?) -> kwargs?`, where `kwargs` can be a `dict` of keyword arguments that can be consumed by `f` and `on_done`. `on_start` can accept additional arguments by name as described in [Advanced Usage](https://cgarciae.github.io/pypeln/advanced/#dependency-injection).
        on_done: A function with signature `on_done(stage_status?)`. This function is executed once per worker when the worker finishes. `on_done` can accept additional arguments by name as described in [Advanced Usage](https://cgarciae.github.io/pypeln/advanced/#dependency-injection).

    Returns:
        Returns a `Stage` if the `stage` parameters is given, else it returns a `Partial`.
    """

    if isinstance(stage, pypeln_utils.Undefined):
        return pypeln_utils.Partial(lambda stage: flat_map(
            f,
            stage=stage,
            workers=workers,
            maxsize=maxsize,
            timeout=timeout,
            on_start=on_start,
            on_done=on_done,
        ))

    stage = to_stage(stage, maxsize=maxsize)

    return Stage(
        process_fn=FlatMap(f),
        workers=workers,
        maxsize=maxsize,
        timeout=timeout,
        total_sources=stage.workers,
        dependencies=[stage],
        on_start=on_start,
        on_done=on_done,
        f_args=pypeln_utils.function_args(f),
    )
コード例 #6
0
def each(
    f: EachFn,
    stage: tp.Union[
        Stage[A], tp.Iterable[A], pypeln_utils.Undefined
    ] = pypeln_utils.UNDEFINED,
    workers: int = 1,
    maxsize: int = 0,
    timeout: float = 0,
    on_start: tp.Callable = None,
    on_done: tp.Callable = None,
    run: bool = False,
) -> tp.Union[tp.Optional[Stage[None]], pypeln_utils.Partial[tp.Optional[Stage[None]]]]:
    """
    Creates a stage that runs the function `f` for each element in the data but the stage itself yields no elements. Its useful for sink stages that perform certain actions such as writting to disk, saving to a database, etc, and dont produce any results. For example:

    ```python
    import pypeln as pl

    def process_image(image_path):
        image = load_image(image_path)
        image = transform_image(image)
        save_image(image_path, image)

    files_paths = get_file_paths()
    stage = pl.sync.each(process_image, file_paths, workers=4)
    pl.sync.run(stage)

    ```
    or alternatively

    ```python
    files_paths = get_file_paths()
    pl.sync.each(process_image, file_paths, workers=4, run=True)
    ```

    !!! note
        Because of concurrency order is not guaranteed.

    Arguments:
        f: A function with signature `f(x) -> None`. `f` can accept additional arguments by name as described in [Advanced Usage](https://cgarciae.github.io/pypeln/advanced/#dependency-injection).
        stage: A Stage or Iterable.
        workers: This parameter is not used and only kept for API compatibility with the other modules.
        maxsize: This parameter is not used and only kept for API compatibility with the other modules.
        timeout: Seconds before stoping the worker if its current task is not yet completed. Defaults to `0` which means its unbounded.
        on_start: A function with signature `on_start(worker_info?) -> kwargs?`, where `kwargs` can be a `dict` of keyword arguments that can be consumed by `f` and `on_done`. `on_start` can accept additional arguments by name as described in [Advanced Usage](https://cgarciae.github.io/pypeln/advanced/#dependency-injection).
        on_done: A function with signature `on_done(stage_status?)`. This function is executed once per worker when the worker finishes. `on_done` can accept additional arguments by name as described in [Advanced Usage](https://cgarciae.github.io/pypeln/advanced/#dependency-injection).
        run: Whether or not to execute the stage immediately.

    !!! warning
        To implement `timeout` we use `stopit.ThreadingTimeout` which has some limitations.

    Returns:
        If the `stage` parameters is not given then this function returns a `Partial`, else if `run=False` (default) it return a new stage, if `run=True` then it runs the stage and returns `None`.
    """

    if isinstance(stage, pypeln_utils.Undefined):
        return pypeln_utils.Partial(
            lambda stage: each(
                f,
                stage=stage,
                workers=workers,
                maxsize=maxsize,
                timeout=timeout,
                on_start=on_start,
                on_done=on_done,
            )
        )

    stage_ = to_stage(stage, maxsize=maxsize)

    stage_ = Stage(
        process_fn=Each(f),
        timeout=timeout,
        dependencies=[stage_],
        on_start=on_start,
        on_done=on_done,
        f_args=pypeln_utils.function_args(f),
    )

    if not run:
        return stage_

    for _ in stage_:
        pass
コード例 #7
0
ファイル: filter.py プロジェクト: maybeee18/pypeln
def filter(
    f: FilterFn,
    stage: tp.Union[Stage[A], tp.Iterable[A], tp.Iterable[A],
                    pypeln_utils.Undefined] = pypeln_utils.UNDEFINED,
    workers: int = 1,
    maxsize: int = 0,
    timeout: float = 0,
    on_start: tp.Callable = None,
    on_done: tp.Callable = None,
) -> tp.Union[Stage[B], pypeln_utils.Partial[Stage[B]]]:
    """
    Creates a stage that filter the data given a predicate function `f`. It is intended to behave like python's built-in `filter` function but with the added concurrency.

    ```python
    import pypeln as pl
    import time
    from random import random

    def slow_gt3(x):
        time.sleep(random()) # <= some slow computation
        return x > 3

    data = range(10) # [0, 1, 2, ..., 9]
    stage = pl.thread.filter(slow_gt3, data, workers=3, maxsize=4)

    data = list(stage) # e.g. [5, 6, 3, 4, 7, 8, 9]
    ```

    !!! note
        Because of concurrency order is not guaranteed.

    Arguments:
        f: A function with signature `f(x) -> bool`. `f` can accept additional arguments by name as described in [Advanced Usage](https://cgarciae.github.io/pypeln/advanced/#dependency-injection).
        stage: A Stage or Iterable.
        workers: The number of workers the stage should contain.
        maxsize: The maximum number of objects the stage can hold simultaneously, if set to `0` (default) then the stage can grow unbounded.
        timeout: Seconds before stoping the worker if its current task is not yet completed. Defaults to `0` which means its unbounded. 
        on_start: A function with signature `on_start(worker_info?) -> kwargs?`, where `kwargs` can be a `dict` of keyword arguments that can be consumed by `f` and `on_done`. `on_start` can accept additional arguments by name as described in [Advanced Usage](https://cgarciae.github.io/pypeln/advanced/#dependency-injection).
        on_done: A function with signature `on_done(stage_status?)`. This function is executed once per worker when the worker finishes. `on_done` can accept additional arguments by name as described in [Advanced Usage](https://cgarciae.github.io/pypeln/advanced/#dependency-injection).

    Returns:
        Returns a `Stage` if the `stage` parameters is given, else it returns a `Partial`.
    """

    if isinstance(stage, pypeln_utils.Undefined):
        return pypeln_utils.Partial(lambda stage: filter(
            f,
            stage=stage,
            workers=workers,
            maxsize=maxsize,
            timeout=timeout,
            on_start=on_start,
            on_done=on_done,
        ))

    stage = to_stage(stage)

    return Stage(
        process_fn=Filter(f),
        workers=workers,
        maxsize=maxsize,
        timeout=timeout,
        total_sources=stage.workers,
        dependencies=[stage],
        on_start=on_start,
        on_done=on_done,
        f_args=pypeln_utils.function_args(f),
    )
コード例 #8
0
def filter(
    f: FilterFn,
    stage: tp.Union[
        Stage[A], tp.Iterable[A], tp.Iterable[A], pypeln_utils.Undefined
    ] = pypeln_utils.UNDEFINED,
    workers: int = 1,
    maxsize: int = 0,
    timeout: float = 0,
    on_start: tp.Callable = None,
    on_done: tp.Callable = None,
) -> tp.Union[Stage[B], pypeln_utils.Partial[Stage[B]]]:
    """
    Creates a stage that filter the data given a predicate function `f`. exactly like python's built-in `filter` function.

    ```python
    import pypeln as pl
    import time
    from random import random

    def slow_gt3(x):
        time.sleep(random()) # <= some slow computation
        return x > 3

    data = range(10) # [0, 1, 2, ..., 9]
    stage = pl.sync.filter(slow_gt3, data, workers=3, maxsize=4)

    data = list(stage) # [3, 4, 5, ..., 9]
    ```

    Arguments:
        f: A function with signature `f(x) -> bool`. `f` can accept additional arguments by name as described in [Advanced Usage](https://cgarciae.github.io/pypeln/advanced/#dependency-injection).
        stage: A Stage or Iterable.
        workers: This parameter is not used and only kept for API compatibility with the other modules.
        maxsize: This parameter is not used and only kept for API compatibility with the other modules.
        timeout: Seconds before stoping the worker if its current task is not yet completed. Defaults to `0` which means its unbounded. 
        on_start: A function with signature `on_start(worker_info?) -> kwargs?`, where `kwargs` can be a `dict` of keyword arguments that can be consumed by `f` and `on_done`. `on_start` can accept additional arguments by name as described in [Advanced Usage](https://cgarciae.github.io/pypeln/advanced/#dependency-injection).
        on_done: A function with signature `on_done(stage_status?)`. This function is executed once per worker when the worker finishes. `on_done` can accept additional arguments by name as described in [Advanced Usage](https://cgarciae.github.io/pypeln/advanced/#dependency-injection).

    !!! warning
        To implement `timeout` we use `stopit.ThreadingTimeout` which has some limitations.

    Returns:
        Returns a `Stage` if the `stage` parameters is given, else it returns a `Partial`.
    """

    if isinstance(stage, pypeln_utils.Undefined):
        return pypeln_utils.Partial(
            lambda stage: filter(
                f,
                stage=stage,
                workers=workers,
                maxsize=maxsize,
                timeout=timeout,
                on_start=on_start,
                on_done=on_done,
            )
        )

    stage_ = to_stage(stage)

    return Stage(
        process_fn=Filter(f),
        timeout=timeout,
        dependencies=[stage_],
        on_start=on_start,
        on_done=on_done,
        f_args=pypeln_utils.function_args(f),
    )
コード例 #9
0
def each(
    f: EachFn,
    stage: tp.Union[Stage[A], tp.Iterable[A], tp.AsyncIterable[A],
                    pypeln_utils.Undefined] = pypeln_utils.UNDEFINED,
    workers: int = 1,
    maxsize: int = 0,
    timeout: float = 0,
    on_start: tp.Callable = None,
    on_done: tp.Callable = None,
    run: bool = False,
) -> tp.Union[tp.Optional[Stage[B]],
              pypeln_utils.Partial[tp.Optional[Stage[B]]]]:
    """
    Creates a stage that runs the function `f` for each element in the data but the stage itself yields no elements. Its useful for sink stages that perform certain actions such as writting to disk, saving to a database, etc, and dont produce any results. For example:

    ```python
    import pypeln as pl

    def process_image(image_path):
        image = load_image(image_path)
        image = transform_image(image)
        save_image(image_path, image)

    files_paths = get_file_paths()
    stage = pl.process.each(process_image, file_paths, workers=4)
    pl.process.run(stage)

    ```

    or alternatively

    ```python
    files_paths = get_file_paths()
    pl.process.each(process_image, file_paths, workers=4, run=True)
    ```

    !!! note
        Because of concurrency order is not guaranteed.

    Arguments:
        f: A function with signature `f(x) -> None`. `f` can accept additional arguments by name as described in [Advanced Usage](https://cgarciae.github.io/pypeln/advanced/#dependency-injection).
        workers: The number of workers the stage should contain.
        maxsize: The maximum number of objects the stage can hold simultaneously, if set to `0` (default) then the stage can grow unbounded.
        timeout: Seconds before stoping the worker if its current task is not yet completed. Defaults to `0` which means its unbounded. 
        on_start: A function with signature `on_start(worker_info?) -> kwargs?`, where `kwargs` can be a `dict` of keyword arguments that can be consumed by `f` and `on_done`. `on_start` can accept additional arguments by name as described in [Advanced Usage](https://cgarciae.github.io/pypeln/advanced/#dependency-injection).
        on_done: A function with signature `on_done(stage_status?)`. This function is executed once per worker when the worker finishes. `on_done` can accept additional arguments by name as described in [Advanced Usage](https://cgarciae.github.io/pypeln/advanced/#dependency-injection).
        run: Whether or not to execute the stage immediately. If each is running inside another coroutine / task then avoid using `run=True` since it will block the event loop, use `await pl.task.each(...)` instead.

    Returns:
        If the `stage` parameters is not given then this function returns a `Partial`, else if `run=False` (default) it return a new stage, if `run=True` then it runs the stage and returns `None`.
    """

    if isinstance(stage, pypeln_utils.Undefined):
        return pypeln_utils.Partial(lambda stage: each(
            f,
            stage=stage,
            workers=workers,
            maxsize=maxsize,
            timeout=timeout,
            on_start=on_start,
            on_done=on_done,
        ))

    stage = to_stage(stage, maxsize=maxsize)

    stage = Stage(
        process_fn=Each(f),
        workers=workers,
        maxsize=maxsize,
        timeout=timeout,
        total_sources=1,
        dependencies=[stage],
        on_start=on_start,
        on_done=on_done,
        f_args=pypeln_utils.function_args(f),
    )

    if not run:
        return stage

    for _ in stage:
        pass
コード例 #10
0
    async def __call__(self):

        worker_info = WorkerInfo(index=0)

        on_start_args: tp.List[str] = (pypeln_utils.function_args(
            self.on_start) if self.on_start else [])
        on_done_args: tp.List[str] = (pypeln_utils.function_args(self.on_done)
                                      if self.on_done else [])

        try:
            if self.on_start is not None:
                on_start_kwargs = dict(worker_info=worker_info)
                kwargs = self.on_start(
                    **{
                        key: value
                        for key, value in on_start_kwargs.items()
                        if key in on_start_args
                    })
                if isinstance(kwargs, tp.Awaitable):
                    kwargs = await kwargs
            else:
                kwargs = {}

            if kwargs is None:
                kwargs = {}

            kwargs.setdefault("worker_info", worker_info)

            async with self.tasks:
                await self.process_fn(
                    self,
                    **{
                        key: value
                        for key, value in kwargs.items() if key in self.f_args
                    },
                )

            self.stage_params.worker_done()

            if self.on_done is not None:

                kwargs.setdefault(
                    "stage_status",
                    StageStatus(),
                )

                coro = self.on_done(
                    **{
                        key: value
                        for key, value in kwargs.items() if key in on_done_args
                    })

                if isinstance(coro, tp.Awaitable):
                    await coro

            await self.stage_params.output_queues.worker_done()

        except asyncio.CancelledError:
            pass
        except BaseException as e:
            # print("ERRORRRR", e)
            # import sys, traceback
            # exception_type, _exception, _traceback = sys.exc_info()
            # traceback.print_exception(exception_type, _exception, _traceback)
            await self.main_queue.raise_exception(e)
        finally:
            self.is_done = True
            self.tasks.stop()