def from_iterable( iterable: typing.Iterable = pypeln_utils.UNDEFINED, maxsize: int = None, worker_constructor: typing.Type = Thread, ) -> Stage: """ Creates a stage from an iterable. This function gives you more control of how a stage is created through the `worker_constructor` parameter which can be either: * `threading.Thread`: (default) is efficient for iterables that already have the data in memory like lists or numpy arrays because threads can share memory so no serialization is needed. * `multiprocessing.Process`: is efficient for iterables who's data is not in memory like arbitrary generators and benefit from escaping the GIL. This is inefficient for iterables which have data in memory because they have to be serialized when sent to the background process. Arguments: iterable: a source iterable. maxsize: this parameter is not used and only kept for API compatibility with the other modules. worker_constructor: defines the worker type for the producer stage. Returns: If the `iterable` parameters is given then this function returns a new stage, else it returns a `Partial`. """ if pypeln_utils.is_undefined(iterable): return pypeln_utils.Partial( lambda iterable: from_iterable(iterable, maxsize=None,) ) return FromIterable( iterable=iterable, f=None, workers=1, maxsize=0, timeout=0, on_start=None, on_done=None, dependencies=[], )
def from_iterable( iterable: tp.Union[tp.Iterable[T], pypeln_utils.Undefined] = pypeln_utils.UNDEFINED, use_thread: bool = True, maxsize: int = 0, ) -> tp.Union[Stage[T], pypeln_utils.Partial[Stage[T]]]: """ Creates a stage from an iterable. This function gives you more control of the iterable is consumed. Arguments: iterable: A source Iterable. use_thread: This parameter is not used and only kept for API compatibility with the other modules. Returns: Returns a `Stage` if the `iterable` parameters is given, else it returns a `Partial`. """ if isinstance(iterable, pypeln_utils.Undefined): return pypeln_utils.Partial( lambda iterable: from_iterable(iterable, use_thread=use_thread)) return Stage( process_fn=FromIterable(iterable, maxsize=maxsize), workers=1, maxsize=maxsize, timeout=0, total_sources=1, dependencies=[], on_start=None, on_done=None, f_args=[], )
def to_iterable( stage: Stage = pypeln_utils.UNDEFINED, maxsize: int = 0, return_index: bool = False ) -> typing.Iterable: """ Creates an iterable from a stage. Arguments: stage: A stage object. maxsize: The maximum number of objects the stage can hold simultaneously, if set to `0` (default) then the stage can grow unbounded. Returns: If the `stage` parameters is given then this function returns an iterable, else it returns a `Partial`. """ if pypeln_utils.is_undefined(stage): return pypeln_utils.Partial( lambda stage: to_iterable(stage, maxsize=maxsize, return_index=return_index) ) if isinstance(stage, Stage): iterable = stage.to_iterable(maxsize=maxsize, return_index=return_index) else: iterable = stage return iterable
def from_iterable( iterable: typing.Iterable = pypeln_utils.UNDEFINED, maxsize: int = None, worker_constructor: typing.Type = None, ) -> Stage: """ Creates a stage from an iterable. Arguments: iterable: a source iterable. maxsize: this parameter is not used and only kept for API compatibility with the other modules. worker_constructor: this parameter is not used and only kept for API compatibility with the other modules. Returns: If the `iterable` parameters is given then this function returns a new stage, else it returns a `Partial`. """ if pypeln_utils.is_undefined(iterable): return pypeln_utils.Partial(lambda iterable: from_iterable( iterable, maxsize=None, worker_constructor=worker_constructor)) return FromIterable( iterable=iterable, f=None, timeout=0, on_start=None, on_done=None, dependencies=[], )
def to_iterable( stage: tp.Union[Stage[A], tp.Iterable[A], pypeln_utils.Undefined] = pypeln_utils.UNDEFINED, maxsize: int = 0, return_index: bool = False, ) -> tp.Union[tp.Iterable[A], pypeln_utils.Partial[tp.Iterable[A]]]: """ Creates an iterable from a stage. Use this function to when you want to have more control over how the output stage is consumed, especifically, setting the `maxsize` argument can help you avoid OOM error if the consumer is slow. Arguments: stage: A Stage or Iterable. maxsize: This parameter is not used and only kept for API compatibility with the other modules. return_index: When set to `True` the resulting iterable will yield the `Elemen(index: Tuple[int, ...], value: Any)` which contains both the resulting value and the index parameter which holds information about the order of creation of the elements at the source. Returns: If the `stage` parameters is given then this function returns an iterable, else it returns a `Partial`. """ if isinstance(stage, pypeln_utils.Undefined): return pypeln_utils.Partial(lambda stage: to_iterable( stage, maxsize=maxsize, return_index=return_index)) if isinstance(stage, Stage): iterable = stage.to_iterable(maxsize=maxsize, return_index=return_index) else: iterable = stage return iterable
def from_iterable( iterable: tp.Union[tp.Iterable[T], pypeln_utils.Undefined] = pypeln_utils.UNDEFINED, use_thread: bool = True, maxsize: int = 0, ) -> tp.Union[Stage[T], pypeln_utils.Partial[Stage[T]]]: """ Creates a stage from an iterable. Arguments: iterable: A source Iterable. use_thread: If set to `True` (default) it will use a thread instead of a process to consume the iterable. Threads start faster and use thread memory to the iterable is not serialized, however, if the iterable is going to perform slow computations it better to use a process. Returns: Returns a `Stage` if the `iterable` parameters is given, else it returns a `Partial`. """ if isinstance(iterable, pypeln_utils.Undefined): return pypeln_utils.Partial( lambda iterable: from_iterable(iterable, use_thread=use_thread)) return Stage( process_fn=FromIterable(iterable, maxsize=maxsize), workers=1, maxsize=maxsize, timeout=0, total_sources=1, dependencies=[], on_start=None, on_done=None, use_threads=use_thread, f_args=[], )
def to_iterable( stage: tp.Union[Stage[A], tp.Iterable[A], tp.AsyncIterable[A], pypeln_utils.Undefined] = pypeln_utils.UNDEFINED, maxsize: int = 0, return_index: bool = False, ) -> tp.Union[tp.Iterable[A], pypeln_utils.Partial[tp.Iterable[A]]]: """ Creates an iterable from a stage. Use this function to when you want to have more control over how the output stage is consumed, especifically, setting the `maxsize` argument can help you avoid OOM error if the consumer is slow. Arguments: stage: A Stage, Iterable, or AsyncIterable. maxsize: The maximum number of objects the stage can hold simultaneously, if set to `0` (default) then the stage can grow unbounded. return_index: When set to `True` the resulting iterable will yield the `Elemen(index: Tuple[int, ...], value: Any)` which contains both the resulting value and the index parameter which holds information about the order of creation of the elements at the source. Returns: If the `stage` parameters is given then this function returns an iterable, else it returns a `Partial`. """ if isinstance(stage, pypeln_utils.Undefined): return pypeln_utils.Partial( lambda stage: to_iterable(stage, maxsize=maxsize)) if isinstance(stage, Stage): iterable = stage.to_iterable(maxsize=maxsize, return_index=return_index) elif isinstance(stage, tp.Iterable[A]): return stage else: iterable = from_iterable(stage).to_iterable(maxsize=maxsize, return_index=return_index) return iterable
def to_iterable( stage: Stage = pypeln_utils.UNDEFINED, maxsize: int = 0, return_index=False, ) -> typing.Iterable: """ Creates an iterable from a stage. Arguments: stage: A stage object. maxsize: This parameter is not used and only kept for API compatibility with the other modules. Returns: If the `stage` parameters is given then this function returns an iterable, else it returns a `Partial`. """ if pypeln_utils.is_undefined(stage): return pypeln_utils.Partial( lambda stage: to_iterable(stage, maxsize=maxsize, return_index=return_index) ) if isinstance(stage, Stage): iterable = stage.to_iterable(maxsize=maxsize, return_index=return_index) else: iterable = stage return iterable
def to_async_iterable( stage: tp.Union[Stage[A], tp.Iterable[A], tp.AsyncIterable[A], pypeln_utils.Undefined] = pypeln_utils.UNDEFINED, maxsize: int = 0, return_index: bool = False, ) -> tp.Union[tp.AsyncIterable[A], pypeln_utils.Partial[tp.AsyncIterable[A]]]: """ Creates an iterable from a stage. Arguments: stage: A Stage, Iterable, or AsyncIterable. maxsize: The maximum number of objects the stage can hold simultaneously, if set to `0` (default) then the stage can grow unbounded. Returns: If the `stage` parameters is given then this function returns an iterable, else it returns a `Partial`. """ if isinstance(stage, pypeln_utils.Undefined): return pypeln_utils.Partial( lambda stage: to_async_iterable(stage, maxsize=maxsize)) if isinstance(stage, Stage): iterable = stage.to_async_iterable(maxsize=maxsize, return_index=return_index) elif isinstance(stage, tp.AsyncIterable[A]): return stage else: iterable = from_iterable(stage, maxsize=maxsize).to_async_iterable( maxsize=maxsize, return_index=return_index) return iterable
def ordered( stage: tp.Union[ Stage[A], tp.Iterable[A], tp.AsyncIterable[A], pypeln_utils.Undefined ] = pypeln_utils.UNDEFINED, maxsize: int = 0, ) -> tp.Union[Stage[A], pypeln_utils.Partial[Stage[A]]]: """ Creates a stage that sorts its elements based on their order of creation on the source iterable(s) of the pipeline. ```python import pypeln as pl import random import time def slow_squared(x): time.sleep(random.random()) return x ** 2 stage = range(5) stage = pl.process.map(slow_squared, stage, workers = 2) stage = pl.process.ordered(stage) print(list(stage)) # [0, 1, 4, 9, 16] ``` !!! note `ordered` will work even if the previous stages are from different `pypeln` modules, but it may not work if you introduce an itermediate external iterable stage. !!! warning This stage will not yield util it accumulates all of the elements from the previous stage, use this only if all elements fit in memory. Arguments: stage: A Stage, Iterable, or AsyncIterable. maxsize: The maximum number of objects the stage can hold simultaneously, if set to `0` (default) then the stage can grow unbounded. Returns: If the `stage` parameters is given then this function returns an iterable, else it returns a `Partial`. """ if isinstance(stage, pypeln_utils.Undefined): return pypeln_utils.Partial(lambda stage: ordered(stage)) stage = to_stage(stage, maxsize=maxsize) return Stage( process_fn=Ordered(), workers=1, maxsize=0, timeout=0, total_sources=1, dependencies=[stage], on_start=None, on_done=None, f_args=[], )
def ordered( stage: tp.Union[ Stage[A], tp.Iterable[A], pypeln_utils.Undefined ] = pypeln_utils.UNDEFINED, ) -> tp.Union[Stage[A], pypeln_utils.Partial[Stage[A]]]: """ Creates a stage that sorts its elements based on their order of creation on the source iterable(s) of the pipeline. ```python import pypeln as pl import random import time def slow_squared(x): time.sleep(random.random()) return x ** 2 stage = range(5) stage = pl.thread.map(slow_squared, stage, workers = 2) stage = pl.sync.ordered(stage) print(list(stage)) # [0, 1, 4, 9, 16] ``` Since `sync.map` preserves order, instead we used `thread.map` so this example made sense. !!! note `ordered` will work even if the previous stages are from different `pypeln` modules, but it may not work if you introduce an itermediate external iterable stage. !!! warning This stage will not yield util it accumulates all of the elements from the previous stage, use this only if all elements fit in memory. Arguments: stage: A Stage or Iterable. Returns: If the `stage` parameters is given then this function returns an iterable, else it returns a `Partial`. """ if isinstance(stage, pypeln_utils.Undefined): return pypeln_utils.Partial(lambda stage: ordered(stage)) stage_ = to_stage(stage) return Stage( process_fn=Ordered(), timeout=0, dependencies=[stage_], on_start=None, on_done=None, f_args=[], )
def ordered(stage: Stage = pypeln_utils.UNDEFINED, maxsize: int = 0) -> Stage: """ Creates a stage that sorts its elements based on their order of creation on the source iterable(s) of the pipeline. ```python import pypeln as pl import random import time def slow_squared(x): time.sleep(random.random()) return x ** 2 stage = range(5) stage = pl.thread.map(slow_squared, stage, workers = 2) stage = pl.sync.ordered(stage) print(list(stage)) # [0, 1, 4, 9, 16] ``` Since `sync.map` preserves order, instead we used `thread.map` so this example made sense. !!! note `ordered` will work even if the previous stages are from different `pypeln` modules, but it may not work if you introduce an itermediate external iterable stage. !!! warning This stage will not yield util it accumulates all of the elements from the previous stage, use this only if all elements fit in memory. Arguments: stage: A stage object. maxsize: The maximum number of objects the stage can hold simultaneously, if set to `0` (default) then the stage can grow unbounded. Returns: If the `stage` parameters is given then this function returns an iterable, else it returns a `Partial`. """ if pypeln_utils.is_undefined(stage): return pypeln_utils.Partial(lambda stage: ordered(stage, maxsize=maxsize)) stage = to_stage(stage) return Ordered( f=None, timeout=0, on_start=None, on_done=None, dependencies=[stage], )
def map( f: MapFn, stage: tp.Union[Stage[A], tp.Iterable[A], pypeln_utils.Undefined] = pypeln_utils.UNDEFINED, workers: int = 1, maxsize: int = 0, timeout: float = 0, on_start: tp.Callable = None, on_done: tp.Callable = None, ) -> tp.Union[Stage[B], pypeln_utils.Partial[Stage[B]]]: """ Creates a stage that maps a function `f` over the data. Its intended to behave like python's built-in `map` function but with the added concurrency. ```python import pypeln as pl import time from random import random def slow_add1(x): time.sleep(random()) # <= some slow computation return x + 1 data = range(10) # [0, 1, 2, ..., 9] stage = pl.process.map(slow_add1, data, workers=3, maxsize=4) data = list(stage) # e.g. [2, 1, 5, 6, 3, 4, 7, 8, 9, 10] ``` !!! note Because of concurrency order is not guaranteed. Arguments: f: A function with the signature `f(x) -> y`. `f` can accept special additional arguments by name as described in [Advanced Usage](https://cgarciae.github.io/pypeln/advanced/#dependency-injection). stage: A Stage or Iterable. workers: The number of workers the stage should contain. maxsize: The maximum number of objects the stage can hold simultaneously, if set to `0` (default) then the stage can grow unbounded. timeout: Seconds before stoping the worker if its current task is not yet completed. Defaults to `0` which means its unbounded. on_start: A function with signature `on_start(worker_info?) -> kwargs?`, where `kwargs` can be a `dict` of keyword arguments that can be consumed by `f` and `on_done`. `on_start` can accept additional arguments by name as described in [Advanced Usage](https://cgarciae.github.io/pypeln/advanced/#dependency-injection). on_done: A function with signature `on_done(stage_status?)`. This function is executed once per worker when the worker finishes. `on_done` can accept additional arguments by name as described in [Advanced Usage](https://cgarciae.github.io/pypeln/advanced/#dependency-injection). Returns: Returns a `Stage` if the `stage` parameters is given, else it returns a `Partial`. """ if isinstance(stage, pypeln_utils.Undefined): return pypeln_utils.Partial(lambda stage: map( f, stage=stage, workers=workers, maxsize=maxsize, timeout=timeout, on_start=on_start, on_done=on_done, )) stage = to_stage(stage, maxsize=maxsize) return Stage( process_fn=Map(f), workers=workers, maxsize=maxsize, timeout=timeout, total_sources=stage.workers, dependencies=[stage], on_start=on_start, on_done=on_done, use_threads=False, f_args=pypeln_utils.function_args(f), )
def flat_map( f: FlatMapFn, stage: tp.Union[Stage[A], tp.Iterable[A], pypeln_utils.Undefined] = pypeln_utils.UNDEFINED, workers: int = 1, maxsize: int = 0, timeout: float = 0, on_start: tp.Callable = None, on_done: tp.Callable = None, ) -> tp.Union[Stage[B], pypeln_utils.Partial[Stage[B]]]: """ Creates a stage that maps a function `f` over the data, however unlike `pypeln.process.map` in this case `f` returns an iterable. As its name implies, `flat_map` will flatten out these iterables so the resulting stage just contains their elements. ```python import pypeln as pl import time from random import random def slow_integer_pair(x): time.sleep(random()) # <= some slow computation if x == 0: yield x else: yield x yield -x data = range(10) # [0, 1, 2, ..., 9] stage = pl.thread.flat_map(slow_integer_pair, data, workers=3, maxsize=4) list(stage) # e.g. [2, -2, 3, -3, 0, 1, -1, 6, -6, 4, -4, ...] ``` !!! note Because of concurrency order is not guaranteed. `flat_map` is a more general operation, you can actually implement `pypeln.process.map` and `pypeln.process.filter` with it, for example: ```python import pypeln as pl pl.thread.map(f, stage) = pl.thread.flat_map(lambda x: [f(x)], stage) pl.thread.filter(f, stage) = pl.thread.flat_map(lambda x: [x] if f(x) else [], stage) ``` Using `flat_map` with a generator function is very useful as e.g. you are able to filter out unwanted elements when there are exceptions, missing data, etc. Arguments: f: A function with signature `f(x) -> iterable`. `f` can accept additional arguments by name as described in [Advanced Usage](https://cgarciae.github.io/pypeln/advanced/#dependency-injection). stage: A Stage or Iterable. workers: The number of workers the stage should contain. maxsize: The maximum number of objects the stage can hold simultaneously, if set to `0` (default) then the stage can grow unbounded. timeout: Seconds before stoping the worker if its current task is not yet completed. Defaults to `0` which means its unbounded. on_start: A function with signature `on_start(worker_info?) -> kwargs?`, where `kwargs` can be a `dict` of keyword arguments that can be consumed by `f` and `on_done`. `on_start` can accept additional arguments by name as described in [Advanced Usage](https://cgarciae.github.io/pypeln/advanced/#dependency-injection). on_done: A function with signature `on_done(stage_status?)`. This function is executed once per worker when the worker finishes. `on_done` can accept additional arguments by name as described in [Advanced Usage](https://cgarciae.github.io/pypeln/advanced/#dependency-injection). Returns: Returns a `Stage` if the `stage` parameters is given, else it returns a `Partial`. """ if isinstance(stage, pypeln_utils.Undefined): return pypeln_utils.Partial(lambda stage: flat_map( f, stage=stage, workers=workers, maxsize=maxsize, timeout=timeout, on_start=on_start, on_done=on_done, )) stage = to_stage(stage, maxsize=maxsize) return Stage( process_fn=FlatMap(f), workers=workers, maxsize=maxsize, timeout=timeout, total_sources=stage.workers, dependencies=[stage], on_start=on_start, on_done=on_done, f_args=pypeln_utils.function_args(f), )
def map( f: typing.Callable, stage: Stage = pypeln_utils.UNDEFINED, workers: int = 1, maxsize: int = 0, timeout: float = 0, on_start: typing.Callable = None, on_done: typing.Callable = None, ) -> Stage: """ Creates a stage that maps a function `f` over the data. Its intended to behave like python's built-in `map` function but with the added concurrency. ```python import pypeln as pl import asyncio from random import random async def slow_add1(x): await asyncio.sleep(random()) # <= some slow computation return x + 1 data = range(10) # [0, 1, 2, ..., 9] stage = pl.task.map(slow_add1, data, workers=3, maxsize=4) data = list(stage) # e.g. [2, 1, 5, 6, 3, 4, 7, 8, 9, 10] ``` !!! note Because of concurrency order is not guaranteed. Arguments: f: A function with signature `f(x, **kwargs) -> y`, where `kwargs` is the return of `on_start` if present. stage: A stage or iterable. workers: The number of workers the stage should contain. maxsize: The maximum number of objects the stage can hold simultaneously, if set to `0` (default) then the stage can grow unbounded. timeout: Seconds before stoping the worker if its current task is not yet completed. Defaults to `0` which means its unbounded. on_start: A function with signature `on_start(worker_info?) -> kwargs`, where `kwargs` can be a `dict` of keyword arguments that will be passed to `f` and `on_done`. If you define a `worker_info` argument an object with information about the worker will be passed. This function is executed once per worker at the beggining. on_done: A function with signature `on_done(stage_status?, **kwargs)`, where `kwargs` is the return of `on_start` if present. If you define a `stage_status` argument an object with information about the stage will be passed. This function is executed once per worker when the worker finishes. Returns: If the `stage` parameters is given then this function returns a new stage, else it returns a `Partial`. """ if pypeln_utils.is_undefined(stage): return pypeln_utils.Partial(lambda stage: map( f, stage=stage, workers=workers, maxsize=maxsize, timeout=timeout, on_start=on_start, on_done=on_done, )) stage = to_stage(stage) return Map( f=f, workers=workers, maxsize=maxsize, timeout=timeout, on_start=on_start, on_done=on_done, dependencies=[stage], )
def flat_map( f: typing.Callable, stage: Stage = pypeln_utils.UNDEFINED, workers: int = 1, maxsize: int = 0, timeout: float = 0, on_start: typing.Callable = None, on_done: typing.Callable = None, ) -> Stage: """ Creates a stage that maps a function `f` over the data, however unlike `pypeln.sync.map` in this case `f` returns an iterable. As its name implies, `flat_map` will flatten out these iterables so the resulting stage just contains their elements. ```python import pypeln as pl import time from random import random def slow_integer_pair(x): time.sleep(random()) # <= some slow computation if x == 0: yield x else: yield x yield -x data = range(10) # [0, 1, 2, ..., 9] stage = pl.sync.flat_map(slow_integer_pair, data, workers=3, maxsize=4) list(stage) # [0, 1, -1, 2, -2, ..., 9, -9] ``` `flat_map` is a more general operation, you can actually implement `pypeln.sync.map` and `pypeln.sync.filter` with it, for example: ```python import pypeln as pl pl.sync.map(f, stage) = pl.sync.flat_map(lambda x: [f(x)], stage) pl.sync.filter(f, stage) = pl.sync.flat_map(lambda x: [x] if f(x) else [], stage) ``` Using `flat_map` with a generator function is very useful as e.g. you are able to filter out unwanted elements when there are exceptions, missing data, etc. Arguments: f: A function with signature `f(x, **kwargs) -> Iterable`, where `kwargs` is the return of `on_start` if present. stage: A stage or iterable. workers: This parameter is not used and only kept for API compatibility with the other modules. maxsize: This parameter is not used and only kept for API compatibility with the other modules. timeout: Seconds before stoping the worker if its current task is not yet completed. Defaults to `0` which means its unbounded. on_start: A function with signature `on_start(worker_info?) -> kwargs`, where `kwargs` can be a `dict` of keyword arguments that will be passed to `f` and `on_done`. If you define a `worker_info` argument an object with information about the worker will be passed. This function is executed once per worker at the beggining. on_done: A function with signature `on_done(stage_status?, **kwargs)`, where `kwargs` is the return of `on_start` if present. If you define a `stage_status` argument an object with information about the stage will be passed. This function is executed once per worker when the worker finishes. !!! warning To implement `timeout` we use `stopit.async_raise` which has some limitations for stoping threads. Returns: If the `stage` parameters is given then this function returns a new stage, else it returns a `Partial`. """ if pypeln_utils.is_undefined(stage): return pypeln_utils.Partial(lambda stage: flat_map( f, stage=stage, workers=workers, maxsize=maxsize, timeout=timeout, on_start=on_start, on_done=on_done, )) stage = to_stage(stage) return FlatMap( f=f, on_start=on_start, on_done=on_done, timeout=timeout, dependencies=[stage], )
def each( f: EachFn, stage: tp.Union[ Stage[A], tp.Iterable[A], pypeln_utils.Undefined ] = pypeln_utils.UNDEFINED, workers: int = 1, maxsize: int = 0, timeout: float = 0, on_start: tp.Callable = None, on_done: tp.Callable = None, run: bool = False, ) -> tp.Union[tp.Optional[Stage[None]], pypeln_utils.Partial[tp.Optional[Stage[None]]]]: """ Creates a stage that runs the function `f` for each element in the data but the stage itself yields no elements. Its useful for sink stages that perform certain actions such as writting to disk, saving to a database, etc, and dont produce any results. For example: ```python import pypeln as pl def process_image(image_path): image = load_image(image_path) image = transform_image(image) save_image(image_path, image) files_paths = get_file_paths() stage = pl.sync.each(process_image, file_paths, workers=4) pl.sync.run(stage) ``` or alternatively ```python files_paths = get_file_paths() pl.sync.each(process_image, file_paths, workers=4, run=True) ``` !!! note Because of concurrency order is not guaranteed. Arguments: f: A function with signature `f(x) -> None`. `f` can accept additional arguments by name as described in [Advanced Usage](https://cgarciae.github.io/pypeln/advanced/#dependency-injection). stage: A Stage or Iterable. workers: This parameter is not used and only kept for API compatibility with the other modules. maxsize: This parameter is not used and only kept for API compatibility with the other modules. timeout: Seconds before stoping the worker if its current task is not yet completed. Defaults to `0` which means its unbounded. on_start: A function with signature `on_start(worker_info?) -> kwargs?`, where `kwargs` can be a `dict` of keyword arguments that can be consumed by `f` and `on_done`. `on_start` can accept additional arguments by name as described in [Advanced Usage](https://cgarciae.github.io/pypeln/advanced/#dependency-injection). on_done: A function with signature `on_done(stage_status?)`. This function is executed once per worker when the worker finishes. `on_done` can accept additional arguments by name as described in [Advanced Usage](https://cgarciae.github.io/pypeln/advanced/#dependency-injection). run: Whether or not to execute the stage immediately. !!! warning To implement `timeout` we use `stopit.ThreadingTimeout` which has some limitations. Returns: If the `stage` parameters is not given then this function returns a `Partial`, else if `run=False` (default) it return a new stage, if `run=True` then it runs the stage and returns `None`. """ if isinstance(stage, pypeln_utils.Undefined): return pypeln_utils.Partial( lambda stage: each( f, stage=stage, workers=workers, maxsize=maxsize, timeout=timeout, on_start=on_start, on_done=on_done, ) ) stage_ = to_stage(stage, maxsize=maxsize) stage_ = Stage( process_fn=Each(f), timeout=timeout, dependencies=[stage_], on_start=on_start, on_done=on_done, f_args=pypeln_utils.function_args(f), ) if not run: return stage_ for _ in stage_: pass
def each( f: typing.Callable, stage: Stage = pypeln_utils.UNDEFINED, workers: int = 1, maxsize: int = 0, timeout: float = 0, on_start: typing.Callable = None, on_done: typing.Callable = None, run: bool = False, ) -> Stage: """ Creates a stage that runs the function `f` for each element in the data but the stage itself yields no elements. Its useful for sink stages that perform certain actions such as writting to disk, saving to a database, etc, and dont produce any results. For example: ```python import pypeln as pl async def process_image(image_path): image = await load_image(image_path) image = await transform_image(image) await save_image(image_path, image) files_paths = get_file_paths() stage = pl.task.each(process_image, file_paths, workers=4) pl.task.run(stage) ``` or alternatively ```python files_paths = get_file_paths() pl.task.each(process_image, file_paths, workers=4, run=True) ``` !!! note Because of concurrency order is not guaranteed. Arguments: f: A function with signature `async? f(x) -> None`. `f` can accept additional arguments by name as described in [Advanced Usage](https://cgarciae.github.io/pypeln/advanced/#dependency-injection). workers: The number of workers the stage should contain. maxsize: The maximum number of objects the stage can hold simultaneously, if set to `0` (default) then the stage can grow unbounded. timeout: Seconds before stoping the worker if its current task is not yet completed. Defaults to `0` which means its unbounded. on_start: A function with signature `on_start(worker_info?) -> kwargs?`, where `kwargs` can be a `dict` of keyword arguments that can be consumed by `f` and `on_done`. `on_start` can accept additional arguments by name as described in [Advanced Usage](https://cgarciae.github.io/pypeln/advanced/#dependency-injection). on_done: A function with signature `on_done(stage_status?)`. This function is executed once per worker when the worker finishes. `on_done` can accept additional arguments by name as described in [Advanced Usage](https://cgarciae.github.io/pypeln/advanced/#dependency-injection). run: Whether or not to execute the stage immediately. Returns: If the `stage` parameters is not given then this function returns a `Partial`, else if `run=False` (default) it return a new stage, if `run=True` then it runs the stage and returns `None`. """ if pypeln_utils.is_undefined(stage): return pypeln_utils.Partial( lambda stage: each( f, stage=stage, workers=workers, maxsize=maxsize, timeout=timeout, on_start=on_start, on_done=on_done, ) ) stage = to_stage(stage) stage = Each( f=f, workers=workers, maxsize=maxsize, timeout=timeout, on_start=on_start, on_done=on_done, dependencies=[stage], ) if not run: return stage for _ in stage: pass
def filter( f: typing.Callable, stage: Stage = pypeln_utils.UNDEFINED, workers: int = 1, maxsize: int = 0, timeout: float = 0, on_start: typing.Callable = None, on_done: typing.Callable = None, ) -> Stage: """ Creates a stage that filter the data given a predicate function `f`. It is intended to behave like python's built-in `filter` function but with the added concurrency. ```python import pypeln as pl import asyncio from random import random async def slow_gt3(x): await asyncio.sleep(random()) # <= some slow computation return x > 3 data = range(10) # [0, 1, 2, ..., 9] stage = pl.task.filter(slow_gt3, data, workers=3, maxsize=4) data = list(stage) # e.g. [5, 6, 3, 4, 7, 8, 9] ``` !!! note Because of concurrency order is not guaranteed. Arguments: f: A function with signature `async? f(x) -> bool`. `f` can accept additional arguments by name as described in [Advanced Usage](https://cgarciae.github.io/pypeln/advanced/#dependency-injection). stage: A stage or iterable. workers: The number of workers the stage should contain. maxsize: The maximum number of objects the stage can hold simultaneously, if set to `0` (default) then the stage can grow unbounded. timeout: Seconds before stoping the worker if its current task is not yet completed. Defaults to `0` which means its unbounded. on_start: A function with signature `on_start(worker_info?) -> kwargs?`, where `kwargs` can be a `dict` of keyword arguments that can be consumed by `f` and `on_done`. `on_start` can accept additional arguments by name as described in [Advanced Usage](https://cgarciae.github.io/pypeln/advanced/#dependency-injection). on_done: A function with signature `on_done(stage_status?)`. This function is executed once per worker when the worker finishes. `on_done` can accept additional arguments by name as described in [Advanced Usage](https://cgarciae.github.io/pypeln/advanced/#dependency-injection). Returns: If the `stage` parameters is given then this function returns a new stage, else it returns a `Partial`. """ if pypeln_utils.is_undefined(stage): return pypeln_utils.Partial( lambda stage: filter( f, stage=stage, workers=workers, maxsize=maxsize, timeout=timeout, on_start=on_start, on_done=on_done, ) ) stage = to_stage(stage) return Filter( f=f, workers=workers, maxsize=maxsize, timeout=timeout, on_start=on_start, on_done=on_done, dependencies=[stage], )
def each( f: typing.Callable, stage: Stage = pypeln_utils.UNDEFINED, workers: int = 1, maxsize: int = 0, timeout: float = 0, on_start: typing.Callable = None, on_done: typing.Callable = None, run: bool = False, ) -> Stage: """ Creates a stage that runs the function `f` for each element in the data but the stage itself yields no elements. Its useful for sink stages that perform certain actions such as writting to disk, saving to a database, etc, and dont produce any results. For example: ```python import pypeln as pl def process_image(image_path): image = load_image(image_path) image = transform_image(image) save_image(image_path, image) files_paths = get_file_paths() stage = pl.sync.each(process_image, file_paths, workers=4) pl.sync.run(stage) ``` or alternatively ```python files_paths = get_file_paths() pl.sync.each(process_image, file_paths, workers=4, run=True) ``` !!! note Because of concurrency order is not guaranteed. Arguments: f: A function with signature `f(x, **kwargs) -> None`, where `kwargs` is the return of `on_start` if present. stage: A stage or iterable. workers: This parameter is not used and only kept for API compatibility with the other modules. maxsize: This parameter is not used and only kept for API compatibility with the other modules. timeout: Seconds before stoping the worker if its current task is not yet completed. Defaults to `0` which means its unbounded. on_start: A function with signature `on_start(worker_info?) -> kwargs`, where `kwargs` can be a `dict` of keyword arguments that will be passed to `f` and `on_done`. If you define a `worker_info` argument an object with information about the worker will be passed. This function is executed once per worker at the beggining. on_done: A function with signature `on_done(stage_status?, **kwargs)`, where `kwargs` is the return of `on_start` if present. If you define a `stage_status` argument an object with information about the stage will be passed. This function is executed once per worker when the worker finishes. run: Whether or not to execute the stage immediately. !!! warning To implement `timeout` we use `stopit.async_raise` which has some limitations for stoping threads. Returns: If the `stage` parameters is not given then this function returns a `Partial`, else if `run=False` (default) it return a new stage, if `run=True` then it runs the stage and returns `None`. """ if pypeln_utils.is_undefined(stage): return pypeln_utils.Partial(lambda stage: each( f, stage=stage, workers=workers, maxsize=maxsize, timeout=timeout, on_start=on_start, on_done=on_done, )) stage = to_stage(stage) stage = Each( f=f, on_start=on_start, on_done=on_done, timeout=timeout, dependencies=[stage], ) if not run: return stage for _ in stage: pass
def filter( f: FilterFn, stage: tp.Union[ Stage[A], tp.Iterable[A], tp.Iterable[A], pypeln_utils.Undefined ] = pypeln_utils.UNDEFINED, workers: int = 1, maxsize: int = 0, timeout: float = 0, on_start: tp.Callable = None, on_done: tp.Callable = None, ) -> tp.Union[Stage[B], pypeln_utils.Partial[Stage[B]]]: """ Creates a stage that filter the data given a predicate function `f`. exactly like python's built-in `filter` function. ```python import pypeln as pl import time from random import random def slow_gt3(x): time.sleep(random()) # <= some slow computation return x > 3 data = range(10) # [0, 1, 2, ..., 9] stage = pl.sync.filter(slow_gt3, data, workers=3, maxsize=4) data = list(stage) # [3, 4, 5, ..., 9] ``` Arguments: f: A function with signature `f(x) -> bool`. `f` can accept additional arguments by name as described in [Advanced Usage](https://cgarciae.github.io/pypeln/advanced/#dependency-injection). stage: A Stage or Iterable. workers: This parameter is not used and only kept for API compatibility with the other modules. maxsize: This parameter is not used and only kept for API compatibility with the other modules. timeout: Seconds before stoping the worker if its current task is not yet completed. Defaults to `0` which means its unbounded. on_start: A function with signature `on_start(worker_info?) -> kwargs?`, where `kwargs` can be a `dict` of keyword arguments that can be consumed by `f` and `on_done`. `on_start` can accept additional arguments by name as described in [Advanced Usage](https://cgarciae.github.io/pypeln/advanced/#dependency-injection). on_done: A function with signature `on_done(stage_status?)`. This function is executed once per worker when the worker finishes. `on_done` can accept additional arguments by name as described in [Advanced Usage](https://cgarciae.github.io/pypeln/advanced/#dependency-injection). !!! warning To implement `timeout` we use `stopit.ThreadingTimeout` which has some limitations. Returns: Returns a `Stage` if the `stage` parameters is given, else it returns a `Partial`. """ if isinstance(stage, pypeln_utils.Undefined): return pypeln_utils.Partial( lambda stage: filter( f, stage=stage, workers=workers, maxsize=maxsize, timeout=timeout, on_start=on_start, on_done=on_done, ) ) stage_ = to_stage(stage) return Stage( process_fn=Filter(f), timeout=timeout, dependencies=[stage_], on_start=on_start, on_done=on_done, f_args=pypeln_utils.function_args(f), )
def map( f: typing.Callable, stage: Stage = pypeln_utils.UNDEFINED, workers: int = None, maxsize: int = None, timeout: float = 0, on_start: typing.Callable = None, on_done: typing.Callable = None, ) -> Stage: """ Creates a stage that maps a function `f` over the data. Its should behave exactly like python's built-in `map` function. ```python import pypeln as pl import time from random import random def slow_add1(x): time.sleep(random()) # <= some slow computation return x + 1 data = range(10) # [0, 1, 2, ..., 9] stage = pl.sync.map(slow_add1, data, workers=3, maxsize=4) data = list(stage) # [1, 2, 3, ..., 10] ``` Arguments: f: A function with signature `f(x, **kwargs) -> y`, where `kwargs` is the return of `on_start` if present. stage: A stage or iterable. workers: This parameter is not used and only kept for API compatibility with the other modules. maxsize: This parameter is not used and only kept for API compatibility with the other modules. timeout: Seconds before stoping the worker if its current task is not yet completed. Defaults to `0` which means its unbounded. on_start: A function with signature `on_start(worker_info?) -> kwargs`, where `kwargs` can be a `dict` of keyword arguments that will be passed to `f` and `on_done`. If you define a `worker_info` argument an object with information about the worker will be passed. This function is executed once per worker at the beggining. on_done: A function with signature `on_done(stage_status?, **kwargs)`, where `kwargs` is the return of `on_start` if present. If you define a `stage_status` argument an object with information about the stage will be passed. This function is executed once per worker when the worker finishes. !!! warning To implement `timeout` we use `stopit.async_raise` which has some limitations for stoping threads. Returns: If the `stage` parameters is given then this function returns a new stage, else it returns a `Partial`. """ if pypeln_utils.is_undefined(stage): return pypeln_utils.Partial(lambda stage: map( f, stage=stage, workers=workers, maxsize=maxsize, timeout=timeout, on_start=on_start, on_done=on_done, )) stage = to_stage(stage) return Map( f=f, on_start=on_start, on_done=on_done, timeout=timeout, dependencies=[stage], )