Esempio n. 1
0
    def map_reduce(self,
                   map_function,
                   map_iterdata,
                   reduce_function,
                   chunksize=None,
                   worker_processes=None,
                   extra_args=None,
                   extra_env=None,
                   map_runtime_memory=None,
                   obj_chunk_size=None,
                   obj_chunk_number=None,
                   reduce_runtime_memory=None,
                   chunk_size=None,
                   chunk_n=None,
                   timeout=None,
                   invoke_pool_threads=None,
                   reducer_one_per_object=False,
                   reducer_wait_local=False,
                   include_modules=[],
                   exclude_modules=[]):
        """
        Map the map_function over the data and apply the reduce_function across all futures.
        This method is executed all within CF.

        :param map_function: the function to map over the data
        :param map_iterdata:  An iterable of input data
        :param chunksize: Split map_iteradata in chunks of this size.
                          Lithops spawns 1 worker per resulting chunk. Default 1
        :param worker_processes: Number of concurrent/parallel processes in each worker Default 1
        :param reduce_function:  the function to reduce over the futures
        :param extra_env: Additional environment variables for action environment. Default None.
        :param extra_args: Additional arguments to pass to function activation. Default None.
        :param map_runtime_memory: Memory to use to run the map function. Default None (loaded from config).
        :param reduce_runtime_memory: Memory to use to run the reduce function. Default None (loaded from config).
        :param obj_chunk_size: the size of the data chunks to split each object. 'None' for processing
                               the whole file in one function activation.
        :param obj_chunk_number: Number of chunks to split each object. 'None' for processing the whole
                                 file in one function activation.
        :param remote_invocation: Enable or disable remote_invocation mechanism. Default 'False'
        :param timeout: Time that the functions have to complete their execution before raising a timeout.
        :param reducer_one_per_object: Set one reducer per object after running the partitioner
        :param reducer_wait_local: Wait for results locally
        :param invoke_pool_threads: Number of threads to use to invoke.
        :param include_modules: Explicitly pickle these dependencies.
        :param exclude_modules: Explicitly keep these modules from pickled dependencies.

        :return: A list with size `len(map_iterdata)` of futures.
        """
        self.last_call = 'map_reduce'
        map_job_id = self._create_job_id('M')

        runtime_meta = self.invoker.select_runtime(map_job_id,
                                                   map_runtime_memory)

        map_job = create_map_job(self.config,
                                 self.internal_storage,
                                 self.executor_id,
                                 map_job_id,
                                 map_function=map_function,
                                 iterdata=map_iterdata,
                                 chunksize=chunksize,
                                 worker_processes=worker_processes,
                                 runtime_meta=runtime_meta,
                                 runtime_memory=map_runtime_memory,
                                 extra_args=extra_args,
                                 extra_env=extra_env,
                                 chunk_size=chunk_size,
                                 chunk_n=chunk_n,
                                 obj_chunk_size=obj_chunk_size,
                                 obj_chunk_number=obj_chunk_number,
                                 include_modules=include_modules,
                                 exclude_modules=exclude_modules,
                                 execution_timeout=timeout,
                                 invoke_pool_threads=invoke_pool_threads)

        map_futures = self.invoker.run_job(map_job)
        self.futures.extend(map_futures)

        if reducer_wait_local:
            wait(fs=map_futures,
                 internal_storage=self.internal_storage,
                 job_monitor=self.job_monitor)

        reduce_job_id = map_job_id.replace('M', 'R')

        runtime_meta = self.invoker.select_runtime(reduce_job_id,
                                                   reduce_runtime_memory)

        reduce_job = create_reduce_job(
            self.config,
            self.internal_storage,
            self.executor_id,
            reduce_job_id,
            reduce_function,
            map_job,
            map_futures,
            runtime_meta=runtime_meta,
            runtime_memory=reduce_runtime_memory,
            reducer_one_per_object=reducer_one_per_object,
            extra_env=extra_env,
            include_modules=include_modules,
            exclude_modules=exclude_modules)

        reduce_futures = self.invoker.run_job(reduce_job)
        self.futures.extend(reduce_futures)

        for f in map_futures:
            f._produce_output = False

        return map_futures + reduce_futures
Esempio n. 2
0
    def map_reduce(self,
                   map_function: Callable,
                   map_iterdata: List[Union[List[Any], Tuple[Any, ...],
                                            Dict[str, Any]]],
                   reduce_function: Callable,
                   chunksize: Optional[int] = None,
                   extra_args: Optional[Union[List[Any], Tuple[Any, ...],
                                              Dict[str, Any]]] = None,
                   extra_env: Optional[Dict[str, str]] = None,
                   map_runtime_memory: Optional[int] = None,
                   reduce_runtime_memory: Optional[int] = None,
                   obj_chunk_size: Optional[int] = None,
                   obj_chunk_number: Optional[int] = None,
                   timeout: Optional[int] = None,
                   reducer_one_per_object: Optional[bool] = False,
                   spawn_reducer: Optional[int] = 20,
                   include_modules: Optional[List[str]] = [],
                   exclude_modules: Optional[List[str]] = []) -> FuturesList:
        """
        Map the map_function over the data and apply the reduce_function across all futures.

        :param map_function: The function to map over the data
        :param map_iterdata: An iterable of input data
        :param reduce_function: The function to reduce over the futures
        :param chunksize: Split map_iteradata in chunks of this size. Lithops spawns 1 worker per resulting chunk. Default 1
        :param extra_args: Additional arguments to pass to function activation. Default None
        :param extra_env: Additional environment variables for action environment. Default None
        :param map_runtime_memory: Memory to use to run the map function. Default None (loaded from config)
        :param reduce_runtime_memory: Memory to use to run the reduce function. Default None (loaded from config)
        :param obj_chunk_size: the size of the data chunks to split each object. 'None' for processing the whole file in one function activation
        :param obj_chunk_number: Number of chunks to split each object. 'None' for processing the whole file in one function activation
        :param timeout: Time that the functions have to complete their execution before raising a timeout
        :param reducer_one_per_object: Set one reducer per object after running the partitioner
        :param spawn_reducer: Percentage of done map functions before spawning the reduce function
        :param include_modules: Explicitly pickle these dependencies.
        :param exclude_modules: Explicitly keep these modules from pickled dependencies.

        :return: A list with size `len(map_iterdata)` of futures.
        """
        self.last_call = 'map_reduce'
        map_job_id = self._create_job_id('M')

        runtime_meta = self.invoker.select_runtime(map_job_id,
                                                   map_runtime_memory)

        map_job = create_map_job(config=self.config,
                                 internal_storage=self.internal_storage,
                                 executor_id=self.executor_id,
                                 job_id=map_job_id,
                                 map_function=map_function,
                                 iterdata=map_iterdata,
                                 chunksize=chunksize,
                                 runtime_meta=runtime_meta,
                                 runtime_memory=map_runtime_memory,
                                 extra_args=extra_args,
                                 extra_env=extra_env,
                                 obj_chunk_size=obj_chunk_size,
                                 obj_chunk_number=obj_chunk_number,
                                 include_modules=include_modules,
                                 exclude_modules=exclude_modules,
                                 execution_timeout=timeout)

        map_futures = self.invoker.run_job(map_job)
        self.futures.extend(map_futures)

        if isinstance(map_iterdata, FuturesList):
            for fut in map_iterdata:
                fut._produce_output = False

        if spawn_reducer != ALWAYS:
            self.wait(map_futures, return_when=spawn_reducer)
            logger.debug(
                f'ExecutorID {self.executor_id} | JobID {map_job_id} - '
                f'{spawn_reducer}% of map activations done. Spawning reduce stage'
            )

        reduce_job_id = map_job_id.replace('M', 'R')

        runtime_meta = self.invoker.select_runtime(reduce_job_id,
                                                   reduce_runtime_memory)

        reduce_job = create_reduce_job(
            config=self.config,
            internal_storage=self.internal_storage,
            executor_id=self.executor_id,
            reduce_job_id=reduce_job_id,
            reduce_function=reduce_function,
            map_job=map_job,
            map_futures=map_futures,
            runtime_meta=runtime_meta,
            runtime_memory=reduce_runtime_memory,
            reducer_one_per_object=reducer_one_per_object,
            extra_env=extra_env,
            include_modules=include_modules,
            exclude_modules=exclude_modules)

        reduce_futures = self.invoker.run_job(reduce_job)
        self.futures.extend(reduce_futures)

        for f in map_futures:
            f._produce_output = False

        return create_futures_list(map_futures + reduce_futures, self)