Example #1
0
 def _wait_futures(self, data):
     logger.info('Reduce function: waiting for map results')
     fut_list = data['results']
     wait(fut_list, self.internal_storage, download_results=True)
     results = [f.result() for f in fut_list if f.done and not f.futures]
     fut_list.clear()
     data['results'] = results
Example #2
0
    def map_reduce(self,
                   map_function,
                   map_iterdata,
                   reduce_function,
                   chunksize=None,
                   worker_processes=None,
                   extra_args=None,
                   extra_env=None,
                   map_runtime_memory=None,
                   obj_chunk_size=None,
                   obj_chunk_number=None,
                   reduce_runtime_memory=None,
                   chunk_size=None,
                   chunk_n=None,
                   timeout=None,
                   invoke_pool_threads=None,
                   reducer_one_per_object=False,
                   reducer_wait_local=False,
                   include_modules=[],
                   exclude_modules=[]):
        """
        Map the map_function over the data and apply the reduce_function across all futures.
        This method is executed all within CF.

        :param map_function: the function to map over the data
        :param map_iterdata:  An iterable of input data
        :param chunksize: Split map_iteradata in chunks of this size.
                          Lithops spawns 1 worker per resulting chunk. Default 1
        :param worker_processes: Number of concurrent/parallel processes in each worker Default 1
        :param reduce_function:  the function to reduce over the futures
        :param extra_env: Additional environment variables for action environment. Default None.
        :param extra_args: Additional arguments to pass to function activation. Default None.
        :param map_runtime_memory: Memory to use to run the map function. Default None (loaded from config).
        :param reduce_runtime_memory: Memory to use to run the reduce function. Default None (loaded from config).
        :param obj_chunk_size: the size of the data chunks to split each object. 'None' for processing
                               the whole file in one function activation.
        :param obj_chunk_number: Number of chunks to split each object. 'None' for processing the whole
                                 file in one function activation.
        :param remote_invocation: Enable or disable remote_invocation mechanism. Default 'False'
        :param timeout: Time that the functions have to complete their execution before raising a timeout.
        :param reducer_one_per_object: Set one reducer per object after running the partitioner
        :param reducer_wait_local: Wait for results locally
        :param invoke_pool_threads: Number of threads to use to invoke.
        :param include_modules: Explicitly pickle these dependencies.
        :param exclude_modules: Explicitly keep these modules from pickled dependencies.

        :return: A list with size `len(map_iterdata)` of futures.
        """
        self.last_call = 'map_reduce'
        map_job_id = self._create_job_id('M')

        runtime_meta = self.invoker.select_runtime(map_job_id,
                                                   map_runtime_memory)

        map_job = create_map_job(self.config,
                                 self.internal_storage,
                                 self.executor_id,
                                 map_job_id,
                                 map_function=map_function,
                                 iterdata=map_iterdata,
                                 chunksize=chunksize,
                                 worker_processes=worker_processes,
                                 runtime_meta=runtime_meta,
                                 runtime_memory=map_runtime_memory,
                                 extra_args=extra_args,
                                 extra_env=extra_env,
                                 chunk_size=chunk_size,
                                 chunk_n=chunk_n,
                                 obj_chunk_size=obj_chunk_size,
                                 obj_chunk_number=obj_chunk_number,
                                 include_modules=include_modules,
                                 exclude_modules=exclude_modules,
                                 execution_timeout=timeout,
                                 invoke_pool_threads=invoke_pool_threads)

        map_futures = self.invoker.run_job(map_job)
        self.futures.extend(map_futures)

        if reducer_wait_local:
            wait(fs=map_futures,
                 internal_storage=self.internal_storage,
                 job_monitor=self.job_monitor)

        reduce_job_id = map_job_id.replace('M', 'R')

        runtime_meta = self.invoker.select_runtime(reduce_job_id,
                                                   reduce_runtime_memory)

        reduce_job = create_reduce_job(
            self.config,
            self.internal_storage,
            self.executor_id,
            reduce_job_id,
            reduce_function,
            map_job,
            map_futures,
            runtime_meta=runtime_meta,
            runtime_memory=reduce_runtime_memory,
            reducer_one_per_object=reducer_one_per_object,
            extra_env=extra_env,
            include_modules=include_modules,
            exclude_modules=exclude_modules)

        reduce_futures = self.invoker.run_job(reduce_job)
        self.futures.extend(reduce_futures)

        for f in map_futures:
            f._produce_output = False

        return map_futures + reduce_futures
Example #3
0
    def wait(self,
             fs=None,
             throw_except=True,
             return_when=ALL_COMPLETED,
             download_results=False,
             timeout=None,
             threadpool_size=THREADPOOL_SIZE,
             wait_dur_sec=WAIT_DUR_SEC):
        """
        Wait for the Future instances (possibly created by different Executor instances)
        given by fs to complete. Returns a named 2-tuple of sets. The first set, named done,
        contains the futures that completed (finished or cancelled futures) before the wait
        completed. The second set, named not_done, contains the futures that did not complete
        (pending or running futures). timeout can be used to control the maximum number of
        seconds to wait before returning.

        :param fs: Futures list. Default None
        :param throw_except: Re-raise exception if call raised. Default True.
        :param return_when: One of `ALL_COMPLETED`, `ANY_COMPLETED`, `ALWAYS`
        :param download_results: Download results. Default false (Only get statuses)
        :param timeout: Timeout of waiting for results.
        :param threadpool_size: Number of threads to use. Default 64
        :param wait_dur_sec: Time interval between each check.

        :return: `(fs_done, fs_notdone)`
            where `fs_done` is a list of futures that have completed
            and `fs_notdone` is a list of futures that have not completed.
        :rtype: 2-tuple of list
        """
        futures = fs or self.futures
        if type(futures) != list:
            futures = [futures]

        # Start waiting for results
        try:
            wait(fs=futures,
                 internal_storage=self.internal_storage,
                 job_monitor=self.job_monitor,
                 download_results=download_results,
                 throw_except=throw_except,
                 return_when=return_when,
                 timeout=timeout,
                 threadpool_size=threadpool_size,
                 wait_dur_sec=wait_dur_sec)

        except Exception as e:
            self.invoker.stop()
            if not fs and is_notebook():
                del self.futures[len(self.futures) - len(futures):]
            if self.data_cleaner and not self.is_lithops_worker:
                self.clean(clean_cloudobjects=False, force=True)
            raise e

        finally:
            present_jobs = {f.job_key for f in futures}
            self.job_monitor.stop(present_jobs)
            if self.data_cleaner and not self.is_lithops_worker:
                self.clean(clean_cloudobjects=False)

        if download_results:
            fs_done = [f for f in futures if f.done]
            fs_notdone = [f for f in futures if not f.done]
        else:
            fs_done = [f for f in futures if f.success or f.done]
            fs_notdone = [f for f in futures if not f.success and not f.done]

        return fs_done, fs_notdone
Example #4
0
    def wait(
        self,
        fs: Optional[Union[ResponseFuture, FuturesList,
                           List[ResponseFuture]]] = None,
        throw_except: Optional[bool] = True,
        return_when: Optional[Any] = ALL_COMPLETED,
        download_results: Optional[bool] = False,
        timeout: Optional[int] = None,
        threadpool_size: Optional[int] = THREADPOOL_SIZE,
        wait_dur_sec: Optional[int] = WAIT_DUR_SEC
    ) -> Tuple[FuturesList, FuturesList]:
        """
        Wait for the Future instances (possibly created by different Executor instances)
        given by fs to complete. Returns a named 2-tuple of sets. The first set, named done,
        contains the futures that completed (finished or cancelled futures) before the wait
        completed. The second set, named not_done, contains the futures that did not complete
        (pending or running futures). timeout can be used to control the maximum number of
        seconds to wait before returning.

        :param fs: Futures list. Default None
        :param throw_except: Re-raise exception if call raised. Default True
        :param return_when: Percentage of done futures
        :param download_results: Download results. Default false (Only get statuses)
        :param timeout: Timeout of waiting for results
        :param threadpool_size: Number of threads to use. Default 64
        :param wait_dur_sec: Time interval between each check

        :return: `(fs_done, fs_notdone)` where `fs_done` is a list of futures that have completed and `fs_notdone` is a list of futures that have not completed.
        """
        futures = fs or self.futures
        if type(futures) != list and type(futures) != FuturesList:
            futures = [futures]

        # Start waiting for results
        try:
            wait(fs=futures,
                 internal_storage=self.internal_storage,
                 job_monitor=self.job_monitor,
                 download_results=download_results,
                 throw_except=throw_except,
                 return_when=return_when,
                 timeout=timeout,
                 threadpool_size=threadpool_size,
                 wait_dur_sec=wait_dur_sec)

            if self.data_cleaner and return_when == ALL_COMPLETED:
                present_jobs = {f.job_key for f in futures}
                self.compute_handler.clear(present_jobs)
                self.clean(clean_cloudobjects=False)

        except (KeyboardInterrupt, Exception) as e:
            self.invoker.stop()
            self.job_monitor.stop()
            if not fs and is_notebook():
                del self.futures[len(self.futures) - len(futures):]
            if self.data_cleaner:
                present_jobs = {f.job_key for f in futures}
                self.compute_handler.clear(present_jobs)
                self.clean(clean_cloudobjects=False, force=True)
            raise e

        if download_results:
            fs_done = [f for f in futures if f.done]
            fs_notdone = [f for f in futures if not f.done]
        else:
            fs_done = [f for f in futures if f.success or f.done]
            fs_notdone = [f for f in futures if not f.success and not f.done]

        return create_futures_list(fs_done, self), create_futures_list(
            fs_notdone, self)