Beispiel #1
0
 def _get_iterator(semaphore):
     for tu_batch in loader():
         override_label = _get_corpus_label(tu_batch)
         shared_state = self._global_shared_state.get(
             override_label)
         yield tu_batch, shared_state
         # If the semaphore value reaches 0, the iterator will block so that no more
         # batches are loaded.
         semaphore.acquire()
    def process(self,
                loader,
                consumer,
                preprocess_exit_step=None,
                options=None):

        logger.info('Start processing using %d worker(s)', self._num_workers)

        if self._num_workers == 0:

            pipeline = None
            for tu_batch in loader():
                override_label = _get_corpus_label(tu_batch)
                shared_state = self._global_shared_state.get(override_label)
                outputs, pipeline = _process_batch(
                    pipeline,
                    tu_batch,
                    options=options,
                    config=self._config,
                    process_type=self._pipeline_type,
                    exit_step=preprocess_exit_step,
                    override_label=override_label,
                    shared_state=shared_state,
                )
                consumer(outputs)

        else:

            # Because of the Python GIL (Global Interpreter Lock), we need to use
            # process-based workers to enable true parallelism. The downside is
            # that it duplicates resources for each worker, increasing the
            # memory usage. This is mitigated by the better stream processing of
            # the loader/consumer which avoids loading the full corpus in memory.
            with multiprocessing.Pool(processes=self._num_workers) as pool:
                results = collections.deque()

                for tu_batch in loader():
                    override_label = _get_corpus_label(tu_batch)
                    shared_state = self._global_shared_state.get(override_label)

                    # Push the batch in the process queue and get a handle on the result.
                    results.append(pool.apply_async(
                        _process_batch_on_worker,
                        args=(
                            tu_batch,
                        ),
                        kwds=dict(
                            options=options,
                            config=self._config,
                            process_type=self._pipeline_type,
                            exit_step=preprocess_exit_step,
                            override_label=override_label,
                            shared_state=shared_state,
                        ),
                    ))

                    # Limit the queue max size to avoid loading too many batches in advance.
                    if len(results) == 2 * self._num_workers:
                        results[0].wait()

                    # Consume batches that are ready.
                    while len(results) > 0 and results[0].ready():
                        consumer(results.popleft().get())

                # Wait and consume all remaining batches.
                while len(results) > 0:
                    consumer(results.popleft().get())
Beispiel #3
0
    def process(self,
                loader,
                consumer,
                preprocess_exit_step=None,
                options=None,
                pipeline=None):

        if self._num_workers == 0:
            logger.info("Start processing")

            for tu_batch in loader():
                override_label = _get_corpus_label(tu_batch)
                shared_state = self._global_shared_state.get(override_label)
                outputs, pipeline = _process_batch(
                    pipeline,
                    tu_batch,
                    options=options,
                    config=self._config,
                    process_type=self._pipeline_type,
                    exit_step=preprocess_exit_step,
                    shared_state=shared_state,
                )
                consumer(outputs)

        else:
            logger.info("Start processing using %d worker(s)",
                        self._num_workers)

            def _get_iterator(semaphore):
                for tu_batch in loader():
                    override_label = _get_corpus_label(tu_batch)
                    shared_state = self._global_shared_state.get(
                        override_label)
                    yield tu_batch, shared_state
                    # If the semaphore value reaches 0, the iterator will block so that no more
                    # batches are loaded.
                    semaphore.acquire()

            process_func = functools.partial(
                _process_batch_on_worker,
                options=options,
                config=self._config,
                process_type=self._pipeline_type,
                exit_step=preprocess_exit_step,
            )

            # Because of the Python GIL (Global Interpreter Lock), we need to use
            # process-based workers to enable true parallelism. The downside is
            # that it duplicates resources for each worker, increasing the
            # memory usage. This is mitigated by the better stream processing of
            # the loader/consumer which avoids loading the full corpus in memory.
            with multiprocessing.Pool(processes=self._num_workers) as pool:
                # We use a semaphore to control how many batches can be loaded in advance.
                buffer_size = 2 * self._num_workers
                semaphore = multiprocessing.Semaphore(buffer_size)
                iterable = _get_iterator(semaphore)

                for result in pool.imap_unordered(process_func, iterable):
                    # Increment the semaphore value to allow loading another batch.
                    semaphore.release()
                    consumer(result)