コード例 #1
0
def bootstrap_within_guides(s,
                            n_cells=100,
                            n_reps=10000,
                            statistic=np.mean,
                            n_jobs=1,
                            tqdm=False):
    rng = np.random.default_rng()
    guide_values = {k: g.values for k, g in s.groupby('sgRNA')}
    guides = list(guide_values)

    if tqdm:
        reps = tqdm_auto(range(n_reps))
    else:
        reps = range(n_reps)

    def bootstrap(guide_values, guides, n_cells, statistic):
        rep_guide = rng.choice(guides)
        vals = guide_values[rep_guide]
        return statistic(vals[rng.integers(len(vals), size=n_cells)])

    if n_jobs != 1:
        bootstrapped = Parallel(n_jobs=n_jobs)(
            delayed(bootstrap)(guide_values, guides, n_cells, statistic)
            for _ in reps)
    else:
        bootstrapped = [
            bootstrap(guide_values, guides, n_cells, statistic) for _ in reps
        ]

    return np.array(bootstrapped)
 def format_save_pdf_documents(self, output_folder: dataiku.Folder, output_df: pd.DataFrame) -> Tuple[int, int]:
     """Open PDF documents in a `dataiku.Folder`, draw text bounding polygons and save them to another folder"""
     df_iterator = (index_series_pair[1].to_dict() for index_series_pair in output_df.iterrows())
     len_iterator = len(output_df.index)
     api_results = []
     start = perf_counter()
     logging.info(f"Formatting and saving {len_iterator} PDF page(s) to output folder...")
     with ThreadPoolExecutor(max_workers=self.parallel_workers) as pool:
         futures = [
             pool.submit(
                 self.format_save_pdf_document,
                 output_folder=output_folder,
                 pdf_path=row[self.doc_handler.SPLITTED_PATH_COLUMN],
                 response=safe_json_loads(row[self.api_column_names.response]),
             )
             for row in df_iterator
         ]
         for future in tqdm_auto(as_completed(futures), total=len_iterator):
             api_results.append(future.result())
     num_success = sum(api_results)
     num_error = len(api_results) - num_success
     logging.info(
         (
             f"Formatting and saving {len_iterator} PDF page(s) to output folder: "
             f"{num_success} succeeded, {num_error} failed in {(perf_counter() - start):.2f} seconds."
         )
     )
     return (num_success, num_error)
コード例 #3
0
def tqdm(*args, disable=TqdmDisableOption.default, **kwargs):
    from tqdm.notebook import tqdm as tqdm_notebook  # pylint: disable=import-outside-toplevel
    from tqdm.auto import tqdm as tqdm_auto  # pylint: disable=import-outside-toplevel
    # To tqdm_notebook, None means do not display. To standard tqdm, None means
    # display only when connected to a TTY.
    if disable == TqdmDisableOption.default:
        disable = False if tqdm_auto == tqdm_notebook else None
    return tqdm_auto(*args, disable=disable, **kwargs)
コード例 #4
0
    def split_all_documents(
        self,
        path_df: pd.DataFrame,
        input_folder: dataiku.Folder,
        output_folder: dataiku.Folder,
        path_column: AnyStr = PATH_COLUMN,
    ) -> pd.DataFrame:
        """Split several PDF or TIFF document files into multiple pages and save them as files in another folder

        Args:
            path_df: DataFrame with one column named `path_column` with all the PDF/TIFF file paths
            input_folder: `dataiku.Folder` where the input PDF/TIFF files is stored
            output_folder: `dataiku.Folder` where files will be saved
            path_column: Name of the path column in the input dataframe

        Returns:
            DataFrame with two columns:
                1. `path_column`: Paths of the input documents
                2. `self.SPLITTED_PATH_COLUMN`: Paths of the splitted files

        """
        start = perf_counter()
        logging.info(
            f"Splitting {len(path_df.index)} document(s) and saving each page to output folder..."
        )
        results = []
        with ThreadPoolExecutor(max_workers=self.parallel_workers) as pool:
            futures = [
                pool.submit(self.split_document,
                            input_folder=input_folder,
                            output_folder=output_folder,
                            input_path=input_path)
                for input_path in path_df[path_column]
            ]
            for future in tqdm_auto(as_completed(futures),
                                    total=len(path_df.index)):
                results.append(future.result())
        num_success = sum(
            [result[self.OUTPUT_PATH_LIST_KEY][0] != "" for result in results])
        num_error = len(results) - num_success
        num_pages = sum(
            [len(result[self.OUTPUT_PATH_LIST_KEY])
             for result in results]) - num_error
        if num_pages == 0:
            raise DocumentSplitError("Could not split any document")
        logging.info((
            f"Splitting {len(path_df.index)} document(s) and saving each page to output folder: "
            f"{num_success} document(s) succeeded generating {num_pages} page(s), "
            f"{num_error} document(s) failed in {(perf_counter() - start):.2f} seconds."
        ))
        output_df = pd.DataFrame([
            OrderedDict([(path_column, result[self.INPUT_PATH_KEY]),
                         (self.SPLITTED_PATH_COLUMN, output_path)])
            for result in results
            for output_path in result[self.OUTPUT_PATH_LIST_KEY]
        ])
        return output_df
コード例 #5
0
    def merge_all_documents(
        self,
        path_df: pd.DataFrame,
        input_folder: dataiku.Folder,
        output_folder: dataiku.Folder,
        path_column: AnyStr = PATH_COLUMN,
    ) -> pd.DataFrame:
        """Merge several PDF or TIFF documents after splitting by `self.split_all_documents`

        Bring balance to the force.

        Args:
            path_df: DataFrame with two columns - cf. output of `self.split_all_documents`
                1. `path_column`: Paths of the input documents
                2. `self.SPLITTED_PATH_COLUMN`: Paths of the splitted files
            input_folder:  `dataiku.Folder` where the input PDF/TIFF files are stored
            output_folder: `dataiku.Folder` where the merged PDF/TIFF file will be saved
            path_column: Name of the path column in the input dataframe

        """
        output_df_list = path_df.groupby(path_column)[
            self.SPLITTED_PATH_COLUMN].apply(list).reset_index()
        start = perf_counter()
        logging.info(
            f"Merging and saving {len(path_df.index)} page(s) of {len(output_df_list.index)} document(s)..."
        )
        results = []
        with ThreadPoolExecutor(max_workers=self.parallel_workers) as pool:
            futures = [
                pool.submit(
                    self.merge_document,
                    input_folder=input_folder,
                    output_folder=output_folder,
                    input_path_list=row[1],
                    output_path=row[0],
                ) for row in output_df_list.itertuples(index=False)
            ]
            for future in tqdm_auto(as_completed(futures),
                                    total=len(output_df_list.index)):
                results.append(future.result())
        num_success = sum(
            [1 if output_path != "" else 0 for output_path in results])
        num_error = len(results) - num_success
        logging.info((
            f"Merging and saving {len(path_df.index)} page(s) of {len(output_df_list.index)} document(s)... "
            f"{num_success} document(s) succeeded, {num_error} failed in {(perf_counter() - start):.2f} seconds."
        ))
        page_numbers = path_df[self.SPLITTED_PATH_COLUMN].astype(str).apply(
            self.extract_page_number_from_path)
        path_df.insert(loc=1,
                       column=self.PAGE_NUMBER_COLUMN,
                       value=page_numbers)
        del path_df[self.SPLITTED_PATH_COLUMN]
        return path_df
    def format_save_images(
        self,
        output_folder: dataiku.Folder,
        output_df: pd.DataFrame = None,
        path_column: AnyStr = PATH_COLUMN,
        verbose: bool = True,
    ) -> Tuple[int, int]:
        """Generic method to apply `self.format_save_image` to all images using an `output_df` with API responses

        Do not override this method!

        """
        if output_df is None:
            output_df = self.output_df
        df_iterator = (index_series_pair[1].to_dict()
                       for index_series_pair in output_df.iterrows())
        len_iterator = len(output_df.index)
        if verbose:
            logging.info(
                f"Formatting and saving {len_iterator} image(s) to output folder..."
            )
        start = perf_counter()
        api_results = []
        with ThreadPoolExecutor(max_workers=self.parallel_workers) as pool:
            futures = [
                pool.submit(
                    self.format_save_image,
                    output_folder=output_folder,
                    image_path=row[path_column],
                    response=safe_json_loads(
                        row[self.api_column_names.response]),
                ) for row in df_iterator
            ]
            for future in tqdm_auto(as_completed(futures), total=len_iterator):
                api_results.append(future.result())
        num_success = sum(api_results)
        num_error = len(api_results) - num_success
        if verbose:
            logging.info((
                f"Formatting and saving {len_iterator} image(s) to output folder: "
                f"{num_success} image(s) succeeded, {num_error} failed in {(perf_counter() - start):.2f} seconds."
            ))
        return (num_success, num_error)
コード例 #7
0
ファイル: cl_monitor.py プロジェクト: funkelab/daisy
    def _update_state(self, task_id, task_state):

        if task_id not in self.progresses:
            total = task_state.total_block_count
            self.progresses[task_id] = tqdm_auto(total=total,
                                                 desc=task_id + " ▶",
                                                 unit='blocks',
                                                 leave=True)

        self.progresses[task_id].set_postfix({
            '⧗': task_state.pending_count,
            '▶': task_state.processing_count,
            '✔': task_state.completed_count,
            '✗': task_state.failed_count,
            '∅': task_state.orphaned_count
        })

        completed = task_state.completed_count
        delta = completed - self.progresses[task_id].n
        self.progresses[task_id].update(delta)
コード例 #8
0
def make_progress_bar(*args, **kwargs):
    """Create iterable as progress bar if available.

    Ensure simple loop is returned or tqdm_notebook progress bar when prerequisities met

    Returns
    -------
    iterable or tqdm_notebook
        tqdm_notebook based progress bar or simple iterable
    """
    pbar = args[0]
    try:
        from tqdm.auto import tqdm as tqdm_auto

        pbar = tqdm_auto(*args, **kwargs)
    except Exception as e:
        logging.warning(
            "No prerequisites (tqdm) installed for interactive progress bar, "
            "continuing without one. See the output in the console "
            "or check installation instructions."
            f"{e}")
    return pbar
コード例 #9
0
def bootstrap_cells(s,
                    n_cells=100,
                    n_reps=10000,
                    statistic=np.mean,
                    n_jobs=1,
                    tqdm=False):
    rng = np.random.default_rng()
    vals = s.values

    def bootstrap(vals, n_cells, statistic):
        return statistic(vals[rng.integers(len(vals), size=n_cells)])

    if tqdm:
        reps = tqdm_auto(range(n_reps))
    else:
        reps = range(n_reps)

    if n_job != 1:
        bootstrapped = Parallel(n_jobs=n_jobs)(
            delayed(bootstrap)(vals, n_cells, statistic) for _ in reps)
    else:
        bootstrapped = [bootstrap(vals, n_cells, statistic) for _ in reps]

    return np.array(bootstrapped)
 def format_save_images(self, output_folder: dataiku.Folder):
     partition = output_folder.writePartition if output_folder.writePartition else ""
     output_folder.clear_partition(partition)
     df_iterator = (i[1].to_dict() for i in self.output_df.iterrows())
     len_iterator = len(self.output_df.index)
     logging.info("Saving bounding boxes to output folder...")
     api_results = []
     with ThreadPoolExecutor(max_workers=self.parallel_workers) as pool:
         futures = [
             pool.submit(
                 self.format_save_image,
                 output_folder=output_folder,
                 image_path=row[IMAGE_PATH_COLUMN],
                 response=safe_json_loads(
                     row[self.api_column_names.response]),
             ) for row in df_iterator
         ]
         for f in tqdm_auto(as_completed(futures), total=len_iterator):
             api_results.append(f.result())
     num_success = sum(api_results)
     num_error = len(api_results) - num_success
     logging.info(
         "Saving bounding boxes to output folder: {} images succeeded, {} failed"
         .format(num_success, num_error))
コード例 #11
0
ファイル: util.py プロジェクト: JayKimBravekjh/numpyro
def progress_bar_factory(num_samples, num_chains):
    """Factory that builds a progress bar decorator along
    with the `set_tqdm_description` and `close_tqdm` functions
    """

    if num_samples > 20:
        print_rate = int(num_samples / 20)
    else:
        print_rate = 1

    remainder = num_samples % print_rate

    tqdm_bars = {}
    finished_chains = []
    for chain in range(num_chains):
        tqdm_bars[chain] = tqdm_auto(range(num_samples), position=chain)
        tqdm_bars[chain].set_description(
            "Compiling.. ",
            refresh=True,
        )

    def _update_tqdm(arg, transform, device):
        chain = int(str(device)[4:])
        tqdm_bars[chain].set_description(
            f"Running chain {chain}",
            refresh=False,
        )
        tqdm_bars[chain].update(arg)

    def _close_tqdm(arg, transform, device):
        chain = int(str(device)[4:])
        tqdm_bars[chain].update(arg)
        finished_chains.append(chain)
        if len(finished_chains) == num_chains:
            for chain in range(num_chains):
                tqdm_bars[chain].close()

    def _update_progress_bar(iter_num):
        """Updates tqdm progress bar of a JAX loop only if the iteration number is a multiple of the print_rate
        Usage: carry = progress_bar((iter_num, print_rate), carry)
        """

        _ = lax.cond(
            iter_num == 1,
            lambda _: host_callback.id_tap(
                _update_tqdm, 0, result=iter_num, tap_with_device=True),
            lambda _: iter_num,
            operand=None,
        )
        _ = lax.cond(
            iter_num % print_rate == 0,
            lambda _: host_callback.id_tap(_update_tqdm,
                                           print_rate,
                                           result=iter_num,
                                           tap_with_device=True),
            lambda _: iter_num,
            operand=None,
        )
        _ = lax.cond(
            iter_num == num_samples,
            lambda _: host_callback.id_tap(
                _close_tqdm, remainder, result=iter_num, tap_with_device=True),
            lambda _: iter_num,
            operand=None,
        )

    def progress_bar_fori_loop(func):
        """Decorator that adds a progress bar to `body_fun` used in `lax.fori_loop`.
        Note that `body_fun` must be looping over a tuple who's first element is `np.arange(num_samples)`.
        This means that `iter_num` is the current iteration number
        """
        def wrapper_progress_bar(i, vals):
            result = func(i, vals)
            _update_progress_bar(i + 1)
            return result

        return wrapper_progress_bar

    return progress_bar_fori_loop
def api_parallelizer(input_df: pd.DataFrame,
                     api_call_function: Callable,
                     api_exceptions: Union[Exception, Tuple[Exception]],
                     column_prefix: AnyStr,
                     parallel_workers: int = DEFAULT_PARALLEL_WORKERS,
                     api_support_batch: bool = DEFAULT_API_SUPPORT_BATCH,
                     batch_size: int = DEFAULT_BATCH_SIZE,
                     error_handling: ErrorHandlingEnum = ErrorHandlingEnum.LOG,
                     verbose: bool = DEFAULT_VERBOSE,
                     **api_call_function_kwargs) -> pd.DataFrame:
    """
    Apply an API call function in parallel to a pandas.DataFrame.
    The DataFrame is passed to the function as row dictionaries.
    Parallelism works by:
    - (default) sending multiple concurrent threads
    - if the API supports it, sending batches of row
    """
    df_iterator = (i[1].to_dict() for i in input_df.iterrows())
    len_iterator = len(input_df.index)
    log_msg = "Calling remote API endpoint with {} rows...".format(
        len_iterator)
    if api_support_batch:
        log_msg += ", chunked by {}".format(batch_size)
        df_iterator = chunked(df_iterator, batch_size)
        len_iterator = math.ceil(len_iterator / batch_size)
    logging.info(log_msg)
    api_column_names = build_unique_column_names(input_df.columns,
                                                 column_prefix)
    pool_kwargs = api_call_function_kwargs.copy()
    more_kwargs = [
        "api_call_function",
        "error_handling",
        "api_exceptions",
        "api_column_names",
    ]
    for k in more_kwargs:
        pool_kwargs[k] = locals()[k]
    for k in ["fn", "row", "batch"]:  # Reserved pool keyword arguments
        pool_kwargs.pop(k, None)
    api_results = []
    with ThreadPoolExecutor(max_workers=parallel_workers) as pool:
        if api_support_batch:
            futures = [
                pool.submit(api_call_batch, batch=batch, **pool_kwargs)
                for batch in df_iterator
            ]
        else:
            futures = [
                pool.submit(api_call_single_row, row=row, **pool_kwargs)
                for row in df_iterator
            ]
        for f in tqdm_auto(as_completed(futures), total=len_iterator):
            api_results.append(f.result())
    if api_support_batch:
        api_results = flatten(api_results)
    output_df = convert_api_results_to_df(input_df, api_results,
                                          api_column_names, error_handling,
                                          verbose)
    num_api_error = sum(output_df[api_column_names.response] == "")
    num_api_success = len(input_df.index) - num_api_error
    logging.info(
        "Remote API call results: {} rows succeeded, {} rows failed.".format(
            num_api_success, num_api_error))
    return output_df
コード例 #13
0
ファイル: tqdm.py プロジェクト: saponas/hail
def tqdm(*args, disable=TQDM_DEFAULT_DISABLE, **kwargs):
    return tqdm_auto(*args, disable=disable, **kwargs)
コード例 #14
0
def parallelizer(
    input_df: pd.DataFrame,
    function: Callable,
    exceptions: Union[Exception, Tuple[Exception]],
    column_prefix: AnyStr,
    parallel_workers: int = DEFAULT_PARALLEL_WORKERS,
    batch_support: bool = DEFAULT_BATCH_SUPPORT,
    batch_size: int = DEFAULT_BATCH_SIZE,
    error_handling: ErrorHandling = ErrorHandling.LOG,
    verbose: bool = DEFAULT_VERBOSE,
    **function_kwargs,
) -> pd.DataFrame:
    """Apply a function to a pandas.DataFrame with parallelization, batching, error handling and progress tracking

    The DataFrame is iterated on and passed to the function as dictionaries, row-by-row or by batches of rows.
    This iterative process is accelerated by the use of concurrent threads and is tracked with a progress bar.
    Errors are catched if they match the `exceptions` parameter and automatically logged.
    Once the whole DataFrame has been iterated on, results and errors are added as additional columns.

    Args:
        input_df: Input dataframe which will be iterated on
        function: Function taking a dict as input and returning a dict
            If `function_support_batch` then the function works on list of dict
            For instance, a function to call an API or do some enrichment
        exceptions: Tuple of Exception classes to catch
        column_prefix: Column prefix to add to the output columns for the `function` responses and errors
        parallel_workers: Number of concurrent threads
        batch_support: If True, send batches of row to the `function`
            Else (default) send rows as dict to the function
        batch_size: Number of rows to include in each batch
            Taken into account if `batch_support` is True
        error_handling: If ErrorHandling.LOG (default), log the error message as a warning
            and return the row with error keys.
            Else fail is there is any error.
        verbose: If True, log additional information on errors
            Else (default) log the error message and the error type
        **function_kwargs: Arbitrary keyword arguments passed to the `function`

    Returns:
        Input dataframe with additional columns:
        - response from the `function`
        - error message if any
        - error type if any

    """
    df_iterator = (index_series_pair[1].to_dict()
                   for index_series_pair in input_df.iterrows())
    len_iterator = len(input_df.index)
    start = perf_counter()
    if batch_support:
        logging.info(
            f"Applying function {function.__name__} in parallel to {len_iterator} row(s)"
            + f" using batch size of {batch_size}...")
        df_iterator = chunked(df_iterator, batch_size)
        len_iterator = math.ceil(len_iterator / batch_size)
    else:
        logging.info(
            f"Applying function {function.__name__} in parallel to {len_iterator} row(s)..."
        )
    column_names = build_unique_column_names(input_df.columns, column_prefix)
    pool_kwargs = {
        **{
            "function": function,
            "error_handling": error_handling,
            "exceptions": exceptions,
            "column_names": column_names,
        },
        **function_kwargs.copy(),
    }
    for kwarg in ["fn", "row", "batch"]:  # Reserved pool keyword arguments
        pool_kwargs.pop(kwarg, None)
    if not batch_support and "batch_response_parser" in pool_kwargs.keys():
        pool_kwargs.pop("batch_response_parser", None)
    results = []
    with ThreadPoolExecutor(max_workers=parallel_workers) as pool:
        if batch_support:
            futures = [
                pool.submit(apply_function_to_batch,
                            batch=batch,
                            **pool_kwargs) for batch in df_iterator
            ]
        else:
            futures = [
                pool.submit(apply_function_to_row, row=row, **pool_kwargs)
                for row in df_iterator
            ]
        for future in tqdm_auto(as_completed(futures), total=len_iterator):
            results.append(future.result())
    if batch_support:
        results = flatten(results)
    output_df = convert_results_to_df(input_df, results, column_names,
                                      error_handling, verbose)
    num_error = sum(output_df[column_names.response] == "")
    num_success = len(input_df.index) - num_error
    logging.info((
        f"Applying function in parallel: {num_success} row(s) succeeded, {num_error} failed "
        f"in {(perf_counter() - start):.2f} seconds."))
    return output_df