Exemple #1
0
def process_dataset_chunks(input_dataset: dataiku.Dataset,
                           output_dataset: dataiku.Dataset,
                           func: Callable,
                           chunksize: float = 1000,
                           **kwargs) -> None:
    """Read a dataset by chunks, process each dataframe chunk with a function and write back to another dataset.

    Pass keyword arguments to the function, adds a tqdm progress bar and generic logging.
    Directly write chunks to the output_dataset, so that only one chunk needs to be processed in-memory at a time.

    Args:
        input_dataset: Input dataiku.Dataset instance
        output_dataset: Output dataiku.Dataset instance
        func: The function to apply to the `input_dataset` by chunks of pandas.DataFrame
            This function must take a pandas.DataFrame as first input argument,
            and output another pandas.DataFrame
        chunksize: Number of rows of each chunk of pandas.DataFrame fed to `func`
        **kwargs: Optional keyword arguments fed to `func`

    Raises:
        ValueError: If the input dataset is empty or if pandas cannot read it without type inference

    """
    input_count_records = count_records(input_dataset)
    if input_count_records == 0:
        raise ValueError("Input dataset has no records")
    logging.info(
        f"Processing dataset {input_dataset.name} of {input_count_records} rows by chunks of {chunksize}..."
    )
    start = perf_counter()
    # First, initialize output schema if not present. Required to show the real error if `iter_dataframes` fails.
    if not output_dataset.read_schema(raise_if_empty=False):
        df = input_dataset.get_dataframe(limit=5, infer_with_pandas=False)
        output_df = func(df=df, **kwargs)
        output_dataset.write_schema_from_dataframe(output_df)
    with output_dataset.get_writer() as writer:
        df_iterator = input_dataset.iter_dataframes(chunksize=chunksize,
                                                    infer_with_pandas=False)
        len_iterator = math.ceil(input_count_records / chunksize)
        for i, df in tqdm(enumerate(df_iterator),
                          total=len_iterator,
                          unit="chunk",
                          mininterval=1.0):
            output_df = func(df=df, **kwargs)
            if i == 0:
                output_dataset.write_schema_from_dataframe(
                    output_df,
                    dropAndCreate=bool(not output_dataset.writePartition))
            writer.write_dataframe(output_df)
    logging.info(
        f"Processing dataset {input_dataset.name} of {input_count_records} rows: "
        + f"Done in {perf_counter() - start:.2f} seconds.")
def process_dataset_chunks(
    input_dataset: dataiku.Dataset, output_dataset: dataiku.Dataset, func: Callable, chunksize: float = 10000, **kwargs
) -> None:
    """
    Read a dataset by chunks, process each dataframe chunk with a function and write back to another dataset.
    Automatically adds a tqdm progress bar and generic logging.
    """
    logging.info("Processing dataframe chunks of size {:d})...".format(chunksize))
    with output_dataset.get_writer() as writer:
        df_iterator = input_dataset.iter_dataframes(chunksize=chunksize, infer_with_pandas=False)
        len_iterator = math.ceil(count_records(input_dataset) / chunksize)
        for i, df in tqdm(enumerate(df_iterator), total=len_iterator):
            output_df = func(df=df, **kwargs)
            if i == 0:
                if output_dataset.writePartition is None or output_dataset.writePartition == "":
                    output_dataset.write_schema_from_dataframe(output_df, dropAndCreate=True)
                else:
                    output_dataset.write_schema_from_dataframe(output_df)
            writer.write_dataframe(output_df)
    logging.info("Processing dataframe chunks: Done!")