Python Dataset.iter_dataframes Examples

Programming Language: Python

Namespace/Package Name: dataiku

Class/Type: Dataset

Method/Function: iter_dataframes

Examples at hotexamples.com: 2

Python Dataset.iter_dataframes - 2 examples found. These are the top rated real world Python examples of dataiku.Dataset.iter_dataframes extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

Dataset(7)

get_dataframe_schema_st(5)

get_dataframe(3)

get_writer(3)

read_schema(3)

write_schema(3)

get_config(2)

get_last_metric_values(2)

iter_dataframes(2)

write_schema_from_dataframe(2)

get(1)

list(1)

Example #1

Show file

def process_dataset_chunks(input_dataset: dataiku.Dataset,
                           output_dataset: dataiku.Dataset,
                           func: Callable,
                           chunksize: float = 1000,
                           **kwargs) -> None:
    """Read a dataset by chunks, process each dataframe chunk with a function and write back to another dataset.

    Pass keyword arguments to the function, adds a tqdm progress bar and generic logging.
    Directly write chunks to the output_dataset, so that only one chunk needs to be processed in-memory at a time.

    Args:
        input_dataset: Input dataiku.Dataset instance
        output_dataset: Output dataiku.Dataset instance
        func: The function to apply to the `input_dataset` by chunks of pandas.DataFrame
            This function must take a pandas.DataFrame as first input argument,
            and output another pandas.DataFrame
        chunksize: Number of rows of each chunk of pandas.DataFrame fed to `func`
        **kwargs: Optional keyword arguments fed to `func`

    Raises:
        ValueError: If the input dataset is empty or if pandas cannot read it without type inference

    """
    input_count_records = count_records(input_dataset)
    if input_count_records == 0:
        raise ValueError("Input dataset has no records")
    logging.info(
        f"Processing dataset {input_dataset.name} of {input_count_records} rows by chunks of {chunksize}..."
    )
    start = perf_counter()
    # First, initialize output schema if not present. Required to show the real error if `iter_dataframes` fails.
    if not output_dataset.read_schema(raise_if_empty=False):
        df = input_dataset.get_dataframe(limit=5, infer_with_pandas=False)
        output_df = func(df=df, **kwargs)
        output_dataset.write_schema_from_dataframe(output_df)
    with output_dataset.get_writer() as writer:
        df_iterator = input_dataset.iter_dataframes(chunksize=chunksize,
                                                    infer_with_pandas=False)
        len_iterator = math.ceil(input_count_records / chunksize)
        for i, df in tqdm(enumerate(df_iterator),
                          total=len_iterator,
                          unit="chunk",
                          mininterval=1.0):
            output_df = func(df=df, **kwargs)
            if i == 0:
                output_dataset.write_schema_from_dataframe(
                    output_df,
                    dropAndCreate=bool(not output_dataset.writePartition))
            writer.write_dataframe(output_df)
    logging.info(
        f"Processing dataset {input_dataset.name} of {input_count_records} rows: "
        + f"Done in {perf_counter() - start:.2f} seconds.")

Example #2

Show file

File: dku_io_utils.py Project: kevanescence/dss-plugin-nlp-language-detection

def process_dataset_chunks(
    input_dataset: dataiku.Dataset, output_dataset: dataiku.Dataset, func: Callable, chunksize: float = 10000, **kwargs
) -> None:
    """
    Read a dataset by chunks, process each dataframe chunk with a function and write back to another dataset.
    Automatically adds a tqdm progress bar and generic logging.
    """
    logging.info("Processing dataframe chunks of size {:d})...".format(chunksize))
    with output_dataset.get_writer() as writer:
        df_iterator = input_dataset.iter_dataframes(chunksize=chunksize, infer_with_pandas=False)
        len_iterator = math.ceil(count_records(input_dataset) / chunksize)
        for i, df in tqdm(enumerate(df_iterator), total=len_iterator):
            output_df = func(df=df, **kwargs)
            if i == 0:
                if output_dataset.writePartition is None or output_dataset.writePartition == "":
                    output_dataset.write_schema_from_dataframe(output_df, dropAndCreate=True)
                else:
                    output_dataset.write_schema_from_dataframe(output_df)
            writer.write_dataframe(output_df)
    logging.info("Processing dataframe chunks: Done!")