def process_dataset_chunks(input_dataset: dataiku.Dataset, output_dataset: dataiku.Dataset, func: Callable, chunksize: float = 1000, **kwargs) -> None: """Read a dataset by chunks, process each dataframe chunk with a function and write back to another dataset. Pass keyword arguments to the function, adds a tqdm progress bar and generic logging. Directly write chunks to the output_dataset, so that only one chunk needs to be processed in-memory at a time. Args: input_dataset: Input dataiku.Dataset instance output_dataset: Output dataiku.Dataset instance func: The function to apply to the `input_dataset` by chunks of pandas.DataFrame This function must take a pandas.DataFrame as first input argument, and output another pandas.DataFrame chunksize: Number of rows of each chunk of pandas.DataFrame fed to `func` **kwargs: Optional keyword arguments fed to `func` Raises: ValueError: If the input dataset is empty or if pandas cannot read it without type inference """ input_count_records = count_records(input_dataset) if input_count_records == 0: raise ValueError("Input dataset has no records") logging.info( f"Processing dataset {input_dataset.name} of {input_count_records} rows by chunks of {chunksize}..." ) start = perf_counter() # First, initialize output schema if not present. Required to show the real error if `iter_dataframes` fails. if not output_dataset.read_schema(raise_if_empty=False): df = input_dataset.get_dataframe(limit=5, infer_with_pandas=False) output_df = func(df=df, **kwargs) output_dataset.write_schema_from_dataframe(output_df) with output_dataset.get_writer() as writer: df_iterator = input_dataset.iter_dataframes(chunksize=chunksize, infer_with_pandas=False) len_iterator = math.ceil(input_count_records / chunksize) for i, df in tqdm(enumerate(df_iterator), total=len_iterator, unit="chunk", mininterval=1.0): output_df = func(df=df, **kwargs) if i == 0: output_dataset.write_schema_from_dataframe( output_df, dropAndCreate=bool(not output_dataset.writePartition)) writer.write_dataframe(output_df) logging.info( f"Processing dataset {input_dataset.name} of {input_count_records} rows: " + f"Done in {perf_counter() - start:.2f} seconds.")
def process_dataset_chunks( input_dataset: dataiku.Dataset, output_dataset: dataiku.Dataset, func: Callable, chunksize: float = 10000, **kwargs ) -> None: """ Read a dataset by chunks, process each dataframe chunk with a function and write back to another dataset. Automatically adds a tqdm progress bar and generic logging. """ logging.info("Processing dataframe chunks of size {:d})...".format(chunksize)) with output_dataset.get_writer() as writer: df_iterator = input_dataset.iter_dataframes(chunksize=chunksize, infer_with_pandas=False) len_iterator = math.ceil(count_records(input_dataset) / chunksize) for i, df in tqdm(enumerate(df_iterator), total=len_iterator): output_df = func(df=df, **kwargs) if i == 0: if output_dataset.writePartition is None or output_dataset.writePartition == "": output_dataset.write_schema_from_dataframe(output_df, dropAndCreate=True) else: output_dataset.write_schema_from_dataframe(output_df) writer.write_dataframe(output_df) logging.info("Processing dataframe chunks: Done!")