def __init__( self, input_df: pd.DataFrame, column_prefix: AnyStr = "api", error_handling: ErrorHandlingEnum = ErrorHandlingEnum.LOG, ): self.input_df = input_df self.column_prefix = column_prefix self.error_handling = error_handling self.api_column_names = build_unique_column_names(input_df, column_prefix) self.column_description_dict = { v: API_COLUMN_NAMES_DESCRIPTION_DICT[k] for k, v in self.api_column_names._asdict().items() }
def __init__( self, input_df: pd.DataFrame, input_folder: dataiku.Folder = None, column_prefix: AnyStr = "api", error_handling: ErrorHandling = ErrorHandling.LOG, parallel_workers: int = DEFAULT_PARALLEL_WORKERS, **kwargs, ): store_attr() self.output_df = None # initialization before calling format_df self.api_column_names = build_unique_column_names( input_df.keys(), column_prefix) self.column_description_dict = { column_name: API_COLUMN_NAMES_DESCRIPTION_DICT[key] for key, column_name in self.api_column_names._asdict().items() } self.column_description_dict[ PATH_COLUMN] = "Path of the file relative to the input folder"
def __init__( self, input_df: pd.DataFrame, input_folder: dataiku.Folder = None, column_prefix: AnyStr = "api", error_handling: ErrorHandlingEnum = ErrorHandlingEnum.LOG, parallel_workers: int = DEFAULT_PARALLEL_WORKERS, ): self.input_df = input_df self.input_folder = input_folder self.output_df = None # initialization before calling format_df self.column_prefix = column_prefix self.error_handling = error_handling self.parallel_workers = parallel_workers self.api_column_names = build_unique_column_names( input_df.keys(), column_prefix) self.column_description_dict = { v: API_COLUMN_NAMES_DESCRIPTION_DICT[k] for k, v in self.api_column_names._asdict().items() }
def api_parallelizer(input_df: pd.DataFrame, api_call_function: Callable, api_exceptions: Union[Exception, Tuple[Exception]], column_prefix: AnyStr, parallel_workers: int = DEFAULT_PARALLEL_WORKERS, api_support_batch: bool = DEFAULT_API_SUPPORT_BATCH, batch_size: int = DEFAULT_BATCH_SIZE, error_handling: ErrorHandlingEnum = ErrorHandlingEnum.LOG, verbose: bool = DEFAULT_VERBOSE, **api_call_function_kwargs) -> pd.DataFrame: """ Apply an API call function in parallel to a pandas.DataFrame. The DataFrame is passed to the function as row dictionaries. Parallelism works by: - (default) sending multiple concurrent threads - if the API supports it, sending batches of row """ df_iterator = (i[1].to_dict() for i in input_df.iterrows()) len_iterator = len(input_df.index) log_msg = "Calling remote API endpoint with {} rows...".format( len_iterator) if api_support_batch: log_msg += ", chunked by {}".format(batch_size) df_iterator = chunked(df_iterator, batch_size) len_iterator = math.ceil(len_iterator / batch_size) logging.info(log_msg) api_column_names = build_unique_column_names(input_df.columns, column_prefix) pool_kwargs = api_call_function_kwargs.copy() more_kwargs = [ "api_call_function", "error_handling", "api_exceptions", "api_column_names", ] for k in more_kwargs: pool_kwargs[k] = locals()[k] for k in ["fn", "row", "batch"]: # Reserved pool keyword arguments pool_kwargs.pop(k, None) api_results = [] with ThreadPoolExecutor(max_workers=parallel_workers) as pool: if api_support_batch: futures = [ pool.submit(api_call_batch, batch=batch, **pool_kwargs) for batch in df_iterator ] else: futures = [ pool.submit(api_call_single_row, row=row, **pool_kwargs) for row in df_iterator ] for f in tqdm_auto(as_completed(futures), total=len_iterator): api_results.append(f.result()) if api_support_batch: api_results = flatten(api_results) output_df = convert_api_results_to_df(input_df, api_results, api_column_names, error_handling, verbose) num_api_error = sum(output_df[api_column_names.response] == "") num_api_success = len(input_df.index) - num_api_error logging.info( "Remote API call results: {} rows succeeded, {} rows failed.".format( num_api_success, num_api_error)) return output_df
def parallelizer( input_df: pd.DataFrame, function: Callable, exceptions: Union[Exception, Tuple[Exception]], column_prefix: AnyStr, parallel_workers: int = DEFAULT_PARALLEL_WORKERS, batch_support: bool = DEFAULT_BATCH_SUPPORT, batch_size: int = DEFAULT_BATCH_SIZE, error_handling: ErrorHandling = ErrorHandling.LOG, verbose: bool = DEFAULT_VERBOSE, **function_kwargs, ) -> pd.DataFrame: """Apply a function to a pandas.DataFrame with parallelization, batching, error handling and progress tracking The DataFrame is iterated on and passed to the function as dictionaries, row-by-row or by batches of rows. This iterative process is accelerated by the use of concurrent threads and is tracked with a progress bar. Errors are catched if they match the `exceptions` parameter and automatically logged. Once the whole DataFrame has been iterated on, results and errors are added as additional columns. Args: input_df: Input dataframe which will be iterated on function: Function taking a dict as input and returning a dict If `function_support_batch` then the function works on list of dict For instance, a function to call an API or do some enrichment exceptions: Tuple of Exception classes to catch column_prefix: Column prefix to add to the output columns for the `function` responses and errors parallel_workers: Number of concurrent threads batch_support: If True, send batches of row to the `function` Else (default) send rows as dict to the function batch_size: Number of rows to include in each batch Taken into account if `batch_support` is True error_handling: If ErrorHandling.LOG (default), log the error message as a warning and return the row with error keys. Else fail is there is any error. verbose: If True, log additional information on errors Else (default) log the error message and the error type **function_kwargs: Arbitrary keyword arguments passed to the `function` Returns: Input dataframe with additional columns: - response from the `function` - error message if any - error type if any """ df_iterator = (index_series_pair[1].to_dict() for index_series_pair in input_df.iterrows()) len_iterator = len(input_df.index) start = perf_counter() if batch_support: logging.info( f"Applying function {function.__name__} in parallel to {len_iterator} row(s)" + f" using batch size of {batch_size}...") df_iterator = chunked(df_iterator, batch_size) len_iterator = math.ceil(len_iterator / batch_size) else: logging.info( f"Applying function {function.__name__} in parallel to {len_iterator} row(s)..." ) column_names = build_unique_column_names(input_df.columns, column_prefix) pool_kwargs = { **{ "function": function, "error_handling": error_handling, "exceptions": exceptions, "column_names": column_names, }, **function_kwargs.copy(), } for kwarg in ["fn", "row", "batch"]: # Reserved pool keyword arguments pool_kwargs.pop(kwarg, None) if not batch_support and "batch_response_parser" in pool_kwargs.keys(): pool_kwargs.pop("batch_response_parser", None) results = [] with ThreadPoolExecutor(max_workers=parallel_workers) as pool: if batch_support: futures = [ pool.submit(apply_function_to_batch, batch=batch, **pool_kwargs) for batch in df_iterator ] else: futures = [ pool.submit(apply_function_to_row, row=row, **pool_kwargs) for row in df_iterator ] for future in tqdm_auto(as_completed(futures), total=len_iterator): results.append(future.result()) if batch_support: results = flatten(results) output_df = convert_results_to_df(input_df, results, column_names, error_handling, verbose) num_error = sum(output_df[column_names.response] == "") num_success = len(input_df.index) - num_error logging.info(( f"Applying function in parallel: {num_success} row(s) succeeded, {num_error} failed " f"in {(perf_counter() - start):.2f} seconds.")) return output_df