def fetch_data( database: Database, queries: List[Type[QueryBuilder]], dimensions: Iterable[Field], share_dimensions: Iterable[Field] = (), reference_groups=(), ) -> Tuple[int, pd.DataFrame]: queries = [str(query) for query in queries] # Indicate which dimensions need to be parsed as date types # For this we create a dictionary with the dimension alias as key and PANDAS_TO_DATETIME_FORMAT as value pandas_parse_dates = {} for dimension in dimensions: unmodified_dimension = find_field_in_modified_field(dimension) if unmodified_dimension.data_type == DataType.date: pandas_parse_dates[alias_selector( unmodified_dimension.alias)] = PANDAS_TO_DATETIME_FORMAT results = database.fetch_dataframes(*queries, parse_dates=pandas_parse_dates) max_rows_returned = max([len(x) for x in results], default=0) logger.info('max_rows_returned', extra={ 'row_count': max_rows_returned, 'database': str(database) }) return max_rows_returned, reduce_result_set(results, reference_groups, dimensions, share_dimensions)
def fetch_data( database: Database, queries: Union[Sized, Iterable], dimensions: Iterable[Field], share_dimensions: Iterable[Field] = (), reference_groups=(), ): queries = [ str( query.limit( min(query._limit or float("inf"), database.max_result_set_size))) for query in queries ] results = database.fetch_dataframes(*queries) return reduce_result_set(results, reference_groups, dimensions, share_dimensions)
def fetch_data( database: Database, queries: List[Type[QueryBuilder]], dimensions: Iterable[Field], share_dimensions: Iterable[Field] = (), reference_groups=(), ) -> Tuple[int, pd.DataFrame]: queries = [str(query) for query in queries] # Indicate which dimensions need to be parsed as date types # For this we create a dictionary with the dimension alias as key and PANDAS_TO_DATETIME_FORMAT as value pandas_parse_dates = {} for dimension in dimensions: unmodified_dimension = find_field_in_modified_field(dimension) if unmodified_dimension.data_type == DataType.date: pandas_parse_dates[alias_selector( unmodified_dimension.alias)] = PANDAS_TO_DATETIME_FORMAT results = database.fetch_dataframes(*queries, parse_dates=pandas_parse_dates) max_rows_returned = 0 for result_df in results: row_count = len(result_df) if row_count > max_rows_returned: max_rows_returned = row_count if row_count > database.max_result_set_size: logger.warning('row_count_over_max', extra={ 'row_count': len(result_df), 'database': str(database) }) # drop all result rows above database.max_result_set_size in place result_df.drop(result_df.index[database.max_result_set_size:], inplace=True) logger.info('max_rows_returned', extra={ 'row_count': max_rows_returned, 'database': str(database) }) return max_rows_returned, reduce_result_set(results, reference_groups, dimensions, share_dimensions)