def describe(df: pd.DataFrame) -> dict: """Calculate the statistics for each series in this DataFrame. Args: df: DataFrame. Returns: This function returns a dictionary containing: - table: overall statistics. - variables: descriptions per series. - correlations: correlation matrices. - missing: missing value diagrams. - messages: direct special attention to these patterns in your data. """ if not isinstance(df, pd.DataFrame): warnings.warn("df is not of type pandas.DataFrame") if df.empty: raise ValueError("df can not be empty") disable_progress_bar = not config["progress_bar"].get(bool) # Multiprocessing of Describe 1D for each column pool_size = config["pool_size"].get(int) if pool_size <= 0: pool_size = multiprocessing.cpu_count() args = [(column, series) for column, series in df.iteritems()] series_description = {} with tqdm(total=len(args), desc="variables", disable=disable_progress_bar) as pbar: if pool_size == 1: for arg in args: column, description = multiprocess_1d(arg) series_description[column] = description pbar.update() else: # Store the original order original_order = { k: v for v, k in enumerate([column for column, _ in args]) } # TODO: use `Pool` for Linux-based systems with multiprocessing.pool.ThreadPool(pool_size) as executor: for i, (column, description) in enumerate( executor.imap_unordered(multiprocess_1d, args) ): series_description[column] = description pbar.set_postfix({'feature_name': column}) pbar.update() # Restore the original order series_description = dict( sorted( series_description.items(), key=lambda index: original_order.get(index[0]), ) ) # Mapping from column name to variable type sort = config["sort"].get(str) series_description = sort_column_names(series_description, sort) variables = { column: description["type"] for column, description in series_description.items() } # Transform the series_description in a DataFrame variable_stats = pd.DataFrame(series_description) # Get correlations correlations = calculate_correlations(df, variables) # Scatter matrix scatter_matrix = get_scatter_matrix(df, variables) # Table statistics with tqdm(total=1, desc="table", disable=disable_progress_bar) as pbar: table_stats = describe_table(df, variable_stats) pbar.update(1) # missing diagrams missing = get_missing_diagrams(df, table_stats) # Messages with tqdm(total=3, desc="warnings", disable=disable_progress_bar) as pbar: pbar.set_description_str("warnings [table]") messages = check_table_messages(table_stats) pbar.update() pbar.set_description_str("warnings [variables]") for col, description in series_description.items(): messages += check_variable_messages(col, description) pbar.update() pbar.set_description_str("warnings [correlations]") messages += check_correlation_messages(correlations) messages.sort(key=lambda message: str(message.message_type)) pbar.update() with tqdm(total=1, desc="package", disable=disable_progress_bar) as pbar: package = { "pandas_profiling_version": __version__, "pandas_profiling_config": config.dump(), } pbar.update() return { # Overall description "table": table_stats, # Per variable descriptions "variables": series_description, # Bivariate relations "scatter": scatter_matrix, # Correlation matrices "correlations": correlations, # Missing values "missing": missing, # Warnings "messages": messages, # Package "package": package, }
def describe(df: pd.DataFrame) -> dict: """Calculate the statistics for each series in this DataFrame. Args: df: DataFrame. Returns: This function returns a dictionary containing: - table: overall statistics. - variables: descriptions per series. - correlations: correlation matrices. - missing: missing value diagrams. - messages: direct special attention to these patterns in your data. """ if not isinstance(df, pd.DataFrame): raise TypeError("df must be of type pandas.DataFrame") if df.empty: raise ValueError("df can not be empty") # Multiprocessing of Describe 1D for each column pool_size = config["pool_size"].get(int) if pool_size <= 0: pool_size = multiprocessing.cpu_count() if pool_size == 1: args = [(column, series) for column, series in df.iteritems()] series_description = { column: series for column, series in itertools.starmap(multiprocess_1d, args) } else: with multiprocessing.pool.ThreadPool(pool_size) as executor: series_description = {} results = executor.starmap(multiprocess_1d, df.iteritems()) for col, description in results: series_description[col] = description # Mapping from column name to variable type variables = { column: description["type"] for column, description in series_description.items() } # Get correlations correlations = calculate_correlations(df, variables) # Check correlations between numerical variables if (config["check_correlation_pearson"].get(bool) is True and "pearson" in correlations): # Overwrites the description with "CORR" series correlation_threshold = config["correlation_threshold_pearson"].get( float) update( series_description, perform_check_correlation( correlations["pearson"], lambda x: x > correlation_threshold, Variable.S_TYPE_CORR, ), ) # Check correlations between categorical variables if (config["check_correlation_cramers"].get(bool) is True and "cramers" in correlations): # Overwrites the description with "CORR" series correlation_threshold = config["correlation_threshold_cramers"].get( float) update( series_description, perform_check_correlation( correlations["cramers"], lambda x: x > correlation_threshold, Variable.S_TYPE_CORR, ), ) # Check recoded if config["check_recoded"].get(bool) is True and "recoded" in correlations: # Overwrites the description with "RECORDED" series update( series_description, perform_check_correlation(correlations["recoded"], lambda x: x == 1, Variable.S_TYPE_RECODED), ) # Transform the series_description in a DataFrame variable_stats = pd.DataFrame(series_description) # Table statistics table_stats = describe_table(df, variable_stats) # missing diagrams missing = get_missing_diagrams(df, table_stats) # Messages messages = check_table_messages(table_stats) for col, description in series_description.items(): messages += check_variable_messages(col, description) package = { "pandas_profiling_version": __version__, "pandas_profiling_config": config.dump(), } return { # Overall description "table": table_stats, # Per variable descriptions "variables": series_description, # Correlation matrices "correlations": correlations, # Missing values "missing": missing, # Warnings "messages": messages, # Package "package": package, }
def describe(df: pd.DataFrame) -> dict: """Calculate the statistics for each series in this DataFrame. Args: df: DataFrame. Returns: This function returns a dictionary containing: - table: overall statistics. - variables: descriptions per series. - correlations: correlation matrices. - missing: missing value diagrams. - messages: direct special attention to these patterns in your data. """ if not isinstance(df, pd.DataFrame): raise TypeError("df must be of type pandas.DataFrame") if df.empty: raise ValueError("df can not be empty") # Multiprocessing of Describe 1D for each column pool_size = config["pool_size"].get(int) if pool_size <= 0: pool_size = multiprocessing.cpu_count() if pool_size == 1: args = [(column, series) for column, series in df.iteritems()] series_description = { column: series for column, series in itertools.starmap(multiprocess_1d, args) } else: with multiprocessing.pool.ThreadPool(pool_size) as executor: series_description = {} results = executor.starmap(multiprocess_1d, df.iteritems()) for col, description in results: series_description[col] = description # Mapping from column name to variable type variables = { column: description["type"] for column, description in series_description.items() } # Get correlations correlations = calculate_correlations(df, variables) # Scatter matrix scatter_matrix = get_scatter_matrix(df, variables) # Transform the series_description in a DataFrame variable_stats = pd.DataFrame(series_description) # Table statistics table_stats = describe_table(df, variable_stats) # missing diagrams missing = get_missing_diagrams(df, table_stats) # Messages messages = check_table_messages(table_stats) for col, description in series_description.items(): messages += check_variable_messages(col, description) messages += check_correlation_messages(correlations) package = { "pandas_profiling_version": __version__, "pandas_profiling_config": config.dump(), } return { # Overall description "table": table_stats, # Per variable descriptions "variables": series_description, # Bivariate relations "scatter": scatter_matrix, # Correlation matrices "correlations": correlations, # Missing values "missing": missing, # Warnings "messages": messages, # Package "package": package, }