def describe( config: Settings, df: pd.DataFrame, summarizer: BaseSummarizer, typeset: VisionsTypeset, sample: Optional[dict] = None, ) -> dict: """Calculate the statistics for each series in this DataFrame. Args: config: report Settings object df: DataFrame. sample: optional, dict with custom sample Returns: This function returns a dictionary containing: - table: overall statistics. - variables: descriptions per series. - correlations: correlation matrices. - missing: missing value diagrams. - messages: direct special attention to these patterns in your data. - package: package details. """ if df is None: raise ValueError( "Can not describe a `lazy` ProfileReport without a DataFrame.") if not isinstance(df, pd.DataFrame): warnings.warn("df is not of type pandas.DataFrame") disable_progress_bar = not config.progress_bar date_start = datetime.utcnow() correlation_names = [ correlation_name for correlation_name in [ "pearson", "spearman", "kendall", "phi_k", "cramers", ] if config.correlations[correlation_name].calculate ] number_of_tasks = 8 + len(df.columns) + len(correlation_names) with tqdm(total=number_of_tasks, desc="Summarize dataset", disable=disable_progress_bar) as pbar: series_description = get_series_descriptions(config, df, summarizer, typeset, pbar) pbar.set_postfix_str("Get variable types") variables = { column: description["type"] for column, description in series_description.items() } supported_columns = [ column for column, type_name in variables.items() if type_name != "Unsupported" ] interval_columns = [ column for column, type_name in variables.items() if type_name == "Numeric" ] pbar.update() # Get correlations correlations = {} for correlation_name in correlation_names: pbar.set_postfix_str(f"Calculate {correlation_name} correlation") correlations[correlation_name] = calculate_correlation( config, df, correlation_name, series_description) pbar.update() # make sure correlations is not None correlations = { key: value for key, value in correlations.items() if value is not None } # Scatter matrix pbar.set_postfix_str("Get scatter matrix") scatter_matrix = get_scatter_matrix(config, df, interval_columns) pbar.update() # Table statistics pbar.set_postfix_str("Get table statistics") table_stats = get_table_stats(config, df, series_description) pbar.update() # missing diagrams pbar.set_postfix_str("Get missing diagrams") missing = get_missing_diagrams(config, df, table_stats) pbar.update() # Sample pbar.set_postfix_str("Take sample") if sample is None: samples = get_sample(config, df) else: if "name" not in sample: sample["name"] = None if "caption" not in sample: sample["caption"] = None samples = [ Sample( id="custom", data=sample["data"], name=sample["name"], caption=sample["caption"], ) ] pbar.update() # Duplicates pbar.set_postfix_str("Locating duplicates") metrics, duplicates = get_duplicates(config, df, supported_columns) table_stats.update(metrics) pbar.update() # Messages pbar.set_postfix_str("Get messages/warnings") messages = get_messages(config, table_stats, series_description, correlations) pbar.update() pbar.set_postfix_str("Get reproduction details") package = { "pandas_profiling_version": __version__, "pandas_profiling_config": config.json(), } pbar.update() pbar.set_postfix_str("Completed") date_end = datetime.utcnow() analysis = { "title": config.title, "date_start": date_start, "date_end": date_end, "duration": date_end - date_start, } return { # Analysis metadata "analysis": analysis, # Overall dataset description "table": table_stats, # Per variable descriptions "variables": series_description, # Bivariate relations "scatter": scatter_matrix, # Correlation matrices "correlations": correlations, # Missing values "missing": missing, # Warnings "messages": messages, # Package "package": package, # Sample "sample": samples, # Duplicates "duplicates": duplicates, }
def describe( config: Settings, df: pd.DataFrame, summarizer: BaseSummarizer, typeset: VisionsTypeset, sample: Optional[dict] = None, ) -> dict: """Calculate the statistics for each series in this DataFrame. Args: config: report Settings object df: DataFrame. summarizer: summarizer object typeset: visions typeset sample: optional, dict with custom sample Returns: This function returns a dictionary containing: - table: overall statistics. - variables: descriptions per series. - correlations: correlation matrices. - missing: missing value diagrams. - alerts: direct special attention to these patterns in your data. - package: package details. """ if df is None: raise ValueError( "Can not describe a `lazy` ProfileReport without a DataFrame.") check_dataframe(df) df = preprocess(config, df) number_of_tasks = 5 with tqdm( total=number_of_tasks, desc="Summarize dataset", disable=not config.progress_bar, position=0, ) as pbar: date_start = datetime.utcnow() # Variable-specific pbar.total += len(df.columns) series_description = get_series_descriptions(config, df, summarizer, typeset, pbar) pbar.set_postfix_str("Get variable types") pbar.total += 1 variables = { column: description["type"] for column, description in series_description.items() } supported_columns = [ column for column, type_name in variables.items() if type_name != "Unsupported" ] interval_columns = [ column for column, type_name in variables.items() if type_name == "Numeric" ] pbar.update() # Get correlations correlation_names = get_active_correlations(config) pbar.total += len(correlation_names) correlations = { correlation_name: progress(calculate_correlation, pbar, f"Calculate {correlation_name} correlation")( config, df, correlation_name, series_description) for correlation_name in correlation_names } # make sure correlations is not None correlations = { key: value for key, value in correlations.items() if value is not None } # Scatter matrix pbar.set_postfix_str("Get scatter matrix") scatter_tasks = get_scatter_tasks(config, interval_columns) pbar.total += len(scatter_tasks) scatter_matrix: Dict[Any, Dict[Any, Any]] = { x: { y: None } for x, y in scatter_tasks } for x, y in scatter_tasks: scatter_matrix[x][y] = progress( get_scatter_plot, pbar, f"scatter {x}, {y}")(config, df, x, y, interval_columns) # Table statistics table_stats = progress(get_table_stats, pbar, "Get dataframe statistics")(config, df, series_description) # missing diagrams missing_map = get_missing_active(config, table_stats) pbar.total += len(missing_map) missing = { name: progress(get_missing_diagram, pbar, f"Missing diagram {name}")(config, df, settings) for name, settings in missing_map.items() } missing = { name: value for name, value in missing.items() if value is not None } # Sample pbar.set_postfix_str("Take sample") if sample is None: samples = get_sample(config, df) else: samples = get_custom_sample(sample) pbar.update() # Duplicates metrics, duplicates = progress( get_duplicates, pbar, "Detecting duplicates")(config, df, supported_columns) table_stats.update(metrics) alerts = progress(get_alerts, pbar, "Get alerts")(config, table_stats, series_description, correlations) pbar.set_postfix_str("Get reproduction details") package = { "pandas_profiling_version": __version__, "pandas_profiling_config": config.json(), } pbar.update() pbar.set_postfix_str("Completed") date_end = datetime.utcnow() analysis = { "title": config.title, "date_start": date_start, "date_end": date_end, "duration": date_end - date_start, } return { # Analysis metadata "analysis": analysis, # Overall dataset description "table": table_stats, # Per variable descriptions "variables": series_description, # Bivariate relations "scatter": scatter_matrix, # Correlation matrices "correlations": correlations, # Missing values "missing": missing, # Alerts "alerts": alerts, # Package "package": package, # Sample "sample": samples, # Duplicates "duplicates": duplicates, }
def describe(title: str, df: pd.DataFrame, sample: Optional[dict] = None) -> dict: """Calculate the statistics for each series in this DataFrame. Args: title: report title df: DataFrame. sample: optional, dict with custom sample Returns: This function returns a dictionary containing: - table: overall statistics. - variables: descriptions per series. - correlations: correlation matrices. - missing: missing value diagrams. - messages: direct special attention to these patterns in your data. - package: package details. """ if df is None: raise ValueError( "Can not describe a `lazy` ProfileReport without a DataFrame.") if not isinstance(df, pd.DataFrame): warnings.warn("df is not of type pandas.DataFrame") if df.empty: raise ValueError("df can not be empty") disable_progress_bar = not config["progress_bar"].get(bool) date_start = datetime.utcnow() correlation_names = [ correlation_name for correlation_name in [ "pearson", "spearman", "kendall", "phi_k", "cramers", ] if config["correlations"][correlation_name]["calculate"].get(bool) ] number_of_tasks = 9 + len(df.columns) + len(correlation_names) with tqdm(total=number_of_tasks, desc="归纳数据集", disable=disable_progress_bar) as pbar: series_description = get_series_descriptions(df, pbar) pbar.set_postfix_str("获取变量类型") variables = { column: description["type"] for column, description in series_description.items() } pbar.update() # Transform the series_description in a DataFrame pbar.set_postfix_str("Get variable statistics") variable_stats = pd.DataFrame(series_description) pbar.update() # Get correlations correlations = {} for correlation_name in correlation_names: pbar.set_postfix_str(f"计算 {correlation_name} 相关性") correlations[correlation_name] = calculate_correlation( df, variables, correlation_name) pbar.update() # make sure correlations is not None correlations = { key: value for key, value in correlations.items() if value is not None } # Scatter matrix pbar.set_postfix_str("Get scatter matrix") scatter_matrix = get_scatter_matrix(df, variables) pbar.update() # Table statistics pbar.set_postfix_str("Get table statistics") table_stats = get_table_stats(df, variable_stats) pbar.update() # missing diagrams pbar.set_postfix_str("Get missing diagrams") missing = get_missing_diagrams(df, table_stats) pbar.update() # Sample pbar.set_postfix_str("Take sample") if sample is None: samples = get_sample(df) else: if "name" not in sample: sample["name"] = None if "caption" not in sample: sample["caption"] = None samples = [ Sample("custom", sample["data"], sample["name"], sample["caption"]) ] pbar.update() # Duplicates pbar.set_postfix_str("Locating duplicates") supported_columns = [ key for key, value in series_description.items() if value["type"] != Variable.S_TYPE_UNSUPPORTED ] duplicates = get_duplicates(df, supported_columns) pbar.update() # Messages pbar.set_postfix_str("Get messages/warnings") messages = get_messages(table_stats, series_description, correlations) pbar.update() pbar.set_postfix_str("Get reproduction details") package = { "pandas_profiling_version": __version__, "pandas_profiling_config": config.dump(), } pbar.update() pbar.set_postfix_str("Completed") date_end = datetime.utcnow() analysis = { "title": title, "date_start": date_start, "date_end": date_end, "duration": date_end - date_start, } return { # Analysis metadata "analysis": analysis, # Overall dataset description "table": table_stats, # Per variable descriptions "variables": series_description, # Bivariate relations "scatter": scatter_matrix, # Correlation matrices "correlations": correlations, # Missing values "missing": missing, # Warnings "messages": messages, # Package "package": package, # Sample "sample": samples, # Duplicates "duplicates": duplicates, }