def create_dataset_info_section( analysis_tracker: AnalysisTracker, figure_base_path: Text = '') -> Tuple[Text, None]: """Create the top dataset info section without section title. No additional info will be generated. Args: analysis_tracker: (AnalysisTracker), holder for all the analysis figure_base_path: (string), not used, for signature consistence Returns: Tuple[Text, None] """ target = analysis_tracker.get_target_attribute().name ml_problem = analysis_tracker.get_job_config().ml_type numerical_attributes = analysis_tracker.get_num_attribute_names() categorical_attributes = analysis_tracker.get_cat_attribute_names() content = template.DATASET_INFO_TEMPLATE.format( location=analysis_tracker.get_job_config().datasource.location, numerical_attributes=len(numerical_attributes), categorical_attributes=len(categorical_attributes), target_name=target, ml_problem_type=ml_problem) return content, None
def create_information_gain_section(analysis_tracker: AnalysisTracker, figure_base_path: str) -> Union[str, None]: """Construct information gain section content for categorical attributes Args: analysis_tracker: (AnalysisTracker), holder for all the analysis figure_base_path: (string), the folder for holding figures Returns: Union[str, None] """ # extract the information gain analysis result # each pair of categorical attributes will have one corresponding analysis info_analysis = analysis_tracker.get_analysis( run_metadata_pb2.Analysis.Name.Name( run_metadata_pb2.Analysis.INFORMATION_GAIN)) if info_analysis: return utils.create_no_order_pair_metric_section( analysis_list=info_analysis, same_match_value=0.0, table_name="Information-Gain", figure_base_path=figure_base_path) return None
def create_anova_section( analysis_tracker: AnalysisTracker ) -> Union[Tuple[str, List[str]], None]: """Construct anova section content Args: analysis_tracker: (AnalysisTracker), holder for all the analysis Returns: Union[Tuple[str, List[str]], None], (section_content, List[warning]) """ warnings = [] # extract the anova analysis result # each pair of numerical and categorical attributes will have # one corresponding analysis anova_analysis = analysis_tracker.get_analysis( run_metadata_pb2.Analysis.Name.Name(run_metadata_pb2.Analysis.ANOVA)) if anova_analysis: table_content = utils.create_order_pair_metric_section( analysis_list=anova_analysis, same_match_value='NA') for analysis in anova_analysis: corr_check = recommendation.check_p_value(analysis) if corr_check: warnings.append(corr_check) if warnings: table_content = table_content + utils.create_warning_notes( warnings) return table_content, warnings return None
def create_table_descriptive_section( analysis_tracker: AnalysisTracker, figure_base_path: Text = '' ) -> Union[Tuple[Text, None], Tuple[None, None]]: """Construct descriptive table section content for categorical attributes. Args: analysis_tracker: (AnalysisTracker), holder for all the analysis figure_base_path: (string), not used, for signature consistence Returns: Union[str, None] """ # extract the descriptive table analysis result # each pair of categorical attributes will have one corresponding analysis analysis_results = analysis_tracker.get_analysis_by_name( Analysis.Name.Name(Analysis.TABLE_DESCRIPTIVE)) if analysis_results: content = [] for analysis in analysis_results: attributes = [item.name for item in analysis.features][::-1] section_title = template.SUB_SUB_SUB_SECTION_TITLE.format( content="{} / {}".format(attributes[0], attributes[1])) analysis_content_str = utils.create_table_from_table_metric( analysis.tmetrics[0]) content.extend([section_title, analysis_content_str, "\n<br/>\n"]) return ''.join(content), None return None, None
def create_table_descriptive_section( analysis_tracker: AnalysisTracker) -> Union[str, None]: """Construct descriptive table section content for categorical attributes Args: analysis_tracker: (AnalysisTracker), holder for all the analysis Returns: Union[str, None] """ # extract the descriptive table analysis result # each pair of categorical attributes will have one corresponding analysis analysis_results = analysis_tracker.get_analysis( run_metadata_pb2.Analysis.Name.Name( run_metadata_pb2.Analysis.TABLE_DESCRIPTIVE)) if analysis_results: content = [] for analysis in analysis_results: attributes = [item.name for item in analysis.features][::-1] section_title = template.SUB_SUB_SUB_SECTION_TITLE.format( content="{} / {}".format(attributes[0], attributes[1])) analysis_content_str = utils.create_table_from_TableMetric( analysis.tmetrics[0]) content.extend([section_title, analysis_content_str, "\n<br/>\n"]) return ''.join(content) return None
def create_information_gain_section( analysis_tracker: AnalysisTracker, figure_base_path: Text) -> Union[Tuple[Text, None], Tuple[None, None]]: """Construct information gain section content for categorical attributes. No additional info will be generated. Args: analysis_tracker: (AnalysisTracker), holder for all the analysis figure_base_path: (string), the folder for holding figures Returns: Union[Tuple[Text, None], Tuple[None, None]] """ # extract the information gain analysis result # each pair of categorical attributes will have one corresponding analysis info_analysis = analysis_tracker.get_analysis_by_name( Analysis.Name.Name(Analysis.INFORMATION_GAIN)) if info_analysis: content = utils.create_no_order_pair_metric_section( analysis_list=info_analysis, same_match_value=0.0, table_name="Information-Gain", figure_base_path=figure_base_path) return content, None return None, None
def __init__(self, config_params: argparse.ArgumentParser): # Parameter from CLI self._config_params = config_params self._analysis_run_metadata.timestamp_sec = time.time() # Load data definition self._job_config = job_config_loader.load_job_config(self._config_params) self._analysis_run_metadata.datasource.CopyFrom(self._job_config.datasource) self.tracker = AnalysisTracker(self._job_config) self.report_path = self._config_params.report_path self.figure_path = os.path.join(os.path.dirname(self.report_path), 'figure') if not os.path.exists(self.figure_path): os.makedirs(self.figure_path) logging.info(self._job_config.datasource)
def create_dataset_info_section( analysis_tracker: AnalysisTracker) -> Union[str, None]: """Create the top dataset info section without section title Args: analysis_tracker: (AnalysisTracker), holder for all the analysis Returns: Union[str, None] """ target = analysis_tracker.get_target().name ml_problem = analysis_tracker.metadata.ml_type numerical_attributes = analysis_tracker.get_numerical_attributes() categorical_attributes = analysis_tracker.get_categorical_attributes() return template.DATASET_INFO_TEMPLATE.format( location=analysis_tracker.metadata.datasource.location, numerical_attributes=len(numerical_attributes), categorical_attributes=len(categorical_attributes), target_name=target, ml_problem_type=ml_problem)
def create_pearson_correlation_section( analysis_tracker: AnalysisTracker, figure_base_path: str) -> Union[Tuple[str, List[str]], None]: """Construct correlation section content for numerical attributes Args: analysis_tracker: (AnalysisTracker), holder for all the analysis figure_base_path: (string), the folder for holding figures Returns: Union[Tuple[str, List[str]], None], (section_content, List[warining]) """ warnings = [] # extract the correlation analysis result # each pair of numerical attributes will have one corresponding analysis corr_analysis = analysis_tracker.get_analysis( run_metadata_pb2.Analysis.Name.Name( run_metadata_pb2.Analysis.PEARSON_CORRELATION)) if corr_analysis: table_content = utils.create_no_order_pair_metric_section( analysis_list=corr_analysis, same_match_value=1.0, table_name="Correlation", figure_base_path=figure_base_path) for analysis in corr_analysis: # correlation condition check corr_check = recommendation.check_pearson_correlation(analysis) if corr_check: warnings.append(corr_check) if warnings: table_content = table_content + utils.create_warning_notes( warnings) return table_content, warnings return None
def create_chi_square_section( analysis_tracker: AnalysisTracker, figure_base_path: Text = '' ) -> Union[Tuple[Text, List[Text]], Tuple[None, None]]: """Construct chi-square section content. If chi-square text is not performed, None will be returned. Args: analysis_tracker: (AnalysisTracker), holder for all the analysis figure_base_path: (string), not used, for signature consistence Returns: Union[Tuple[Text, List[Text]], Tuple[None, None]], (section_content, List[warning]) """ warnings = [] # extract the anova analysis result # each pair of categorical attributes will have # one corresponding analysis chi_square_analysis = analysis_tracker.get_analysis_by_name( Analysis.Name.Name(Analysis.CHI_SQUARE)) if chi_square_analysis: table_content = utils.create_no_order_pair_metric_section( analysis_list=chi_square_analysis, same_match_value='NA', figure_base_path='NA') for analysis in chi_square_analysis: corr_check = recommendation.check_p_value(analysis) if corr_check: warnings.append(corr_check) if warnings: table_content = table_content + utils.create_warning_notes( warnings) return table_content, warnings return None, None
def create_anova_section( analysis_tracker: AnalysisTracker, figure_base_path: Text = '' ) -> Union[Tuple[Text, List[Text]], Tuple[None, None]]: """Construct anova section content. If anova test is not performed, None will be returned. Args: analysis_tracker: (AnalysisTracker), holder for all the analysis figure_base_path: (string), the folder for holding figures Returns: Union[Tuple[Text, List[Text]], Tuple[None, None]], (section_content, List[warning]) """ warnings = [] # extract the anova analysis result # each pair of numerical and categorical attributes will have # one corresponding analysis anova_analysis = analysis_tracker.get_analysis_by_name( Analysis.Name.Name(Analysis.ANOVA)) if anova_analysis: table_content = utils.create_order_pair_metric_section( analysis_list=anova_analysis, same_match_value='NA') for analysis in anova_analysis: corr_check = recommendation.check_p_value(analysis) if corr_check: warnings.append(corr_check) if warnings: table_content = table_content + utils.create_warning_notes( warnings) return table_content, warnings return None, None
class Run: """Class of main interface for running analysis""" _run_metadata = run_metadata_pb2.AnalysisRun() def __init__(self, config_params: argparse.ArgumentParser): # Parameter from CLI self._config_params = config_params self._run_metadata.timestamp_sec = time.time() # Load data definition self._metadata_def = metadata_loader.load_metadata_def( self._config_params) self._run_metadata.datasource.CopyFrom(self._metadata_def.datasource) self.tracker = AnalysisTracker(self._metadata_def) self.report_path = self._config_params.report_path self.figure_path = os.path.join(os.path.dirname(self.report_path), 'figure') if not os.path.exists(self.figure_path): os.makedirs(self.figure_path) logging.info(self._metadata_def.datasource) def _run_descriptive(self): """Run descriptive analysis for both numerical and categorical attributes.""" analyzer = descriptive_analysis.DescriptiveAnalysis( self._metadata_def, preprocessor_factory.PreprocessorFactory.new_preprocessor( self._config_params)) analyses = list() analyses.extend(analyzer.run_numerical_descriptive()) analyses.extend( analyzer.run_numerical_histograms( self._metadata_def.histogram_bin)) analyses.extend(analyzer.run_categorical_descriptive()) vc_limit = self._metadata_def.value_counts_limit analyses.extend(analyzer.run_value_counts(vc_limit)) for item in analyses: self.tracker.add_analysis(item) return analyses def _categorical_cardinality_check(self): """Check whether the cardinality of the categorical columns are within the specified threshold.""" def _get_cardinality(attribute): descrip_analysis = self.tracker.get_attribute_analysis( attribute_name=attribute.name, analysis_name=run_metadata_pb2.Analysis.Name.Name( run_metadata_pb2.Analysis.DESCRIPTIVE)) for metric in descrip_analysis[0].smetrics: if metric.name == run_metadata_pb2.ScalarMetric.CARDINALITY: return metric.value return None valid_list = [] for att in self._metadata_def.categorical_attributes: cardinality = _get_cardinality(att) if cardinality <= self._metadata_def.general_cardinality_limit: valid_list.append(att) self._metadata_def.update_low_card_categorical(valid_list) def _run_qualitative(self): """Run correlation qualitative analysis for combinations of numerical and categorical attributes""" analyzer = qualitative_analysis.QualitativeAnalysis( self._metadata_def, preprocessor_factory.PreprocessorFactory.new_preprocessor( self._config_params)) analyses = list() if self._metadata_def.contingency_table_run: analyses.extend(analyzer.run_contigency_table()) if self._metadata_def.table_descriptive_run: analyses.extend(analyzer.run_categorical_numerical_descriptive()) for item in analyses: self.tracker.add_analysis(item) return analyses def _run_quantitative(self): """Run correlation quantitative analysis for combinations of numerical and categorical attributes""" analyzer = quantitative_analysis.QuantitativeAnalysis( self._metadata_def, preprocessor_factory.PreprocessorFactory.new_preprocessor( self._config_params)) analyses = [] if self._metadata_def.pearson_corr_run: analyses.extend(analyzer.run_pearson_correlation()) if self._metadata_def.information_gain_run: analyses.extend(analyzer.run_information_gain()) if self._metadata_def.chi_square_run: analyses.extend(analyzer.run_chi_square()) if self._metadata_def.anova_run: analyses.extend(analyzer.run_anova()) for item in analyses: self.tracker.add_analysis(item) return analyses def run_exploratory_data_analysis(self): """Run the main exploratory data analysis loop.""" self._run_metadata.analyses.extend(self._run_descriptive()) self._categorical_cardinality_check() self._run_metadata.analyses.extend(self._run_qualitative()) self._run_metadata.analyses.extend(self._run_quantitative()) # pylint: disable-msg=logging-format-interpolation logging.info( "Numerical attributes: {}\nCategorical attributes: {}".format( self.tracker.get_numerical_attributes(), self.tracker.get_categorical_attributes())) logging.info('All analysis:\n{}'.format( self.tracker.get_all_analysis_unique_names())) report = report_generator.create_report_md_content( analysis_tracker=self.tracker, figure_base_path=self.figure_path) logging.info(report) with open(self.report_path, 'w') as wf: wf.write(report)
class AnalysisRun: """Class of main interface for running analysis""" _analysis_run_metadata = analysis_entity_pb2.AnalysisRun() def __init__(self, config_params: argparse.ArgumentParser): # Parameter from CLI self._config_params = config_params self._analysis_run_metadata.timestamp_sec = time.time() # Load data definition self._job_config = job_config_loader.load_job_config(self._config_params) self._analysis_run_metadata.datasource.CopyFrom(self._job_config.datasource) self.tracker = AnalysisTracker(self._job_config) self.report_path = self._config_params.report_path self.figure_path = os.path.join(os.path.dirname(self.report_path), 'figure') if not os.path.exists(self.figure_path): os.makedirs(self.figure_path) logging.info(self._job_config.datasource) def run_parallel_analysis_tasks(self, analysis_task): """Run parallel analysis task.""" task_result = _parallel_runner( tasks=analysis_task, num_parallel=self._config_params.parallel_thread) analysis_list = list() for result in task_result: if isinstance(result, Analysis): analysis_list.append(result) else: analysis_list.extend(result) for analysis in analysis_list: self.tracker.add_analysis(analysis) return analysis_list def _run_descriptive(self): """Run descriptive analysis for both numerical and categorical attributes.""" analyzer = descriptive_analysis.DescriptiveAnalysis( self._job_config, preprocessor_factory.PreprocessorFactory.new_preprocessor( self._config_params)) analysis_tasks = list() analysis_tasks.extend(analyzer.numerical_descriptive_tasks()) h_bin = self._job_config.histogram_bin analysis_tasks.extend(analyzer.numerical_histograms_tasks(h_bin)) analysis_tasks.extend(analyzer.categorical_descriptive_tasks()) vc_limit = self._job_config.value_counts_limit analysis_tasks.extend(analyzer.value_counts_tasks(vc_limit)) return self.run_parallel_analysis_tasks(analysis_tasks) def _categorical_cardinality_check(self): """Check whether the cardinality of the categorical columns are within the specified threshold.""" def _get_cardinality(attribute): descrip_analysis = self.tracker.get_analysis_by_attribute_and_name( attribute_name=attribute.name, analysis_name=Analysis.Name.Name(Analysis.DESCRIPTIVE) ) for metric in descrip_analysis[0].smetrics: if metric.name == ScalarMetric.CARDINALITY: return metric.value return None valid_list = [] for att in self._job_config.categorical_attributes: cardinality = _get_cardinality(att) if cardinality <= self._job_config.general_cardinality_limit: valid_list.append(att) self._job_config.update_low_card_categorical(valid_list) def _qualitative_tasks(self): """Run correlation qualitative analysis for combinations of numerical and categorical attributes""" analyzer = qualitative_analysis.QualitativeAnalysis( self._job_config, preprocessor_factory.PreprocessorFactory.new_preprocessor( self._config_params)) analysis_tasks = list() if self._job_config.contingency_table_run: analysis_tasks.extend(analyzer.contingency_table_tasks()) if self._job_config.table_descriptive_run: analysis_tasks.extend(analyzer.categorical_numerical_descriptive_tasks()) return analysis_tasks def _quantitative_tasks(self): """Run correlation quantitative analysis for combinations of numerical and categorical attributes""" analyzer = quantitative_analysis.QuantitativeAnalysis( self._job_config, preprocessor_factory.PreprocessorFactory.new_preprocessor( self._config_params)) analysis_tasks = list() if self._job_config.pearson_corr_run: analysis_tasks.extend(analyzer.pearson_correlation_tasks()) if self._job_config.information_gain_run: analysis_tasks.extend(analyzer.information_gain_tasks()) if self._job_config.chi_square_run: analysis_tasks.extend( analyzer.chi_square_tasks(self._config_params.sampling_rate)) if self._job_config.anova_run: analysis_tasks.extend( analyzer.anova_tasks(self._config_params.sampling_rate)) return analysis_tasks def _generate_and_write_report(self): # generate markdown report md_report = report_generator.create_md_report( analysis_tracker=self.tracker, figure_base_path=self.figure_path, config_params=self._config_params) md_report_path = os.path.join(self.report_path, MD_FILE_NAME) # write report to a file with open(md_report_path, 'w') as wf: wf.write(md_report) logging.debug(md_report) logging.info('Markdown report generated successfully.') # generate html report html_report = report_generator.create_html_report_from_markdown( markdown_content=md_report) html_report_path = os.path.join(self.report_path, HTML_FILE_NAME) with open(html_report_path, 'w') as wf: wf.write(html_report) logging.debug(html_report) logging.info('HTML report generated successfully.') # generate pdf file if wkhtmltopdf is installed if which('wkhtmltopdf'): pdf_file_path = os.path.join(self.report_path, PDF_FILE_NAME) call(['wkhtmltopdf', '--enable-local-file-access', html_report_path, pdf_file_path]) else: logging.info( 'wkhtmltopdf is not detected, pdf report wont be generated.') def _export_analysis_results(self): result_dict = self.tracker.export_to_dict() report_folder = os.path.dirname(self.report_path) export_file = os.path.join(report_folder, AR_FILE_NAME) with open(export_file, 'wb') as wf: pickle.dump(result_dict, wf) def run_exploratory_data_analysis(self): """Run the main exploratory data analysis loop.""" self._analysis_run_metadata.analyses.extend(self._run_descriptive()) self._categorical_cardinality_check() non_descriptive_tasks = self._qualitative_tasks() non_descriptive_tasks.extend(self._quantitative_tasks()) self._analysis_run_metadata.analyses.extend( self.run_parallel_analysis_tasks(non_descriptive_tasks)) # pylint: disable-msg=logging-format-interpolation logging.info("""Numerical attributes: {} Categorical attributes: {}""".format( self.tracker.get_num_attribute_names(), self.tracker.get_cat_attribute_names() )) logging.info('All analysis:\n{}'.format( self.tracker.get_all_analysis_unique_names())) self._generate_and_write_report() # export the analysis results if self._config_params.export_result: self._export_analysis_results()
def create_descriptive_section(analysis_tracker: AnalysisTracker, figure_base_path: str) -> (str, List[str]): """Create descriptive section of the report Args: analysis_tracker: (AnalysisTracker) figure_base_path: (string), the folder for holding figures Returns: (str, List[str]), (section_content, List[warnings]) """ numerical_attributes = analysis_tracker.get_numerical_attributes() categorical_attributes = analysis_tracker.get_categorical_attributes() # holders for section content and warnings based on descriptive analysis contents = [] warnings = [] section_template = template.TABLE_DESCRIPTIVE_TEMPLATE for att in numerical_attributes: # base analysis is one holding basic descriptive statistics base_analysis = analysis_tracker.get_attribute_analysis( att, run_metadata_pb2.Analysis.Name.Name( run_metadata_pb2.Analysis.DESCRIPTIVE))[0] # additional analysis is one holding histogram for numerical attribute additional_analysis = analysis_tracker.get_attribute_analysis( att, run_metadata_pb2.Analysis.Name.Name( run_metadata_pb2.Analysis.HISTOGRAM))[0] contents.append( utils.create_table_descriptive_row_from_analysis( attribute_name=att, base_analysis=base_analysis, additional_analysis=additional_analysis, figure_base_path=figure_base_path)) # check missing value condition missing_check = recommendation.check_missing(att, base_analysis) if missing_check: warnings.append(missing_check) for att in categorical_attributes: # base analysis is one holding basic descriptive statistics base_analysis = analysis_tracker.get_attribute_analysis( att, run_metadata_pb2.Analysis.Name.Name( run_metadata_pb2.Analysis.DESCRIPTIVE))[0] # additional analysis is one holding value counts # for categorical attribute additional_analysis = analysis_tracker.get_attribute_analysis( att, run_metadata_pb2.Analysis.Name.Name( run_metadata_pb2.Analysis.VALUE_COUNTS))[0] contents.append( utils.create_table_descriptive_row_from_analysis( attribute_name=att, base_analysis=base_analysis, additional_analysis=additional_analysis, figure_base_path=figure_base_path)) # check missing value condition missing_check = recommendation.check_missing(att, base_analysis) if missing_check: warnings.append(missing_check) # check cardinality condition cardinality_check = recommendation.check_cardinality( att, base_analysis) if cardinality_check: warnings.append(cardinality_check) table_content = section_template.format(row_content=''.join(contents)) if warnings: table_content = table_content + utils.create_warning_notes(warnings) return table_content, warnings
def create_target_highlight_section( analysis_tracker: AnalysisTracker ) -> Union[Tuple[str, List[str]], None]: """Create the section highlight the correlation analysis performed between target and other attributes Args: analysis_tracker: (AnalysisTracker), holder for all the analysis Returns: Union[Tuple[str, List[str]], None], (section_content, List[warning]) """ # pylint: disable-msg=too-many-locals def _other_attribute_name(target_name: str, analysis: run_metadata_pb2.Analysis) -> str: attribute_name = [ att.name for att in analysis.features if att.name != target_name ][0] return attribute_name def _check_analysis(analysis_list: List[List[run_metadata_pb2.Analysis]]): for item in analysis_list: for analysis in item: if analysis.name in checking_map: if checking_map[analysis.name](analysis): yield _other_attribute_name(target, analysis) def _consolidate_analysis(metric_names, analysis_tracker): revised_names = [] analysis_list = [] for name in metric_names: analysis = analysis_tracker.get_attribute_analysis(target, name) if analysis: revised_names.append(name) analysis_list.append(analysis) return revised_names, analysis_list checking_map = { run_metadata_pb2.Analysis.ANOVA: recommendation.check_p_value, run_metadata_pb2.Analysis.PEARSON_CORRELATION: recommendation.check_pearson_correlation, run_metadata_pb2.Analysis.CHI_SQUARE: recommendation.check_p_value } target = analysis_tracker.get_target().name ml_problem = analysis_tracker.metadata.ml_type recommend_features = [] # pylint: disable-msg=no-else-return if ml_problem == c.metadata.ml_type.NULL: return None else: if ml_problem == c.metadata.ml_type.REGRESSION: # Correlation for numerical attributes # ANOVA for categorical attributes numerical_metric_names = [ run_metadata_pb2.Analysis.Name.Name( run_metadata_pb2.Analysis.PEARSON_CORRELATION) ] categorical_metric_names = [ run_metadata_pb2.Analysis.Name.Name( run_metadata_pb2.Analysis.ANOVA) ] elif ml_problem == c.metadata.ml_type.CLASSIFICATION: # ANOVA for numerical attributes # IG and Chi-square for categorical attributes numerical_metric_names = [ run_metadata_pb2.Analysis.Name.Name( run_metadata_pb2.Analysis.ANOVA) ] categorical_metric_names = [ run_metadata_pb2.Analysis.Name.Name( run_metadata_pb2.Analysis.INFORMATION_GAIN), run_metadata_pb2.Analysis.Name.Name( run_metadata_pb2.Analysis.CHI_SQUARE) ] else: raise ValueError('The ML problem type is not supported') section_content = [ "**Target:** {}\n".format(analysis_tracker.get_target().name) ] r_numerical_metrics, r_numerical_analysis = \ _consolidate_analysis(numerical_metric_names, analysis_tracker) r_categorical_metrics, r_categorical_analysis = \ _consolidate_analysis(categorical_metric_names, analysis_tracker) if r_numerical_metrics: section_content.append( template.SUB_SUB_SECTION_TITLE.format( content="Numerical features and target")) # recommendation based on checking results recommend_features.extend(_check_analysis(r_numerical_analysis)) numerical_highlight = utils.create_target_metrics_highlight( target_name=target, metric_name_list=r_numerical_metrics, metric_analysis_list=r_numerical_analysis) section_content.append(numerical_highlight) if r_categorical_metrics: section_content.append( template.SUB_SUB_SECTION_TITLE.format( content="Categorical features and target")) recommend_features.extend(_check_analysis(r_categorical_analysis)) # recommendation based on checking results categorical_highlight = utils.create_target_metrics_highlight( target_name=target, metric_name_list=r_categorical_metrics, metric_analysis_list=r_categorical_analysis) section_content.append(categorical_highlight) return ''.join(section_content), recommend_features
def create_target_highlight_section( analysis_tracker: AnalysisTracker, figure_base_path: Text = '' ) -> Union[Tuple[Text, List[Text]], Tuple[None, None]]: """Create the section highlight the correlation analysis performed between target and other attributes. Args: analysis_tracker: (AnalysisTracker), holder for all the analysis figure_base_path: (string), not used, for signature consistence Returns: Union[Tuple[str, List[str]], None], (section_content, List[warning]) """ # pylint: disable-msg=too-many-locals def _other_attribute_name(target_name: str, analysis: analysis_entity_pb2.Analysis) -> str: attribute_name = [ att.name for att in analysis.features if att.name != target_name ][0] return attribute_name def _check_analysis( analysis_list: List[List[analysis_entity_pb2.Analysis]]): for item in analysis_list: for analysis in item: if analysis.name in checking_map: if checking_map[analysis.name](analysis): yield _other_attribute_name(target, analysis) def _consolidate_analysis(metric_names, analysis_tracker): revised_names = [] analysis_list = [] for name in metric_names: analysis = analysis_tracker.get_analysis_by_attribute_and_name( target, name) if analysis: revised_names.append(name) analysis_list.append(analysis) return revised_names, analysis_list checking_map = { Analysis.ANOVA: recommendation.check_p_value, Analysis.PEARSON_CORRELATION: recommendation.check_pearson_correlation, Analysis.CHI_SQUARE: recommendation.check_p_value } target = analysis_tracker.get_target_attribute().name ml_problem = analysis_tracker.get_job_config().ml_type # pylint: disable-msg=no-else-return if ml_problem == c.ml_type.NULL: return None, None else: if ml_problem == c.ml_type.REGRESSION: target_type = c.datasource.TYPE_NUMERICAL # Correlation for numerical attributes # ANOVA for categorical attributes numerical_metric_names = [ Analysis.Name.Name(Analysis.PEARSON_CORRELATION) ] categorical_metric_names = [Analysis.Name.Name(Analysis.ANOVA)] elif ml_problem == c.ml_type.CLASSIFICATION: target_type = c.datasource.TYPE_CATEGORICAL # ANOVA for numerical attributes # IG and Chi-square for categorical attributes numerical_metric_names = [Analysis.Name.Name(Analysis.ANOVA)] categorical_metric_names = [ Analysis.Name.Name(Analysis.INFORMATION_GAIN), Analysis.Name.Name(Analysis.CHI_SQUARE) ] else: raise ValueError('The ML problem type is not supported') recommend_features = [] section_content = [] r_numerical_metrics, r_numerical_analysis = \ _consolidate_analysis(numerical_metric_names, analysis_tracker) r_categorical_metrics, r_categorical_analysis = \ _consolidate_analysis(categorical_metric_names, analysis_tracker) if r_numerical_metrics: section_content.append( template.SUB_SUB_SECTION_TITLE.format( content="Numerical features and target")) # recommendation based on checking results recommend_features.extend(_check_analysis(r_numerical_analysis)) numerical_highlight = utils.create_target_metrics_highlight( target_name=target, metric_name_list=r_numerical_metrics, metric_analysis_list=r_numerical_analysis) section_content.append(numerical_highlight) if r_categorical_metrics: section_content.append( template.SUB_SUB_SECTION_TITLE.format( content="Categorical features and target")) recommend_features.extend(_check_analysis(r_categorical_analysis)) # recommendation based on checking results categorical_highlight = utils.create_target_metrics_highlight( target_name=target, metric_name_list=r_categorical_metrics, metric_analysis_list=r_categorical_analysis) section_content.append(categorical_highlight) if not section_content: return None, None else: target_str_template = template.TARGET_HEADLINE_TEMPLATE target_str = target_str_template.format(target=target, target_type=target_type) section_content.insert(0, target_str) return ''.join(section_content), recommend_features
def create_descriptive_section(analysis_tracker: AnalysisTracker, figure_base_path: Text) -> (Text, List[Text]): """Create descriptive section of the report. Checking based on the descriptive results will be performed, e.g., missing values and high cardinality. Args: analysis_tracker: (AnalysisTracker), holder for all the analysis figure_base_path: (string), the folder for holding figures Returns: Tuple[Text, List[Text]], (section_content, List[warnings]) """ numerical_attributes = analysis_tracker.get_num_attribute_names() categorical_attributes = analysis_tracker.get_cat_attribute_names() # holders for section content and warnings based on descriptive analysis contents = [] warnings = [] section_template = template.TABLE_DESCRIPTIVE_TEMPLATE for att in numerical_attributes: # base analysis is one holding basic descriptive statistics base_analysis = analysis_tracker.get_analysis_by_attribute_and_name( att, Analysis.Name.Name(Analysis.DESCRIPTIVE))[0] # additional analysis is one holding histogram for numerical attribute additional_analysis = analysis_tracker.get_analysis_by_attribute_and_name( att, Analysis.Name.Name(Analysis.HISTOGRAM))[0] contents.append( utils.create_table_descriptive_row_from_analysis( attribute_name=att, base_analysis=base_analysis, additional_analysis=additional_analysis, figure_base_path=figure_base_path)) # check missing value condition missing_check = recommendation.check_missing(att, base_analysis) if missing_check: warnings.append(missing_check) for att in categorical_attributes: # base analysis is one holding basic descriptive statistics base_analysis = analysis_tracker.get_analysis_by_attribute_and_name( att, Analysis.Name.Name(Analysis.DESCRIPTIVE))[0] # additional analysis is one holding value counts # for categorical attribute additional_analysis = analysis_tracker.get_analysis_by_attribute_and_name( att, Analysis.Name.Name(Analysis.VALUE_COUNTS))[0] contents.append( utils.create_table_descriptive_row_from_analysis( attribute_name=att, base_analysis=base_analysis, additional_analysis=additional_analysis, figure_base_path=figure_base_path)) # check missing value condition missing_check = recommendation.check_missing(att, base_analysis) if missing_check: warnings.append(missing_check) # check cardinality condition cardinality_check = recommendation.check_cardinality( att, base_analysis) if cardinality_check: warnings.append(cardinality_check) # finally all the descriptive analysis result will be organised in a table table_content = section_template.format(row_content=''.join(contents)) if warnings: table_content = table_content + utils.create_warning_notes(warnings) return table_content, warnings