コード例 #1
0
def create_dataset_info_section(
        analysis_tracker: AnalysisTracker,
        figure_base_path: Text = '') -> Tuple[Text, None]:
    """Create the top dataset info section without section title. No additional
  info will be generated.

  Args:
      analysis_tracker: (AnalysisTracker), holder for all the analysis
      figure_base_path: (string), not used, for signature consistence

  Returns:
      Tuple[Text, None]
  """
    target = analysis_tracker.get_target_attribute().name
    ml_problem = analysis_tracker.get_job_config().ml_type
    numerical_attributes = analysis_tracker.get_num_attribute_names()
    categorical_attributes = analysis_tracker.get_cat_attribute_names()

    content = template.DATASET_INFO_TEMPLATE.format(
        location=analysis_tracker.get_job_config().datasource.location,
        numerical_attributes=len(numerical_attributes),
        categorical_attributes=len(categorical_attributes),
        target_name=target,
        ml_problem_type=ml_problem)
    return content, None
コード例 #2
0
def create_information_gain_section(analysis_tracker: AnalysisTracker,
                                    figure_base_path: str) -> Union[str, None]:
    """Construct information gain section content for categorical attributes

  Args:
      analysis_tracker: (AnalysisTracker), holder for all the analysis
      figure_base_path: (string), the folder for holding figures

  Returns:
      Union[str, None]
  """

    # extract the information gain analysis result
    # each pair of categorical attributes will have one corresponding analysis
    info_analysis = analysis_tracker.get_analysis(
        run_metadata_pb2.Analysis.Name.Name(
            run_metadata_pb2.Analysis.INFORMATION_GAIN))

    if info_analysis:
        return utils.create_no_order_pair_metric_section(
            analysis_list=info_analysis,
            same_match_value=0.0,
            table_name="Information-Gain",
            figure_base_path=figure_base_path)

    return None
コード例 #3
0
def create_anova_section(
        analysis_tracker: AnalysisTracker
) -> Union[Tuple[str, List[str]], None]:
    """Construct anova section content

  Args:
      analysis_tracker: (AnalysisTracker), holder for all the analysis

  Returns:
      Union[Tuple[str, List[str]], None], (section_content, List[warning])
  """
    warnings = []

    # extract the anova analysis result
    # each pair of numerical and categorical attributes will have
    # one corresponding analysis
    anova_analysis = analysis_tracker.get_analysis(
        run_metadata_pb2.Analysis.Name.Name(run_metadata_pb2.Analysis.ANOVA))

    if anova_analysis:
        table_content = utils.create_order_pair_metric_section(
            analysis_list=anova_analysis, same_match_value='NA')

        for analysis in anova_analysis:
            corr_check = recommendation.check_p_value(analysis)
            if corr_check:
                warnings.append(corr_check)

        if warnings:
            table_content = table_content + utils.create_warning_notes(
                warnings)

        return table_content, warnings

    return None
コード例 #4
0
def create_table_descriptive_section(
    analysis_tracker: AnalysisTracker,
    figure_base_path: Text = ''
) -> Union[Tuple[Text, None], Tuple[None, None]]:
    """Construct descriptive table section content for categorical attributes.

  Args:
      analysis_tracker: (AnalysisTracker), holder for all the analysis
      figure_base_path: (string), not used, for signature consistence

  Returns:
      Union[str, None]
  """
    # extract the descriptive table analysis result
    # each pair of categorical attributes will have one corresponding analysis
    analysis_results = analysis_tracker.get_analysis_by_name(
        Analysis.Name.Name(Analysis.TABLE_DESCRIPTIVE))

    if analysis_results:
        content = []
        for analysis in analysis_results:
            attributes = [item.name for item in analysis.features][::-1]
            section_title = template.SUB_SUB_SUB_SECTION_TITLE.format(
                content="{} / {}".format(attributes[0], attributes[1]))
            analysis_content_str = utils.create_table_from_table_metric(
                analysis.tmetrics[0])
            content.extend([section_title, analysis_content_str, "\n<br/>\n"])
        return ''.join(content), None

    return None, None
コード例 #5
0
def create_table_descriptive_section(
        analysis_tracker: AnalysisTracker) -> Union[str, None]:
    """Construct descriptive table section content for categorical attributes

  Args:
      analysis_tracker: (AnalysisTracker), holder for all the analysis

  Returns:
      Union[str, None]
  """
    # extract the descriptive table analysis result
    # each pair of categorical attributes will have one corresponding analysis
    analysis_results = analysis_tracker.get_analysis(
        run_metadata_pb2.Analysis.Name.Name(
            run_metadata_pb2.Analysis.TABLE_DESCRIPTIVE))

    if analysis_results:
        content = []
        for analysis in analysis_results:
            attributes = [item.name for item in analysis.features][::-1]
            section_title = template.SUB_SUB_SUB_SECTION_TITLE.format(
                content="{} / {}".format(attributes[0], attributes[1]))
            analysis_content_str = utils.create_table_from_TableMetric(
                analysis.tmetrics[0])
            content.extend([section_title, analysis_content_str, "\n<br/>\n"])
        return ''.join(content)

    return None
コード例 #6
0
def create_information_gain_section(
        analysis_tracker: AnalysisTracker,
        figure_base_path: Text) -> Union[Tuple[Text, None], Tuple[None, None]]:
    """Construct information gain section content for categorical attributes. No
  additional info will be generated.

  Args:
      analysis_tracker: (AnalysisTracker), holder for all the analysis
      figure_base_path: (string), the folder for holding figures

  Returns:
      Union[Tuple[Text, None], Tuple[None, None]]
  """

    # extract the information gain analysis result
    # each pair of categorical attributes will have one corresponding analysis
    info_analysis = analysis_tracker.get_analysis_by_name(
        Analysis.Name.Name(Analysis.INFORMATION_GAIN))

    if info_analysis:
        content = utils.create_no_order_pair_metric_section(
            analysis_list=info_analysis,
            same_match_value=0.0,
            table_name="Information-Gain",
            figure_base_path=figure_base_path)
        return content, None

    return None, None
コード例 #7
0
  def __init__(self, config_params: argparse.ArgumentParser):
    # Parameter from CLI
    self._config_params = config_params
    self._analysis_run_metadata.timestamp_sec = time.time()

    # Load data definition
    self._job_config = job_config_loader.load_job_config(self._config_params)
    self._analysis_run_metadata.datasource.CopyFrom(self._job_config.datasource)
    self.tracker = AnalysisTracker(self._job_config)

    self.report_path = self._config_params.report_path
    self.figure_path = os.path.join(os.path.dirname(self.report_path), 'figure')
    if not os.path.exists(self.figure_path):
      os.makedirs(self.figure_path)

    logging.info(self._job_config.datasource)
コード例 #8
0
def create_dataset_info_section(
        analysis_tracker: AnalysisTracker) -> Union[str, None]:
    """Create the top dataset info section without section title

  Args:
      analysis_tracker: (AnalysisTracker), holder for all the analysis

  Returns:
      Union[str, None]
  """
    target = analysis_tracker.get_target().name
    ml_problem = analysis_tracker.metadata.ml_type
    numerical_attributes = analysis_tracker.get_numerical_attributes()
    categorical_attributes = analysis_tracker.get_categorical_attributes()

    return template.DATASET_INFO_TEMPLATE.format(
        location=analysis_tracker.metadata.datasource.location,
        numerical_attributes=len(numerical_attributes),
        categorical_attributes=len(categorical_attributes),
        target_name=target,
        ml_problem_type=ml_problem)
コード例 #9
0
def create_pearson_correlation_section(
        analysis_tracker: AnalysisTracker,
        figure_base_path: str) -> Union[Tuple[str, List[str]], None]:
    """Construct correlation section content for numerical attributes

  Args:
      analysis_tracker: (AnalysisTracker), holder for all the analysis
      figure_base_path: (string), the folder for holding figures

  Returns:
      Union[Tuple[str, List[str]], None], (section_content, List[warining])
  """

    warnings = []
    # extract the correlation analysis result
    # each pair of numerical attributes will have one corresponding analysis
    corr_analysis = analysis_tracker.get_analysis(
        run_metadata_pb2.Analysis.Name.Name(
            run_metadata_pb2.Analysis.PEARSON_CORRELATION))

    if corr_analysis:

        table_content = utils.create_no_order_pair_metric_section(
            analysis_list=corr_analysis,
            same_match_value=1.0,
            table_name="Correlation",
            figure_base_path=figure_base_path)

        for analysis in corr_analysis:
            # correlation condition check
            corr_check = recommendation.check_pearson_correlation(analysis)
            if corr_check:
                warnings.append(corr_check)

        if warnings:
            table_content = table_content + utils.create_warning_notes(
                warnings)

        return table_content, warnings

    return None
コード例 #10
0
def create_chi_square_section(
    analysis_tracker: AnalysisTracker,
    figure_base_path: Text = ''
) -> Union[Tuple[Text, List[Text]], Tuple[None, None]]:
    """Construct chi-square section content. If chi-square text
  is not performed, None will be returned.

  Args:
      analysis_tracker: (AnalysisTracker), holder for all the analysis
      figure_base_path: (string), not used, for signature consistence

  Returns:
      Union[Tuple[Text, List[Text]], Tuple[None, None]],
      (section_content, List[warning])
  """
    warnings = []

    # extract the anova analysis result
    # each pair of categorical attributes will have
    # one corresponding analysis
    chi_square_analysis = analysis_tracker.get_analysis_by_name(
        Analysis.Name.Name(Analysis.CHI_SQUARE))

    if chi_square_analysis:
        table_content = utils.create_no_order_pair_metric_section(
            analysis_list=chi_square_analysis,
            same_match_value='NA',
            figure_base_path='NA')
        for analysis in chi_square_analysis:
            corr_check = recommendation.check_p_value(analysis)
            if corr_check:
                warnings.append(corr_check)

        if warnings:
            table_content = table_content + utils.create_warning_notes(
                warnings)

        return table_content, warnings

    return None, None
コード例 #11
0
def create_anova_section(
    analysis_tracker: AnalysisTracker,
    figure_base_path: Text = ''
) -> Union[Tuple[Text, List[Text]], Tuple[None, None]]:
    """Construct anova section content. If anova test is not performed,
  None will be returned.

  Args:
      analysis_tracker: (AnalysisTracker), holder for all the analysis
      figure_base_path: (string), the folder for holding figures

  Returns:
      Union[Tuple[Text, List[Text]], Tuple[None, None]],
      (section_content, List[warning])
  """
    warnings = []

    # extract the anova analysis result
    # each pair of numerical and categorical attributes will have
    # one corresponding analysis
    anova_analysis = analysis_tracker.get_analysis_by_name(
        Analysis.Name.Name(Analysis.ANOVA))

    if anova_analysis:
        table_content = utils.create_order_pair_metric_section(
            analysis_list=anova_analysis, same_match_value='NA')

        for analysis in anova_analysis:
            corr_check = recommendation.check_p_value(analysis)
            if corr_check:
                warnings.append(corr_check)

        if warnings:
            table_content = table_content + utils.create_warning_notes(
                warnings)

        return table_content, warnings

    return None, None
コード例 #12
0
class Run:
    """Class of main interface for running analysis"""
    _run_metadata = run_metadata_pb2.AnalysisRun()

    def __init__(self, config_params: argparse.ArgumentParser):
        # Parameter from CLI
        self._config_params = config_params
        self._run_metadata.timestamp_sec = time.time()

        # Load data definition
        self._metadata_def = metadata_loader.load_metadata_def(
            self._config_params)
        self._run_metadata.datasource.CopyFrom(self._metadata_def.datasource)
        self.tracker = AnalysisTracker(self._metadata_def)

        self.report_path = self._config_params.report_path
        self.figure_path = os.path.join(os.path.dirname(self.report_path),
                                        'figure')
        if not os.path.exists(self.figure_path):
            os.makedirs(self.figure_path)

        logging.info(self._metadata_def.datasource)

    def _run_descriptive(self):
        """Run descriptive analysis for both numerical and
    categorical attributes."""
        analyzer = descriptive_analysis.DescriptiveAnalysis(
            self._metadata_def,
            preprocessor_factory.PreprocessorFactory.new_preprocessor(
                self._config_params))

        analyses = list()

        analyses.extend(analyzer.run_numerical_descriptive())
        analyses.extend(
            analyzer.run_numerical_histograms(
                self._metadata_def.histogram_bin))

        analyses.extend(analyzer.run_categorical_descriptive())
        vc_limit = self._metadata_def.value_counts_limit
        analyses.extend(analyzer.run_value_counts(vc_limit))

        for item in analyses:
            self.tracker.add_analysis(item)

        return analyses

    def _categorical_cardinality_check(self):
        """Check whether the cardinality of the categorical columns are within
    the specified threshold."""
        def _get_cardinality(attribute):
            descrip_analysis = self.tracker.get_attribute_analysis(
                attribute_name=attribute.name,
                analysis_name=run_metadata_pb2.Analysis.Name.Name(
                    run_metadata_pb2.Analysis.DESCRIPTIVE))
            for metric in descrip_analysis[0].smetrics:
                if metric.name == run_metadata_pb2.ScalarMetric.CARDINALITY:
                    return metric.value
            return None

        valid_list = []

        for att in self._metadata_def.categorical_attributes:
            cardinality = _get_cardinality(att)
            if cardinality <= self._metadata_def.general_cardinality_limit:
                valid_list.append(att)

        self._metadata_def.update_low_card_categorical(valid_list)

    def _run_qualitative(self):
        """Run correlation qualitative analysis for combinations of numerical
    and categorical attributes"""
        analyzer = qualitative_analysis.QualitativeAnalysis(
            self._metadata_def,
            preprocessor_factory.PreprocessorFactory.new_preprocessor(
                self._config_params))

        analyses = list()
        if self._metadata_def.contingency_table_run:
            analyses.extend(analyzer.run_contigency_table())
        if self._metadata_def.table_descriptive_run:
            analyses.extend(analyzer.run_categorical_numerical_descriptive())

        for item in analyses:
            self.tracker.add_analysis(item)

        return analyses

    def _run_quantitative(self):
        """Run correlation quantitative analysis for combinations of numerical
    and categorical attributes"""
        analyzer = quantitative_analysis.QuantitativeAnalysis(
            self._metadata_def,
            preprocessor_factory.PreprocessorFactory.new_preprocessor(
                self._config_params))

        analyses = []
        if self._metadata_def.pearson_corr_run:
            analyses.extend(analyzer.run_pearson_correlation())
        if self._metadata_def.information_gain_run:
            analyses.extend(analyzer.run_information_gain())
        if self._metadata_def.chi_square_run:
            analyses.extend(analyzer.run_chi_square())
        if self._metadata_def.anova_run:
            analyses.extend(analyzer.run_anova())

        for item in analyses:
            self.tracker.add_analysis(item)

        return analyses

    def run_exploratory_data_analysis(self):
        """Run the main exploratory data analysis loop."""

        self._run_metadata.analyses.extend(self._run_descriptive())
        self._categorical_cardinality_check()
        self._run_metadata.analyses.extend(self._run_qualitative())
        self._run_metadata.analyses.extend(self._run_quantitative())

        # pylint: disable-msg=logging-format-interpolation
        logging.info(
            "Numerical attributes: {}\nCategorical attributes: {}".format(
                self.tracker.get_numerical_attributes(),
                self.tracker.get_categorical_attributes()))
        logging.info('All analysis:\n{}'.format(
            self.tracker.get_all_analysis_unique_names()))

        report = report_generator.create_report_md_content(
            analysis_tracker=self.tracker, figure_base_path=self.figure_path)

        logging.info(report)
        with open(self.report_path, 'w') as wf:
            wf.write(report)
コード例 #13
0
class AnalysisRun:
  """Class of main interface for running analysis"""
  _analysis_run_metadata = analysis_entity_pb2.AnalysisRun()

  def __init__(self, config_params: argparse.ArgumentParser):
    # Parameter from CLI
    self._config_params = config_params
    self._analysis_run_metadata.timestamp_sec = time.time()

    # Load data definition
    self._job_config = job_config_loader.load_job_config(self._config_params)
    self._analysis_run_metadata.datasource.CopyFrom(self._job_config.datasource)
    self.tracker = AnalysisTracker(self._job_config)

    self.report_path = self._config_params.report_path
    self.figure_path = os.path.join(os.path.dirname(self.report_path), 'figure')
    if not os.path.exists(self.figure_path):
      os.makedirs(self.figure_path)

    logging.info(self._job_config.datasource)

  def run_parallel_analysis_tasks(self, analysis_task):
    """Run parallel analysis task."""
    task_result = _parallel_runner(
        tasks=analysis_task,
        num_parallel=self._config_params.parallel_thread)

    analysis_list = list()
    for result in task_result:
      if isinstance(result, Analysis):
        analysis_list.append(result)
      else:
        analysis_list.extend(result)

    for analysis in analysis_list:
      self.tracker.add_analysis(analysis)

    return analysis_list

  def _run_descriptive(self):
    """Run descriptive analysis for both numerical and
    categorical attributes."""
    analyzer = descriptive_analysis.DescriptiveAnalysis(
        self._job_config,
        preprocessor_factory.PreprocessorFactory.new_preprocessor(
            self._config_params))

    analysis_tasks = list()
    analysis_tasks.extend(analyzer.numerical_descriptive_tasks())

    h_bin = self._job_config.histogram_bin
    analysis_tasks.extend(analyzer.numerical_histograms_tasks(h_bin))

    analysis_tasks.extend(analyzer.categorical_descriptive_tasks())

    vc_limit = self._job_config.value_counts_limit
    analysis_tasks.extend(analyzer.value_counts_tasks(vc_limit))

    return self.run_parallel_analysis_tasks(analysis_tasks)

  def _categorical_cardinality_check(self):
    """Check whether the cardinality of the categorical columns are within
    the specified threshold."""

    def _get_cardinality(attribute):
      descrip_analysis = self.tracker.get_analysis_by_attribute_and_name(
          attribute_name=attribute.name,
          analysis_name=Analysis.Name.Name(Analysis.DESCRIPTIVE)
      )
      for metric in descrip_analysis[0].smetrics:
        if metric.name == ScalarMetric.CARDINALITY:
          return metric.value
      return None

    valid_list = []

    for att in self._job_config.categorical_attributes:
      cardinality = _get_cardinality(att)
      if cardinality <= self._job_config.general_cardinality_limit:
        valid_list.append(att)

    self._job_config.update_low_card_categorical(valid_list)

  def _qualitative_tasks(self):
    """Run correlation qualitative analysis for combinations of numerical
    and categorical attributes"""
    analyzer = qualitative_analysis.QualitativeAnalysis(
        self._job_config,
        preprocessor_factory.PreprocessorFactory.new_preprocessor(
            self._config_params))

    analysis_tasks = list()

    if self._job_config.contingency_table_run:
      analysis_tasks.extend(analyzer.contingency_table_tasks())
    if self._job_config.table_descriptive_run:
      analysis_tasks.extend(analyzer.categorical_numerical_descriptive_tasks())

    return analysis_tasks

  def _quantitative_tasks(self):
    """Run correlation quantitative analysis for combinations of numerical
    and categorical attributes"""
    analyzer = quantitative_analysis.QuantitativeAnalysis(
        self._job_config,
        preprocessor_factory.PreprocessorFactory.new_preprocessor(
            self._config_params))

    analysis_tasks = list()

    if self._job_config.pearson_corr_run:
      analysis_tasks.extend(analyzer.pearson_correlation_tasks())
    if self._job_config.information_gain_run:
      analysis_tasks.extend(analyzer.information_gain_tasks())
    if self._job_config.chi_square_run:
      analysis_tasks.extend(
          analyzer.chi_square_tasks(self._config_params.sampling_rate))
    if self._job_config.anova_run:
      analysis_tasks.extend(
          analyzer.anova_tasks(self._config_params.sampling_rate))

    return analysis_tasks

  def _generate_and_write_report(self):
    # generate markdown report
    md_report = report_generator.create_md_report(
        analysis_tracker=self.tracker,
        figure_base_path=self.figure_path,
        config_params=self._config_params)
    md_report_path = os.path.join(self.report_path, MD_FILE_NAME)
    # write report to a file
    with open(md_report_path, 'w') as wf:
      wf.write(md_report)
    logging.debug(md_report)
    logging.info('Markdown report generated successfully.')

    # generate html report
    html_report = report_generator.create_html_report_from_markdown(
        markdown_content=md_report)
    html_report_path = os.path.join(self.report_path, HTML_FILE_NAME)
    with open(html_report_path, 'w') as wf:
      wf.write(html_report)
    logging.debug(html_report)
    logging.info('HTML report generated successfully.')

    # generate pdf file if wkhtmltopdf is installed
    if which('wkhtmltopdf'):
      pdf_file_path = os.path.join(self.report_path, PDF_FILE_NAME)
      call(['wkhtmltopdf', '--enable-local-file-access',
            html_report_path, pdf_file_path])
    else:
      logging.info(
          'wkhtmltopdf is not detected, pdf report wont be generated.')

  def _export_analysis_results(self):
    result_dict = self.tracker.export_to_dict()
    report_folder = os.path.dirname(self.report_path)
    export_file = os.path.join(report_folder, AR_FILE_NAME)
    with open(export_file, 'wb') as wf:
      pickle.dump(result_dict, wf)

  def run_exploratory_data_analysis(self):
    """Run the main exploratory data analysis loop."""

    self._analysis_run_metadata.analyses.extend(self._run_descriptive())

    self._categorical_cardinality_check()

    non_descriptive_tasks = self._qualitative_tasks()
    non_descriptive_tasks.extend(self._quantitative_tasks())
    self._analysis_run_metadata.analyses.extend(
        self.run_parallel_analysis_tasks(non_descriptive_tasks))

    # pylint: disable-msg=logging-format-interpolation

    logging.info("""Numerical attributes: {}
    Categorical attributes: {}""".format(
        self.tracker.get_num_attribute_names(),
        self.tracker.get_cat_attribute_names()
    ))
    logging.info('All analysis:\n{}'.format(
        self.tracker.get_all_analysis_unique_names()))

    self._generate_and_write_report()

    # export the analysis results
    if self._config_params.export_result:
      self._export_analysis_results()
コード例 #14
0
def create_descriptive_section(analysis_tracker: AnalysisTracker,
                               figure_base_path: str) -> (str, List[str]):
    """Create descriptive section of the report

  Args:
      analysis_tracker: (AnalysisTracker)
      figure_base_path: (string), the folder for holding figures

  Returns:
      (str, List[str]), (section_content, List[warnings])
  """

    numerical_attributes = analysis_tracker.get_numerical_attributes()
    categorical_attributes = analysis_tracker.get_categorical_attributes()

    # holders for section content and warnings based on descriptive analysis
    contents = []
    warnings = []

    section_template = template.TABLE_DESCRIPTIVE_TEMPLATE

    for att in numerical_attributes:
        # base analysis is one holding basic descriptive statistics
        base_analysis = analysis_tracker.get_attribute_analysis(
            att,
            run_metadata_pb2.Analysis.Name.Name(
                run_metadata_pb2.Analysis.DESCRIPTIVE))[0]
        # additional analysis is one holding histogram for numerical attribute
        additional_analysis = analysis_tracker.get_attribute_analysis(
            att,
            run_metadata_pb2.Analysis.Name.Name(
                run_metadata_pb2.Analysis.HISTOGRAM))[0]
        contents.append(
            utils.create_table_descriptive_row_from_analysis(
                attribute_name=att,
                base_analysis=base_analysis,
                additional_analysis=additional_analysis,
                figure_base_path=figure_base_path))
        # check missing value condition
        missing_check = recommendation.check_missing(att, base_analysis)
        if missing_check:
            warnings.append(missing_check)

    for att in categorical_attributes:
        # base analysis is one holding basic descriptive statistics
        base_analysis = analysis_tracker.get_attribute_analysis(
            att,
            run_metadata_pb2.Analysis.Name.Name(
                run_metadata_pb2.Analysis.DESCRIPTIVE))[0]
        # additional analysis is one holding value counts
        # for categorical attribute
        additional_analysis = analysis_tracker.get_attribute_analysis(
            att,
            run_metadata_pb2.Analysis.Name.Name(
                run_metadata_pb2.Analysis.VALUE_COUNTS))[0]
        contents.append(
            utils.create_table_descriptive_row_from_analysis(
                attribute_name=att,
                base_analysis=base_analysis,
                additional_analysis=additional_analysis,
                figure_base_path=figure_base_path))
        # check missing value condition
        missing_check = recommendation.check_missing(att, base_analysis)
        if missing_check:
            warnings.append(missing_check)
        # check cardinality condition
        cardinality_check = recommendation.check_cardinality(
            att, base_analysis)
        if cardinality_check:
            warnings.append(cardinality_check)

    table_content = section_template.format(row_content=''.join(contents))

    if warnings:
        table_content = table_content + utils.create_warning_notes(warnings)

    return table_content, warnings
コード例 #15
0
def create_target_highlight_section(
        analysis_tracker: AnalysisTracker
) -> Union[Tuple[str, List[str]], None]:
    """Create the section highlight the correlation analysis performed between
  target and other attributes

  Args:
      analysis_tracker: (AnalysisTracker), holder for all the analysis

  Returns:
      Union[Tuple[str, List[str]], None], (section_content, List[warning])
  """

    # pylint: disable-msg=too-many-locals
    def _other_attribute_name(target_name: str,
                              analysis: run_metadata_pb2.Analysis) -> str:
        attribute_name = [
            att.name for att in analysis.features if att.name != target_name
        ][0]
        return attribute_name

    def _check_analysis(analysis_list: List[List[run_metadata_pb2.Analysis]]):
        for item in analysis_list:
            for analysis in item:
                if analysis.name in checking_map:
                    if checking_map[analysis.name](analysis):
                        yield _other_attribute_name(target, analysis)

    def _consolidate_analysis(metric_names, analysis_tracker):
        revised_names = []
        analysis_list = []
        for name in metric_names:
            analysis = analysis_tracker.get_attribute_analysis(target, name)
            if analysis:
                revised_names.append(name)
                analysis_list.append(analysis)
        return revised_names, analysis_list

    checking_map = {
        run_metadata_pb2.Analysis.ANOVA: recommendation.check_p_value,
        run_metadata_pb2.Analysis.PEARSON_CORRELATION:
        recommendation.check_pearson_correlation,
        run_metadata_pb2.Analysis.CHI_SQUARE: recommendation.check_p_value
    }

    target = analysis_tracker.get_target().name
    ml_problem = analysis_tracker.metadata.ml_type

    recommend_features = []

    # pylint: disable-msg=no-else-return
    if ml_problem == c.metadata.ml_type.NULL:
        return None
    else:
        if ml_problem == c.metadata.ml_type.REGRESSION:
            # Correlation for numerical attributes
            # ANOVA for categorical attributes
            numerical_metric_names = [
                run_metadata_pb2.Analysis.Name.Name(
                    run_metadata_pb2.Analysis.PEARSON_CORRELATION)
            ]
            categorical_metric_names = [
                run_metadata_pb2.Analysis.Name.Name(
                    run_metadata_pb2.Analysis.ANOVA)
            ]

        elif ml_problem == c.metadata.ml_type.CLASSIFICATION:
            # ANOVA for numerical attributes
            # IG and Chi-square for categorical attributes
            numerical_metric_names = [
                run_metadata_pb2.Analysis.Name.Name(
                    run_metadata_pb2.Analysis.ANOVA)
            ]
            categorical_metric_names = [
                run_metadata_pb2.Analysis.Name.Name(
                    run_metadata_pb2.Analysis.INFORMATION_GAIN),
                run_metadata_pb2.Analysis.Name.Name(
                    run_metadata_pb2.Analysis.CHI_SQUARE)
            ]

        else:
            raise ValueError('The ML problem type is not supported')

        section_content = [
            "**Target:** {}\n".format(analysis_tracker.get_target().name)
        ]

        r_numerical_metrics, r_numerical_analysis = \
          _consolidate_analysis(numerical_metric_names, analysis_tracker)
        r_categorical_metrics, r_categorical_analysis = \
          _consolidate_analysis(categorical_metric_names, analysis_tracker)

        if r_numerical_metrics:
            section_content.append(
                template.SUB_SUB_SECTION_TITLE.format(
                    content="Numerical features and target"))

            # recommendation based on checking results
            recommend_features.extend(_check_analysis(r_numerical_analysis))
            numerical_highlight = utils.create_target_metrics_highlight(
                target_name=target,
                metric_name_list=r_numerical_metrics,
                metric_analysis_list=r_numerical_analysis)
            section_content.append(numerical_highlight)

        if r_categorical_metrics:
            section_content.append(
                template.SUB_SUB_SECTION_TITLE.format(
                    content="Categorical features and target"))
            recommend_features.extend(_check_analysis(r_categorical_analysis))
            # recommendation based on checking results
            categorical_highlight = utils.create_target_metrics_highlight(
                target_name=target,
                metric_name_list=r_categorical_metrics,
                metric_analysis_list=r_categorical_analysis)
            section_content.append(categorical_highlight)

        return ''.join(section_content), recommend_features
コード例 #16
0
def create_target_highlight_section(
    analysis_tracker: AnalysisTracker,
    figure_base_path: Text = ''
) -> Union[Tuple[Text, List[Text]], Tuple[None, None]]:
    """Create the section highlight the correlation analysis performed between
  target and other attributes.

  Args:
      analysis_tracker: (AnalysisTracker), holder for all the analysis
      figure_base_path: (string), not used, for signature consistence

  Returns:
      Union[Tuple[str, List[str]], None], (section_content, List[warning])
  """

    # pylint: disable-msg=too-many-locals
    def _other_attribute_name(target_name: str,
                              analysis: analysis_entity_pb2.Analysis) -> str:
        attribute_name = [
            att.name for att in analysis.features if att.name != target_name
        ][0]
        return attribute_name

    def _check_analysis(
            analysis_list: List[List[analysis_entity_pb2.Analysis]]):
        for item in analysis_list:
            for analysis in item:
                if analysis.name in checking_map:
                    if checking_map[analysis.name](analysis):
                        yield _other_attribute_name(target, analysis)

    def _consolidate_analysis(metric_names, analysis_tracker):
        revised_names = []
        analysis_list = []
        for name in metric_names:
            analysis = analysis_tracker.get_analysis_by_attribute_and_name(
                target, name)
            if analysis:
                revised_names.append(name)
                analysis_list.append(analysis)
        return revised_names, analysis_list

    checking_map = {
        Analysis.ANOVA: recommendation.check_p_value,
        Analysis.PEARSON_CORRELATION: recommendation.check_pearson_correlation,
        Analysis.CHI_SQUARE: recommendation.check_p_value
    }

    target = analysis_tracker.get_target_attribute().name
    ml_problem = analysis_tracker.get_job_config().ml_type

    # pylint: disable-msg=no-else-return
    if ml_problem == c.ml_type.NULL:
        return None, None
    else:
        if ml_problem == c.ml_type.REGRESSION:
            target_type = c.datasource.TYPE_NUMERICAL
            # Correlation for numerical attributes
            # ANOVA for categorical attributes
            numerical_metric_names = [
                Analysis.Name.Name(Analysis.PEARSON_CORRELATION)
            ]
            categorical_metric_names = [Analysis.Name.Name(Analysis.ANOVA)]

        elif ml_problem == c.ml_type.CLASSIFICATION:
            target_type = c.datasource.TYPE_CATEGORICAL
            # ANOVA for numerical attributes
            # IG and Chi-square for categorical attributes
            numerical_metric_names = [Analysis.Name.Name(Analysis.ANOVA)]
            categorical_metric_names = [
                Analysis.Name.Name(Analysis.INFORMATION_GAIN),
                Analysis.Name.Name(Analysis.CHI_SQUARE)
            ]

        else:
            raise ValueError('The ML problem type is not supported')

        recommend_features = []
        section_content = []

        r_numerical_metrics, r_numerical_analysis = \
          _consolidate_analysis(numerical_metric_names, analysis_tracker)
        r_categorical_metrics, r_categorical_analysis = \
          _consolidate_analysis(categorical_metric_names, analysis_tracker)

        if r_numerical_metrics:
            section_content.append(
                template.SUB_SUB_SECTION_TITLE.format(
                    content="Numerical features and target"))

            # recommendation based on checking results
            recommend_features.extend(_check_analysis(r_numerical_analysis))
            numerical_highlight = utils.create_target_metrics_highlight(
                target_name=target,
                metric_name_list=r_numerical_metrics,
                metric_analysis_list=r_numerical_analysis)
            section_content.append(numerical_highlight)

        if r_categorical_metrics:
            section_content.append(
                template.SUB_SUB_SECTION_TITLE.format(
                    content="Categorical features and target"))
            recommend_features.extend(_check_analysis(r_categorical_analysis))
            # recommendation based on checking results
            categorical_highlight = utils.create_target_metrics_highlight(
                target_name=target,
                metric_name_list=r_categorical_metrics,
                metric_analysis_list=r_categorical_analysis)
            section_content.append(categorical_highlight)

        if not section_content:
            return None, None
        else:
            target_str_template = template.TARGET_HEADLINE_TEMPLATE
            target_str = target_str_template.format(target=target,
                                                    target_type=target_type)
            section_content.insert(0, target_str)
            return ''.join(section_content), recommend_features
コード例 #17
0
def create_descriptive_section(analysis_tracker: AnalysisTracker,
                               figure_base_path: Text) -> (Text, List[Text]):
    """Create descriptive section of the report. Checking based on the descriptive
  results will be performed, e.g., missing values and high cardinality.

  Args:
      analysis_tracker: (AnalysisTracker), holder for all the analysis
      figure_base_path: (string), the folder for holding figures

  Returns:
      Tuple[Text, List[Text]], (section_content, List[warnings])
  """

    numerical_attributes = analysis_tracker.get_num_attribute_names()
    categorical_attributes = analysis_tracker.get_cat_attribute_names()

    # holders for section content and warnings based on descriptive analysis
    contents = []
    warnings = []

    section_template = template.TABLE_DESCRIPTIVE_TEMPLATE

    for att in numerical_attributes:
        # base analysis is one holding basic descriptive statistics
        base_analysis = analysis_tracker.get_analysis_by_attribute_and_name(
            att, Analysis.Name.Name(Analysis.DESCRIPTIVE))[0]
        # additional analysis is one holding histogram for numerical attribute
        additional_analysis = analysis_tracker.get_analysis_by_attribute_and_name(
            att, Analysis.Name.Name(Analysis.HISTOGRAM))[0]
        contents.append(
            utils.create_table_descriptive_row_from_analysis(
                attribute_name=att,
                base_analysis=base_analysis,
                additional_analysis=additional_analysis,
                figure_base_path=figure_base_path))
        # check missing value condition
        missing_check = recommendation.check_missing(att, base_analysis)
        if missing_check:
            warnings.append(missing_check)

    for att in categorical_attributes:
        # base analysis is one holding basic descriptive statistics
        base_analysis = analysis_tracker.get_analysis_by_attribute_and_name(
            att, Analysis.Name.Name(Analysis.DESCRIPTIVE))[0]
        # additional analysis is one holding value counts
        # for categorical attribute
        additional_analysis = analysis_tracker.get_analysis_by_attribute_and_name(
            att, Analysis.Name.Name(Analysis.VALUE_COUNTS))[0]
        contents.append(
            utils.create_table_descriptive_row_from_analysis(
                attribute_name=att,
                base_analysis=base_analysis,
                additional_analysis=additional_analysis,
                figure_base_path=figure_base_path))
        # check missing value condition
        missing_check = recommendation.check_missing(att, base_analysis)
        if missing_check:
            warnings.append(missing_check)
        # check cardinality condition
        cardinality_check = recommendation.check_cardinality(
            att, base_analysis)
        if cardinality_check:
            warnings.append(cardinality_check)

    # finally all the descriptive analysis result will be organised in a table
    table_content = section_template.format(row_content=''.join(contents))

    if warnings:
        table_content = table_content + utils.create_warning_notes(warnings)

    return table_content, warnings