Exemple #1
0
    def test_add_training_analysis(self):
        self.name = "training-analysis"
        self.report_name = "Training Analysis Report"
        self.tag_number = 1
        ## Create Report
        self.report = Report(name=self.report_name)
        ### Add Header Level 2
        self.report.detail.add_header_level_2(text='Deep Learning Training')
        ### Learning Curve
        with open(
                prepare_input_path(
                    data_path='sample_data/learning_curve.json'), 'r') as f:
            learning_curve = json.load(f)

        print('best_idx:', learning_curve['best_idx'])
        print(
            'history [first 2 samples]:', {
                k: learning_curve['history'][k]
                for k in list(learning_curve['history'].keys())[:2]
            })
        print('benchmark_metric:', learning_curve['benchmark_metric'])
        print('benchmark_threshold:', learning_curve['benchmark_threshold'])
        print('training_params:', learning_curve['training_params'])

        self.report.detail.add_learning_curve(
            history=learning_curve['history'],
            best_idx=learning_curve['best_idx'],
            benchmark_metric=learning_curve['benchmark_metric'],
            benchmark_threshold=learning_curve['benchmark_threshold'],
            training_params=learning_curve['training_params'])
Exemple #2
0
    def test_add_overview(self):
        self.name = "overview"
        self.report_name = "Overview Report"
        self.tag_number = 1
        ## Create Report
        self.report = Report(name=self.report_name)

        ### Create Cover Section
        self.report.overview.add_new_page()
        self.report.overview.add_section_title(text="Overview")
        self.report.overview.add_paragraph(text="This is summary Info")
        model_info = [('Model ID', '12345678'), ('Model Version', 'v6'),
                      ('Scenario ID', '111222333444555'),
                      ('Notes', 'This model is created as a beta version.')]
        self.report.overview.add_model_info_summary(model_info=model_info)

        timing = [('Data Preprocessing', 1000), ('Feature Engineering', 10000),
                  ('Training', 200200), ('Evaluation', 30303)]
        self.report.overview.add_training_timing(timing=timing)

        data_summary = [('training', 10000), ('validation', 2000),
                        ('testing', 1000)]
        self.report.overview.add_data_set_summary(data_summary=data_summary)

        with open(
                prepare_input_path(
                    data_path='sample_data/evaluation_result_summary.json'),
                'r') as f:
            evaluation_result_data = json.load(f)
        print(evaluation_result_data)

        self.report.overview.add_evaluation_result_summary(
            evaluation_result=evaluation_result_data)
        print(self.report.overview.contents)
        ### added 7 component but length of overview is 8,
        ### there is a default component - new page at the beginning (init)
        self.assertEqual(len(self.report.overview.contents), 8)
Exemple #3
0
    def test_add_feature_analysis(self):
        self.name = "feature-analysis"
        self.report_name = "Feature Analysis Report"
        self.page_number = 4
        ## Create Report
        self.report = Report(name=self.report_name)
        ### Create Feature Analysis Section as new page
        self.report.detail.add_new_page()
        self.report.detail.add_section_title("Example for Feature Analysis ")
        ### Add Header Level 1
        self.report.detail.add_header_level_1(text='Feature Analysis')
        self.assertEqual(len(self.report.detail.contents), 3)

        ### Add Header Level 2
        self.report.detail.add_header_level_2(text='Feature Importance')
        self.assertEqual(len(self.report.detail.contents), 4)
        ### Feature Importance
        path = Path(prepare_input_path(data_path='sample_data/model.pkl'))
        model = pd.read_pickle(str(path))
        path = Path(prepare_input_path(data_path='sample_data/train_data.csv'))
        data = pd.read_csv(str(path))
        # -- csv including header --
        feature_names = data.columns

        fi = FeatureInterpreter(feature_names=feature_names)
        rank = fi.get_feature_importance_ranking(trained_model=model,
                                                 train_x=data,
                                                 method='default')
        self.report.detail.add_feature_importance(importance_ranking=rank,
                                                  importance_threshold=0.005)
        self.assertEqual(len(self.report.detail.contents), 5)

        ### Create Performance Analysis Section as new page
        self.report.detail.add_new_page()
        self.report.detail.add_section_title(
            "Example for Performance Analysis ")
        self.report.detail.add_paragraph(
            text='this is dummy model, no trained with titanic dataset')
        ### Add Header Level 1
        self.report.detail.add_header_level_1(text='Performance Analysis')
        ### Add Header Level 2
        self.report.detail.add_header_level_2(text='Hyperparameter Tuning')
        self.assertEqual(len(self.report.detail.contents), 10)

        ### Hyperparameter Tuning
        with open(
                prepare_input_path(
                    data_path='sample_data/hyperparameter_tuning.json'),
                'r') as f:
            hyperparameter_tuning = json.load(f)
        print('search_space:', hyperparameter_tuning['search_space'])
        print('best_idx:', hyperparameter_tuning['best_idx'])
        self.assertEqual(hyperparameter_tuning['best_idx'], '3')
        print(
            'history [first 2 samples]:', {
                k: hyperparameter_tuning['history'][k]
                for k in list(hyperparameter_tuning['history'].keys())[:2]
            })
        print('benchmark_metric:', hyperparameter_tuning['benchmark_metric'])
        self.assertEqual(hyperparameter_tuning['benchmark_metric'], 'accuracy')
        print('benchmark_threshold:',
              hyperparameter_tuning['benchmark_threshold'])
        self.assertEqual(hyperparameter_tuning['benchmark_threshold'], 0.8)
        print('non_hyperopt_score:',
              hyperparameter_tuning['non_hyperopt_score'])

        self.report.detail.add_hyperparameter_tuning(
            history=hyperparameter_tuning['history'],
            best_idx=hyperparameter_tuning['best_idx'],
            search_space=hyperparameter_tuning['search_space'],
            benchmark_metric=hyperparameter_tuning['benchmark_metric'],
            benchmark_threshold=hyperparameter_tuning['benchmark_threshold'],
            non_hyperopt_score=hyperparameter_tuning['non_hyperopt_score'])
        print(self.report.detail.contents)
        self.assertEqual(len(self.report.detail.contents), 11)
Exemple #4
0
    def test_add_data_analysis(self):
        self.name = "data-analysis"
        self.report_name = "Data Analysis Report"
        self.page_number = 10
        ## Create Report
        self.report = Report(name=self.report_name)
        ### Create Data Analysis Section
        self.report.detail.add_section_title("Example for Data Analysis ")
        ### Add Header Level 1
        self.report.detail.add_header_level_1(text='Data Analysis')
        self.assertEqual(len(self.report.detail.contents), 2)

        ### Load Data
        file_name = prepare_input_path(data_path='sample_data/titanic.csv')
        data = pd.read_csv(file_name)
        ### Add dummy birthday to demonstrate datetime presentation
        bday = []
        for i in range(len(data)):
            year = np.random.randint(low=1960, high=1979)
            month = np.random.randint(low=1, high=12)
            day = np.random.randint(low=1, high=28)
            bday.append("%s" % (10000 * year + 100 * month + day))
        data['Birthday'] = bday

        label_column = 'Survived'

        ### Get data types - scenario where no metadata provided
        feature, valid_feature_names, valid_feature_types, meta = \
            DataUtil.get_column_types(data=data, threshold=0.3, label=label_column)
        # -- Cast Data to String --
        non_numeric_features = [
            name for name, _type in list(
                zip(valid_feature_names, valid_feature_types))
            if _type != DATATYPE.NUMBER
        ]
        if label_column is not None:
            non_numeric_features += [label_column]
        DataUtil.cast_type_to_string(data=data,
                                     feature_names=non_numeric_features)

        ### Add Header Level 2
        self.report.detail.add_header_level_2(
            text='Data Class (Label) Distribution')
        ### Add Label distribution
        label_distributions = DataUtil.get_label_distribution(
            data=data, label=label_column)
        self.report.detail.add_data_set_distribution(label_distributions)
        self.assertEqual(len(self.report.detail.contents), 4)

        ### Get Data Stats
        stats = DataUtil.get_data_statistics(data=data,
                                             feature_names=valid_feature_names,
                                             feature_types=valid_feature_types,
                                             label=label_column)

        ### Add Header Level 2
        self.report.detail.add_header_level_2(text='Data Field Attribute')
        ### Data Field Attribute
        self.report.detail.add_data_attributes(meta)
        self.assertEqual(len(self.report.detail.contents), 6)

        ### Add Header Level 2
        self.report.detail.add_header_level_2(text='Data Missing Value Check')
        ### Missing value count
        missing_count, total_count = \
            DataUtil.get_missing_value_count(data=data,
                                             feature_names=valid_feature_names,
                                             feature_types=valid_feature_types)
        print(missing_count)
        self.assertEqual(missing_count['Age'], 177)
        self.assertEqual(missing_count['Embarked'], 2)
        print(total_count)
        self.assertEqual(total_count['Name'], 891)
        self.assertEqual(total_count['Embarked'], 891)
        self.assertEqual(len(self.report.detail.contents), 7)
        self.report.detail.add_data_missing_value(
            missing_count=dict(missing_count), total_count=total_count)
        self.assertEqual(len(self.report.detail.contents), 8)

        ### Add Header Level 2
        self.report.detail.add_header_level_2(text='Data Field Distribution')
        ### Data Field Distribution Desc
        self.report.detail.add_paragraph(
            text=
            'This section displays distribution for categorical fields, numerical fields and text fields.'
        )
        self.assertEqual(len(self.report.detail.contents), 10)

        ### Add Header Level 3
        self.report.detail.add_header_level_3(
            text='Categorical Field Distribution')
        print(feature[DATATYPE.CATEGORY])
        self.assertEqual(len(feature[DATATYPE.CATEGORY]), 5)
        ### Categorical field distribution
        for field_name in feature[DATATYPE.CATEGORY]:
            labelled_stats, all_stats = stats[field_name]
            self.report.detail.add_categorical_field_distribution(
                field_name=field_name, field_distribution=labelled_stats)
        self.assertEqual(len(self.report.detail.contents), 16)

        ### Add Header Level 3
        self.report.detail.add_header_level_3(
            text='Numerical Field Distribution')
        print(feature[DATATYPE.NUMBER])
        self.assertEqual(len(feature[DATATYPE.NUMBER]), 2)
        ### Numerical field distribution
        for field_name in feature[DATATYPE.NUMBER]:
            labelled_stats, all_stats = stats[field_name]
            self.report.detail.add_numeric_field_distribution(
                field_name=field_name, field_distribution=labelled_stats)
        self.assertEqual(len(self.report.detail.contents), 19)

        ### Add Header Level 3
        self.report.detail.add_header_level_3(text='Text Field Distribution')
        print(feature[DATATYPE.FREETEXT])
        self.assertEqual(len(feature[DATATYPE.FREETEXT]), 1)
        ### Text field distribution
        for field_name in feature[DATATYPE.FREETEXT]:
            labelled_stats, all_stats = stats[field_name]
            self.report.detail.add_text_field_distribution(
                field_name=field_name, field_distribution=labelled_stats)
        self.assertEqual(len(self.report.detail.contents), 21)

        ### Add Header Level 3
        self.report.detail.add_header_level_3(
            text='Datetime Field Distribution')
        print(feature[DATATYPE.DATETIME])
        self.assertEqual(len(feature[DATATYPE.DATETIME]), 1)
        ### Datetime field distribution
        for field_name in feature[DATATYPE.DATETIME]:
            labelled_stats, all_stats = stats[field_name]
            self.report.detail.add_datetime_field_distribution(
                field_name=field_name, field_distribution=labelled_stats)

        print(self.report.detail.contents)
        self.assertEqual(len(self.report.detail.contents), 23)