コード例 #1
0
 def run(self, store) -> Report:
     df1, df2 = store[self.metadata_precalculation]
     part1, part2 = self.adjust_dataset_sizes(df1, df2)
     categorical_check = CategoricalStatisticalCheck()
     numerical_check = NumericalStatisticalCheck()
     pvalues = pd.DataFrame(columns=df1.columns, index=['pvalue'])
     for column in df1.columns.levels[0]:
         for mdtype in self.metadata_precalculation.text_metadata_types:
             if mdtype.metadata_return_type() == ColumnType.categorical:
                 p = categorical_check.statistical_test(
                     part1[(column, mdtype.metadata_name())],
                     part2[(column, mdtype.metadata_name())])
             elif mdtype.metadata_return_type() == ColumnType.numerical:
                 p = numerical_check.statistical_test(
                     part1[(column, mdtype.metadata_name())],
                     part2[(column, mdtype.metadata_name())])
             else:
                 raise UnknownMetadataReturnColumnTypeError(mdtype)
             pvalues[(column, mdtype.metadata_name())] = [p]
     significant_columns = self.significant_columns(pvalues)
     return StatisticalReport(
         "Text Metadata Check",
         examined_columns=sorted(df1.columns.levels[0]),
         shifted_columns=sorted(significant_columns),
         explanation=self.explain(pvalues),
         explanation_header=self.explanation_header(
             numerical_check.statistical_test_name(),
             categorical_check.statistical_test_name(),
             any_significant=len(significant_columns) > 0),
         information={'test_results': pvalues},
         figures=self.metadata_figure(pvalues, part1, part2))
コード例 #2
0
 def metadata_plot(figure, tile, column, mdtype, df1, df2):
     col_mdtype_tuple = (column, mdtype.metadata_name())
     if mdtype.metadata_return_type() == ColumnType.categorical:
         CategoricalStatisticalCheck.column_plot(figure, tile,
                                                 col_mdtype_tuple, df1, df2)
     elif mdtype.metadata_return_type() == ColumnType.numerical:
         NumericalStatisticalCheck.column_plot(figure, tile,
                                               col_mdtype_tuple, df1, df2)
     else:
         raise UnknownMetadataReturnColumnTypeError(mdtype)
コード例 #3
0
 def test_size_adjustment(self):
     df1 = pd.DataFrame([0] * 20)
     df2 = pd.DataFrame([0] * 12)
     for check, solution in [(CategoricalStatisticalCheck(sample_size=10), (10, 10)),
                             (CategoricalStatisticalCheck(sample_size=15), (15, 12)),
                             (NumericalStatisticalCheck(use_equal_dataset_sizes=True), (12, 12)),
                             (TextMetadataStatisticalCheck(sample_size=15, use_equal_dataset_sizes=True), (12, 12))]:
         with self.subTest(solution=solution):
             result1, result2 = check.adjust_dataset_sizes(df1, df2)
             self.assertEqual(len(result1), solution[0])
             self.assertEqual(len(result2), solution[1])
コード例 #4
0
 def test_paired_total_ratios_figure_looks_right(self, mock_plt):
     mock_figure = MagicMock(autospec=Figure)
     mock_axes = MagicMock(autospec=Axes)
     with mock.patch.object(pd.DataFrame, 'plot') as mock_plot:
         CategoricalStatisticalCheck.paired_total_ratios_plot(
             mock_figure, mock_axes, 'vaccination_reaction',
             self.df1_significant, self.df2_significant)
     self.assertTrue(mock_plot.called)
     self.assertTrue(mock_plot.return_value.set_title.called)
     self.assertTrue(mock_plot.return_value.set_xlabel.called)
     self.assertTrue(mock_plot.return_value.set_ylabel.called)
     self.assertTrue(mock_plot.return_value.invert_yaxis.called)
     self.assertFalse(mock_plt.show.called)
コード例 #5
0
 def test_column_tuples_are_handled_by_categorical_visualization(self):
     columns = ['text']
     metadata_names = ['category']
     cols = pd.MultiIndex.from_product([columns, metadata_names], names=['column', 'metadata'])
     df1 = pd.DataFrame(columns=cols)
     df2 = pd.DataFrame(columns=cols)
     df1[('text', 'category')] = ['latin' * 3]
     df2[('text', 'category')] = ['arabic' * 3]
     mock_figure = MagicMock(autospec=Figure)
     mock_axes = MagicMock(autospec=Axes)
     with mock.patch.object(categorical_statistical_check.vis, 'plot_categorical_horizontal_ratio_histogram',
                            return_value=mock_axes):
         CategoricalStatisticalCheck.paired_total_ratios_plot(mock_figure, mock_axes, ('text', 'category'), df1, df2)
     mock_axes.set_title.assert_called_once_with("Column: '('text', 'category')'", fontsize='x-large')
コード例 #6
0
 def test_figure_function_is_collected(self):
     df1 = pd.DataFrame.from_dict({
         'col1': ['value'] * 100,
         'col2': ['value'] * 100
     })
     df2 = pd.DataFrame.from_dict({
         'col1': ['value'] * 200,
         'col2': ['value'] * 200
     })
     check = CategoricalStatisticalCheck()
     for solution, sig_cols in [(1, ['col1', 'col2']), (1, ['col1']),
                                (0, [])]:
         with self.subTest(solution=solution, sig_cols=sig_cols):
             result = check.column_figure(significant_columns=sig_cols,
                                          df1=df1,
                                          df2=df2)
             self.assertEqual(solution, len(result))
コード例 #7
0
 def test_all_plot_functions_are_called_and_plot_is_shown(self, mock_plt):
     plot_data = [PlotData(MagicMock(), 1), PlotData(MagicMock(), 2), PlotData(MagicMock(), 3)]
     for check, height in [(CategoricalStatisticalCheck(), 30.0), (NumericalStatisticalCheck(), 30.0)]:
         with self.subTest(check=check):
             check.plot_all_columns(plot_data)
             mock_plt.figure.assert_called_with(figsize=(12, height), tight_layout=True)
             for func, rows in plot_data:
                 self.assertTrue(func.called)
             mock_plt.show.assert_called_with()
コード例 #8
0
 def test_plot_type(self, mock_plot, mock_subplot):
     figure = MagicMock(autospec=Figure)
     tile = MagicMock()
     CategoricalStatisticalCheck().column_plot(figure, tile,
                                               'vaccination_reaction',
                                               self.df1_significant,
                                               self.df2_significant)
     mock_subplot.assert_called_with(figure, tile)
     self.assertTrue(mock_plot.called)
コード例 #9
0
 def test_column_order_in_report(self):
     df1 = pd.DataFrame([[1, 0]] * 10, columns=['abc', 'def'])
     df2 = pd.DataFrame([[0, 1]] * 10, columns=['abc', 'def'])
     store = Store(df1, df2)
     for check in [CategoricalStatisticalCheck(), NumericalStatisticalCheck()]:
         with self.subTest(check=check):
             result = check.run(store)
             self.assertEqual('abc', result.examined_columns[0])
             self.assertEqual('abc', result.shifted_columns[0])
             self.assertEqual(result.examined_columns, result.shifted_columns)
コード例 #10
0
 def test_compliance_with_detector(self):
     df1 = pd.DataFrame([0] * 10)
     df2 = pd.DataFrame([0] * 10)
     detector = Detector(df1=df1, df2=df2, log_print=False)
     detector.run(CategoricalStatisticalCheck())
     self.assertEqual(1, len(detector.check_reports[0].examined_columns))
     self.assertEqual(0, len(detector.check_reports[0].shifted_columns))
     self.assertEqual(0, len(detector.check_reports[0].explanation))
     assert_frame_equal(
         pd.DataFrame([1.0], index=['pvalue']),
         detector.check_reports[0].information['test_results'])
コード例 #11
0
 def test_correct_number_of_plot_functions(self):
     for sig_cols in [['col1', 'col2'], ['col1'], []]:
         with self.subTest(sig_cols=sig_cols):
             result = CategoricalStatisticalCheck().plot_data(sig_cols, MagicMock(), MagicMock())
     self.assertEqual(len(sig_cols), len(result))
コード例 #12
0
 def test_grid_is_created(self, mock_grid, mock_plt_figure):
     plot_data = [PlotData(MagicMock(), 1), PlotData(MagicMock(), 2), PlotData(MagicMock(), 3)]
     for check, shape in [(CategoricalStatisticalCheck(), (6, 1)), (NumericalStatisticalCheck(), (6, 1))]:
         with self.subTest(check=check):
             check.plot_all_columns(plot_data)
             mock_grid.GridSpec.assert_called_with(shape[0], shape[1])
コード例 #13
0
 def test_significant(self):
     store = Store(self.df1_significant, self.df2_significant)
     result = CategoricalStatisticalCheck().run(store)
     self.assertEqual(1, len(result.examined_columns))
     self.assertEqual(1, len(result.shifted_columns))
     self.assertEqual(1, len(result.explanation))