def run(self, store) -> Report: df1, df2 = store[self.metadata_precalculation] part1, part2 = self.adjust_dataset_sizes(df1, df2) categorical_check = CategoricalStatisticalCheck() numerical_check = NumericalStatisticalCheck() pvalues = pd.DataFrame(columns=df1.columns, index=['pvalue']) for column in df1.columns.levels[0]: for mdtype in self.metadata_precalculation.text_metadata_types: if mdtype.metadata_return_type() == ColumnType.categorical: p = categorical_check.statistical_test( part1[(column, mdtype.metadata_name())], part2[(column, mdtype.metadata_name())]) elif mdtype.metadata_return_type() == ColumnType.numerical: p = numerical_check.statistical_test( part1[(column, mdtype.metadata_name())], part2[(column, mdtype.metadata_name())]) else: raise UnknownMetadataReturnColumnTypeError(mdtype) pvalues[(column, mdtype.metadata_name())] = [p] significant_columns = self.significant_columns(pvalues) return StatisticalReport( "Text Metadata Check", examined_columns=sorted(df1.columns.levels[0]), shifted_columns=sorted(significant_columns), explanation=self.explain(pvalues), explanation_header=self.explanation_header( numerical_check.statistical_test_name(), categorical_check.statistical_test_name(), any_significant=len(significant_columns) > 0), information={'test_results': pvalues}, figures=self.metadata_figure(pvalues, part1, part2))
def metadata_plot(figure, tile, column, mdtype, df1, df2): col_mdtype_tuple = (column, mdtype.metadata_name()) if mdtype.metadata_return_type() == ColumnType.categorical: CategoricalStatisticalCheck.column_plot(figure, tile, col_mdtype_tuple, df1, df2) elif mdtype.metadata_return_type() == ColumnType.numerical: NumericalStatisticalCheck.column_plot(figure, tile, col_mdtype_tuple, df1, df2) else: raise UnknownMetadataReturnColumnTypeError(mdtype)
def test_size_adjustment(self): df1 = pd.DataFrame([0] * 20) df2 = pd.DataFrame([0] * 12) for check, solution in [(CategoricalStatisticalCheck(sample_size=10), (10, 10)), (CategoricalStatisticalCheck(sample_size=15), (15, 12)), (NumericalStatisticalCheck(use_equal_dataset_sizes=True), (12, 12)), (TextMetadataStatisticalCheck(sample_size=15, use_equal_dataset_sizes=True), (12, 12))]: with self.subTest(solution=solution): result1, result2 = check.adjust_dataset_sizes(df1, df2) self.assertEqual(len(result1), solution[0]) self.assertEqual(len(result2), solution[1])
def test_paired_total_ratios_figure_looks_right(self, mock_plt): mock_figure = MagicMock(autospec=Figure) mock_axes = MagicMock(autospec=Axes) with mock.patch.object(pd.DataFrame, 'plot') as mock_plot: CategoricalStatisticalCheck.paired_total_ratios_plot( mock_figure, mock_axes, 'vaccination_reaction', self.df1_significant, self.df2_significant) self.assertTrue(mock_plot.called) self.assertTrue(mock_plot.return_value.set_title.called) self.assertTrue(mock_plot.return_value.set_xlabel.called) self.assertTrue(mock_plot.return_value.set_ylabel.called) self.assertTrue(mock_plot.return_value.invert_yaxis.called) self.assertFalse(mock_plt.show.called)
def test_column_tuples_are_handled_by_categorical_visualization(self): columns = ['text'] metadata_names = ['category'] cols = pd.MultiIndex.from_product([columns, metadata_names], names=['column', 'metadata']) df1 = pd.DataFrame(columns=cols) df2 = pd.DataFrame(columns=cols) df1[('text', 'category')] = ['latin' * 3] df2[('text', 'category')] = ['arabic' * 3] mock_figure = MagicMock(autospec=Figure) mock_axes = MagicMock(autospec=Axes) with mock.patch.object(categorical_statistical_check.vis, 'plot_categorical_horizontal_ratio_histogram', return_value=mock_axes): CategoricalStatisticalCheck.paired_total_ratios_plot(mock_figure, mock_axes, ('text', 'category'), df1, df2) mock_axes.set_title.assert_called_once_with("Column: '('text', 'category')'", fontsize='x-large')
def test_figure_function_is_collected(self): df1 = pd.DataFrame.from_dict({ 'col1': ['value'] * 100, 'col2': ['value'] * 100 }) df2 = pd.DataFrame.from_dict({ 'col1': ['value'] * 200, 'col2': ['value'] * 200 }) check = CategoricalStatisticalCheck() for solution, sig_cols in [(1, ['col1', 'col2']), (1, ['col1']), (0, [])]: with self.subTest(solution=solution, sig_cols=sig_cols): result = check.column_figure(significant_columns=sig_cols, df1=df1, df2=df2) self.assertEqual(solution, len(result))
def test_all_plot_functions_are_called_and_plot_is_shown(self, mock_plt): plot_data = [PlotData(MagicMock(), 1), PlotData(MagicMock(), 2), PlotData(MagicMock(), 3)] for check, height in [(CategoricalStatisticalCheck(), 30.0), (NumericalStatisticalCheck(), 30.0)]: with self.subTest(check=check): check.plot_all_columns(plot_data) mock_plt.figure.assert_called_with(figsize=(12, height), tight_layout=True) for func, rows in plot_data: self.assertTrue(func.called) mock_plt.show.assert_called_with()
def test_plot_type(self, mock_plot, mock_subplot): figure = MagicMock(autospec=Figure) tile = MagicMock() CategoricalStatisticalCheck().column_plot(figure, tile, 'vaccination_reaction', self.df1_significant, self.df2_significant) mock_subplot.assert_called_with(figure, tile) self.assertTrue(mock_plot.called)
def test_column_order_in_report(self): df1 = pd.DataFrame([[1, 0]] * 10, columns=['abc', 'def']) df2 = pd.DataFrame([[0, 1]] * 10, columns=['abc', 'def']) store = Store(df1, df2) for check in [CategoricalStatisticalCheck(), NumericalStatisticalCheck()]: with self.subTest(check=check): result = check.run(store) self.assertEqual('abc', result.examined_columns[0]) self.assertEqual('abc', result.shifted_columns[0]) self.assertEqual(result.examined_columns, result.shifted_columns)
def test_compliance_with_detector(self): df1 = pd.DataFrame([0] * 10) df2 = pd.DataFrame([0] * 10) detector = Detector(df1=df1, df2=df2, log_print=False) detector.run(CategoricalStatisticalCheck()) self.assertEqual(1, len(detector.check_reports[0].examined_columns)) self.assertEqual(0, len(detector.check_reports[0].shifted_columns)) self.assertEqual(0, len(detector.check_reports[0].explanation)) assert_frame_equal( pd.DataFrame([1.0], index=['pvalue']), detector.check_reports[0].information['test_results'])
def test_correct_number_of_plot_functions(self): for sig_cols in [['col1', 'col2'], ['col1'], []]: with self.subTest(sig_cols=sig_cols): result = CategoricalStatisticalCheck().plot_data(sig_cols, MagicMock(), MagicMock()) self.assertEqual(len(sig_cols), len(result))
def test_grid_is_created(self, mock_grid, mock_plt_figure): plot_data = [PlotData(MagicMock(), 1), PlotData(MagicMock(), 2), PlotData(MagicMock(), 3)] for check, shape in [(CategoricalStatisticalCheck(), (6, 1)), (NumericalStatisticalCheck(), (6, 1))]: with self.subTest(check=check): check.plot_all_columns(plot_data) mock_grid.GridSpec.assert_called_with(shape[0], shape[1])
def test_significant(self): store = Store(self.df1_significant, self.df2_significant) result = CategoricalStatisticalCheck().run(store) self.assertEqual(1, len(result.examined_columns)) self.assertEqual(1, len(result.shifted_columns)) self.assertEqual(1, len(result.explanation))