def test_all_plot_functions_are_called_and_plot_is_shown(self, mock_plt): plot_data = [PlotData(MagicMock(), 1), PlotData(MagicMock(), 2), PlotData(MagicMock(), 3)] TextMetadataStatisticalCheck.plot_all_metadata(plot_data) mock_plt.figure.assert_called_with(figsize=(12.0, 30.0), tight_layout=True) for func, rows in plot_data: self.assertTrue(func.called) mock_plt.show.assert_called_with()
def test_figure_function_is_collected(self): df1 = pd.DataFrame.from_dict({'text': ['blub'] * 10}) df2 = pd.DataFrame.from_dict({'text': ['blub'] * 10}) metadata_names = ['num_chars', 'num_words'] cols = pd.MultiIndex.from_product([df1.columns, metadata_names], names=['column', 'metadata']) check = TextMetadataStatisticalCheck() pvalues = pd.DataFrame(columns=cols, index=['pvalue']) for solution, num_sig_metadata in [(1, 2), (1, 1), (0, 0)]: p = [0.001] * num_sig_metadata + [0.05] * (2 - num_sig_metadata) pvalues[('text', 'num_chars')] = p[0] pvalues[('text', 'num_words')] = p[1] with self.subTest(solution=solution, pvalues=pvalues): result = check.metadata_figure(pvalues=pvalues, df1=df1, df2=df2) self.assertEqual(solution, len(result))
def test_correct_number_of_plot_data(self): df1 = pd.DataFrame.from_dict({'text': ['blub'] * 10}) df2 = pd.DataFrame.from_dict({'text': ['blub'] * 10}) metadata_names = ['num_chars', 'num_words'] cols = pd.MultiIndex.from_product([df1.columns, metadata_names], names=['column', 'metadata']) check = TextMetadataStatisticalCheck() pvalues = pd.DataFrame(columns=cols, index=['pvalue']) for num_sig_metadata in [2, 1, 0]: p = [0.001] * num_sig_metadata + [0.05] * (2 - num_sig_metadata) pvalues[('text', 'num_chars')] = p[0] pvalues[('text', 'num_words')] = p[1] with self.subTest(num_sig_metadata=num_sig_metadata, pvalues=pvalues): result = check.plot_data(['text'], pvalues, df1, df2) self.assertEqual(num_sig_metadata, len(result))
def test_not_significant(self): df1 = pd.DataFrame.from_dict({'text': self.poems}) df2 = pd.DataFrame.from_dict({'text': list(reversed(self.poems))}) store = Store(df1, df2) result = TextMetadataStatisticalCheck().run(store) self.assertEqual(1, len(result.examined_columns)) self.assertEqual(0, len(result.shifted_columns)) self.assertEqual(0, len(result.explanation))
def test_column_order_in_report(self): df1 = pd.DataFrame.from_dict({'text': self.poems, 'abc': self.poems}) df2 = pd.DataFrame.from_dict({'text': self.phrases, 'abc': self.phrases}) store = Store(df1, df2) result = TextMetadataStatisticalCheck([NumCharsMetadata()]).run(store) self.assertEqual('abc', result.examined_columns[0]) self.assertEqual('abc', result.shifted_columns[0]) self.assertEqual(result.examined_columns, result.shifted_columns)
def test_significant(self): df1 = pd.DataFrame.from_dict({'text': self.poems}) df2 = pd.DataFrame.from_dict({'text': self.phrases}) store = Store(df1, df2) result = TextMetadataStatisticalCheck([NumCharsMetadata(), NumWordsMetadata(), DistinctWordsRatioMetadata(), LanguagePerParagraph()] ).run(store) self.assertEqual(1, len(result.examined_columns)) self.assertEqual(1, len(result.shifted_columns)) self.assertEqual(1, len(result.explanation))
def test_size_adjustment(self): df1 = pd.DataFrame([0] * 20) df2 = pd.DataFrame([0] * 12) for check, solution in [(CategoricalStatisticalCheck(sample_size=10), (10, 10)), (CategoricalStatisticalCheck(sample_size=15), (15, 12)), (NumericalStatisticalCheck(use_equal_dataset_sizes=True), (12, 12)), (TextMetadataStatisticalCheck(sample_size=15, use_equal_dataset_sizes=True), (12, 12))]: with self.subTest(solution=solution): result1, result2 = check.adjust_dataset_sizes(df1, df2) self.assertEqual(len(result1), solution[0]) self.assertEqual(len(result2), solution[1])
def test_compliance_with_detector(self): df1 = pd.DataFrame.from_dict({'text': ['This is a very important text.', 'It contains information.', 'Brilliant ideas are written down.', 'Read it.', 'You will become a lot smarter.', 'Or you will waste your time.', 'Come on, figure it out!', 'Perhaps it will at least entertain you.', 'Do not be afraid.', 'Be brave!']}) df2 = pd.DataFrame.from_dict({'text': ['This is a very important text.', 'It contains information.', 'Brilliant ideas are written down.', 'Read it.', 'You will become a lot smarter.', 'Or you will waste your time.', 'Come on, figure it out!', 'Perhaps it will at least entertain you.', 'Do not be afraid.', 'Be brave!']}) detector = Detector(df1=df1, df2=df2, log_print=False) detector.run(TextMetadataStatisticalCheck()) column_index = pd.MultiIndex.from_product([['text'], ['distinct_words', 'num_chars', 'num_words']], names=['column', 'metadata']) solution = pd.DataFrame([[1.0, 1.0, 1.0]], columns=column_index, index=['pvalue']) self.assertEqual(1, len(detector.check_reports[0].examined_columns)) self.assertEqual(0, len(detector.check_reports[0].shifted_columns)) self.assertEqual(0, len(detector.check_reports[0].explanation)) assert_frame_equal(solution, detector.check_reports[0].information['test_results'])
def test_infer_language_is_set(self): check = TextMetadataStatisticalCheck([UnknownWordRatioMetadata(), StopwordRatioMetadata()], infer_language=True) md_with_lang = [mdtype for mdtype in check.metadata_precalculation.text_metadata_types if type(mdtype) in [UnknownWordRatioMetadata, StopwordRatioMetadata]] for mdtype in md_with_lang: self.assertTrue(mdtype.infer_language)
def test_language_can_be_set(self): check = TextMetadataStatisticalCheck([UnknownWordRatioMetadata(), StopwordRatioMetadata()], language='fr') md_with_lang = [mdtype for mdtype in check.metadata_precalculation.text_metadata_types if type(mdtype) in [UnknownWordRatioMetadata, StopwordRatioMetadata]] for mdtype in md_with_lang: self.assertEqual('fr', mdtype.language)
def test_significant_metadata(self): pvalues = pd.DataFrame([[0.001, 0.2]], columns=['num_chars', 'distinct_words_ratio'], index=['pvalue']) result = TextMetadataStatisticalCheck(significance=0.01).significant_metadata_names(pvalues) self.assertIn('num_chars', result) self.assertNotIn('distinct_words_ratio', result)
def test_correct_visualization_is_chosen_numerical(self): with mock.patch.object(NumericalStatisticalCheck, 'column_plot') as mock_plot: figure = MagicMock(spec=Figure) tile = MagicMock() TextMetadataStatisticalCheck.metadata_plot(figure, tile, 'text', NumCharsMetadata(), None, None) self.assertTrue(mock_plot.called)