コード例 #1
0
 def test_all_plot_functions_are_called_and_plot_is_shown(self, mock_plt):
     plot_data = [PlotData(MagicMock(), 1), PlotData(MagicMock(), 2), PlotData(MagicMock(), 3)]
     TextMetadataStatisticalCheck.plot_all_metadata(plot_data)
     mock_plt.figure.assert_called_with(figsize=(12.0, 30.0), tight_layout=True)
     for func, rows in plot_data:
         self.assertTrue(func.called)
     mock_plt.show.assert_called_with()
コード例 #2
0
 def test_figure_function_is_collected(self):
     df1 = pd.DataFrame.from_dict({'text': ['blub'] * 10})
     df2 = pd.DataFrame.from_dict({'text': ['blub'] * 10})
     metadata_names = ['num_chars', 'num_words']
     cols = pd.MultiIndex.from_product([df1.columns, metadata_names], names=['column', 'metadata'])
     check = TextMetadataStatisticalCheck()
     pvalues = pd.DataFrame(columns=cols, index=['pvalue'])
     for solution, num_sig_metadata in [(1, 2), (1, 1), (0, 0)]:
         p = [0.001] * num_sig_metadata + [0.05] * (2 - num_sig_metadata)
         pvalues[('text', 'num_chars')] = p[0]
         pvalues[('text', 'num_words')] = p[1]
         with self.subTest(solution=solution, pvalues=pvalues):
             result = check.metadata_figure(pvalues=pvalues, df1=df1, df2=df2)
             self.assertEqual(solution, len(result))
コード例 #3
0
 def test_correct_number_of_plot_data(self):
     df1 = pd.DataFrame.from_dict({'text': ['blub'] * 10})
     df2 = pd.DataFrame.from_dict({'text': ['blub'] * 10})
     metadata_names = ['num_chars', 'num_words']
     cols = pd.MultiIndex.from_product([df1.columns, metadata_names], names=['column', 'metadata'])
     check = TextMetadataStatisticalCheck()
     pvalues = pd.DataFrame(columns=cols, index=['pvalue'])
     for num_sig_metadata in [2, 1, 0]:
         p = [0.001] * num_sig_metadata + [0.05] * (2 - num_sig_metadata)
         pvalues[('text', 'num_chars')] = p[0]
         pvalues[('text', 'num_words')] = p[1]
         with self.subTest(num_sig_metadata=num_sig_metadata, pvalues=pvalues):
             result = check.plot_data(['text'], pvalues, df1, df2)
             self.assertEqual(num_sig_metadata, len(result))
コード例 #4
0
 def test_not_significant(self):
     df1 = pd.DataFrame.from_dict({'text': self.poems})
     df2 = pd.DataFrame.from_dict({'text': list(reversed(self.poems))})
     store = Store(df1, df2)
     result = TextMetadataStatisticalCheck().run(store)
     self.assertEqual(1, len(result.examined_columns))
     self.assertEqual(0, len(result.shifted_columns))
     self.assertEqual(0, len(result.explanation))
コード例 #5
0
 def test_column_order_in_report(self):
     df1 = pd.DataFrame.from_dict({'text': self.poems, 'abc': self.poems})
     df2 = pd.DataFrame.from_dict({'text': self.phrases, 'abc': self.phrases})
     store = Store(df1, df2)
     result = TextMetadataStatisticalCheck([NumCharsMetadata()]).run(store)
     self.assertEqual('abc', result.examined_columns[0])
     self.assertEqual('abc', result.shifted_columns[0])
     self.assertEqual(result.examined_columns, result.shifted_columns)
コード例 #6
0
 def test_significant(self):
     df1 = pd.DataFrame.from_dict({'text': self.poems})
     df2 = pd.DataFrame.from_dict({'text': self.phrases})
     store = Store(df1, df2)
     result = TextMetadataStatisticalCheck([NumCharsMetadata(), NumWordsMetadata(),
                                            DistinctWordsRatioMetadata(), LanguagePerParagraph()]
                                           ).run(store)
     self.assertEqual(1, len(result.examined_columns))
     self.assertEqual(1, len(result.shifted_columns))
     self.assertEqual(1, len(result.explanation))
コード例 #7
0
 def test_size_adjustment(self):
     df1 = pd.DataFrame([0] * 20)
     df2 = pd.DataFrame([0] * 12)
     for check, solution in [(CategoricalStatisticalCheck(sample_size=10), (10, 10)),
                             (CategoricalStatisticalCheck(sample_size=15), (15, 12)),
                             (NumericalStatisticalCheck(use_equal_dataset_sizes=True), (12, 12)),
                             (TextMetadataStatisticalCheck(sample_size=15, use_equal_dataset_sizes=True), (12, 12))]:
         with self.subTest(solution=solution):
             result1, result2 = check.adjust_dataset_sizes(df1, df2)
             self.assertEqual(len(result1), solution[0])
             self.assertEqual(len(result2), solution[1])
コード例 #8
0
 def test_compliance_with_detector(self):
     df1 = pd.DataFrame.from_dict({'text': ['This is a very important text.',
                                            'It contains information.', 'Brilliant ideas are written down.',
                                            'Read it.', 'You will become a lot smarter.',
                                            'Or you will waste your time.', 'Come on, figure it out!',
                                            'Perhaps it will at least entertain you.', 'Do not be afraid.',
                                            'Be brave!']})
     df2 = pd.DataFrame.from_dict({'text': ['This is a very important text.',
                                            'It contains information.', 'Brilliant ideas are written down.',
                                            'Read it.', 'You will become a lot smarter.',
                                            'Or you will waste your time.', 'Come on, figure it out!',
                                            'Perhaps it will at least entertain you.', 'Do not be afraid.',
                                            'Be brave!']})
     detector = Detector(df1=df1, df2=df2, log_print=False)
     detector.run(TextMetadataStatisticalCheck())
     column_index = pd.MultiIndex.from_product([['text'], ['distinct_words', 'num_chars', 'num_words']],
                                               names=['column', 'metadata'])
     solution = pd.DataFrame([[1.0, 1.0, 1.0]], columns=column_index, index=['pvalue'])
     self.assertEqual(1, len(detector.check_reports[0].examined_columns))
     self.assertEqual(0, len(detector.check_reports[0].shifted_columns))
     self.assertEqual(0, len(detector.check_reports[0].explanation))
     assert_frame_equal(solution, detector.check_reports[0].information['test_results'])
コード例 #9
0
 def test_infer_language_is_set(self):
     check = TextMetadataStatisticalCheck([UnknownWordRatioMetadata(), StopwordRatioMetadata()], infer_language=True)
     md_with_lang = [mdtype for mdtype in check.metadata_precalculation.text_metadata_types
                     if type(mdtype) in [UnknownWordRatioMetadata, StopwordRatioMetadata]]
     for mdtype in md_with_lang:
         self.assertTrue(mdtype.infer_language)
コード例 #10
0
 def test_language_can_be_set(self):
     check = TextMetadataStatisticalCheck([UnknownWordRatioMetadata(), StopwordRatioMetadata()], language='fr')
     md_with_lang = [mdtype for mdtype in check.metadata_precalculation.text_metadata_types
                     if type(mdtype) in [UnknownWordRatioMetadata, StopwordRatioMetadata]]
     for mdtype in md_with_lang:
         self.assertEqual('fr', mdtype.language)
コード例 #11
0
 def test_significant_metadata(self):
     pvalues = pd.DataFrame([[0.001, 0.2]], columns=['num_chars', 'distinct_words_ratio'], index=['pvalue'])
     result = TextMetadataStatisticalCheck(significance=0.01).significant_metadata_names(pvalues)
     self.assertIn('num_chars', result)
     self.assertNotIn('distinct_words_ratio', result)
コード例 #12
0
 def test_correct_visualization_is_chosen_numerical(self):
     with mock.patch.object(NumericalStatisticalCheck, 'column_plot') as mock_plot:
         figure = MagicMock(spec=Figure)
         tile = MagicMock()
         TextMetadataStatisticalCheck.metadata_plot(figure, tile, 'text', NumCharsMetadata(), None, None)
     self.assertTrue(mock_plot.called)