def test_warning_tf(self): test_root_path = os.path.dirname( os.path.dirname(os.path.realpath(__file__))) test_dir = os.path.join(test_root_path, 'data') path = os.path.join(test_dir, 'csv/diamonds.csv') data = dp.Data(path) profile_options = dp.ProfilerOptions() profile_options.structured_options.set({ "text.is_enabled": False, "int.is_enabled": False, "float.is_enabled": False, "order.is_enabled": False, "category.is_enabled": False, "chi2_homogeneity.is_enabled": False, "datetime.is_enabled": False }) profile = dp.StructuredProfiler(data, options=profile_options) results = profile.report() columns = [] predictions = [] for i in range(len(results['data_stats'])): columns.append(i) predictions.append(results['data_stats'][i]['data_label'])
def test_null_list(self, *mocks): data = [None, None, None] profiler = dp.StructuredProfiler(data, options=self.options) fig = graphs.plot_missing_values_matrix(profiler) self.assertIsInstance(fig, plt.Figure) self.assertEqual(1, len(fig.axes)) ax = fig.axes[0] patches, labels = ax.get_legend_handles_labels() self.assertEqual(['"None"'], labels) expected_patch_values = [ { "xy": (0.1, -0.5), "width": 0.8, "height": 3 }, ] for patch, expected in zip(patches, expected_patch_values): np.testing.assert_almost_equal(expected["xy"], patch.xy) self.assertEqual(expected["width"], patch.get_width()) self.assertEqual(expected["height"], patch.get_height()) xtick_labels = [xtick.get_text() for xtick in ax.get_xticklabels()] self.assertListEqual(['"0"'], xtick_labels) self.assertEqual("column name", ax.get_xlabel()) self.assertEqual("row index", ax.get_ylabel())
def test_warning_tf_run_dp_multiple_times(self): test_root_path = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) test_dir = os.path.join(test_root_path, "data") path = os.path.join(test_dir, "csv/diamonds.csv") for i in range(3): print("running dp =============================", i) data = dp.Data(path) profile_options = dp.ProfilerOptions() profile_options.structured_options.set( { "text.is_enabled": False, "int.is_enabled": False, "float.is_enabled": False, "order.is_enabled": False, "category.is_enabled": False, "chi2_homogeneity.is_enabled": False, "datetime.is_enabled": False, } ) profile = dp.StructuredProfiler(data, options=profile_options) results = profile.report() columns = [] predictions = [] for j in range(len(results["data_stats"])): columns.append(j) predictions.append(results["data_stats"][j]["data_label"])
def test_2_null_types_multicol(self, *mocks): data = pd.DataFrame( [ [None, "", 1.0, "1/2/2021"], [3, None, 3.5, ""], [1, None, 1.0, "2/5/2020"], [None, 1, 10.0, "3/5/2020"], ], columns=["integer", "str", "float", "datetime"], dtype=object, ) profiler = dp.StructuredProfiler(data, options=self.options) fig = graphs.plot_missing_values_matrix(profiler) self.assertIsInstance(fig, plt.Figure) self.assertEqual(1, len(fig.axes)) ax = fig.axes[0] patches, labels = ax.get_legend_handles_labels() self.assertEqual(['"None"', '"None"', '""', '"None"', '""'], labels) expected_patch_values = [ { "xy": (0.1, -0.5), "width": 0.8, "height": 1 }, { "xy": (0.1, 2.5), "width": 0.8, "height": 1 }, { "xy": (1.1, -0.5), "width": 0.8, "height": 1 }, { "xy": (1.1, 0.5), "width": 0.8, "height": 2 }, { "xy": (3.1, 0.5), "width": 0.8, "height": 1 }, ] for patch, expected in zip(patches, expected_patch_values): np.testing.assert_almost_equal(expected["xy"], patch.xy) self.assertEqual(expected["width"], patch.get_width()) self.assertEqual(expected["height"], patch.get_height()) xtick_labels = [xtick.get_text() for xtick in ax.get_xticklabels()] self.assertListEqual(['"integer"', '"str"', '"float"', '"datetime"'], xtick_labels) self.assertEqual("column name", ax.get_xlabel()) self.assertEqual("row index", ax.get_ylabel())
def test_no_data(self, *mocks): profiler = dp.StructuredProfiler([], options=self.options) with self.assertWarnsRegex( UserWarning, "There was no data in the profiles to plot " "missing column values.", ): graphs.plot_missing_values_matrix(profiler)
def test_empty_profiler(self, plot_col_mock, plt_mock): with self.assertWarnsRegex( Warning, "No plots were constructed" " because no int or float columns " "were found in columns"): fig = graphs.plot_histograms( dp.StructuredProfiler(data=None, options=self.options)) self.assertIsNone(fig)
def setUpClass(cls): cls.data = pd.DataFrame( [[1, 'a', 1.0, '1/2/2021'], [None, 'b', None, '1/2/2020'], [3, 'c', 3.5, '1/2/2022'], [4, 'd', 4.5, '1/2/2023'], [5, 'e', 6.0, '5/2/2020'], [None, 'f', None, '1/5/2020'], [1, 'g', 1.0, '2/5/2020'], [None, 1, 10.0, '3/5/2020']], columns=['int', 'str', 'float', 'datetime']) cls.options = dp.ProfilerOptions() cls.options.set({"data_labeler.is_enabled": False}) cls.options.set({"multiprocess.is_enabled": False}) cls.profiler = dp.StructuredProfiler(cls.data, options=cls.options)
def test_2_null_types_multicol(self, *mocks): data = pd.DataFrame( [[None, '', 1.0, '1/2/2021'], [3, None, 3.5, ''], [1, None, 1.0, '2/5/2020'], [None, 1, 10.0, '3/5/2020']], columns=['integer', 'str', 'float', 'datetime'], dtype=object) profiler = dp.StructuredProfiler(data, options=self.options) fig = graphs.plot_missing_values_matrix(profiler) self.assertIsInstance(fig, plt.Figure) self.assertEqual(1, len(fig.axes)) ax = fig.axes[0] patches, labels = ax.get_legend_handles_labels() self.assertEqual(['"None"', '"None"', '""', '"None"', '""'], labels) expected_patch_values = [ { 'xy': (0.1, -0.5), 'width': 0.8, 'height': 1 }, { 'xy': (0.1, 2.5), 'width': 0.8, 'height': 1 }, { 'xy': (1.1, -0.5), 'width': 0.8, 'height': 1 }, { 'xy': (1.1, 0.5), 'width': 0.8, 'height': 2 }, { 'xy': (3.1, 0.5), 'width': 0.8, 'height': 1 }, ] for patch, expected in zip(patches, expected_patch_values): np.testing.assert_almost_equal(expected['xy'], patch.xy) self.assertEqual(expected['width'], patch.get_width()) self.assertEqual(expected['height'], patch.get_height()) xtick_labels = [xtick.get_text() for xtick in ax.get_xticklabels()] self.assertListEqual(['"integer"', '"str"', '"float"', '"datetime"'], xtick_labels) self.assertEqual('column name', ax.get_xlabel()) self.assertEqual('row index', ax.get_ylabel())
def test_warning_tf_multiple_dp_with_update(self): test_root_path = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) test_dir = os.path.join(test_root_path, "data") path = os.path.join(test_dir, "csv/diamonds.csv") data = dp.Data(path) profile_options = dp.ProfilerOptions() profile_options.structured_options.set( { "text.is_enabled": False, "int.is_enabled": False, "float.is_enabled": False, "order.is_enabled": False, "category.is_enabled": False, "datetime.is_enabled": False, "chi2_homogeneity.is_enabled": False, "correlation.is_enabled": False, } ) print("running dp1") profile1 = dp.StructuredProfiler(data, options=profile_options) data = dp.Data(path) profile_options = dp.ProfilerOptions() profile_options.structured_options.set( { "text.is_enabled": False, "int.is_enabled": False, "float.is_enabled": False, "order.is_enabled": False, "category.is_enabled": False, "datetime.is_enabled": False, "chi2_homogeneity.is_enabled": False, "correlation.is_enabled": False, } ) print("running dp2") profile2 = dp.StructuredProfiler(data, options=profile_options) profile1.update_profile(data)
def test_warning_tf_run_dp_merge(self): test_root_path = os.path.dirname( os.path.dirname(os.path.realpath(__file__))) test_dir = os.path.join(test_root_path, 'data') path = os.path.join(test_dir, 'csv/diamonds.csv') data = dp.Data(path) profile_options = dp.ProfilerOptions() profile_options.structured_options.set({ "text.is_enabled": False, "int.is_enabled": False, "float.is_enabled": False, "order.is_enabled": False, "category.is_enabled": False, "datetime.is_enabled": False, "chi2_homogeneity.is_enabled": False, "correlation.is_enabled": False }) print('running dp1') profile1 = dp.StructuredProfiler(data, options=profile_options) data = dp.Data(path) profile_options = dp.ProfilerOptions() profile_options.structured_options.set({ "text.is_enabled": False, "int.is_enabled": False, "float.is_enabled": False, "order.is_enabled": False, "category.is_enabled": False, "datetime.is_enabled": False, "chi2_homogeneity.is_enabled": False, "correlation.is_enabled": False }) print('running dp2') profile2 = dp.StructuredProfiler(data, options=profile_options) profile = profile1 + profile2
def setUpClass(cls): cls.data = pd.DataFrame( [ [1, "a", 1.0, "1/2/2021"], [None, "b", None, "1/2/2020"], [3, "c", 3.5, "1/2/2022"], [4, "d", 4.5, "1/2/2023"], [5, "e", 6.0, "5/2/2020"], [None, "f", None, "1/5/2020"], [1, "g", 1.0, "2/5/2020"], [None, 1, 10.0, "3/5/2020"], ], columns=["int", "str", "float", "datetime"], ) cls.options = dp.ProfilerOptions() cls.options.set({"data_labeler.is_enabled": False}) cls.options.set({"multiprocess.is_enabled": False}) cls.profiler = dp.StructuredProfiler(cls.data, options=cls.options)