Python StructuredProfiler Examples, dataprofiler.StructuredProfiler Python Examples

Example #1

0

Show file

File: test_integration_struct_data_labeler.py Project: JGSweets/data-profiler

    def test_warning_tf(self):

        test_root_path = os.path.dirname(
            os.path.dirname(os.path.realpath(__file__)))
        test_dir = os.path.join(test_root_path, 'data')
        path = os.path.join(test_dir, 'csv/diamonds.csv')
        data = dp.Data(path)

        profile_options = dp.ProfilerOptions()
        profile_options.structured_options.set({
            "text.is_enabled": False,
            "int.is_enabled": False,
            "float.is_enabled": False,
            "order.is_enabled": False,
            "category.is_enabled": False,
            "chi2_homogeneity.is_enabled": False,
            "datetime.is_enabled": False
        })

        profile = dp.StructuredProfiler(data, options=profile_options)
        results = profile.report()

        columns = []
        predictions = []
        for i in range(len(results['data_stats'])):
            columns.append(i)
            predictions.append(results['data_stats'][i]['data_label'])

Example #2

0

Show file

    def test_null_list(self, *mocks):
        data = [None, None, None]

        profiler = dp.StructuredProfiler(data, options=self.options)

        fig = graphs.plot_missing_values_matrix(profiler)
        self.assertIsInstance(fig, plt.Figure)
        self.assertEqual(1, len(fig.axes))

        ax = fig.axes[0]
        patches, labels = ax.get_legend_handles_labels()
        self.assertEqual(['"None"'], labels)

        expected_patch_values = [
            {
                "xy": (0.1, -0.5),
                "width": 0.8,
                "height": 3
            },
        ]

        for patch, expected in zip(patches, expected_patch_values):
            np.testing.assert_almost_equal(expected["xy"], patch.xy)
            self.assertEqual(expected["width"], patch.get_width())
            self.assertEqual(expected["height"], patch.get_height())
        xtick_labels = [xtick.get_text() for xtick in ax.get_xticklabels()]
        self.assertListEqual(['"0"'], xtick_labels)
        self.assertEqual("column name", ax.get_xlabel())
        self.assertEqual("row index", ax.get_ylabel())

Example #3

0

Show file

File: test_integration_struct_data_labeler.py Project: capitalone/DataProfiler

    def test_warning_tf_run_dp_multiple_times(self):
        test_root_path = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
        test_dir = os.path.join(test_root_path, "data")
        path = os.path.join(test_dir, "csv/diamonds.csv")

        for i in range(3):
            print("running dp =============================", i)
            data = dp.Data(path)
            profile_options = dp.ProfilerOptions()
            profile_options.structured_options.set(
                {
                    "text.is_enabled": False,
                    "int.is_enabled": False,
                    "float.is_enabled": False,
                    "order.is_enabled": False,
                    "category.is_enabled": False,
                    "chi2_homogeneity.is_enabled": False,
                    "datetime.is_enabled": False,
                }
            )

            profile = dp.StructuredProfiler(data, options=profile_options)

            results = profile.report()

            columns = []
            predictions = []
            for j in range(len(results["data_stats"])):
                columns.append(j)
                predictions.append(results["data_stats"][j]["data_label"])

Example #4

0

Show file

    def test_2_null_types_multicol(self, *mocks):
        data = pd.DataFrame(
            [
                [None, "", 1.0, "1/2/2021"],
                [3, None, 3.5, ""],
                [1, None, 1.0, "2/5/2020"],
                [None, 1, 10.0, "3/5/2020"],
            ],
            columns=["integer", "str", "float", "datetime"],
            dtype=object,
        )

        profiler = dp.StructuredProfiler(data, options=self.options)

        fig = graphs.plot_missing_values_matrix(profiler)
        self.assertIsInstance(fig, plt.Figure)
        self.assertEqual(1, len(fig.axes))

        ax = fig.axes[0]
        patches, labels = ax.get_legend_handles_labels()
        self.assertEqual(['"None"', '"None"', '""', '"None"', '""'], labels)

        expected_patch_values = [
            {
                "xy": (0.1, -0.5),
                "width": 0.8,
                "height": 1
            },
            {
                "xy": (0.1, 2.5),
                "width": 0.8,
                "height": 1
            },
            {
                "xy": (1.1, -0.5),
                "width": 0.8,
                "height": 1
            },
            {
                "xy": (1.1, 0.5),
                "width": 0.8,
                "height": 2
            },
            {
                "xy": (3.1, 0.5),
                "width": 0.8,
                "height": 1
            },
        ]

        for patch, expected in zip(patches, expected_patch_values):
            np.testing.assert_almost_equal(expected["xy"], patch.xy)
            self.assertEqual(expected["width"], patch.get_width())
            self.assertEqual(expected["height"], patch.get_height())
        xtick_labels = [xtick.get_text() for xtick in ax.get_xticklabels()]
        self.assertListEqual(['"integer"', '"str"', '"float"', '"datetime"'],
                             xtick_labels)
        self.assertEqual("column name", ax.get_xlabel())
        self.assertEqual("row index", ax.get_ylabel())

Example #5

0

Show file

 def test_no_data(self, *mocks):
     profiler = dp.StructuredProfiler([], options=self.options)
     with self.assertWarnsRegex(
             UserWarning,
             "There was no data in the profiles to plot "
             "missing column values.",
     ):
         graphs.plot_missing_values_matrix(profiler)

Example #6

0

Show file

File: test_graphs.py Project: JGSweets/data-profiler

 def test_empty_profiler(self, plot_col_mock, plt_mock):
     with self.assertWarnsRegex(
             Warning, "No plots were constructed"
             " because no int or float columns "
             "were found in columns"):
         fig = graphs.plot_histograms(
             dp.StructuredProfiler(data=None, options=self.options))
     self.assertIsNone(fig)

Example #7

0

Show file

File: test_graphs.py Project: JGSweets/data-profiler

 def setUpClass(cls):
     cls.data = pd.DataFrame(
         [[1, 'a', 1.0, '1/2/2021'], [None, 'b', None, '1/2/2020'],
          [3, 'c', 3.5, '1/2/2022'], [4, 'd', 4.5, '1/2/2023'],
          [5, 'e', 6.0, '5/2/2020'], [None, 'f', None, '1/5/2020'],
          [1, 'g', 1.0, '2/5/2020'], [None, 1, 10.0, '3/5/2020']],
         columns=['int', 'str', 'float', 'datetime'])
     cls.options = dp.ProfilerOptions()
     cls.options.set({"data_labeler.is_enabled": False})
     cls.options.set({"multiprocess.is_enabled": False})
     cls.profiler = dp.StructuredProfiler(cls.data, options=cls.options)

Example #8

0

Show file

File: test_graphs.py Project: JGSweets/data-profiler

    def test_2_null_types_multicol(self, *mocks):
        data = pd.DataFrame(
            [[None, '', 1.0, '1/2/2021'], [3, None, 3.5, ''],
             [1, None, 1.0, '2/5/2020'], [None, 1, 10.0, '3/5/2020']],
            columns=['integer', 'str', 'float', 'datetime'],
            dtype=object)

        profiler = dp.StructuredProfiler(data, options=self.options)

        fig = graphs.plot_missing_values_matrix(profiler)
        self.assertIsInstance(fig, plt.Figure)
        self.assertEqual(1, len(fig.axes))

        ax = fig.axes[0]
        patches, labels = ax.get_legend_handles_labels()
        self.assertEqual(['"None"', '"None"', '""', '"None"', '""'], labels)

        expected_patch_values = [
            {
                'xy': (0.1, -0.5),
                'width': 0.8,
                'height': 1
            },
            {
                'xy': (0.1, 2.5),
                'width': 0.8,
                'height': 1
            },
            {
                'xy': (1.1, -0.5),
                'width': 0.8,
                'height': 1
            },
            {
                'xy': (1.1, 0.5),
                'width': 0.8,
                'height': 2
            },
            {
                'xy': (3.1, 0.5),
                'width': 0.8,
                'height': 1
            },
        ]

        for patch, expected in zip(patches, expected_patch_values):
            np.testing.assert_almost_equal(expected['xy'], patch.xy)
            self.assertEqual(expected['width'], patch.get_width())
            self.assertEqual(expected['height'], patch.get_height())
        xtick_labels = [xtick.get_text() for xtick in ax.get_xticklabels()]
        self.assertListEqual(['"integer"', '"str"', '"float"', '"datetime"'],
                             xtick_labels)
        self.assertEqual('column name', ax.get_xlabel())
        self.assertEqual('row index', ax.get_ylabel())

Example #9

0

Show file

File: test_integration_struct_data_labeler.py Project: capitalone/DataProfiler

    def test_warning_tf_multiple_dp_with_update(self):
        test_root_path = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
        test_dir = os.path.join(test_root_path, "data")
        path = os.path.join(test_dir, "csv/diamonds.csv")

        data = dp.Data(path)
        profile_options = dp.ProfilerOptions()
        profile_options.structured_options.set(
            {
                "text.is_enabled": False,
                "int.is_enabled": False,
                "float.is_enabled": False,
                "order.is_enabled": False,
                "category.is_enabled": False,
                "datetime.is_enabled": False,
                "chi2_homogeneity.is_enabled": False,
                "correlation.is_enabled": False,
            }
        )
        print("running dp1")
        profile1 = dp.StructuredProfiler(data, options=profile_options)

        data = dp.Data(path)
        profile_options = dp.ProfilerOptions()
        profile_options.structured_options.set(
            {
                "text.is_enabled": False,
                "int.is_enabled": False,
                "float.is_enabled": False,
                "order.is_enabled": False,
                "category.is_enabled": False,
                "datetime.is_enabled": False,
                "chi2_homogeneity.is_enabled": False,
                "correlation.is_enabled": False,
            }
        )
        print("running dp2")
        profile2 = dp.StructuredProfiler(data, options=profile_options)

        profile1.update_profile(data)

Example #10

0

Show file

File: test_integration_struct_data_labeler.py Project: JGSweets/data-profiler

    def test_warning_tf_run_dp_merge(self):
        test_root_path = os.path.dirname(
            os.path.dirname(os.path.realpath(__file__)))
        test_dir = os.path.join(test_root_path, 'data')
        path = os.path.join(test_dir, 'csv/diamonds.csv')

        data = dp.Data(path)
        profile_options = dp.ProfilerOptions()
        profile_options.structured_options.set({
            "text.is_enabled": False,
            "int.is_enabled": False,
            "float.is_enabled": False,
            "order.is_enabled": False,
            "category.is_enabled": False,
            "datetime.is_enabled": False,
            "chi2_homogeneity.is_enabled": False,
            "correlation.is_enabled": False
        })
        print('running dp1')
        profile1 = dp.StructuredProfiler(data, options=profile_options)

        data = dp.Data(path)
        profile_options = dp.ProfilerOptions()
        profile_options.structured_options.set({
            "text.is_enabled": False,
            "int.is_enabled": False,
            "float.is_enabled": False,
            "order.is_enabled": False,
            "category.is_enabled": False,
            "datetime.is_enabled": False,
            "chi2_homogeneity.is_enabled": False,
            "correlation.is_enabled": False
        })
        print('running dp2')
        profile2 = dp.StructuredProfiler(data, options=profile_options)

        profile = profile1 + profile2

Example #11

0

Show file

 def setUpClass(cls):
     cls.data = pd.DataFrame(
         [
             [1, "a", 1.0, "1/2/2021"],
             [None, "b", None, "1/2/2020"],
             [3, "c", 3.5, "1/2/2022"],
             [4, "d", 4.5, "1/2/2023"],
             [5, "e", 6.0, "5/2/2020"],
             [None, "f", None, "1/5/2020"],
             [1, "g", 1.0, "2/5/2020"],
             [None, 1, 10.0, "3/5/2020"],
         ],
         columns=["int", "str", "float", "datetime"],
     )
     cls.options = dp.ProfilerOptions()
     cls.options.set({"data_labeler.is_enabled": False})
     cls.options.set({"multiprocess.is_enabled": False})
     cls.profiler = dp.StructuredProfiler(cls.data, options=cls.options)