Esempio n. 1
0
    def test_profiled_max(self):

        def date_linspace(start, end, steps):
            delta = (end - start) / steps
            increments = list(range(0, steps)) * np.array([delta] * steps)
            return start + increments

        df = pd.core.series.Series(
            date_linspace(datetime.datetime.min, datetime.datetime.max, 11)
        )
        df = df.apply(
            lambda x: x - datetime.timedelta(microseconds=x.microsecond)
        ).apply(str)

        datetime_profile = DateTimeColumn(df[:-1].name)
        datetime_profile.update(df[:-1])

        self.assertEqual(datetime_profile.max, df.iloc[-2])

        datetime_profile.update(df)
        self.assertEqual(datetime_profile.max, df.iloc[-1])

        datetime_profile.update(pd.Series([np.nan, df.iloc[3]]))
        self.assertEqual(datetime_profile.max, df.iloc[-1])

        datetime_profile.update(df[1:2])  # only way to keep as df
        self.assertEqual(datetime_profile.max, df.iloc[-1])
Esempio n. 2
0
    def test_profiled_date_time_formats(self):
        """
        Checks whether the profiler properly determines all datetime formats.
        :return:
        """
        date_formats_1 = [
            "%Y-%m-%d %H:%M:%S",      # 2013-03-5 15:43:30
            "%Y-%m-%dT%H:%M:%S",      # 2013-03-6T15:43:30
            "%Y-%m-%dT%H:%M:%S.%fZ",  # 2013-03-6T15:43:30.123456Z
            "%m/%d/%y %H:%M",         # 03/10/13 15:43
            "%m/%d/%Y %H:%M",         # 3/8/2013 15:43
            "%Y%m%dT%H%M%S",          # 2013036T154330
            "%H:%M:%S.%f",            # 05:46:30.258509
        ]
        df_1 = pd.Series([], dtype=object)
        for date_format in date_formats_1:
            # generate a few samples for each date format
            df_1 = pd.concat(
                [df_1, self._generate_datetime_data(date_format)]
            )

        date_formats_2 = [
            "%Y-%m-%d",   # 2013-03-7
            "%m/%d/%Y",   # 3/8/2013
            "%m/%d/%y",   # 03/10/13
            "%B %d, %Y",  # March 9, 2013
            "%b %d, %Y",  # Mar 11, 2013
            "%d%b%y",     # 12Mar13
            "%b-%d-%y",   # Mar-13-13
            "%m%d%Y",     # 03142013
        ]
        df_2 = pd.Series([], dtype=object)
        for date_format in date_formats_2:
            # generate a few samples for each date format
            df_2 = pd.concat(
                [df_2, self._generate_datetime_data(date_format)]
            )

        date_formats_all = date_formats_1 + date_formats_2
        df_all = pd.concat([df_1, df_2])
        datetime_profile = DateTimeColumn(df_all.name)
        datetime_profile.update(df_all)

        six.assertCountEqual(self,
                             date_formats_all,
                             set(datetime_profile.date_formats))

        # Test chunks
        datetime_profile = DateTimeColumn(df_1.name)
        datetime_profile.update(df_1)

        six.assertCountEqual(self,
                             date_formats_1,
                             set(datetime_profile.date_formats))

        datetime_profile.update(df_2)
        six.assertCountEqual(self,
                             date_formats_all,
                             datetime_profile.date_formats)
Esempio n. 3
0
    def test_base_case(self):
        data = pd.Series([], dtype=object)
        profiler = DateTimeColumn(data.name)
        profiler.update(data)
        profiler.update(data)  # intentional to validate no changes if empty

        self.assertEqual(profiler.match_count, 0)
        self.assertIsNone(profiler.min)
        self.assertIsNone(profiler.max)
        self.assertListEqual([], profiler.date_formats)
        self.assertIsNone(profiler.data_type_ratio)
Esempio n. 4
0
    def _test_datetime_detection_helper(self, date_formats):

        for date_format in date_formats:
            # generate a few samples for each date format
            gen_data = self._generate_datetime_data(date_format)

            # Test to see if the format and col type is detected correctly.
            datetime_profile = DateTimeColumn(gen_data.name)
            datetime_profile.update(gen_data)

            self.assertEqual(date_format, datetime_profile.date_formats[0])
Esempio n. 5
0
    def test_report(self):
        data = [
            2.5, 12.5, "2013-03-10 15:43:30", 5, "03/10/13 15:43",
            "Mar 11, 2013"
        ]
        df = pd.Series(data).apply(str)
        profile = DateTimeColumn(df.name)

        report1 = profile.profile
        report2 = profile.report(remove_disabled_flag=False)
        report3 = profile.report(remove_disabled_flag=True)
        self.assertDictEqual(report1, report2)
        self.assertDictEqual(report1, report3)
Esempio n. 6
0
 def test_day_suffixes(self):
     """
     Tests datetime examples with daytime suffixes.
     :return:
     """
     data = [
         "Mar 1st, 2020", "Feb 22nd, 2019", "October 23rd, 2018",
         "12thMar13"
     ]
     df = pd.Series(data).apply(str)
     profiler = DateTimeColumn(df.name)
     profiler.update(df)
     self.assertEqual("Mar 1st, 2020", profiler.max)
     self.assertEqual("12thMar13", profiler.min)
     self.assertEqual(4, profiler.match_count)
Esempio n. 7
0
 def test_datetime_column_with_wrong_options(self):
     with self.assertRaisesRegex(
             ValueError,
             "DateTimeColumn parameter 'options' must be"
             " of type DateTimeOptions.",
     ):
         profiler = DateTimeColumn("Datetime", options="wrong_data_type")
Esempio n. 8
0
    def test_add(self):
        # unique format for the first profile
        data1 = [
            "2013-03-5 15:43:30",
            "2013-03-6T15:43:30",
            "2013-03-6T15:43:30.123456Z",
            "03/10/2013 15:43",
            "3/8/2013 15:43",
            "%2013036T154330",
            "05:46:30.258509",
        ]
        df = pd.Series(data1).apply(str)
        profile1 = DateTimeColumn(df.name)
        profile1.update(df)

        # unique format for second profile
        data2 = [
            2.5, 12.5, '2013-03-10 15:23:20', 5, '03/10/2013 15:23',
            'Mar 12, 2013'
        ]
        df = pd.Series(data2).apply(str)
        profile2 = DateTimeColumn(df.name)
        profile2.update(df)

        merged_profile = profile1 + profile2

        # checks for _dt_objs
        min_dt_obj = datetime.datetime.strptime('05:46:30.258509',
                                                '%H:%M:%S.%f')
        max_dt_obj = datetime.datetime.strptime('2013-03-12', '%Y-%m-%d')
        self.assertEqual(min_dt_obj, merged_profile._dt_obj_min)
        self.assertEqual(max_dt_obj, merged_profile._dt_obj_max)

        # checks for the proper max and min to be merged
        self.assertEqual('05:46:30.258509', merged_profile.min)
        self.assertEqual('Mar 12, 2013', merged_profile.max)

        # checks for date format merge
        self.assertCountEqual(
            ['%Y-%m-%d %H:%M:%S', '%Y-%m-%dT%H:%M:%S', '%Y-%m-%dT%H:%M:%S.%fZ',
             '%m/%d/%Y %H:%M', '%H:%M:%S.%f', '%b %d, %Y'],
            merged_profile.date_formats)

        # Checks for DateTimeColumn type for argument
        with self.assertRaises(TypeError) as exc:
            profile2 = "example_string"
            profile1 + profile2

        self.assertEqual(str(exc.exception),
                         "Unsupported operand type(s) for +: "
                         "'DateTimeColumn' and '{}'"
                         .format(profile2.__class__.__name__))
Esempio n. 9
0
    def test_profile(self):
        data = [
            2.5, 12.5, '2013-03-10 15:43:30', 5, '03/10/13 15:43',
            'Mar 11, 2013'
        ]
        df = pd.Series(data).apply(str)
        profiler = DateTimeColumn(df.name)
        expected_profile = dict(
            min='03/10/13 15:43',
            max='Mar 11, 2013',
            histogram=None,
            format=[
                '%Y-%m-%d %H:%M:%S',
                "%m/%d/%y %H:%M",
                "%b %d, %Y",
            ],
            times=defaultdict(float, {'datetime': 1.0})
        )
        time_array = [float(i) for i in range(4, 0, -1)]
        with mock.patch('time.time', side_effect=lambda: time_array.pop()):
            # Validate that the times dictionary is empty
            self.assertEqual(defaultdict(float), profiler.profile['times'])

            # Validate the time in the datetime class has the expected time.
            profiler.update(df)
            expected = defaultdict(float, {'datetime': 1.0})
            self.assertEqual(expected, profiler.profile['times'])
            profile = profiler.profile
            self.assertCountEqual(expected_profile, profile)
            
            # Validate time in datetime class has expected time after second
            # update
            profiler.update(df)
            expected = defaultdict(float, {'datetime': 2.0})
            self.assertEqual(expected, profiler.profile['times'])
Esempio n. 10
0
    def test_data_ratio(self):
        data = [
            2.5, 12.5, '2013-03-5 15:43:30', 5, '03/10/13 15:43', 'Mar 11, 2013'
        ]
        df = pd.Series(data).apply(str)

        profiler = DateTimeColumn(df.name)
        self.assertEqual(profiler.data_type_ratio, None)

        profiler.update(df)
        self.assertEqual(profiler.data_type_ratio, 0.5)

        profiler.update(pd.Series([None, '10/20/13', 'nan']))
        self.assertEqual(profiler.data_type_ratio, 4/9.0)
Esempio n. 11
0
    def test_null_add(self):

        # initialize the profiles
        dates = [None, "2014-12-18", "2015-07-21"]
        df = pd.Series(dates)
        df_nulls = df[:1]
        df_dates = df[1:]

        profile1 = DateTimeColumn(name="date")
        profile2 = DateTimeColumn(name="date")
        profile1.update(df_nulls)
        profile2.update(df_dates)

        # test when first profile has the nulls
        merged_profile = profile1 + profile2
        self.assertEqual("2014-12-18", merged_profile.min)
        self.assertEqual("2015-07-21", merged_profile.max)

        # test when second profile has the nulls
        merged_profile = profile2 + profile1
        self.assertEqual("2014-12-18", merged_profile.min)
        self.assertEqual("2015-07-21", merged_profile.max)
Esempio n. 12
0
    def test_data_ratio(self):
        data = [
            2.5, 12.5, "2013-03-5 15:43:30", 5, "03/10/13 15:43",
            "Mar 11, 2013"
        ]
        df = pd.Series(data).apply(str)

        profiler = DateTimeColumn(df.name)
        self.assertEqual(profiler.data_type_ratio, None)

        profiler.update(df)
        self.assertEqual(profiler.data_type_ratio, 0.5)

        profiler.update(pd.Series([None, "10/20/13", "nan"]))
        self.assertEqual(profiler.data_type_ratio, 4 / 9.0)
    def test_warning_for_bad_dates(self):

        df = pd.Series(['03/10/2013 15:43'])

        profiler = DateTimeColumn(df.name)
        with warnings.catch_warnings(record=True) as w:
            profiler.update(df)
        self.assertEqual(len(w), 0)

        df = pd.Series(['03/10/13 15:43'])
        with self.assertWarns(RuntimeWarning) as r_warning:
            profiler.update(df)
        self.assertEqual(
            str(r_warning.warning),
            "Years provided were in two digit format. As a result, "
            "datetime assumes dates < 69 are for 2000s and above "
            "are for the 1990s. "
            "https://stackoverflow.com/questions/37766353/"
            "pandas-to-datetime-parsing-wrong-year")
Esempio n. 14
0
    def test_diff(self):
        data1 = [None, 'Mar 12, 2013', "2013-05-18", "2014-03-01"]
        df1 = pd.Series(data1).apply(str)
        profiler1 = DateTimeColumn(df1.name)
        profiler1.update(df1)


        data2 = [
            2.5, 12.5, '2013-03-10 15:43:30', 5, '03/10/14 15:43',
            'Mar 11, 2013'
        ]
        df2 = pd.Series(data2).apply(str)
        profiler2 = DateTimeColumn(df2.name)
        profiler2.update(df2)

        expected_diff = {
            'min': "+1 days 08:16:30",
            'max': "-9 days 15:43:00",
            'format': [['%Y-%m-%d'], ['%b %d, %Y'], ['%Y-%m-%d %H:%M:%S', '%m/%d/%y %H:%M']]
        }
        expected_format = expected_diff.pop('format')
        expected_unique1 = expected_format[0]
        expected_shared = expected_format[1]
        expected_unique2 = expected_format[2]

        diff = profiler1.diff(profiler2)
        format = diff.pop('format')
        unique1 = format[0]
        shared = format[1]
        unique2 = format[2]
        self.assertDictEqual(expected_diff, diff)
        self.assertEqual(set(expected_unique1), set(unique1))
        self.assertEqual(set(expected_shared), set(shared))
        self.assertEqual(set(expected_unique2), set(unique2))

        # Assert type error is properly called
        with self.assertRaises(TypeError) as exc:
            profiler1.diff("Inproper input")
        self.assertEqual(str(exc.exception),
                         "Unsupported operand type(s) for diff: "
                         "'DateTimeColumn' and 'str'")