def test_profiled_max(self): def date_linspace(start, end, steps): delta = (end - start) / steps increments = list(range(0, steps)) * np.array([delta] * steps) return start + increments df = pd.core.series.Series( date_linspace(datetime.datetime.min, datetime.datetime.max, 11) ) df = df.apply( lambda x: x - datetime.timedelta(microseconds=x.microsecond) ).apply(str) datetime_profile = DateTimeColumn(df[:-1].name) datetime_profile.update(df[:-1]) self.assertEqual(datetime_profile.max, df.iloc[-2]) datetime_profile.update(df) self.assertEqual(datetime_profile.max, df.iloc[-1]) datetime_profile.update(pd.Series([np.nan, df.iloc[3]])) self.assertEqual(datetime_profile.max, df.iloc[-1]) datetime_profile.update(df[1:2]) # only way to keep as df self.assertEqual(datetime_profile.max, df.iloc[-1])
def test_profiled_date_time_formats(self): """ Checks whether the profiler properly determines all datetime formats. :return: """ date_formats_1 = [ "%Y-%m-%d %H:%M:%S", # 2013-03-5 15:43:30 "%Y-%m-%dT%H:%M:%S", # 2013-03-6T15:43:30 "%Y-%m-%dT%H:%M:%S.%fZ", # 2013-03-6T15:43:30.123456Z "%m/%d/%y %H:%M", # 03/10/13 15:43 "%m/%d/%Y %H:%M", # 3/8/2013 15:43 "%Y%m%dT%H%M%S", # 2013036T154330 "%H:%M:%S.%f", # 05:46:30.258509 ] df_1 = pd.Series([], dtype=object) for date_format in date_formats_1: # generate a few samples for each date format df_1 = pd.concat( [df_1, self._generate_datetime_data(date_format)] ) date_formats_2 = [ "%Y-%m-%d", # 2013-03-7 "%m/%d/%Y", # 3/8/2013 "%m/%d/%y", # 03/10/13 "%B %d, %Y", # March 9, 2013 "%b %d, %Y", # Mar 11, 2013 "%d%b%y", # 12Mar13 "%b-%d-%y", # Mar-13-13 "%m%d%Y", # 03142013 ] df_2 = pd.Series([], dtype=object) for date_format in date_formats_2: # generate a few samples for each date format df_2 = pd.concat( [df_2, self._generate_datetime_data(date_format)] ) date_formats_all = date_formats_1 + date_formats_2 df_all = pd.concat([df_1, df_2]) datetime_profile = DateTimeColumn(df_all.name) datetime_profile.update(df_all) six.assertCountEqual(self, date_formats_all, set(datetime_profile.date_formats)) # Test chunks datetime_profile = DateTimeColumn(df_1.name) datetime_profile.update(df_1) six.assertCountEqual(self, date_formats_1, set(datetime_profile.date_formats)) datetime_profile.update(df_2) six.assertCountEqual(self, date_formats_all, datetime_profile.date_formats)
def test_base_case(self): data = pd.Series([], dtype=object) profiler = DateTimeColumn(data.name) profiler.update(data) profiler.update(data) # intentional to validate no changes if empty self.assertEqual(profiler.match_count, 0) self.assertIsNone(profiler.min) self.assertIsNone(profiler.max) self.assertListEqual([], profiler.date_formats) self.assertIsNone(profiler.data_type_ratio)
def _test_datetime_detection_helper(self, date_formats): for date_format in date_formats: # generate a few samples for each date format gen_data = self._generate_datetime_data(date_format) # Test to see if the format and col type is detected correctly. datetime_profile = DateTimeColumn(gen_data.name) datetime_profile.update(gen_data) self.assertEqual(date_format, datetime_profile.date_formats[0])
def test_report(self): data = [ 2.5, 12.5, "2013-03-10 15:43:30", 5, "03/10/13 15:43", "Mar 11, 2013" ] df = pd.Series(data).apply(str) profile = DateTimeColumn(df.name) report1 = profile.profile report2 = profile.report(remove_disabled_flag=False) report3 = profile.report(remove_disabled_flag=True) self.assertDictEqual(report1, report2) self.assertDictEqual(report1, report3)
def test_day_suffixes(self): """ Tests datetime examples with daytime suffixes. :return: """ data = [ "Mar 1st, 2020", "Feb 22nd, 2019", "October 23rd, 2018", "12thMar13" ] df = pd.Series(data).apply(str) profiler = DateTimeColumn(df.name) profiler.update(df) self.assertEqual("Mar 1st, 2020", profiler.max) self.assertEqual("12thMar13", profiler.min) self.assertEqual(4, profiler.match_count)
def test_datetime_column_with_wrong_options(self): with self.assertRaisesRegex( ValueError, "DateTimeColumn parameter 'options' must be" " of type DateTimeOptions.", ): profiler = DateTimeColumn("Datetime", options="wrong_data_type")
def test_add(self): # unique format for the first profile data1 = [ "2013-03-5 15:43:30", "2013-03-6T15:43:30", "2013-03-6T15:43:30.123456Z", "03/10/2013 15:43", "3/8/2013 15:43", "%2013036T154330", "05:46:30.258509", ] df = pd.Series(data1).apply(str) profile1 = DateTimeColumn(df.name) profile1.update(df) # unique format for second profile data2 = [ 2.5, 12.5, '2013-03-10 15:23:20', 5, '03/10/2013 15:23', 'Mar 12, 2013' ] df = pd.Series(data2).apply(str) profile2 = DateTimeColumn(df.name) profile2.update(df) merged_profile = profile1 + profile2 # checks for _dt_objs min_dt_obj = datetime.datetime.strptime('05:46:30.258509', '%H:%M:%S.%f') max_dt_obj = datetime.datetime.strptime('2013-03-12', '%Y-%m-%d') self.assertEqual(min_dt_obj, merged_profile._dt_obj_min) self.assertEqual(max_dt_obj, merged_profile._dt_obj_max) # checks for the proper max and min to be merged self.assertEqual('05:46:30.258509', merged_profile.min) self.assertEqual('Mar 12, 2013', merged_profile.max) # checks for date format merge self.assertCountEqual( ['%Y-%m-%d %H:%M:%S', '%Y-%m-%dT%H:%M:%S', '%Y-%m-%dT%H:%M:%S.%fZ', '%m/%d/%Y %H:%M', '%H:%M:%S.%f', '%b %d, %Y'], merged_profile.date_formats) # Checks for DateTimeColumn type for argument with self.assertRaises(TypeError) as exc: profile2 = "example_string" profile1 + profile2 self.assertEqual(str(exc.exception), "Unsupported operand type(s) for +: " "'DateTimeColumn' and '{}'" .format(profile2.__class__.__name__))
def test_profile(self): data = [ 2.5, 12.5, '2013-03-10 15:43:30', 5, '03/10/13 15:43', 'Mar 11, 2013' ] df = pd.Series(data).apply(str) profiler = DateTimeColumn(df.name) expected_profile = dict( min='03/10/13 15:43', max='Mar 11, 2013', histogram=None, format=[ '%Y-%m-%d %H:%M:%S', "%m/%d/%y %H:%M", "%b %d, %Y", ], times=defaultdict(float, {'datetime': 1.0}) ) time_array = [float(i) for i in range(4, 0, -1)] with mock.patch('time.time', side_effect=lambda: time_array.pop()): # Validate that the times dictionary is empty self.assertEqual(defaultdict(float), profiler.profile['times']) # Validate the time in the datetime class has the expected time. profiler.update(df) expected = defaultdict(float, {'datetime': 1.0}) self.assertEqual(expected, profiler.profile['times']) profile = profiler.profile self.assertCountEqual(expected_profile, profile) # Validate time in datetime class has expected time after second # update profiler.update(df) expected = defaultdict(float, {'datetime': 2.0}) self.assertEqual(expected, profiler.profile['times'])
def test_data_ratio(self): data = [ 2.5, 12.5, '2013-03-5 15:43:30', 5, '03/10/13 15:43', 'Mar 11, 2013' ] df = pd.Series(data).apply(str) profiler = DateTimeColumn(df.name) self.assertEqual(profiler.data_type_ratio, None) profiler.update(df) self.assertEqual(profiler.data_type_ratio, 0.5) profiler.update(pd.Series([None, '10/20/13', 'nan'])) self.assertEqual(profiler.data_type_ratio, 4/9.0)
def test_null_add(self): # initialize the profiles dates = [None, "2014-12-18", "2015-07-21"] df = pd.Series(dates) df_nulls = df[:1] df_dates = df[1:] profile1 = DateTimeColumn(name="date") profile2 = DateTimeColumn(name="date") profile1.update(df_nulls) profile2.update(df_dates) # test when first profile has the nulls merged_profile = profile1 + profile2 self.assertEqual("2014-12-18", merged_profile.min) self.assertEqual("2015-07-21", merged_profile.max) # test when second profile has the nulls merged_profile = profile2 + profile1 self.assertEqual("2014-12-18", merged_profile.min) self.assertEqual("2015-07-21", merged_profile.max)
def test_data_ratio(self): data = [ 2.5, 12.5, "2013-03-5 15:43:30", 5, "03/10/13 15:43", "Mar 11, 2013" ] df = pd.Series(data).apply(str) profiler = DateTimeColumn(df.name) self.assertEqual(profiler.data_type_ratio, None) profiler.update(df) self.assertEqual(profiler.data_type_ratio, 0.5) profiler.update(pd.Series([None, "10/20/13", "nan"])) self.assertEqual(profiler.data_type_ratio, 4 / 9.0)
def test_warning_for_bad_dates(self): df = pd.Series(['03/10/2013 15:43']) profiler = DateTimeColumn(df.name) with warnings.catch_warnings(record=True) as w: profiler.update(df) self.assertEqual(len(w), 0) df = pd.Series(['03/10/13 15:43']) with self.assertWarns(RuntimeWarning) as r_warning: profiler.update(df) self.assertEqual( str(r_warning.warning), "Years provided were in two digit format. As a result, " "datetime assumes dates < 69 are for 2000s and above " "are for the 1990s. " "https://stackoverflow.com/questions/37766353/" "pandas-to-datetime-parsing-wrong-year")
def test_diff(self): data1 = [None, 'Mar 12, 2013', "2013-05-18", "2014-03-01"] df1 = pd.Series(data1).apply(str) profiler1 = DateTimeColumn(df1.name) profiler1.update(df1) data2 = [ 2.5, 12.5, '2013-03-10 15:43:30', 5, '03/10/14 15:43', 'Mar 11, 2013' ] df2 = pd.Series(data2).apply(str) profiler2 = DateTimeColumn(df2.name) profiler2.update(df2) expected_diff = { 'min': "+1 days 08:16:30", 'max': "-9 days 15:43:00", 'format': [['%Y-%m-%d'], ['%b %d, %Y'], ['%Y-%m-%d %H:%M:%S', '%m/%d/%y %H:%M']] } expected_format = expected_diff.pop('format') expected_unique1 = expected_format[0] expected_shared = expected_format[1] expected_unique2 = expected_format[2] diff = profiler1.diff(profiler2) format = diff.pop('format') unique1 = format[0] shared = format[1] unique2 = format[2] self.assertDictEqual(expected_diff, diff) self.assertEqual(set(expected_unique1), set(unique1)) self.assertEqual(set(expected_shared), set(shared)) self.assertEqual(set(expected_unique2), set(unique2)) # Assert type error is properly called with self.assertRaises(TypeError) as exc: profiler1.diff("Inproper input") self.assertEqual(str(exc.exception), "Unsupported operand type(s) for diff: " "'DateTimeColumn' and 'str'")