def test_correlate_date_max_min(self): tools = DataBuilderTools() dates = ['10/01/2017'] result = tools.correlate_dates(dates, max_date='05/01/2017') self.assertEqual(['05-01-2017T00:00:00'], result) result = tools.correlate_dates(dates, min_date='12/01/2017') self.assertEqual(['12-01-2017T00:00:00'], result)
def test_correlate_numbers_max_min(self): tools = DataBuilderTools() values = [2] result = tools.correlate_numbers(values, max_value=1) self.assertEqual([1], result) result = tools.correlate_numbers(values, min_value=3) self.assertEqual([3], result)
def test_associate_custom(self): tools = DataBuilderTools() df = pd.DataFrame() df['cat'] = tools.get_category(list('MFU'), size=10, seed=31) df['values'] = tools.get_number(10, size=10, seed=31) control = [ 5.0, np.nan, 1.0, np.nan, np.nan, 6.0, np.nan, np.nan, 5.0, 3.0 ] result = tools.associate_custom( df=df, code_str="df.loc[df['cat'] == 'U', 'values'] = new_value", use_exec=True, new_value=None) self.assertEqual(control, list(result['values'])) df['values'] = tools.get_number(10, size=10, seed=31) result = tools.associate_custom( df=df, code_str= "df['values'] = np.where(df['cat'] == 'U', None, df['values'])", use_exec=True, new_value=None) control = [None, 2, 0, 4, None, None, 3, 5, 4, 2] self.assertEqual(control, list(result['values'])) self.assertEqual(control, list(df['values'])) df_staff = pd.DataFrame() df_staff['sid'] = [1000] result = tools.associate_custom( df=df_staff, code_str="df[header].apply(lambda x: f'CU_{x}')", header='sid') self.assertEqual(['CU_1000'], list(result))
def test_example(self): tools = DataBuilderTools() df_accounts = pd.DataFrame() df_accounts['account_id'] = tools.unique_identifiers(1000, 2000, size=150) # Create a weighting pattern that is loaded towards low value and then exponetially tails off value_distribution = [0.01, 0.8, 1, 3, 9, 8, 3, 2, 1] + list( np.flip(np.exp(np.arange(-5, 0.0, 0.2)).round(2))) # Apply the weighting to an account range of 0 to 1000 df_accounts['balance'] = tools.get_number( 0, 1000, weight_pattern=value_distribution, size=150) df_accounts['interest'] = tools.correlate_numbers( df_accounts['balance'], spread=0, offset=0.02, action='multiply', precision=2)
def test_correlation_dates(self): tools = DataBuilderTools() dates = ['10/01/2017', '12/01/2017', None, '', 'Fred'] control = ['10-01-2020', '12-01-2020', None, '', 'Fred'] result = tools.correlate_dates(dates, lower_spread={'days': 2}, upper_spread={'days': 2}, offset={'years': 3}, date_format="%d-%m-%Y", seed=99) self.assertEqual(control, result) control = ['11-01-2017', '13-01-2017', None, '', 'Fred'] result = tools.correlate_dates(dates, date_format="%d-%m-%Y", seed=99) self.assertEqual(control, result) dates = DataBuilderTools.get_datetime('01/01/2010', '31/12/2010', date_format="%d-%m-%Y", seed=99, size=5) control = [ '17-07-2010', '07-07-2010', '06-08-2010', '07-06-2010', '23-02-2010' ] self.assertEqual(control, dates) result = tools.correlate_dates(dates, lower_spread={'days': 2}, upper_spread={'days': 2}, seed=99, date_format="%d-%m-%Y") control = [ '18-07-2010', '08-07-2010', '06-08-2010', '06-06-2010', '22-02-2010' ] self.assertEqual(control, result)
def test_correlation_dates_attributes(self): tools = DataBuilderTools() rows = 100 df_staff = pd.DataFrame() df_staff['joined'] = tools.get_datetime(start='01/01/2008', until='07/01/2019', date_format='%d-%m-%Y', size=rows) def offset_limits(): diff_list = [] for index in range(rows): c_time = pd.to_datetime(control[index], errors='coerce', infer_datetime_format=True, dayfirst=True) r_time = pd.to_datetime(result[index], errors='coerce', infer_datetime_format=True, dayfirst=True) diff_list.append(r_time - c_time) max_diff = max(diff_list) min_diff = min(diff_list) mean_diff = np.mean(diff_list) return min_diff, mean_diff, max_diff control = cleaner.list_formatter(df_staff['joined']) result = tools.correlate_dates(df_staff['joined'], offset={'days': 7}) min_diff, mean_diff, max_diff = offset_limits() self.assertEquals(7, max_diff.days) self.assertEquals(6, min_diff.days) result = tools.correlate_dates(df_staff['joined'], offset={'days': 7}, lower_spread={'days': 3}, upper_spread={'days': 5}) min_diff, mean_diff, max_diff = offset_limits() self.assertEquals(11, max_diff.days) self.assertEquals(4, min_diff.days)
def test_dates(self): result = tools.get_datetime('01/01/2018', '01/01/2019', as_num=True, size=1000) self.assertEqual(1000, len(result)) # get the values at the edge value_min = tools.get_datetime('01/01/2018', '01/01/2018', as_num=True, size=1)[0] value_max = tools.get_datetime('01/01/2019', '01/01/2019', as_num=True, size=1)[0] self.assertLessEqual(value_min, min(result)) self.assertGreaterEqual(value_max, max(result)) result = tools.get_datetime('01/01/2018', '01/01/2018') self.assertEqual([ datetime.datetime(2018, 1, 1, 0, 0, tzinfo=datetime.timezone.utc) ], result) # test date_format result = tools.get_datetime('01/01/2018', '01/01/2018', date_format="%Y-%m-%d") self.assertEqual(['2018-01-01'], result) # test ignore time result = tools.get_datetime('01/01/2018T01:01:01', '01/01/2018T23:59:59', ignore_time=True) self.assertEqual([ datetime.datetime(2018, 1, 1, 0, 0, tzinfo=datetime.timezone.utc) ], result) # test at_most result = tools.get_datetime('01/01/2018', '01/04/2018', ignore_time=True, at_most=1, size=3, date_format="%Y-%m-%d") self.assertEqual(3, len(result)) self.assertCountEqual(['2018-01-01', '2018-01-03', '2018-01-02'], result)
def test_associate_dataset(self): tools = DataBuilderTools() dataset = ['M', 'F', 'M', 'M', 'U', 'F'] associations = [{ 'expect': 'c', 'value': 'M' }, { 'expect': 'c', 'value': 'F' }] actions = {0: {'action': 'Male'}, 1: {'action': 'Female'}} result = tools.associate_canonical(dataset, associations=associations, actions=actions, default_header='_default') control = ['Male', 'Female', 'Male', 'Male', 'U', 'Female'] self.assertEqual(control, result) dataset = pd.DataFrame() dataset['gender'] = ['M', 'F', 'M', 'M', 'U', 'F', 'U'] dataset['age'] = [20, 22, 18, 43, 34, 57, 22] associations = [ { 'age': { 'expect': 'n', 'value': [24, 100] }, 'gender': { 'expect': 'category', 'value': ['M'] } }, { 'age': { 'expect': 'n', 'value': [0, 23] } }, { 'age': { 'expect': 'n', 'value': [24, 100] }, 'gender': { 'expect': 'category', 'value': ['F'] } }, ] actions = { 0: { 'action': 'Dad' }, 1: { 'action': 'correlate_numbers', 'kwargs': { 'values': { '_header': 'age' }, 'offset': 100 } }, 2: { 'action': { '_header': 'age' } } } result = tools.associate_canonical(dataset, associations=associations, actions=actions, default_value='Unknown') control = [120, 122, 118, 'Dad', 'Unknown', 57, 122] self.assertEqual(control, result) titanic = sns.load_dataset('titanic') associations = [{ 'sex': { 'expect': 'category', 'value': ['male'] }, 'survived': { 'expect': 'number', 'value': 0 } }, { 'sex': { 'expect': 'category', 'value': ['male'] }, 'survived': { 'expect': 'number', 'value': 1 } }, { 'sex': { 'expect': 'category', 'value': ['female'] }, 'survived': { 'expect': 'number', 'value': 0 } }, { 'sex': { 'expect': 'category', 'value': ['female'] }, 'survived': { 'expect': 'number', 'value': 1 } }] actions = { 0: { 'action': 'correlate_numbers', 'kwargs': { 'values': { '_header': 'age' }, 'fill_nulls': True } }, 1: { 'action': 'correlate_numbers', 'kwargs': { 'values': { '_header': 'age' }, 'fill_nulls': True } }, 2: { 'action': 'correlate_numbers', 'kwargs': { 'values': { '_header': 'age' }, 'fill_nulls': True } }, 3: { 'action': 'correlate_numbers', 'kwargs': { 'values': { '_header': 'age' }, 'fill_nulls': True } }, } result = tools.associate_canonical(titanic, associations=associations, actions=actions, default_value=99) control = [22.0, 38.0, 26.0, 35.0, 35.0, None, 54.0, 2.0, 27.0, 14.0] self.assertEqual(control, result[:10])
def test_associate_timeseries(self): tools = DataBuilderTools() rows = 100 df_staff = pd.DataFrame() df_staff['sid'] = tools.unique_identifiers(from_value=10000000, to_value=99999999, size=rows) df_staff['staff_type'] = tools.get_category( selection=['contractor', 'part-time', 'full-time'], weight_pattern=[1, 3, 6], size=rows) df_staff['joined'] = tools.get_datetime(start='01/01/2008', until='07/01/2019', date_format='%d-%m-%Y', size=rows) associations = [{ 'joined': { 'expect': 'date', 'value': ['01/01/2000', '31/12/2013'] }, 'staff_type': { 'expect': 'category', 'value': ['full-time', 'part-time'] } }, { 'joined': { 'expect': 'date', 'value': ['31/12/2013', '31/12/2100'] }, 'staff_type': { 'expect': 'category', 'value': ['full-time', 'part-time'] } }] actions = { 0: { 'action': 'get_datetime', 'kwargs': { 'start': "05/01/2014", 'until': "16/01/2014" } }, 1: { 'action': 'correlate_dates', 'kwargs': { 'dates': { '_header': 'joined' }, 'offset': { 'days': 9 }, 'lower_spread': 4 } } } df_staff['registered'] = tools.associate_canonical( df_staff, associations=associations, actions=actions, default_value=None)
def test_runs(self): """Basic smoke test""" DataBuilderTools()
def test_correlate_numbers(self): tools = DataBuilderTools() values = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 0.0] control = [1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5, 8.5, 9.5, 0.5] result = tools.correlate_numbers(values, offset=0.5) self.assertEqual(control, result) result = tools.correlate_numbers(values, spread=1, precision=1, seed=31) control = [0.7, 1.7, 3.2, 2.1, 5.7, 5.9, 8.0, 9.4, 8.4, -0.3] self.assertEqual(control, result) control = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, np.nan, 0.0] result = tools.correlate_numbers(values, quantity=0.9, seed=31) self.assertEqual(control, result) values = [1., 1., 1., 1., 1., 1., 1.] control = [1.244, 1.676, 2.009, 2.405, 1.497, 1.257, 1.026] result = tools.correlate_numbers(values, spread=1, seed=31, min_value=1) self.assertEqual(control, result) values = [1, 2, 3, 4, 5, 6, 7, 8, 9, 0] control = [1, 2, 3, 4, 5, 6, 7, 8, None, 0] result = tools.correlate_numbers(values, quantity=0.9, seed=31, precision=0) self.assertEqual(control, result) values = [1, None, 5, 5, np.nan, 5, 3, 'None', 3, 3] control = [1, 3, 5, 5, 3, 5, 3, 5, 3, 3] result = tools.correlate_numbers(values, fill_nulls=True, seed=99) self.assertEqual(control, result) values = [0, 0, 0, 1, 0, 2, 4, 0, 1] control = [0, 0, 0, -2, 0, 7, 7, 0, -1] result = tools.correlate_numbers(values, spread=2, keep_zero=True, seed=99, precision=0) self.assertEqual(control, result) values = [None, None, 1] control = [11, 11, 11] result = tools.correlate_numbers(values, offset=10, fill_nulls=True) self.assertEqual(control, result) #exception testing values = [] result = tools.correlate_numbers(values, offset=10) self.assertEqual(0, len(result)) values = [None, None] result = tools.correlate_numbers(values, offset=10, fill_nulls=True) self.assertEqual(values, result)
def test_correlation_category(self): tools = DataBuilderTools() selection = ['F', 'M', 'U'] corr = {} values = tools.get_category(selection, weight_pattern=[5, 3, 2], size=10) result = tools.correlate_categories(values, correlations=selection, actions=corr, value_type='Category') self.assertEqual(values, result) corr = { 0: { 'action': 'V' }, 1: { 'action': 'get_category', 'kwargs': { 'selection': [0, 1], 'weight_pattern': [6, 4], 'seed': 101 } } } values = tools.get_category(selection, weight_pattern=[5, 3, 2], size=10, seed=101) control = ['M', 'F', 'F', 'F', 'F', 'F', 'F', 'M', 'F', 'M'] self.assertEqual(control, values) result = tools.correlate_categories(values, correlations=selection, actions=corr, value_type='C', seed=101) control = [0, 'V', 'V', 'V', 'V', 'V', 'V', 0, 'V', 0] self.assertEqual(control, result) selection = [[1, 5], 6, [7, 9]] corr = {0: {'action': 0}, 1: {'action': 1}, 2: {'action': 2}} values = tools.get_number(5, 8, weight_pattern=[[1, 0], [0, 1]], size=10, seed=31) control = [5, 5, 6, 5, 5, 7, 7, 7, 6, 6] self.assertEqual(control, values) result = tools.correlate_categories(values, correlations=selection, actions=corr, value_type='number', seed=101) control = [0, 0, 1, 0, 0, 2, 2, 2, 1, 1] self.assertEqual(control, result) selection = [['11:00', '11:29'], ['11:30', '11:30'], ['11:31', '11:59']] corr = { 0: { 'action': 'Early' }, 1: { 'action': 'On-time' }, 2: { 'action': 'Late' } } values = ['11:23', '11:30', '11:45', '11:02', '11:31'] result = tools.correlate_categories(values, correlations=selection, actions=corr, value_type='date', seed=101) control = ['Early', 'On-time', 'Late', 'Early', 'Late'] self.assertEqual(control, result) selection = [[10, 20], [21, 30]] corr = { 0: { 'action': {} }, 1: { 'action': 'correlate_numbers', 'kwargs': { 'values': {}, 'offset': 100 } } } values = [11, 22] result = tools.correlate_categories(values, correlations=selection, actions=corr, value_type='number', seed=101) control = [11, 122] self.assertEqual(control, result)