def test_correlate_date_max_min(self):
     tools = DataBuilderTools()
     dates = ['10/01/2017']
     result = tools.correlate_dates(dates, max_date='05/01/2017')
     self.assertEqual(['05-01-2017T00:00:00'], result)
     result = tools.correlate_dates(dates, min_date='12/01/2017')
     self.assertEqual(['12-01-2017T00:00:00'], result)
 def test_correlate_numbers_max_min(self):
     tools = DataBuilderTools()
     values = [2]
     result = tools.correlate_numbers(values, max_value=1)
     self.assertEqual([1], result)
     result = tools.correlate_numbers(values, min_value=3)
     self.assertEqual([3], result)
    def test_associate_custom(self):
        tools = DataBuilderTools()
        df = pd.DataFrame()
        df['cat'] = tools.get_category(list('MFU'), size=10, seed=31)
        df['values'] = tools.get_number(10, size=10, seed=31)
        control = [
            5.0, np.nan, 1.0, np.nan, np.nan, 6.0, np.nan, np.nan, 5.0, 3.0
        ]
        result = tools.associate_custom(
            df=df,
            code_str="df.loc[df['cat'] == 'U', 'values'] = new_value",
            use_exec=True,
            new_value=None)
        self.assertEqual(control, list(result['values']))
        df['values'] = tools.get_number(10, size=10, seed=31)
        result = tools.associate_custom(
            df=df,
            code_str=
            "df['values'] = np.where(df['cat'] == 'U', None, df['values'])",
            use_exec=True,
            new_value=None)
        control = [None, 2, 0, 4, None, None, 3, 5, 4, 2]
        self.assertEqual(control, list(result['values']))
        self.assertEqual(control, list(df['values']))

        df_staff = pd.DataFrame()
        df_staff['sid'] = [1000]
        result = tools.associate_custom(
            df=df_staff,
            code_str="df[header].apply(lambda x: f'CU_{x}')",
            header='sid')
        self.assertEqual(['CU_1000'], list(result))
    def test_example(self):
        tools = DataBuilderTools()
        df_accounts = pd.DataFrame()
        df_accounts['account_id'] = tools.unique_identifiers(1000,
                                                             2000,
                                                             size=150)

        # Create a weighting pattern that is loaded towards low value and then exponetially tails off
        value_distribution = [0.01, 0.8, 1, 3, 9, 8, 3, 2, 1] + list(
            np.flip(np.exp(np.arange(-5, 0.0, 0.2)).round(2)))

        # Apply the weighting to an account range of 0 to 1000
        df_accounts['balance'] = tools.get_number(
            0, 1000, weight_pattern=value_distribution, size=150)

        df_accounts['interest'] = tools.correlate_numbers(
            df_accounts['balance'],
            spread=0,
            offset=0.02,
            action='multiply',
            precision=2)
 def test_correlation_dates(self):
     tools = DataBuilderTools()
     dates = ['10/01/2017', '12/01/2017', None, '', 'Fred']
     control = ['10-01-2020', '12-01-2020', None, '', 'Fred']
     result = tools.correlate_dates(dates,
                                    lower_spread={'days': 2},
                                    upper_spread={'days': 2},
                                    offset={'years': 3},
                                    date_format="%d-%m-%Y",
                                    seed=99)
     self.assertEqual(control, result)
     control = ['11-01-2017', '13-01-2017', None, '', 'Fred']
     result = tools.correlate_dates(dates, date_format="%d-%m-%Y", seed=99)
     self.assertEqual(control, result)
     dates = DataBuilderTools.get_datetime('01/01/2010',
                                           '31/12/2010',
                                           date_format="%d-%m-%Y",
                                           seed=99,
                                           size=5)
     control = [
         '17-07-2010', '07-07-2010', '06-08-2010', '07-06-2010',
         '23-02-2010'
     ]
     self.assertEqual(control, dates)
     result = tools.correlate_dates(dates,
                                    lower_spread={'days': 2},
                                    upper_spread={'days': 2},
                                    seed=99,
                                    date_format="%d-%m-%Y")
     control = [
         '18-07-2010', '08-07-2010', '06-08-2010', '06-06-2010',
         '22-02-2010'
     ]
     self.assertEqual(control, result)
    def test_correlation_dates_attributes(self):
        tools = DataBuilderTools()
        rows = 100
        df_staff = pd.DataFrame()
        df_staff['joined'] = tools.get_datetime(start='01/01/2008',
                                                until='07/01/2019',
                                                date_format='%d-%m-%Y',
                                                size=rows)

        def offset_limits():
            diff_list = []
            for index in range(rows):
                c_time = pd.to_datetime(control[index],
                                        errors='coerce',
                                        infer_datetime_format=True,
                                        dayfirst=True)
                r_time = pd.to_datetime(result[index],
                                        errors='coerce',
                                        infer_datetime_format=True,
                                        dayfirst=True)
                diff_list.append(r_time - c_time)
            max_diff = max(diff_list)
            min_diff = min(diff_list)
            mean_diff = np.mean(diff_list)
            return min_diff, mean_diff, max_diff

        control = cleaner.list_formatter(df_staff['joined'])
        result = tools.correlate_dates(df_staff['joined'], offset={'days': 7})
        min_diff, mean_diff, max_diff = offset_limits()
        self.assertEquals(7, max_diff.days)
        self.assertEquals(6, min_diff.days)

        result = tools.correlate_dates(df_staff['joined'],
                                       offset={'days': 7},
                                       lower_spread={'days': 3},
                                       upper_spread={'days': 5})
        min_diff, mean_diff, max_diff = offset_limits()
        self.assertEquals(11, max_diff.days)
        self.assertEquals(4, min_diff.days)
 def test_dates(self):
     result = tools.get_datetime('01/01/2018',
                                 '01/01/2019',
                                 as_num=True,
                                 size=1000)
     self.assertEqual(1000, len(result))
     # get the values at the edge
     value_min = tools.get_datetime('01/01/2018',
                                    '01/01/2018',
                                    as_num=True,
                                    size=1)[0]
     value_max = tools.get_datetime('01/01/2019',
                                    '01/01/2019',
                                    as_num=True,
                                    size=1)[0]
     self.assertLessEqual(value_min, min(result))
     self.assertGreaterEqual(value_max, max(result))
     result = tools.get_datetime('01/01/2018', '01/01/2018')
     self.assertEqual([
         datetime.datetime(2018, 1, 1, 0, 0, tzinfo=datetime.timezone.utc)
     ], result)
     # test date_format
     result = tools.get_datetime('01/01/2018',
                                 '01/01/2018',
                                 date_format="%Y-%m-%d")
     self.assertEqual(['2018-01-01'], result)
     # test ignore time
     result = tools.get_datetime('01/01/2018T01:01:01',
                                 '01/01/2018T23:59:59',
                                 ignore_time=True)
     self.assertEqual([
         datetime.datetime(2018, 1, 1, 0, 0, tzinfo=datetime.timezone.utc)
     ], result)
     # test at_most
     result = tools.get_datetime('01/01/2018',
                                 '01/04/2018',
                                 ignore_time=True,
                                 at_most=1,
                                 size=3,
                                 date_format="%Y-%m-%d")
     self.assertEqual(3, len(result))
     self.assertCountEqual(['2018-01-01', '2018-01-03', '2018-01-02'],
                           result)
    def test_associate_dataset(self):
        tools = DataBuilderTools()
        dataset = ['M', 'F', 'M', 'M', 'U', 'F']
        associations = [{
            'expect': 'c',
            'value': 'M'
        }, {
            'expect': 'c',
            'value': 'F'
        }]
        actions = {0: {'action': 'Male'}, 1: {'action': 'Female'}}
        result = tools.associate_canonical(dataset,
                                           associations=associations,
                                           actions=actions,
                                           default_header='_default')
        control = ['Male', 'Female', 'Male', 'Male', 'U', 'Female']
        self.assertEqual(control, result)

        dataset = pd.DataFrame()
        dataset['gender'] = ['M', 'F', 'M', 'M', 'U', 'F', 'U']
        dataset['age'] = [20, 22, 18, 43, 34, 57, 22]
        associations = [
            {
                'age': {
                    'expect': 'n',
                    'value': [24, 100]
                },
                'gender': {
                    'expect': 'category',
                    'value': ['M']
                }
            },
            {
                'age': {
                    'expect': 'n',
                    'value': [0, 23]
                }
            },
            {
                'age': {
                    'expect': 'n',
                    'value': [24, 100]
                },
                'gender': {
                    'expect': 'category',
                    'value': ['F']
                }
            },
        ]
        actions = {
            0: {
                'action': 'Dad'
            },
            1: {
                'action': 'correlate_numbers',
                'kwargs': {
                    'values': {
                        '_header': 'age'
                    },
                    'offset': 100
                }
            },
            2: {
                'action': {
                    '_header': 'age'
                }
            }
        }

        result = tools.associate_canonical(dataset,
                                           associations=associations,
                                           actions=actions,
                                           default_value='Unknown')
        control = [120, 122, 118, 'Dad', 'Unknown', 57, 122]
        self.assertEqual(control, result)

        titanic = sns.load_dataset('titanic')
        associations = [{
            'sex': {
                'expect': 'category',
                'value': ['male']
            },
            'survived': {
                'expect': 'number',
                'value': 0
            }
        }, {
            'sex': {
                'expect': 'category',
                'value': ['male']
            },
            'survived': {
                'expect': 'number',
                'value': 1
            }
        }, {
            'sex': {
                'expect': 'category',
                'value': ['female']
            },
            'survived': {
                'expect': 'number',
                'value': 0
            }
        }, {
            'sex': {
                'expect': 'category',
                'value': ['female']
            },
            'survived': {
                'expect': 'number',
                'value': 1
            }
        }]

        actions = {
            0: {
                'action': 'correlate_numbers',
                'kwargs': {
                    'values': {
                        '_header': 'age'
                    },
                    'fill_nulls': True
                }
            },
            1: {
                'action': 'correlate_numbers',
                'kwargs': {
                    'values': {
                        '_header': 'age'
                    },
                    'fill_nulls': True
                }
            },
            2: {
                'action': 'correlate_numbers',
                'kwargs': {
                    'values': {
                        '_header': 'age'
                    },
                    'fill_nulls': True
                }
            },
            3: {
                'action': 'correlate_numbers',
                'kwargs': {
                    'values': {
                        '_header': 'age'
                    },
                    'fill_nulls': True
                }
            },
        }

        result = tools.associate_canonical(titanic,
                                           associations=associations,
                                           actions=actions,
                                           default_value=99)
        control = [22.0, 38.0, 26.0, 35.0, 35.0, None, 54.0, 2.0, 27.0, 14.0]
        self.assertEqual(control, result[:10])
    def test_associate_timeseries(self):
        tools = DataBuilderTools()
        rows = 100
        df_staff = pd.DataFrame()
        df_staff['sid'] = tools.unique_identifiers(from_value=10000000,
                                                   to_value=99999999,
                                                   size=rows)
        df_staff['staff_type'] = tools.get_category(
            selection=['contractor', 'part-time', 'full-time'],
            weight_pattern=[1, 3, 6],
            size=rows)
        df_staff['joined'] = tools.get_datetime(start='01/01/2008',
                                                until='07/01/2019',
                                                date_format='%d-%m-%Y',
                                                size=rows)

        associations = [{
            'joined': {
                'expect': 'date',
                'value': ['01/01/2000', '31/12/2013']
            },
            'staff_type': {
                'expect': 'category',
                'value': ['full-time', 'part-time']
            }
        }, {
            'joined': {
                'expect': 'date',
                'value': ['31/12/2013', '31/12/2100']
            },
            'staff_type': {
                'expect': 'category',
                'value': ['full-time', 'part-time']
            }
        }]

        actions = {
            0: {
                'action': 'get_datetime',
                'kwargs': {
                    'start': "05/01/2014",
                    'until': "16/01/2014"
                }
            },
            1: {
                'action': 'correlate_dates',
                'kwargs': {
                    'dates': {
                        '_header': 'joined'
                    },
                    'offset': {
                        'days': 9
                    },
                    'lower_spread': 4
                }
            }
        }

        df_staff['registered'] = tools.associate_canonical(
            df_staff,
            associations=associations,
            actions=actions,
            default_value=None)
 def test_runs(self):
     """Basic smoke test"""
     DataBuilderTools()
 def test_correlate_numbers(self):
     tools = DataBuilderTools()
     values = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 0.0]
     control = [1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5, 8.5, 9.5, 0.5]
     result = tools.correlate_numbers(values, offset=0.5)
     self.assertEqual(control, result)
     result = tools.correlate_numbers(values,
                                      spread=1,
                                      precision=1,
                                      seed=31)
     control = [0.7, 1.7, 3.2, 2.1, 5.7, 5.9, 8.0, 9.4, 8.4, -0.3]
     self.assertEqual(control, result)
     control = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, np.nan, 0.0]
     result = tools.correlate_numbers(values, quantity=0.9, seed=31)
     self.assertEqual(control, result)
     values = [1., 1., 1., 1., 1., 1., 1.]
     control = [1.244, 1.676, 2.009, 2.405, 1.497, 1.257, 1.026]
     result = tools.correlate_numbers(values,
                                      spread=1,
                                      seed=31,
                                      min_value=1)
     self.assertEqual(control, result)
     values = [1, 2, 3, 4, 5, 6, 7, 8, 9, 0]
     control = [1, 2, 3, 4, 5, 6, 7, 8, None, 0]
     result = tools.correlate_numbers(values,
                                      quantity=0.9,
                                      seed=31,
                                      precision=0)
     self.assertEqual(control, result)
     values = [1, None, 5, 5, np.nan, 5, 3, 'None', 3, 3]
     control = [1, 3, 5, 5, 3, 5, 3, 5, 3, 3]
     result = tools.correlate_numbers(values, fill_nulls=True, seed=99)
     self.assertEqual(control, result)
     values = [0, 0, 0, 1, 0, 2, 4, 0, 1]
     control = [0, 0, 0, -2, 0, 7, 7, 0, -1]
     result = tools.correlate_numbers(values,
                                      spread=2,
                                      keep_zero=True,
                                      seed=99,
                                      precision=0)
     self.assertEqual(control, result)
     values = [None, None, 1]
     control = [11, 11, 11]
     result = tools.correlate_numbers(values, offset=10, fill_nulls=True)
     self.assertEqual(control, result)
     #exception testing
     values = []
     result = tools.correlate_numbers(values, offset=10)
     self.assertEqual(0, len(result))
     values = [None, None]
     result = tools.correlate_numbers(values, offset=10, fill_nulls=True)
     self.assertEqual(values, result)
    def test_correlation_category(self):
        tools = DataBuilderTools()
        selection = ['F', 'M', 'U']
        corr = {}
        values = tools.get_category(selection,
                                    weight_pattern=[5, 3, 2],
                                    size=10)
        result = tools.correlate_categories(values,
                                            correlations=selection,
                                            actions=corr,
                                            value_type='Category')
        self.assertEqual(values, result)

        corr = {
            0: {
                'action': 'V'
            },
            1: {
                'action': 'get_category',
                'kwargs': {
                    'selection': [0, 1],
                    'weight_pattern': [6, 4],
                    'seed': 101
                }
            }
        }
        values = tools.get_category(selection,
                                    weight_pattern=[5, 3, 2],
                                    size=10,
                                    seed=101)
        control = ['M', 'F', 'F', 'F', 'F', 'F', 'F', 'M', 'F', 'M']
        self.assertEqual(control, values)
        result = tools.correlate_categories(values,
                                            correlations=selection,
                                            actions=corr,
                                            value_type='C',
                                            seed=101)
        control = [0, 'V', 'V', 'V', 'V', 'V', 'V', 0, 'V', 0]
        self.assertEqual(control, result)

        selection = [[1, 5], 6, [7, 9]]
        corr = {0: {'action': 0}, 1: {'action': 1}, 2: {'action': 2}}
        values = tools.get_number(5,
                                  8,
                                  weight_pattern=[[1, 0], [0, 1]],
                                  size=10,
                                  seed=31)
        control = [5, 5, 6, 5, 5, 7, 7, 7, 6, 6]
        self.assertEqual(control, values)
        result = tools.correlate_categories(values,
                                            correlations=selection,
                                            actions=corr,
                                            value_type='number',
                                            seed=101)
        control = [0, 0, 1, 0, 0, 2, 2, 2, 1, 1]
        self.assertEqual(control, result)

        selection = [['11:00', '11:29'], ['11:30', '11:30'],
                     ['11:31', '11:59']]
        corr = {
            0: {
                'action': 'Early'
            },
            1: {
                'action': 'On-time'
            },
            2: {
                'action': 'Late'
            }
        }
        values = ['11:23', '11:30', '11:45', '11:02', '11:31']
        result = tools.correlate_categories(values,
                                            correlations=selection,
                                            actions=corr,
                                            value_type='date',
                                            seed=101)
        control = ['Early', 'On-time', 'Late', 'Early', 'Late']
        self.assertEqual(control, result)

        selection = [[10, 20], [21, 30]]
        corr = {
            0: {
                'action': {}
            },
            1: {
                'action': 'correlate_numbers',
                'kwargs': {
                    'values': {},
                    'offset': 100
                }
            }
        }
        values = [11, 22]
        result = tools.correlate_categories(values,
                                            correlations=selection,
                                            actions=corr,
                                            value_type='number',
                                            seed=101)
        control = [11, 122]
        self.assertEqual(control, result)