def noise(self, sample): """ Adds noise to the duplicate rows Parameteres: sample (list or obj): `dataset.Dataset.sample` Returns sample (list or obj): distorted rows TODO: - implement more noise options than just random """ sample_dataset = DataSet(sample.copy()) columns = sample_dataset.sample(self.percentage, columns=True) if sample_dataset.data_type == 'pandas': sample_dataset.records = \ sample_dataset.records.reset_index(drop=True) for column in columns: col = sample_dataset.column_idx(column) col_type = sample_dataset.column_dtype(col) func = None if 'float' in str(col_type): func = generate_random_float elif 'int' in str(col_type): func = generate_random_int if func: kwargs = { 'low': self.dataset.column_agg(col, min), 'high': self.dataset.column_agg(col, max) } if kwargs.get('low') == kwargs.get('high'): kwargs['high'] += 1 sample = self.apply_func_to_column(lambda x: func(x, **kwargs), col) elif col_type in [object, str]: sample = self.apply_func_to_column(messy_spaces, col, dataset=sample_dataset) return sample_dataset.records
def test_column_dtype(input_obj, column, col_type, kwargs): data = DataSet(input_obj, **kwargs) if isinstance(column, str): column = data.column_idx(column) assert data.column_dtype(column) == col_type