コード例 #1
0
ファイル: test_dataset.py プロジェクト: myd1/datafuzz
def test_sample(input_obj, percentage, columns, kwargs):
    data = DataSet(input_obj, **kwargs)
    sample = data.sample(percentage, columns=columns)
    assert len(sample) == 1
    if columns and not kwargs:
        assert sample[0] in input_obj[0]
    elif columns:
        assert sample[0] in [0, 1, 2]
    elif isinstance(sample, pd.DataFrame):
        assert list(sample.T.to_dict().values())[0] in input_obj
    else:
        assert sample[0] in input_obj
コード例 #2
0
ファイル: duplicator.py プロジェクト: myd1/datafuzz
    def noise(self, sample):
        """ Adds noise to the duplicate rows

            Parameteres:
                sample (list or obj): `dataset.Dataset.sample`

            Returns
                sample (list or obj): distorted rows

            TODO:
                - implement more noise options than just random

        """
        sample_dataset = DataSet(sample.copy())
        columns = sample_dataset.sample(self.percentage, columns=True)
        if sample_dataset.data_type == 'pandas':
            sample_dataset.records = \
                sample_dataset.records.reset_index(drop=True)

        for column in columns:
            col = sample_dataset.column_idx(column)
            col_type = sample_dataset.column_dtype(col)
            func = None

            if 'float' in str(col_type):
                func = generate_random_float
            elif 'int' in str(col_type):
                func = generate_random_int
            if func:
                kwargs = {
                    'low': self.dataset.column_agg(col, min),
                    'high': self.dataset.column_agg(col, max)
                }
                if kwargs.get('low') == kwargs.get('high'):
                    kwargs['high'] += 1

                sample = self.apply_func_to_column(lambda x: func(x, **kwargs),
                                                   col)
            elif col_type in [object, str]:
                sample = self.apply_func_to_column(messy_spaces,
                                                   col,
                                                   dataset=sample_dataset)
        return sample_dataset.records