Ejemplo n.º 1
0
def test_init(input_obj):
    if isinstance(input_obj, list):
        dataset = DataSet(input_obj, pandas=False)
    else:
        dataset = DataSet(input_obj)
    duper = Duplicator(dataset, **{'percentage': 50})
    assert isinstance(duper, Duplicator)
    assert duper.num_rows > 0
Ejemplo n.º 2
0
def test_append(input_obj, rows, shape, kwargs):
    data = DataSet(input_obj, **kwargs)
    data.append(rows)
    if data.data_type in ['numpy', 'pandas']:
        assert data.records.shape == shape
    else:
        assert len(data.records) == shape[0]
        assert len(data.records[0]) == shape[1]
Ejemplo n.º 3
0
def test_sample(input_obj, percentage, columns, kwargs):
    data = DataSet(input_obj, **kwargs)
    sample = data.sample(percentage, columns=columns)
    assert len(sample) == 1
    if columns and not kwargs:
        assert sample[0] in input_obj[0]
    elif columns:
        assert sample[0] in [0, 1, 2]
    elif isinstance(sample, pd.DataFrame):
        assert list(sample.T.to_dict().values())[0] in input_obj
    else:
        assert sample[0] in input_obj
Ejemplo n.º 4
0
def test_set_value(input_obj,cols,val):
    if isinstance(input_obj, list):
        dataset = DataSet(input_obj, pandas=False)
    else:
        dataset = DataSet(input_obj)
    noizer = NoiseMaker(dataset, **{'columns': cols, 'percentage': 50, 'noise': ['random']})
    noizer.set_value(val)
    for col in noizer.columns:
        if noizer.dataset.data_type == 'pandas':
            assert noizer.dataset.records[noizer.dataset.records.iloc[:,col] == val].shape[0] >= 1
        elif noizer.dataset.data_type == 'numpy':
            assert val in noizer.dataset.records[:,col]
        else:
            assert val in [r[col] for r in noizer.dataset.records]
Ejemplo n.º 5
0
def test_output_sql(input_obj, kwargs):
    data = DataSet(input_obj, **kwargs)
    assert isinstance(data, DataSet)
    assert data.db_uri == kwargs.get('db_uri')
    data.to_output()

    db = dataset_db.connect(kwargs.get('db_uri'))
    rows = list(db.query('select * from test;'))
    assert len(rows) == 2
    # removing id
    assert sorted(list(rows[0].values())[1:]) == sorted(
        list(input_obj[0].values()))

    if os.path.exists(kwargs.get('db_uri').replace('sqlite:///', '')):
        os.remove(kwargs.get('db_uri').replace('sqlite:///', ''))
Ejemplo n.º 6
0
def test_run_strategy(input_obj, cols, noise, limits):
    if isinstance(input_obj, list):
        dataset = DataSet(input_obj, pandas=False)
    else:
        dataset = DataSet(input_obj)
    noizer = NoiseMaker(dataset, **{'columns': cols, 'percentage': 50, 'noise': noise}, limits=limits)
    noizer.run_strategy()
    assert isinstance(noizer, NoiseMaker)
    assert isinstance(noizer.columns, collections.Iterable)
    if dataset.data_type == 'pandas':
        assert not dataset.records.equals(dataset.input)
    elif dataset.data_type == 'numpy':
        assert not np.array_equal(dataset.records, dataset.input)
    else:
        assert dataset.records != dataset.input
Ejemplo n.º 7
0
def test_init(input_obj, cols):
    dataset = DataSet(input_obj)
    fuzzer = Fuzzer(dataset, **{'columns': cols, 'percentage': 50})
    assert isinstance(fuzzer, Fuzzer)
    assert isinstance(fuzzer.columns, collections.Iterable)
    assert fuzzer.num_rows > 0
    assert fuzzer.percentage == .5
Ejemplo n.º 8
0
def test_build_strategy(input_dict, output, percent, cols):
    dataset = DataSet(np.random.rand(20, 5))
    strategy_obj = build_strategy(input_dict, dataset)
    assert isinstance(strategy_obj, output)
    assert strategy_obj.percentage == percent
    if cols:
        assert len(strategy_obj.columns) == cols
Ejemplo n.º 9
0
def test_run_strategy(input_obj, cols):
    if isinstance(input_obj, list):
        dataset = DataSet(input_obj, pandas=False)
    else:
        dataset = DataSet(input_obj)
    fuzzer = Fuzzer(dataset, **{'columns': cols, 'percentage': 50})
    fuzzer.run_strategy()
    assert isinstance(fuzzer, Fuzzer)
    assert isinstance(fuzzer.columns, collections.Iterable)
    # TODO: need to add test that types are not equal as sometimes they test as equals
    if dataset.data_type == 'pandas':
        assert not dataset.records.equals(dataset.input)
    elif dataset.data_type == 'numpy':
        assert not np.array_equal(dataset.records, dataset.input)
    else:
        assert dataset.records != dataset.input
Ejemplo n.º 10
0
def test_init_from_obj(input_obj):
    data = DataSet(input_obj)
    if isinstance(input_obj, list):
        assert data.records.equals(pd.DataFrame(input_obj))
        assert data.original == input_obj
    elif isinstance(input_obj, pd.DataFrame):
        assert data.records.equals(input_obj)
        assert data.input.equals(input_obj)
        assert data.original.equals(input_obj)
    else:
        assert np.array_equal(data.original, input_obj)
        assert np.array_equal(data.records, input_obj)
        assert np.array_equal(data.input, input_obj)
    assert data.records is not input_obj
    assert data.original is input_obj
    assert len(data) == 2
    assert isinstance(data, collections.Iterable)

    count = 0
    for line in data:
        if data.data_type == 'pandas':
            assert data[count].equals(data.records.iloc[count, :])
        elif data.data_type == 'numpy':
            assert np.array_equal(data[count], data.records[count, :])
        else:
            assert data[count] == line
        count += 1
    assert count == 2
Ejemplo n.º 11
0
def test_helper_methods():
    dataset = DataSet([[1, 2, 3]])
    fuzzer = Fuzzer(dataset)

    assert fuzzer.fuzz_random() in [sql, metachars, files, delimiter, emoji]
    assert fuzzer.fuzz_str() in [
        add_format, change_encoding, to_bytes, insert_boms
    ]
    assert fuzzer.fuzz_numeric() in [nanify, bigints, hexify]
Ejemplo n.º 12
0
def test_output(input_obj, output, output_type, kwargs):
    data = DataSet(input_obj, output=output, **kwargs)
    if output.startswith('file://'):
        assert data.output_filename == output.replace('file://', '')
    to_output = data.to_output()
    assert isinstance(to_output, output_type)
    if output_type == list and isinstance(input_obj, list):
        assert input_obj == to_output
    elif output_type == list:
        assert input_obj.tolist() == to_output
    elif output_type == pd.DataFrame:
        assert to_output.shape == (2, 3)
    elif output_type == np.ndarray and not kwargs:
        assert np.array_equal(to_output[0, :], [1, 2, 5])
    elif output_type == np.ndarray:
        assert to_output[0] == {'a': 1, 'b': 2, 'd': 5}
    elif output_type == str:
        assert os.path.exists(to_output)
Ejemplo n.º 13
0
def test_init_no_pandas(input_obj, shape):
    if isinstance(input_obj, str):
        input_obj = 'file://{}'.format(input_obj)
    data = DataSet(input_obj, pandas=False)
    assert isinstance(data.records, list)
    assert isinstance(data.input, list)
    assert len(data.records) == shape[0]
    assert len(data.records[0]) == shape[1]
    assert data.original == input_obj
Ejemplo n.º 14
0
def test_init_sql(input_obj, kwargs):
    data = DataSet(input_obj, **kwargs)
    assert isinstance(data, DataSet)
    assert data.db_uri == kwargs.get('db_uri')
    assert data.query == kwargs.get('query')
    if kwargs.get('pandas') is None:
        assert isinstance(data.input, pd.DataFrame)
    else:
        assert isinstance(data.input, list)
Ejemplo n.º 15
0
def test_init(input_obj, cols):
    dataset = DataSet(input_obj)
    noizer = NoiseMaker(dataset, **{'columns': cols, 'percentage': 50, 'noise': ['random']})
    assert isinstance(noizer, NoiseMaker)
    assert isinstance(noizer.columns, collections.Iterable)
    assert noizer.num_rows > 0
    assert noizer.percentage == .5

    with pytest.raises(Exception):
        NoiseMaker(dataset, **{'columns': cols, 'percentage': 50})
Ejemplo n.º 16
0
def test_duplicate_with_noise(input_obj):
    if isinstance(input_obj, list):
        dataset = DataSet(input_obj, pandas=False)
    else:
        dataset = DataSet(input_obj)
    duper = Duplicator(dataset, **{'percentage': 50, 'add_noise': True})
    duper.run_strategy()
    assert isinstance(duper, Duplicator)
    if dataset.data_type == 'pandas':
        assert not dataset.records.equals(dataset.input)
        assert not dataset.records.duplicated().any()
    elif dataset.data_type == 'numpy':
        assert not np.array_equal(np.unique(dataset.records, axis=0),
                                  dataset.input)
        assert not np.array_equal(dataset.records, dataset.input)
    else:
        counter = collections.Counter([str(r) for r in dataset.records])
        assert not any(
            [val if val[1] > 1 else None for val in counter.most_common()])
        assert dataset.records != dataset.input
Ejemplo n.º 17
0
def test_init_from_file(input_file, shape):
    filename = 'file://{}'.format(input_file)
    data = DataSet(filename)
    assert data.records is not input_file
    assert data.input is not input_file
    assert data.data_type == 'pandas'
    assert isinstance(data.records, pd.DataFrame)
    assert isinstance(data.input, pd.DataFrame)
    assert data.records.shape == shape
    assert data.input.shape == shape
    assert data.original == filename
    assert data.input_filename == filename.replace('file://', '')
Ejemplo n.º 18
0
def test_duplicate(input_obj):
    if isinstance(input_obj, list):
        dataset = DataSet(input_obj, pandas=False)
    else:
        dataset = DataSet(input_obj)
    duper = Duplicator(dataset, **{'percentage': 50})
    duper.run_strategy()
    assert isinstance(duper, Duplicator)
    if dataset.data_type == 'pandas':
        assert not dataset.records.equals(dataset.input)
        assert dataset.records.duplicated().any()
        assert len(dataset) > dataset.input.shape[0]
    elif dataset.data_type == 'numpy':
        unique = np.unique(dataset.records, axis=0)
        assert sorted(unique.ravel()) == sorted(dataset.input.ravel())
        assert not np.array_equal(dataset.records, dataset.input)
        assert len(dataset) > dataset.input.shape[0]
    else:
        counter = collections.Counter([str(r) for r in dataset.records])
        assert any(
            [val if val[1] > 1 else None for val in counter.most_common()])
        assert dataset.records != dataset.input
        assert len(dataset) > len(dataset.input)
Ejemplo n.º 19
0
def fuzz_from_parser(parser):
    """ Fuzz using parser input.
        This will generate a `dataset.Dataset` from `parser.input`,
        apply any defined strategies and call `dataset.to_output`.

        Arguments:
            parser (`parsers.StrategyCLIParser` or
                    `parsers.StrategyYAMLParser`): strategy parser

        Returns:
            dataset.to_output()
    """
    dataset = DataSet(parser.input,
                      output=parser.output,
                      db_uri=parser.db_uri,
                      query=parser.query,
                      table=parser.table)
    for strategy in parser.strategies:
        strategy_obj = build_strategy(strategy, dataset)
        try:
            strategy_obj.run_strategy()
        except Exception:
            logging.exception('Error running strategy: %s', strategy)
    return dataset.to_output()
Ejemplo n.º 20
0
    def noise(self, sample):
        """ Adds noise to the duplicate rows

            Parameteres:
                sample (list or obj): `dataset.Dataset.sample`

            Returns
                sample (list or obj): distorted rows

            TODO:
                - implement more noise options than just random

        """
        sample_dataset = DataSet(sample.copy())
        columns = sample_dataset.sample(self.percentage, columns=True)
        if sample_dataset.data_type == 'pandas':
            sample_dataset.records = \
                sample_dataset.records.reset_index(drop=True)

        for column in columns:
            col = sample_dataset.column_idx(column)
            col_type = sample_dataset.column_dtype(col)
            func = None

            if 'float' in str(col_type):
                func = generate_random_float
            elif 'int' in str(col_type):
                func = generate_random_int
            if func:
                kwargs = {
                    'low': self.dataset.column_agg(col, min),
                    'high': self.dataset.column_agg(col, max)
                }
                if kwargs.get('low') == kwargs.get('high'):
                    kwargs['high'] += 1

                sample = self.apply_func_to_column(lambda x: func(x, **kwargs),
                                                   col)
            elif col_type in [object, str]:
                sample = self.apply_func_to_column(messy_spaces,
                                                   col,
                                                   dataset=sample_dataset)
        return sample_dataset.records
Ejemplo n.º 21
0
def test_column_dtype(input_obj, column, col_type, kwargs):
    data = DataSet(input_obj, **kwargs)
    if isinstance(column, str):
        column = data.column_idx(column)
    assert data.column_dtype(column) == col_type
Ejemplo n.º 22
0
def test_init_errors(input_obj, error):
    with pytest.raises(error):
        DataSet(input_obj)
Ejemplo n.º 23
0
def test_output_errors(input_obj, output):
    with pytest.raises(NotImplementedError):
        data = DataSet(input_obj, output=output)
        data.to_output()
Ejemplo n.º 24
0
def test_bad_io(input_obj, output, kwargs):
    with pytest.raises(Exception):
        if 'data' in input_obj:
            input_obj = 'file://{}'.format(input_obj)
        data = DataSet(input_obj, output=output, **kwargs)
        data.to_output()
Ejemplo n.º 25
0
def test_column_agg(input_obj, column, agg, result, kwargs):
    data = DataSet(input_obj, **kwargs)
    if isinstance(column, str):
        column = data.column_idx(column)
    assert data.column_agg(column, agg) == result