def test_generator_generate_invalid_domain(domain): """ test generator class init method with invalid domain value """ generator = Generator(data_type="specific") with pytest.raises(ValueError): generator.generate(num_rows=10, domain=domain)
def test_generator_random_num_columns(num_columns): """ test generator class generate method num_columns parameter """ generator = Generator(data_type="random") data = generator.generate(num_rows=10, num_columns=num_columns) assert data.shape[1] == num_columns
def test_generator_num_rows(num_rows, domain): """ test generator class generate method num_rows parameter """ generator = Generator(data_type="specific") data = generator.generate(num_rows=num_rows, domain=domain) assert data.shape[0] == num_rows
def test_generator_generate_random_invalid_params(): """ test generator class init method with invalid parameter values """ generator = Generator(data_type="random") with pytest.raises(ValueError): generator.generate(num_rows=10, metadata={"mock": int}, num_columns=5)
def store( # pylint: disable=too-many-arguments,too-many-branches self, data_dir: Optional[str] = None, file_name: Optional[str] = None, extension: Optional[str] = None, num_rows: Optional[int] = 500, **kwargs, ) -> str: """ generate a random data file. supports multiple file formats such as `.csv`, `.json`, `.parquet`, or `.xlsx` *** parameters *** *data_dir* : directory to store generated file in (if None will use `self.default_dir`) *file_name*: name of file to generate *extension*: format / extension of file *** """ # param validation # name of file if file_name is None: file_name = self.default_fn # data storage dir if data_dir is None: data_dir = self.default_dir if isinstance(data_dir, str): data_dir = pathlib.Path(data_dir) # file format / extension if extension is None: extension = random.choice(SUPPORTED_EXTENSIONS) if extension not in SUPPORTED_EXTENSIONS: raise ValueError( f"invalid extension: {str(extension)}. mok supports: {str(SUPPORTED_EXTENSIONS)}", # pylint: disable=line-too-long ) # file path file_path = data_dir.joinpath(f"{file_name}.{extension}") # init data generator generator = Generator(data_type=self.data_type) # generate data as a pandas df data = generator.generate(num_rows=num_rows, **kwargs) # pylint: disable=invalid-name logger.info("storing to: %s", str(file_path)) if extension == "csv": data.to_csv(str(file_path)) elif extension == "json": data.to_json(str(file_path)) elif extension == "parquet": data.to_parquet(str(file_path)) elif extension == "xlsx": data.to_excel(str(file_path)) else: raise ValueError( f"unsupported extension: {str(extension)}. mok batch supports: {str(SUPPORTED_EXTENSIONS)}", # pylint: disable=line-too-long ) return str(file_path)
def test_generator_generate_data_type(data_type): """ test generator class init method with invalid parameter values """ generator = Generator(data_type="random") generator.data_type = data_type with pytest.raises(ValueError): generator.generate(num_rows=10, num_columns=5)
def test_generator_missing(num_rows, missing): """ test generator class generate method missing parameter """ generator = Generator(data_type="random") data = generator.generate(num_rows=num_rows, num_columns=5) assert data.shape[0] == num_rows assert data.isna().sum().sum() == 0 data = generator.generate(num_rows=num_rows, num_columns=5, missing=missing) assert data.shape[0] == num_rows assert data.isna().sum().sum() != 0
def test_generator_random_between_range(num_range, col_type): """ test generator class generate method min_val & max_val parameters """ generator = Generator(data_type="random") data = generator.generate( num_rows=10, num_columns=1, min_val=num_range[0], max_val=num_range[1], col_types=[col_type], ) for idx, _ in enumerate(generator.column_names): in_between = all(data[generator.column_names[idx]].between( num_range[0], num_range[1])) assert in_between
def test_generator_random_length(str_len): """ test generator class generate method min_len & max_len parameters """ generator = Generator(data_type="random") data = generator.generate( num_rows=10, num_columns=1, min_len=str_len[0], max_len=str_len[1], col_types=[str], ) for idx, _ in enumerate(generator.column_names): in_between = all(data[generator.column_names[idx]].str.len().between( str_len[0], str_len[1])) assert in_between
def test_generator_metadata(metadata): """ test generator class generate method col_type parameters """ generator = Generator(data_type="random") data = generator.generate( num_rows=10, metadata=metadata, ) numpy_types = [ NUMPY_TYPES[col_type] for col_type in metadata.values() if col_type in NUMPY_TYPES ] for idx, _ in enumerate(generator.column_names): correct_type = all(data[generator.column_names[idx]].apply( lambda x: type(x) in metadata.values() or type(x) in numpy_types)) assert correct_type
def test_generator_init_invalid(data_type): """ test generator class init method with invalid parameter values """ with pytest.raises(ValueError): Generator(data_type=data_type)
def test_generator_init(data_type): """ test generator class init method with valid parameters """ generator = Generator(data_type=data_type) assert isinstance(generator, Generator)