def test_partial_cols(self): with sample_file_with_cols(['col1', 'col2']) as x: sp = SamplingProfiler(name='sampler_test', num_iterations=1000, file_path=x, sample_cols=['col1']) gen = StarSchemaModel([sp]) gen.generate_all_datasets() dataset = gen.datasets['sampler_test'] assert len(dataset) == 1000 for list_vals in dataset.values(): for vals in list_vals: assert vals['col1'] assert 'col2' not in vals
def test_from_dict_and_all_cols(self): with sample_file_with_cols(['col1', 'col2'], val='testing') as x: sp = SamplingProfiler.from_dict({'name': 'sampler_test', 'num_iterations': 1000, 'file_path': x}) gen = StarSchemaModel([sp]) gen.generate_all_datasets() dataset = gen.datasets['sampler_test'] assert len(dataset) == 1000 for list_vals in dataset.values(): for vals in list_vals: assert vals['col1'] == 'testing' assert vals['col2'] == 'testing'
def main(): num_products = get_num_products(num_iterations, scale_factor) schema = [ # DIMS ('naive_type2_scd', { 'name': 'customer', 'entity_generator': generate_customer, 'num_iterations': num_iterations, 'mutation_rate': 0.3, # Will update mutate cols 30% of the time 'mutating_cols': ['address'] # Only address will update }), ('naive', { 'name': 'product', 'entity_generator': generate_product, 'num_iterations': num_products }), # FACTS ('naive', { 'name': 'orders', 'entity_generator': generate_order, 'num_iterations': num_iterations * scale_factor, 'relations': [{'name': 'customer'}, {'name': 'product'}] })] dummy_data = StarSchemaModel.from_list(schema) dummy_data.generate_all_datasets(print_progress=True) dummy_data.to_csv(folder) dummy_data.to_pickled_pyschema(folder) print("Done")
def to_dbt_schema(self, path='', name=''): """ Returns a dbt schema dict, which can be used by seed schemas :param path: directory to create folder in :param name: defaults to name of adapter """ StarSchemaModel.create_path(path) if not name: name = self.name schemas = {} for model_name, uids in self.model.datasets.items(): first_row = next(iter(uids.values()))[0] schema = {} for col_name, val in first_row.items(): schema[col_name] = self.convert_pytype(type(val)) schemas[model_name] = {'column_types': schema} with open(os.path.join(path, name + ".yml"), "w+") as f: yaml.dump(schemas, f, default_flow_style=False)
def sample_file_with_cols(cols, num_its=10, val=None, name='test'): def gen_func(): return {col: random.random() if not val else val for col in cols} np = NaiveProfiler(name=name, num_iterations=num_its, generator_funtion=gen_func) gen = StarSchemaModel([np]) gen.generate_all_datasets() gen.to_csv() yield name + '.csv' os.remove(name + '.csv')
def main(): num_products = get_num_products(num_iterations, scale_factor) schema = [ # DIMS ( 'naive', { 'name': 'customer', # the name of the entity/table 'entity_generator': generate_customer, # function that defines entity 'num_iterations': num_iterations # How many times to run that function }), ('naive', { 'name': 'product', 'entity_generator': generate_product, 'num_iterations': num_products }), ('naive', { 'name': 'currency', 'entity_generator': generate_currency, 'num_iterations': num_currencies }), # FACTS ('naive', { 'name': 'orders', 'entity_generator': generate_order, 'num_iterations': num_iterations * scale_factor, 'relations': [{ 'name': 'customer' }, { 'name': 'currency' }] }), ( 'naive', { 'name': 'order_item', 'entity_generator': generate_order_item, 'num_iterations': num_iterations * scale_factor, 'num_entities_per_iteration': lambda: random.randint( 1, 3 ), # Number of facts per iteration (e.g. 3 items 1 order) 'relations': [{ 'name': 'orders', 'unique': True }, { 'name': 'product', 'type': 'many_to_many', 'unique': True }] # Each iteration has the same entity link for one_to_many relations (e.g. one order_id per order_item) # For many_to_many this link is sampled - if unique_per_fact then it is sampled without replacement. # In this example an order has multiple order items, each linked to a unique_per_fact product within that order # If an order could have multiple of the same product then unique_per_fact would be false }), ( 'naive', { 'name': 'currency_conversion', 'entity_generator': generate_currency_conv, 'num_iterations': num_currencies, 'num_entities_per_iteration': num_days, # We get one record per currency per day 'relations': [{ 'name': 'currency', 'unique': True }] # Here the default type is one_to_many - in this case there will be a unique value for each iteration # Sampled from the source table - note this will fail if there are more iterations that values in # The original table. }) ] dummy_data = StarSchemaModel.from_list(schema) dummy_data.generate_all_datasets(print_progress=True) dummy_data.to_csv(folder) dummy_data.to_pickled_pyschema(folder) print("Done")
def main(): num_products = get_num_products(num_iterations, scale_factor) schema = [ # DIMS ('naive_type2_scd', { 'name': 'customer', 'min_valid_from': low_date, 'max_valid_from': high_date, 'entity_generator': generate_customer, 'num_iterations': num_iterations, 'mutation_rate': 0.1, # Will update mutate cols 10% of the time 'mutating_cols': ['address'] # Only address will update }), ('naive', { 'name': 'product', 'entity_generator': generate_product, 'num_iterations': num_products }), ('naive', { 'name': 'currency', 'entity_generator': generate_currency, 'num_iterations': num_currencies }), # FACTS ('naive', { 'name': 'orders', 'entity_generator': generate_order, 'num_iterations': num_iterations * scale_factor, 'relations': [{'name': 'customer'}, {'name': 'currency'}] }), ('naive', { 'name': 'order_item', 'entity_generator': generate_order_item, 'num_iterations': num_iterations * scale_factor, 'num_entities_per_iteration': lambda: random.randint(1, 3), # Number of facts per iteration (e.g. 3 items 1 order) 'relations': [{'name': 'orders', 'unique': True}, {'name': 'product', 'type': 'many_to_many', 'unique': True}] # Each iteration has the same entity link for one_to_many relations (e.g. one order_id per order_item) # For many_to_many this link is sampled - if unique_per_fact then it is sampled without replacement. # In this example an order has multiple order items, each linked to a unique_per_fact product within that order # If an order could have multiple of the same product then unique_per_fact would be false }), ('naive', { 'name': 'currency_conversion', 'entity_generator': generate_currency_conv, 'num_iterations': num_currencies, 'num_entities_per_iteration': num_days, # We get one record per currency per day 'relations': [{'name': 'currency', 'unique': True}] # Here the default type is one_to_many - in this case there will be a unique value for each iteration # Sampled from the source table - note this will fail if there are more iterations that values in # The original table. }) ] dummy_data = StarSchemaModel.from_list(schema) dummy_data.generate_all_datasets(print_progress=True) dummy_data.to_csv(data_path) padapter = PostgresSchemaAdapter(dummy_data) padapter.to_dbt_schema(path=schema_path) bqadapter = BigquerySchemaAdapter(dummy_data) bqadapter.to_dbt_schema(path=schema_path) print("Done")