Exemple #1
0
    def test_partial_cols(self):
        with sample_file_with_cols(['col1', 'col2']) as x:
            sp = SamplingProfiler(name='sampler_test', num_iterations=1000, file_path=x, sample_cols=['col1'])
            gen = StarSchemaModel([sp])
            gen.generate_all_datasets()
            dataset = gen.datasets['sampler_test']

            assert len(dataset) == 1000
            for list_vals in dataset.values():
                for vals in list_vals:
                    assert vals['col1']
                    assert 'col2' not in vals
Exemple #2
0
    def test_from_dict_and_all_cols(self):
        with sample_file_with_cols(['col1', 'col2'], val='testing') as x:
            sp = SamplingProfiler.from_dict({'name': 'sampler_test', 'num_iterations': 1000, 'file_path': x})
            gen = StarSchemaModel([sp])
            gen.generate_all_datasets()
            dataset = gen.datasets['sampler_test']

            assert len(dataset) == 1000
            for list_vals in dataset.values():
                for vals in list_vals:
                    assert vals['col1'] == 'testing'
                    assert vals['col2'] == 'testing'
Exemple #3
0
def main():
    num_products = get_num_products(num_iterations, scale_factor)

    schema = [
        #  DIMS
        ('naive_type2_scd', {
            'name': 'customer',
            'entity_generator': generate_customer,
            'num_iterations': num_iterations,
            'mutation_rate': 0.3,  # Will update mutate cols 30% of the time
            'mutating_cols': ['address']  # Only address will update
        }),
        ('naive', {
            'name': 'product',
            'entity_generator': generate_product,
            'num_iterations': num_products
        }),
        #  FACTS
        ('naive', {
            'name': 'orders',
            'entity_generator': generate_order,
            'num_iterations': num_iterations * scale_factor,
            'relations': [{'name': 'customer'}, {'name': 'product'}]
        })]

    dummy_data = StarSchemaModel.from_list(schema)
    dummy_data.generate_all_datasets(print_progress=True)
    dummy_data.to_csv(folder)
    dummy_data.to_pickled_pyschema(folder)
    print("Done")
    def to_dbt_schema(self, path='', name=''):
        """ Returns a dbt schema dict, which can be used by seed schemas

        :param path: directory to create folder in
        :param name: defaults to name of adapter
        """
        StarSchemaModel.create_path(path)

        if not name:
            name = self.name
        schemas = {}
        for model_name, uids in self.model.datasets.items():
            first_row = next(iter(uids.values()))[0]
            schema = {}
            for col_name, val in first_row.items():
                schema[col_name] = self.convert_pytype(type(val))
            schemas[model_name] = {'column_types': schema}

        with open(os.path.join(path, name + ".yml"), "w+") as f:
            yaml.dump(schemas, f, default_flow_style=False)
Exemple #5
0
def sample_file_with_cols(cols, num_its=10, val=None, name='test'):
    def gen_func():
        return {col: random.random() if not val else val for col in cols}

    np = NaiveProfiler(name=name, num_iterations=num_its, generator_funtion=gen_func)
    gen = StarSchemaModel([np])
    gen.generate_all_datasets()
    gen.to_csv()
    yield name + '.csv'
    os.remove(name + '.csv')
Exemple #6
0
def main():
    num_products = get_num_products(num_iterations, scale_factor)

    schema = [
        #  DIMS
        (
            'naive',
            {
                'name': 'customer',  # the name of the entity/table
                'entity_generator':
                generate_customer,  # function that defines entity
                'num_iterations':
                num_iterations  # How many times to run that function
            }),
        ('naive', {
            'name': 'product',
            'entity_generator': generate_product,
            'num_iterations': num_products
        }),
        ('naive', {
            'name': 'currency',
            'entity_generator': generate_currency,
            'num_iterations': num_currencies
        }),
        #  FACTS
        ('naive', {
            'name': 'orders',
            'entity_generator': generate_order,
            'num_iterations': num_iterations * scale_factor,
            'relations': [{
                'name': 'customer'
            }, {
                'name': 'currency'
            }]
        }),
        (
            'naive',
            {
                'name':
                'order_item',
                'entity_generator':
                generate_order_item,
                'num_iterations':
                num_iterations * scale_factor,
                'num_entities_per_iteration':
                lambda: random.randint(
                    1, 3
                ),  # Number of facts per iteration (e.g. 3 items 1 order)
                'relations': [{
                    'name': 'orders',
                    'unique': True
                }, {
                    'name': 'product',
                    'type': 'many_to_many',
                    'unique': True
                }]
                # Each iteration has the same entity link for one_to_many relations (e.g. one order_id per order_item)
                # For many_to_many this link is sampled - if unique_per_fact then it is sampled without replacement.
                # In this example an order has multiple order items, each linked to a unique_per_fact product within that order
                # If an order could have multiple of the same product then unique_per_fact would be false
            }),
        (
            'naive',
            {
                'name': 'currency_conversion',
                'entity_generator': generate_currency_conv,
                'num_iterations': num_currencies,
                'num_entities_per_iteration':
                num_days,  # We get one record per currency per day
                'relations': [{
                    'name': 'currency',
                    'unique': True
                }]
                # Here the default type is one_to_many - in this case there will be a unique value for each iteration
                # Sampled from the source table - note this will fail if there are more iterations that values in
                # The original table.
            })
    ]

    dummy_data = StarSchemaModel.from_list(schema)
    dummy_data.generate_all_datasets(print_progress=True)
    dummy_data.to_csv(folder)
    dummy_data.to_pickled_pyschema(folder)
    print("Done")
def main():
    num_products = get_num_products(num_iterations, scale_factor)

    schema = [
        #  DIMS
        ('naive_type2_scd', {
            'name': 'customer',
            'min_valid_from': low_date,
            'max_valid_from': high_date,
            'entity_generator': generate_customer,
            'num_iterations': num_iterations,
            'mutation_rate': 0.1,  # Will update mutate cols 10% of the time
            'mutating_cols': ['address']  # Only address will update
        }),
        ('naive', {
            'name': 'product',
            'entity_generator': generate_product,
            'num_iterations': num_products
        }),
        ('naive', {
            'name': 'currency',
            'entity_generator': generate_currency,
            'num_iterations': num_currencies
        }),
        #  FACTS
        ('naive', {
            'name': 'orders',
            'entity_generator': generate_order,
            'num_iterations': num_iterations * scale_factor,
            'relations': [{'name': 'customer'},
                          {'name': 'currency'}]
        }),
        ('naive', {
            'name': 'order_item',
            'entity_generator': generate_order_item,
            'num_iterations': num_iterations * scale_factor,
            'num_entities_per_iteration': lambda: random.randint(1, 3),
        # Number of facts per iteration (e.g. 3 items 1 order)
            'relations': [{'name': 'orders', 'unique': True},
                          {'name': 'product', 'type': 'many_to_many', 'unique': True}]
            # Each iteration has the same entity link for one_to_many relations (e.g. one order_id per order_item)
            # For many_to_many this link is sampled - if unique_per_fact then it is sampled without replacement.
            # In this example an order has multiple order items, each linked to a unique_per_fact product within that order
            # If an order could have multiple of the same product then unique_per_fact would be false
        }),
        ('naive', {
            'name': 'currency_conversion',
            'entity_generator': generate_currency_conv,
            'num_iterations': num_currencies,
            'num_entities_per_iteration': num_days,  # We get one record per currency per day
            'relations': [{'name': 'currency', 'unique': True}]
            # Here the default type is one_to_many - in this case there will be a unique value for each iteration
            # Sampled from the source table - note this will fail if there are more iterations that values in
            # The original table.
        })
    ]

    dummy_data = StarSchemaModel.from_list(schema)
    dummy_data.generate_all_datasets(print_progress=True)
    dummy_data.to_csv(data_path)

    padapter = PostgresSchemaAdapter(dummy_data)
    padapter.to_dbt_schema(path=schema_path)

    bqadapter = BigquerySchemaAdapter(dummy_data)
    bqadapter.to_dbt_schema(path=schema_path)

    print("Done")