Esempio n. 1
0
def test_evaluate_tables_from_demo():
    tables = load_demo(metadata=False)

    new_meta = Metadata()
    new_meta.add_table('users', data=tables['users'], primary_key='user_id')
    new_meta.add_table('sessions',
                       data=tables['sessions'],
                       primary_key='session_id',
                       parent='users',
                       foreign_key='user_id')
    transactions_fields = {
        'timestamp': {
            'type': 'datetime',
            'format': '%Y-%m-%d'
        }
    }
    new_meta.add_table('transactions',
                       tables['transactions'],
                       fields_metadata=transactions_fields,
                       primary_key='transaction_id',
                       parent='sessions')

    sdv = SDV()
    sdv.fit(new_meta, tables=tables)

    sampled = sdv.sample_all()

    table_scores = dict()
    for table in new_meta.get_tables():
        table_scores[table] = evaluate(sampled[table],
                                       real=tables[table],
                                       metadata=new_meta,
                                       table_name=table)

    evaluate(sampled, real=tables, metadata=new_meta)
Esempio n. 2
0
def test_build_demo_metadata_from_tables():
    """Build metadata from the demo tables.

    Then compare the built metadata with the demo one
    to make sure that they are the same.
    """
    tables = load_demo(metadata=False)

    new_meta = Metadata()
    new_meta.add_table('users', data=tables['users'], primary_key='user_id')
    new_meta.add_table('sessions',
                       data=tables['sessions'],
                       primary_key='session_id',
                       parent='users',
                       foreign_key='user_id')
    transactions_fields = {
        'timestamp': {
            'type': 'datetime',
            'format': '%Y-%m-%dT%H:%M'
        }
    }
    new_meta.add_table('transactions',
                       tables['transactions'],
                       fields_metadata=transactions_fields,
                       primary_key='transaction_id',
                       parent='sessions')

    assert DEMO_METADATA == new_meta.to_dict()
Esempio n. 3
0
def test_sdv():
    metadata, tables = load_demo(metadata=True)

    sdv = SDV()
    sdv.fit(metadata, tables)

    # Sample all
    sampled = sdv.sample()

    assert set(sampled.keys()) == {'users', 'sessions', 'transactions'}
    assert len(sampled['users']) == 10

    # Sample with children
    sampled = sdv.sample('users', reset_primary_keys=True)

    assert set(sampled.keys()) == {'users', 'sessions', 'transactions'}
    assert len(sampled['users']) == 10

    # Sample without children
    users = sdv.sample('users', sample_children=False)

    assert users.shape == tables['users'].shape
    assert set(users.columns) == set(tables['users'].columns)

    sessions = sdv.sample('sessions', sample_children=False)

    assert sessions.shape == tables['sessions'].shape
    assert set(sessions.columns) == set(tables['sessions'].columns)

    transactions = sdv.sample('transactions', sample_children=False)

    assert transactions.shape == tables['transactions'].shape
    assert set(transactions.columns) == set(tables['transactions'].columns)
Esempio n. 4
0
def test_sdv_multiparent():
    metadata, tables = load_demo('got_families', metadata=True)

    sdv = SDV()
    sdv.fit(metadata, tables)

    # Sample all
    sampled = sdv.sample()

    assert set(sampled.keys()) == {'characters', 'families', 'character_families'}
    assert len(sampled['characters']) == 7

    # Sample with children
    sampled = sdv.sample('characters', reset_primary_keys=True)

    assert set(sampled.keys()) == {'characters', 'character_families'}
    assert len(sampled['characters']) == 7
    assert 'family_id' in sampled['character_families']

    # Sample without children
    characters = sdv.sample('characters', sample_children=False)

    assert characters.shape == tables['characters'].shape
    assert set(characters.columns) == set(tables['characters'].columns)

    families = sdv.sample('families', sample_children=False)

    assert families.shape == tables['families'].shape
    assert set(families.columns) == set(tables['families'].columns)

    character_families = sdv.sample('character_families', sample_children=False)

    assert character_families.shape == tables['character_families'].shape
    assert set(character_families.columns) == set(tables['character_families'].columns)
Esempio n. 5
0
    def test_integration(self):
        metadata, tables = load_demo(metadata=True)

        sdv = SDV()
        sdv.fit(metadata, tables)
        synthetic = sdv.sample_all(20)

        metrics = evaluate(metadata, tables, synthetic)
        metrics.overall()
        metrics.details()
        metrics.highlights()
Esempio n. 6
0
def test_integer_categoricals():
    """Ensure integer categoricals are still sampled as integers.

    The origin of this tests can be found in the github issue #194:
    https://github.com/sdv-dev/SDV/issues/194
    """
    metadata, tables = load_demo(metadata=True)
    metadata_dict = metadata.to_dict()
    metadata_dict['tables']['users']['fields']['age'] = {'type': 'categorical'}

    sdv = SDV()
    sdv.fit(metadata, tables)
    sampled = sdv.sample()

    for name, table in tables.items():
        assert (sampled[name].dtypes == table.dtypes).all()
Esempio n. 7
0
"""
Running the SDV basic tutorial using their example dataset.
"""
from sdv import load_demo
from sdv import SDV

# Grab the demo data
metadata, tables = load_demo(metadata=True)
print(metadata)

# Run the basic fit
sdv = SDV()
sdv.fit(metadata, tables)
print("done fit")
sdv.save('sdv.pkl')