def test_evaluate_tables_from_demo(): tables = load_demo(metadata=False) new_meta = Metadata() new_meta.add_table('users', data=tables['users'], primary_key='user_id') new_meta.add_table('sessions', data=tables['sessions'], primary_key='session_id', parent='users', foreign_key='user_id') transactions_fields = { 'timestamp': { 'type': 'datetime', 'format': '%Y-%m-%d' } } new_meta.add_table('transactions', tables['transactions'], fields_metadata=transactions_fields, primary_key='transaction_id', parent='sessions') sdv = SDV() sdv.fit(new_meta, tables=tables) sampled = sdv.sample_all() table_scores = dict() for table in new_meta.get_tables(): table_scores[table] = evaluate(sampled[table], real=tables[table], metadata=new_meta, table_name=table) evaluate(sampled, real=tables, metadata=new_meta)
def _score_dataset(dataset, datasets_path, output): start = datetime.now() try: if datasets_path is None: metadata, tables = load_demo(dataset, metadata=True) else: metadata = Metadata( os.path.join(datasets_path, dataset, 'metadata.json')) tables = metadata.load_tables() sdv = SDV() LOGGER.info('Modeling dataset %s', dataset) sdv.fit(metadata, tables) LOGGER.info('Sampling dataset %s', dataset) sampled = sdv.sample_all(10) LOGGER.info('Evaluating dataset %s', dataset) score = evaluate(sampled, metadata=metadata) LOGGER.info('%s: %s - ELAPSED: %s', dataset, score, datetime.now() - start) output.update({ 'dataset': dataset, 'score': score, }) except Exception as ex: error = '{}: {}'.format(type(ex).__name__, str(ex)) LOGGER.error('%s: %s - ELAPSED: %s', dataset, error, datetime.now() - start) output.update({'dataset': dataset, 'error': error})
def test_sdv(): metadata, tables = load_demo(metadata=True) sdv = SDV() sdv.fit(metadata, tables) # Sample all sampled = sdv.sample() assert set(sampled.keys()) == {'users', 'sessions', 'transactions'} assert len(sampled['users']) == 10 # Sample with children sampled = sdv.sample('users', reset_primary_keys=True) assert set(sampled.keys()) == {'users', 'sessions', 'transactions'} assert len(sampled['users']) == 10 # Sample without children users = sdv.sample('users', sample_children=False) assert users.shape == tables['users'].shape assert set(users.columns) == set(tables['users'].columns) sessions = sdv.sample('sessions', sample_children=False) assert sessions.shape == tables['sessions'].shape assert set(sessions.columns) == set(tables['sessions'].columns) transactions = sdv.sample('transactions', sample_children=False) assert transactions.shape == tables['transactions'].shape assert set(transactions.columns) == set(tables['transactions'].columns)
def test_sdv_multiparent(): metadata, tables = load_demo('got_families', metadata=True) sdv = SDV() sdv.fit(metadata, tables) # Sample all sampled = sdv.sample() assert set(sampled.keys()) == {'characters', 'families', 'character_families'} assert len(sampled['characters']) == 7 # Sample with children sampled = sdv.sample('characters', reset_primary_keys=True) assert set(sampled.keys()) == {'characters', 'character_families'} assert len(sampled['characters']) == 7 assert 'family_id' in sampled['character_families'] # Sample without children characters = sdv.sample('characters', sample_children=False) assert characters.shape == tables['characters'].shape assert set(characters.columns) == set(tables['characters'].columns) families = sdv.sample('families', sample_children=False) assert families.shape == tables['families'].shape assert set(families.columns) == set(tables['families'].columns) character_families = sdv.sample('character_families', sample_children=False) assert character_families.shape == tables['character_families'].shape assert set(character_families.columns) == set(tables['character_families'].columns)
def run_example(): """Example of usage of SDV for tables contanining more than one foreign key.""" # Setup vault = SDV('data/meta.json') vault.fit() # Run result = vault.sample_all() for name, table in result.items(): print('Samples generated for table {}:\n{}\n'.format(name, table.head(5)))
def test_integration(self): metadata, tables = load_demo(metadata=True) sdv = SDV() sdv.fit(metadata, tables) synthetic = sdv.sample_all(20) metrics = evaluate(metadata, tables, synthetic) metrics.overall() metrics.details() metrics.highlights()
def test_integer_categoricals(): """Ensure integer categoricals are still sampled as integers. The origin of this tests can be found in the github issue #194: https://github.com/sdv-dev/SDV/issues/194 """ metadata, tables = load_demo(metadata=True) metadata_dict = metadata.to_dict() metadata_dict['tables']['users']['fields']['age'] = {'type': 'categorical'} sdv = SDV() sdv.fit(metadata, tables) sampled = sdv.sample() for name, table in tables.items(): assert (sampled[name].dtypes == table.dtypes).all()
def test_sdv_multi_foreign_key(): """Ensure multi-foreign-key datasets are properly covered. Multi-foreign-key datasets are those that have one table with 2 foreign keys to the same parent. """ metadata, tables = datasets.load_multi_foreign_key() sdv = SDV() sdv.fit(metadata, tables) # Sample all sampled = sdv.sample() assert set(sampled.keys()) == {'parent', 'child'} assert len(sampled['parent']) == 10
def fit_save_model(mfile): sdv = SDV() ''' Original Data ''' users = pd.DataFrame({ 'user_id': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], 'country': ['USA', 'UK', 'ES', 'UK', 'USA', 'DE', 'BG', 'ES', 'FR', 'UK'], 'gender': ['M', 'F', None, 'M', 'F', 'M', 'F', None, 'F', None], 'age': [34, 23, 44, 22, 54, 57, 45, 41, 23, 30] }) tables = { 'users': users } with open('./user_table_metadata.json') as metadata_file: metadata = json.load(metadata_file) sdv.fit(metadata, tables) sdv.save(mfile)
""" Running the SDV basic tutorial using their example dataset. """ from sdv import load_demo from sdv import SDV # Grab the demo data metadata, tables = load_demo(metadata=True) print(metadata) # Run the basic fit sdv = SDV() sdv.fit(metadata, tables) print("done fit") sdv.save('sdv.pkl')
def createmodel(tables, mfile): sdv = SDV() with open('./join_table_metadata.json') as metadata_file: metadata = json.load(metadata_file) sdv.fit(metadata, tables) sdv.save(mfile)