def test_evaluate_tables_from_demo(): tables = load_demo(metadata=False) new_meta = Metadata() new_meta.add_table('users', data=tables['users'], primary_key='user_id') new_meta.add_table('sessions', data=tables['sessions'], primary_key='session_id', parent='users', foreign_key='user_id') transactions_fields = { 'timestamp': { 'type': 'datetime', 'format': '%Y-%m-%d' } } new_meta.add_table('transactions', tables['transactions'], fields_metadata=transactions_fields, primary_key='transaction_id', parent='sessions') sdv = SDV() sdv.fit(new_meta, tables=tables) sampled = sdv.sample_all() table_scores = dict() for table in new_meta.get_tables(): table_scores[table] = evaluate(sampled[table], real=tables[table], metadata=new_meta, table_name=table) evaluate(sampled, real=tables, metadata=new_meta)
def load_dataset(dataset, datasets_path=None, bucket=None): dataset_path = _get_dataset_path(dataset, datasets_path, bucket) metadata = Metadata(str(dataset_path / 'metadata.json')) tables = metadata.get_tables() if not hasattr(metadata, 'modality'): if len(tables) > 1: modality = 'multi-table' else: table = metadata.get_table_meta(tables[0]) if any(table.get(field) for field in TIMESERIES_FIELDS): modality = 'timeseries' else: modality = 'single-table' metadata._metadata['modality'] = modality metadata.modality = modality if not hasattr(metadata, 'name'): metadata._metadata['name'] = dataset_path.name metadata.name = dataset_path.name return metadata
def load_dataset(dataset, datasets_path=None, bucket=None, aws_key=None, aws_secret=None, max_columns=None): dataset_path = _get_dataset_path(dataset, datasets_path, bucket, aws_key, aws_secret) with open(dataset_path / 'metadata.json') as metadata_file: metadata_content = json.load(metadata_file) if max_columns: if len(metadata_content['tables']) > 1: raise ValueError( 'max_columns is not supported for multi-table datasets') _apply_max_columns_to_metadata(metadata_content, max_columns) metadata = Metadata(metadata_content, dataset_path) tables = metadata.get_tables() if not hasattr(metadata, 'modality'): if len(tables) > 1: modality = 'multi-table' else: table = metadata.get_table_meta(tables[0]) if any(table.get(field) for field in TIMESERIES_FIELDS): modality = 'timeseries' else: modality = 'single-table' metadata._metadata['modality'] = modality metadata.modality = modality if not hasattr(metadata, 'name'): metadata._metadata['name'] = dataset_path.name metadata.name = dataset_path.name return metadata