def fit(self, metadata, tables=None, root_path=None): """Fit this SDV instance to the dataset data. Args: metadata (dict, str or Metadata): Metadata dict, path to the metadata JSON file or Metadata instance itself. tables (dict): Dictionary with the table names as key and ``pandas.DataFrame`` instances as values. If ``None`` is given, the tables will be loaded from the paths indicated in ``metadata``. Defaults to ``None``. root_path (str or None): Path to the dataset directory. If ``None`` and metadata is a path, the metadata location is used. If ``None`` and metadata is a dict, the current working directory is used. """ if isinstance(metadata, Metadata): self.metadata = metadata else: self.metadata = Metadata(metadata, root_path) self.metadata.validate(tables) self.modeler = Modeler(self.metadata, self.model, self.model_kwargs) self.modeler.model_database(tables) self.sampler = Sampler(self.metadata, self.modeler.models, self.model, self.model_kwargs)
def test__visualize_add_edges(self): """Add edges into a graphviz digraph.""" # Setup metadata = MagicMock(spec_set=Metadata) metadata.get_tables.return_value = ['demo', 'other'] metadata.get_parents.side_effect = [set(['other']), set()] metadata.get_foreign_key.return_value = 'fk' metadata.get_primary_key.return_value = 'pk' plot = Mock() # Run Metadata._visualize_add_edges(metadata, plot) # Asserts expected_edge_label = ' {}.{} -> {}.{}'.format('demo', 'fk', 'other', 'pk') metadata.get_tables.assert_called_once_with() metadata.get_foreign_key.assert_called_once_with('other', 'demo') metadata.get_primary_key.assert_called_once_with('other') assert metadata.get_parents.call_args_list == [call('demo'), call('other')] plot.edge.assert_called_once_with( 'other', 'demo', label=expected_edge_label, arrowhead='crow' )
def test_add_table_with_no_fields_data(self): """Add table with data to analyze all""" # Setup metadata = Mock(spec=Metadata) metadata.get_tables.return_value = ['a_table', 'b_table'] metadata._metadata = {'tables': dict()} metadata._get_field_details.return_value = { 'a_field': {'type': 'numerical', 'subtype': 'integer'}, 'b_field': {'type': 'boolean'}, 'c_field': {'type': 'categorical'} } # Run data = pd.DataFrame({'a_field': [0, 1], 'b_field': [True, False], 'c_field': ['a', 'b']}) Metadata.add_table(metadata, 'x_table', data=data) # Asserts expected_table_meta = { 'fields': { 'a_field': {'type': 'numerical', 'subtype': 'integer'}, 'b_field': {'type': 'boolean'}, 'c_field': {'type': 'categorical'} } } assert metadata._metadata['tables']['x_table'] == expected_table_meta metadata.set_primary_key.call_count == 0 metadata.add_relationship.call_count == 0
def test_add_table_with_data_str(self, mock_read_csv): """Add table with data as str""" # Setup metadata = Mock(spec_set=Metadata) metadata.get_tables.return_value = ['a_table', 'b_table'] metadata._metadata = {'tables': dict()} mock_read_csv.return_value = pd.DataFrame({ 'a_field': [0, 1], 'b_field': [True, False], 'c_field': ['a', 'b'] }) metadata._get_field_details.return_value = { 'a_field': {'type': 'numerical', 'subtype': 'integer'}, 'b_field': {'type': 'boolean'}, 'c_field': {'type': 'categorical'} } # Run Metadata.add_table(metadata, 'x_table', data='/path/to/file.csv') expected_table_meta = { 'fields': { 'a_field': {'type': 'numerical', 'subtype': 'integer'}, 'b_field': {'type': 'boolean'}, 'c_field': {'type': 'categorical'} }, 'path': '/path/to/file.csv' } assert metadata._metadata['tables']['x_table'] == expected_table_meta metadata.set_primary_key.call_count == 0 metadata.add_relationship.call_count == 0
def test_add_table_with_fields_metadata(self): """Add table with fields metadata""" # Setup metadata = Mock(spec=Metadata) metadata.get_tables.return_value = ['a_table', 'b_table'] metadata._metadata = {'tables': dict()} # Run fields_metadata = { 'a_field': {'type': 'numerical', 'subtype': 'integer'} } Metadata.add_table(metadata, 'x_table', fields_metadata=fields_metadata) # Asserts expected_table_meta = { 'fields': { 'a_field': {'type': 'numerical', 'subtype': 'integer'} } } assert metadata._metadata['tables']['x_table'] == expected_table_meta metadata.set_primary_key.call_count == 0 metadata.add_relationship.call_count == 0
def test_add_field(self): """Add field table no exist""" # Setup metadata = Mock(spec=Metadata) metadata.get_tables.return_value = list() metadata._metadata = { 'tables': { 'a_table': {'fields': dict()} } } # Run Metadata.add_field(metadata, 'a_table', 'a_field', 'id', 'string', None) # Asserts expected_metadata = { 'tables': { 'a_table': { 'fields': {'a_field': {'type': 'id', 'subtype': 'string'}} } } } assert metadata._metadata == expected_metadata metadata._check_field.assert_called_once_with('a_table', 'a_field', exists=False)
def test_reverse_transform(self): """Test reverse transform""" # Setup ht_mock = Mock() ht_mock.reverse_transform.return_value = { 'item 1': pd.Series([1.0, 2.0, None, 4.0, 5.0]), 'item 2': pd.Series([1.1, None, 3.3, None, 5.5]), 'item 3': pd.Series([None, 'bbb', 'ccc', 'ddd', None]), 'item 4': pd.Series([True, False, None, False, True]) } metadata = Mock(spec=Metadata) metadata._hyper_transformers = { 'test': ht_mock } metadata._get_dtypes.return_value = { 'item 1': int, 'item 2': float, 'item 3': np.object, 'item 4': bool, } # Run data = pd.DataFrame({'foo': [0, 1]}) Metadata.reverse_transform(metadata, 'test', data) # Asserts expected_call = pd.DataFrame({'foo': [0, 1]}) pd.testing.assert_frame_equal( ht_mock.reverse_transform.call_args[0][0], expected_call )
def test__get_transformers_raise_valueerror(self): """Test get transformers dict raise ValueError.""" # Run dtypes = { 'string': str } with pytest.raises(ValueError): Metadata._get_transformers(dtypes, None)
def _load_demo_dataset(dataset_name, data_path): dataset_path = _get_dataset_path(dataset_name, data_path) meta = Metadata(metadata=os.path.join(dataset_path, 'metadata.json')) tables = { name: _dtypes64(table) for name, table in meta.load_tables().items() } return meta, tables
def __init__(self, metadata, root_path=None): if isinstance(metadata, Metadata): self.metadata = metadata else: self.metadata = Metadata(metadata, root_path) self._primary_key_generators = dict() self._remaining_primary_keys = dict()
def test_add_table_already_exist(self): """Try to add a new table that already exist""" # Setup metadata = Mock(spec=Metadata) metadata.get_tables.return_value = ['a_table', 'b_table'] # Run with pytest.raises(ValueError): Metadata.add_table(metadata, 'a_table')
def test_add_relationship_parent_no_exist(self): """Add relationship table no exist""" # Setup metadata = Mock(spec=Metadata) metadata.get_tables.return_value = ['a_table'] # Run with pytest.raises(ValueError): Metadata.add_relationship(metadata, 'a_table', 'b_table')
def test_add_relationship_already_exist(self): """Add relationship already exist""" # Setup metadata = Mock(spec=Metadata) metadata.get_tables.return_value = ['a_table', 'b_table'] metadata.get_parents.return_value = set(['b_table']) # Run with pytest.raises(ValueError): Metadata.add_relationship(metadata, 'a_table', 'b_table')
def test_get_dtypes_error_id(self): """Test get data types with an id that is not a primary or foreign key.""" # Setup table_meta = {'fields': {'item': {'type': 'id'}}} metadata = Mock(spec=Metadata) metadata.get_table_meta.return_value = table_meta metadata._DTYPES = Metadata._DTYPES # Run with pytest.raises(ValueError): Metadata.get_dtypes(metadata, 'test', ids=True)
def test_get_dtypes_error_subtype_id(self): """Test get data types with an invalid id subtype.""" # Setup table_meta = {'fields': {'item': {'type': 'id', 'subtype': 'boolean'}}} metadata = Mock(spec=Metadata) metadata.get_table_meta.return_value = table_meta metadata._DTYPES = Metadata._DTYPES # Run with pytest.raises(ValueError): Metadata.get_dtypes(metadata, 'test', ids=True)
def test_get_dtypes_error_invalid_type(self): """Test get data types with an invalid type.""" # Setup table_meta = {'fields': {'item': {'type': 'unknown'}}} metadata = Mock(spec=Metadata) metadata.get_table_meta.return_value = table_meta metadata._DTYPES = Metadata._DTYPES # Run with pytest.raises(ValueError): Metadata.get_dtypes(metadata, 'test')
def test_add_relationship_parent_no_primary_key(self): """Add relationship parent no primary key""" # Setup metadata = Mock(spec=Metadata) metadata.get_tables.return_value = ['a_table', 'b_table'] metadata.get_parents.return_value = set() metadata.get_children.return_value = set() metadata.get_primary_key.return_value = None # Run with pytest.raises(ValueError): Metadata.add_relationship(metadata, 'a_table', 'b_table')
def _tabular_metric(sdmetric, synthetic, real, metadata=None, details=False): if metadata is None: metadata = Metadata() metadata.add_table(None, real) real = {None: real} synthetic = {None: synthetic} metrics = sdmetric.metrics(metadata, real, synthetic) if details: return list(metrics) return np.mean([metric.value for metric in metrics])
def test_get_dtypes_error_subtype_numerical(self): """Test get data types with an invalid numerical subtype.""" # Setup table_meta = { 'fields': { 'item': {'type': 'numerical', 'subtype': 'boolean'} } } metadata = Mock(spec_set=Metadata) metadata.get_table_meta.return_value = table_meta metadata._DTYPES = Metadata._DTYPES # Run with pytest.raises(MetadataError): Metadata.get_dtypes(metadata, 'test')
def test__get_dtypes_error_subtype_numerical(self): """Test get data types with an invalid numerical subtype.""" # Setup table_meta = { 'fields': { 'item': {'type': 'numerical', 'subtype': 'boolean'} } } # Run and asserts metadata = Mock(spec=Metadata) metadata.get_table_meta.return_value = table_meta with pytest.raises(ValueError): Metadata._get_dtypes(metadata, 'test')
def _load_relational_dummy(): users = pd.DataFrame({ 'user_id': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], 'country': ['US', 'UK', 'ES', 'UK', 'US', 'DE', 'BG', 'ES', 'FR', 'UK'], 'gender': ['M', 'F', None, 'M', 'F', 'M', 'F', None, 'F', None], 'age': [34, 23, 44, 22, 54, 57, 45, 41, 23, 30] }) sessions = pd.DataFrame({ 'session_id': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], 'user_id': [0, 1, 1, 2, 4, 5, 6, 6, 6, 8], 'device': ['mobile', 'tablet', 'tablet', 'mobile', 'mobile', 'mobile', 'mobile', 'tablet', 'mobile', 'tablet'], 'os': ['android', 'ios', 'android', 'android', 'ios', 'android', 'ios', 'ios', 'ios', 'ios'] }) transactions = pd.DataFrame({ 'transaction_id': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], 'session_id': [0, 0, 1, 3, 5, 5, 7, 8, 9, 9], 'timestamp': ['2019-01-01T12:34:32', '2019-01-01T12:42:21', '2019-01-07T17:23:11', '2019-01-10T11:08:57', '2019-01-10T21:54:08', '2019-01-11T11:21:20', '2019-01-22T14:44:10', '2019-01-23T10:14:09', '2019-01-27T16:09:17', '2019-01-29T12:10:48'], 'amount': [100.0, 55.3, 79.5, 112.1, 110.0, 76.3, 89.5, 132.1, 68.0, 99.9], 'approved': [True, True, True, False, False, True, True, False, True, True], }) transactions['timestamp'] = pd.to_datetime(transactions['timestamp']) tables = { 'users': users, 'sessions': sessions, 'transactions': transactions } return Metadata(DEMO_METADATA), tables
def test__get_graphviz_extension_none(self): """Get graphviz with path equals to None.""" # Run result = Metadata._get_graphviz_extension(None) # Asserts assert result == (None, None)
def test_get_dtypes_with_ids(self): """Test get data types including ids.""" # Setup table_meta = { 'fields': { 'item 0': {'type': 'id', 'subtype': 'integer'}, 'item 1': {'type': 'numerical', 'subtype': 'integer'}, 'item 2': {'type': 'numerical', 'subtype': 'float'}, 'item 3': {'type': 'categorical'}, 'item 4': {'type': 'boolean'}, 'item 5': {'type': 'datetime'} }, 'primary_key': 'item 0' } metadata = Mock(spec_set=Metadata) metadata.get_table_meta.return_value = table_meta metadata._DTYPES = Metadata._DTYPES # Run result = Metadata.get_dtypes(metadata, 'test', ids=True) # Asserts expected = { 'item 0': 'int', 'item 1': 'int', 'item 2': 'float', 'item 3': 'object', 'item 4': 'bool', 'item 5': 'datetime64', } assert result == expected
def test_add_table_with_fields_no_data(self): """Add table with fields and no data""" # Setup metadata = Mock(spec=Metadata) metadata.get_tables.return_value = ['a_table', 'b_table'] metadata._metadata = {'tables': dict()} # Run fields = ['a_field', 'b_field'] Metadata.add_table(metadata, 'x_table', fields=fields) # Asserts expected_table_meta = {'fields': dict()} assert metadata._metadata['tables']['x_table'] == expected_table_meta
def test__get_graphviz_extension_valid(self): """Get a valid graphviz extension.""" # Run result = Metadata._get_graphviz_extension('/some/path.png') # Asserts assert result == ('/some/path', 'png')
def test__dict_metadata(self): """Test dict_metadata""" # Run metadata = { 'tables': [{ 'name': 'test', 'use': True, 'fields': [{ 'ref': {'table': 'table_ref', 'field': 'field_ref'}, 'name': 'test_field' }] }] } result = Metadata._dict_metadata(metadata) # Asserts expected = { 'tables': { 'test': { 'use': True, 'name': 'test', 'fields': { 'test_field': { 'ref': {'table': 'table_ref', 'field': 'field_ref'}, 'name': 'test_field' } } } } } assert result == expected
def test__get_pii_fields(self): """Test get pii fields""" # Setup table_meta = { 'fields': { 'foo': { 'type': 'categorical', 'pii': True, 'pii_category': 'email' }, 'bar': { 'type': 'categorical', 'pii_category': 'email' } } } # Run metadata = Mock(spec=Metadata) metadata.get_table_meta.return_value = table_meta table_name = 'test' result = Metadata._get_pii_fields(metadata, table_name) # Asserts expected = {'foo': 'email'} assert result == expected
def test__get_dtypes_no_ids(self): """Test get data types excluding ids.""" # Setup table_meta = { 'fields': { 'item 0': {'type': 'id', 'subtype': 'integer'}, 'item 1': {'type': 'numerical', 'subtype': 'integer'}, 'item 2': {'type': 'numerical', 'subtype': 'float'}, 'item 3': {'type': 'categorical'}, 'item 4': {'type': 'boolean'}, 'item 5': {'type': 'datetime'}, } } metadata = Mock(spec=Metadata) metadata.get_table_meta.return_value = table_meta metadata._DTYPES = Metadata._DTYPES # Run result = Metadata._get_dtypes(metadata, 'test') # Asserts expected = { 'item 1': int, 'item 2': float, 'item 3': np.object, 'item 4': bool, 'item 5': np.datetime64, } assert result == expected
def test_load_tables(self): """Test get tables""" # Setup table_names = ['foo', 'bar', 'tar'] table_data = [ pd.DataFrame({'foo': [1, 2]}), pd.DataFrame({'bar': [3, 4]}), pd.DataFrame({'tar': [5, 6]}) ] metadata = Mock(spec=Metadata) metadata.get_tables.side_effect = table_names metadata.load_table.side_effect = table_data # Run tables = ['table 1', 'table 2', 'table 3'] result = Metadata.load_tables(metadata, tables=tables) # Asserts expected = { 'table 1': pd.DataFrame({'foo': [1, 2]}), 'table 2': pd.DataFrame({'bar': [3, 4]}), 'table 3': pd.DataFrame({'tar': [5, 6]}) } assert result.keys() == expected.keys() for k, v in result.items(): pd.testing.assert_frame_equal(v, expected[k])
def test_get_foreign_key(self): """Test get foreign key""" # Setup primary_key = 'a_primary_key' fields = { 'a_field': { 'ref': { 'field': 'a_primary_key' }, 'name': 'a_field' }, 'p_field': { 'ref': { 'field': 'another_key_field' }, 'name': 'p_field' } } metadata = Mock(spec=Metadata) metadata.get_primary_key.return_value = primary_key metadata.get_fields.return_value = fields # Run result = Metadata.get_foreign_key(metadata, 'parent', 'child') # Asserts assert result == 'a_field' metadata.get_primary_key.assert_called_once_with('parent') metadata.get_fields.assert_called_once_with('child')