Ejemplo n.º 1
0
    def test_add_table_with_fields_metadata(self):
        """Add table with fields metadata"""
        # Setup
        metadata = Mock(spec=Metadata)
        metadata.get_tables.return_value = ['a_table', 'b_table']
        metadata._metadata = {'tables': dict()}

        # Run
        fields_metadata = {
            'a_field': {'type': 'numerical', 'subtype': 'integer'}
        }

        Metadata.add_table(metadata, 'x_table', fields_metadata=fields_metadata)

        # Asserts
        expected_table_meta = {
            'fields': {
                'a_field': {'type': 'numerical', 'subtype': 'integer'}
            }
        }

        assert metadata._metadata['tables']['x_table'] == expected_table_meta

        metadata.set_primary_key.call_count == 0
        metadata.add_relationship.call_count == 0
Ejemplo n.º 2
0
    def test_add_table_with_no_fields_data(self):
        """Add table with data to analyze all"""
        # Setup
        metadata = Mock(spec=Metadata)
        metadata.get_tables.return_value = ['a_table', 'b_table']
        metadata._metadata = {'tables': dict()}
        metadata._get_field_details.return_value = {
            'a_field': {'type': 'numerical', 'subtype': 'integer'},
            'b_field': {'type': 'boolean'},
            'c_field': {'type': 'categorical'}
        }

        # Run
        data = pd.DataFrame({'a_field': [0, 1], 'b_field': [True, False], 'c_field': ['a', 'b']})

        Metadata.add_table(metadata, 'x_table', data=data)

        # Asserts
        expected_table_meta = {
            'fields': {
                'a_field': {'type': 'numerical', 'subtype': 'integer'},
                'b_field': {'type': 'boolean'},
                'c_field': {'type': 'categorical'}
            }
        }

        assert metadata._metadata['tables']['x_table'] == expected_table_meta

        metadata.set_primary_key.call_count == 0
        metadata.add_relationship.call_count == 0
Ejemplo n.º 3
0
    def test_add_table_with_data_str(self, mock_read_csv):
        """Add table with data as str"""
        # Setup
        metadata = Mock(spec_set=Metadata)
        metadata.get_tables.return_value = ['a_table', 'b_table']
        metadata._metadata = {'tables': dict()}
        mock_read_csv.return_value = pd.DataFrame({
            'a_field': [0, 1],
            'b_field': [True, False],
            'c_field': ['a', 'b']
        })
        metadata._get_field_details.return_value = {
            'a_field': {'type': 'numerical', 'subtype': 'integer'},
            'b_field': {'type': 'boolean'},
            'c_field': {'type': 'categorical'}
        }

        # Run
        Metadata.add_table(metadata, 'x_table', data='/path/to/file.csv')

        expected_table_meta = {
            'fields': {
                'a_field': {'type': 'numerical', 'subtype': 'integer'},
                'b_field': {'type': 'boolean'},
                'c_field': {'type': 'categorical'}
            },
            'path': '/path/to/file.csv'
        }

        assert metadata._metadata['tables']['x_table'] == expected_table_meta

        metadata.set_primary_key.call_count == 0
        metadata.add_relationship.call_count == 0
Ejemplo n.º 4
0
def _validate_arguments(synth, real, metadata, root_path, table_name):
    """Validate arguments needed to compute descriptors values.

    If ``metadata`` is an instance of dict create the ``Metadata`` object.
    If ``metadata`` is ``None``, ``real`` has to be a ``pandas.DataFrane``.

    If ``real`` is ``None`` load all the tables and assert that ``synth`` is a ``dict``.
    Otherwise, ``real`` and ``synth`` must be of the same type.

    If ``synth`` is not a ``dict``, create a dictionary using the ``table_name``.

    Assert that ``synth`` and ``real`` must have the same tables.

    Args:
        synth (dict or pandas.DataFrame):
            Synthesized data.
        real (dict, pandas.DataFrame or None):
            Real data.
        metadata (str, dict, Metadata or None):
            Metadata instance or details needed to build it.
        root_path (str):
            Path to the metadata file.
        table_name (str):
            Table name used to prepare the metadata object, real and synth dict.

    Returns:
        tuple (dict, dict, Metadata):
            Processed tables and Metadata oject.
    """
    if isinstance(metadata, dict):
        metadata = Metadata(metadata, root_path)
    elif metadata is None:
        if not isinstance(real, pd.DataFrame):
            raise TypeError(
                'If metadata is None, `real` has to be a DataFrame')

        metadata = Metadata()
        metadata.add_table(table_name, data=real)

    if real is None:
        real = metadata.load_tables()
        if not isinstance(synth, dict):
            raise TypeError('If `real` is `None`, `synth` must be a dict')

    elif not isinstance(synth, type(real)):
        raise TypeError('`real` and `synth` must be of the same type')

    if not isinstance(synth, dict):
        synth = {table_name: synth}

    if not isinstance(real, dict):
        real = {table_name: real}

    if not set(real.keys()) == set(synth.keys()):
        raise ValueError(
            'real and synthetic dataset must have the same tables')

    return synth, real, metadata
Ejemplo n.º 5
0
    def test_add_table_already_exist(self):
        """Try to add a new table that already exist"""
        # Setup
        metadata = Mock(spec=Metadata)
        metadata.get_tables.return_value = ['a_table', 'b_table']

        # Run
        with pytest.raises(ValueError):
            Metadata.add_table(metadata, 'a_table')
Ejemplo n.º 6
0
def test_hma1_single_child_row_single_parent_row():
    """Test that ``HMA1`` supports a single child row per single parent row.

    ``HMA1`` doesn't learn the distribution of the values for a child row when those
    are equal to 1. This is because those values will be equal to ``0``  and alter the
    ``std`` by a lot.

    Setup:
        - Create a dataset that has 1 child row per single parent row.
        - Create the ``sdv.Metadata`` for that dataset.
        - Create an instance of ``HMA1``.

    Input:
        - ``dataset``
        - ``sdv.Metadata``

    Output:
        - ``dict`` with synthetic data.
    """

    # Setup
    parent_a = pd.DataFrame({
        'parent_id': range(5),
        'value': range(5)
    })

    child = pd.DataFrame({
        'parent_a': range(5),
        'value_a': range(5),
    })

    tables = {
        'parent_a': parent_a,
        'child': child
    }

    metadata = Metadata()
    metadata.add_table('parent_a', parent_a, primary_key='parent_id')
    metadata.add_table('child', child)
    metadata.add_relationship('parent_a', 'child', 'parent_a')

    model = HMA1(metadata)

    # Run
    model.fit(tables)
    sampled = model.sample(num_rows=10)

    # Assert
    assert len(sampled) == 2
    assert len(sampled['parent_a']) == 10
    assert len(sampled['child']) == 10

    assert len(sampled['parent_a']['parent_id'].unique()) == 10
    assert len(sampled['child']['parent_a'].unique()) == 10
Ejemplo n.º 7
0
def _tabular_metric(sdmetric, synthetic, real, metadata=None, details=False):
    if metadata is None:
        metadata = Metadata()
        metadata.add_table(None, real)
        real = {None: real}
        synthetic = {None: synthetic}

    metrics = sdmetric.metrics(metadata, real, synthetic)
    if details:
        return list(metrics)

    return np.mean([metric.value for metric in metrics])
Ejemplo n.º 8
0
    def test_add_table_with_fields_no_data(self):
        """Add table with fields and no data"""
        # Setup
        metadata = Mock(spec=Metadata)
        metadata.get_tables.return_value = ['a_table', 'b_table']
        metadata._metadata = {'tables': dict()}

        # Run
        fields = ['a_field', 'b_field']

        Metadata.add_table(metadata, 'x_table', fields=fields)

        # Asserts
        expected_table_meta = {'fields': dict()}

        assert metadata._metadata['tables']['x_table'] == expected_table_meta
Ejemplo n.º 9
0
    def test_add_table_with_primary_key(self):
        """Add table with primary key"""
        # Setup
        metadata = Mock(spec=Metadata)
        metadata.get_tables.return_value = ['a_table', 'b_table']
        metadata._metadata = {'tables': dict()}

        # Run
        Metadata.add_table(metadata, 'x_table', primary_key='id')

        # Asserts
        expected_table_meta = {'fields': dict()}

        assert metadata._metadata['tables']['x_table'] == expected_table_meta

        metadata.set_primary_key.assert_called_once_with('x_table', 'id')
        metadata.add_relationship.call_count == 0
Ejemplo n.º 10
0
    def test_add_table_only_name(self):
        """Add table with only the name"""
        # Setup
        metadata = Mock(spec=Metadata)
        metadata.get_tables.return_value = ['a_table', 'b_table']
        metadata._metadata = {'tables': dict()}

        # Run
        Metadata.add_table(metadata, 'x_table')

        # Asserts
        expected_table_meta = {'fields': dict()}

        assert metadata._metadata['tables']['x_table'] == expected_table_meta

        metadata.set_primary_key.call_count == 0
        metadata.add_relationship.call_count == 0
Ejemplo n.º 11
0
    def test_add_table_with_foreign_key(self):
        """Add table with foreign key"""
        # Setup
        metadata = Mock(spec_set=Metadata)
        metadata.get_tables.return_value = ['a_table', 'b_table']
        metadata._metadata = {'tables': dict()}

        # Run
        Metadata.add_table(metadata, 'x_table', parent='users')

        # Asserts
        expected_table_meta = {'fields': dict()}

        assert metadata._metadata['tables']['x_table'] == expected_table_meta

        metadata.set_primary_key.call_count == 0
        metadata.add_relationship.assert_called_once_with(
            'users', 'x_table', None)