Exemple #1
0
def test___init___copies_metadata():
    """Test the ``__init__`` method.

    This test assures that the metadata provided to the model is copied,
    so that any modifications don't change the input.

    Setup:
        - Initialize two models with the same metadata and data.

    Expected behavior:
        - The metadata for each model and the provided metadata should all be different.
    """
    # Setup
    metadata, data = load_tabular_demo('student_placements', metadata=True)

    # Run
    model = GaussianCopula(table_metadata=metadata,
                           categorical_transformer='label_encoding',
                           default_distribution='gamma')
    model.fit(data)
    model2 = GaussianCopula(table_metadata=metadata,
                            categorical_transformer='label_encoding',
                            default_distribution='beta')
    model2.fit(data)

    # Assert
    assert model._metadata != metadata
    assert model._metadata != model2._metadata
    assert model2._metadata != metadata
    gamma = 'copulas.univariate.gamma.GammaUnivariate'
    beta = 'copulas.univariate.beta.BetaUnivariate'
    assert all(distribution == gamma
               for distribution in model.get_distributions().values())
    assert all(distribution == beta
               for distribution in model2.get_distributions().values())
Exemple #2
0
def test_constraints(tmpdir):
    # Setup
    employees = load_tabular_demo()
    fixed_company_department_constraint = FixedCombinations(
        column_names=['company', 'department'])
    age_gt_age_when_joined_constraint = Inequality(
        low_column_name='age_when_joined', high_column_name='age')
    age_range_constraint = ScalarRange('age', 29, 50)
    constraints = [
        fixed_company_department_constraint, age_gt_age_when_joined_constraint,
        age_range_constraint
    ]

    # Run
    gc = GaussianCopula(constraints=constraints,
                        min_value=None,
                        max_value=None)
    gc.fit(employees)
    gc.save(tmpdir / 'test.pkl')
    gc = gc.load(tmpdir / 'test.pkl')
    sampled = gc.sample(10)

    # Assert
    assert all(age_gt_age_when_joined_constraint.is_valid(sampled))
    assert all(age_range_constraint.is_valid(sampled))
    assert all(fixed_company_department_constraint.is_valid(sampled))
Exemple #3
0
def test_unique_combination_constraint():
    employees = load_tabular_demo()

    unique_company_department_constraint = UniqueCombinations(
        columns=['company', 'department'], handling_strategy='transform')

    model = CopulaGAN(constraints=[unique_company_department_constraint])
    model.fit(employees)
    model.sample(10)
Exemple #4
0
def test_fixed_combination_constraint():
    # Setup
    employees = load_tabular_demo()
    fixed_company_department_constraint = FixedCombinations(
        column_names=['company', 'department'])
    model = CTGAN(constraints=[fixed_company_department_constraint])

    # Run
    model.fit(employees)
    sampled = model.sample(10)

    # Assert
    assert all(fixed_company_department_constraint.is_valid(sampled))
Exemple #5
0
def test_constraints_reject_sampling_zero_valid():
    """Ensure everything works if no rows are valid on the first try.

    See https://github.com/sdv-dev/SDV/issues/285
    """
    employees = load_tabular_demo()

    _IS_VALID_CALLED.clear()
    constraint = CustomConstraint(is_valid=_is_valid)

    gc = GaussianCopula(constraints=[constraint])
    gc.fit(employees)
    gc.sample(10)
Exemple #6
0
 def __init__(self, separation_char, input_data="", input_path=""):
     self.input_data = input_data
     if input_data == "demo":
         self.data = load_tabular_demo('student_placements')  # demo data
     elif input_data == "ml-100k" or input_data == "own" or input_data == "amazon":
         #self.data_dir = f"{conf.DATA_DIR}/ml-100k/"
         if input_data == "own":
             self.input_path = input_path
         elif input_data == "amazon":
             self.input_data = f"{conf.DATA_DIR}/amazon/amazon_ratings.csv"
         self.sparse_data = self.load_sparse_data(separation_char)
     else:
         print(
             "Dataset was not provided or implementation is not yet prepared to handle this dataset.\nPlease try ml-100k or demo dataset."
         )
    def __init__(self, input_data="", input_path=""):
        self.max_active = 1000  # 30 for "own" dataset
        self.min_active = 350  # only 33 users # 5 for "own dataset"
        self.input_data = input_data

        if input_data == "demo":
            self.data = load_tabular_demo('student_placements')  # demo data
        elif input_data == "ml-100k" or input_data == "own":
            #self.data_dir = f"{conf.DATA_DIR}/ml-100k/"
            if input_data == "own":
                self.input_path = input_path
            self.active_data_dense, self.inactive_data_dense, self.active_data_sparse, self.inactive_data_sparse = self.load_data(
            )
        else:
            print(
                "Dataset was not provided or implementation is not yet prepared to handle this dataset.\nPlease try ml-100k or demo dataset."
            )
Exemple #8
0
def test_constraints():

    employees = load_tabular_demo()

    unique_company_department_constraint = UniqueCombinations(
        columns=['company', 'department'], handling_strategy='transform')

    age_gt_age_when_joined_constraint = GreaterThan(
        low='age_when_joined', high='age', handling_strategy='reject_sampling')

    years_in_the_company_constraint = ColumnFormula(
        column='years_in_the_company',
        formula=years_in_the_company,
        handling_strategy='transform')

    constraints = [
        unique_company_department_constraint,
        age_gt_age_when_joined_constraint, years_in_the_company_constraint
    ]
    gc = GaussianCopula(constraints=constraints)
    gc.fit(employees)
    gc.sample(10)