def test___init___copies_metadata(): """Test the ``__init__`` method. This test assures that the metadata provided to the model is copied, so that any modifications don't change the input. Setup: - Initialize two models with the same metadata and data. Expected behavior: - The metadata for each model and the provided metadata should all be different. """ # Setup metadata, data = load_tabular_demo('student_placements', metadata=True) # Run model = GaussianCopula(table_metadata=metadata, categorical_transformer='label_encoding', default_distribution='gamma') model.fit(data) model2 = GaussianCopula(table_metadata=metadata, categorical_transformer='label_encoding', default_distribution='beta') model2.fit(data) # Assert assert model._metadata != metadata assert model._metadata != model2._metadata assert model2._metadata != metadata gamma = 'copulas.univariate.gamma.GammaUnivariate' beta = 'copulas.univariate.beta.BetaUnivariate' assert all(distribution == gamma for distribution in model.get_distributions().values()) assert all(distribution == beta for distribution in model2.get_distributions().values())
def test_constraints(tmpdir): # Setup employees = load_tabular_demo() fixed_company_department_constraint = FixedCombinations( column_names=['company', 'department']) age_gt_age_when_joined_constraint = Inequality( low_column_name='age_when_joined', high_column_name='age') age_range_constraint = ScalarRange('age', 29, 50) constraints = [ fixed_company_department_constraint, age_gt_age_when_joined_constraint, age_range_constraint ] # Run gc = GaussianCopula(constraints=constraints, min_value=None, max_value=None) gc.fit(employees) gc.save(tmpdir / 'test.pkl') gc = gc.load(tmpdir / 'test.pkl') sampled = gc.sample(10) # Assert assert all(age_gt_age_when_joined_constraint.is_valid(sampled)) assert all(age_range_constraint.is_valid(sampled)) assert all(fixed_company_department_constraint.is_valid(sampled))
def test_unique_combination_constraint(): employees = load_tabular_demo() unique_company_department_constraint = UniqueCombinations( columns=['company', 'department'], handling_strategy='transform') model = CopulaGAN(constraints=[unique_company_department_constraint]) model.fit(employees) model.sample(10)
def test_fixed_combination_constraint(): # Setup employees = load_tabular_demo() fixed_company_department_constraint = FixedCombinations( column_names=['company', 'department']) model = CTGAN(constraints=[fixed_company_department_constraint]) # Run model.fit(employees) sampled = model.sample(10) # Assert assert all(fixed_company_department_constraint.is_valid(sampled))
def test_constraints_reject_sampling_zero_valid(): """Ensure everything works if no rows are valid on the first try. See https://github.com/sdv-dev/SDV/issues/285 """ employees = load_tabular_demo() _IS_VALID_CALLED.clear() constraint = CustomConstraint(is_valid=_is_valid) gc = GaussianCopula(constraints=[constraint]) gc.fit(employees) gc.sample(10)
def __init__(self, separation_char, input_data="", input_path=""): self.input_data = input_data if input_data == "demo": self.data = load_tabular_demo('student_placements') # demo data elif input_data == "ml-100k" or input_data == "own" or input_data == "amazon": #self.data_dir = f"{conf.DATA_DIR}/ml-100k/" if input_data == "own": self.input_path = input_path elif input_data == "amazon": self.input_data = f"{conf.DATA_DIR}/amazon/amazon_ratings.csv" self.sparse_data = self.load_sparse_data(separation_char) else: print( "Dataset was not provided or implementation is not yet prepared to handle this dataset.\nPlease try ml-100k or demo dataset." )
def __init__(self, input_data="", input_path=""): self.max_active = 1000 # 30 for "own" dataset self.min_active = 350 # only 33 users # 5 for "own dataset" self.input_data = input_data if input_data == "demo": self.data = load_tabular_demo('student_placements') # demo data elif input_data == "ml-100k" or input_data == "own": #self.data_dir = f"{conf.DATA_DIR}/ml-100k/" if input_data == "own": self.input_path = input_path self.active_data_dense, self.inactive_data_dense, self.active_data_sparse, self.inactive_data_sparse = self.load_data( ) else: print( "Dataset was not provided or implementation is not yet prepared to handle this dataset.\nPlease try ml-100k or demo dataset." )
def test_constraints(): employees = load_tabular_demo() unique_company_department_constraint = UniqueCombinations( columns=['company', 'department'], handling_strategy='transform') age_gt_age_when_joined_constraint = GreaterThan( low='age_when_joined', high='age', handling_strategy='reject_sampling') years_in_the_company_constraint = ColumnFormula( column='years_in_the_company', formula=years_in_the_company, handling_strategy='transform') constraints = [ unique_company_department_constraint, age_gt_age_when_joined_constraint, years_in_the_company_constraint ] gc = GaussianCopula(constraints=constraints) gc.fit(employees) gc.sample(10)