def test_criterion(): glmdisc.Glmdisc(criterion="bic") glmdisc.Glmdisc(criterion="aic") glmdisc.Glmdisc(criterion="gini") with pytest.raises(ValueError): glmdisc.Glmdisc(criterion="toto")
def test_test(): glmdisc.Glmdisc(test=True) glmdisc.Glmdisc(test=False) with pytest.raises(ValueError): glmdisc.Glmdisc(test="some string") with pytest.raises(ValueError): glmdisc.Glmdisc(test=12)
def test_validation(): glmdisc.Glmdisc(validation=True) glmdisc.Glmdisc(validation=False) with pytest.raises(ValueError): glmdisc.Glmdisc(validation="some string") with pytest.raises(ValueError): glmdisc.Glmdisc(validation=12)
def test_not_fit(): model = glmdisc.Glmdisc(algorithm="NN", test=False, validation=False) with pytest.raises(glmdisc.NotFittedError): model._check_is_fitted() n = 100 d = 2 x, y, theta = glmdisc.Glmdisc.generate_data(n, d) model = glmdisc.Glmdisc(algorithm="NN", test=False, validation=False, burn_in=20) with pytest.raises(glmdisc.NotFittedError): model.fit(predictors_cont=x, predictors_qual=None, labels=y, iter=11)
def test_validation_criterion(caplog): glmdisc.Glmdisc(validation=True, criterion='aic') assert caplog.records[0].message == ( 'No need to penalize the log-likelihood when a validation set is used. ' 'Using log-likelihood instead.') glmdisc.Glmdisc(validation=True, criterion='bic') assert caplog.records[0].message == ( 'No need to penalize the log-likelihood when a validation set is used. ' 'Using log-likelihood instead.')
def test_iter(): n = 100 d = 2 x, y, theta = glmdisc.Glmdisc.generate_data(n, d) with pytest.raises(ValueError): model = glmdisc.Glmdisc() model.fit(predictors_cont=x, predictors_qual=None, labels=y, iter=-12) with pytest.raises(ValueError): model = glmdisc.Glmdisc() model.fit(predictors_cont=x, predictors_qual=None, labels=y, iter=100000000)
def test_plot_log(caplog): n = 500 d = 2 x, y, theta = glmdisc.Glmdisc.generate_data(n, d) cuts = ([0, 0.333, 0.666, 1]) xd = np.ndarray.copy(x) for i in range(d): xd[:, i] = pd.cut(x[:, i], bins=cuts, labels=[0, 1, 2]) model = glmdisc.Glmdisc(algorithm="NN", validation=False, test=False) model.fit(predictors_cont=x, predictors_qual=xd, labels=y, iter=50) model.plot(predictors_cont_number=0, predictors_qual_number=0) assert caplog.records[-2].message != ( "A single int (more than 0 and less than the " "number of columns in predictors_cont) must be " "provided for predictors_cont_number") assert caplog.records[-1].message != ( "A single int (more than 0 and less than the " "number of columns in predictors_qual) must be " "provided for predictors_qual_number") with pytest.raises(ValueError): model.plot(predictors_cont_number=0, predictors_qual_number=-1) assert caplog.records[-1].message == ( "A single int (more than 0 and less than the " "number of columns in predictors_qual) must be " "provided for predictors_qual_number") with pytest.raises(ValueError): model.plot(predictors_cont_number=-1, predictors_qual_number=0) assert caplog.records[-1].message == ( "A single int (more than 0 and less than the " "number of columns in predictors_cont) must be " "provided for predictors_cont_number")
def test_best_formula(caplog): n = 200 d = 2 x, y, theta = glmdisc.Glmdisc.generate_data(n, d) cuts = ([0, 0.333, 0.666, 1]) xd = np.ndarray.copy(x) for i in range(d): xd[:, i] = pd.cut(x[:, i], bins=cuts, labels=[0, 1, 2]) model = glmdisc.Glmdisc(validation=False, test=False) model.fit(predictors_cont=x, predictors_qual=xd, labels=y, iter=11) formula = model.best_formula() assert isinstance(formula, list) assert len(formula) == 2 * d for j in range(2 * d): assert isinstance(formula[j], list) assert ("No cut-points found for continuous variable 0" in caplog.records[-4].message) | \ ("Cut-points found for continuous variable 0" in caplog.records[-4].message) assert ("No cut-points found for continuous variable 1" in caplog.records[-3].message) | \ ("Cut-points found for continuous variable 1" in caplog.records[-3].message) assert ("No regroupments made for categorical variable 0" in caplog.records[-2].message) | \ ("Regroupments made for categorical variable 0" in caplog.records[-2].message) assert ("No regroupments made for categorical variable 1" in caplog.records[-1].message) | \ ("Regroupments made for categorical variable 1" in caplog.records[-1].message)
def test_args_fit(): n = 100 d = 2 x, y, theta = glmdisc.Glmdisc.generate_data(n, d) model = glmdisc.Glmdisc() cuts = ([0, 0.333, 0.666, 1]) xd = np.ndarray.copy(x) for i in range(d): xd[:, i] = pd.cut(x[:, i], bins=cuts, labels=[0, 1, 2]) with pytest.raises(ValueError): model.fit(predictors_cont=x, predictors_qual=None, labels=[]) with pytest.raises(ValueError): model.fit(predictors_cont="blabla", predictors_qual=None, labels=[]) with pytest.raises(ValueError): model.fit(predictors_cont=None, predictors_qual="blabla", labels=[]) with pytest.raises(ValueError): model.fit(predictors_cont=None, predictors_qual=None, labels=y) with pytest.raises(ValueError): model.fit(predictors_cont=x, predictors_qual=None, labels=y[0:50]) with pytest.raises(ValueError): model.fit(predictors_cont=None, predictors_qual=xd, labels=y[0:50])
def test_nan(): n = 100 d = 2 x, y, theta = glmdisc.Glmdisc.generate_data(n, d) x[0, 0] = np.nan x[90, 1] = np.nan model = glmdisc.Glmdisc(criterion="bic") model.fit(predictors_cont=x, predictors_qual=None, labels=y, iter=11)
def test_calculate_shape_categorical(): n = 100 d = 2 x, y, theta = glmdisc.Glmdisc.generate_data(n, d) model = glmdisc.Glmdisc() cuts = ([0, 0.333, 0.666, 1]) xd = np.ndarray.copy(x) for i in range(d): xd[:, i] = pd.cut(x[:, i], bins=cuts, labels=[0, 1, 2]) model.fit(predictors_cont=None, predictors_qual=xd, labels=y, iter=11) continu_complete_case = model._calculate_shape() assert model.n == n assert model.d_cont == 0 assert model.d_qual == d assert continu_complete_case is None model = glmdisc.Glmdisc(m_start=2) model.fit(predictors_cont=None, predictors_qual=xd, labels=y, iter=11)
def test_discrete_data_val_cont(): n = 1000 d = 2 x, y, theta = glmdisc.Glmdisc.generate_data(n, d) model = glmdisc.Glmdisc(algorithm="NN", validation=True, test=False) model.fit(predictors_cont=x, predictors_qual=None, labels=y, iter=80) result = model.discrete_data() assert isinstance(result, np.ndarray) assert result.shape[0] == 400
def test_discretize_cont(): n = 100 d = 2 x, y, theta = glmdisc.Glmdisc.generate_data(n, d) model = glmdisc.Glmdisc(algorithm="NN", validation=False, test=False) random.seed(1) np.random.seed(1) model.fit(predictors_cont=x, predictors_qual=None, labels=y, iter=11) model.discretize(predictors_cont=x, predictors_qual=None)
def test_predict_new(): n = 200 d = 2 x, y, theta = glmdisc.Glmdisc.generate_data(n, d) model = glmdisc.Glmdisc(algorithm="NN", validation=False, test=False) model.fit(predictors_cont=x[0:100], predictors_qual=None, labels=y[0:100], iter=11) results = model.predict(predictors_cont=x[100:200], predictors_qual=None) assert results.shape == (100, 2) assert (results > 0).all() and (results < 1).all() assert (results.sum(axis=1) == 1).all()
def test_discretize_cont(): n = 100 d = 2 x, y, theta = glmdisc.Glmdisc.generate_data(n, d) model = glmdisc.Glmdisc(validation=False, test=False) random.seed(1) np.random.seed(1) model.fit(predictors_cont=x, predictors_qual=None, labels=y, iter=11) emap = model.discretize(predictors_cont=x, predictors_qual=None) model.best_encoder_emap.transform(emap.astype(int).astype(str))
def test_discrete_data_test_cont(): n = 500 d = 2 x, y, theta = glmdisc.Glmdisc.generate_data(n, d) model = glmdisc.Glmdisc(algorithm="NN", validation=False, test=True) random.seed(1) np.random.seed(1) model.fit(predictors_cont=x, predictors_qual=None, labels=y, iter=50) result = model.discrete_data() assert isinstance(result, np.ndarray) assert result.shape[0] == 200
def test_discrete_data_cont(): n = 500 d = 2 x, y, theta = glmdisc.Glmdisc.generate_data(n, d) model = glmdisc.Glmdisc(validation=False, test=False) random.seed(1) np.random.seed(1) model.fit(predictors_cont=x, predictors_qual=None, labels=y, iter=50) result = model.discrete_data() assert isinstance(result, scipy.sparse.csr.csr_matrix) assert result.shape[0] == 500
def test_init(): glmdisc_instance = glmdisc.Glmdisc() assert glmdisc_instance.algorithm == "SEM" glmdisc_instance = glmdisc.Glmdisc(algorithm="NN") assert glmdisc_instance.algorithm == "NN" assert glmdisc_instance.test assert glmdisc_instance.validation assert glmdisc_instance.criterion == "bic" assert glmdisc_instance.m_start == 20 assert glmdisc_instance.criterion_iter == [] assert glmdisc_instance.best_link == [] assert glmdisc_instance.best_reglog is None assert glmdisc_instance.affectations == [] assert glmdisc_instance.best_encoder_emap is None assert glmdisc_instance.performance == -np.inf np.testing.assert_array_equal(glmdisc_instance.train, np.array([])) np.testing.assert_array_equal(glmdisc_instance.validate, np.array([])) np.testing.assert_array_equal(glmdisc_instance.test_rows, np.array([])) with pytest.raises(ValueError): glmdisc.Glmdisc(algorithm="toto")
def test_kwargs_fit(caplog): n = 100 d = 2 x, y, theta = glmdisc.Glmdisc.generate_data(n, d) model = glmdisc.Glmdisc(validation=False, test=False) model.fit(predictors_cont=x, predictors_qual=None, labels=y, toto="tata") assert "**kwargs not used for algorithm = 'SEM'" in caplog.records[0].message
def test_calculate_shape_continu(): n = 100 d = 2 x, y, theta = glmdisc.Glmdisc.generate_data(n, d) model = glmdisc.Glmdisc() model.fit(predictors_cont=x, predictors_qual=None, labels=y, iter=11) continu_complete_case = model._calculate_shape() assert model.n == n assert model.d_cont == d assert model.d_qual == 0 assert continu_complete_case.shape == (100, 2) assert continu_complete_case.all
def test_split(): n = 100 d = 2 x, y, theta = glmdisc.Glmdisc.generate_data(n, d) model = glmdisc.Glmdisc() random.seed(1) np.random.seed(1) model.fit(predictors_cont=x, predictors_qual=None, labels=y, iter=11) training = model.train validating = model.validate testing = model.test_rows model = glmdisc.Glmdisc() random.seed(1) np.random.seed(1) model.fit(predictors_cont=x, predictors_qual=None, labels=y, iter=11) np.testing.assert_array_equal(training, model.train) np.testing.assert_array_equal(validating, model.validate) np.testing.assert_array_equal(testing, model.test_rows) assert len(model.train) > 0 assert len(model.validate) > 0 assert len(model.test_rows) > 0 model = glmdisc.Glmdisc(validation=False) model.fit(predictors_cont=x, predictors_qual=None, labels=y, iter=11) assert len(model.train) > 0 assert model.validate is None assert len(model.test_rows) > 0 model = glmdisc.Glmdisc(test=False) model.fit(predictors_cont=x, predictors_qual=None, labels=y, iter=11) assert len(model.train) > 0 assert len(model.validate) > 0 assert model.test_rows is None model = glmdisc.Glmdisc(validation=False, test=False) model.fit(predictors_cont=x, predictors_qual=None, labels=y, iter=11) assert len(model.train) > 0 assert model.validate is None assert model.test_rows is None
def test_best_formula_no_cont(): n = 200 d = 2 x, y, theta = glmdisc.Glmdisc.generate_data(n, d) cuts = ([0, 0.333, 0.666, 1]) xd = np.ndarray.copy(x) for i in range(d): xd[:, i] = pd.cut(x[:, i], bins=cuts, labels=[0, 1, 2]) model = glmdisc.Glmdisc(validation=False, test=False) model.fit(predictors_cont=None, predictors_qual=xd, labels=y, iter=11) model.best_formula()
def test_kwargs(): n = 100 d = 2 x, y, theta = glmdisc.Glmdisc.generate_data(n, d) model = glmdisc.Glmdisc(algorithm="NN", validation=False, test=False) model.fit(predictors_cont=x, predictors_qual=None, labels=y, plot=True, optim=Adagrad(), callbacks=EarlyStopping()) assert isinstance(model.model_nn.optimizer, tensorflow.python.keras.optimizer_v2.adam.Adam) assert isinstance(model.callbacks[-1], tensorflow.python.keras.callbacks.EarlyStopping)
def test_discretize_qual(): n = 500 d = 2 x, y, theta = glmdisc.Glmdisc.generate_data(n, d) cuts = ([0, 0.333, 0.666, 1]) xd = np.ndarray.copy(x) for i in range(d): xd[:, i] = pd.cut(x[:, i], bins=cuts, labels=[0, 1, 2]) model = glmdisc.Glmdisc(algorithm="NN", validation=False, test=False) random.seed(1) np.random.seed(1) model.fit(predictors_cont=None, predictors_qual=xd, labels=y, iter=50) model.discretize(predictors_cont=None, predictors_qual=xd)
def test_not_fit(): n = 100 d = 2 x, y, theta = glmdisc.Glmdisc.generate_data(n, d) model = glmdisc.Glmdisc() for i in range(100): random.seed(i) np.random.seed(i) model.fit(predictors_cont=x, predictors_qual=None, labels=y, iter=11) try: model.check_is_fitted() except glmdisc.NotFittedError: with pytest.raises(glmdisc.NotFittedError): model.check_is_fitted() break
def test_calculate_criterion(): n = 100 d = 2 x, y, theta = glmdisc.Glmdisc.generate_data(n, d) model = glmdisc.Glmdisc(algorithm="NN", criterion="bic") random.seed(1) np.random.seed(1) model.fit(predictors_cont=x, predictors_qual=None, labels=y, iter=11) emap = np.resize(np.array([np.where( np.random.multinomial(1, pvals=[0.33, 0.33, 0.34]))[0][0] + 1 for _ in range(n * d)]), (n, d)) current_encoder_emap = sk.preprocessing.OneHotEncoder() current_encoder_emap.fit(X=emap.astype(str)) model_emap = sk.linear_model.LogisticRegression(solver='liblinear', C=1e40, tol=0.001, max_iter=25, warm_start=False) model_emap.fit(X=current_encoder_emap.transform(emap.astype(str)), y=y) modele_bic = model._calculate_criterion(emap, model_emap, current_encoder_emap) assert modele_bic < 0 model = glmdisc.Glmdisc(algorithm="NN", criterion="aic") random.seed(1) np.random.seed(1) model.fit(predictors_cont=x, predictors_qual=None, labels=y, iter=11) assert math.isclose(model._calculate_criterion(emap, model_emap, current_encoder_emap), modele_bic) model = glmdisc.Glmdisc(algorithm="NN", validation=False) random.seed(1) np.random.seed(1) model.fit(predictors_cont=x, predictors_qual=None, labels=y, iter=11) modele_bic = model._calculate_criterion(emap, model_emap, current_encoder_emap) model = glmdisc.Glmdisc(algorithm="NN", criterion="aic", validation=False) random.seed(1) np.random.seed(1) model.fit(predictors_cont=x, predictors_qual=None, labels=y, iter=11) assert math.isclose(model._calculate_criterion(emap, model_emap, current_encoder_emap), modele_bic + ( math.log(model.n) - 2) * model_emap.coef_.shape[1]) model = glmdisc.Glmdisc(algorithm="NN", criterion="gini") model.fit(predictors_cont=x, predictors_qual=None, labels=y, iter=11) assert 0 <= model._calculate_criterion(emap, model_emap, current_encoder_emap) <= 1 model = glmdisc.Glmdisc(algorithm="NN", criterion="gini", validation=False) model.fit(predictors_cont=x, predictors_qual=None, labels=y, iter=11) assert 0 <= model._calculate_criterion(emap, model_emap, current_encoder_emap) <= 1
def test_discretize_qual(): n = 500 d = 2 x, y, theta = glmdisc.Glmdisc.generate_data(n, d) cuts = ([0, 0.333, 0.666, 1]) xd = np.ndarray.copy(x) for i in range(d): xd[:, i] = pd.cut(x[:, i], bins=cuts, labels=[0, 1, 2]) model = glmdisc.Glmdisc(validation=False, test=False) random.seed(1) np.random.seed(1) model.fit(predictors_cont=None, predictors_qual=xd, labels=y, iter=50) emap = model.discretize(predictors_cont=None, predictors_qual=xd) model.best_encoder_emap.transform(emap.astype(int).astype(str))
def test_fit(): n = 100 d = 2 x, y, theta = glmdisc.Glmdisc.generate_data(n, d) model = glmdisc.Glmdisc(algorithm="NN", criterion="bic", validation=True) model.fit(predictors_cont=x, predictors_qual=None, labels=y, iter=20) model = glmdisc.Glmdisc(algorithm="NN", criterion="aic", validation=True) model.fit(predictors_cont=x, predictors_qual=None, labels=y, iter=20) model = glmdisc.Glmdisc(algorithm="NN", criterion="gini", validation=True) model.fit(predictors_cont=x, predictors_qual=None, labels=y, iter=20) model = glmdisc.Glmdisc(algorithm="NN", criterion="bic", validation=False) model.fit(predictors_cont=x, predictors_qual=None, labels=y, iter=20) model = glmdisc.Glmdisc(algorithm="NN", criterion="aic", validation=False) model.fit(predictors_cont=x, predictors_qual=None, labels=y, iter=20) model = glmdisc.Glmdisc(algorithm="NN", criterion="gini", validation=False) model.fit(predictors_cont=x, predictors_qual=None, labels=y, iter=20) model = glmdisc.Glmdisc(algorithm="NN", criterion="bic", test=False, validation=False) model.fit(predictors_cont=x, predictors_qual=None, labels=y, iter=20) model = glmdisc.Glmdisc(algorithm="NN", criterion="aic", test=False, validation=False) model.fit(predictors_cont=x, predictors_qual=None, labels=y, iter=20) model = glmdisc.Glmdisc(algorithm="NN", criterion="gini", test=False, validation=False) model.fit(predictors_cont=x, predictors_qual=None, labels=y, iter=20)
def test_discrete_data_test_qual(): n = 500 d = 2 x, y, theta = glmdisc.Glmdisc.generate_data(n, d) cuts = ([0, 0.333, 0.666, 1]) xd = np.ndarray.copy(x) for i in range(d): xd[:, i] = pd.cut(x[:, i], bins=cuts, labels=[0, 1, 2]) model = glmdisc.Glmdisc(algorithm="NN", validation=False, test=True) random.seed(1) np.random.seed(1) model.fit(predictors_cont=None, predictors_qual=xd, labels=y, iter=50) result = model.discrete_data() assert isinstance(result, np.ndarray) assert result.shape[0] == 200
def test_discrete_data_val_both(): n = 1000 d = 2 x, y, theta = glmdisc.Glmdisc.generate_data(n, d) cuts = ([0, 0.333, 0.666, 1]) xd = np.ndarray.copy(x) for i in range(d): xd[:, i] = pd.cut(x[:, i], bins=cuts, labels=[0, 1, 2]) model = glmdisc.Glmdisc(validation=True, test=False) random.seed(1) np.random.seed(1) model.fit(predictors_cont=x, predictors_qual=xd, labels=y, iter=100) result = model.discrete_data() assert isinstance(result, scipy.sparse.csr.csr_matrix) assert result.shape[0] == 400