Beispiel #1
0
def test_criterion():
    glmdisc.Glmdisc(criterion="bic")
    glmdisc.Glmdisc(criterion="aic")
    glmdisc.Glmdisc(criterion="gini")

    with pytest.raises(ValueError):
        glmdisc.Glmdisc(criterion="toto")
Beispiel #2
0
def test_test():
    glmdisc.Glmdisc(test=True)
    glmdisc.Glmdisc(test=False)

    with pytest.raises(ValueError):
        glmdisc.Glmdisc(test="some string")

    with pytest.raises(ValueError):
        glmdisc.Glmdisc(test=12)
Beispiel #3
0
def test_validation():
    glmdisc.Glmdisc(validation=True)
    glmdisc.Glmdisc(validation=False)

    with pytest.raises(ValueError):
        glmdisc.Glmdisc(validation="some string")

    with pytest.raises(ValueError):
        glmdisc.Glmdisc(validation=12)
Beispiel #4
0
def test_not_fit():
    model = glmdisc.Glmdisc(algorithm="NN", test=False, validation=False)
    with pytest.raises(glmdisc.NotFittedError):
        model._check_is_fitted()
    n = 100
    d = 2
    x, y, theta = glmdisc.Glmdisc.generate_data(n, d)
    model = glmdisc.Glmdisc(algorithm="NN", test=False, validation=False, burn_in=20)
    with pytest.raises(glmdisc.NotFittedError):
        model.fit(predictors_cont=x, predictors_qual=None, labels=y, iter=11)
Beispiel #5
0
def test_validation_criterion(caplog):
    glmdisc.Glmdisc(validation=True, criterion='aic')

    assert caplog.records[0].message == (
        'No need to penalize the log-likelihood when a validation set is used. '
        'Using log-likelihood instead.')

    glmdisc.Glmdisc(validation=True, criterion='bic')

    assert caplog.records[0].message == (
        'No need to penalize the log-likelihood when a validation set is used. '
        'Using log-likelihood instead.')
Beispiel #6
0
def test_iter():
    n = 100
    d = 2
    x, y, theta = glmdisc.Glmdisc.generate_data(n, d)

    with pytest.raises(ValueError):
        model = glmdisc.Glmdisc()
        model.fit(predictors_cont=x, predictors_qual=None, labels=y, iter=-12)

    with pytest.raises(ValueError):
        model = glmdisc.Glmdisc()
        model.fit(predictors_cont=x, predictors_qual=None, labels=y, iter=100000000)
Beispiel #7
0
def test_plot_log(caplog):
    n = 500
    d = 2
    x, y, theta = glmdisc.Glmdisc.generate_data(n, d)
    cuts = ([0, 0.333, 0.666, 1])
    xd = np.ndarray.copy(x)
    for i in range(d):
        xd[:, i] = pd.cut(x[:, i], bins=cuts, labels=[0, 1, 2])

    model = glmdisc.Glmdisc(algorithm="NN", validation=False, test=False)
    model.fit(predictors_cont=x, predictors_qual=xd, labels=y, iter=50)
    model.plot(predictors_cont_number=0, predictors_qual_number=0)
    assert caplog.records[-2].message != (
        "A single int (more than 0 and less than the "
        "number of columns in predictors_cont) must be "
        "provided for predictors_cont_number")
    assert caplog.records[-1].message != (
        "A single int (more than 0 and less than the "
        "number of columns in predictors_qual) must be "
        "provided for predictors_qual_number")
    with pytest.raises(ValueError):
        model.plot(predictors_cont_number=0, predictors_qual_number=-1)
    assert caplog.records[-1].message == (
        "A single int (more than 0 and less than the "
        "number of columns in predictors_qual) must be "
        "provided for predictors_qual_number")
    with pytest.raises(ValueError):
        model.plot(predictors_cont_number=-1, predictors_qual_number=0)
    assert caplog.records[-1].message == (
        "A single int (more than 0 and less than the "
        "number of columns in predictors_cont) must be "
        "provided for predictors_cont_number")
Beispiel #8
0
def test_best_formula(caplog):
    n = 200
    d = 2
    x, y, theta = glmdisc.Glmdisc.generate_data(n, d)
    cuts = ([0, 0.333, 0.666, 1])
    xd = np.ndarray.copy(x)

    for i in range(d):
        xd[:, i] = pd.cut(x[:, i], bins=cuts, labels=[0, 1, 2])

    model = glmdisc.Glmdisc(validation=False, test=False)
    model.fit(predictors_cont=x, predictors_qual=xd, labels=y, iter=11)
    formula = model.best_formula()
    assert isinstance(formula, list)
    assert len(formula) == 2 * d
    for j in range(2 * d):
        assert isinstance(formula[j], list)
    assert ("No cut-points found for continuous variable 0" in caplog.records[-4].message) | \
        ("Cut-points found for continuous variable 0" in caplog.records[-4].message)
    assert ("No cut-points found for continuous variable 1" in caplog.records[-3].message) | \
        ("Cut-points found for continuous variable 1" in caplog.records[-3].message)
    assert ("No regroupments made for categorical variable 0" in caplog.records[-2].message) | \
        ("Regroupments made for categorical variable 0" in caplog.records[-2].message)
    assert ("No regroupments made for categorical variable 1" in caplog.records[-1].message) | \
        ("Regroupments made for categorical variable 1" in caplog.records[-1].message)
Beispiel #9
0
def test_args_fit():
    n = 100
    d = 2
    x, y, theta = glmdisc.Glmdisc.generate_data(n, d)
    model = glmdisc.Glmdisc()
    cuts = ([0, 0.333, 0.666, 1])
    xd = np.ndarray.copy(x)

    for i in range(d):
        xd[:, i] = pd.cut(x[:, i], bins=cuts, labels=[0, 1, 2])

    with pytest.raises(ValueError):
        model.fit(predictors_cont=x, predictors_qual=None, labels=[])

    with pytest.raises(ValueError):
        model.fit(predictors_cont="blabla", predictors_qual=None, labels=[])

    with pytest.raises(ValueError):
        model.fit(predictors_cont=None, predictors_qual="blabla", labels=[])

    with pytest.raises(ValueError):
        model.fit(predictors_cont=None, predictors_qual=None, labels=y)

    with pytest.raises(ValueError):
        model.fit(predictors_cont=x, predictors_qual=None, labels=y[0:50])

    with pytest.raises(ValueError):
        model.fit(predictors_cont=None, predictors_qual=xd, labels=y[0:50])
Beispiel #10
0
def test_nan():
    n = 100
    d = 2
    x, y, theta = glmdisc.Glmdisc.generate_data(n, d)
    x[0, 0] = np.nan
    x[90, 1] = np.nan
    model = glmdisc.Glmdisc(criterion="bic")
    model.fit(predictors_cont=x, predictors_qual=None, labels=y, iter=11)
Beispiel #11
0
def test_calculate_shape_categorical():
    n = 100
    d = 2
    x, y, theta = glmdisc.Glmdisc.generate_data(n, d)
    model = glmdisc.Glmdisc()
    cuts = ([0, 0.333, 0.666, 1])
    xd = np.ndarray.copy(x)
    for i in range(d):
        xd[:, i] = pd.cut(x[:, i], bins=cuts, labels=[0, 1, 2])
    model.fit(predictors_cont=None, predictors_qual=xd, labels=y, iter=11)
    continu_complete_case = model._calculate_shape()
    assert model.n == n
    assert model.d_cont == 0
    assert model.d_qual == d
    assert continu_complete_case is None

    model = glmdisc.Glmdisc(m_start=2)
    model.fit(predictors_cont=None, predictors_qual=xd, labels=y, iter=11)
Beispiel #12
0
def test_discrete_data_val_cont():
    n = 1000
    d = 2
    x, y, theta = glmdisc.Glmdisc.generate_data(n, d)
    model = glmdisc.Glmdisc(algorithm="NN", validation=True, test=False)
    model.fit(predictors_cont=x, predictors_qual=None, labels=y, iter=80)
    result = model.discrete_data()
    assert isinstance(result, np.ndarray)
    assert result.shape[0] == 400
Beispiel #13
0
def test_discretize_cont():
    n = 100
    d = 2
    x, y, theta = glmdisc.Glmdisc.generate_data(n, d)
    model = glmdisc.Glmdisc(algorithm="NN", validation=False, test=False)
    random.seed(1)
    np.random.seed(1)
    model.fit(predictors_cont=x, predictors_qual=None, labels=y, iter=11)
    model.discretize(predictors_cont=x, predictors_qual=None)
def test_predict_new():
    n = 200
    d = 2
    x, y, theta = glmdisc.Glmdisc.generate_data(n, d)
    model = glmdisc.Glmdisc(algorithm="NN", validation=False, test=False)
    model.fit(predictors_cont=x[0:100], predictors_qual=None, labels=y[0:100], iter=11)
    results = model.predict(predictors_cont=x[100:200], predictors_qual=None)
    assert results.shape == (100, 2)
    assert (results > 0).all() and (results < 1).all()
    assert (results.sum(axis=1) == 1).all()
Beispiel #15
0
def test_discretize_cont():
    n = 100
    d = 2
    x, y, theta = glmdisc.Glmdisc.generate_data(n, d)
    model = glmdisc.Glmdisc(validation=False, test=False)
    random.seed(1)
    np.random.seed(1)
    model.fit(predictors_cont=x, predictors_qual=None, labels=y, iter=11)
    emap = model.discretize(predictors_cont=x, predictors_qual=None)
    model.best_encoder_emap.transform(emap.astype(int).astype(str))
Beispiel #16
0
def test_discrete_data_test_cont():
    n = 500
    d = 2
    x, y, theta = glmdisc.Glmdisc.generate_data(n, d)
    model = glmdisc.Glmdisc(algorithm="NN", validation=False, test=True)
    random.seed(1)
    np.random.seed(1)
    model.fit(predictors_cont=x, predictors_qual=None, labels=y, iter=50)
    result = model.discrete_data()
    assert isinstance(result, np.ndarray)
    assert result.shape[0] == 200
Beispiel #17
0
def test_discrete_data_cont():
    n = 500
    d = 2
    x, y, theta = glmdisc.Glmdisc.generate_data(n, d)
    model = glmdisc.Glmdisc(validation=False, test=False)
    random.seed(1)
    np.random.seed(1)
    model.fit(predictors_cont=x, predictors_qual=None, labels=y, iter=50)
    result = model.discrete_data()
    assert isinstance(result, scipy.sparse.csr.csr_matrix)
    assert result.shape[0] == 500
Beispiel #18
0
def test_init():
    glmdisc_instance = glmdisc.Glmdisc()
    assert glmdisc_instance.algorithm == "SEM"
    glmdisc_instance = glmdisc.Glmdisc(algorithm="NN")
    assert glmdisc_instance.algorithm == "NN"
    assert glmdisc_instance.test
    assert glmdisc_instance.validation
    assert glmdisc_instance.criterion == "bic"
    assert glmdisc_instance.m_start == 20
    assert glmdisc_instance.criterion_iter == []
    assert glmdisc_instance.best_link == []
    assert glmdisc_instance.best_reglog is None
    assert glmdisc_instance.affectations == []
    assert glmdisc_instance.best_encoder_emap is None
    assert glmdisc_instance.performance == -np.inf
    np.testing.assert_array_equal(glmdisc_instance.train, np.array([]))
    np.testing.assert_array_equal(glmdisc_instance.validate, np.array([]))
    np.testing.assert_array_equal(glmdisc_instance.test_rows, np.array([]))
    with pytest.raises(ValueError):
        glmdisc.Glmdisc(algorithm="toto")
Beispiel #19
0
def test_kwargs_fit(caplog):
    n = 100
    d = 2
    x, y, theta = glmdisc.Glmdisc.generate_data(n, d)
    model = glmdisc.Glmdisc(validation=False, test=False)

    model.fit(predictors_cont=x,
              predictors_qual=None,
              labels=y,
              toto="tata")

    assert "**kwargs not used for algorithm = 'SEM'" in caplog.records[0].message
Beispiel #20
0
def test_calculate_shape_continu():
    n = 100
    d = 2
    x, y, theta = glmdisc.Glmdisc.generate_data(n, d)
    model = glmdisc.Glmdisc()
    model.fit(predictors_cont=x, predictors_qual=None, labels=y, iter=11)
    continu_complete_case = model._calculate_shape()
    assert model.n == n
    assert model.d_cont == d
    assert model.d_qual == 0
    assert continu_complete_case.shape == (100, 2)
    assert continu_complete_case.all
Beispiel #21
0
def test_split():
    n = 100
    d = 2
    x, y, theta = glmdisc.Glmdisc.generate_data(n, d)
    model = glmdisc.Glmdisc()
    random.seed(1)
    np.random.seed(1)
    model.fit(predictors_cont=x, predictors_qual=None, labels=y, iter=11)
    training = model.train
    validating = model.validate
    testing = model.test_rows

    model = glmdisc.Glmdisc()
    random.seed(1)
    np.random.seed(1)
    model.fit(predictors_cont=x, predictors_qual=None, labels=y, iter=11)
    np.testing.assert_array_equal(training, model.train)
    np.testing.assert_array_equal(validating, model.validate)
    np.testing.assert_array_equal(testing, model.test_rows)
    assert len(model.train) > 0
    assert len(model.validate) > 0
    assert len(model.test_rows) > 0

    model = glmdisc.Glmdisc(validation=False)
    model.fit(predictors_cont=x, predictors_qual=None, labels=y, iter=11)
    assert len(model.train) > 0
    assert model.validate is None
    assert len(model.test_rows) > 0

    model = glmdisc.Glmdisc(test=False)
    model.fit(predictors_cont=x, predictors_qual=None, labels=y, iter=11)
    assert len(model.train) > 0
    assert len(model.validate) > 0
    assert model.test_rows is None

    model = glmdisc.Glmdisc(validation=False, test=False)
    model.fit(predictors_cont=x, predictors_qual=None, labels=y, iter=11)
    assert len(model.train) > 0
    assert model.validate is None
    assert model.test_rows is None
Beispiel #22
0
def test_best_formula_no_cont():
    n = 200
    d = 2
    x, y, theta = glmdisc.Glmdisc.generate_data(n, d)
    cuts = ([0, 0.333, 0.666, 1])
    xd = np.ndarray.copy(x)

    for i in range(d):
        xd[:, i] = pd.cut(x[:, i], bins=cuts, labels=[0, 1, 2])

    model = glmdisc.Glmdisc(validation=False, test=False)
    model.fit(predictors_cont=None, predictors_qual=xd, labels=y, iter=11)
    model.best_formula()
Beispiel #23
0
def test_kwargs():
    n = 100
    d = 2
    x, y, theta = glmdisc.Glmdisc.generate_data(n, d)
    model = glmdisc.Glmdisc(algorithm="NN", validation=False, test=False)

    model.fit(predictors_cont=x,
              predictors_qual=None,
              labels=y,
              plot=True,
              optim=Adagrad(),
              callbacks=EarlyStopping())
    assert isinstance(model.model_nn.optimizer, tensorflow.python.keras.optimizer_v2.adam.Adam)
    assert isinstance(model.callbacks[-1], tensorflow.python.keras.callbacks.EarlyStopping)
Beispiel #24
0
def test_discretize_qual():
    n = 500
    d = 2
    x, y, theta = glmdisc.Glmdisc.generate_data(n, d)
    cuts = ([0, 0.333, 0.666, 1])
    xd = np.ndarray.copy(x)

    for i in range(d):
        xd[:, i] = pd.cut(x[:, i], bins=cuts, labels=[0, 1, 2])

    model = glmdisc.Glmdisc(algorithm="NN", validation=False, test=False)
    random.seed(1)
    np.random.seed(1)
    model.fit(predictors_cont=None, predictors_qual=xd, labels=y, iter=50)
    model.discretize(predictors_cont=None, predictors_qual=xd)
Beispiel #25
0
def test_not_fit():
    n = 100
    d = 2
    x, y, theta = glmdisc.Glmdisc.generate_data(n, d)
    model = glmdisc.Glmdisc()
    for i in range(100):
        random.seed(i)
        np.random.seed(i)
        model.fit(predictors_cont=x, predictors_qual=None, labels=y, iter=11)
        try:
            model.check_is_fitted()
        except glmdisc.NotFittedError:
            with pytest.raises(glmdisc.NotFittedError):
                model.check_is_fitted()
            break
Beispiel #26
0
def test_calculate_criterion():
    n = 100
    d = 2
    x, y, theta = glmdisc.Glmdisc.generate_data(n, d)
    model = glmdisc.Glmdisc(algorithm="NN", criterion="bic")
    random.seed(1)
    np.random.seed(1)
    model.fit(predictors_cont=x, predictors_qual=None, labels=y, iter=11)
    emap = np.resize(np.array([np.where(
        np.random.multinomial(1,
                              pvals=[0.33, 0.33, 0.34]))[0][0] + 1 for _ in range(n * d)]),
                     (n, d))

    current_encoder_emap = sk.preprocessing.OneHotEncoder()
    current_encoder_emap.fit(X=emap.astype(str))

    model_emap = sk.linear_model.LogisticRegression(solver='liblinear',
                                                    C=1e40,
                                                    tol=0.001,
                                                    max_iter=25,
                                                    warm_start=False)
    model_emap.fit(X=current_encoder_emap.transform(emap.astype(str)),
                   y=y)

    modele_bic = model._calculate_criterion(emap, model_emap, current_encoder_emap)
    assert modele_bic < 0

    model = glmdisc.Glmdisc(algorithm="NN", criterion="aic")
    random.seed(1)
    np.random.seed(1)
    model.fit(predictors_cont=x, predictors_qual=None, labels=y, iter=11)
    assert math.isclose(model._calculate_criterion(emap, model_emap, current_encoder_emap), modele_bic)

    model = glmdisc.Glmdisc(algorithm="NN", validation=False)
    random.seed(1)
    np.random.seed(1)
    model.fit(predictors_cont=x, predictors_qual=None, labels=y, iter=11)
    modele_bic = model._calculate_criterion(emap, model_emap, current_encoder_emap)

    model = glmdisc.Glmdisc(algorithm="NN", criterion="aic", validation=False)
    random.seed(1)
    np.random.seed(1)
    model.fit(predictors_cont=x, predictors_qual=None, labels=y, iter=11)
    assert math.isclose(model._calculate_criterion(emap,
                                                   model_emap,
                                                   current_encoder_emap), modele_bic + (
        math.log(model.n) - 2) * model_emap.coef_.shape[1])

    model = glmdisc.Glmdisc(algorithm="NN", criterion="gini")
    model.fit(predictors_cont=x, predictors_qual=None, labels=y, iter=11)
    assert 0 <= model._calculate_criterion(emap, model_emap, current_encoder_emap) <= 1

    model = glmdisc.Glmdisc(algorithm="NN", criterion="gini", validation=False)
    model.fit(predictors_cont=x, predictors_qual=None, labels=y, iter=11)
    assert 0 <= model._calculate_criterion(emap, model_emap, current_encoder_emap) <= 1
Beispiel #27
0
def test_discretize_qual():
    n = 500
    d = 2
    x, y, theta = glmdisc.Glmdisc.generate_data(n, d)
    cuts = ([0, 0.333, 0.666, 1])
    xd = np.ndarray.copy(x)

    for i in range(d):
        xd[:, i] = pd.cut(x[:, i], bins=cuts, labels=[0, 1, 2])

    model = glmdisc.Glmdisc(validation=False, test=False)
    random.seed(1)
    np.random.seed(1)
    model.fit(predictors_cont=None, predictors_qual=xd, labels=y, iter=50)
    emap = model.discretize(predictors_cont=None, predictors_qual=xd)
    model.best_encoder_emap.transform(emap.astype(int).astype(str))
Beispiel #28
0
def test_fit():
    n = 100
    d = 2
    x, y, theta = glmdisc.Glmdisc.generate_data(n, d)
    model = glmdisc.Glmdisc(algorithm="NN", criterion="bic", validation=True)
    model.fit(predictors_cont=x, predictors_qual=None, labels=y, iter=20)
    model = glmdisc.Glmdisc(algorithm="NN", criterion="aic", validation=True)
    model.fit(predictors_cont=x, predictors_qual=None, labels=y, iter=20)
    model = glmdisc.Glmdisc(algorithm="NN", criterion="gini", validation=True)
    model.fit(predictors_cont=x, predictors_qual=None, labels=y, iter=20)
    model = glmdisc.Glmdisc(algorithm="NN", criterion="bic", validation=False)
    model.fit(predictors_cont=x, predictors_qual=None, labels=y, iter=20)
    model = glmdisc.Glmdisc(algorithm="NN", criterion="aic", validation=False)
    model.fit(predictors_cont=x, predictors_qual=None, labels=y, iter=20)
    model = glmdisc.Glmdisc(algorithm="NN", criterion="gini", validation=False)
    model.fit(predictors_cont=x, predictors_qual=None, labels=y, iter=20)
    model = glmdisc.Glmdisc(algorithm="NN", criterion="bic", test=False, validation=False)
    model.fit(predictors_cont=x, predictors_qual=None, labels=y, iter=20)
    model = glmdisc.Glmdisc(algorithm="NN", criterion="aic", test=False, validation=False)
    model.fit(predictors_cont=x, predictors_qual=None, labels=y, iter=20)
    model = glmdisc.Glmdisc(algorithm="NN", criterion="gini", test=False, validation=False)
    model.fit(predictors_cont=x, predictors_qual=None, labels=y, iter=20)
Beispiel #29
0
def test_discrete_data_test_qual():
    n = 500
    d = 2
    x, y, theta = glmdisc.Glmdisc.generate_data(n, d)
    cuts = ([0, 0.333, 0.666, 1])
    xd = np.ndarray.copy(x)

    for i in range(d):
        xd[:, i] = pd.cut(x[:, i], bins=cuts, labels=[0, 1, 2])

    model = glmdisc.Glmdisc(algorithm="NN", validation=False, test=True)
    random.seed(1)
    np.random.seed(1)
    model.fit(predictors_cont=None, predictors_qual=xd, labels=y, iter=50)
    result = model.discrete_data()
    assert isinstance(result, np.ndarray)
    assert result.shape[0] == 200
Beispiel #30
0
def test_discrete_data_val_both():
    n = 1000
    d = 2
    x, y, theta = glmdisc.Glmdisc.generate_data(n, d)
    cuts = ([0, 0.333, 0.666, 1])
    xd = np.ndarray.copy(x)

    for i in range(d):
        xd[:, i] = pd.cut(x[:, i], bins=cuts, labels=[0, 1, 2])

    model = glmdisc.Glmdisc(validation=True, test=False)
    random.seed(1)
    np.random.seed(1)
    model.fit(predictors_cont=x, predictors_qual=xd, labels=y, iter=100)
    result = model.discrete_data()
    assert isinstance(result, scipy.sparse.csr.csr_matrix)
    assert result.shape[0] == 400