def test_cv_num_folds():
    cv = pbn.CrossValidation(df)

    dataframes = list(cv)
    indices = list(cv.indices())

    assert len(dataframes) == 10, "Default number of folds must be 10."
    assert len(indices) == 10, "Default number of folds must be 10."

    cv5 = pbn.CrossValidation(df, 5)
    dataframes = list(cv5)
    indices = list(cv5.indices())
    assert len(dataframes) == 5, "Wrong number of folds"
    assert len(indices) == 5, "Wrong number of folds for the indices iterator."
def test_cv_disjoint_indices():
    cv = pbn.CrossValidation(df)

    for (train_df, test_df), (train_indices,
                              test_indices) in zip(cv, cv.indices()):
        nptrain = np.asarray(train_indices)
        nptest = np.asarray(test_indices)
        combination = np.hstack((nptrain, nptest))

        assert np.all(
            np.sort(combination) == np.arange(SIZE)
        ), "Not all the examples are included in the cross validation."
        assert np.all(train_df.to_pandas().to_numpy() == df.iloc[train_indices,:].to_numpy()), \
                                "The CV iterator do not slice the train dataset exactly equal as the CV indices iterator."
        assert np.all(test_df.to_pandas().to_numpy() == df.iloc[test_indices,:].to_numpy()), \
                                "The CV iterator do not slice the test dataset exactly equal as the CV indices iterator."

        assert np.setdiff1d(
            nptrain, nptest
        ).shape == nptrain.shape, "The train indices includes test indices"
        assert np.setdiff1d(
            nptest, nptrain
        ).shape == nptest.shape, "The test indices includes train indices"
        assert np.all(
            np.sort(np.setdiff1d(train_indices, test_indices)) == np.sort(
                train_indices)), "The train indices includes test indices"
        assert np.all(
            np.sort(np.setdiff1d(test_indices, train_indices)) == np.sort(
                test_indices)), "The test indices includes train indices"
def test_cv_fold():
    cv = pbn.CrossValidation(df)

    for i, (train_df, test_df) in enumerate(cv):
        train_fold, test_fold = cv.fold(i)

        assert train_fold.equals(
            train_df), "Train DataFrame fold() and __iter__ are not equal."
        assert test_fold.equals(
            test_df), "Test DataFrame fold() and __iter__ are not equal."
def test_cv_seed():
    cv = pbn.CrossValidation(df, seed=0)

    dataframes = list(cv)

    cv2 = pbn.CrossValidation(df, seed=0)

    for (train_cv, test_cv), (train_cv2, test_cv2) in zip(dataframes, cv2):
        assert train_cv.equals(
            train_cv2), "Train CV DataFrames with the same seed are not equal."
        assert test_cv.equals(
            test_cv2), "Test CV DataFrames with the same seed are not equal."

    cv3 = pbn.CrossValidation(df, seed=1)
    for (train_cv2, test_cv2), (train_cv3, test_cv3) in zip(cv2, cv3):
        assert not train_cv2.equals(
            train_cv3
        ), "Train CV DataFrames with different seeds return the same result."
        assert not test_cv2.equals(
            test_cv3
        ), "Test CV DataFrames with different seeds return the same result."
def numpy_local_score(node_type, data, variable, evidence):
    cv = pbn.CrossValidation(data, 10, seed)
    loglik = 0
    for train_df, test_df in cv:
        if isinstance(variable, str):
            node_data = train_df.to_pandas().loc[:, [variable] +
                                                 evidence].dropna()
            variable_data = node_data.loc[:, variable]
            evidence_data = node_data.loc[:, evidence]
            test_node_data = test_df.to_pandas().loc[:, [variable] +
                                                     evidence].dropna()
            test_variable_data = test_node_data.loc[:, variable]
            test_evidence_data = test_node_data.loc[:, evidence]
        else:
            node_data = train_df.to_pandas().iloc[:, [variable] +
                                                  evidence].dropna()
            variable_data = node_data.iloc[:, 0]
            evidence_data = node_data.iloc[:, 1:]
            test_node_data = test_df.to_pandas().iloc[:, [variable] +
                                                      evidence].dropna()
            test_variable_data = test_node_data.iloc[:, 0]
            test_evidence_data = test_node_data.iloc[:, 1:]

        if node_type == pbn.LinearGaussianCPDType():
            N = variable_data.shape[0]
            d = evidence_data.shape[1]
            linregress_data = np.column_stack(
                (np.ones(N), evidence_data.to_numpy()))
            (beta, res, _, _) = np.linalg.lstsq(linregress_data,
                                                variable_data.to_numpy(),
                                                rcond=None)
            var = res / (N - d - 1)

            means = beta[0] + np.sum(beta[1:] * test_evidence_data, axis=1)
            loglik += norm.logpdf(test_variable_data, means,
                                  np.sqrt(var)).sum()
        elif node_type == pbn.CKDEType():
            k_joint = gaussian_kde(
                node_data.to_numpy().T,
                bw_method=lambda s: np.power(4 / (s.d + 2), 1 /
                                             (s.d + 4)) * s.scotts_factor())
            if evidence:
                k_marg = gaussian_kde(evidence_data.to_numpy().T,
                                      bw_method=k_joint.covariance_factor())
                loglik += np.sum(
                    k_joint.logpdf(test_node_data.to_numpy().T) -
                    k_marg.logpdf(test_evidence_data.to_numpy().T))
            else:
                loglik += np.sum(k_joint.logpdf(test_node_data.to_numpy().T))

    return loglik
def test_cv_loc():
    cv = pbn.CrossValidation(df)

    for (train_df, test_df) in cv.loc("a"):
        assert train_df.num_columns == 1, "Only column \"a\" must be present in train DataFrame."
        assert test_df.num_columns == 1, "Only column \"a\" must be present in test DataFrame."
        train_schema = train_df.schema
        test_schema = test_df.schema
        assert train_schema.names == [
            "a"
        ], "Only column \"a\" must be present in train DataFrame."
        assert test_schema.names == [
            "a"
        ], "Only column \"a\" must be present in test DataFrame."

    for (train_df, test_df) in cv.loc(1):
        assert train_df.num_columns == 1, "Only column \"b\" must be present in train DataFrame."
        assert test_df.num_columns == 1, "Only column \"b\" must be present in test DataFrame."
        train_schema = train_df.schema
        test_schema = test_df.schema
        assert train_schema.names == [
            "b"
        ], "Only column \"b\" must be present in train DataFrame."
        assert test_schema.names == [
            "b"
        ], "Only column \"b\" must be present in test DataFrame."

    for (train_df, test_df) in cv.loc(["b", "d"]):
        assert train_df.num_columns == 2, "Only columns [\"b\", \"d\"] must be present in train DataFrame."
        assert test_df.num_columns == 2, "Only column [\"b\", \"d\"] must be present in test DataFrame."
        train_schema = train_df.schema
        test_schema = test_df.schema
        assert train_schema.names == [
            "b", "d"
        ], "Only column [\"b\", \"d\"] must be present in train DataFrame."
        assert test_schema.names == [
            "b", "d"
        ], "Only column [\"b\", \"d\"] must be present in test DataFrame."

    for (train_df, test_df) in cv.loc([0, 2]):
        assert train_df.num_columns == 2, "Only columns [\"a\", \"c\"] must be present in train DataFrame."
        assert test_df.num_columns == 2, "Only column [\"a\", \"c\"] must be present in test DataFrame."
        train_schema = train_df.schema
        test_schema = test_df.schema
        assert train_schema.names == [
            "a", "c"
        ], "Only column [\"a\", \"c\"] must be present in train DataFrame."
        assert test_schema.names == [
            "a", "c"
        ], "Only column [\"a\", \"c\"] must be present in test DataFrame."
def test_cv_null():
    np.random.seed(0)
    a_null = np.random.randint(0, SIZE, size=100)
    b_null = np.random.randint(0, SIZE, size=100)
    c_null = np.random.randint(0, SIZE, size=100)
    d_null = np.random.randint(0, SIZE, size=100)

    df_null = df
    df_null.loc[df_null.index[a_null], 'a'] = np.nan
    df_null.loc[df_null.index[b_null], 'b'] = np.nan
    df_null.loc[df_null.index[c_null], 'c'] = np.nan
    df_null.loc[df_null.index[d_null], 'd'] = np.nan

    non_null = df_null.dropna()
    cv = pbn.CrossValidation(df_null)

    for (train_df, test_df), (train_indices,
                              test_indices) in zip(cv, cv.indices()):
        assert non_null.shape[0] == (
            train_df.num_rows +
            test_df.num_rows), "CV did not remove null instances correctly."

        nptrain = np.asarray(train_indices)
        nptest = np.asarray(test_indices)
        combination = np.hstack((nptrain, nptest))

        actual_combination = np.sort(
            np.setdiff1d(
                np.arange(SIZE),
                np.asarray(
                    list(
                        set(
                            list(a_null) + list(b_null) + list(c_null) +
                            list(d_null))))))

        assert np.all(
            np.sort(combination) == actual_combination
        ), "Not all the examples are included in the cross validation."
        assert np.all(train_df.to_pandas().to_numpy() == df.iloc[train_indices,:].to_numpy()), \
                                "The CV iterator do not slice the train dataset exactly equal as the CV indices iterator."
        assert np.all(test_df.to_pandas().to_numpy() == df.iloc[test_indices,:].to_numpy()), \
                                "The CV iterator do not slice the test dataset exactly equal as the CV indices iterator."

        assert np.setdiff1d(
            nptrain, nptest
        ).shape == nptrain.shape, "The train indices includes test indices"
        assert np.setdiff1d(
            nptest, nptrain
        ).shape == nptest.shape, "The test indices includes train indices"
        assert np.all(
            np.sort(np.setdiff1d(train_indices, test_indices)) == np.sort(
                train_indices)), "The train indices includes test indices"
        assert np.all(
            np.sort(np.setdiff1d(test_indices, train_indices)) == np.sort(
                test_indices)), "The test indices includes train indices"

    cv_include_null = pbn.CrossValidation(df_null, include_null=True)

    for (train_df, test_df), (train_indices,
                              test_indices) in zip(cv_include_null,
                                                   cv_include_null.indices()):
        assert (train_df.num_rows + test_df.num_rows
                ) == SIZE, "CV did not remove null instances correctly."

        nptrain = np.asarray(train_indices)
        nptest = np.asarray(test_indices)
        combination = np.hstack((nptrain, nptest))

        train_df_mat = train_df.to_pandas().to_numpy()
        train_indices_mat = df.iloc[train_indices, :].to_numpy()
        test_df_mat = test_df.to_pandas().to_numpy()
        test_indices_mat = df.iloc[test_indices, :].to_numpy()

        assert np.all(
            np.sort(combination) == np.arange(SIZE)
        ), "Not all the examples are included in the cross validation."
        assert np.all(np.isnan(train_df_mat) == np.isnan(train_indices_mat)), \
                                                                "The null values are wrongly specified in the train DataFrame."

        assert np.all(train_df_mat[~np.isnan(train_df_mat)] == train_indices_mat[~np.isnan(train_df_mat)]), \
                                "The CV iterator do not slice the train dataset exactly equal as the CV indices iterator."

        assert np.all(np.isnan(test_df_mat) == np.isnan(test_indices_mat)), \
                                                                "The null values are wrongly specified in the test DataFrame."
        assert np.all(test_df_mat[~np.isnan(test_df_mat)] == test_indices_mat[~np.isnan(test_df_mat)]), \
                                "The CV iterator do not slice the test dataset exactly equal as the CV indices iterator."

        assert np.setdiff1d(
            nptrain, nptest
        ).shape == nptrain.shape, "The train indices includes test indices"
        assert np.setdiff1d(
            nptest, nptrain
        ).shape == nptest.shape, "The test indices includes train indices"
        assert np.all(
            np.sort(np.setdiff1d(train_indices, test_indices)) == np.sort(
                train_indices)), "The train indices includes test indices"
        assert np.all(
            np.sort(np.setdiff1d(test_indices, train_indices)) == np.sort(
                test_indices)), "The test indices includes train indices"