def test_setUp(self, tol=0.02):
        # assumes working directory is diamond/
        folder = "diamond/integration_tests/logistic"

        simulated_data_loc = "%s/simulated_logistic_df.csv" % folder
        estimated_covariance_loc = "%s/simulated_logistic_covariance.csv" % folder
        resources_exist = os.path.exists(simulated_data_loc) and os.path.exists(estimated_covariance_loc)
        if not resources_exist:
            logging.info("Simulating data and estimating covariances in R")
            os.system("/usr/local/bin/Rscript %s/logistic_generate_and_fit.R" % folder)
        logging.info("Reading in training data and R::lme4-estimated covariance matrix")
        df_train = pd.read_csv(simulated_data_loc)
        df_estimated_covariance = pd.read_csv(estimated_covariance_loc)

        self.model = LogisticRegression(train_df=df_train,
                                        priors_df=df_estimated_covariance,
                                        copy=True,
                                        test_df=None)
        logging.info("Fitting model in diamond")
        self.formula = "y ~ 1 + x + (1 + x | level)"
        results = self.model.fit(self.formula, tol=1e-4, verbose=True)

        # the format of the coefficient vector is:
        # fixed effects, then [random intercept, random slope] for each level
        beta_hat = np.append(results["fixed_effects"].value.values,
                             pd.melt(results["level"], "level").sort_values(["level", "variable"]).value.values)

        beta_true = pd.read_csv("%s/simulated_logistic_true_parameters.csv" % folder)["x"].values
        rel_error = np.mean((beta_hat - beta_true) ** 2) / np.mean(abs(beta_true))
        if rel_error > tol:
            logging.warn("relative error = %f > tolerance = %f" % (rel_error, tol))
        else:
            logging.info("relative error = %f < tolerance = %f" % (rel_error, tol))
        # make sure the coefficients are very close
        self.assertTrue(rel_error < tol)
Ejemplo n.º 2
0
def fit_diamond_model(df_train):
    logging.info('fitting diamond model')
    formula = 'target ~ 1 + (1|song_id) + (1|msno)'
    priors = pd.DataFrame({
        'group': ['song_id', 'msno'],
        'var1': ['intercept'] * 2,
        'var2': [np.nan] * 2,
        # fit on a sample of data in R/lme4
        'vcov': [0.00845, 0.07268]
    })
    diamond_model = LogisticRegression(df_train, priors)
    diamond_model.fit(formula, tol=1e-5, verbose=False, max_its=200)
    df_train.drop(['row_index', 'intercept'],
                  axis=1,
                  inplace=True,
                  errors='ignore')
    return diamond_model
    def setUp(self):
        data = {"response": [0, 1, 1], "var_a": [21, 32, 10], "cyl": [4, 6, 4]}
        df = pd.DataFrame(data, index=[0, 1, 2])

        priors_data = {
            "grp": ["cyl", "cyl", "cyl"],
            "var1": ["intercept", "intercept", "var_a"],
            "var2": [np.NaN, "var_a", np.NaN],
            "vcov": [0.123, -1.42, 0.998]
        }
        priors_df = pd.DataFrame(priors_data, index=[0, 1, 2])

        self.formula = "response ~ 1 + var_a + (1 + var_a | cyl)"

        self.model = LogisticRegression(train_df=df,
                                        priors_df=priors_df,
                                        test_df=None)
class TestGLM(unittest.TestCase):
    def setUp(self):
        data = {"response": [0, 1, 1], "var_a": [21, 32, 10], "cyl": [4, 6, 4]}
        df = pd.DataFrame(data, index=[0, 1, 2])

        priors_data = {
            "grp": ["cyl", "cyl", "cyl"],
            "var1": ["intercept", "intercept", "var_a"],
            "var2": [np.NaN, "var_a", np.NaN],
            "vcov": [0.123, -1.42, 0.998]
        }
        priors_df = pd.DataFrame(priors_data, index=[0, 1, 2])

        self.formula = "response ~ 1 + var_a + (1 + var_a | cyl)"

        self.model = LogisticRegression(train_df=df,
                                        priors_df=priors_df,
                                        test_df=None)

    def test_parse_formula(self):
        self.model._parse_formula(self.formula)

        self.assertEqual(self.model.num_main, 2)
        self.assertEqual(self.model.response, "response")
        self.assertListEqual(self.model.main_effects, ["intercept", "var_a"])
        self.assertEqual(self.model.total_num_interactions,
                         self.model.train_df.cyl.nunique())
        self.assertListEqual(self.model.grouping_factors, ["cyl"])
        self.assertListEqual(self.model.group_levels.keys(), ["cyl"])
        self.assertListEqual(list(self.model.group_levels["cyl"]), [4, 6])

    def test_create_penalty_matrix(self):
        self.model._parse_formula(self.formula)
        self.model._create_penalty_matrix()

        expected_inv_cov_block = np.linalg.inv([[0.123, -1.42], [-1.42,
                                                                 0.998]])
        actual_inv_cov_block = self.model.sparse_inv_covs["cyl"]._block

        self.assertListEqual(self.model.sparse_inv_covs.keys(),
                             ["main", "cyl"])
        self.assertEqual(self.model.sparse_inv_covs["cyl"]._num_blocks, 2)
        self.assertEqual(self.model.sparse_inv_covs["cyl"]._block_shape, 2)
        self.assertTrue((expected_inv_cov_block == actual_inv_cov_block).all())

    def test_create_main_design(self):
        self.model._parse_formula(self.formula)
        self.model._create_design_matrix()

        expected_design = [[1, float(row[1].var_a)]
                           for row in self.model.train_df.iterrows()]
        actual_design = self.model._create_main_design()

        self.assertEqual(
            actual_design.shape,
            (len(self.model.train_df), self.model.train_df.cyl.nunique()))
        self.assertTrue((expected_design == actual_design.todense()).all())

    def test_create_inter_design(self):
        self.model._parse_formula(self.formula)
        self.model._create_design_matrix()

        expected_design = [[1, float(row[1].var_a), 0, 0] if row[1].cyl == 4
                           else [0, 0, 1, float(row[1].var_a)]
                           for row in self.model.train_df.iterrows()]
        actual_design = self.model._create_inter_design(g="cyl")

        # shape is (num variables * num levels, num observations)
        self.assertEqual(
            actual_design.shape,
            (len(self.model.train_df), 2 * self.model.train_df.cyl.nunique()))
        self.assertTrue((expected_design == actual_design.todense()).all())