Python Data Beispiele, regmod.data.Data Python Beispiele

Beispiel #1

0

Datei anzeigen

Datei: test_optimizer.py Projekt: zhengp0/regmod

def test_scipy_optimizer(seed):
    np.random.seed(seed)
    num_obs = 20
    df = pd.DataFrame({
        "obs": np.random.randn(num_obs),
        "cov0": np.random.randn(num_obs),
        "cov1": np.random.randn(num_obs)
    })
    data = Data(col_obs="obs", col_covs=["cov0", "cov1"], df=df)

    spline_specs = SplineSpecs(knots=np.linspace(0.0, 1.0, 5),
                               degree=3,
                               knots_type="rel_domain")

    var_cov0 = Variable(name="cov0")
    var_cov1 = SplineVariable(name="cov1", spline_specs=spline_specs)

    model = GaussianModel(
        data, param_specs={"mu": {
            "variables": [var_cov0, var_cov1]
        }})

    coefs = scipy_optimize(model)

    tr_coef = np.linalg.solve(
        (model.mat[0].T * model.data.weights).dot(model.mat[0]),
        (model.mat[0].T * model.data.weights).dot(model.data.obs))

    assert np.allclose(coefs, tr_coef)

Beispiel #2

0

Datei anzeigen

def data():
    num_obs = 5
    df = pd.DataFrame({
        "obs": np.random.randn(num_obs),
        "cov0": np.random.randn(num_obs),
        "cov1": np.random.randn(num_obs)
    })
    return Data(col_obs="obs", col_covs=["cov0", "cov1"], df=df)

Beispiel #3

0

Datei anzeigen

    def __init__(self, data: Data, **kwargs):
        if not np.all(data.obs >= 0):
            raise ValueError(
                "Binomial model requires observations to be non-negative.")
        if len(data.col_obs) != 2:
            raise ValueError("Binomial model need 2 columns of observations, "
                             "one for number of events, one for sample size.")
        if any(np.diff(data.get_cols(data.col_obs), axis=1) < 0):
            raise ValueError(
                "Binomial model requires number of events less or equal than sample size."
            )

        self.obs_1s = data.get_cols(data.col_obs[0])
        self.obs_0s = np.diff(data.get_cols(data.col_obs), axis=1).ravel()
        self.obs_sample_sizes = data.get_cols(data.col_obs[1])

        super().__init__(data, **kwargs)

Beispiel #4

0

Datei anzeigen

Datei: test_binomialmodel.py Projekt: zhengp0/regmod

def data():
    num_obs = 5
    obs_1s = np.random.rand(num_obs) * 10
    df = pd.DataFrame({
        "obs_1s": obs_1s,
        "obs_sample_sizes": obs_1s + 1.0,
        "cov0": np.random.randn(num_obs),
        "cov1": np.random.randn(num_obs)
    })
    return Data(col_obs=["obs_1s", "obs_sample_sizes"],
                col_covs=["cov0", "cov1"],
                df=df)

Beispiel #5

0

Datei anzeigen

    def get_mat(self, data: Data) -> np.ndarray:
        """Get design matrix.

        Parameters
        ----------
        data : Data
            Data object that provides the covariates.

        Returns
        -------
        np.ndarray
            Design matrix.
        """
        self.check_data(data)
        return data.get_covs(self.name)

Beispiel #6

0

Datei anzeigen

    def get_mat(self, data: Data) -> np.ndarray:
        """Get design matrix.

        Parameters
        ----------
        data : Data
            Data object that provides the covariates.

        Returns
        -------
        np.ndarray
            Design matrix.
        """
        self.check_data(data)
        cov = data.get_cols(self.name)
        return self.spline.design_mat(cov, l_extra=True, r_extra=True)

Beispiel #7

0

Datei anzeigen

    def check_data(self, data: Data):
        """Check if the data contains the column name `name`. And create the
        spline object, if only `spline_specs` is provided.

        Parameters
        ----------
        data : Data
            Data object to be checked.
        """
        super().check_data(data)
        if self.spline is None:
            cov = data.get_cols(self.name)
            self.spline = self.spline_specs.create_spline(cov)
            for prior in self.linear_upriors + self.linear_gpriors:
                if isinstance(prior, SplinePrior):
                    prior.attach_spline(self.spline)

Beispiel #8

0

Datei anzeigen

def test_no_match_col_obs(df):
    df = df.drop(COL_OBS, axis=1)
    data = Data(col_obs=COL_OBS, df=df)
    assert all(np.isnan(data.obs))

Beispiel #9

0

Datei anzeigen

def test_mult_obs(df):
    data = Data(col_obs=[COL_OBS, COL_COVS[0]], df=df)
    assert len(data.col_obs) == 2
    obs = data.obs
    assert obs.shape == (data.num_obs, 2)

Beispiel #10

0

Datei anzeigen

def test_no_obs(df):
    data = Data(col_covs=COL_COVS, df=df)
    with pytest.raises(ValueError):
        data.obs

Beispiel #11

0

Datei anzeigen

def test_attach_df(df):
    data = Data(COL_OBS, COL_COVS, COL_WEIGHTS, COL_OFFSET)
    assert data.is_empty()
    data.attach_df(df)
    assert data.num_obs == NUM_OBS

Beispiel #12

0

Datei anzeigen

def test_post_init_fill_df(df_simple):
    data = Data(COL_OBS, df=df_simple)
    assert data.num_obs == NUM_OBS
    assert all(data.weights == 1.0)
    assert all(data.offset == 0.0)
    assert all(data.get_cols('intercept') == 1.0)

Beispiel #13

0

Datei anzeigen

def test_post_init_empty():
    data = Data(COL_OBS)
    assert data.is_empty()

Beispiel #14

0

Datei anzeigen

def test_init(df):
    data = Data(COL_OBS, COL_COVS, COL_WEIGHTS, COL_WEIGHTS, df)
    assert data.num_obs == NUM_OBS

Beispiel #15

0

Datei anzeigen

def data(df):
    return Data(COL_OBS, COL_COVS, COL_WEIGHTS, COL_WEIGHTS, df)

Beispiel #16

0

Datei anzeigen

Datei: logit_ratio.py Projekt: ihmeuw-msca/emmodel

def main():
    # load data
    df_all = pd.read_csv(data_path)
    df_all = df_all[df_all.include].reset_index(drop=True)

    national_index = df_all.ihme_loc_id.str.len() == 3
    df_national = df_all[national_index].reset_index(drop=True)
    df_subnational = df_all[~national_index].reset_index(drop=True)

    df = df_national

    # create results folder
    if not results_path.exists():
        results_path.mkdir()

# Fit global IDR model
    idr_model = ExcessMortalityModel(df, [idr_model_variables],
                                     col_obs="logit_ratio")
    idr_model.run_models()

    # attach data to create spline
    data = Data(col_obs="logit_ratio",
                col_covs=[
                    intercept_variable.name, idr_variable.name,
                    time_variable.name
                ])
    data.df = df
    idr_variable.check_data(data)
    time_variable.check_data(data)

    # fix idr coefficients
    coefs = idr_model.results[0]["coefs"][1:]
    idr_variable.add_priors(UniformPrior(lb=coefs, ub=coefs))

    # getting location structure
    location_structure = {}
    for super_region in df.super_region_name.unique():
        regions = df[df.super_region_name == super_region].region_name.unique()
        location_structure[super_region] = {}
        for region in regions:
            nationals = list(df_national[df_national.region_name ==
                                         region].ihme_loc_id.unique())
            location_structure[super_region][region] = {}
            for national in nationals:
                subnational_index = df_subnational.ihme_loc_id.str.startswith(
                    national)
                location_structure[super_region][region][national] = list(
                    df_subnational.ihme_loc_id[subnational_index].unique())

    # construct cascade model
    # global model
    global_model = Cascade(df, cascade_specs, level_id=0, name="Global")

    # super region model
    super_region_models = [
        Cascade(
            df[df.super_region_name == super_region].reset_index(drop=True),
            cascade_specs,
            level_id=1,
            name=super_region)
        for super_region in df.super_region_name.unique()
    ]

    # region model
    region_models = [
        Cascade(df[df.region_name == region].reset_index(drop=True),
                cascade_specs,
                level_id=2,
                name=region) for region in df.region_name.unique()
    ]

    # national model
    national_models = [
        Cascade(df_national[df_national.ihme_loc_id == national].reset_index(
            drop=True),
                cascade_specs,
                level_id=3,
                name=national)
        for national in df_national.ihme_loc_id.unique()
    ]

    # subnational model
    subnational_models = [
        Cascade(df_subnational[df_subnational.ihme_loc_id ==
                               subnational].reset_index(drop=True),
                cascade_specs,
                level_id=4,
                name=subnational)
        for subnational in df_subnational.ihme_loc_id.unique()
    ]

    # link all models together
    link_cascade_models(global_model, [
        super_region_models, region_models, national_models, subnational_models
    ], location_structure)

    # fit model
    global_model.run_models()

    # create plots
    model_list = global_model.to_list()

    # predict
    pred_dfs = {}
    for cmodel in model_list:
        pred_dfs[cmodel.name] = predict(cmodel.df, cmodel.model)

    # plot
    for loc_id in df_all.ihme_loc_id.unique():
        df_sub = df_all[df_all.ihme_loc_id == loc_id]
        loc_structure = [
            "Global", df_sub.super_region_name.values[0],
            df_sub.region_name.values[0]
        ]
        if len(loc_id) > 3:
            loc_structure.extend([loc_id[:3], loc_id])
        else:
            loc_structure.append(loc_id)
        plot_model(df_all, pred_dfs, loc_structure)
        plt.savefig(results_path / f"{loc_id}.pdf", bbox_inches="tight")
        plt.close("all")

    # # create results dataframe
    # coefs = pd.concat([model.model.get_coefs_df() for model in model_list])
    # coefs["location"] = [model.name for model in model_list]

    # coefs.to_csv(results_path / "coefs.csv", index=False)

    # # create samples of the coefficient
    # for cmodel in model_list:
    #     df_coefs = sample_coefs(cmodel)
    #     df_coefs.to_csv(results_path / f"cdraws_{cmodel.level_id}_{cmodel.name}.csv", index=False)

    return model_list