Ejemplo n.º 1
0
    def test_poisson_epil(self):

        cur_dir = os.path.dirname(os.path.abspath(__file__))
        fname = os.path.join(cur_dir, "results", "epil.csv")
        data = pd.read_csv(fname)

        fam = Poisson()
        ind = Independence()
        md1 = GEE.from_formula("y ~ age + trt + base",
                               data,
                               groups=data["subject"],
                               cov_struct=ind,
                               family=fam)
        mdf1 = md1.fit()

        # Coefficients should agree with GLM
        from statsmodels.genmod.generalized_linear_model import GLM
        from statsmodels.genmod import families

        md2 = GLM.from_formula("y ~ age + trt + base",
                               data,
                               family=families.Poisson())
        mdf2 = md2.fit(scale="X2")

        assert_almost_equal(mdf1.params, mdf2.params, decimal=6)
        assert_almost_equal(mdf1.scale, mdf2.scale, decimal=6)
Ejemplo n.º 2
0
    def test_poisson_epil(self):

        cur_dir = os.path.dirname(os.path.abspath(__file__))
        fname = os.path.join(cur_dir, "results", "epil.csv")
        data = pd.read_csv(fname)

        fam = Poisson()
        ind = Independence()
        mod1 = GEE.from_formula("y ~ age + trt + base",
                                data["subject"],
                                data,
                                cov_struct=ind,
                                family=fam)
        rslt1 = mod1.fit()

        # Coefficients should agree with GLM
        from statsmodels.genmod.generalized_linear_model import GLM
        from statsmodels.genmod import families

        mod2 = GLM.from_formula("y ~ age + trt + base",
                                data,
                                family=families.Poisson())
        rslt2 = mod2.fit(scale="X2")

        # don't use wrapper, asserts_xxx don't work
        rslt1 = rslt1._results
        rslt2 = rslt2._results

        assert_almost_equal(rslt1.params, rslt2.params, decimal=6)
        assert_almost_equal(rslt1.scale, rslt2.scale, decimal=6)
Ejemplo n.º 3
0
    def test_poisson_epil(self):

        cur_dir = os.path.dirname(os.path.abspath(__file__))
        fname = os.path.join(cur_dir, "results", "epil.csv")
        data = pd.read_csv(fname)

        fam = Poisson()
        ind = Independence()
        mod1 = GEE.from_formula("y ~ age + trt + base", data["subject"],
                                data, cov_struct=ind, family=fam)
        rslt1 = mod1.fit()

        # Coefficients should agree with GLM
        from statsmodels.genmod.generalized_linear_model import GLM
        from statsmodels.genmod import families

        mod2 = GLM.from_formula("y ~ age + trt + base", data,
                               family=families.Poisson())
        rslt2 = mod2.fit(scale="X2")

        # don't use wrapper, asserts_xxx don't work
        rslt1 = rslt1._results
        rslt2 = rslt2._results

        assert_almost_equal(rslt1.params, rslt2.params, decimal=6)
        assert_almost_equal(rslt1.scale, rslt2.scale, decimal=6)
Ejemplo n.º 4
0
    def setup_class(cls):
        from .test_diagnostic import get_duncan_data
        endog, exog, labels = get_duncan_data()
        data = pd.DataFrame(np.column_stack((endog, exog)),
                        columns='y const var1 var2'.split(),
                        index=labels)

        res0 = GLM.from_formula('y ~ const + var1 + var2 - 1', data).fit()
        res1 = OLS.from_formula('y ~ const + var1 + var2 - 1', data).fit()
        cls.infl1 = res1.get_influence()
        cls.infl0 = res0.get_influence()
Ejemplo n.º 5
0
    def setup_class(cls):
        from .test_diagnostic import get_duncan_data
        endog, exog, labels = get_duncan_data()
        data = pd.DataFrame(np.column_stack((endog, exog)),
                            columns='y const var1 var2'.split(),
                            index=labels)

        res0 = GLM.from_formula('y ~ const + var1 + var2 - 1', data).fit()
        res1 = OLS.from_formula('y ~ const + var1 + var2 - 1', data).fit()
        cls.infl1 = res1.get_influence()
        cls.infl0 = res0.get_influence()
Ejemplo n.º 6
0
    def setup_class(cls):
        nobs = 30
        np.random.seed(987128)
        x = np.random.randn(nobs, 3)
        y = x.sum(1) + np.random.randn(nobs)
        index = ['obs%02d' % i for i in range(nobs)]
        # add one extra column to check that it doesn't matter
        cls.data = pd.DataFrame(np.round(np.column_stack((y, x)), 4),
                                columns='y var1 var2 var3'.split(),
                                index=index)

        cls.res = GLM.from_formula('y ~ var1 + var2', data=cls.data).fit()
Ejemplo n.º 7
0
    def setup_class(cls):
        nobs = 30
        np.random.seed(987128)
        x = np.random.randn(nobs, 3)
        y = x.sum(1) + np.random.randn(nobs)
        index = ['obs%02d' % i for i in range(nobs)]
        # add one extra column to check that it doesn't matter
        cls.data = pd.DataFrame(np.round(np.column_stack((y, x)), 4),
                                columns='y var1 var2 var3'.split(),
                                index=index)

        cls.res = GLM.from_formula('y ~ var1 + var2', data=cls.data).fit()
Ejemplo n.º 8
0
def _train(elements, model_cfg):
    """Construct one model per building block type"""
    models = {}
    target = model_cfg.target
    for model in model_cfg.sections:
        # Construct model formula from configuration
        terms = " + ".join(["1"] + [f"C({f})" for f in model.factors])

        # Train model
        models[model.label] = GLM.from_formula(
            f"{target} ~ {terms}",
            family=Binomial(),
            data=filter_data(elements, {model_cfg.label_column: model.label}),
        ).fit(scale="X2")
    return models
Ejemplo n.º 9
0
    def setup_class(cls):
        from statsmodels.base._constraints import fit_constrained

        cls.res2 = results.results_noexposure_constraint
        cls.idx = [7, 3, 4, 5, 6, 0, 1]  # 2 is dropped baseline for categorical

        # example without offset
        formula = 'deaths ~ logpyears + smokes + C(agecat)'
        mod = GLM.from_formula(formula, data=data,
                                    family=families.Poisson())

        constr = 'C(agecat)[T.4] = C(agecat)[T.5]'
        lc = patsy.DesignInfo(mod.exog_names).linear_constraint(constr)
        cls.res1 = fit_constrained(mod, lc.coefs, lc.constants)
        cls.constraints = lc
        cls.res1m = mod.fit_constrained(constr)
    def setup_class(cls):
        from statsmodels.base._constraints import fit_constrained

        cls.res2 = results.results_noexposure_constraint
        cls.idx = [7, 3, 4, 5, 6, 0,
                   1]  # 2 is dropped baseline for categorical

        # example without offset
        formula = 'deaths ~ logpyears + smokes + C(agecat)'
        mod = GLM.from_formula(formula, data=data, family=families.Poisson())

        constr = 'C(agecat)[T.4] = C(agecat)[T.5]'
        lc = patsy.DesignInfo(mod.exog_names).linear_constraint(constr)
        cls.res1 = fit_constrained(mod, lc.coefs, lc.constants)
        cls.constraints = lc
        cls.res1m = mod.fit_constrained(constr)
Ejemplo n.º 11
0
    def setup_class(cls):
        vs = Independence()
        family = families.Poisson()
        np.random.seed(987126)
        Y = np.exp(1 + np.random.normal(size=100))
        X1 = np.random.normal(size=100)
        X2 = np.random.normal(size=100)
        X3 = np.random.normal(size=100)
        groups = np.random.randint(0, 4, size=100)

        D = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "X3": X3})

        mod1 = GEE.from_formula("Y ~ X1 + X2 + X3", groups, D,
                            family=family, cov_struct=vs)
        cls.result1 = mod1.fit()

        mod2 = GLM.from_formula("Y ~ X1 + X2 + X3", data=D, family=family)
        cls.result2 = mod2.fit(disp=False)
Ejemplo n.º 12
0
    def setup_class(cls):

        vs = Independence()
        family = families.Gaussian()
        np.random.seed(987126)
        Y = np.random.normal(size=100)
        X1 = np.random.normal(size=100)
        X2 = np.random.normal(size=100)
        X3 = np.random.normal(size=100)
        groups = np.kron(np.arange(20), np.ones(5))

        D = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "X3": X3})

        md = GEE.from_formula("Y ~ X1 + X2 + X3", groups, D,
                              family=family, cov_struct=vs)
        cls.result1 = md.fit()

        cls.result2 = GLM.from_formula("Y ~ X1 + X2 + X3", data=D).fit()
Ejemplo n.º 13
0
    def setup_class(cls):
        from statsmodels.genmod.generalized_linear_model import GLM
        from statsmodels.genmod import families
        from statsmodels.base._constraints import fit_constrained

        cls.res2 = results.results_exposure_constraint
        cls.idx = [6, 2, 3, 4, 5, 0]  # 2 is dropped baseline for categorical

        # example with offset
        formula = 'deaths ~ smokes + C(agecat)'
        mod = GLM.from_formula(formula, data=data,
                                    family=families.Poisson(),
                                    offset=np.log(data['pyears'].values))

        constr = 'C(agecat)[T.4] = C(agecat)[T.5]'
        lc = patsy.DesignInfo(mod.exog_names).linear_constraint(constr)
        cls.res1 = fit_constrained(mod, lc.coefs, lc.constants)
        cls.constraints = lc
        cls.res1m = mod.fit_constrained(constr)._results
Ejemplo n.º 14
0
def test_wtd_patsy_missing():
    from statsmodels.datasets.cpunish import load
    import pandas as pd
    data = load()
    data.exog[0, 0] = np.nan
    data.endog[[2, 4, 6, 8]] = np.nan
    data.pandas = pd.DataFrame(data.exog, columns=data.exog_name)
    data.pandas['EXECUTIONS'] = data.endog
    weights = np.arange(1, len(data.endog)+1)
    formula = """EXECUTIONS ~ INCOME + PERPOVERTY + PERBLACK + VC100k96 +
                 SOUTH + DEGREE"""
    mod_misisng = GLM.from_formula(formula, data=data.pandas, freq_weights=weights)
    assert_equal(mod_misisng.freq_weights.shape[0],
                 mod_misisng.endog.shape[0])
    assert_equal(mod_misisng.freq_weights.shape[0],
                 mod_misisng.exog.shape[0])
    assert_equal(mod_misisng.freq_weights.shape[0], 12)
    keep_weights = np.array([ 2,  4,  6,  8, 10, 11, 12, 13, 14, 15, 16, 17])
    assert_equal(mod_misisng.freq_weights, keep_weights)
def compute_chi2_null_test(model_results, data, dep_var, max_iter, l2_weight):
    """
    Compute difference from null model using deviance:
    P(null) - P(model) ~ chi_2
    """
    null_formula = '%s ~ 1' % (dep_var)
    null_model = GLM.from_formula(null_formula,
                                  data,
                                  family=Binomial(link=logit()))
    null_model_results = null_model.fit_regularized(maxiter=max_iter,
                                                    method='elastic_net',
                                                    alpha=l2_weight,
                                                    L1_wt=0.0)
    model_loglike = model_results.model.loglike(model_results.params)
    null_model_loglike = null_model_results.model.loglike(
        null_model_results.params)
    llr = -2 * (null_model_loglike - model_loglike)
    model_df = model_results.model.df_model
    p_val = chi2.sf(llr, model_df)
    return llr, model_df, p_val
Ejemplo n.º 16
0
    def setup_class(cls):
        # adjusted for Gamma, not in test_gee.py
        vs = Independence()
        family = families.Gamma(link=links.log)
        np.random.seed(987126)
        #Y = np.random.normal(size=100)**2
        Y = np.exp(0.1 + np.random.normal(size=100))   # log-normal
        X1 = np.random.normal(size=100)
        X2 = np.random.normal(size=100)
        X3 = np.random.normal(size=100)
        groups = np.random.randint(0, 4, size=100)

        D = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "X3": X3})

        mod1 = GEE.from_formula("Y ~ X1 + X2 + X3", groups, D,
                                family=family, cov_struct=vs)
        cls.result1 = mod1.fit()

        mod2 = GLM.from_formula("Y ~ X1 + X2 + X3", data=D, family=family)
        cls.result2 = mod2.fit(disp=False)
Ejemplo n.º 17
0
    def setup_class(cls):
        from statsmodels.genmod.generalized_linear_model import GLM
        from statsmodels.genmod import families
        from statsmodels.base._constraints import fit_constrained

        cls.res2 = results.results_exposure_constraint
        cls.idx = [6, 2, 3, 4, 5, 0]  # 2 is dropped baseline for categorical

        # example with offset
        formula = 'deaths ~ smokes + C(agecat)'
        mod = GLM.from_formula(formula,
                               data=data,
                               family=families.Poisson(),
                               offset=np.log(data['pyears'].values))

        constr = 'C(agecat)[T.4] = C(agecat)[T.5]'
        lc = patsy.DesignInfo(mod.exog_names).linear_constraint(constr)
        cls.res1 = fit_constrained(mod, lc.coefs, lc.constants)
        cls.constraints = lc
        cls.res1m = mod.fit_constrained(constr)._results
Ejemplo n.º 18
0
    def setup_class(cls):
        vs = Independence()
        family = families.Poisson()
        np.random.seed(987126)
        Y = np.exp(1 + np.random.normal(size=100))
        X1 = np.random.normal(size=100)
        X2 = np.random.normal(size=100)
        X3 = np.random.normal(size=100)
        groups = np.random.randint(0, 4, size=100)

        D = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "X3": X3})

        mod1 = GEE.from_formula("Y ~ X1 + X2 + X3",
                                groups,
                                D,
                                family=family,
                                cov_struct=vs)
        cls.result1 = mod1.fit()

        mod2 = GLM.from_formula("Y ~ X1 + X2 + X3", data=D, family=family)
        cls.result2 = mod2.fit(disp=False)
Ejemplo n.º 19
0
    def setup_class(cls):

        vs = Independence()
        family = families.Gaussian()
        np.random.seed(987126)
        Y = np.random.normal(size=100)
        X1 = np.random.normal(size=100)
        X2 = np.random.normal(size=100)
        X3 = np.random.normal(size=100)
        groups = np.kron(np.arange(20), np.ones(5))

        D = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "X3": X3})

        md = GEE.from_formula("Y ~ X1 + X2 + X3",
                              groups,
                              D,
                              family=family,
                              cov_struct=vs)
        cls.result1 = md.fit()

        cls.result2 = GLM.from_formula("Y ~ X1 + X2 + X3", data=D).fit()
Ejemplo n.º 20
0
    def setup_class(cls):
        # adjusted for Gamma, not in test_gee.py
        vs = Independence()
        family = families.Gamma(link=links.log)
        np.random.seed(987126)
        #Y = np.random.normal(size=100)**2
        Y = np.exp(0.1 + np.random.normal(size=100))  # log-normal
        X1 = np.random.normal(size=100)
        X2 = np.random.normal(size=100)
        X3 = np.random.normal(size=100)
        groups = np.random.randint(0, 4, size=100)

        D = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "X3": X3})

        mod1 = GEE.from_formula("Y ~ X1 + X2 + X3",
                                groups,
                                D,
                                family=family,
                                cov_struct=vs)
        cls.result1 = mod1.fit()

        mod2 = GLM.from_formula("Y ~ X1 + X2 + X3", data=D, family=family)
        cls.result2 = mod2.fit(disp=False)
Ejemplo n.º 21
0
    def test_poisson_epil(self):

        cur_dir = os.path.dirname(os.path.abspath(__file__))
        fname = os.path.join(cur_dir, "results", "epil.csv")
        data = pd.read_csv(fname)

        fam = Poisson()
        ind = Independence()
        md1 = GEE.from_formula("y ~ age + trt + base", data,
                               groups=data["subject"], cov_struct=ind,
                               family=fam)
        mdf1 = md1.fit()

        # Coefficients should agree with GLM
        from statsmodels.genmod.generalized_linear_model import GLM
        from statsmodels.genmod import families

        md2 = GLM.from_formula("y ~ age + trt + base", data,
                               family=families.Poisson())
        mdf2 = md2.fit(scale="X2")

        assert_almost_equal(mdf1.params, mdf2.params, decimal=6)
        assert_almost_equal(mdf1.scale, mdf2.scale, decimal=6)
Ejemplo n.º 22
0
def update_peaks_fit_regression(data, NE_date_ranges, NE_var, data_name_var,
                                round_date_var, peak_date_var,
                                peak_date_buffer, scalar_vars, formula,
                                max_iter, l2_weight, regression_type):
    """
    Randomly update peak times and fit regression.
    """
    NE_peak_dates_i = NE_date_ranges.apply(lambda x: np.random.choice(x, 1)[0])
    NE_peak_dates_i_df = NE_peak_dates_i.reset_index().rename(
        columns={0: peak_date_var})
    #         data_peak_dates_i = data_date_ranges.apply(lambda x: np.random.choice(x, 1)[0]).reset_index().rename(columns={0 : peak_date_var})
    data_i = pd.merge(data,
                      NE_peak_dates_i_df,
                      on=[NE_var, data_name_var],
                      how='inner')
    # reassign peaks
    data_i = data_i.assign(
        **{
            'pre_peak': (
                data_i.loc[:, round_date_var] <= data_i.loc[:, peak_date_var] -
                peak_date_buffer).astype(int),
            'post_peak': (
                data_i.loc[:, round_date_var] >= data_i.loc[:, peak_date_var] +
                peak_date_buffer).astype(int),
            'during_peak':
            ((data_i.loc[:, round_date_var] > data_i.loc[:, peak_date_var] -
              peak_date_buffer)
             & (data_i.loc[:, round_date_var] < data_i.loc[:, peak_date_var] +
                peak_date_buffer)).astype(int),
        })
    # add days since post-peak
    data_i = data_i.assign(
        **{
            'since_peak':
            data_i.loc[:, 'post_peak'] *
            (data_i.loc[:, round_date_var] - data_i.loc[:, peak_date_var])
        })
    # Z-norm all scalar vars
    scaler = StandardScaler()
    for v in scalar_vars:
        data_i = data_i.assign(
            **
            {v: scaler.fit_transform(data_i.loc[:, v].values.reshape(-1, 1))})
    model_full = GLM.from_formula(formula,
                                  data_i,
                                  family=Binomial(link=logit()))
    logging.debug(
        '%d/%d/%d pre/during/post data' %
        (data_i.loc[:, 'pre_peak'].sum(), data_i.loc[:, 'during_peak'].sum(),
         data_i.loc[:, 'post_peak'].sum()))
    if (regression_type == 'regularized_logit'):
        model_res_full = model_full.fit_regularized(maxiter=max_iter,
                                                    method='elastic_net',
                                                    alpha=l2_weight,
                                                    L1_wt=0.0)
        model_res_full_err = compute_err_data(model_res_full)
        err = model_res_full_err.loc[:, 'SE']
    else:
        model_res_full = model_full.fit()
        err = model_res_full.bse
    params = model_res_full.params
    return params, err, NE_peak_dates_i
def run_regression(data,
                   formula,
                   regression_type,
                   dep_var='anchor',
                   out_dir='../../output',
                   split_var=None,
                   split_var_val=0):
    """
    Run logit regression on data with given formula 
    and write to file.
    Option: use regularized logit (reduce variable inflation).
    
    :param data: full data
    :param formula: regression formula
    :param regression_type: type of regression (logit|regularized_logit)
    :param dep_var: dependent variable
    :param out_dir: output directory
    :param split_var: optional variable to split data (e.g. only organization accounts)
    :param split_var_val: value of split value variable (if included)
    """
    l2_weight = 0.01
    max_iter = 100
    model_full = GLM.from_formula(formula, data, family=Binomial(link=logit()))
    if (regression_type == 'regularized_logit'):
        model_res_full = model_full.fit_regularized(maxiter=max_iter,
                                                    method='elastic_net',
                                                    alpha=l2_weight,
                                                    L1_wt=0.0)
    else:
        model_res_full = model_full.fit()

    ## summary stats
    model_res_full_err = compute_err_data(model_res_full)
    # write to file
    reg_out_str = 'anchor_%s_output_%s.tsv' % (regression_type,
                                               formula.replace(' ', ''))
    if (split_var is not None):
        reg_out_str = 'anchor_%s_output_%s_split_%s=%s.tsv' % (
            regression_type, formula.replace(' ',
                                             ''), split_var, split_var_val)
    res_out_file = os.path.join(out_dir, reg_out_str)
    model_res_full_err.to_csv(res_out_file, sep='\t', index=True)

    ## save coeffs to file => pretty print as latex
    # need lots of decimal points! for multiple variable correction
    pd.options.display.float_format = '{:,.5f}'.format
    tex_out_str = reg_out_str.replace('.tsv', '.tex')
    tex_res_out_file = os.path.join(out_dir, tex_out_str)
    model_res_full_err = model_res_full_err.assign(
        **{'coeff': model_res_full_err.index})
    tex_data_cols = ['coeff', 'mean', 'SE', 'p_val']
    model_res_full_err.to_latex(tex_res_out_file,
                                columns=tex_data_cols,
                                index=False)

    ## compute regression fit parameters => deviance, AIC, etc.
    # start with chi2 test against null model
    llr, model_df, p_val = compute_chi2_null_test(model_res_full, data,
                                                  dep_var, max_iter, l2_weight)
    logging.debug('N=%d, LLR=%.5f, df=%d, p-val=%.3E' %
                  (data.shape[0], llr, model_df, p_val))
    # variance inflation factor: are some of the covariates highly collinear?
    # for sanity we only look at non-categorical vars
    cat_var_matcher = re.compile(
        'C\(.+\)\[T\..+\]|Intercept'
    )  # format="C(var_name)[T.var_val]" ("C(username)[T.barackobama]")
    non_cat_params = [
        param for param in model_res_full.params.index
        if cat_var_matcher.search(param) is None
    ]
    for param in non_cat_params:
        VIF_i = compute_VIF(model_res_full, param)
        logging.debug('VIF test: param=%s, VIF=%.3f' % (param, VIF_i))

    ## compute accuracy on k-fold classification
    ## we would use R-squared but that doesn't work for logistic regression
    # first get data into usable format
    n_splits = 10
    accs = k_fold_acc(model_full.exog, model_full.endog, k=n_splits)
    mean_acc = np.mean(accs)
    se_acc = np.std(accs) / n_splits**.5
    logging.debug('%d-fold mean accuracy = %.3f +/- %.3f' %
                  (n_splits, mean_acc, se_acc))
def test_weights(data, dep_var, cat_vars, scalar_vars, l2_weights):
    indep_formula = ' + '.join(
        ['C(%s)' % (cap_cat_var)
         for cap_cat_var in cap_cat_vars] + scalar_vars)
    formula = '%s ~ %s' % (dep_var, indep_formula)
    # convert raw data to exogenous data
    # need to do this to force train/test
    # to have same features
    data_rand = data.copy()
    np.random.shuffle(data_rand.values)
    model_dummy = GLM.from_formula(formula,
                                   data_rand,
                                   family=Binomial(link=logit()))
    exog = model_dummy.exog
    exog_names = model_dummy.exog_names
    endog = model_dummy.endog
    # generate cross validation folds
    cross_val_folds = 10
    N = data_rand.shape[0]
    cross_val_chunk_size = float(N) / cross_val_folds
    cross_val_fold_train_idx = [
        list(
            range(int(floor(i * cross_val_chunk_size)),
                  int(ceil((i + 1) * cross_val_chunk_size))))
        for i in range(cross_val_folds)
    ]
    cross_val_fold_test_idx = [
        list(range(0, int(ceil(i * cross_val_chunk_size)))) +
        list(range(int(floor((i + 1) * cross_val_chunk_size)), N))
        for i in range(cross_val_folds)
    ]
    weight_likelihoods = []
    for l2_weight in l2_weights:
        print('testing weight = %.3f' % (l2_weight))
        likelihoods_l2 = []
        for i, (train_idx_i, test_idx_i) in enumerate(
                zip(cross_val_fold_train_idx, cross_val_fold_test_idx)):
            print('fold %d' % (i))
            train_XY = data_rand.iloc[train_idx_i, :]
            test_X = exog[test_idx_i, :]
            test_Y = endog[test_idx_i]
            # fit model
            model_i = GLM.from_formula(formula,
                                       train_XY,
                                       family=Binomial(link=logit()))
            model_res_i = model_i.fit_regularized(maxiter=max_iter,
                                                  method='elastic_net',
                                                  alpha=l2_weight,
                                                  L1_wt=0.)
            # add 0 params for missing coefficients
            # to match X shape
            model_res_i.params = model_res_i.params.loc[exog_names].fillna(
                0, inplace=False)
            # score test data
            likelihood_i = compute_log_likelihood(model_res_i.params, test_Y,
                                                  test_X)
            likelihoods_l2.append(likelihood_i)
        weight_likelihoods.append(likelihoods_l2)
    weight_likelihoods = pd.DataFrame(np.array(weight_likelihoods),
                                      index=l2_weights)
    mean_weight_likelihoods = weight_likelihoods.mean(axis=0)
    return mean_weight_likelihoods
Ejemplo n.º 25
0
                             delimiter=",",
                             names=[
                                 "male", "female", "infant", "length",
                                 "diameter", "height", "whole_weight",
                                 "shucked_weight", "viscera_weight",
                                 "shell_weight", "rings"
                             ])
sample.describe()
sample_validation.describe()
N = len(sample)
N_valid = len(sample_validation)
print("{} samples to train and {} samples to validate".format(N, N_valid))

#Definició del model lineal com a combinació lineal de les entrades de les dades.
model_glm = GLM.from_formula(
    'rings ~ male + female + infant + length + diameter + height + whole_weight + shucked_weight + viscera_weight + shell_weight',
    sample)
model = model_glm.fit()
print(model.summary())
print(model.params, file=open('coeficients/linear_regression', 'w'))
#Fem les prediccions de train
prediccions = model.predict(sample.loc[:, "male":"shell_weight"])

#Trobem les mètriques per evaluar el model, sobre les dades de train
MAE = np.sum(abs(sample.rings - prediccions)) / N
print("MAE:", MAE)

mean_square_error = np.sum((sample.rings - prediccions)**2) / N
print("MSE:", mean_square_error)

root_mse = np.sqrt(model.deviance / N)