Beispiel #1
0
def fit_poisson_simulation(arrivals_departures):

    y_arr, X_arr = patsy.dmatrices(
        "arrivals ~ C(months, Treatment) + C(hours, Treatment) + C(weekday_dummy, Treatment)",
        arrivals_departures,
        return_type='dataframe')
    y_dep, X_dep = patsy.dmatrices(
        "departures ~ C(months, Treatment) + C(hours, Treatment) + C(weekday_dummy, Treatment)",
        arrivals_departures,
        return_type='dataframe')

    y_dep[pd.isnull(y_dep)] = 0

    # Fit poisson distributions for arrivals and departures, print results
    arr_poisson_model = sm.Poisson(y_arr, X_arr)
    arr_poisson_results = arr_poisson_model.fit(disp=0)

    dep_poisson_model = sm.Poisson(y_dep, X_dep)
    dep_poisson_results = dep_poisson_model.fit(disp=0)

    # print arr_poisson_results.summary(), dep_poisson_results.summary()

    poisson_results = [arr_poisson_results, dep_poisson_results]

    return poisson_results
Beispiel #2
0
 def setupClass(cls):
     cls.kvars = 10 # Number of variables
     cls.m = 7 # Number of unregularized parameters
     rand_data = sm.datasets.randhie.load()
     rand_exog = rand_data.exog.view(float).reshape(len(rand_data.exog), -1)
     rand_exog = sm.add_constant(rand_exog, prepend=True)
     # Drop some columns and do an unregularized fit
     exog_no_PSI = rand_exog[:, :cls.m]
     cls.res_unreg = sm.Poisson(
         rand_data.endog, exog_no_PSI).fit(method="newton", disp=False)
     # Do a regularized fit with alpha, effectively dropping the last column
     alpha = 10 * len(rand_data.endog) * np.ones(cls.kvars)
     alpha[:cls.m] = 0
     cls.res_reg = sm.Poisson(rand_data.endog, rand_exog).fit_regularized(
         method='l1', alpha=alpha, disp=False, acc=1e-10, maxiter=2000,
         trim_mode='auto')
    def fit_poisson_using_goals(self, matches, team_name, scored):
        """fits and returns a poisson distribution using goals scored or
        allowed depending on 'scored' param. Uses the statsmodel library."""
        elos = []
        num_goals = []

        for match in matches:
            if match.home_team == team_name:
                elos.append([
                    match.away_team_resulting_rating -
                    match.away_team_rating_change, 1
                ])

                if scored:
                    num_goals.append(match.home_team_score)
                else:
                    num_goals.append(match.away_team_score)
            else:
                elos.append([
                    match.home_team_resulting_rating -
                    match.home_team_rating_change,
                    1  # a0 term.
                ])

                if scored:
                    num_goals.append(match.away_team_score)
                else:
                    num_goals.append(match.home_team_score)

        poisson = sm.Poisson(num_goals, elos)
        poisson_fitted = poisson.fit(method=self.OPTIMIZATION_METHOD)

        return poisson_fitted
Beispiel #4
0
def main(args):

    logger.info('==================================')
    logger.info('COUNT POISSON FIT')

    df_poisson_params = pd.DataFrame(columns=[
        'id', 'lambda', 'count_mean', 'count_var', 'count_len', 'loglikelihood'
    ])

    for i, (subj, counts) in enumerate(
            utils.event_partition_generator(args, num_days=7)):
        df_poisson_params.loc[i] = [
            np.nan for j in range(len(df_poisson_params.columns))
        ]

        df_poisson_params['id'].values[i] = subj
        df_poisson_params['count_mean'].values[i] = np.mean(counts)
        df_poisson_params['count_var'].values[i] = np.var(counts)
        df_poisson_params['count_len'].values[i] = len(counts)

        try:
            res = sm.Poisson(counts, np.ones_like(counts)).fit(disp=0)
            lambda_param = res.params[0]
            loglikelihood = -res.llf
        except Exception:
            logger.info('Could not fit negative binomial on %s' % subj)
            (lambda_param, loglikelihood) = (np.nan, np.nan)

        df_poisson_params['lambda'].values[i] = lambda_param
        df_poisson_params['loglikelihood'].values[i] = loglikelihood

    df_poisson_params.to_csv(os.path.join(args.working_dir,
                                          'params_poisson_count.csv'),
                             index=False)
def tiny_poisson(l):
    poi_mod, poi_ppf_obs = [None for i in range(2)]
    poi_rmse = 0
    xtr = np.array([item[1:] for item in l])
    ytr = np.array([item[0] for item in l]).reshape(-1, 1)

    poi_res = []
    try:
        if np.count_nonzero(ytr) > 0:
            poi_mod = sm.Poisson(ytr, xtr).fit(method="nm", maxiter=10000, disp=0, maxfun=10000) # method nm works without singular mat
            poi_mean_pred = poi_mod.predict(xtr)
            poi_ppf_obs = stats.poisson.ppf(q=0.99, mu=poi_mean_pred)
            poi_ppf_obs[poi_ppf_obs>150] = 150
            poi_rmse_tr = np.sqrt(mean_squared_error(ytr, poi_ppf_obs))
            poi_res = [poi_mod, poi_ppf_obs, poi_rmse_tr]

        else:
            poi_res = return_zeros(ytr, "AllZeros")

    except np.linalg.LinAlgError as e:
        if 'Singular matrix' in str(e):
            # print(" You should not have reached this point. ")
            # print(" Regularization should avoid the singular matrix. ")
            nzeros = len(ytr) - np.count_nonzero(ytr)
            prop = round((100 * nzeros) / len(ytr), 2)
            # print(" Proportion of zeros: ", prop)
            poi_prop_err_singmat.append(prop)
            nb_res = return_zeros(ytr, "Singular")
    except AssertionError as e:
        nb_res = return_zeros(ytr, "Assert")
    except ValueError as e:
        print("\t\t\tIgnored output containing np.nan or np.inf")
        pass
    return poi_res
Beispiel #6
0
 def setup(self):
     #fit for each test, because results will be changed by test
     x = self.exog
     np.random.seed(987689)
     y_count = np.random.poisson(np.exp(x.sum(1) - x.mean()))
     model = sm.Poisson(y_count, x)
     self.results = model.fit_regularized(method='l1', disp=0, alpha=10)
def core(X, Y, Z=None):
    '''
    X: X变量
    Y: 预测值
    Z 为基数, 可以为空或者为一个Series,长度与Y一致
    
    '''
    X = sm.add_constant(X, prepend=False)
    X = X.rename(columns={'const': '截距'})

    if Z is None:
        Z = pd.Series([1 for i in range(len(Y))])

    ### 除以基数 然后取对数
    y = Y / Z
    y_log = np.log(y)

    # building the model
    poisson_mod = sm.Poisson(y_log, X)
    res = poisson_mod.fit(method="bfgs")
    y_pre = res.predict(X)

    Y_predict = np.exp(y_pre) * Z
    Y_predict.name = '预测值'
    df_predict_result = Y.to_frame(name='实际值').join(Y_predict)

    #model description
    tables = res.summary().tables
    df_list = [pd.read_html(StringIO(t.as_html()))[0] for t in tables]
    dfinfo1 = df_list[1].fillna('Variables').set_index(0)
    dfinfo1 = dfinfo1.T.set_index('Variables').T
    dfinfo1.index.name = '项'
    dfinfo1.columns.name = '参数类型'
    dfinfo1.columns = ['回归系数', '标准误差', 'Z值', 'p值', '95%CI(下限)', '95%CI(上限)']
    dfinfo1['or值'] = np.exp(res.params)
    dfinfo1 = dfinfo1.round(3)

    R_Squared = r2_score(y_log, y_pre)

    tb2 = {
        'BIC': res.bic,
        'AIC': res.aic,
        'df': res.df_model,
        'p': res.llr_pvalue,
        '似然比卡方值': res.llr,
        'R²': R_Squared,
        'Pseud_R²': res.prsquared
    }
    dfinfo2 = pd.DataFrame([tb2]).round(3)
    dfinfo2 = dfinfo2.set_index('似然比卡方值')

    r = {
        '模型似然比检验和效果汇总': dfinfo2,
        'Poisson回归分析结果汇总': dfinfo1,
        '实际值与预测值': df_predict_result
    }

    return r
Beispiel #8
0
def fit_poisson(snowflake_connection,
                cfg,
                station,
                include_rebalance=False,
                time_interval='1H'):
    # Use the correct delta data
    station_updates = stations.GetStationData(snowflake_connection, cfg,
                                              station["station_id"],
                                              station["latitude"],
                                              station["longitude"])
    #print(station_updates.dtypes)
    arrivals_departures = rebalance_station_poisson_data(
        station_updates,
        station["station_id"],
        time_interval,
        include_rebalance=False)
    # Create design matrix for months, hours, and weekday vs. weekend.
    # We can't just create a "month" column to toss into our model, because it doesnt
    # understand what "June" is. Instead, we need to create a column for each month
    # and code each row according to what month it's in. Ditto for hours and weekday (=1).

    y_arr, X_arr = patsy.dmatrices(
        "arrivals ~ C(months, Treatment) + C(hours, Treatment) + C(weekday_dummy, Treatment)",
        arrivals_departures,
        return_type='dataframe')
    y_dep, X_dep = patsy.dmatrices(
        "departures ~ C(months, Treatment) + C(hours, Treatment) + C(weekday_dummy, Treatment)",
        arrivals_departures,
        return_type='dataframe')

    y_dep[pd.isnull(y_dep)] = 0

    # Fit poisson distributions for arrivals and departures, print results
    arr_poisson_model = sm.Poisson(y_arr, X_arr)
    arr_poisson_results = arr_poisson_model.fit(disp=0)

    dep_poisson_model = sm.Poisson(y_dep, X_dep)
    dep_poisson_results = dep_poisson_model.fit(disp=0)

    # print arr_poisson_results.summary(), dep_poisson_results.summary()

    poisson_results = [arr_poisson_results, dep_poisson_results]

    return poisson_results
Beispiel #9
0
 def setup(self):
     #fit for each test, because results will be changed by test
     x = self.exog
     np.random.seed(987689)
     y_count = np.random.poisson(np.exp(x.sum(1) - x.mean()))
     model = sm.Poisson(y_count, x)
     # use start_params to converge faster
     start_params = np.array([0.75334818, 0.99425553, 1.00494724, 1.00247112])
     self.results = model.fit(start_params=start_params, method='bfgs',
                              disp=0)
Beispiel #10
0
def test_poisson_newton():
    #GH: 24, Newton doesn't work well sometimes
    nobs = 10000
    np.random.seed(987689)
    x = np.random.randn(nobs, 3)
    x = sm.add_constant(x, prepend=True)
    y_count = np.random.poisson(np.exp(x.sum(1)))
    mod = sm.Poisson(y_count, x)
    res = mod.fit(start_params=-np.ones(4), method='newton', disp=0)
    assert_(not res.mle_retvals['converged'])
Beispiel #11
0
def train_glm_sm_unconstrained(xtrain, ytrain, tmodel):
	# initalize model
	if tmodel == "linear" or tmodel == "lin":
		model = sm.OLS( ytrain, xtrain)
	elif tmodel == "logistic" or tmodel == "log":
		model = sm.Logit( ytrain, xtrain)
	elif tmodel == "poisson" or tmodel == "poi":
		model = sm.Poisson( ytrain, xtrain)
	result = model.fit(disp=0)
	return model, result.params
Beispiel #12
0
 def test_pd_offset_exposure(self):
     endog = pd.DataFrame({'F': [0.0, 0.0, 0.0, 0.0, 1.0]})
     exog = pd.DataFrame({'I': [1.0, 1.0, 1.0, 1.0, 1.0],
                          'C': [0.0, 1.0, 0.0, 1.0, 0.0]})
     exposure = pd.Series([1., 1, 1, 2, 1])
     offset = pd.Series([1, 1, 1, 2, 1])
     sm.Poisson(endog=endog, exog=exog, offset=offset).fit()
     inflations = ['logit', 'probit']
     for inflation in inflations:
         sm.ZeroInflatedPoisson(endog=endog, exog=exog["I"],
                                exposure=exposure,
                                inflation=inflation).fit()
Beispiel #13
0
def test_poi_nb_zip_zinb_tiny_subset(meta, m):
    exog_names = r"rowid;latitude;longitude;target;dbuiltup;dforest;drecreation;dbrr;dwrl;dwrn;dwrr;dcamping;dcaravan;dcross;dgolf;dheem;dhaven;dsafari;dwater;attr;dbath;lu;lc;maxmeanhaz;maxstdhaz".split(";")[4:]

    np.random.seed(2)

    randint = np.random.randint(0, high=len(m)-1, size=800)

    msel = m[randint,:]

    Y = msel[:, 0]
    X = msel[:, 1:]

    # Ynz, Xnz = trim_value(Y, X, 0)

    print("Msel shape: ", msel.shape)

    xtrain, xtest, ytrain, ytest = train_test_split(X, Y, train_size=0.60, random_state=42)

    print(xtrain.shape, ytrain.shape, xtest.shape, ytest.shape)

    print
    print("Model: Poisson")
    poi_mod = sm.Poisson(ytrain, xtrain).fit(method="newton", maxiter=50)
    poi_mean_pred = poi_mod.predict(xtest)
    poi_ppf_obs = stats.poisson.ppf(q=0.95, mu=poi_mean_pred)
    poi_rmse = np.sqrt(mean_squared_error(ytest, poi_ppf_obs))
    # print(np.unique(poi_ppf_obs, return_counts=True))
    print("RMSE Poisson: ", poi_rmse)
    # print(poi_mod.summary(yname='tickbites', xname=exog_names))

    print
    print("Model: Neg. Binomial")
    nb_mod = sm.NegativeBinomial(ytrain, xtrain).fit(start_params = None, method = 'newton', maxiter=50)
    nb_pred = nb_mod.predict(xtest)
    nb_rmse = np.sqrt(mean_squared_error(ytest, nb_pred))
    # print(np.unique(nb_pred, return_counts=True))
    print("RMSE Negative Binomial: ", nb_rmse)

    print
    print("Model: Zero Inflated Poisson")
    zip_mod = sm.ZeroInflatedPoisson(ytrain, xtrain).fit(method="newton", maxiter=50)
    zip_mean_pred = zip_mod.predict(xtest, exog_infl=np.ones((len(xtest), 1)))
    zip_ppf_obs = stats.poisson.ppf(q=0.95, mu=zip_mean_pred)
    zip_rmse = np.sqrt(mean_squared_error(ytest, zip_ppf_obs))
    print("RMSE Zero-Inflated Poisson", zip_rmse)

    print
    print("Model: Zero Inflated Neg. Binomial")
    zinb_mod = sm.ZeroInflatedNegativeBinomialP(ytrain, xtrain).fit(method="newton", maxiter=50)
    zinb_pred = zinb_mod.predict(xtest, exog_infl=np.ones((len(xtest), 1)))
    zinb_rmse = np.sqrt(mean_squared_error(ytest, zinb_pred))
    print("RMSE Zero-Inflated Negative Binomial: ", zinb_rmse)
Beispiel #14
0
    def setup(self):
        #fit for each test, because results will be changed by test
        x = self.exog
        np.random.seed(987689)
        y_count = np.random.poisson(np.exp(x.sum(1) - x.mean()))
        model = sm.Poisson(y_count, x)  #, exposure=np.ones(nobs), offset=np.zeros(nobs)) #bug with default
        # use start_params to converge faster
        start_params = np.array([0.75334818, 0.99425553, 1.00494724, 1.00247112])
        self.results = model.fit(start_params=start_params, method='bfgs',
                                 disp=0)

        #TODO: temporary, fixed in master
        self.predict_kwds = dict(exposure=1, offset=0)
Beispiel #15
0
def regression(df, a, b, c, d, distribution):
    """[summary]
    Calculate VE and CI's according
    https://timeseriesreasoning.com/contents/estimation-of-vaccine-efficacy-using-logistic-regression/
    * We'll use Patsy to carve out the X and y matrices
    * Build and train a Logit model (sm.Logit)

    Args:
        a ([type]): sick vax
        b ([type]): sick unvax
        c ([type]): total vax
        d ([type]): total unvax

    Returns:
        0"""

    p_sick_unvax = b / d
    #Form the regression equation
    expr = 'INFECTED ~  VACCINATED'

    #We'll use Patsy to carve out the X and y matrices
    y_train, X_train = dmatrices(expr, df, return_type='dataframe')

    #Build and train a Logit model
    if distribution == "logit":
        model = sm.Logit(endog=y_train, exog=X_train, disp=False)
    elif distribution == "poisson":
        model = sm.Poisson(endog=y_train, exog=X_train, disp=False)
    elif distribution == "neg_bin":
        model = sm.NegativeBinomial(endog=y_train, exog=X_train, disp=False)

    results = model.fit(disp=False)
    params = results.params

    #Print the model summary
    #stl.write(logit_results.summary2())

    VE = VE_(params[1], p_sick_unvax)

    # stl.write(f"\nConfidence intervals")
    # stl.write(logit_results.conf_int())  # confidence intervals

    conf = results.conf_int()
    high, low = conf[0][1], conf[1][1]
    prsquared = results.prsquared
    VE_low, VE_high = VE_(low, p_sick_unvax), VE_(high, p_sick_unvax)
    stl.write(
        f"VE Regression {distribution}                       : {VE} % [{VE_low} , {VE_high}] | pseudo-R2 = {prsquared}"
    )
Beispiel #16
0
def tiny_poisson(l):
    mean_pred, ppf_obs, poi_mod = [None for i in range(3)]
    xtr = np.array([item[1:] for item in l])
    ytr = np.array([item[0] for item in l]).reshape(-1, 1)
    try:
        poi_mod = sm.Poisson(ytr, xtr).fit()
        mean_pred = poi_mod.predict(xtr)  # or use a new x
        sf_obs = stats.poisson.sf(2 - 1, mean_pred)  # average over x in sample
        pmf_obs = stats.poisson.pmf(2, mean_pred)
        ppf_obs = stats.poisson.ppf(q=0.95,
                                    mu=mean_pred)  # average over x in sample
    except np.linalg.LinAlgError as e:
        if 'Singular matrix' in str(e):
            print("Ignored a singular matrix.")
    return [poi_mod, mean_pred, ppf_obs]
Beispiel #17
0
def poisson_reg(train_df, test_df):

    y = train_df.total_cases
    train_df.drop('total_cases', axis=1, inplace=True)
    train_df = add_constant(train_df)

    print(y.head(10))
    print(train_df.head(10))

    poisson_model = sm.Poisson(y, train_df).fit()
    preds = poisson_model.predict(train_df)
    diff = abs(preds - y)
    print(preds.head(10))
    print(diff.head(10))
    print(np.mean(diff))
Beispiel #18
0
def fit(dataframe, target, city, station):
    ''' Train the Poisson process to predict bikes or spaces. '''
    features = [
        column for column in dataframe.columns
        if column not in ['bikes', 'spaces']
    ]
    # Create a GLM style formula (target ~ features)
    formula = '{0} ~ {1}'.format(
        target, ' + '.join(
            ['C({}), Treatment'.format(feature) for feature in features]))
    y, X = dmatrices(formula, dataframe, return_type='dataframe')
    model = sm.Poisson(y, X)
    parameters = model.fit(disp=0).params
    estimatedLambda = np.exp(np.sum(parameters))
    return estimatedLambda
Beispiel #19
0
def test_poisson_predict():
    #GH: 175, make sure poisson predict works without offset and exposure
    data = sm.datasets.randhie.load()
    exog = sm.add_constant(data.exog, prepend=True)
    res = sm.Poisson(data.endog, exog).fit(method='newton', disp=0)
    pred1 = res.predict()
    pred2 = res.predict(exog)
    assert_almost_equal(pred1, pred2)
    #exta options
    pred3 = res.predict(exog, offset=0, exposure=1)
    assert_almost_equal(pred1, pred3)
    pred3 = res.predict(exog, offset=0, exposure=2)
    assert_almost_equal(2 * pred1, pred3)
    pred3 = res.predict(exog, offset=np.log(2), exposure=1)
    assert_almost_equal(2 * pred1, pred3)
Beispiel #20
0
def SPPoisson(context):
    # 从 Context 中获取相关数据
    args = context.args
    # 查看上一节点发送的 args.inputData 数据
    df = args.inputData

    featureColumns = args.featureColumns
    labelColumn = args.labelColumn

    features = df[featureColumns].values
    label = df[labelColumn].values

    arma_mod = sm.Poisson(label, features, missing=args.missing)
    arma_res = arma_mod.fit(method=args.method)

    return arma_res
Beispiel #21
0
def tiny_poisson(l):
    print("\t\tRunning Poisson")
    poi_mod, poi_ppf_obs = [None for i in range(2)]
    poi_rmse = 0
    xtr = np.array([item[1:] for item in l])
    ytr = np.array([item[0] for item in l]).reshape(-1, 1)
    try:
        poi_mod = sm.Poisson(ytr, xtr).fit(method="newton", maxiter=50, disp=0)
        poi_mean_pred = poi_mod.predict(xtr)
        poi_ppf_obs = stats.poisson.ppf(q=0.95, mu=poi_mean_pred)  # average over x in sample
        poi_rmse = np.sqrt(mean_squared_error(ytr, poi_ppf_obs))

    except np.linalg.LinAlgError as e:
        if 'Singular matrix' in str(e):
            print("\t\t\tIgnored a singular matrix.")
    except ValueError:
        print("\t\t\tIgnored output containing np.nan or np.inf")

    return [poi_mod, poi_ppf_obs, poi_rmse]
Beispiel #22
0
def train_glm_sm(xtrain, ytrain, tmodel, constraints=None):
	if constraints != None:
		if tmodel == "linear" or tmodel == "lin":
			model = sm.GLM(ytrain, xtrain, family=sm.families.Gaussian())
		elif tmodel == "logistic" or tmodel == "log":
			model = sm.GLM( ytrain, xtrain, family=sm.families.Binomial())
		elif tmodel == "poisson" or tmodel == "poi":
			model = sm.GLM( ytrain, xtrain, family=sm.families.Poisson())
		result = model.fit_constrained(constraints)
	else:
		if tmodel=="linear" or tmodel=="lin":
			model = sm.OLS( ytrain, xtrain )
			result = model.fit(disp=0, skip_hessian=True)
		elif tmodel=="logistic" or tmodel=="log":
			model = sm.Logit( ytrain, xtrain )
			result = model.fit(disp=0, method="newton", skip_hessian=True)
		elif tmodel=="poisson" or tmodel=="poi":
			model = sm.Poisson( ytrain, xtrain )
			result = model.fit(disp=0, method="newton", skip_hessian=True)
	return model, result.params
Beispiel #23
0
def fit_model(papers_an, tm, comps_n):

    if comps_n == 0:
        reg_data = papers_an.copy()
        endog = reg_data["citation_count"].astype(float)
        exog = (add_constant(reg_data[["year", "is_comp",
                                       "num_auth"]])).astype(float)
    else:
        pca = PCA(n_components=comps_n)
        tm_pca = (pd.DataFrame(pca.fit_transform(
            tm.iloc[:, 1:].dropna())).assign(article_id=tm['article_id']))

        tm_pca.columns = [str(x) for x in tm_pca]

        reg_data = papers_an.merge(tm_pca, on='article_id')
        endog = reg_data["citation_count"].astype(float)

        exog = (add_constant(
            reg_data[["year", "is_comp", "num_auth"] +
                     tm_pca.drop(axis=1, labels=['article_id']).columns.tolist(
                     )]).astype(float))
    return sm.Poisson(endog=endog, exog=exog).fit_regularized(cov_type="HC1")
Beispiel #24
0
def test_poi_nb_zip_zinb_raw_data(meta, m):
    Y = m[:, 0]
    X = m[:, 1:]
    Ynz, Xnz = trim_value(Y, X, 0)
    xtrain, xtest, ytrain, ytest = train_test_split(X, Y, train_size=0.60, random_state=77)

    print("Training with: ", xtrain.shape, ytrain.shape)
    print("Testing with: ", xtest.shape, ytest.shape)

    print()
    print("Model: Poisson")
    poi_mod = sm.Poisson(ytrain, xtrain).fit(method="newton", maxiter=50)
    poi_mean_pred = poi_mod.predict(xtest)
    poi_ppf_obs = stats.poisson.ppf(q=0.95, mu=poi_mean_pred)
    poi_rmse = np.sqrt(mean_squared_error(ytest, poi_ppf_obs))

    print("Model: Zero Inflated Poisson")
    zip_mod = sm.ZeroInflatedPoisson(ytrain, xtrain).fit(method="newton", maxiter=50)
    zip_mean_pred = zip_mod.predict(xtest, exog_infl=np.ones((len(xtest), 1)))
    zip_ppf_obs = stats.poisson.ppf(q=0.95, mu=zip_mean_pred)
    zip_rmse = np.sqrt(mean_squared_error(ytest, zip_ppf_obs))

    print("Model: Zero Inflated Neg. Binomial")
    zinb_mod = sm.ZeroInflatedNegativeBinomialP(ytrain, xtrain).fit(method="newton", maxiter=50)
    zinb_pred = zinb_mod.predict(xtest, exog_infl=np.ones((len(xtest), 1)))
    zinb_rmse = np.sqrt(mean_squared_error(ytest, zinb_pred))

    print()
    print("Model: Zero Inflated Neg. Binomial")
    zinb_mod = sm.ZeroInflatedNegativeBinomialP(ytrain, xtrain).fit(method="newton", maxiter=50)
    zinb_pred = zinb_mod.predict(xtest)
    zinb_rmse = np.sqrt(mean_squared_error(ytrain, zinb_pred))

    print("RMSE Poisson: ", poi_rmse)
    print("RMSE Negative Binomial: ", nb_rmse)
    print("RMSE Zero-Inflated Poisson", zip_rmse)
    print("RMSE Zero-Inflated Negative Binomial: ", zinb_rmse)
Beispiel #25
0
m_logit = sm.Logit(y, X).fit()  # option: Probit
print(m_logit.summary2())  # estimation summary
y_pred = m_logit.predict(X)  # fitted/predicted values
print(confusion_matrix(y, (y_pred > .5).astype(int)))

# nominal data models (not tested)
y = df.y_nominal  # DV
mn_logit = sm.MNLogit(y, X).fit()
print(mn_logit.summary2())  # estimation summary
y_pred = mn_logit.predict(X)  # fitted/predicted values
print(confusion_matrix(y, (y_pred > .5).astype(int)))

# count data models (w/ exposure!)
y = df.y_count  # DV

m_poiss = sm.Poisson(
    y, X, exposure=df['x_timespan'].values).fit()
print(m_poiss.summary2())

m_NB2 = sm.NegativeBinomial(
    y, X, loglike_method='nb2', exposure=df['x_timespan'].values).fit()
print(m_NB2.summary2())

m_NB1 = sm.NegativeBinomial(
    y, X, loglike_method='nb1', exposure=df['x_timespan'].values).fit()
print(m_NB1.summary2())

m_NBP = sm.NegativeBinomialP(
    y, X, exposure=df['x_timespan'].values).fit()
print(m_NBP.summary2())

#endregion
mlogit_res = mlogit_mod.fit()
print(mlogit_res.params)

# ## Poisson
#
# Load the Rand data. Note that this example is similar to Cameron and
# Trivedi's `Microeconometrics` Table 20.5, but it is slightly different
# because of minor changes in the data.

rand_data = sm.datasets.randhie.load()
rand_exog = rand_data.exog
rand_exog = sm.add_constant(rand_exog, prepend=False)

# Fit Poisson model:

poisson_mod = sm.Poisson(rand_data.endog, rand_exog)
poisson_res = poisson_mod.fit(method="newton")
print(poisson_res.summary())

# ## Negative Binomial
#
# The negative binomial model gives slightly different results.

mod_nbin = sm.NegativeBinomial(rand_data.endog, rand_exog)
res_nbin = mod_nbin.fit(disp=False)
print(res_nbin.summary())

# ## Alternative solvers
#
# The default method for fitting discrete data MLE models is Newton-
# Raphson. You can use other solvers by using the ``method`` argument:
Beispiel #27
0
    print 'cs', numdiff.approx_fprime_cs(test_params, loglike)
    print 'sm', hess(test_params)
    print 'fd', numdiff.approx_fprime1(test_params, score, epsilon)
    print 'cs', numdiff.approx_fprime_cs(test_params, score)

    #print 'fd', numdiff.approx_hess(test_params, loglike, epsilon) #TODO: bug
    '''
    Traceback (most recent call last):
      File "C:\Josef\eclipsegworkspace\statsmodels-josef-experimental-gsoc\scikits\statsmodels\sandbox\regression\test_numdiff.py", line 74, in <module>
        print 'fd', numdiff.approx_hess(test_params, loglike, epsilon)
      File "C:\Josef\eclipsegworkspace\statsmodels-josef-experimental-gsoc\scikits\statsmodels\sandbox\regression\numdiff.py", line 118, in approx_hess
        xh = x + h
    TypeError: can only concatenate list (not "float") to list
    '''
    hesscs = numdiff.approx_hess_cs(test_params, loglike)
    print 'cs', hesscs
    print maxabs(hess(test_params), hesscs)

    data = sm.datasets.anes96.load()
    exog = data.exog
    exog[:, 0] = np.log(exog[:, 0] + .1)
    exog = np.column_stack((exog[:, 0], exog[:, 2], exog[:, 5:8]))
    exog = sm.add_constant(exog)
    res1 = sm.MNLogit(data.endog, exog).fit(method="newton", disp=0)

    datap = sm.datasets.randhie.load()
    nobs = len(datap.endog)
    exogp = sm.add_constant(datap.exog.view(float).reshape(nobs, -1))
    modp = sm.Poisson(datap.endog, exogp)
    resp = modp.fit(method='newton', disp=0)
# In[16]:

np.shape(data_test)

# In[17]:

# scatter plots for conditional relationships
g = sns.FacetGrid(new_data, row="sex", col="age", margin_titles=True)
g.map(plt.scatter, "gdp_per_capita ($)2", "suicides_no", edgecolor="w")

# ### Simple Poisson Regression

# In[18]:

model1 = sm.Poisson(endog=new_data['suicides_no'],
                    exog=sm.add_constant(
                        new_data[['sex', 'age', 'gdp_per_capita ($)2']]))
res1 = model1.fit()

# In[19]:

poisson1 = sm.GLM(new_data['suicides_no'],
                  sm.add_constant(
                      new_data[['sex', 'age', 'gdp_per_capita ($)2']]),
                  family=sm.families.Poisson()).fit()
print(poisson1.summary())

# In[20]:

#compute MSPE
y_pred1 = res1.predict(
import pickle

fname = 'try_shrink%d_ols.pickle' % shrinkit
fh = open(fname, 'w')
pickle.dump(results._results, fh)  #pickling wrapper doesn't work
fh.close()
fh = open(fname, 'r')
results2 = pickle.load(fh)
fh.close()
print results2.predict(xf)
print results2.model.predict(results.params, xf)

y_count = np.random.poisson(np.exp(x.sum(1) - x.mean()))
model = sm.Poisson(
    y_count,
    x)  #, exposure=np.ones(nobs), offset=np.zeros(nobs)) #bug with default
results = model.fit(method='bfgs')

results.summary()

print results.model.predict(results.params, xf, exposure=1, offset=0)

if shrinkit:
    results.remove_data()
else:
    #work around pickling bug
    results.mle_settings['callback'] = None

import pickle
Beispiel #30
0
from scipy import stats
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

X = np.random.randint(99, size=(800, 21))
Y = np.random.randint(2, size=(800, 1))

xtrain, xtest, ytrain, ytest = train_test_split(X,
                                                Y,
                                                train_size=0.60,
                                                random_state=42)

print(xtrain.shape, ytrain.shape, xtest.shape, ytest.shape)

print("Model: Poisson")
poi_mod = sm.Poisson(ytrain, xtrain).fit(method="newton", maxiter=50)
poi_mean_pred = poi_mod.predict(xtest)
poi_ppf_obs = stats.poisson.ppf(q=0.95, mu=poi_mean_pred)
poi_rmse = np.sqrt(mean_squared_error(ytest, poi_ppf_obs))

print("Model: Neg. Binomial")
nb_mod = sm.NegativeBinomial(ytrain, xtrain).fit(start_params=None,
                                                 method='newton',
                                                 maxiter=50)
nb_pred = nb_mod.predict(xtest)
nb_rmse = np.sqrt(mean_squared_error(ytest, nb_pred))

print(np.ones(len(xtest)).shape)

print("Model: Zero Inflated Poisson")
zip_mod = sm.ZeroInflatedPoisson(ytrain, xtrain).fit(method="newton",