Python MixedLM Examples

Programming Language: Python

Namespace/Package Name: statsmodels.api

Method/Function: MixedLM

Examples at hotexamples.com: 15

Python MixedLM - 15 examples found. These are the top rated real world Python examples of statsmodels.api.MixedLM extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: _MixedLinearModel.py Project: mesgarpour/ERMER

    def train(self,
              features_indep_df: PandasDataFrame,
              feature_target: List,
              model_labals: List = [0, 1],
              **kwargs: Any) -> StatsmodelsMixedLM:
        """Perform the training, using the Mixed Linear Model.
        :param features_indep_df: the independent features, which are inputted into the model.
        :param feature_target: the target feature, which is being estimated.
        :param model_labals: the target labels (default [0, 1]).
        :param kwargs: any other arguments that the selected reader may accept.
        :return: the trained model.
        """
        self._logger.debug("Train " + __name__)
        if 'groups' not in kwargs.keys():
            self._logger.error(__name__ + " - " +
                               " function argument is missing: 'groups'.")
            sys.exit()

        groups = features_indep_df[kwargs['groups']]
        exog = features_indep_df.drop(kwargs['groups'], axis=1)
        exog['Intercept'] = 1

        model_train = sm.MixedLM(endog=feature_target,
                                 exog=exog,
                                 groups=groups,
                                 exog_re=exog['Intercept'])
        model_train = model_train.fit()
        print(model_train.summary())
        return model_train

Example #2

Show file

File: tm_massunivariatemodels.py Project: knutj/TFCE_mediation

def run_mm(trunc_data, out_data_array, exog_vars, groupVar, i):
	print(i)
	try:
		out_data_array = sm.MixedLM(trunc_data, exog_vars, groupVar).fit().resid
	except ValueError:
		print("Error %d" % i)
		out_data_array = np.zeros((len(exog_vars)))
	return out_data_array

Example #3

Show file

def run_one_lmm(genotypes, phenotypes, groups):
    try:
        intercept = np.ones(genotypes.size)
        genotypes = genotypes.copy()
        x = np.stack([intercept, genotypes], axis=1)
        # Return p-value for genotype coefficient
        return sm.MixedLM(phenotypes, x, groups).fit().pvalues[1]
    except np.linalg.LinAlgError:
        # Could not fit model, return NaN
        return float('nan')

Example #4

Show file

File: mixedlm.py Project: yzharold/genetest

    def fit(self, y, X):
        """Fit the model.

        Args:
            y (pandas.DataFrame): The vector of endogenous variable.
            X (pandas.DataFrame): The matrix of exogenous variables.

        """
        # Retrieving the data
        y, X, groups = self._prepare_data(y, X)

        # Creating the MixedLM model from StatsModels and fitting it
        model = sm.MixedLM(y, X, groups)
        try:
            # fitted = model.fit(reml=self._reml)
            fitted = model.fit(reml=self._reml)

        except np.linalg.linalg.LinAlgError as e:
            raise StatsError(str(e))

        out = {}
        parameters = fitted.params.index

        # Results about the model fit
        out = {
            "MODEL": {
                "log_likelihood": fitted.llf,
                "nobs": X.shape[0],
                "random_effects": self._format_re(fitted.random_effects),
            },
        }

        # Getting the confidence intervals
        conf_ints = fitted.conf_int()

        for param in parameters:
            # If GWAS, check that inference could be done on the SNP
            if param == "SNPs" and np.isnan(fitted.pvalues[param]):
                raise StatsError("Inference did not converge.")

            out[param] = {
                "coef": fitted.params[param],
                "std_err": fitted.bse[param],
                "lower_ci": conf_ints.loc[param, 0],
                "upper_ci": conf_ints.loc[param, 1],
                "z_value": fitted.tvalues[param],
                "p_value": fitted.pvalues[param],
            }

        return out

Example #5

Show file

def crude_mixedML2(df_merged, x_feature, y_feature, covars):

    #TODO: Replace covars variable with actual selection of indivdual features

    df_merged = df_merged.replace(-9, np.nan).replace('-9', np.nan).replace(
        999, np.nan).replace(888, np.nan)

    split_covars = covars.split('|')

    print(split_covars)

    data = add_confound(df_merged, x_feature, y_feature, split_covars)

    data['intercept'] = 1

    print(data.columns)
    #data = data.select_dtypes(include = ['float','integer'])

    X = data[[x for x in data.columns if x != y_feature and x != 'CohortType']]

    Y = data[y_feature]

    if X.shape[0] > 2:

        reg = sm.MixedLM(Y,
                         X,
                         groups=data["CohortType"],
                         exog_re=X["intercept"]).fit()
        ret = reg.summary()
    else:
        ret = 'error'

    fit_string = y_feature + '~'

    for x in X.columns:
        fit_string += ' + ' + str(x)

    fit_string = fit_string.replace('~ +', '~') + ' + (1|CohortType)'
    header = '<div> <b> Liear Mixed Model with Random Intercept </b> </div>'
    header += '<div> <b> Number samples: </b> ' + str(X.shape[0]) + '</div>'
    header += '<div> <b>  Model: </b>' + fit_string + '</div>'
    header += '<div> <b> Group: </b> CohortType '

    htmls = header + ret.tables[0].to_html() + ret.tables[1].to_html()
    return htmls

Example #6

Show file

File: mixed_effects.py Project: chriskjou/opennmt-inspection

def run_per_voxel(df, from_regress, labels):
    y_predicted_all = np.zeros((df.shape[0], ))
    kf = KFold(n_splits=5, shuffle=True)
    data = pd.concat([df, from_regress], axis=1)
    data = data.dropna()
    indices = list(data.index)
    # reset valid indices
    from_regress = from_regress.loc[indices, ].reset_index(drop=True)
    df = df.loc[indices, ].reset_index(drop=True)

    for train_index, test_index in kf.split(df):

        # training_data = data.loc[train_index,].reset_index(drop=True)
        # testing_data = data.loc[test_index,].reset_index(drop=True)
        # training_y_groups = data_labels.loc[train_index,].reset_index(drop=True)

        # print(training_data.shape)
        # print(training_y_groups.shape)

        # prepare data
        training_X = from_regress.loc[train_index, ].reset_index(drop=True)
        training_y = df.loc[train_index, ]['activations'].reset_index(
            drop=True)
        training_y_groups = df.loc[
            train_index, ]['subject_number'].reset_index(drop=True)

        testing_X = from_regress.loc[test_index, ].reset_index(drop=True)
        testing_y = df.loc[test_index, ]['activations'].reset_index(drop=True)
        testing_y_groups = df.loc[test_index, ]['subject_number'].reset_index(
            drop=True)

        md = sm.MixedLM(endog=training_y,
                        exog=training_X,
                        groups=training_y_groups,
                        exog_re=training_X)
        # func = 'activations ~ ' + str(labels) + '1'
        # re_form = str(labels)[:-2]
        # print(re_form)
        # print(func)
        # print(training_data.columns.values.tolist())
        # md = smf.mixedlm(func, training_data, re_formula=re_form, groups=training_y_groups)
        mdf = md.fit()
        print(mdf.summary())

        # print(testing_y.shape)
        y_hat_test = mdf.predict(testing_data)
        print("PREDICTION")
        print(y_hat_test[:10])
        y_predicted_all[test_index] = y_hat_test
        # print(y_hat_test.shape)
        # print(np.sqrt(np.sum(np.abs(y_hat_test - testing_y))))
        # print(asdf)
    y_true = df['activations']
    print("PREDICTED SHAPE")
    print(y_predicted_all.shape)
    print(y_predicted_all[:10])
    print("TRUE SHAPE")
    print(y_true.shape)
    print(y_true[:10])
    rmse = np.sqrt(np.sum(np.abs(y_predicted_all - y_true)))
    print("RMSE: " + str(rmse))
    return rmse.astype(np.float32)

Example #7

Show file

File: core.py Project: WELTEN/learning-pulse-python-app

def processData(df_original, reg_mlme=True):
    """
    Description: core processing of the data. It's divided in two main steps:
    step 1, apply VAR to the fixed effects wrt to each actor, step 2, apply LMEM
    to whole dataset and learn 5 different models for each of the labels 
    Input: dataframe transformed with the whole history
    Output: dataframe with the forecast for each participant.
    """
    df_flat = df_original.reset_index()
    # Actors definition
    #count participants anre remove 10
    actors = df_flat.actorId.unique().tolist()
    if 10 in actors: actors.remove(10)  # remove user no.10 (insufficient info)

    # Attributes definition
    #categoricals = ['MainActivity','lat','lng','weatherId']
    activities = list(df_original.ix[:, 19:].columns.values)
    random_effects = activities + ['Steps']
    fixed_effects = [
        'pressure', 'temp', 'humidity', 'hr_min', 'hr_avc', 'hr_mean',
        'hr_std', 'hr_max', 'timeframe'
    ]
    labels = ['Abilities', 'Challenge', 'Productivity', 'Stress', 'Flow']

    # Dataframe to remember min e max for each user
    target_min = pd.DataFrame(np.nan, index=actors, columns=labels)
    target_max = pd.DataFrame(np.nan, index=actors, columns=labels)
    target_mean = pd.DataFrame(np.nan, index=actors, columns=labels)
    target_std = pd.DataFrame(np.nan, index=actors, columns=labels)

    # STEP 1) VAR on fixed effects
    #-------------------------------
    window = 5  # Windows to predict
    df_future = pd.DataFrame()  #prepare the future dataframe
    var_attributes = [
        item for item in fixed_effects if item not in ['timeframe']
    ]
    for user in actors:
        print "7.1 ----- Forecasting actor ARLearn" + str(user)
        df_user = df_original.xs(user, level='actorId')
        df = df_user[var_attributes]
        VARres = VARprocess(df)
        forecasts = VARforecast(df, VARres, window)
        #plt = forecasts.plot() # prediction plot
        #plt.axvline(forecasts.index[-window])
        forecasts['actorId'] = user
        forecasts['timeframe'] = forecasts.index.hour
        df_future = df_future.append(forecasts)
        # offtopic, add max and min
        for target in (labels):
            target_min[target][user] = min(df_user[target])
            target_max[target][user] = max(df_user[target])
            target_mean[target][user] = df_user[target].mean()
            target_std[target][user] = df_user[target].std()
    # add intercept term
    df_future['Intercept'] = 1
    df_future = df_future.reset_index().set_index(['index',
                                                   'actorId']).sort_index()
    # ------------------------------- end VAR

    # STEP 2) Linear Mixed Effect Model
    # -------------------------------
    data = df_flat
    data['intercept'] = 1  # set the intercept term
    LMEM_models = []  # create a list of models, for multi output
    exog = data[fixed_effects +
                ['intercept']]  # the attributes from which to predict
    exog_re = data[random_effects]  # random effects
    groups = data['actorId']  # group definition

    # Training phase of four model, one per each label
    for target in labels:
        endog = data[target]  # endogenous, ie the values we want to predict
        if ((reg_mlme == False)
                and os.path.exists('model_' + target + '.pickle')):
            LMEM_results = pickle.load(
                open('model_' + target + '.pickle', 'rb'))
            LMEM_models.append(LMEM_results)
        else:
            with warnings.catch_warnings():
                warnings.filterwarnings("ignore")
                LMEM_model = sm.MixedLM(endog,
                                        exog,
                                        groups=groups,
                                        exog_re=exog_re)
                LMEM_results = LMEM_model.fit()
                LMEM_results.save('model_' + target + '.pickle',
                                  remove_data=False)
                LMEM_models.append(LMEM_results)

    # Coefficients importance averaging
    coeff = pd.DataFrame(index=range(0, len(exog.T)),
                         data={
                             'coefficients': 0.0
                         },
                         dtype='float').coefficients
    for i in range(0, len(coeff)):
        for j in range(0, len(LMEM_models)):
            coeff[i] = coeff[i] + LMEM_models[j].fe_params[i]
    coeff = coeff / len(LMEM_models)

    # Test phase for each of the four models
    df = df_future.reset_index()
    exog = df[fixed_effects]
    exog['intercept'] = 1
    for i in range(0, len(labels)):
        t = labels[i]
        df[t] = LMEM_models[i].predict(exog)
        # normalization
        for u in actors:
            actual = df[df['actorId'] == u][t]
            rindex = df[df['actorId'] == u][t].index
            # Normalization (x_max-x_min)*(x_i/100)+x_min
            df.loc[rindex,
                   t] = (target_max[t][u] -
                         target_min[t][u]) * (actual / 100) + target_min[t][u]
        df[t] = df[t].astype('int')
    df = df.rename(columns={'index': 'timestamp'})
    return df

Example #8

Show file

File: mixed_lm_example.py Project: timgates42/statsmodels

# This is one of the example data sets provided in the LMER R library.
# The outcome variable is the size of the tree, and the covariate used here
# is a time value.  The data are grouped by tree.

data = sm.datasets.get_rdataset("Sitka", "MASS").data
endog = data["size"]
data["Intercept"] = 1
exog = data[["Intercept", "Time"]]

# Here is the statsmodels LME fit for a basic model with a random
# intercept.  We are passing the endog and exog data directly to the LME
# init function as arrays.  Also note that endog_re is specified explicitly
# in argument 4 as a random intercept (although this would also be the
# default if it were not specified).

md = sm.MixedLM(endog, exog, groups=data["tree"], exog_re=exog["Intercept"])
mdf = md.fit()
print(mdf.summary())

# Here is the same model fit in R using LMER:

# ```ipython
# %R
# data(Sitka, package="MASS")
# print(summary(lmer("size ~ Time + (1 | tree)", data=Sitka)))
# ```

# ```
# Linear mixed model fit by REML ['lmerMod']
# Formula: size ~ Time + (1 | tree)
#    Data: Sitka

Example #9

Show file

def causal_simulation(path,start_date,f_start_date,datafile="dataset_full.csv",govpolicyfile="gov_dates_mandates.csv", num_date_omit=0, print_graph=True):
    data = pd.read_csv(path+"/"+datafile)
    
    start_dt = datetime.strptime(start_date, '%m/%d/%y').strftime('%Y-%m-%d')
    print(start_dt)
      
    dateval = pd.date_range(start_dt, periods=horizon+180).tolist()
    dates = pd.DataFrame({'dateval': dateval})
    dates['dateval'] = dates['dateval'].apply(lambda x: datetime.strptime(str(x),'%Y-%m-%d %H:%M:%S').strftime('%Y-%m-%d') )
    data['dateval'] = data['date'].apply(lambda x: datetime.strptime(str(x),'%Y%m%d' ).strftime('%Y-%m-%d') )
    if num_date_omit > 0:
        temp_start_date = datetime.strptime(f_start_date, '%m/%d/%y') - timedelta(days=30)
        temp_start_date = int(temp_start_date.strftime('%Y%m%d'))
        print(temp_start_date)
        
        temp = data.loc[(data['confirmed'].isna())&(data['date']>temp_start_date), ['location_name','confirmed','dateval']].sort_values(by='dateval').reset_index(drop=True)
        temp_start_date = temp.loc[0, 'dateval']
        temp_start_training_date = (datetime.strptime(temp_start_date, '%Y-%m-%d') - timedelta(days=num_date_omit)).strftime('%Y-%m-%d')
        print(temp_start_training_date)
        temp_start_simulation_date = (datetime.strptime(temp_start_date, '%Y-%m-%d') - timedelta(days=num_date_omit-1)).strftime('%Y-%m-%d')
     
    data['state'] = data['province_state']
   
    zx = 0
    
    data = psql.sqldf("""
    select province_state , country_region as country, date, confirmed,
       recovered, death, population, TAVG/10 as TAVG,
     a1.location_name, a1.dateval, country_region ||'-'||state as state,
    case when TAVG<=0 then 1 else 0 end as is_freezing,
    case when TAVG>0 and TAVG/10<20 then 1 else 0 end as is_cold, 
    case when TAVG>=20 and TAVG/10<35 then 1 else 0 end as is_warm,
    case when TAVG>=35 then 1 else 0 end as is_hot,
    case when TAVG>=20 then 1 else 0 end as temp_th,
     
      case when julianday(a1.dateval)>julianday('2020-03-20') then 1 else 0 end
       as gov_action
    from data a1   
    """).drop_duplicates()

   
    data['Intercept'] = 1.0
    data = data[(data['dateval']>=start_dt)]
    data['holdout'] = np.where((data['dateval']>=datetime.strptime(f_start_date, '%m/%d/%y').strftime('%Y-%m-%d')),1,0)

    print(data)
    
    data_save = data.copy()

    # data smoothing to correct irregular data issues: like dropped cumulative values
    data1 = pd.DataFrame()
     
    z = 0
    
    for state in data['state'].drop_duplicates():
            dat = data[(data['state']==state)].sort_values(by=['dateval'])
            if len(dat['dateval'])>1:
                dat = dat.fillna(0)
                dat = dat.loc[dat['confirmed'].ne(0.0).idxmax():]
                rho_data = dat[dat['confirmed']>0].sort_values(by=['dateval'])
                rho_data = rho_data[0:30]
                
                zz1 = 0.0
                zz2 = 0.0
                zz3 = 0.0
                for s in range(len(rho_data['dateval'])):
                    if (s>0):
                        if rho_data['confirmed'].values[s]-rho_data['confirmed'].values[s-1]>0:
                            zz1 = zz1 + rho_data['confirmed'].values[s]-rho_data['confirmed'].values[s-1]
                        if rho_data['recovered'].values[s]-rho_data['recovered'].values[s-1]>0:
                            zz2 = zz2 + rho_data['recovered'].values[s]-rho_data['recovered'].values[s-1]
                        if rho_data['death'].values[s]-rho_data['death'].values[s-1]>0:
                            zz3  = zz3 + rho_data['death'].values[s]-rho_data['death'].values[s-1]
                rho = 0.0
                if (zz2+zz3) >0.0:
                    rho = (zz1+zz2+zz3)/(zz2+zz3) 
                print("R_0 for "+state +" : "+str(rho))
                dat['lag_confirmed'] = 0.0
                dat['lag_recovered'] = 0.0
                dat['lag_death'] = 0.0
                dat['lag_removed'] = 0.0

                dat['d_recovered'] = 0.0
                dat['d_death'] = 0.0
                dat['d_removed'] = 0.0
                  
                N = dat['population'].values[0]
                dd = dat[dat['confirmed']>1]
                
                
                dat['R_0'] = rho
                z1 = 0
                tt = 1
                dat['removed'] = dat['death'] + dat['recovered'] 
                
                for t in range(len(dat)):
                    
                    if t>0 and t<=len(dat):
                        dat['lag_confirmed'].values[t] = dat['confirmed'].values[t-1]
                        dat['lag_removed'].values[t] = dat['removed'].values[t-1]
                        dat['lag_recovered'].values[t] = dat['recovered'].values[t-1]
                        dat['lag_death'].values[t] = dat['death'].values[t-1] 
                        dat['d_recovered'].values[t] =  dat['recovered'].values[t] - dat['recovered'].values[t-1] 
                        dat['d_death'].values[t] = dat['death'].values[t]-dat['death'].values[t-1]
                        dat['d_removed'].values[t] = dat['removed'].values[t]-dat['removed'].values[t-1]

                data1 = data1.append(dat,ignore_index=True)
                z = z +1
                         

    data = data1.copy()
    rb = np.mean(data[data['R_0']>0]['R_0'])
    data['R_0'] = np.where(data['R_0']==0,rb,data['R_0'])
    data = data.fillna(0)   
    data.to_csv("output/simulation_output/input_data.csv")
    if num_date_omit > 0:
        data_train = data[(data['removed']>0) & ((data['holdout']==0) | (data['dateval']<=temp_start_training_date))][['dateval','Intercept','state','TAVG','gov_action','is_freezing','is_cold','is_warm','is_hot','lag_confirmed','lag_death','lag_recovered','d_death','d_recovered','d_removed','removed']]
        print(temp_start_training_date)
        print(max(data_train['dateval']))
    else:
        data_train = data[(data['removed']>0) & (data['holdout']==0)][['dateval','Intercept','state','TAVG','gov_action','is_freezing','is_cold','is_warm','is_hot','lag_confirmed','lag_death','lag_recovered','d_death','d_recovered','d_removed','removed']]
    
    endog =data_train['d_removed']
    exog = data_train[[ 'Intercept','gov_action','TAVG','lag_confirmed']]
    model = sm.MixedLM(endog, exog, exog_re=exog[[ 'Intercept','lag_confirmed']],  groups=data_train["state"])
    po_results = model.fit()
    print(po_results.summary())
    
    
    



    # Get Coefficient
 
    k = []
    v1 = []
    v2 = []
    v3 = []
    d = po_results.random_effects
    
    for i in d:
        my_str = ''.join((ch if ch in '0123456789.-' else ' ') for ch in str(d[i]))
        listOfNumbers = [float(i) for i in my_str.split()]
        v1 = v1 +[str(d[i]).split(" ")[0] ]
        l = str(d[i]).split(" ")
        if str(listOfNumbers[0]).strip()=='':
            v2 = v2 +[0.00 ]
            v3 = v3 + [0.0]
        else:    
            v2 = v2 +[listOfNumbers[0] ]
            v3 = v3 +[listOfNumbers[1] ]
        k = k + [i]
        
    r_combined = pd.DataFrame({'state':k,'coef_name':v1,'coef_value':v2,'re_lag_confirmed':v3})
    
    r_combined['fe_Intercept'] = po_results.fe_params['Intercept']

    r_combined['Intercept'] = r_combined['fe_Intercept']+r_combined['coef_value']
   
    r_combined['lag_confirmed'] = po_results.fe_params['lag_confirmed'] +r_combined["re_lag_confirmed"]
    r_combined['gov_action'] = po_results.fe_params['gov_action']
    r_combined['TAVG'] = po_results.fe_params['TAVG']
    
    r_combined.fillna(0.0)
    r_combined.to_csv("output/simulation_output/recover_coefs.csv")
    
    mean_beta = np.mean(r_combined[r_combined['lag_confirmed']>0]['lag_confirmed'])
    r_combined['lag_confirmed'] = np.where(r_combined['lag_confirmed']<0,mean_beta,r_combined['lag_confirmed'])
    
    # Get Prediction and Bias

    t_dat = generate_dataset(data, r_combined)
    if num_date_omit > 0:
        t_dat = t_dat[(t_dat['removed']>0) & ((t_dat['holdout']==0) | (t_dat['dateval']<=temp_start_training_date))]
    else:
        t_dat = t_dat[(t_dat['removed']>0) & (t_dat['holdout']==0)]

    pred_on_train = runSimulator(data1=t_dat,
    coefsdfR=r_combined,
    sir_names=['susceptible','confirmed','death','removed'],
    xnamesr=['Intercept','gov_action','TAVG','lag_confirmed'],
    horizon1=60, date_gov_adjust=0, print_graph=print_graph)

    # == Adjust Prediction of Removed with Bias

    pred_on_train['bias_removed'] = pred_on_train['pred_removed'] - pred_on_train['removed']
    # pred_on_train.to_csv('output/simulation_output/pred_on_train.csv')
    mean_bias = pred_on_train.groupby('location_name')['bias_removed'].mean().reset_index()
    mean_bias.to_csv('output/simulation_output/bias.csv')

    loc_list = set(pred_on_train['location_name'])
    for loc in loc_list:
        bias = mean_bias.loc[mean_bias['location_name']==loc, 'bias_removed'].iloc[0]
        if bias > 0:
            pred_on_train.loc[pred_on_train['location_name']==loc, 'pred_removed'] = pred_on_train.loc[pred_on_train['location_name']==loc, 'pred_removed']-bias
        elif bias < 0:
            pred_on_train.loc[pred_on_train['location_name']==loc, 'pred_removed'] = pred_on_train.loc[pred_on_train['location_name']==loc, 'pred_removed']+bias

        loc_name = "".join(c for c in loc if c.isalnum())
        temp_for_plot = pred_on_train.loc[pred_on_train['location_name']==loc, ['dateval', 'removed', 'pred_removed']]
        plt.figure(figsize=(12,12))
        fig = temp_for_plot.plot(x='dateval', y=['removed', 'pred_removed'], rot=45, ax=plt.gca()).get_figure()
        fig.savefig(os.path.join('output/covid_plot/actual_pred/covid_plot_compare_'+loc_name+'.png'))
        fig.clf()

    pred_on_train['pred_removed'] = np.where(pred_on_train['pred_removed']<0, 0, pred_on_train['pred_removed'])


    pred_on_train.to_csv('output/simulation_output/adjusted_pred_on_train.csv')

    print(pred_on_train)

    # Adjust R-Combined with Bias

    mean_bias2 = mean_bias.copy()
    r_combined2 = r_combined.copy()
    mean_bias2['location_name'] = mean_bias2['location_name'].str.replace('[^a-zA-Z]', '')
    r_combined2['state'] = r_combined2['state'].str.replace('[^a-zA-Z]', '')
    loc_list = set(mean_bias2['location_name'])
    for loc in loc_list:
        bias = mean_bias2.loc[mean_bias2['location_name']==loc, 'bias_removed'].iloc[0]
        if bias > 0:
            r_combined2.loc[r_combined2['state']==loc, 'Intercept'] = r_combined2.loc[r_combined2['state']==loc, 'Intercept']-bias
        elif bias < 0:
            r_combined2.loc[r_combined2['state']==loc, 'Intercept'] = r_combined2.loc[r_combined2['state']==loc, 'Intercept']+bias

    r_combined = r_combined2


    tti = 0
     
    states = data['state'].drop_duplicates().str.replace('[^a-zA-Z]', '')
    data2 = data.copy()
    data2['state'] = data2['state'].str.replace('[^a-zA-Z]', '')
    data3 = pd.DataFrame()
    for s in states:
        rc = r_combined[r_combined['state']==s]
        #print(rc)
        dat = data2[data2['state']==s]
         
        if len(dat)>0 and len(rc)>0:
            print(s)
            dat = dat.sort_values(by=['dateval'])
            #dat = dat.reset_index()
            beta = rc['lag_confirmed'].values[0]
            N = dat['population'].values[0]
            dat['susceptible'] = np.where(dat['holdout']==0, N+0.0,0.0)
            
            alpha = dat['R_0'].values[0]*beta
            dat['alpha'] = alpha

            
            if tti==0:
               data3 = dat
            else:
                data3 = data3.append(dat,ignore_index=True )
            tti = tti +1     

    print(data3)
    print(data3['state'].drop_duplicates())
     
    # data3.to_csv(path+"/before_sim_data_test.csv")
    #runDynamicSimulator 
    #runSimulator(data1,coefsdfR,sir_names,xnamesr,horizon1)
    date_start_sim = 20200510

    if num_date_omit > 0:
        sim_data = data3[(data3['holdout']==1)&(data3['dateval']>=temp_start_simulation_date)]
        print(temp_start_simulation_date)
        print(min(sim_data['dateval']))
    else:
        sim_data = data3[(data3['holdout']==1)]
    
    sim_data_output_after = runSimulator(data1=sim_data,
    coefsdfR=r_combined,
    sir_names=['susceptible','confirmed','death','removed'],
    xnamesr=['Intercept','gov_action','TAVG','lag_confirmed'],
    horizon1=60, date_gov_adjust=date_start_sim, print_graph=print_graph)
    sim_data_output_after.to_csv("output/simulation_output/simulations_after_adjust_at_"+str(date_start_sim)+"_omitlastD_"+str(num_date_omit)+".csv")

    sim_data_output_before = runSimulator(data1=sim_data,
    coefsdfR=r_combined,
    sir_names=['susceptible','confirmed','death','removed'],
    xnamesr=['Intercept','gov_action','TAVG','lag_confirmed'],
    horizon1=60, date_gov_adjust=0, print_graph=print_graph)
    sim_data_output_before.to_csv("output/simulation_output/simulations_before_adjust_at_"+str(date_start_sim)+"_omitlastD_"+str(num_date_omit)+".csv")

    sim_data_compare = sim_data_output_after.merge(sim_data_output_before, on=['index', 'province_state', 'country','date','dateval','location_name'], suffixes=('_after', '_before'))
    sim_data_compare['diff_susceptible'] = sim_data_compare['pred_susceptible_after'] - sim_data_compare['pred_susceptible_before']
    sim_data_compare['diff_confirmed'] = sim_data_compare['pred_confirmed_after'] - sim_data_compare['pred_confirmed_before']
    sim_data_compare['diff_removed'] = sim_data_compare['pred_removed_after'] - sim_data_compare['pred_removed_before']

    sim_data_compare = sim_data_compare.loc[:, ['province_state', 'country','date','dateval','location_name','pred_susceptible_after', 'pred_confirmed_after', 'pred_removed_after', 'pred_susceptible_before', 'pred_confirmed_before', 'pred_removed_before', 'diff_susceptible', 'diff_confirmed', 'diff_removed']]
    sim_data_compare.to_csv("output/simulation_output/simulations_compare"+str(date_start_sim)+"_omitlastD_"+str(num_date_omit)+".csv")

    if print_graph == True:
        for location in sim_data_compare['location_name'].drop_duplicates():
            dat = sim_data_compare[(sim_data_compare['location_name']==location)].sort_values(by=['date'])
            # plot results
            plt.figure(1)
            plt.figure(figsize=(15,10))
            xtick_locator = AutoDateLocator()
            xtick_formatter = AutoDateFormatter(xtick_locator)
            date_list = pd.to_datetime(dat['dateval'])
            ax = plt.axes()
            ax.xaxis.set_major_locator(xtick_locator)
            ax.xaxis.set_major_formatter(xtick_formatter)
            plt.plot(date_list,dat['diff_susceptible'],'b-')
            plt.plot(date_list,dat['diff_confirmed'],'r--')
            plt.plot(date_list,dat['diff_removed'],'g--')
            plt.xlabel('Time')
            plt.ylabel('Populations')
            plt.title('Compare Before/After Gov. Intervention Adjust at '+''.join(e for e in location if e.isalnum())+' : '+str(date_start_sim))
            plt.legend(['Diff Suceptibes','Diff Confirmed','Diff Removed'])
            plt.savefig(os.path.join('output/covid_plot/covid_plot_compare_'+"omitlastD_"+str(num_date_omit)+''.join(e for e in location if e.isalnum())+'_'+str(date_start_sim)+'.png'))
            plt.clf()
            plt.close()

Example #10

Show file

def run(opts):

    indexCol = opts.indexcolumns[0]

    # read csv(s)
    num_csv = len(opts.inputcsv)
    pdCSV = pd.read_csv(opts.inputcsv[0], delimiter=',', index_col=[indexCol])
    if num_csv > 1:
        for i in range(int(num_csv - 1)):
            tempCSV = pd.read_csv(opts.inputcsv[int(i + 1)],
                                  delimiter=',',
                                  index_col=[indexCol])
            pdCSV = pd.concat([pdCSV, tempCSV],
                              axis=1,
                              join_axes=[pdCSV.index])

    # Interaction Variables
    if opts.interactionvars:
        for int_terms in opts.interactionvars:
            inteaction_vars = int_terms.split("*")
            for scale_var in inteaction_vars:
                var_temp = scalearr(pdCSV[scale_var])
                var_tempname = '%s_std' % scale_var
                if var_tempname in opts.exogenousvariables:
                    pass
                else:
                    pdCSV[var_tempname] = var_temp
                    opts.exogenousvariables.append(var_tempname)
        for int_terms in opts.interactionvars:
            inteaction_vars = int_terms.split("*")
            for i, scale_var in enumerate(inteaction_vars):
                if i == 0:
                    int_temp = pdCSV['%s_std' % scale_var]
                    int_tempname = '%s' % scale_var
                else:
                    int_temp = int_temp * pdCSV['%s_std' % scale_var]
                    int_tempname = int_tempname + '.X.' + scale_var
            if int_tempname in opts.exogenousvariables:
                pass
            else:
                pdCSV[int_tempname] = int_temp
                opts.exogenousvariables.append(int_tempname)
            int_temp = None
        print opts.exogenousvariables

    # output column/variable names.
    if opts.outputcolumnnames:
        for counter, roi in enumerate(pdCSV.columns):
            print("[%d] : %s" % (counter, roi))
        quit()

    # set grouping variables
    if opts.groupingvariable:
        if len(opts.groupingvariable) > 1:
            pdCSV = russiandolls(opts.groupingvariable, pdCSV)
            groupVar = 'group_list'
        else:
            groupVar = opts.groupingvariable[0]

    # stats functions

    if opts.outstats:
        if not opts.statsmodel:
            print("A statistical model must be specificed. -m {model}")
            quit()
        if not opts.range:
            print("Range must be specfied. -r {start} {stop}")
            quit()
        elif len(opts.range) != 2:
            print("Range must have start and stop. -r {start} {stop}")
            quit()
        else:
            roi_names = []
            t_values = []
            p_values = []
            icc_values = []
            if not opts.exogenousvariables:
                print(
                    "The exogenous (independent) variables must be specifice. e.g., -exog pred1 pred2 age"
                )
                quit()

            if opts.mediation:
                medvars = ['%s' % opts.mediation[1], '%s' % opts.mediation[2]]
                exog_vars = create_exog_mat(opts.exogenousvariables, pdCSV)
                # build null array
                pdCSV = omitmissing(pdDF=pdCSV,
                                    endog_range=opts.range,
                                    exogenous=strip_ones(exog_vars),
                                    groups=medvars)
                if opts.statsmodel == 'mixedmodel' or opts.statsmodel == 'mm':
                    pdCSV = omitmissing(pdDF=pdCSV,
                                        endog_range=opts.range,
                                        groups=opts.groupingvariable)
                # rebuild exog_vars with correct length
                exog_vars = create_exog_mat(opts.exogenousvariables, pdCSV,
                                            opts.scaleexog == True)
                leftvar = pdCSV[opts.mediation[1]]
                rightvar = pdCSV[opts.mediation[2]]
                y = pdCSV.iloc[:, int(opts.range[0]):int(opts.range[1]) + 1]

                if opts.statsmodel == 'mixedmodel' or opts.statsmodel == 'mm':
                    t_valuesA = []
                    t_valuesB = []
                    ################ MM mediation ################
                    if opts.mediation[0] == 'I':
                        EXOG_A = sm.add_constant(
                            np.column_stack((leftvar, strip_ones(exog_vars))))
                        EXOG_B = np.column_stack((leftvar, rightvar))
                        EXOG_B = sm.add_constant(
                            np.column_stack((EXOG_B, strip_ones(exog_vars))))
                        #pathA
                        for i in xrange(int(opts.range[0]),
                                        int(opts.range[1]) + 1):
                            mdl_fit = sm.MixedLM(pdCSV[pdCSV.columns[i]],
                                                 EXOG_A,
                                                 pdCSV[groupVar]).fit()
                            roi_names.append(pdCSV.columns[i])
                            t_valuesA.append(mdl_fit.tvalues[1])
                        #pathB
                        for i in xrange(int(opts.range[0]),
                                        int(opts.range[1]) + 1):
                            mdl_fit = sm.MixedLM(pdCSV[pdCSV.columns[i]],
                                                 EXOG_B,
                                                 pdCSV[groupVar]).fit()
                            t_valuesB.append(mdl_fit.tvalues[1])
                    elif opts.mediation[0] == 'M':
                        EXOG_A = sm.add_constant(
                            np.column_stack((leftvar, strip_ones(exog_vars))))
                        EXOG_B = np.column_stack((rightvar, leftvar))
                        EXOG_B = sm.add_constant(
                            np.column_stack((EXOG_B, strip_ones(exog_vars))))
                        #pathA
                        for i in xrange(int(opts.range[0]),
                                        int(opts.range[1]) + 1):
                            mdl_fit = sm.MixedLM(pdCSV[pdCSV.columns[i]],
                                                 EXOG_A,
                                                 pdCSV[groupVar]).fit()
                            roi_names.append(pdCSV.columns[i])
                            t_valuesA.append(mdl_fit.tvalues[1])
                        #pathB
                        for i in xrange(int(opts.range[0]),
                                        int(opts.range[1]) + 1):
                            mdl_fit = sm.MixedLM(pdCSV[pdCSV.columns[i]],
                                                 EXOG_B,
                                                 pdCSV[groupVar]).fit()
                            t_valuesB.append(mdl_fit.tvalues[1])
                    else:
                        EXOG_A = sm.add_constant(
                            np.column_stack((leftvar, strip_ones(exog_vars))))
                        EXOG_B = np.column_stack((rightvar, leftvar))
                        EXOG_B = sm.add_constant(
                            np.column_stack((EXOG_B, strip_ones(exog_vars))))

                        #pathA
                        mdl_fit = sm.MixedLM(rightvar, EXOG_A,
                                             pdCSV[groupVar]).fit()
                        t_valuesA = mdl_fit.tvalues[1]

                        #pathB
                        for i in xrange(int(opts.range[0]),
                                        int(opts.range[1]) + 1):
                            mdl_fit = sm.MixedLM(pdCSV[pdCSV.columns[i]],
                                                 exog_vars,
                                                 pdCSV[groupVar]).fit()
                            roi_names.append(pdCSV.columns[i])
                            t_valuesB.append(mdl_fit.tvalues[1])

                    z_values = special_calc_sobelz(np.array(t_valuesA),
                                                   np.array(t_valuesB),
                                                   alg="aroian")
                    p_values = norm.sf(abs(z_values))
                    p_FDR = multipletests(p_values, method='fdr_bh')[1]

                else:
                    ################ LM mediation ################
                    if opts.mediation[0] == 'I':
                        EXOG_A = sm.add_constant(
                            np.column_stack((leftvar, strip_ones(exog_vars))))
                        EXOG_B = np.column_stack((leftvar, rightvar))
                        EXOG_B = sm.add_constant(
                            np.column_stack((EXOG_B, strip_ones(exog_vars))))

                        y = pdCSV.iloc[:,
                                       int(opts.range[0]):int(opts.range[1]) +
                                       1]
                        #pathA
                        t_valuesA = full_glm_results(y,
                                                     EXOG_A,
                                                     only_tvals=True)[1, :]
                        #pathB
                        t_valuesB = full_glm_results(y,
                                                     EXOG_B,
                                                     only_tvals=True)[1, :]

                    elif opts.mediation[0] == 'M':
                        EXOG_A = sm.add_constant(
                            np.column_stack((leftvar, strip_ones(exog_vars))))
                        EXOG_B = np.column_stack((rightvar, leftvar))
                        EXOG_B = sm.add_constant(
                            np.column_stack((EXOG_B, strip_ones(exog_vars))))

                        y = pdCSV.iloc[:,
                                       int(opts.range[0]):int(opts.range[1]) +
                                       1]
                        #pathA
                        t_valuesA = full_glm_results(y,
                                                     EXOG_A,
                                                     only_tvals=True)[1, :]
                        #pathB
                        t_valuesB = full_glm_results(y,
                                                     EXOG_B,
                                                     only_tvals=True)[1, :]

                    elif opts.mediation[0] == 'Y':
                        EXOG_A = sm.add_constant(
                            np.column_stack((leftvar, strip_ones(exog_vars))))
                        EXOG_B = np.column_stack((rightvar, leftvar))
                        EXOG_B = sm.add_constant(
                            np.column_stack((EXOG_B, strip_ones(exog_vars))))

                        y = pdCSV.iloc[:,
                                       int(opts.range[0]):int(opts.range[1]) +
                                       1]
                        #pathA
                        t_valuesA = sm.OLS(rightvar, EXOG_A).fit().tvalues[1]
                        #pathB
                        t_valuesB = full_glm_results(y,
                                                     EXOG_B,
                                                     only_tvals=True)[1, :]

                    else:
                        print("Error: Invalid mediation type.")
                        quit()
                    z_values = special_calc_sobelz(np.array(t_valuesA),
                                                   np.array(t_valuesB),
                                                   alg="aroian")
                    p_values = norm.sf(abs(z_values))
                    p_FDR = multipletests(p_values, method='fdr_bh')[1]

                    if opts.permutation:
                        if opts.groupingvariable:
                            p_FWER = run_permutations_med(
                                endog_arr=y,
                                exog_vars=exog_vars,
                                medtype=opts.mediation[0],
                                leftvar=leftvar,
                                rightvar=rightvar,
                                num_perm=int(opts.permutation[0]),
                                stat_arr=z_values,
                                uniq_groups=pdCSV[groupVar],
                                return_permutations=True)
                        else:
                            p_FWER = run_permutations_med(
                                endog_arr=y,
                                exog_vars=exog_vars,
                                medtype=opts.mediation[0],
                                leftvar=leftvar,
                                rightvar=rightvar,
                                num_perm=int(opts.permutation[0]),
                                stat_arr=z_values,
                                uniq_groups=None,
                                return_permutations=True)

                    roi_names = []
                    for i in xrange(int(opts.range[0]),
                                    int(opts.range[1]) + 1):
                        roi_names.append(pdCSV.columns[i])

                columnnames = []
                columnnames.append('Zval')
                columnnames.append('pval')
                columnnames.append('pFDR')
                columndata = np.column_stack((z_values, p_values))
                columndata = np.column_stack((columndata, p_FDR))
                if opts.permutation:
                    columnnames.append('pFWER')
                    columndata = np.column_stack((columndata, p_FWER))
                pd_DF = pd.DataFrame(data=columndata,
                                     index=roi_names,
                                     columns=columnnames)
                pd_DF.to_csv(opts.outstats[0], index_label='ROI')

            else:
                ################ MIXED MODEL ################
                if opts.statsmodel == 'mixedmodel' or opts.statsmodel == 'mm':
                    exog_vars = create_exog_mat(opts.exogenousvariables, pdCSV)

                    # build null array
                    pdCSV = omitmissing(pdDF=pdCSV,
                                        endog_range=opts.range,
                                        exogenous=strip_ones(exog_vars),
                                        groups=opts.groupingvariable)
                    # rebuild exog_vars with correct length
                    if opts.scaleexogwithingroup:
                        exog_vars = create_exog_mat(
                            opts.exogenousvariables,
                            pdCSV,
                            opts.scaleexog == True,
                            scale_groups=pdCSV[groupVar])
                    else:
                        exog_vars = create_exog_mat(opts.exogenousvariables,
                                                    pdCSV,
                                                    opts.scaleexog == True)

                    exog_re = None
                    if opts.exogintercept:
                        exog_re = dmatrix("1+%s" % opts.exogintercept[0],
                                          pdCSV)

                    for i in xrange(int(opts.range[0]),
                                    int(opts.range[1]) + 1):
                        mdl_fit = sm.MixedLM(endog=pdCSV[pdCSV.columns[i]],
                                             exog=exog_vars,
                                             groups=pdCSV[groupVar],
                                             exog_re=exog_re).fit()
                        roi_names.append(pdCSV.columns[i])
                        t_values.append(mdl_fit.tvalues[1:])
                        p_values.append(mdl_fit.pvalues[1:])
                        icc_values.append(
                            np.array(mdl_fit.cov_re /
                                     (mdl_fit.cov_re + mdl_fit.scale)))
                        if opts.plotresids:
                            os.system('mkdir -p resid_plots')
                            plot_residuals(
                                residual=mdl_fit.resid,
                                fitted=mdl_fit.fittedvalues,
                                basename=('%s_mm_%s' %
                                          (str(i).zfill(4), pdCSV.columns[i])),
                                outdir='resid_plots/')
                    p_values = np.array(p_values)
                    t_values = np.array(t_values)
                    p_FDR = np.zeros_like(p_values)

                    p_values[np.isnan(p_values)] = 1
                    for col in range(p_FDR.shape[1]):
                        p_FDR[:, col] = multipletests(p_values[:, col],
                                                      method='fdr_bh')[1]

                    columnnames = []
                    for colname in opts.exogenousvariables:
                        columnnames.append('tval_%s' % colname)
                    if opts.exogintercept:
                        columnnames.append('tval_re1')
                        columnnames.append('tval_re1Xre2')
                        columnnames.append('tval_re2')
                    else:
                        columnnames.append('tval_groupRE')

                    for colname in opts.exogenousvariables:
                        columnnames.append('pval_%s' % colname)
                    if opts.exogintercept:
                        columnnames.append('pval_re1')
                        columnnames.append('pval_re1Xre2')
                        columnnames.append('pval_re2')
                    else:
                        columnnames.append('pval_groupRE')

                    for colname in opts.exogenousvariables:
                        columnnames.append('pFDR_%s' % colname)
                    if opts.exogintercept:
                        columnnames.append('pFDR_re1')
                        columnnames.append('pFDR_re1Xre2')
                        columnnames.append('pFDR_re2')
                    else:
                        columnnames.append('pFDR_groupRE')

                    if not opts.exogintercept:
                        columnnames.append('ICC_groupRE')
                    columndata = np.column_stack((t_values, p_values))
                    columndata = np.column_stack((columndata, p_FDR))
                    if not opts.exogintercept:
                        columndata = np.column_stack(
                            (columndata, np.array(icc_values).flatten()))
                    pd_DF = pd.DataFrame(data=columndata,
                                         index=roi_names,
                                         columns=columnnames)
                    pd_DF.to_csv(opts.outstats[0], index_label='ROI')
                else:
                    ################ LINEAR MODEL ################
                    exog_vars = create_exog_mat(opts.exogenousvariables, pdCSV)
                    # build null array
                    pdCSV = omitmissing(pdDF=pdCSV,
                                        endog_range=opts.range,
                                        exogenous=strip_ones(exog_vars))
                    # rebuild exog_vars with correct length
                    if opts.scaleexogwithingroup:
                        exog_vars = create_exog_mat(
                            opts.exogenousvariables,
                            pdCSV,
                            opts.scaleexog == True,
                            scale_groups=pdCSV[groupVar])
                    else:
                        exog_vars = create_exog_mat(opts.exogenousvariables,
                                                    pdCSV,
                                                    opts.scaleexog == True)
                    y = np.array(
                        pdCSV.iloc[:,
                                   int(opts.range[0]):int(opts.range[1]) + 1])

                    if opts.plotresids:
                        f_values, t_values, p_values, R2, R2_adj, resids, fitted = full_glm_results(
                            y, exog_vars, return_resids=True)
                    else:
                        np.savetxt('temp_int.csv',
                                   orthog_columns(strip_ones(exog_vars)),
                                   delimiter=',')
                        f_values, t_values, p_values, R2, R2_adj = full_glm_results(
                            y, exog_vars)

                    if opts.permutation:
                        if opts.groupingvariable:
                            p_FWER = run_permutations(
                                endog_arr=y,
                                exog_vars=exog_vars,
                                num_perm=int(opts.permutation[0]),
                                stat_arr=t_values,
                                uniq_groups=pdCSV[groupVar],
                                return_permutations=True)
                        else:
                            p_FWER = run_permutations(endog_arr=y,
                                                      exog_vars=exog_vars,
                                                      num_perm=int(
                                                          opts.permutation[0]),
                                                      stat_arr=t_values,
                                                      uniq_groups=None,
                                                      return_permutations=True)
                        p_FWER = p_FWER[1:, :].T

                    t_values = t_values[1:, :].T  # ignore intercept
                    p_values = p_values[1:, :].T  # ignore intercept

                    roi_names = []
                    for i in xrange(int(opts.range[0]),
                                    int(opts.range[1]) + 1):
                        roi_names.append(pdCSV.columns[i])

                    p_FDR = np.zeros_like(p_values)
                    p_values[np.isnan(p_values)] = 1
                    for col in range(p_FDR.shape[1]):
                        p_FDR[:, col] = multipletests(p_values[:, col],
                                                      method='fdr_bh')[1]

                    columnnames = []
                    columnnames.append('Fvalue')
                    columnnames.append('R2')
                    columnnames.append('R2adj')
                    for colname in opts.exogenousvariables:
                        columnnames.append('tval_%s' % colname)
                    for colname in opts.exogenousvariables:
                        columnnames.append('pval_%s' % colname)
                    for colname in opts.exogenousvariables:
                        columnnames.append('pFDR_%s' % colname)

                    columndata = np.column_stack((f_values[:, np.newaxis], R2))
                    columndata = np.column_stack((columndata, R2_adj))
                    columndata = np.column_stack((columndata, t_values))
                    columndata = np.column_stack((columndata, p_values))
                    columndata = np.column_stack((columndata, p_FDR))
                    if opts.permutation:
                        for colname in opts.exogenousvariables:
                            columnnames.append('pFWER_%s' % colname)
                        columndata = np.column_stack((columndata, p_FWER))
                    pd_DF = pd.DataFrame(data=columndata,
                                         index=roi_names,
                                         columns=columnnames)
                    pd_DF.to_csv(opts.outstats[0], index_label='ROI')

                    if opts.plotresids:
                        os.system('mkdir -p resid_plots')
                        for i, roi in enumerate(np.array(roi_names)):
                            plot_residuals(
                                residual=resids[:, i],
                                fitted=fitted[:, i],
                                basename=(
                                    '%s_lm_%s' %
                                    (str(i + int(opts.range[0])).zfill(4),
                                     roi)),
                                outdir='resid_plots/')

    if opts.savecsv:
        pdCSV.to_csv(opts.savecsv[0])

Example #11

Show file

vc = {'x': '0 + x'}

md = smf.mixedlm("y ~ 1 + x", test_df, groups = test_df["unit"],
                 vc_formula = vc,
                 re_formula = "~ 1") # random intercept
        
mdf = md.fit()
mdf.summary()
mdf.scale

reffs = mdf.random_effects
smf_b_i = [reffs[i][1] for i in range(1, 31)]
smf_beta_i = mdf.params['x'] + smf_b_i

md2 = sm.MixedLM(test_df["y"], test_df[["Intercept", "x"]],
                groups = test_df["unit"],
                exog_re = test_df[["Intercept", "x"]])

mdf2 = md2.fit()
mdf2.summary()

# Stage2: Trying to get it to work with keras

enc = OneHotEncoder()
enc.fit(test_df['unit'].values.reshape(-1, 1))
unit_onehot = enc.transform(test_df['unit'].values.reshape(-1, 1))

unit_onehot = unit_onehot.toarray() # random intercepts design
unit_x = np.dot(np.diag(test_df['x']), unit_onehot) # random coefs design

n_units = unit_onehot.shape[1]

Example #12

Show file

    return mat, colnames


# Then we set up the variance components using the VCSpec class.

vcm = df.groupby("group1").apply(f).to_list()
mats = [x[0] for x in vcm]
colnames = [x[1] for x in vcm]
names = ["group2"]
vcs = VCSpec(names, [colnames], [mats])

# Finally we fit the model.  It can be seen that the results of the
# two fits are identical.

oo = np.ones(df.shape[0])
model2 = sm.MixedLM(df.y, oo, exog_re=oo, groups=df.group1, exog_vc=vcs)
result2 = model2.fit()
print(result2.summary())

# ## Crossed analysis

# In a crossed analysis, the levels of one group can occur in any
# combination with the levels of the another group.  The groups in
# Statsmodels MixedLM are always nested, but it is possible to fit a
# crossed model by having only one group, and specifying all random
# effects as variance components.  Many, but not all crossed models
# can be fit in this way.  The function below generates a crossed data
# set with two levels of random structure.


def generate_crossed(n_group1=100,

Example #13

Show file

File: GLMM.py Project: WayneHajas/YearFrequencyAnalysis

csvfile = '..\\TreeNobXdate.csv'
col = 0  #Use first column from file
SurveyYear = 2005
MaxYear = 1980
MinYear = 1875

#Generate an age-frequency.  Oldest-ages first
data = GetAgeFreq(csvfile,
                  col=col,
                  SurveyYear=SurveyYear,
                  MaxYear=MaxYear,
                  MinYear=MinYear)
nyear = len(data)

#Add a column of ones to represent the constant
yb = array([array([int(1), int(t)]) for t in range(nyear)])

glm = sm.GLM(data,
             yb,
             family=sm.families.Poisson(),
             links=sm.families.links.Log())
print(glm.fit().summary())
print('#############')
groups = [t for t in range(nyear)]
MLM = sm.MixedLM(data,
                 yb,
                 groups=groups,
                 family=sm.families.Poisson(),
                 links=sm.families.links.Log())
print(MLM.fit().summary())
print('#############')

Example #14

Show file

File: analyses_tools.py Project: elifesciences-publications/2020_eLife_pupil_bias

def mixed_linear_modeling(df, x='bin', bic_diff=10, df_sims=None, colors=None):

    fig = plt.figure(figsize=(1.1 * len(df['variable'].unique()), 1.5))
    plt_nr = 1

    for param in df['variable'].unique():

        data = df.loc[df['variable'] == param, :]

        ax = fig.add_subplot(1, len(df['variable'].unique()), plt_nr)

        # sns.barplot(x='variable', y='value', hue='bin', units='subj_idx', palette='Reds', ci=None, data=df)
        # sns.barplot(x='variable', y='value', hue='bin', units='subj_idx', palette='Reds', ci=66, data=df)
        kwargs = {
            'linewidths': 0,
            'markeredgewidth': 0.5,
            'markeredgecolor': 'black',
            'ecolor': 'black'
        }
        if ('level' in data.columns) & ~(x == 'level'):
            sns.pointplot(x=x,
                          y='value',
                          hue='level',
                          units='subj_idx',
                          join=False,
                          ci=66,
                          scale=0.50,
                          errwidth=1,
                          palette='Greys',
                          data=data,
                          zorder=1,
                          **kwargs)
        else:
            sns.pointplot(x=x,
                          y='value',
                          units='subj_idx',
                          join=False,
                          ci=66,
                          scale=0.66,
                          errwidth=1,
                          color='grey',
                          data=data,
                          zorder=1,
                          **kwargs)
        # sns.stripplot(x='variable', y='value', hue='bin', color='grey', size=2, jitter=False, dodge=True, data=df)
        # locs = np.sort(np.array([p.get_x() + p.get_width() / 2. for p in ax.patches]))

        if param == 'rt':
            plt.ylim(data['value'].mean() - 0.1, data['value'].mean() + 0.1)

        if len(data[x].unique()) > 2:
            # variables:
            data['intercept'] = 1
            data.loc[:, '{}_^2'.format(x)] = np.array(data.loc[:, x]**2)

            # # zscore:
            # for subj in data['subj_idx'].unique():
            #     ind = data['subj_idx']==subj
            #     data.loc[ind,x] = (data.loc[ind,x] - data.loc[ind,x].mean()) / data.loc[ind,x].std()
            #     data.loc[ind,'{}_^2'.format(x)] = (data.loc[ind,'{}_^2'.format(x)]  - data.loc[ind,'{}_^2'.format(x)].mean()) / data.loc[ind,'{}_^2'.format(x)].std()

            endog = data.loc[:, 'value'].astype(float)
            if ('level' in data.columns) & ~(x == 'level'):
                exog1 = data.loc[:, ['intercept', 'level', x]].astype(float)
                exog2 = data.loc[:,
                                 ['intercept', 'level', x, '{}_^2'.
                                  format(x)]].astype(float)
            else:
                exog1 = data.loc[:, ['intercept', x]].astype(float)
                exog2 = data.loc[:,
                                 ['intercept', x, '{}_^2'.format(x)]].astype(
                                     float)

            # comparison:
            try:
                md1 = sm.MixedLM(endog,
                                 exog1,
                                 data.loc[:, 'subj_idx'],
                                 exog_re=exog1)
                mdf1 = md1.fit(reml=False)
                md2 = sm.MixedLM(endog,
                                 exog2,
                                 data.loc[:, 'subj_idx'],
                                 exog_re=exog2)
                mdf2 = md2.fit(reml=False)
                if mdf1.converged & mdf2.converged:
                    random = True
                else:
                    md1 = sm.MixedLM(
                        endog,
                        exog1,
                        data.loc[:, 'subj_idx'],
                    )
                    mdf1 = md1.fit(reml=False)
                    md2 = sm.MixedLM(
                        endog,
                        exog2,
                        data.loc[:, 'subj_idx'],
                    )
                    mdf2 = md2.fit(reml=False)
                    random = False
                if (mdf1.bic - mdf2.bic) > bic_diff:
                    exog = exog2.copy()
                else:
                    exog = exog1.copy()

                # refit with reml:
                if random:
                    mdf = sm.MixedLM(endog,
                                     exog,
                                     groups=data.loc[:, 'subj_idx'],
                                     exog_re=exog).fit()
                else:
                    mdf = sm.MixedLM(endog,
                                     exog,
                                     groups=data.loc[:, 'subj_idx']).fit()
                print(mdf.summary())
                xx = np.sort(np.array([p.get_data()[0][0] for p in ax.lines]))
                if ('level' in data.columns) & ~(x == 'level'):
                    if (mdf1.bic - mdf2.bic) > bic_diff:
                        yy = np.concatenate([
                            mdf.params['intercept'] +
                            (np.array(exog.groupby('level').mean().index) *
                             mdf.params['level']) + (b * mdf.params[x]) +
                            ((b**2) * mdf.params['{}_^2'.format(x)])
                            for b in np.array(exog.groupby(x).mean().index)
                        ])
                        plt.title('p = {}\np1 = {}\np2 = {}'.format(
                            round(mdf.pvalues['level'], 3),
                            round(mdf.pvalues[x], 3),
                            round(mdf.pvalues['{}_^2'.format(x)], 3)),
                                  size=6)
                    else:
                        yy = np.concatenate([
                            mdf.params['intercept'] +
                            (np.array(exog.groupby('level').mean().index) *
                             mdf.params['level']) + (b * mdf.params[x])
                            for b in np.array(exog.groupby(x).mean().index)
                        ])
                        plt.title('p = {}\np = {}'.format(
                            round(mdf.pvalues['level'], 3),
                            round(mdf.pvalues[x], 3)),
                                  size=6)
                    for v in exog.groupby('level').mean().index:
                        plt.plot(
                            xx[int(v)::len(exog.groupby('level').mean().index
                                           )],
                            yy[int(v)::len(exog.groupby('level').mean().index
                                           )],
                            lw=1,
                            color='black')
                else:
                    if (mdf1.bic - mdf2.bic) > bic_diff:
                        yy = mdf.params['intercept'] + (np.array(
                            exog.groupby(x).mean().index) * mdf.params[x]) + (
                                (np.array(exog.groupby(x).mean().index)**2) *
                                mdf.params['{}_^2'.format(x)])
                        plt.title('p1 = {}\np2 = {}'.format(
                            round(mdf.pvalues[x], 3),
                            round(mdf.pvalues['{}_^2'.format(x)], 3)),
                                  size=6)
                    else:
                        yy = mdf.params['intercept'] + (np.array(
                            exog.groupby(x).mean().index) * mdf.params[x])
                        plt.title('p = {}'.format(round(mdf.pvalues[x], 3)),
                                  size=6)
                    plt.plot(xx, yy, lw=1, color='black')
            except:
                pass
        else:

            t, p = sp.stats.ttest_rel(data.loc[data[x] == 0, 'value'],
                                      data.loc[data[x] == 1, 'value'])
            plt.title('p = {}'.format(round(p, 3)), size=6)

        if not df_sims is None:
            if ('level' in data.columns) & ~(x == 'level'):
                for df_sim, color in zip(df_sims, colors):
                    sns.pointplot(
                        x=x,
                        y='value',
                        hue='level',
                        palette=[
                            'blue' for _ in range(len(data['level'].unique()))
                        ],
                        join=False,
                        ci=None,
                        markers='x',
                        scale=0.66,
                        data=df_sim.loc[df['variable'] == param, :],
                        zorder=100)
            else:
                for df_sim, color in zip(df_sims, colors):
                    sns.pointplot(x=x,
                                  y='value',
                                  color='blue',
                                  join=False,
                                  ci=None,
                                  markers='x',
                                  scale=0.66,
                                  data=df_sim.loc[df['variable'] == param, :],
                                  zorder=100)
        try:
            plt.gca().get_legend().remove()
        except:
            pass

        plt.xticks(ax.get_xticks(), list(np.array(ax.get_xticks(), dtype=int)))
        plt.ylabel(param)

        plt_nr += 1

    sns.despine(offset=2, trim=True)
    plt.tight_layout()
    return fig

Example #15

Show file

File: mixedlm_emoMusic_sm.py Project: zivzone/emoMusic

# for ind in xrange(y_test.shape[0]):
#     # print ind, fake_id, (ind+1) % NUM_FRAMES
#     X_test_fake_song_ids[ind] = fake_id
#     if (ind+1) % NUM_FRAMES == 0:
#         fake_id += 1

### add column of ones to data to account for the bias:
# X_train = add_intercept(X_train)
# print X_train.shape
# print X_train[0:10]

# Fit regression model
# cf http://statsmodels.sourceforge.net/devel/mixed_linear.html
# md = smf.mixedlm(y_train, X_train, groups=X_train_fake_song_ids)
# md = sm.MixedLM(y_train, X_train, exog_re=X_train_fake_time, groups=X_train_fake_song_ids, use_sqrt=True)
md = sm.MixedLM(y_train, X_train, groups=X_train_fake_song_ids, use_sqrt=True)
mdf = md.fit()

print mdf.summary()

# X_test = add_intercept(X_test)
pred = mdf.predict(X_test)
# print pred

pred = list()
# predict each song separately and append predictions
for ind_song in xrange(nb_test_song):
    deb = ind_song * NUM_FRAMES
    fin = deb + NUM_FRAMES
    pred_song = mdf.predict(X_test[deb:fin, :])
    pred.append(pred_song)