コード例 #1
0
ファイル: _MixedLinearModel.py プロジェクト: mesgarpour/ERMER
    def train(self,
              features_indep_df: PandasDataFrame,
              feature_target: List,
              model_labals: List = [0, 1],
              **kwargs: Any) -> StatsmodelsMixedLM:
        """Perform the training, using the Mixed Linear Model.
        :param features_indep_df: the independent features, which are inputted into the model.
        :param feature_target: the target feature, which is being estimated.
        :param model_labals: the target labels (default [0, 1]).
        :param kwargs: any other arguments that the selected reader may accept.
        :return: the trained model.
        """
        self._logger.debug("Train " + __name__)
        if 'groups' not in kwargs.keys():
            self._logger.error(__name__ + " - " +
                               " function argument is missing: 'groups'.")
            sys.exit()

        groups = features_indep_df[kwargs['groups']]
        exog = features_indep_df.drop(kwargs['groups'], axis=1)
        exog['Intercept'] = 1

        model_train = sm.MixedLM(endog=feature_target,
                                 exog=exog,
                                 groups=groups,
                                 exog_re=exog['Intercept'])
        model_train = model_train.fit()
        print(model_train.summary())
        return model_train
コード例 #2
0
def run_mm(trunc_data, out_data_array, exog_vars, groupVar, i):
	print(i)
	try:
		out_data_array = sm.MixedLM(trunc_data, exog_vars, groupVar).fit().resid
	except ValueError:
		print("Error %d" % i)
		out_data_array = np.zeros((len(exog_vars)))
	return out_data_array
コード例 #3
0
def run_one_lmm(genotypes, phenotypes, groups):
    try:
        intercept = np.ones(genotypes.size)
        genotypes = genotypes.copy()
        x = np.stack([intercept, genotypes], axis=1)
        # Return p-value for genotype coefficient
        return sm.MixedLM(phenotypes, x, groups).fit().pvalues[1]
    except np.linalg.LinAlgError:
        # Could not fit model, return NaN
        return float('nan')
コード例 #4
0
ファイル: mixedlm.py プロジェクト: yzharold/genetest
    def fit(self, y, X):
        """Fit the model.

        Args:
            y (pandas.DataFrame): The vector of endogenous variable.
            X (pandas.DataFrame): The matrix of exogenous variables.

        """
        # Retrieving the data
        y, X, groups = self._prepare_data(y, X)

        # Creating the MixedLM model from StatsModels and fitting it
        model = sm.MixedLM(y, X, groups)
        try:
            # fitted = model.fit(reml=self._reml)
            fitted = model.fit(reml=self._reml)

        except np.linalg.linalg.LinAlgError as e:
            raise StatsError(str(e))

        out = {}
        parameters = fitted.params.index

        # Results about the model fit
        out = {
            "MODEL": {
                "log_likelihood": fitted.llf,
                "nobs": X.shape[0],
                "random_effects": self._format_re(fitted.random_effects),
            },
        }

        # Getting the confidence intervals
        conf_ints = fitted.conf_int()

        for param in parameters:
            # If GWAS, check that inference could be done on the SNP
            if param == "SNPs" and np.isnan(fitted.pvalues[param]):
                raise StatsError("Inference did not converge.")

            out[param] = {
                "coef": fitted.params[param],
                "std_err": fitted.bse[param],
                "lower_ci": conf_ints.loc[param, 0],
                "upper_ci": conf_ints.loc[param, 1],
                "z_value": fitted.tvalues[param],
                "p_value": fitted.pvalues[param],
            }

        return out
コード例 #5
0
def crude_mixedML2(df_merged, x_feature, y_feature, covars):

    #TODO: Replace covars variable with actual selection of indivdual features

    df_merged = df_merged.replace(-9, np.nan).replace('-9', np.nan).replace(
        999, np.nan).replace(888, np.nan)

    split_covars = covars.split('|')

    print(split_covars)

    data = add_confound(df_merged, x_feature, y_feature, split_covars)

    data['intercept'] = 1

    print(data.columns)
    #data = data.select_dtypes(include = ['float','integer'])

    X = data[[x for x in data.columns if x != y_feature and x != 'CohortType']]

    Y = data[y_feature]

    if X.shape[0] > 2:

        reg = sm.MixedLM(Y,
                         X,
                         groups=data["CohortType"],
                         exog_re=X["intercept"]).fit()
        ret = reg.summary()
    else:
        ret = 'error'

    fit_string = y_feature + '~'

    for x in X.columns:
        fit_string += ' + ' + str(x)

    fit_string = fit_string.replace('~ +', '~') + ' + (1|CohortType)'
    header = '<div> <b> Liear Mixed Model with Random Intercept </b> </div>'
    header += '<div> <b> Number samples: </b> ' + str(X.shape[0]) + '</div>'
    header += '<div> <b>  Model: </b>' + fit_string + '</div>'
    header += '<div> <b> Group: </b> CohortType '

    htmls = header + ret.tables[0].to_html() + ret.tables[1].to_html()
    return htmls
コード例 #6
0
def run_per_voxel(df, from_regress, labels):
    y_predicted_all = np.zeros((df.shape[0], ))
    kf = KFold(n_splits=5, shuffle=True)
    data = pd.concat([df, from_regress], axis=1)
    data = data.dropna()
    indices = list(data.index)
    # reset valid indices
    from_regress = from_regress.loc[indices, ].reset_index(drop=True)
    df = df.loc[indices, ].reset_index(drop=True)

    for train_index, test_index in kf.split(df):

        # training_data = data.loc[train_index,].reset_index(drop=True)
        # testing_data = data.loc[test_index,].reset_index(drop=True)
        # training_y_groups = data_labels.loc[train_index,].reset_index(drop=True)

        # print(training_data.shape)
        # print(training_y_groups.shape)

        # prepare data
        training_X = from_regress.loc[train_index, ].reset_index(drop=True)
        training_y = df.loc[train_index, ]['activations'].reset_index(
            drop=True)
        training_y_groups = df.loc[
            train_index, ]['subject_number'].reset_index(drop=True)

        testing_X = from_regress.loc[test_index, ].reset_index(drop=True)
        testing_y = df.loc[test_index, ]['activations'].reset_index(drop=True)
        testing_y_groups = df.loc[test_index, ]['subject_number'].reset_index(
            drop=True)

        md = sm.MixedLM(endog=training_y,
                        exog=training_X,
                        groups=training_y_groups,
                        exog_re=training_X)
        # func = 'activations ~ ' + str(labels) + '1'
        # re_form = str(labels)[:-2]
        # print(re_form)
        # print(func)
        # print(training_data.columns.values.tolist())
        # md = smf.mixedlm(func, training_data, re_formula=re_form, groups=training_y_groups)
        mdf = md.fit()
        print(mdf.summary())

        # print(testing_y.shape)
        y_hat_test = mdf.predict(testing_data)
        print("PREDICTION")
        print(y_hat_test[:10])
        y_predicted_all[test_index] = y_hat_test
        # print(y_hat_test.shape)
        # print(np.sqrt(np.sum(np.abs(y_hat_test - testing_y))))
        # print(asdf)
    y_true = df['activations']
    print("PREDICTED SHAPE")
    print(y_predicted_all.shape)
    print(y_predicted_all[:10])
    print("TRUE SHAPE")
    print(y_true.shape)
    print(y_true[:10])
    rmse = np.sqrt(np.sum(np.abs(y_predicted_all - y_true)))
    print("RMSE: " + str(rmse))
    return rmse.astype(np.float32)
コード例 #7
0
def processData(df_original, reg_mlme=True):
    """
    Description: core processing of the data. It's divided in two main steps:
    step 1, apply VAR to the fixed effects wrt to each actor, step 2, apply LMEM
    to whole dataset and learn 5 different models for each of the labels 
    Input: dataframe transformed with the whole history
    Output: dataframe with the forecast for each participant.
    """
    df_flat = df_original.reset_index()
    # Actors definition
    #count participants anre remove 10
    actors = df_flat.actorId.unique().tolist()
    if 10 in actors: actors.remove(10)  # remove user no.10 (insufficient info)

    # Attributes definition
    #categoricals = ['MainActivity','lat','lng','weatherId']
    activities = list(df_original.ix[:, 19:].columns.values)
    random_effects = activities + ['Steps']
    fixed_effects = [
        'pressure', 'temp', 'humidity', 'hr_min', 'hr_avc', 'hr_mean',
        'hr_std', 'hr_max', 'timeframe'
    ]
    labels = ['Abilities', 'Challenge', 'Productivity', 'Stress', 'Flow']

    # Dataframe to remember min e max for each user
    target_min = pd.DataFrame(np.nan, index=actors, columns=labels)
    target_max = pd.DataFrame(np.nan, index=actors, columns=labels)
    target_mean = pd.DataFrame(np.nan, index=actors, columns=labels)
    target_std = pd.DataFrame(np.nan, index=actors, columns=labels)

    # STEP 1) VAR on fixed effects
    #-------------------------------
    window = 5  # Windows to predict
    df_future = pd.DataFrame()  #prepare the future dataframe
    var_attributes = [
        item for item in fixed_effects if item not in ['timeframe']
    ]
    for user in actors:
        print "7.1 ----- Forecasting actor ARLearn" + str(user)
        df_user = df_original.xs(user, level='actorId')
        df = df_user[var_attributes]
        VARres = VARprocess(df)
        forecasts = VARforecast(df, VARres, window)
        #plt = forecasts.plot() # prediction plot
        #plt.axvline(forecasts.index[-window])
        forecasts['actorId'] = user
        forecasts['timeframe'] = forecasts.index.hour
        df_future = df_future.append(forecasts)
        # offtopic, add max and min
        for target in (labels):
            target_min[target][user] = min(df_user[target])
            target_max[target][user] = max(df_user[target])
            target_mean[target][user] = df_user[target].mean()
            target_std[target][user] = df_user[target].std()
    # add intercept term
    df_future['Intercept'] = 1
    df_future = df_future.reset_index().set_index(['index',
                                                   'actorId']).sort_index()
    # ------------------------------- end VAR

    # STEP 2) Linear Mixed Effect Model
    # -------------------------------
    data = df_flat
    data['intercept'] = 1  # set the intercept term
    LMEM_models = []  # create a list of models, for multi output
    exog = data[fixed_effects +
                ['intercept']]  # the attributes from which to predict
    exog_re = data[random_effects]  # random effects
    groups = data['actorId']  # group definition

    # Training phase of four model, one per each label
    for target in labels:
        endog = data[target]  # endogenous, ie the values we want to predict
        if ((reg_mlme == False)
                and os.path.exists('model_' + target + '.pickle')):
            LMEM_results = pickle.load(
                open('model_' + target + '.pickle', 'rb'))
            LMEM_models.append(LMEM_results)
        else:
            with warnings.catch_warnings():
                warnings.filterwarnings("ignore")
                LMEM_model = sm.MixedLM(endog,
                                        exog,
                                        groups=groups,
                                        exog_re=exog_re)
                LMEM_results = LMEM_model.fit()
                LMEM_results.save('model_' + target + '.pickle',
                                  remove_data=False)
                LMEM_models.append(LMEM_results)

    # Coefficients importance averaging
    coeff = pd.DataFrame(index=range(0, len(exog.T)),
                         data={
                             'coefficients': 0.0
                         },
                         dtype='float').coefficients
    for i in range(0, len(coeff)):
        for j in range(0, len(LMEM_models)):
            coeff[i] = coeff[i] + LMEM_models[j].fe_params[i]
    coeff = coeff / len(LMEM_models)

    # Test phase for each of the four models
    df = df_future.reset_index()
    exog = df[fixed_effects]
    exog['intercept'] = 1
    for i in range(0, len(labels)):
        t = labels[i]
        df[t] = LMEM_models[i].predict(exog)
        # normalization
        for u in actors:
            actual = df[df['actorId'] == u][t]
            rindex = df[df['actorId'] == u][t].index
            # Normalization (x_max-x_min)*(x_i/100)+x_min
            df.loc[rindex,
                   t] = (target_max[t][u] -
                         target_min[t][u]) * (actual / 100) + target_min[t][u]
        df[t] = df[t].astype('int')
    df = df.rename(columns={'index': 'timestamp'})
    return df
コード例 #8
0
# This is one of the example data sets provided in the LMER R library.
# The outcome variable is the size of the tree, and the covariate used here
# is a time value.  The data are grouped by tree.

data = sm.datasets.get_rdataset("Sitka", "MASS").data
endog = data["size"]
data["Intercept"] = 1
exog = data[["Intercept", "Time"]]

# Here is the statsmodels LME fit for a basic model with a random
# intercept.  We are passing the endog and exog data directly to the LME
# init function as arrays.  Also note that endog_re is specified explicitly
# in argument 4 as a random intercept (although this would also be the
# default if it were not specified).

md = sm.MixedLM(endog, exog, groups=data["tree"], exog_re=exog["Intercept"])
mdf = md.fit()
print(mdf.summary())

# Here is the same model fit in R using LMER:

# ```ipython
# %R
# data(Sitka, package="MASS")
# print(summary(lmer("size ~ Time + (1 | tree)", data=Sitka)))
# ```

# ```
# Linear mixed model fit by REML ['lmerMod']
# Formula: size ~ Time + (1 | tree)
#    Data: Sitka
コード例 #9
0
def causal_simulation(path,start_date,f_start_date,datafile="dataset_full.csv",govpolicyfile="gov_dates_mandates.csv", num_date_omit=0, print_graph=True):
    data = pd.read_csv(path+"/"+datafile)
    
    start_dt = datetime.strptime(start_date, '%m/%d/%y').strftime('%Y-%m-%d')
    print(start_dt)
      
    dateval = pd.date_range(start_dt, periods=horizon+180).tolist()
    dates = pd.DataFrame({'dateval': dateval})
    dates['dateval'] = dates['dateval'].apply(lambda x: datetime.strptime(str(x),'%Y-%m-%d %H:%M:%S').strftime('%Y-%m-%d') )
    data['dateval'] = data['date'].apply(lambda x: datetime.strptime(str(x),'%Y%m%d' ).strftime('%Y-%m-%d') )
    if num_date_omit > 0:
        temp_start_date = datetime.strptime(f_start_date, '%m/%d/%y') - timedelta(days=30)
        temp_start_date = int(temp_start_date.strftime('%Y%m%d'))
        print(temp_start_date)
        
        temp = data.loc[(data['confirmed'].isna())&(data['date']>temp_start_date), ['location_name','confirmed','dateval']].sort_values(by='dateval').reset_index(drop=True)
        temp_start_date = temp.loc[0, 'dateval']
        temp_start_training_date = (datetime.strptime(temp_start_date, '%Y-%m-%d') - timedelta(days=num_date_omit)).strftime('%Y-%m-%d')
        print(temp_start_training_date)
        temp_start_simulation_date = (datetime.strptime(temp_start_date, '%Y-%m-%d') - timedelta(days=num_date_omit-1)).strftime('%Y-%m-%d')
     
    data['state'] = data['province_state']
   
    zx = 0
    
    data = psql.sqldf("""
    select province_state , country_region as country, date, confirmed,
       recovered, death, population, TAVG/10 as TAVG,
     a1.location_name, a1.dateval, country_region ||'-'||state as state,
    case when TAVG<=0 then 1 else 0 end as is_freezing,
    case when TAVG>0 and TAVG/10<20 then 1 else 0 end as is_cold, 
    case when TAVG>=20 and TAVG/10<35 then 1 else 0 end as is_warm,
    case when TAVG>=35 then 1 else 0 end as is_hot,
    case when TAVG>=20 then 1 else 0 end as temp_th,
     
      case when julianday(a1.dateval)>julianday('2020-03-20') then 1 else 0 end
       as gov_action
    from data a1   
    """).drop_duplicates()

   
    data['Intercept'] = 1.0
    data = data[(data['dateval']>=start_dt)]
    data['holdout'] = np.where((data['dateval']>=datetime.strptime(f_start_date, '%m/%d/%y').strftime('%Y-%m-%d')),1,0)

    print(data)
    
    data_save = data.copy()

    # data smoothing to correct irregular data issues: like dropped cumulative values
    data1 = pd.DataFrame()
     
    z = 0
    
    for state in data['state'].drop_duplicates():
            dat = data[(data['state']==state)].sort_values(by=['dateval'])
            if len(dat['dateval'])>1:
                dat = dat.fillna(0)
                dat = dat.loc[dat['confirmed'].ne(0.0).idxmax():]
                rho_data = dat[dat['confirmed']>0].sort_values(by=['dateval'])
                rho_data = rho_data[0:30]
                
                zz1 = 0.0
                zz2 = 0.0
                zz3 = 0.0
                for s in range(len(rho_data['dateval'])):
                    if (s>0):
                        if rho_data['confirmed'].values[s]-rho_data['confirmed'].values[s-1]>0:
                            zz1 = zz1 + rho_data['confirmed'].values[s]-rho_data['confirmed'].values[s-1]
                        if rho_data['recovered'].values[s]-rho_data['recovered'].values[s-1]>0:
                            zz2 = zz2 + rho_data['recovered'].values[s]-rho_data['recovered'].values[s-1]
                        if rho_data['death'].values[s]-rho_data['death'].values[s-1]>0:
                            zz3  = zz3 + rho_data['death'].values[s]-rho_data['death'].values[s-1]
                rho = 0.0
                if (zz2+zz3) >0.0:
                    rho = (zz1+zz2+zz3)/(zz2+zz3) 
                print("R_0 for "+state +" : "+str(rho))
                dat['lag_confirmed'] = 0.0
                dat['lag_recovered'] = 0.0
                dat['lag_death'] = 0.0
                dat['lag_removed'] = 0.0

                dat['d_recovered'] = 0.0
                dat['d_death'] = 0.0
                dat['d_removed'] = 0.0
                  
                N = dat['population'].values[0]
                dd = dat[dat['confirmed']>1]
                
                
                dat['R_0'] = rho
                z1 = 0
                tt = 1
                dat['removed'] = dat['death'] + dat['recovered'] 
                
                for t in range(len(dat)):
                    
                    if t>0 and t<=len(dat):
                        dat['lag_confirmed'].values[t] = dat['confirmed'].values[t-1]
                        dat['lag_removed'].values[t] = dat['removed'].values[t-1]
                        dat['lag_recovered'].values[t] = dat['recovered'].values[t-1]
                        dat['lag_death'].values[t] = dat['death'].values[t-1] 
                        dat['d_recovered'].values[t] =  dat['recovered'].values[t] - dat['recovered'].values[t-1] 
                        dat['d_death'].values[t] = dat['death'].values[t]-dat['death'].values[t-1]
                        dat['d_removed'].values[t] = dat['removed'].values[t]-dat['removed'].values[t-1]

                data1 = data1.append(dat,ignore_index=True)
                z = z +1
                         

    data = data1.copy()
    rb = np.mean(data[data['R_0']>0]['R_0'])
    data['R_0'] = np.where(data['R_0']==0,rb,data['R_0'])
    data = data.fillna(0)   
    data.to_csv("output/simulation_output/input_data.csv")
    if num_date_omit > 0:
        data_train = data[(data['removed']>0) & ((data['holdout']==0) | (data['dateval']<=temp_start_training_date))][['dateval','Intercept','state','TAVG','gov_action','is_freezing','is_cold','is_warm','is_hot','lag_confirmed','lag_death','lag_recovered','d_death','d_recovered','d_removed','removed']]
        print(temp_start_training_date)
        print(max(data_train['dateval']))
    else:
        data_train = data[(data['removed']>0) & (data['holdout']==0)][['dateval','Intercept','state','TAVG','gov_action','is_freezing','is_cold','is_warm','is_hot','lag_confirmed','lag_death','lag_recovered','d_death','d_recovered','d_removed','removed']]
    
    endog =data_train['d_removed']
    exog = data_train[[ 'Intercept','gov_action','TAVG','lag_confirmed']]
    model = sm.MixedLM(endog, exog, exog_re=exog[[ 'Intercept','lag_confirmed']],  groups=data_train["state"])
    po_results = model.fit()
    print(po_results.summary())
    
    
    



    # Get Coefficient
 
    k = []
    v1 = []
    v2 = []
    v3 = []
    d = po_results.random_effects
    
    for i in d:
        my_str = ''.join((ch if ch in '0123456789.-' else ' ') for ch in str(d[i]))
        listOfNumbers = [float(i) for i in my_str.split()]
        v1 = v1 +[str(d[i]).split(" ")[0] ]
        l = str(d[i]).split(" ")
        if str(listOfNumbers[0]).strip()=='':
            v2 = v2 +[0.00 ]
            v3 = v3 + [0.0]
        else:    
            v2 = v2 +[listOfNumbers[0] ]
            v3 = v3 +[listOfNumbers[1] ]
        k = k + [i]
        
    r_combined = pd.DataFrame({'state':k,'coef_name':v1,'coef_value':v2,'re_lag_confirmed':v3})
    
    r_combined['fe_Intercept'] = po_results.fe_params['Intercept']

    r_combined['Intercept'] = r_combined['fe_Intercept']+r_combined['coef_value']
   
    r_combined['lag_confirmed'] = po_results.fe_params['lag_confirmed'] +r_combined["re_lag_confirmed"]
    r_combined['gov_action'] = po_results.fe_params['gov_action']
    r_combined['TAVG'] = po_results.fe_params['TAVG']
    
    r_combined.fillna(0.0)
    r_combined.to_csv("output/simulation_output/recover_coefs.csv")
    
    mean_beta = np.mean(r_combined[r_combined['lag_confirmed']>0]['lag_confirmed'])
    r_combined['lag_confirmed'] = np.where(r_combined['lag_confirmed']<0,mean_beta,r_combined['lag_confirmed'])
    
    # Get Prediction and Bias

    t_dat = generate_dataset(data, r_combined)
    if num_date_omit > 0:
        t_dat = t_dat[(t_dat['removed']>0) & ((t_dat['holdout']==0) | (t_dat['dateval']<=temp_start_training_date))]
    else:
        t_dat = t_dat[(t_dat['removed']>0) & (t_dat['holdout']==0)]

    pred_on_train = runSimulator(data1=t_dat,
    coefsdfR=r_combined,
    sir_names=['susceptible','confirmed','death','removed'],
    xnamesr=['Intercept','gov_action','TAVG','lag_confirmed'],
    horizon1=60, date_gov_adjust=0, print_graph=print_graph)

    # == Adjust Prediction of Removed with Bias

    pred_on_train['bias_removed'] = pred_on_train['pred_removed'] - pred_on_train['removed']
    # pred_on_train.to_csv('output/simulation_output/pred_on_train.csv')
    mean_bias = pred_on_train.groupby('location_name')['bias_removed'].mean().reset_index()
    mean_bias.to_csv('output/simulation_output/bias.csv')

    loc_list = set(pred_on_train['location_name'])
    for loc in loc_list:
        bias = mean_bias.loc[mean_bias['location_name']==loc, 'bias_removed'].iloc[0]
        if bias > 0:
            pred_on_train.loc[pred_on_train['location_name']==loc, 'pred_removed'] = pred_on_train.loc[pred_on_train['location_name']==loc, 'pred_removed']-bias
        elif bias < 0:
            pred_on_train.loc[pred_on_train['location_name']==loc, 'pred_removed'] = pred_on_train.loc[pred_on_train['location_name']==loc, 'pred_removed']+bias

        loc_name = "".join(c for c in loc if c.isalnum())
        temp_for_plot = pred_on_train.loc[pred_on_train['location_name']==loc, ['dateval', 'removed', 'pred_removed']]
        plt.figure(figsize=(12,12))
        fig = temp_for_plot.plot(x='dateval', y=['removed', 'pred_removed'], rot=45, ax=plt.gca()).get_figure()
        fig.savefig(os.path.join('output/covid_plot/actual_pred/covid_plot_compare_'+loc_name+'.png'))
        fig.clf()

    pred_on_train['pred_removed'] = np.where(pred_on_train['pred_removed']<0, 0, pred_on_train['pred_removed'])


    pred_on_train.to_csv('output/simulation_output/adjusted_pred_on_train.csv')

    print(pred_on_train)

    # Adjust R-Combined with Bias

    mean_bias2 = mean_bias.copy()
    r_combined2 = r_combined.copy()
    mean_bias2['location_name'] = mean_bias2['location_name'].str.replace('[^a-zA-Z]', '')
    r_combined2['state'] = r_combined2['state'].str.replace('[^a-zA-Z]', '')
    loc_list = set(mean_bias2['location_name'])
    for loc in loc_list:
        bias = mean_bias2.loc[mean_bias2['location_name']==loc, 'bias_removed'].iloc[0]
        if bias > 0:
            r_combined2.loc[r_combined2['state']==loc, 'Intercept'] = r_combined2.loc[r_combined2['state']==loc, 'Intercept']-bias
        elif bias < 0:
            r_combined2.loc[r_combined2['state']==loc, 'Intercept'] = r_combined2.loc[r_combined2['state']==loc, 'Intercept']+bias

    r_combined = r_combined2


    tti = 0
     
    states = data['state'].drop_duplicates().str.replace('[^a-zA-Z]', '')
    data2 = data.copy()
    data2['state'] = data2['state'].str.replace('[^a-zA-Z]', '')
    data3 = pd.DataFrame()
    for s in states:
        rc = r_combined[r_combined['state']==s]
        #print(rc)
        dat = data2[data2['state']==s]
         
        if len(dat)>0 and len(rc)>0:
            print(s)
            dat = dat.sort_values(by=['dateval'])
            #dat = dat.reset_index()
            beta = rc['lag_confirmed'].values[0]
            N = dat['population'].values[0]
            dat['susceptible'] = np.where(dat['holdout']==0, N+0.0,0.0)
            
            alpha = dat['R_0'].values[0]*beta
            dat['alpha'] = alpha

            
            if tti==0:
               data3 = dat
            else:
                data3 = data3.append(dat,ignore_index=True )
            tti = tti +1     

    print(data3)
    print(data3['state'].drop_duplicates())
     
    # data3.to_csv(path+"/before_sim_data_test.csv")
    #runDynamicSimulator 
    #runSimulator(data1,coefsdfR,sir_names,xnamesr,horizon1)
    date_start_sim = 20200510

    if num_date_omit > 0:
        sim_data = data3[(data3['holdout']==1)&(data3['dateval']>=temp_start_simulation_date)]
        print(temp_start_simulation_date)
        print(min(sim_data['dateval']))
    else:
        sim_data = data3[(data3['holdout']==1)]
    
    sim_data_output_after = runSimulator(data1=sim_data,
    coefsdfR=r_combined,
    sir_names=['susceptible','confirmed','death','removed'],
    xnamesr=['Intercept','gov_action','TAVG','lag_confirmed'],
    horizon1=60, date_gov_adjust=date_start_sim, print_graph=print_graph)
    sim_data_output_after.to_csv("output/simulation_output/simulations_after_adjust_at_"+str(date_start_sim)+"_omitlastD_"+str(num_date_omit)+".csv")

    sim_data_output_before = runSimulator(data1=sim_data,
    coefsdfR=r_combined,
    sir_names=['susceptible','confirmed','death','removed'],
    xnamesr=['Intercept','gov_action','TAVG','lag_confirmed'],
    horizon1=60, date_gov_adjust=0, print_graph=print_graph)
    sim_data_output_before.to_csv("output/simulation_output/simulations_before_adjust_at_"+str(date_start_sim)+"_omitlastD_"+str(num_date_omit)+".csv")

    sim_data_compare = sim_data_output_after.merge(sim_data_output_before, on=['index', 'province_state', 'country','date','dateval','location_name'], suffixes=('_after', '_before'))
    sim_data_compare['diff_susceptible'] = sim_data_compare['pred_susceptible_after'] - sim_data_compare['pred_susceptible_before']
    sim_data_compare['diff_confirmed'] = sim_data_compare['pred_confirmed_after'] - sim_data_compare['pred_confirmed_before']
    sim_data_compare['diff_removed'] = sim_data_compare['pred_removed_after'] - sim_data_compare['pred_removed_before']

    sim_data_compare = sim_data_compare.loc[:, ['province_state', 'country','date','dateval','location_name','pred_susceptible_after', 'pred_confirmed_after', 'pred_removed_after', 'pred_susceptible_before', 'pred_confirmed_before', 'pred_removed_before', 'diff_susceptible', 'diff_confirmed', 'diff_removed']]
    sim_data_compare.to_csv("output/simulation_output/simulations_compare"+str(date_start_sim)+"_omitlastD_"+str(num_date_omit)+".csv")

    if print_graph == True:
        for location in sim_data_compare['location_name'].drop_duplicates():
            dat = sim_data_compare[(sim_data_compare['location_name']==location)].sort_values(by=['date'])
            # plot results
            plt.figure(1)
            plt.figure(figsize=(15,10))
            xtick_locator = AutoDateLocator()
            xtick_formatter = AutoDateFormatter(xtick_locator)
            date_list = pd.to_datetime(dat['dateval'])
            ax = plt.axes()
            ax.xaxis.set_major_locator(xtick_locator)
            ax.xaxis.set_major_formatter(xtick_formatter)
            plt.plot(date_list,dat['diff_susceptible'],'b-')
            plt.plot(date_list,dat['diff_confirmed'],'r--')
            plt.plot(date_list,dat['diff_removed'],'g--')
            plt.xlabel('Time')
            plt.ylabel('Populations')
            plt.title('Compare Before/After Gov. Intervention Adjust at '+''.join(e for e in location if e.isalnum())+' : '+str(date_start_sim))
            plt.legend(['Diff Suceptibes','Diff Confirmed','Diff Removed'])
            plt.savefig(os.path.join('output/covid_plot/covid_plot_compare_'+"omitlastD_"+str(num_date_omit)+''.join(e for e in location if e.isalnum())+'_'+str(date_start_sim)+'.png'))
            plt.clf()
            plt.close()
コード例 #10
0
def run(opts):

    indexCol = opts.indexcolumns[0]

    # read csv(s)
    num_csv = len(opts.inputcsv)
    pdCSV = pd.read_csv(opts.inputcsv[0], delimiter=',', index_col=[indexCol])
    if num_csv > 1:
        for i in range(int(num_csv - 1)):
            tempCSV = pd.read_csv(opts.inputcsv[int(i + 1)],
                                  delimiter=',',
                                  index_col=[indexCol])
            pdCSV = pd.concat([pdCSV, tempCSV],
                              axis=1,
                              join_axes=[pdCSV.index])

    # Interaction Variables
    if opts.interactionvars:
        for int_terms in opts.interactionvars:
            inteaction_vars = int_terms.split("*")
            for scale_var in inteaction_vars:
                var_temp = scalearr(pdCSV[scale_var])
                var_tempname = '%s_std' % scale_var
                if var_tempname in opts.exogenousvariables:
                    pass
                else:
                    pdCSV[var_tempname] = var_temp
                    opts.exogenousvariables.append(var_tempname)
        for int_terms in opts.interactionvars:
            inteaction_vars = int_terms.split("*")
            for i, scale_var in enumerate(inteaction_vars):
                if i == 0:
                    int_temp = pdCSV['%s_std' % scale_var]
                    int_tempname = '%s' % scale_var
                else:
                    int_temp = int_temp * pdCSV['%s_std' % scale_var]
                    int_tempname = int_tempname + '.X.' + scale_var
            if int_tempname in opts.exogenousvariables:
                pass
            else:
                pdCSV[int_tempname] = int_temp
                opts.exogenousvariables.append(int_tempname)
            int_temp = None
        print opts.exogenousvariables

    # output column/variable names.
    if opts.outputcolumnnames:
        for counter, roi in enumerate(pdCSV.columns):
            print("[%d] : %s" % (counter, roi))
        quit()

    # set grouping variables
    if opts.groupingvariable:
        if len(opts.groupingvariable) > 1:
            pdCSV = russiandolls(opts.groupingvariable, pdCSV)
            groupVar = 'group_list'
        else:
            groupVar = opts.groupingvariable[0]

    # stats functions

    if opts.outstats:
        if not opts.statsmodel:
            print("A statistical model must be specificed. -m {model}")
            quit()
        if not opts.range:
            print("Range must be specfied. -r {start} {stop}")
            quit()
        elif len(opts.range) != 2:
            print("Range must have start and stop. -r {start} {stop}")
            quit()
        else:
            roi_names = []
            t_values = []
            p_values = []
            icc_values = []
            if not opts.exogenousvariables:
                print(
                    "The exogenous (independent) variables must be specifice. e.g., -exog pred1 pred2 age"
                )
                quit()

            if opts.mediation:
                medvars = ['%s' % opts.mediation[1], '%s' % opts.mediation[2]]
                exog_vars = create_exog_mat(opts.exogenousvariables, pdCSV)
                # build null array
                pdCSV = omitmissing(pdDF=pdCSV,
                                    endog_range=opts.range,
                                    exogenous=strip_ones(exog_vars),
                                    groups=medvars)
                if opts.statsmodel == 'mixedmodel' or opts.statsmodel == 'mm':
                    pdCSV = omitmissing(pdDF=pdCSV,
                                        endog_range=opts.range,
                                        groups=opts.groupingvariable)
                # rebuild exog_vars with correct length
                exog_vars = create_exog_mat(opts.exogenousvariables, pdCSV,
                                            opts.scaleexog == True)
                leftvar = pdCSV[opts.mediation[1]]
                rightvar = pdCSV[opts.mediation[2]]
                y = pdCSV.iloc[:, int(opts.range[0]):int(opts.range[1]) + 1]

                if opts.statsmodel == 'mixedmodel' or opts.statsmodel == 'mm':
                    t_valuesA = []
                    t_valuesB = []
                    ################ MM mediation ################
                    if opts.mediation[0] == 'I':
                        EXOG_A = sm.add_constant(
                            np.column_stack((leftvar, strip_ones(exog_vars))))
                        EXOG_B = np.column_stack((leftvar, rightvar))
                        EXOG_B = sm.add_constant(
                            np.column_stack((EXOG_B, strip_ones(exog_vars))))
                        #pathA
                        for i in xrange(int(opts.range[0]),
                                        int(opts.range[1]) + 1):
                            mdl_fit = sm.MixedLM(pdCSV[pdCSV.columns[i]],
                                                 EXOG_A,
                                                 pdCSV[groupVar]).fit()
                            roi_names.append(pdCSV.columns[i])
                            t_valuesA.append(mdl_fit.tvalues[1])
                        #pathB
                        for i in xrange(int(opts.range[0]),
                                        int(opts.range[1]) + 1):
                            mdl_fit = sm.MixedLM(pdCSV[pdCSV.columns[i]],
                                                 EXOG_B,
                                                 pdCSV[groupVar]).fit()
                            t_valuesB.append(mdl_fit.tvalues[1])
                    elif opts.mediation[0] == 'M':
                        EXOG_A = sm.add_constant(
                            np.column_stack((leftvar, strip_ones(exog_vars))))
                        EXOG_B = np.column_stack((rightvar, leftvar))
                        EXOG_B = sm.add_constant(
                            np.column_stack((EXOG_B, strip_ones(exog_vars))))
                        #pathA
                        for i in xrange(int(opts.range[0]),
                                        int(opts.range[1]) + 1):
                            mdl_fit = sm.MixedLM(pdCSV[pdCSV.columns[i]],
                                                 EXOG_A,
                                                 pdCSV[groupVar]).fit()
                            roi_names.append(pdCSV.columns[i])
                            t_valuesA.append(mdl_fit.tvalues[1])
                        #pathB
                        for i in xrange(int(opts.range[0]),
                                        int(opts.range[1]) + 1):
                            mdl_fit = sm.MixedLM(pdCSV[pdCSV.columns[i]],
                                                 EXOG_B,
                                                 pdCSV[groupVar]).fit()
                            t_valuesB.append(mdl_fit.tvalues[1])
                    else:
                        EXOG_A = sm.add_constant(
                            np.column_stack((leftvar, strip_ones(exog_vars))))
                        EXOG_B = np.column_stack((rightvar, leftvar))
                        EXOG_B = sm.add_constant(
                            np.column_stack((EXOG_B, strip_ones(exog_vars))))

                        #pathA
                        mdl_fit = sm.MixedLM(rightvar, EXOG_A,
                                             pdCSV[groupVar]).fit()
                        t_valuesA = mdl_fit.tvalues[1]

                        #pathB
                        for i in xrange(int(opts.range[0]),
                                        int(opts.range[1]) + 1):
                            mdl_fit = sm.MixedLM(pdCSV[pdCSV.columns[i]],
                                                 exog_vars,
                                                 pdCSV[groupVar]).fit()
                            roi_names.append(pdCSV.columns[i])
                            t_valuesB.append(mdl_fit.tvalues[1])

                    z_values = special_calc_sobelz(np.array(t_valuesA),
                                                   np.array(t_valuesB),
                                                   alg="aroian")
                    p_values = norm.sf(abs(z_values))
                    p_FDR = multipletests(p_values, method='fdr_bh')[1]

                else:
                    ################ LM mediation ################
                    if opts.mediation[0] == 'I':
                        EXOG_A = sm.add_constant(
                            np.column_stack((leftvar, strip_ones(exog_vars))))
                        EXOG_B = np.column_stack((leftvar, rightvar))
                        EXOG_B = sm.add_constant(
                            np.column_stack((EXOG_B, strip_ones(exog_vars))))

                        y = pdCSV.iloc[:,
                                       int(opts.range[0]):int(opts.range[1]) +
                                       1]
                        #pathA
                        t_valuesA = full_glm_results(y,
                                                     EXOG_A,
                                                     only_tvals=True)[1, :]
                        #pathB
                        t_valuesB = full_glm_results(y,
                                                     EXOG_B,
                                                     only_tvals=True)[1, :]

                    elif opts.mediation[0] == 'M':
                        EXOG_A = sm.add_constant(
                            np.column_stack((leftvar, strip_ones(exog_vars))))
                        EXOG_B = np.column_stack((rightvar, leftvar))
                        EXOG_B = sm.add_constant(
                            np.column_stack((EXOG_B, strip_ones(exog_vars))))

                        y = pdCSV.iloc[:,
                                       int(opts.range[0]):int(opts.range[1]) +
                                       1]
                        #pathA
                        t_valuesA = full_glm_results(y,
                                                     EXOG_A,
                                                     only_tvals=True)[1, :]
                        #pathB
                        t_valuesB = full_glm_results(y,
                                                     EXOG_B,
                                                     only_tvals=True)[1, :]

                    elif opts.mediation[0] == 'Y':
                        EXOG_A = sm.add_constant(
                            np.column_stack((leftvar, strip_ones(exog_vars))))
                        EXOG_B = np.column_stack((rightvar, leftvar))
                        EXOG_B = sm.add_constant(
                            np.column_stack((EXOG_B, strip_ones(exog_vars))))

                        y = pdCSV.iloc[:,
                                       int(opts.range[0]):int(opts.range[1]) +
                                       1]
                        #pathA
                        t_valuesA = sm.OLS(rightvar, EXOG_A).fit().tvalues[1]
                        #pathB
                        t_valuesB = full_glm_results(y,
                                                     EXOG_B,
                                                     only_tvals=True)[1, :]

                    else:
                        print("Error: Invalid mediation type.")
                        quit()
                    z_values = special_calc_sobelz(np.array(t_valuesA),
                                                   np.array(t_valuesB),
                                                   alg="aroian")
                    p_values = norm.sf(abs(z_values))
                    p_FDR = multipletests(p_values, method='fdr_bh')[1]

                    if opts.permutation:
                        if opts.groupingvariable:
                            p_FWER = run_permutations_med(
                                endog_arr=y,
                                exog_vars=exog_vars,
                                medtype=opts.mediation[0],
                                leftvar=leftvar,
                                rightvar=rightvar,
                                num_perm=int(opts.permutation[0]),
                                stat_arr=z_values,
                                uniq_groups=pdCSV[groupVar],
                                return_permutations=True)
                        else:
                            p_FWER = run_permutations_med(
                                endog_arr=y,
                                exog_vars=exog_vars,
                                medtype=opts.mediation[0],
                                leftvar=leftvar,
                                rightvar=rightvar,
                                num_perm=int(opts.permutation[0]),
                                stat_arr=z_values,
                                uniq_groups=None,
                                return_permutations=True)

                    roi_names = []
                    for i in xrange(int(opts.range[0]),
                                    int(opts.range[1]) + 1):
                        roi_names.append(pdCSV.columns[i])

                columnnames = []
                columnnames.append('Zval')
                columnnames.append('pval')
                columnnames.append('pFDR')
                columndata = np.column_stack((z_values, p_values))
                columndata = np.column_stack((columndata, p_FDR))
                if opts.permutation:
                    columnnames.append('pFWER')
                    columndata = np.column_stack((columndata, p_FWER))
                pd_DF = pd.DataFrame(data=columndata,
                                     index=roi_names,
                                     columns=columnnames)
                pd_DF.to_csv(opts.outstats[0], index_label='ROI')

            else:
                ################ MIXED MODEL ################
                if opts.statsmodel == 'mixedmodel' or opts.statsmodel == 'mm':
                    exog_vars = create_exog_mat(opts.exogenousvariables, pdCSV)

                    # build null array
                    pdCSV = omitmissing(pdDF=pdCSV,
                                        endog_range=opts.range,
                                        exogenous=strip_ones(exog_vars),
                                        groups=opts.groupingvariable)
                    # rebuild exog_vars with correct length
                    if opts.scaleexogwithingroup:
                        exog_vars = create_exog_mat(
                            opts.exogenousvariables,
                            pdCSV,
                            opts.scaleexog == True,
                            scale_groups=pdCSV[groupVar])
                    else:
                        exog_vars = create_exog_mat(opts.exogenousvariables,
                                                    pdCSV,
                                                    opts.scaleexog == True)

                    exog_re = None
                    if opts.exogintercept:
                        exog_re = dmatrix("1+%s" % opts.exogintercept[0],
                                          pdCSV)

                    for i in xrange(int(opts.range[0]),
                                    int(opts.range[1]) + 1):
                        mdl_fit = sm.MixedLM(endog=pdCSV[pdCSV.columns[i]],
                                             exog=exog_vars,
                                             groups=pdCSV[groupVar],
                                             exog_re=exog_re).fit()
                        roi_names.append(pdCSV.columns[i])
                        t_values.append(mdl_fit.tvalues[1:])
                        p_values.append(mdl_fit.pvalues[1:])
                        icc_values.append(
                            np.array(mdl_fit.cov_re /
                                     (mdl_fit.cov_re + mdl_fit.scale)))
                        if opts.plotresids:
                            os.system('mkdir -p resid_plots')
                            plot_residuals(
                                residual=mdl_fit.resid,
                                fitted=mdl_fit.fittedvalues,
                                basename=('%s_mm_%s' %
                                          (str(i).zfill(4), pdCSV.columns[i])),
                                outdir='resid_plots/')
                    p_values = np.array(p_values)
                    t_values = np.array(t_values)
                    p_FDR = np.zeros_like(p_values)

                    p_values[np.isnan(p_values)] = 1
                    for col in range(p_FDR.shape[1]):
                        p_FDR[:, col] = multipletests(p_values[:, col],
                                                      method='fdr_bh')[1]

                    columnnames = []
                    for colname in opts.exogenousvariables:
                        columnnames.append('tval_%s' % colname)
                    if opts.exogintercept:
                        columnnames.append('tval_re1')
                        columnnames.append('tval_re1Xre2')
                        columnnames.append('tval_re2')
                    else:
                        columnnames.append('tval_groupRE')

                    for colname in opts.exogenousvariables:
                        columnnames.append('pval_%s' % colname)
                    if opts.exogintercept:
                        columnnames.append('pval_re1')
                        columnnames.append('pval_re1Xre2')
                        columnnames.append('pval_re2')
                    else:
                        columnnames.append('pval_groupRE')

                    for colname in opts.exogenousvariables:
                        columnnames.append('pFDR_%s' % colname)
                    if opts.exogintercept:
                        columnnames.append('pFDR_re1')
                        columnnames.append('pFDR_re1Xre2')
                        columnnames.append('pFDR_re2')
                    else:
                        columnnames.append('pFDR_groupRE')

                    if not opts.exogintercept:
                        columnnames.append('ICC_groupRE')
                    columndata = np.column_stack((t_values, p_values))
                    columndata = np.column_stack((columndata, p_FDR))
                    if not opts.exogintercept:
                        columndata = np.column_stack(
                            (columndata, np.array(icc_values).flatten()))
                    pd_DF = pd.DataFrame(data=columndata,
                                         index=roi_names,
                                         columns=columnnames)
                    pd_DF.to_csv(opts.outstats[0], index_label='ROI')
                else:
                    ################ LINEAR MODEL ################
                    exog_vars = create_exog_mat(opts.exogenousvariables, pdCSV)
                    # build null array
                    pdCSV = omitmissing(pdDF=pdCSV,
                                        endog_range=opts.range,
                                        exogenous=strip_ones(exog_vars))
                    # rebuild exog_vars with correct length
                    if opts.scaleexogwithingroup:
                        exog_vars = create_exog_mat(
                            opts.exogenousvariables,
                            pdCSV,
                            opts.scaleexog == True,
                            scale_groups=pdCSV[groupVar])
                    else:
                        exog_vars = create_exog_mat(opts.exogenousvariables,
                                                    pdCSV,
                                                    opts.scaleexog == True)
                    y = np.array(
                        pdCSV.iloc[:,
                                   int(opts.range[0]):int(opts.range[1]) + 1])

                    if opts.plotresids:
                        f_values, t_values, p_values, R2, R2_adj, resids, fitted = full_glm_results(
                            y, exog_vars, return_resids=True)
                    else:
                        np.savetxt('temp_int.csv',
                                   orthog_columns(strip_ones(exog_vars)),
                                   delimiter=',')
                        f_values, t_values, p_values, R2, R2_adj = full_glm_results(
                            y, exog_vars)

                    if opts.permutation:
                        if opts.groupingvariable:
                            p_FWER = run_permutations(
                                endog_arr=y,
                                exog_vars=exog_vars,
                                num_perm=int(opts.permutation[0]),
                                stat_arr=t_values,
                                uniq_groups=pdCSV[groupVar],
                                return_permutations=True)
                        else:
                            p_FWER = run_permutations(endog_arr=y,
                                                      exog_vars=exog_vars,
                                                      num_perm=int(
                                                          opts.permutation[0]),
                                                      stat_arr=t_values,
                                                      uniq_groups=None,
                                                      return_permutations=True)
                        p_FWER = p_FWER[1:, :].T

                    t_values = t_values[1:, :].T  # ignore intercept
                    p_values = p_values[1:, :].T  # ignore intercept

                    roi_names = []
                    for i in xrange(int(opts.range[0]),
                                    int(opts.range[1]) + 1):
                        roi_names.append(pdCSV.columns[i])

                    p_FDR = np.zeros_like(p_values)
                    p_values[np.isnan(p_values)] = 1
                    for col in range(p_FDR.shape[1]):
                        p_FDR[:, col] = multipletests(p_values[:, col],
                                                      method='fdr_bh')[1]

                    columnnames = []
                    columnnames.append('Fvalue')
                    columnnames.append('R2')
                    columnnames.append('R2adj')
                    for colname in opts.exogenousvariables:
                        columnnames.append('tval_%s' % colname)
                    for colname in opts.exogenousvariables:
                        columnnames.append('pval_%s' % colname)
                    for colname in opts.exogenousvariables:
                        columnnames.append('pFDR_%s' % colname)

                    columndata = np.column_stack((f_values[:, np.newaxis], R2))
                    columndata = np.column_stack((columndata, R2_adj))
                    columndata = np.column_stack((columndata, t_values))
                    columndata = np.column_stack((columndata, p_values))
                    columndata = np.column_stack((columndata, p_FDR))
                    if opts.permutation:
                        for colname in opts.exogenousvariables:
                            columnnames.append('pFWER_%s' % colname)
                        columndata = np.column_stack((columndata, p_FWER))
                    pd_DF = pd.DataFrame(data=columndata,
                                         index=roi_names,
                                         columns=columnnames)
                    pd_DF.to_csv(opts.outstats[0], index_label='ROI')

                    if opts.plotresids:
                        os.system('mkdir -p resid_plots')
                        for i, roi in enumerate(np.array(roi_names)):
                            plot_residuals(
                                residual=resids[:, i],
                                fitted=fitted[:, i],
                                basename=(
                                    '%s_lm_%s' %
                                    (str(i + int(opts.range[0])).zfill(4),
                                     roi)),
                                outdir='resid_plots/')

    if opts.savecsv:
        pdCSV.to_csv(opts.savecsv[0])
コード例 #11
0
vc = {'x': '0 + x'}

md = smf.mixedlm("y ~ 1 + x", test_df, groups = test_df["unit"],
                 vc_formula = vc,
                 re_formula = "~ 1") # random intercept
        
mdf = md.fit()
mdf.summary()
mdf.scale

reffs = mdf.random_effects
smf_b_i = [reffs[i][1] for i in range(1, 31)]
smf_beta_i = mdf.params['x'] + smf_b_i

md2 = sm.MixedLM(test_df["y"], test_df[["Intercept", "x"]],
                groups = test_df["unit"],
                exog_re = test_df[["Intercept", "x"]])

mdf2 = md2.fit()
mdf2.summary()

# Stage2: Trying to get it to work with keras

enc = OneHotEncoder()
enc.fit(test_df['unit'].values.reshape(-1, 1))
unit_onehot = enc.transform(test_df['unit'].values.reshape(-1, 1))

unit_onehot = unit_onehot.toarray() # random intercepts design
unit_x = np.dot(np.diag(test_df['x']), unit_onehot) # random coefs design

n_units = unit_onehot.shape[1]
コード例 #12
0
    return mat, colnames


# Then we set up the variance components using the VCSpec class.

vcm = df.groupby("group1").apply(f).to_list()
mats = [x[0] for x in vcm]
colnames = [x[1] for x in vcm]
names = ["group2"]
vcs = VCSpec(names, [colnames], [mats])

# Finally we fit the model.  It can be seen that the results of the
# two fits are identical.

oo = np.ones(df.shape[0])
model2 = sm.MixedLM(df.y, oo, exog_re=oo, groups=df.group1, exog_vc=vcs)
result2 = model2.fit()
print(result2.summary())

# ## Crossed analysis

# In a crossed analysis, the levels of one group can occur in any
# combination with the levels of the another group.  The groups in
# Statsmodels MixedLM are always nested, but it is possible to fit a
# crossed model by having only one group, and specifying all random
# effects as variance components.  Many, but not all crossed models
# can be fit in this way.  The function below generates a crossed data
# set with two levels of random structure.


def generate_crossed(n_group1=100,
コード例 #13
0
csvfile = '..\\TreeNobXdate.csv'
col = 0  #Use first column from file
SurveyYear = 2005
MaxYear = 1980
MinYear = 1875

#Generate an age-frequency.  Oldest-ages first
data = GetAgeFreq(csvfile,
                  col=col,
                  SurveyYear=SurveyYear,
                  MaxYear=MaxYear,
                  MinYear=MinYear)
nyear = len(data)

#Add a column of ones to represent the constant
yb = array([array([int(1), int(t)]) for t in range(nyear)])

glm = sm.GLM(data,
             yb,
             family=sm.families.Poisson(),
             links=sm.families.links.Log())
print(glm.fit().summary())
print('#############')
groups = [t for t in range(nyear)]
MLM = sm.MixedLM(data,
                 yb,
                 groups=groups,
                 family=sm.families.Poisson(),
                 links=sm.families.links.Log())
print(MLM.fit().summary())
print('#############')
def mixed_linear_modeling(df, x='bin', bic_diff=10, df_sims=None, colors=None):

    fig = plt.figure(figsize=(1.1 * len(df['variable'].unique()), 1.5))
    plt_nr = 1

    for param in df['variable'].unique():

        data = df.loc[df['variable'] == param, :]

        ax = fig.add_subplot(1, len(df['variable'].unique()), plt_nr)

        # sns.barplot(x='variable', y='value', hue='bin', units='subj_idx', palette='Reds', ci=None, data=df)
        # sns.barplot(x='variable', y='value', hue='bin', units='subj_idx', palette='Reds', ci=66, data=df)
        kwargs = {
            'linewidths': 0,
            'markeredgewidth': 0.5,
            'markeredgecolor': 'black',
            'ecolor': 'black'
        }
        if ('level' in data.columns) & ~(x == 'level'):
            sns.pointplot(x=x,
                          y='value',
                          hue='level',
                          units='subj_idx',
                          join=False,
                          ci=66,
                          scale=0.50,
                          errwidth=1,
                          palette='Greys',
                          data=data,
                          zorder=1,
                          **kwargs)
        else:
            sns.pointplot(x=x,
                          y='value',
                          units='subj_idx',
                          join=False,
                          ci=66,
                          scale=0.66,
                          errwidth=1,
                          color='grey',
                          data=data,
                          zorder=1,
                          **kwargs)
        # sns.stripplot(x='variable', y='value', hue='bin', color='grey', size=2, jitter=False, dodge=True, data=df)
        # locs = np.sort(np.array([p.get_x() + p.get_width() / 2. for p in ax.patches]))

        if param == 'rt':
            plt.ylim(data['value'].mean() - 0.1, data['value'].mean() + 0.1)

        if len(data[x].unique()) > 2:
            # variables:
            data['intercept'] = 1
            data.loc[:, '{}_^2'.format(x)] = np.array(data.loc[:, x]**2)

            # # zscore:
            # for subj in data['subj_idx'].unique():
            #     ind = data['subj_idx']==subj
            #     data.loc[ind,x] = (data.loc[ind,x] - data.loc[ind,x].mean()) / data.loc[ind,x].std()
            #     data.loc[ind,'{}_^2'.format(x)] = (data.loc[ind,'{}_^2'.format(x)]  - data.loc[ind,'{}_^2'.format(x)].mean()) / data.loc[ind,'{}_^2'.format(x)].std()

            endog = data.loc[:, 'value'].astype(float)
            if ('level' in data.columns) & ~(x == 'level'):
                exog1 = data.loc[:, ['intercept', 'level', x]].astype(float)
                exog2 = data.loc[:,
                                 ['intercept', 'level', x, '{}_^2'.
                                  format(x)]].astype(float)
            else:
                exog1 = data.loc[:, ['intercept', x]].astype(float)
                exog2 = data.loc[:,
                                 ['intercept', x, '{}_^2'.format(x)]].astype(
                                     float)

            # comparison:
            try:
                md1 = sm.MixedLM(endog,
                                 exog1,
                                 data.loc[:, 'subj_idx'],
                                 exog_re=exog1)
                mdf1 = md1.fit(reml=False)
                md2 = sm.MixedLM(endog,
                                 exog2,
                                 data.loc[:, 'subj_idx'],
                                 exog_re=exog2)
                mdf2 = md2.fit(reml=False)
                if mdf1.converged & mdf2.converged:
                    random = True
                else:
                    md1 = sm.MixedLM(
                        endog,
                        exog1,
                        data.loc[:, 'subj_idx'],
                    )
                    mdf1 = md1.fit(reml=False)
                    md2 = sm.MixedLM(
                        endog,
                        exog2,
                        data.loc[:, 'subj_idx'],
                    )
                    mdf2 = md2.fit(reml=False)
                    random = False
                if (mdf1.bic - mdf2.bic) > bic_diff:
                    exog = exog2.copy()
                else:
                    exog = exog1.copy()

                # refit with reml:
                if random:
                    mdf = sm.MixedLM(endog,
                                     exog,
                                     groups=data.loc[:, 'subj_idx'],
                                     exog_re=exog).fit()
                else:
                    mdf = sm.MixedLM(endog,
                                     exog,
                                     groups=data.loc[:, 'subj_idx']).fit()
                print(mdf.summary())
                xx = np.sort(np.array([p.get_data()[0][0] for p in ax.lines]))
                if ('level' in data.columns) & ~(x == 'level'):
                    if (mdf1.bic - mdf2.bic) > bic_diff:
                        yy = np.concatenate([
                            mdf.params['intercept'] +
                            (np.array(exog.groupby('level').mean().index) *
                             mdf.params['level']) + (b * mdf.params[x]) +
                            ((b**2) * mdf.params['{}_^2'.format(x)])
                            for b in np.array(exog.groupby(x).mean().index)
                        ])
                        plt.title('p = {}\np1 = {}\np2 = {}'.format(
                            round(mdf.pvalues['level'], 3),
                            round(mdf.pvalues[x], 3),
                            round(mdf.pvalues['{}_^2'.format(x)], 3)),
                                  size=6)
                    else:
                        yy = np.concatenate([
                            mdf.params['intercept'] +
                            (np.array(exog.groupby('level').mean().index) *
                             mdf.params['level']) + (b * mdf.params[x])
                            for b in np.array(exog.groupby(x).mean().index)
                        ])
                        plt.title('p = {}\np = {}'.format(
                            round(mdf.pvalues['level'], 3),
                            round(mdf.pvalues[x], 3)),
                                  size=6)
                    for v in exog.groupby('level').mean().index:
                        plt.plot(
                            xx[int(v)::len(exog.groupby('level').mean().index
                                           )],
                            yy[int(v)::len(exog.groupby('level').mean().index
                                           )],
                            lw=1,
                            color='black')
                else:
                    if (mdf1.bic - mdf2.bic) > bic_diff:
                        yy = mdf.params['intercept'] + (np.array(
                            exog.groupby(x).mean().index) * mdf.params[x]) + (
                                (np.array(exog.groupby(x).mean().index)**2) *
                                mdf.params['{}_^2'.format(x)])
                        plt.title('p1 = {}\np2 = {}'.format(
                            round(mdf.pvalues[x], 3),
                            round(mdf.pvalues['{}_^2'.format(x)], 3)),
                                  size=6)
                    else:
                        yy = mdf.params['intercept'] + (np.array(
                            exog.groupby(x).mean().index) * mdf.params[x])
                        plt.title('p = {}'.format(round(mdf.pvalues[x], 3)),
                                  size=6)
                    plt.plot(xx, yy, lw=1, color='black')
            except:
                pass
        else:

            t, p = sp.stats.ttest_rel(data.loc[data[x] == 0, 'value'],
                                      data.loc[data[x] == 1, 'value'])
            plt.title('p = {}'.format(round(p, 3)), size=6)

        if not df_sims is None:
            if ('level' in data.columns) & ~(x == 'level'):
                for df_sim, color in zip(df_sims, colors):
                    sns.pointplot(
                        x=x,
                        y='value',
                        hue='level',
                        palette=[
                            'blue' for _ in range(len(data['level'].unique()))
                        ],
                        join=False,
                        ci=None,
                        markers='x',
                        scale=0.66,
                        data=df_sim.loc[df['variable'] == param, :],
                        zorder=100)
            else:
                for df_sim, color in zip(df_sims, colors):
                    sns.pointplot(x=x,
                                  y='value',
                                  color='blue',
                                  join=False,
                                  ci=None,
                                  markers='x',
                                  scale=0.66,
                                  data=df_sim.loc[df['variable'] == param, :],
                                  zorder=100)
        try:
            plt.gca().get_legend().remove()
        except:
            pass

        plt.xticks(ax.get_xticks(), list(np.array(ax.get_xticks(), dtype=int)))
        plt.ylabel(param)

        plt_nr += 1

    sns.despine(offset=2, trim=True)
    plt.tight_layout()
    return fig
コード例 #15
0
# for ind in xrange(y_test.shape[0]):
#     # print ind, fake_id, (ind+1) % NUM_FRAMES
#     X_test_fake_song_ids[ind] = fake_id
#     if (ind+1) % NUM_FRAMES == 0:
#         fake_id += 1

### add column of ones to data to account for the bias:
# X_train = add_intercept(X_train)
# print X_train.shape
# print X_train[0:10]

# Fit regression model
# cf http://statsmodels.sourceforge.net/devel/mixed_linear.html
# md = smf.mixedlm(y_train, X_train, groups=X_train_fake_song_ids)
# md = sm.MixedLM(y_train, X_train, exog_re=X_train_fake_time, groups=X_train_fake_song_ids, use_sqrt=True)
md = sm.MixedLM(y_train, X_train, groups=X_train_fake_song_ids, use_sqrt=True)
mdf = md.fit()

print mdf.summary()

# X_test = add_intercept(X_test)
pred = mdf.predict(X_test)
# print pred

pred = list()
# predict each song separately and append predictions
for ind_song in xrange(nb_test_song):
    deb = ind_song * NUM_FRAMES
    fin = deb + NUM_FRAMES
    pred_song = mdf.predict(X_test[deb:fin, :])
    pred.append(pred_song)